Merge pull request #609 from return42/fix-bing-news
[fix] engine bing-news: replace the http:// by https://
This commit is contained in:
		
						commit
						4da1e0026c
					
				| @ -1,16 +1,27 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| """ | # lint: pylint | ||||||
|  Bing (News) | """Bing (News) | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
|  | from urllib.parse import ( | ||||||
|  |     urlencode, | ||||||
|  |     urlparse, | ||||||
|  |     parse_qsl, | ||||||
|  |     quote, | ||||||
|  | ) | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from dateutil import parser | from dateutil import parser | ||||||
| from urllib.parse import urlencode, urlparse, parse_qsl |  | ||||||
| from lxml import etree | from lxml import etree | ||||||
| from lxml.etree import XPath | from lxml.etree import XPath | ||||||
| from searx.utils import match_language, eval_xpath_getindex | from searx.utils import ( | ||||||
| from searx.engines.bing import language_aliases |     match_language, | ||||||
| from searx.engines.bing import _fetch_supported_languages, supported_languages_url  # NOQA # pylint: disable=unused-import |     eval_xpath_getindex | ||||||
|  | ) | ||||||
|  | from searx.engines.bing import (  # pylint: disable=unused-import | ||||||
|  |     language_aliases, | ||||||
|  |     _fetch_supported_languages, | ||||||
|  |     supported_languages_url, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
| @ -31,69 +42,71 @@ time_range_support = True | |||||||
| base_url = 'https://www.bing.com/' | base_url = 'https://www.bing.com/' | ||||||
| search_string = 'news/search?{query}&first={offset}&format=RSS' | search_string = 'news/search?{query}&first={offset}&format=RSS' | ||||||
| search_string_with_time = 'news/search?{query}&first={offset}&qft=interval%3d"{interval}"&format=RSS' | search_string_with_time = 'news/search?{query}&first={offset}&qft=interval%3d"{interval}"&format=RSS' | ||||||
| time_range_dict = {'day': '7', | time_range_dict = { | ||||||
|  |     'day': '7', | ||||||
|     'week': '8', |     'week': '8', | ||||||
|                    'month': '9'} |     'month': '9' | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| # remove click |  | ||||||
| def url_cleanup(url_string): | def url_cleanup(url_string): | ||||||
|  |     """remove click""" | ||||||
|  | 
 | ||||||
|     parsed_url = urlparse(url_string) |     parsed_url = urlparse(url_string) | ||||||
|     if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx': |     if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx': | ||||||
|         query = dict(parse_qsl(parsed_url.query)) |         query = dict(parse_qsl(parsed_url.query)) | ||||||
|         return query.get('url', None) |         url_string = query.get('url', None) | ||||||
|     return url_string |     return url_string | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| # replace the http://*bing4.com/th?id=... by https://www.bing.com/th?id=... |  | ||||||
| def image_url_cleanup(url_string): | def image_url_cleanup(url_string): | ||||||
|     parsed_url = urlparse(url_string) |     """replace the http://*bing.com/th?id=... by https://www.bing.com/th?id=...""" | ||||||
|     if parsed_url.netloc.endswith('bing4.com') and parsed_url.path == '/th': |  | ||||||
|         query = dict(parse_qsl(parsed_url.query)) |  | ||||||
|         return "https://www.bing.com/th?id=" + query.get('id') |  | ||||||
|     return url_string |  | ||||||
| 
 | 
 | ||||||
|  |     parsed_url = urlparse(url_string) | ||||||
|  |     if parsed_url.netloc.endswith('bing.com') and parsed_url.path == '/th': | ||||||
|  |         query = dict(parse_qsl(parsed_url.query)) | ||||||
|  |         url_string = "https://www.bing.com/th?id=" + quote(query.get('id')) | ||||||
|  |     return url_string | ||||||
| 
 | 
 | ||||||
| def _get_url(query, language, offset, time_range): | def _get_url(query, language, offset, time_range): | ||||||
|     if time_range in time_range_dict: |     if time_range in time_range_dict: | ||||||
|         search_path = search_string_with_time.format( |         search_path = search_string_with_time.format( | ||||||
|             query=urlencode({'q': query, 'setmkt': language}), |             query = urlencode({ | ||||||
|             offset=offset, |                 'q': query, | ||||||
|             interval=time_range_dict[time_range]) |                 'setmkt': language | ||||||
|  |             }), | ||||||
|  |             offset = offset, | ||||||
|  |             interval = time_range_dict[time_range] | ||||||
|  |         ) | ||||||
|     else: |     else: | ||||||
|         # e.g. setmkt=de-de&setlang=de |         # e.g. setmkt=de-de&setlang=de | ||||||
|         search_path = search_string.format( |         search_path = search_string.format( | ||||||
|             query=urlencode({'q': query, 'setmkt': language}), |             query = urlencode({ | ||||||
|             offset=offset) |                 'q': query, | ||||||
|  |                 'setmkt': language | ||||||
|  |             }), | ||||||
|  |             offset = offset | ||||||
|  |         ) | ||||||
|     return base_url + search_path |     return base_url + search_path | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| # do search-request |  | ||||||
| def request(query, params): | def request(query, params): | ||||||
|  | 
 | ||||||
|     if params['time_range'] and params['time_range'] not in time_range_dict: |     if params['time_range'] and params['time_range'] not in time_range_dict: | ||||||
|         return params |         return params | ||||||
| 
 | 
 | ||||||
|     offset = (params['pageno'] - 1) * 10 + 1 |     offset = (params['pageno'] - 1) * 10 + 1 | ||||||
| 
 |  | ||||||
|     if params['language'] == 'all': |     if params['language'] == 'all': | ||||||
|         language = 'en-US' |         language = 'en-US' | ||||||
|     else: |     else: | ||||||
|         language = match_language(params['language'], supported_languages, language_aliases) |         language = match_language(params['language'], supported_languages, language_aliases) | ||||||
| 
 |  | ||||||
|     params['url'] = _get_url(query, language, offset, params['time_range']) |     params['url'] = _get_url(query, language, offset, params['time_range']) | ||||||
| 
 | 
 | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| # get response from search-request |  | ||||||
| def response(resp): | def response(resp): | ||||||
|  | 
 | ||||||
|     results = [] |     results = [] | ||||||
| 
 |  | ||||||
|     rss = etree.fromstring(resp.content) |     rss = etree.fromstring(resp.content) | ||||||
|  |     namespaces = rss.nsmap | ||||||
| 
 | 
 | ||||||
|     ns = rss.nsmap |  | ||||||
| 
 |  | ||||||
|     # parse results |  | ||||||
|     for item in rss.xpath('./channel/item'): |     for item in rss.xpath('./channel/item'): | ||||||
|         # url / title / content |         # url / title / content | ||||||
|         url = url_cleanup(eval_xpath_getindex(item, './link/text()', 0, default=None)) |         url = url_cleanup(eval_xpath_getindex(item, './link/text()', 0, default=None)) | ||||||
| @ -110,22 +123,26 @@ def response(resp): | |||||||
|             publishedDate = datetime.now() |             publishedDate = datetime.now() | ||||||
| 
 | 
 | ||||||
|         # thumbnail |         # thumbnail | ||||||
|         thumbnail = eval_xpath_getindex(item, XPath('./News:Image/text()', namespaces=ns), 0, default=None) |         thumbnail = eval_xpath_getindex( | ||||||
|  |             item, XPath('./News:Image/text()', namespaces=namespaces), 0, default=None) | ||||||
|         if thumbnail is not None: |         if thumbnail is not None: | ||||||
|             thumbnail = image_url_cleanup(thumbnail) |             thumbnail = image_url_cleanup(thumbnail) | ||||||
| 
 | 
 | ||||||
|         # append result |         # append result | ||||||
|         if thumbnail is not None: |         if thumbnail is not None: | ||||||
|             results.append({'url': url, |             results.append({ | ||||||
|  |                 'url': url, | ||||||
|                 'title': title, |                 'title': title, | ||||||
|                 'publishedDate': publishedDate, |                 'publishedDate': publishedDate, | ||||||
|                 'content': content, |                 'content': content, | ||||||
|                             'img_src': thumbnail}) |                 'img_src': thumbnail | ||||||
|  |             }) | ||||||
|         else: |         else: | ||||||
|             results.append({'url': url, |             results.append({ | ||||||
|  |                 'url': url, | ||||||
|                 'title': title, |                 'title': title, | ||||||
|                 'publishedDate': publishedDate, |                 'publishedDate': publishedDate, | ||||||
|                             'content': content}) |                 'content': content | ||||||
|  |             }) | ||||||
| 
 | 
 | ||||||
|     # return results |  | ||||||
|     return results |     return results | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user