[fix] rewrite Yahoo-News engine
Many things have been changed since last review of this engine. This patch fix xpath selectors, implements suggestion and is a complete review / rewrite of the engine. Signed-off-by: Markus Heiser <markus@darmarit.de>
This commit is contained in:
		
							parent
							
								
									0d8b369b5b
								
							
						
					
					
						commit
						d2faea423a
					
				
							
								
								
									
										1
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								Makefile
									
									
									
									
									
								
							| @ -196,6 +196,7 @@ PYLINT_FILES=\ | |||||||
| 	searx/engines/google_images.py \
 | 	searx/engines/google_images.py \
 | ||||||
| 	searx/engines/mediathekviewweb.py \
 | 	searx/engines/mediathekviewweb.py \
 | ||||||
| 	searx/engines/google_scholar.py \
 | 	searx/engines/google_scholar.py \
 | ||||||
|  | 	searx/engines/yahoo_news.py \
 | ||||||
| 	searx_extra/update/update_external_bangs.py | 	searx_extra/update/update_external_bangs.py | ||||||
| 
 | 
 | ||||||
| test.pylint: pyenvinstall | test.pylint: pyenvinstall | ||||||
|  | |||||||
| @ -1,16 +1,35 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| """ | """Yahoo (News) | ||||||
|  Yahoo (News) | 
 | ||||||
|  | Yahoo News is "English only" and do not offer localized nor language queries. | ||||||
|  | 
 | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
|  | # pylint: disable=invalid-name, missing-function-docstring | ||||||
|  | 
 | ||||||
| import re | import re | ||||||
| from datetime import datetime, timedelta |  | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from datetime import datetime, timedelta | ||||||
| from searx.engines.yahoo import parse_url, language_aliases |  | ||||||
| from searx.engines.yahoo import _fetch_supported_languages, supported_languages_url  # NOQA # pylint: disable=unused-import |  | ||||||
| from dateutil import parser | from dateutil import parser | ||||||
| from searx.utils import extract_text, extract_url, match_language | from lxml import html | ||||||
|  | 
 | ||||||
|  | from searx import logger | ||||||
|  | from searx.utils import ( | ||||||
|  |     eval_xpath_list, | ||||||
|  |     eval_xpath_getindex, | ||||||
|  |     extract_text, | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | from searx.engines.yahoo import parse_url | ||||||
|  | 
 | ||||||
|  | # pylint: disable=unused-import | ||||||
|  | from searx.engines.yahoo import ( | ||||||
|  |     _fetch_supported_languages, | ||||||
|  |     supported_languages_url, | ||||||
|  | ) | ||||||
|  | # pylint: enable=unused-import | ||||||
|  | 
 | ||||||
|  | logger = logger.getChild('yahoo_news engine') | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
| @ -22,90 +41,78 @@ about = { | |||||||
|     "results": 'HTML', |     "results": 'HTML', | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| # engine dependent config | language_support = False | ||||||
| categories = ['news'] | time_range_support = False | ||||||
|  | safesearch = False | ||||||
| paging = True | paging = True | ||||||
|  | categories = ['news'] | ||||||
| 
 | 
 | ||||||
| # search-url | # search-url | ||||||
| search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&{lang}=uh3_news_web_gs_1&pz=10&xargs=0&vl=lang_{lang}'  # noqa | search_url = ( | ||||||
|  |     'https://news.search.yahoo.com/search' | ||||||
|  |     '?{query}&b={offset}' | ||||||
|  |     ) | ||||||
| 
 | 
 | ||||||
| # specific xpath variables | AGO_RE = re.compile(r'([0-9]+)\s*(year|month|week|day|minute|hour)') | ||||||
| results_xpath = '//ol[contains(@class,"searchCenterMiddle")]//li' | AGO_TIMEDELTA = { | ||||||
| url_xpath = './/h3/a/@href' |   'minute': timedelta(minutes=1), | ||||||
| title_xpath = './/h3/a' |   'hour': timedelta(hours=1), | ||||||
| content_xpath = './/div[@class="compText"]' |   'day': timedelta(days=1), | ||||||
| publishedDate_xpath = './/span[contains(@class,"tri")]' |   'week': timedelta(days=7), | ||||||
| suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a' |   'month': timedelta(days=30), | ||||||
|  |   'year': timedelta(days=365), | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| # do search-request |  | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     offset = (params['pageno'] - 1) * 10 + 1 |     offset = (params['pageno'] - 1) * 10 + 1 | ||||||
| 
 | 
 | ||||||
|     if params['language'] == 'all': |     params['url'] = search_url.format( | ||||||
|         language = 'en' |         offset = offset, | ||||||
|     else: |         query = urlencode({'p': query}) | ||||||
|         language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] |     ) | ||||||
| 
 |     logger.debug("query_url --> %s", params['url']) | ||||||
|     params['url'] = search_url.format(offset=offset, |  | ||||||
|                                       query=urlencode({'p': query}), |  | ||||||
|                                       lang=language) |  | ||||||
| 
 |  | ||||||
|     # TODO required? |  | ||||||
|     params['cookies']['sB'] = '"v=1&vm=p&fl=1&vl=lang_{lang}&sh=1&pn=10&rw=new'\ |  | ||||||
|         .format(lang=language) |  | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| def sanitize_url(url): |  | ||||||
|     if ".yahoo.com/" in url: |  | ||||||
|         return re.sub("\\;\\_ylt\\=.+$", "", url) |  | ||||||
|     else: |  | ||||||
|         return url |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # get response from search-request |  | ||||||
| def response(resp): | def response(resp): | ||||||
|     results = [] |     results = [] | ||||||
| 
 |  | ||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|     # parse results |     # parse results | ||||||
|     for result in dom.xpath(results_xpath): |     for result in eval_xpath_list(dom, '//ol[contains(@class,"searchCenterMiddle")]//li'): | ||||||
|         urls = result.xpath(url_xpath) | 
 | ||||||
|         if len(urls) != 1: |         url = eval_xpath_getindex(result, './/h4/a/@href', 0, None) | ||||||
|  |         if url is None: | ||||||
|             continue |             continue | ||||||
|         url = sanitize_url(parse_url(extract_url(urls, search_url))) |         url = parse_url(url) | ||||||
|         title = extract_text(result.xpath(title_xpath)[0]) |         title = extract_text(result.xpath('.//h4/a')) | ||||||
|         content = extract_text(result.xpath(content_xpath)[0]) |         content = extract_text(result.xpath('.//p')) | ||||||
|  |         img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None) | ||||||
| 
 | 
 | ||||||
|         # parse publishedDate |         item = { | ||||||
|         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) |             'url': url, | ||||||
| 
 |  | ||||||
|         # still useful ? |  | ||||||
|         if re.match("^[0-9]+ minute(s|) ago$", publishedDate): |  | ||||||
|             publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group())) |  | ||||||
|         elif re.match("^[0-9]+ days? ago$", publishedDate): |  | ||||||
|             publishedDate = datetime.now() - timedelta(days=int(re.match(r'\d+', publishedDate).group())) |  | ||||||
|         elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate): |  | ||||||
|             timeNumbers = re.findall(r'\d+', publishedDate) |  | ||||||
|             publishedDate = datetime.now()\ |  | ||||||
|                 - timedelta(hours=int(timeNumbers[0]))\ |  | ||||||
|                 - timedelta(minutes=int(timeNumbers[1])) |  | ||||||
|         else: |  | ||||||
|             try: |  | ||||||
|                 publishedDate = parser.parse(publishedDate) |  | ||||||
|             except: |  | ||||||
|                 publishedDate = datetime.now() |  | ||||||
| 
 |  | ||||||
|         if publishedDate.year == 1900: |  | ||||||
|             publishedDate = publishedDate.replace(year=datetime.now().year) |  | ||||||
| 
 |  | ||||||
|         # append result |  | ||||||
|         results.append({'url': url, |  | ||||||
|             'title': title, |             'title': title, | ||||||
|             'content': content, |             'content': content, | ||||||
|                         'publishedDate': publishedDate}) |             'img_src' : img_src | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         pub_date = extract_text(result.xpath('.//span[contains(@class,"s-time")]')) | ||||||
|  |         ago = AGO_RE.search(pub_date) | ||||||
|  |         if ago: | ||||||
|  |             number = int(ago.group(1)) | ||||||
|  |             delta = AGO_TIMEDELTA[ago.group(2)] | ||||||
|  |             pub_date = datetime.now() - delta * number | ||||||
|  |         else: | ||||||
|  |             try: | ||||||
|  |                 pub_date = parser.parse(pub_date) | ||||||
|  |             except parser.ParserError: | ||||||
|  |                 pub_date = None | ||||||
|  | 
 | ||||||
|  |         if pub_date is not None: | ||||||
|  |             item['publishedDate'] = pub_date | ||||||
|  |         results.append(item) | ||||||
|  | 
 | ||||||
|  |         for suggestion in eval_xpath_list(dom, '//div[contains(@class,"AlsoTry")]//td'): | ||||||
|  |             results.append({'suggestion': extract_text(suggestion)}) | ||||||
| 
 | 
 | ||||||
|     # return results |  | ||||||
|     return results |     return results | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user