| 
									
										
										
										
											2021-01-13 11:31:25 +01:00
										 |  |  | # SPDX-License-Identifier: AGPL-3.0-or-later | 
					
						
							| 
									
										
										
										
											2021-04-26 20:18:20 +02:00
										 |  |  | # lint: pylint | 
					
						
							| 
									
										
										
										
											2021-03-08 09:41:32 +01:00
										 |  |  | """Yahoo (News)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Yahoo News is "English only" and do not offer localized nor language queries. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-13 11:31:25 +01:00
										 |  |  | """
 | 
					
						
							| 
									
										
										
										
											2021-03-08 09:41:32 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-07 13:26:59 +02:00
										 |  |  | # pylint: disable=invalid-name | 
					
						
							| 
									
										
										
										
											2014-03-04 13:11:53 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-11-30 18:43:03 +01:00
										 |  |  | import re | 
					
						
							| 
									
										
										
										
											2020-08-06 17:42:46 +02:00
										 |  |  | from urllib.parse import urlencode | 
					
						
							| 
									
										
										
										
											2021-03-08 09:41:32 +01:00
										 |  |  | from datetime import datetime, timedelta | 
					
						
							| 
									
										
										
										
											2014-03-18 13:19:50 +01:00
										 |  |  | from dateutil import parser | 
					
						
							| 
									
										
										
										
											2021-03-08 09:41:32 +01:00
										 |  |  | from lxml import html | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from searx.utils import ( | 
					
						
							|  |  |  |     eval_xpath_list, | 
					
						
							|  |  |  |     eval_xpath_getindex, | 
					
						
							|  |  |  |     extract_text, | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from searx.engines.yahoo import parse_url | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-13 11:31:25 +01:00
										 |  |  | # about | 
					
						
							|  |  |  | about = { | 
					
						
							|  |  |  |     "website": 'https://news.yahoo.com', | 
					
						
							|  |  |  |     "wikidata_id": 'Q3044717', | 
					
						
							|  |  |  |     "official_api_documentation": 'https://developer.yahoo.com/api/', | 
					
						
							|  |  |  |     "use_official_api": False, | 
					
						
							|  |  |  |     "require_api_key": False, | 
					
						
							|  |  |  |     "results": 'HTML', | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-03-08 09:41:32 +01:00
										 |  |  | language_support = False | 
					
						
							|  |  |  | time_range_support = False | 
					
						
							|  |  |  | safesearch = False | 
					
						
							| 
									
										
										
										
											2014-09-01 16:17:29 +02:00
										 |  |  | paging = True | 
					
						
							| 
									
										
										
										
											2021-03-08 09:41:32 +01:00
										 |  |  | categories = ['news'] | 
					
						
							| 
									
										
										
										
											2014-09-01 16:17:29 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # search-url | 
					
						
							| 
									
										
										
										
											2021-03-08 09:41:32 +01:00
										 |  |  | search_url = ( | 
					
						
							|  |  |  |     'https://news.search.yahoo.com/search' | 
					
						
							|  |  |  |     '?{query}&b={offset}' | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | AGO_RE = re.compile(r'([0-9]+)\s*(year|month|week|day|minute|hour)') | 
					
						
							|  |  |  | AGO_TIMEDELTA = { | 
					
						
							|  |  |  |   'minute': timedelta(minutes=1), | 
					
						
							|  |  |  |   'hour': timedelta(hours=1), | 
					
						
							|  |  |  |   'day': timedelta(days=1), | 
					
						
							|  |  |  |   'week': timedelta(days=7), | 
					
						
							|  |  |  |   'month': timedelta(days=30), | 
					
						
							|  |  |  |   'year': timedelta(days=365), | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2014-03-04 13:11:53 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | def request(query, params): | 
					
						
							|  |  |  |     offset = (params['pageno'] - 1) * 10 + 1 | 
					
						
							| 
									
										
										
										
											2014-09-01 16:17:29 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-03-08 09:41:32 +01:00
										 |  |  |     params['url'] = search_url.format( | 
					
						
							|  |  |  |         offset = offset, | 
					
						
							|  |  |  |         query = urlencode({'p': query}) | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     logger.debug("query_url --> %s", params['url']) | 
					
						
							| 
									
										
										
										
											2014-03-04 13:11:53 +01:00
										 |  |  |     return params | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def response(resp): | 
					
						
							|  |  |  |     results = [] | 
					
						
							|  |  |  |     dom = html.fromstring(resp.text) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-03-08 09:41:32 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-01 16:17:29 +02:00
										 |  |  |     # parse results | 
					
						
							| 
									
										
										
										
											2021-03-08 09:41:32 +01:00
										 |  |  |     for result in eval_xpath_list(dom, '//ol[contains(@class,"searchCenterMiddle")]//li'): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         url = eval_xpath_getindex(result, './/h4/a/@href', 0, None) | 
					
						
							|  |  |  |         if url is None: | 
					
						
							| 
									
										
										
										
											2015-05-02 21:08:56 +02:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2021-03-08 09:41:32 +01:00
										 |  |  |         url = parse_url(url) | 
					
						
							|  |  |  |         title = extract_text(result.xpath('.//h4/a')) | 
					
						
							|  |  |  |         content = extract_text(result.xpath('.//p')) | 
					
						
							|  |  |  |         img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         item = { | 
					
						
							|  |  |  |             'url': url, | 
					
						
							|  |  |  |             'title': title, | 
					
						
							|  |  |  |             'content': content, | 
					
						
							|  |  |  |             'img_src' : img_src | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         pub_date = extract_text(result.xpath('.//span[contains(@class,"s-time")]')) | 
					
						
							|  |  |  |         ago = AGO_RE.search(pub_date) | 
					
						
							|  |  |  |         if ago: | 
					
						
							|  |  |  |             number = int(ago.group(1)) | 
					
						
							|  |  |  |             delta = AGO_TIMEDELTA[ago.group(2)] | 
					
						
							|  |  |  |             pub_date = datetime.now() - delta * number | 
					
						
							| 
									
										
										
										
											2014-03-15 19:20:29 +01:00
										 |  |  |         else: | 
					
						
							| 
									
										
										
										
											2017-04-08 19:42:50 +02:00
										 |  |  |             try: | 
					
						
							| 
									
										
										
										
											2021-03-08 09:41:32 +01:00
										 |  |  |                 pub_date = parser.parse(pub_date) | 
					
						
							|  |  |  |             except parser.ParserError: | 
					
						
							|  |  |  |                 pub_date = None | 
					
						
							| 
									
										
										
										
											2014-03-15 19:20:29 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-03-08 09:41:32 +01:00
										 |  |  |         if pub_date is not None: | 
					
						
							|  |  |  |             item['publishedDate'] = pub_date | 
					
						
							|  |  |  |         results.append(item) | 
					
						
							| 
									
										
										
										
											2014-03-15 19:20:29 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-03-08 09:41:32 +01:00
										 |  |  |         for suggestion in eval_xpath_list(dom, '//div[contains(@class,"AlsoTry")]//td'): | 
					
						
							|  |  |  |             results.append({'suggestion': extract_text(suggestion)}) | 
					
						
							| 
									
										
										
										
											2014-03-04 13:11:53 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     return results |