| 
									
										
										
										
											2021-01-13 11:31:25 +01:00
										 |  |  | # SPDX-License-Identifier: AGPL-3.0-or-later | 
					
						
							| 
									
										
										
										
											2019-11-29 18:56:29 +01:00
										 |  |  | """
 | 
					
						
							| 
									
										
										
										
											2021-01-13 11:31:25 +01:00
										 |  |  |  DuckDuckGo (Instant Answer API) | 
					
						
							| 
									
										
										
										
											2019-11-29 18:56:29 +01:00
										 |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-10-14 23:54:33 +02:00
										 |  |  | import json | 
					
						
							| 
									
										
										
										
											2020-12-06 10:14:09 +01:00
										 |  |  | from urllib.parse import urlencode, urlparse, urljoin | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | from lxml import html | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | from searx import logger | 
					
						
							|  |  |  | from searx.data import WIKIDATA_UNITS | 
					
						
							| 
									
										
										
										
											2020-11-02 11:19:53 +01:00
										 |  |  | from searx.engines.duckduckgo import language_aliases | 
					
						
							| 
									
										
										
										
											2020-11-16 09:43:23 +01:00
										 |  |  | from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url  # NOQA # pylint: disable=unused-import | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  | from searx.utils import extract_text, html_to_text, match_language, get_string_replaces_function | 
					
						
							|  |  |  | from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | logger = logger.getChild('duckduckgo_definitions') | 
					
						
							| 
									
										
										
										
											2013-10-23 23:55:37 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-13 11:31:25 +01:00
										 |  |  | # about | 
					
						
							|  |  |  | about = { | 
					
						
							|  |  |  |     "website": 'https://duckduckgo.com/', | 
					
						
							|  |  |  |     "wikidata_id": 'Q12805', | 
					
						
							|  |  |  |     "official_api_documentation": 'https://duckduckgo.com/api', | 
					
						
							|  |  |  |     "use_official_api": True, | 
					
						
							|  |  |  |     "require_api_key": False, | 
					
						
							|  |  |  |     "results": 'JSON', | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  | URL = 'https://api.duckduckgo.com/'\ | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  |     + '?{query}&format=json&pretty=0&no_redirect=1&d=1' | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  | WIKIDATA_PREFIX = [ | 
					
						
							|  |  |  |     'http://www.wikidata.org/entity/', | 
					
						
							|  |  |  |     'https://www.wikidata.org/entity/' | 
					
						
							|  |  |  | ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | replace_http_by_https = get_string_replaces_function({'http:': 'https:'}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def is_broken_text(text): | 
					
						
							|  |  |  |     """ duckduckgo may return something like "<a href="xxxx">http://somewhere Related website<a/>"
 | 
					
						
							| 
									
										
										
										
											2016-04-18 17:52:16 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |     The href URL is broken, the "Related website" may contains some HTML. | 
					
						
							| 
									
										
										
										
											2013-10-14 23:54:33 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |     The best solution seems to ignore these results. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     return text.startswith('http') and ' ' in text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def result_to_text(text, htmlResult): | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  |     # TODO : remove result ending with "Meaning" or "Category" | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |     result = None | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  |     dom = html.fromstring(htmlResult) | 
					
						
							|  |  |  |     a = dom.xpath('//a') | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  |     if len(a) >= 1: | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |         result = extract_text(a[0]) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |         result = text | 
					
						
							|  |  |  |     if not is_broken_text(result): | 
					
						
							|  |  |  |         return result | 
					
						
							|  |  |  |     return None | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-10-14 23:54:33 +02:00
										 |  |  | def request(query, params): | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |     params['url'] = URL.format(query=urlencode({'q': query})) | 
					
						
							| 
									
										
										
										
											2018-03-01 05:30:48 +01:00
										 |  |  |     language = match_language(params['language'], supported_languages, language_aliases) | 
					
						
							| 
									
										
										
										
											2019-11-29 18:56:29 +01:00
										 |  |  |     language = language.split('-')[0] | 
					
						
							|  |  |  |     params['headers']['Accept-Language'] = language | 
					
						
							| 
									
										
										
										
											2013-10-14 23:54:33 +02:00
										 |  |  |     return params | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def response(resp): | 
					
						
							|  |  |  |     results = [] | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-02-09 18:28:08 +01:00
										 |  |  |     search_res = json.loads(resp.text) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |     # search_res.get('Entity') possible values (not exhaustive) : | 
					
						
							|  |  |  |     # * continent / country / department / location / waterfall | 
					
						
							|  |  |  |     # * actor / musician / artist | 
					
						
							|  |  |  |     # * book / performing art / film / television  / media franchise / concert tour / playwright | 
					
						
							|  |  |  |     # * prepared food | 
					
						
							|  |  |  |     # * website / software / os / programming language / file format / software engineer | 
					
						
							|  |  |  |     # * compagny | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  |     content = '' | 
					
						
							|  |  |  |     heading = search_res.get('Heading', '') | 
					
						
							|  |  |  |     attributes = [] | 
					
						
							|  |  |  |     urls = [] | 
					
						
							|  |  |  |     infobox_id = None | 
					
						
							|  |  |  |     relatedTopics = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # add answer if there is one | 
					
						
							|  |  |  |     answer = search_res.get('Answer', '') | 
					
						
							| 
									
										
										
										
											2019-11-29 18:56:29 +01:00
										 |  |  |     if answer: | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |         logger.debug('AnswerType="%s" Answer="%s"', search_res.get('AnswerType'), answer) | 
					
						
							|  |  |  |         if search_res.get('AnswerType') not in ['calc', 'ip']: | 
					
						
							| 
									
										
										
										
											2019-11-29 18:56:29 +01:00
										 |  |  |             results.append({'answer': html_to_text(answer)}) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # add infobox | 
					
						
							| 
									
										
										
										
											2013-10-14 23:54:33 +02:00
										 |  |  |     if 'Definition' in search_res: | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  |         content = content + search_res.get('Definition', '') | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if 'Abstract' in search_res: | 
					
						
							|  |  |  |         content = content + search_res.get('Abstract', '') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # image | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |     image = search_res.get('Image') | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  |     image = None if image == '' else image | 
					
						
							| 
									
										
										
										
											2020-12-06 10:14:09 +01:00
										 |  |  |     if image is not None and urlparse(image).netloc == '': | 
					
						
							|  |  |  |         image = urljoin('https://duckduckgo.com', image) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # urls | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |     # Official website, Wikipedia page | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  |     for ddg_result in search_res.get('Results', []): | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |         firstURL = ddg_result.get('FirstURL') | 
					
						
							|  |  |  |         text = ddg_result.get('Text') | 
					
						
							|  |  |  |         if firstURL is not None and text is not None: | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  |             urls.append({'title': text, 'url': firstURL}) | 
					
						
							|  |  |  |             results.append({'title': heading, 'url': firstURL}) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # related topics | 
					
						
							| 
									
										
										
										
											2015-02-09 18:28:08 +01:00
										 |  |  |     for ddg_result in search_res.get('RelatedTopics', []): | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  |         if 'FirstURL' in ddg_result: | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |             firstURL = ddg_result.get('FirstURL') | 
					
						
							|  |  |  |             text = ddg_result.get('Text') | 
					
						
							|  |  |  |             if not is_broken_text(text): | 
					
						
							|  |  |  |                 suggestion = result_to_text(text, | 
					
						
							|  |  |  |                                             ddg_result.get('Result')) | 
					
						
							|  |  |  |                 if suggestion != heading and suggestion is not None: | 
					
						
							|  |  |  |                     results.append({'suggestion': suggestion}) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  |         elif 'Topics' in ddg_result: | 
					
						
							|  |  |  |             suggestions = [] | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  |             relatedTopics.append({'name': ddg_result.get('Name', ''), | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |                                   'suggestions': suggestions}) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  |             for topic_result in ddg_result.get('Topics', []): | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |                 suggestion = result_to_text(topic_result.get('Text'), | 
					
						
							|  |  |  |                                             topic_result.get('Result')) | 
					
						
							|  |  |  |                 if suggestion != heading and suggestion is not None: | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  |                     suggestions.append(suggestion) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # abstract | 
					
						
							|  |  |  |     abstractURL = search_res.get('AbstractURL', '') | 
					
						
							|  |  |  |     if abstractURL != '': | 
					
						
							|  |  |  |         # add as result ? problem always in english | 
					
						
							|  |  |  |         infobox_id = abstractURL | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  |         urls.append({'title': search_res.get('AbstractSource'), | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |                      'url': abstractURL, | 
					
						
							|  |  |  |                      'official': True}) | 
					
						
							|  |  |  |         results.append({'url': abstractURL, | 
					
						
							|  |  |  |                         'title': heading}) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # definition | 
					
						
							|  |  |  |     definitionURL = search_res.get('DefinitionURL', '') | 
					
						
							|  |  |  |     if definitionURL != '': | 
					
						
							|  |  |  |         # add as result ? as answer ? problem always in english | 
					
						
							|  |  |  |         infobox_id = definitionURL | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  |         urls.append({'title': search_res.get('DefinitionSource'), | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |                      'url': definitionURL}) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-04-17 23:21:44 +02:00
										 |  |  |     # to merge with wikidata's infobox | 
					
						
							|  |  |  |     if infobox_id: | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |         infobox_id = replace_http_by_https(infobox_id) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # attributes | 
					
						
							|  |  |  |     # some will be converted to urls | 
					
						
							|  |  |  |     if 'Infobox' in search_res: | 
					
						
							|  |  |  |         infobox = search_res.get('Infobox') | 
					
						
							|  |  |  |         if 'content' in infobox: | 
					
						
							|  |  |  |             osm_zoom = 17 | 
					
						
							|  |  |  |             coordinates = None | 
					
						
							|  |  |  |             for info in infobox.get('content'): | 
					
						
							|  |  |  |                 data_type = info.get('data_type') | 
					
						
							|  |  |  |                 data_label = info.get('label') | 
					
						
							|  |  |  |                 data_value = info.get('value') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # Workaround: ddg may return a double quote | 
					
						
							|  |  |  |                 if data_value == '""': | 
					
						
							|  |  |  |                     continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # Is it an external URL ? | 
					
						
							|  |  |  |                 # * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile | 
					
						
							|  |  |  |                 # * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id | 
					
						
							|  |  |  |                 # * netflix_id | 
					
						
							|  |  |  |                 external_url = get_external_url(data_type, data_value) | 
					
						
							|  |  |  |                 if external_url is not None: | 
					
						
							|  |  |  |                     urls.append({'title': data_label, | 
					
						
							|  |  |  |                                  'url': external_url}) | 
					
						
							|  |  |  |                 elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']: | 
					
						
							|  |  |  |                     # ignore instance: Wikidata value from "Instance Of" (Qxxxx) | 
					
						
							|  |  |  |                     # ignore wiki_maps_trigger: reference to a javascript | 
					
						
							|  |  |  |                     # ignore google_play_artist_id: service shutdown | 
					
						
							|  |  |  |                     pass | 
					
						
							|  |  |  |                 elif data_type == 'string' and data_label == 'Website': | 
					
						
							|  |  |  |                     # There is already an URL for the website | 
					
						
							|  |  |  |                     pass | 
					
						
							|  |  |  |                 elif data_type == 'area': | 
					
						
							|  |  |  |                     attributes.append({'label': data_label, | 
					
						
							|  |  |  |                                        'value': area_to_str(data_value), | 
					
						
							|  |  |  |                                        'entity': 'P2046'}) | 
					
						
							|  |  |  |                     osm_zoom = area_to_osm_zoom(data_value.get('amount')) | 
					
						
							|  |  |  |                 elif data_type == 'coordinates': | 
					
						
							|  |  |  |                     if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2': | 
					
						
							|  |  |  |                         # coordinate on Earth | 
					
						
							|  |  |  |                         # get the zoom information from the area | 
					
						
							|  |  |  |                         coordinates = info | 
					
						
							|  |  |  |                     else: | 
					
						
							|  |  |  |                         # coordinate NOT on Earth | 
					
						
							|  |  |  |                         attributes.append({'label': data_label, | 
					
						
							|  |  |  |                                            'value': data_value, | 
					
						
							|  |  |  |                                            'entity': 'P625'}) | 
					
						
							|  |  |  |                 elif data_type == 'string': | 
					
						
							|  |  |  |                     attributes.append({'label': data_label, | 
					
						
							|  |  |  |                                        'value': data_value}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             if coordinates: | 
					
						
							|  |  |  |                 data_label = coordinates.get('label') | 
					
						
							|  |  |  |                 data_value = coordinates.get('value') | 
					
						
							|  |  |  |                 latitude = data_value.get('latitude') | 
					
						
							|  |  |  |                 longitude = data_value.get('longitude') | 
					
						
							|  |  |  |                 url = get_earth_coordinates_url(latitude, longitude, osm_zoom) | 
					
						
							|  |  |  |                 urls.append({'title': 'OpenStreetMap', | 
					
						
							|  |  |  |                              'url': url, | 
					
						
							|  |  |  |                              'entity': 'P625'}) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  |     if len(heading) > 0: | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  |         # TODO get infobox.meta.value where .label='article_title' | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  |         if image is None and len(attributes) == 0 and len(urls) == 1 and\ | 
					
						
							|  |  |  |            len(relatedTopics) == 0 and len(content) == 0: | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |             results.append({'url': urls[0]['url'], | 
					
						
							|  |  |  |                             'title': heading, | 
					
						
							|  |  |  |                             'content': content}) | 
					
						
							| 
									
										
										
										
											2014-10-11 15:49:50 +02:00
										 |  |  |         else: | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  |             results.append({'infobox': heading, | 
					
						
							|  |  |  |                             'id': infobox_id, | 
					
						
							|  |  |  |                             'content': content, | 
					
						
							|  |  |  |                             'img_src': image, | 
					
						
							|  |  |  |                             'attributes': attributes, | 
					
						
							|  |  |  |                             'urls': urls, | 
					
						
							|  |  |  |                             'relatedTopics': relatedTopics}) | 
					
						
							| 
									
										
										
										
											2013-10-14 23:54:33 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     return results | 
					
						
							| 
									
										
										
										
											2020-10-26 19:25:28 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def unit_to_str(unit): | 
					
						
							|  |  |  |     for prefix in WIKIDATA_PREFIX: | 
					
						
							|  |  |  |         if unit.startswith(prefix): | 
					
						
							|  |  |  |             wikidata_entity = unit[len(prefix):] | 
					
						
							|  |  |  |             return WIKIDATA_UNITS.get(wikidata_entity, unit) | 
					
						
							|  |  |  |     return unit | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def area_to_str(area): | 
					
						
							|  |  |  |     """parse {'unit': 'http://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}""" | 
					
						
							|  |  |  |     unit = unit_to_str(area.get('unit')) | 
					
						
							|  |  |  |     if unit is not None: | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             amount = float(area.get('amount')) | 
					
						
							|  |  |  |             return '{} {}'.format(amount, unit) | 
					
						
							|  |  |  |         except ValueError: | 
					
						
							|  |  |  |             pass | 
					
						
							|  |  |  |     return '{} {}'.format(area.get('amount', ''), area.get('unit', '')) |