| 
									
										
										
										
											2021-01-13 11:31:25 +01:00
										 |  |  | # SPDX-License-Identifier: AGPL-3.0-or-later | 
					
						
							| 
									
										
										
										
											2021-09-30 16:40:00 +02:00
										 |  |  | # lint: pylint | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  | """
 | 
					
						
							|  |  |  | DuckDuckGo Lite | 
					
						
							|  |  |  | ~~~~~~~~~~~~~~~ | 
					
						
							| 
									
										
										
										
											2015-05-02 15:45:17 +02:00
										 |  |  | """
 | 
					
						
							| 
									
										
										
										
											2014-09-02 17:14:57 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  | from typing import TYPE_CHECKING | 
					
						
							| 
									
										
										
										
											2023-04-03 09:52:16 +02:00
										 |  |  | import re | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  | from urllib.parse import urlencode | 
					
						
							| 
									
										
										
										
											2022-10-04 19:20:32 +02:00
										 |  |  | import json | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  | import babel | 
					
						
							|  |  |  | import lxml.html | 
					
						
							| 
									
										
										
										
											2021-09-30 16:40:00 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  | from searx import ( | 
					
						
							|  |  |  |     locales, | 
					
						
							|  |  |  |     redislib, | 
					
						
							| 
									
										
										
										
											2023-04-03 09:52:16 +02:00
										 |  |  |     external_bang, | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2021-09-30 16:40:00 +02:00
										 |  |  | from searx.utils import ( | 
					
						
							|  |  |  |     eval_xpath, | 
					
						
							|  |  |  |     eval_xpath_getindex, | 
					
						
							|  |  |  |     extract_text, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-06-25 12:37:31 +02:00
										 |  |  | from searx.network import get  # see https://github.com/searxng/searxng/issues/762 | 
					
						
							|  |  |  | from searx import redisdb | 
					
						
							| 
									
										
										
										
											2022-10-04 19:20:32 +02:00
										 |  |  | from searx.enginelib.traits import EngineTraits | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | if TYPE_CHECKING: | 
					
						
							|  |  |  |     import logging | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     logger: logging.Logger | 
					
						
							| 
									
										
										
										
											2022-10-04 19:20:32 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | traits: EngineTraits | 
					
						
							| 
									
										
										
										
											2013-10-14 23:09:13 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-13 11:31:25 +01:00
										 |  |  | about = { | 
					
						
							| 
									
										
										
										
											2022-12-22 14:49:58 +01:00
										 |  |  |     "website": 'https://lite.duckduckgo.com/lite/', | 
					
						
							| 
									
										
										
										
											2021-01-13 11:31:25 +01:00
										 |  |  |     "wikidata_id": 'Q12805', | 
					
						
							|  |  |  |     "use_official_api": False, | 
					
						
							|  |  |  |     "require_api_key": False, | 
					
						
							|  |  |  |     "results": 'HTML', | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  | send_accept_language_header = True | 
					
						
							|  |  |  | """DuckDuckGo-Lite tries to guess user's prefered language from the HTTP
 | 
					
						
							|  |  |  | ``Accept-Language``.  Optional the user can select a region filter (but not a | 
					
						
							|  |  |  | language). | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-02 17:14:57 +02:00
										 |  |  | # engine dependent config | 
					
						
							| 
									
										
										
										
											2021-12-22 16:58:52 +01:00
										 |  |  | categories = ['general', 'web'] | 
					
						
							| 
									
										
										
										
											2021-09-30 16:40:00 +02:00
										 |  |  | paging = True | 
					
						
							| 
									
										
										
										
											2016-07-18 16:15:37 +02:00
										 |  |  | time_range_support = True | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  | safesearch = True  # user can't select but the results are filtered | 
					
						
							| 
									
										
										
										
											2013-10-14 23:09:13 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  | url = 'https://lite.duckduckgo.com/lite/' | 
					
						
							|  |  |  | # url_ping = 'https://duckduckgo.com/t/sl_l' | 
					
						
							| 
									
										
										
										
											2018-03-01 05:30:48 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  | time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} | 
					
						
							| 
									
										
										
										
											2023-09-22 08:53:19 +02:00
										 |  |  | form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'} | 
					
						
							| 
									
										
										
										
											2014-03-29 16:38:45 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-02 17:14:57 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  | def cache_vqd(query, value): | 
					
						
							| 
									
										
										
										
											2023-10-10 11:08:14 +02:00
										 |  |  |     """Caches a ``vqd`` value from a query.""" | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |     c = redisdb.client() | 
					
						
							|  |  |  |     if c: | 
					
						
							| 
									
										
										
										
											2023-09-22 08:53:19 +02:00
										 |  |  |         logger.debug("cache vqd value: %s", value) | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |         key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query) | 
					
						
							| 
									
										
										
										
											2023-09-22 08:53:19 +02:00
										 |  |  |         c.set(key, value, ex=600) | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-05 20:25:13 +02:00
										 |  |  | def get_vqd(query): | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |     """Returns the ``vqd`` that fits to the *query*.  If there is no ``vqd`` cached
 | 
					
						
							| 
									
										
										
										
											2023-09-22 08:53:19 +02:00
										 |  |  |     (:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd value from the | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |     response. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-10 11:08:14 +02:00
										 |  |  |     .. hint:: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |        If an empty string is returned there are no results for the ``query`` and | 
					
						
							|  |  |  |        therefore no ``vqd`` value. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     DDG's bot detection is sensitive to the ``vqd`` value.  For some search terms | 
					
						
							|  |  |  |     (such as extremely long search terms that are often sent by bots), no ``vqd`` | 
					
						
							|  |  |  |     value can be determined. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     If SearXNG cannot determine a ``vqd`` value, then no request should go out | 
					
						
							|  |  |  |     to DDG: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         A request with a wrong ``vqd`` value leads to DDG temporarily putting | 
					
						
							|  |  |  |         SearXNG's IP on a block list. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Requests from IPs in this block list run into timeouts. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Not sure, but it seems the block list is a sliding window: to get my IP rid | 
					
						
							|  |  |  |     from the bot list I had to cool down my IP for 1h (send no requests from | 
					
						
							|  |  |  |     that IP to DDG). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     TL;DR; the ``vqd`` value is needed to pass DDG's bot protection and is used | 
					
						
							|  |  |  |     by all request to DDG: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     - DuckDuckGo Lite: ``https://lite.duckduckgo.com/lite`` (POST form data) | 
					
						
							|  |  |  |     - DuckDuckGo Web: ``https://links.duckduckgo.com/d.js?q=...&vqd=...`` | 
					
						
							|  |  |  |     - DuckDuckGo Images: ``https://duckduckgo.com/i.js??q=...&vqd=...`` | 
					
						
							|  |  |  |     - DuckDuckGo Videos: ``https://duckduckgo.com/v.js??q=...&vqd=...`` | 
					
						
							|  |  |  |     - DuckDuckGo News: ``https://duckduckgo.com/news.js??q=...&vqd=...`` | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2023-10-10 11:08:14 +02:00
										 |  |  |     value = '' | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |     c = redisdb.client() | 
					
						
							|  |  |  |     if c: | 
					
						
							|  |  |  |         key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query) | 
					
						
							|  |  |  |         value = c.get(key) | 
					
						
							| 
									
										
										
										
											2023-10-10 11:08:14 +02:00
										 |  |  |         if value or value == b'': | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |             value = value.decode('utf-8') | 
					
						
							|  |  |  |             logger.debug("re-use cached vqd value: %s", value) | 
					
						
							|  |  |  |             return value | 
					
						
							| 
									
										
										
										
											2019-01-06 15:27:46 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-22 08:53:19 +02:00
										 |  |  |     query_url = 'https://lite.duckduckgo.com/lite/?{args}'.format(args=urlencode({'q': query})) | 
					
						
							|  |  |  |     res = get(query_url) | 
					
						
							|  |  |  |     doc = lxml.html.fromstring(res.text) | 
					
						
							| 
									
										
										
										
											2023-10-10 09:04:00 +02:00
										 |  |  |     value = doc.xpath("//input[@name='vqd']/@value") | 
					
						
							|  |  |  |     if value: | 
					
						
							|  |  |  |         value = value[0] | 
					
						
							|  |  |  |     else: | 
					
						
							| 
									
										
										
										
											2023-10-10 11:08:14 +02:00
										 |  |  |         # Some search terms do not have results and therefore no vqd value.  If | 
					
						
							|  |  |  |         # no vqd value can be determined for the search term, an empty string is | 
					
						
							|  |  |  |         # chached. | 
					
						
							| 
									
										
										
										
											2023-10-10 09:04:00 +02:00
										 |  |  |         value = '' | 
					
						
							| 
									
										
										
										
											2023-10-10 11:08:14 +02:00
										 |  |  |     logger.debug("new vqd value: '%s'", value) | 
					
						
							| 
									
										
										
										
											2023-09-22 08:53:19 +02:00
										 |  |  |     cache_vqd(query, value) | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |     return value | 
					
						
							| 
									
										
										
										
											2018-03-01 05:30:48 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'): | 
					
						
							|  |  |  |     """Get DuckDuckGo's language identifier from SearXNG's locale.
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-15 09:53:03 +02:00
										 |  |  |     DuckDuckGo defines its languages by region codes (see | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |     :py:obj:`fetch_traits`). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     To get region and language of a DDG service use: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     .. code: python | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |        eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) | 
					
						
							|  |  |  |        eng_lang = get_ddg_lang(traits, params['searxng_locale']) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     It might confuse, but the ``l`` value of the cookie is what SearXNG calls | 
					
						
							|  |  |  |     the *region*: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     .. code:: python | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # !ddi paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'} | 
					
						
							|  |  |  |         params['cookies']['ad'] = eng_lang | 
					
						
							|  |  |  |         params['cookies']['ah'] = eng_region | 
					
						
							|  |  |  |         params['cookies']['l'] = eng_region | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     .. hint:: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |        `DDG-lite <https://lite.duckduckgo.com/lite>`__ does not offer a language | 
					
						
							|  |  |  |        selection to the user, only a region can be selected by the user | 
					
						
							|  |  |  |        (``eng_region`` from the example above).  DDG-lite stores the selected | 
					
						
							|  |  |  |        region in a cookie:: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |          params['cookies']['kl'] = eng_region  # 'ar-es' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2023-06-25 12:37:31 +02:00
										 |  |  |     return eng_traits.custom['lang_region'].get(  # type: ignore | 
					
						
							|  |  |  |         sxng_locale, eng_traits.get_language(sxng_locale, default) | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ddg_reg_map = { | 
					
						
							|  |  |  |     'tw-tzh': 'zh_TW', | 
					
						
							|  |  |  |     'hk-tzh': 'zh_HK', | 
					
						
							|  |  |  |     'ct-ca': 'skip',  # ct-ca and es-ca both map to ca_ES | 
					
						
							|  |  |  |     'es-ca': 'ca_ES', | 
					
						
							|  |  |  |     'id-en': 'id_ID', | 
					
						
							|  |  |  |     'no-no': 'nb_NO', | 
					
						
							|  |  |  |     'jp-jp': 'ja_JP', | 
					
						
							|  |  |  |     'kr-kr': 'ko_KR', | 
					
						
							|  |  |  |     'xa-ar': 'ar_SA', | 
					
						
							|  |  |  |     'sl-sl': 'sl_SI', | 
					
						
							|  |  |  |     'th-en': 'th_TH', | 
					
						
							|  |  |  |     'vn-en': 'vi_VN', | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ddg_lang_map = { | 
					
						
							|  |  |  |     # use ar --> ar_EG (Egypt's arabic) | 
					
						
							|  |  |  |     "ar_DZ": 'lang_region', | 
					
						
							|  |  |  |     "ar_JO": 'lang_region', | 
					
						
							|  |  |  |     "ar_SA": 'lang_region', | 
					
						
							|  |  |  |     # use bn --> bn_BD | 
					
						
							|  |  |  |     'bn_IN': 'lang_region', | 
					
						
							|  |  |  |     # use de --> de_DE | 
					
						
							|  |  |  |     'de_CH': 'lang_region', | 
					
						
							|  |  |  |     # use en --> en_US, | 
					
						
							|  |  |  |     'en_AU': 'lang_region', | 
					
						
							|  |  |  |     'en_CA': 'lang_region', | 
					
						
							|  |  |  |     'en_GB': 'lang_region', | 
					
						
							|  |  |  |     # Esperanto | 
					
						
							|  |  |  |     'eo_XX': 'eo', | 
					
						
							|  |  |  |     # use es --> es_ES, | 
					
						
							|  |  |  |     'es_AR': 'lang_region', | 
					
						
							|  |  |  |     'es_CL': 'lang_region', | 
					
						
							|  |  |  |     'es_CO': 'lang_region', | 
					
						
							|  |  |  |     'es_CR': 'lang_region', | 
					
						
							|  |  |  |     'es_EC': 'lang_region', | 
					
						
							|  |  |  |     'es_MX': 'lang_region', | 
					
						
							|  |  |  |     'es_PE': 'lang_region', | 
					
						
							|  |  |  |     'es_UY': 'lang_region', | 
					
						
							|  |  |  |     'es_VE': 'lang_region', | 
					
						
							|  |  |  |     # use fr --> rf_FR | 
					
						
							|  |  |  |     'fr_CA': 'lang_region', | 
					
						
							|  |  |  |     'fr_CH': 'lang_region', | 
					
						
							|  |  |  |     'fr_BE': 'lang_region', | 
					
						
							|  |  |  |     # use nl --> nl_NL | 
					
						
							|  |  |  |     'nl_BE': 'lang_region', | 
					
						
							|  |  |  |     # use pt --> pt_PT | 
					
						
							|  |  |  |     'pt_BR': 'lang_region', | 
					
						
							|  |  |  |     # skip these languages | 
					
						
							|  |  |  |     'od_IN': 'skip', | 
					
						
							|  |  |  |     'io_XX': 'skip', | 
					
						
							|  |  |  |     'tokipona_XX': 'skip', | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2017-05-21 05:33:08 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def request(query, params): | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-10 11:08:14 +02:00
										 |  |  |     # request needs a vqd argument | 
					
						
							|  |  |  |     vqd = get_vqd(query) | 
					
						
							|  |  |  |     if not vqd: | 
					
						
							|  |  |  |         # some search terms do not have results and therefore no vqd value | 
					
						
							|  |  |  |         params['url'] = None | 
					
						
							|  |  |  |         return params | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-04-03 09:52:16 +02:00
										 |  |  |     # quote ddg bangs | 
					
						
							|  |  |  |     query_parts = [] | 
					
						
							|  |  |  |     # for val in re.split(r'(\s+)', query): | 
					
						
							|  |  |  |     for val in re.split(r'(\s+)', query): | 
					
						
							|  |  |  |         if not val.strip(): | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if val.startswith('!') and external_bang.get_node(external_bang.EXTERNAL_BANGS, val[1:]): | 
					
						
							|  |  |  |             val = f"'{val}'" | 
					
						
							|  |  |  |         query_parts.append(val) | 
					
						
							|  |  |  |     query = ' '.join(query_parts) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |     eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) | 
					
						
							|  |  |  |     # eng_lang = get_ddg_lang(traits, params['searxng_locale']) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-09 15:01:40 +02:00
										 |  |  |     params['url'] = url | 
					
						
							|  |  |  |     params['method'] = 'POST' | 
					
						
							|  |  |  |     params['data']['q'] = query | 
					
						
							| 
									
										
										
										
											2021-09-30 16:40:00 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # The API is not documented, so we do some reverse engineering and emulate | 
					
						
							|  |  |  |     # what https://lite.duckduckgo.com/lite/ does when you press "next Page" | 
					
						
							|  |  |  |     # link again and again .. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     params['headers']['Content-Type'] = 'application/x-www-form-urlencoded' | 
					
						
							| 
									
										
										
										
											2023-10-10 11:08:14 +02:00
										 |  |  |     params['data']['vqd'] = vqd | 
					
						
							| 
									
										
										
										
											2021-09-30 16:40:00 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # initial page does not have an offset | 
					
						
							|  |  |  |     if params['pageno'] == 2: | 
					
						
							|  |  |  |         # second page does have an offset of 30 | 
					
						
							|  |  |  |         offset = (params['pageno'] - 1) * 30 | 
					
						
							|  |  |  |         params['data']['s'] = offset | 
					
						
							|  |  |  |         params['data']['dc'] = offset + 1 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     elif params['pageno'] > 2: | 
					
						
							|  |  |  |         # third and following pages do have an offset of 30 + n*50 | 
					
						
							|  |  |  |         offset = 30 + (params['pageno'] - 2) * 50 | 
					
						
							|  |  |  |         params['data']['s'] = offset | 
					
						
							|  |  |  |         params['data']['dc'] = offset + 1 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # initial page does not have additional data in the input form | 
					
						
							|  |  |  |     if params['pageno'] > 1: | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-22 08:53:19 +02:00
										 |  |  |         params['data']['o'] = form_data.get('o', 'json') | 
					
						
							|  |  |  |         params['data']['api'] = form_data.get('api', 'd.js') | 
					
						
							|  |  |  |         params['data']['nextParams'] = form_data.get('nextParams', '') | 
					
						
							|  |  |  |         params['data']['v'] = form_data.get('v', 'l') | 
					
						
							| 
									
										
										
										
											2023-10-10 08:13:07 +02:00
										 |  |  |         params['headers']['Referer'] = 'https://lite.duckduckgo.com/' | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |     params['data']['kl'] = eng_region | 
					
						
							|  |  |  |     params['cookies']['kl'] = eng_region | 
					
						
							| 
									
										
										
										
											2016-07-18 16:15:37 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-30 16:40:00 +02:00
										 |  |  |     params['data']['df'] = '' | 
					
						
							| 
									
										
										
										
											2021-02-09 12:07:19 +01:00
										 |  |  |     if params['time_range'] in time_range_dict: | 
					
						
							|  |  |  |         params['data']['df'] = time_range_dict[params['time_range']] | 
					
						
							| 
									
										
										
										
											2021-09-30 16:40:00 +02:00
										 |  |  |         params['cookies']['df'] = time_range_dict[params['time_range']] | 
					
						
							| 
									
										
										
										
											2021-02-09 12:07:19 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-30 16:40:00 +02:00
										 |  |  |     logger.debug("param data: %s", params['data']) | 
					
						
							|  |  |  |     logger.debug("param cookies: %s", params['cookies']) | 
					
						
							| 
									
										
										
										
											2013-10-14 23:09:13 +02:00
										 |  |  |     return params | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-10-14 23:09:13 +02:00
										 |  |  | def response(resp): | 
					
						
							| 
									
										
										
										
											2021-02-09 12:07:19 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-30 16:40:00 +02:00
										 |  |  |     if resp.status_code == 303: | 
					
						
							|  |  |  |         return [] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-02-09 14:36:43 +01:00
										 |  |  |     results = [] | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |     doc = lxml.html.fromstring(resp.text) | 
					
						
							| 
									
										
										
										
											2014-09-02 17:14:57 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-30 16:40:00 +02:00
										 |  |  |     result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if len(result_table) == 2: | 
					
						
							|  |  |  |         # some locales (at least China) does not have a "next page" button and | 
					
						
							|  |  |  |         # the layout of the HTML tables is different. | 
					
						
							|  |  |  |         result_table = result_table[1] | 
					
						
							|  |  |  |     elif not len(result_table) >= 3: | 
					
						
							| 
									
										
										
										
											2023-09-22 08:53:19 +02:00
										 |  |  |         # no more results | 
					
						
							| 
									
										
										
										
											2021-09-30 16:40:00 +02:00
										 |  |  |         return [] | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |     else: | 
					
						
							|  |  |  |         result_table = result_table[2] | 
					
						
							|  |  |  |         # update form data from response | 
					
						
							|  |  |  |         form = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table//input/..') | 
					
						
							|  |  |  |         if len(form): | 
					
						
							| 
									
										
										
										
											2023-09-22 08:53:19 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |             form = form[0] | 
					
						
							|  |  |  |             form_data['v'] = eval_xpath(form, '//input[@name="v"]/@value')[0] | 
					
						
							|  |  |  |             form_data['api'] = eval_xpath(form, '//input[@name="api"]/@value')[0] | 
					
						
							|  |  |  |             form_data['o'] = eval_xpath(form, '//input[@name="o"]/@value')[0] | 
					
						
							|  |  |  |             logger.debug('form_data: %s', form_data) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             value = eval_xpath(form, '//input[@name="vqd"]/@value')[0] | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |             query = resp.search_params['data']['q'] | 
					
						
							| 
									
										
										
										
											2023-09-22 08:53:19 +02:00
										 |  |  |             cache_vqd(query, value) | 
					
						
							| 
									
										
										
										
											2021-09-30 16:40:00 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     tr_rows = eval_xpath(result_table, './/tr') | 
					
						
							|  |  |  |     # In the last <tr> is the form of the 'previous/next page' links | 
					
						
							|  |  |  |     tr_rows = tr_rows[:-1] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     len_tr_rows = len(tr_rows) | 
					
						
							|  |  |  |     offset = 0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     while len_tr_rows >= offset + 4: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # assemble table rows we need to scrap | 
					
						
							|  |  |  |         tr_title = tr_rows[offset] | 
					
						
							|  |  |  |         tr_content = tr_rows[offset + 1] | 
					
						
							|  |  |  |         offset += 4 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # ignore sponsored Adds <tr class="result-sponsored"> | 
					
						
							|  |  |  |         if tr_content.get('class') == 'result-sponsored': | 
					
						
							| 
									
										
										
										
											2013-10-15 19:11:43 +02:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2014-09-02 17:14:57 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-30 16:40:00 +02:00
										 |  |  |         a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None) | 
					
						
							|  |  |  |         if a_tag is None: | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2014-09-02 17:14:57 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-30 16:40:00 +02:00
										 |  |  |         td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None) | 
					
						
							|  |  |  |         if td_content is None: | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2014-03-21 16:33:17 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |         results.append( | 
					
						
							|  |  |  |             { | 
					
						
							|  |  |  |                 'title': a_tag.text_content(), | 
					
						
							|  |  |  |                 'content': extract_text(td_content), | 
					
						
							|  |  |  |                 'url': a_tag.get('href'), | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2020-06-13 23:42:16 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-10-15 19:11:43 +02:00
										 |  |  |     return results | 
					
						
							| 
									
										
										
										
											2016-11-06 03:51:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  | def fetch_traits(engine_traits: EngineTraits): | 
					
						
							|  |  |  |     """Fetch languages & regions from DuckDuckGo.
 | 
					
						
							| 
									
										
										
										
											2016-11-06 03:51:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |     SearXNG's ``all`` locale maps DuckDuckGo's "Alle regions" (``wt-wt``). | 
					
						
							|  |  |  |     DuckDuckGo's language "Browsers prefered language" (``wt_WT``) makes no | 
					
						
							|  |  |  |     sense in a SearXNG request since SearXNG's ``all`` will not add a | 
					
						
							|  |  |  |     ``Accept-Language`` HTTP header.  The value in ``engine_traits.all_locale`` | 
					
						
							|  |  |  |     is ``wt-wt`` (the region). | 
					
						
							| 
									
										
										
										
											2022-10-04 19:20:32 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-15 09:53:03 +02:00
										 |  |  |     Beside regions DuckDuckGo also defines its languages by region codes.  By | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |     example these are the english languages in DuckDuckGo: | 
					
						
							| 
									
										
										
										
											2022-10-04 19:20:32 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |     - en_US | 
					
						
							|  |  |  |     - en_AU | 
					
						
							|  |  |  |     - en_CA | 
					
						
							|  |  |  |     - en_GB | 
					
						
							| 
									
										
										
										
											2022-10-04 19:20:32 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |     The function :py:obj:`get_ddg_lang` evaluates DuckDuckGo's language from | 
					
						
							|  |  |  |     SearXNG's locale. | 
					
						
							| 
									
										
										
										
											2022-10-04 19:20:32 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |     """
 | 
					
						
							|  |  |  |     # pylint: disable=too-many-branches, too-many-statements | 
					
						
							|  |  |  |     # fetch regions | 
					
						
							| 
									
										
										
										
											2022-10-04 19:20:32 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     engine_traits.all_locale = 'wt-wt' | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |     # updated from u588 to u661 / should be updated automatically? | 
					
						
							| 
									
										
										
										
											2023-06-25 12:37:31 +02:00
										 |  |  |     resp = get('https://duckduckgo.com/util/u661.js') | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-25 12:37:31 +02:00
										 |  |  |     if not resp.ok:  # type: ignore | 
					
						
							| 
									
										
										
										
											2022-10-04 19:20:32 +02:00
										 |  |  |         print("ERROR: response from DuckDuckGo is not OK.") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-25 12:37:31 +02:00
										 |  |  |     pos = resp.text.find('regions:{') + 8  # type: ignore | 
					
						
							|  |  |  |     js_code = resp.text[pos:]  # type: ignore | 
					
						
							| 
									
										
										
										
											2022-10-04 19:20:32 +02:00
										 |  |  |     pos = js_code.find('}') + 1 | 
					
						
							|  |  |  |     regions = json.loads(js_code[:pos]) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for eng_tag, name in regions.items(): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if eng_tag == 'wt-wt': | 
					
						
							|  |  |  |             engine_traits.all_locale = 'wt-wt' | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |         region = ddg_reg_map.get(eng_tag) | 
					
						
							| 
									
										
										
										
											2022-10-04 19:20:32 +02:00
										 |  |  |         if region == 'skip': | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if not region: | 
					
						
							|  |  |  |             eng_territory, eng_lang = eng_tag.split('-') | 
					
						
							|  |  |  |             region = eng_lang + '_' + eng_territory.upper() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         try: | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |             sxng_tag = locales.region_tag(babel.Locale.parse(region)) | 
					
						
							| 
									
										
										
										
											2022-10-04 19:20:32 +02:00
										 |  |  |         except babel.UnknownLocaleError: | 
					
						
							|  |  |  |             print("ERROR: %s (%s) -> %s is unknown by babel" % (name, eng_tag, region)) | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         conflict = engine_traits.regions.get(sxng_tag) | 
					
						
							|  |  |  |         if conflict: | 
					
						
							|  |  |  |             if conflict != eng_tag: | 
					
						
							|  |  |  |                 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         engine_traits.regions[sxng_tag] = eng_tag | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # fetch languages | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     engine_traits.custom['lang_region'] = {} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-25 12:37:31 +02:00
										 |  |  |     pos = resp.text.find('languages:{') + 10  # type: ignore | 
					
						
							|  |  |  |     js_code = resp.text[pos:]  # type: ignore | 
					
						
							| 
									
										
										
										
											2022-11-05 15:10:52 +01:00
										 |  |  |     pos = js_code.find('}') + 1 | 
					
						
							|  |  |  |     js_code = '{"' + js_code[1:pos].replace(':', '":').replace(',', ',"') | 
					
						
							|  |  |  |     languages = json.loads(js_code) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for eng_lang, name in languages.items(): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if eng_lang == 'wt_WT': | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         babel_tag = ddg_lang_map.get(eng_lang, eng_lang) | 
					
						
							|  |  |  |         if babel_tag == 'skip': | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             if babel_tag == 'lang_region': | 
					
						
							|  |  |  |                 sxng_tag = locales.region_tag(babel.Locale.parse(eng_lang)) | 
					
						
							|  |  |  |                 engine_traits.custom['lang_region'][sxng_tag] = eng_lang | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             sxng_tag = locales.language_tag(babel.Locale.parse(babel_tag)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         except babel.UnknownLocaleError: | 
					
						
							|  |  |  |             print("ERROR: language %s (%s) is unknown by babel" % (name, eng_lang)) | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         conflict = engine_traits.languages.get(sxng_tag) | 
					
						
							|  |  |  |         if conflict: | 
					
						
							|  |  |  |             if conflict != eng_lang: | 
					
						
							|  |  |  |                 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang)) | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         engine_traits.languages[sxng_tag] = eng_lang |