[fix] duckduckgo extra: crashes and returns no results
This commit is contained in:
		
							parent
							
								
									c4b874e9b0
								
							
						
					
					
						commit
						f0f0b2d4c9
					
				@ -1,12 +1,14 @@
 | 
				
			|||||||
# SPDX-License-Identifier: AGPL-3.0-or-later
 | 
					# SPDX-License-Identifier: AGPL-3.0-or-later
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
DuckDuckGo Lite
 | 
					DuckDuckGo WEB
 | 
				
			||||||
~~~~~~~~~~~~~~~
 | 
					~~~~~~~~~~~~~~
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from __future__ import annotations
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from typing import TYPE_CHECKING
 | 
					from typing import TYPE_CHECKING
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
from urllib.parse import urlencode
 | 
					from urllib.parse import urlencode, quote_plus
 | 
				
			||||||
import json
 | 
					import json
 | 
				
			||||||
import babel
 | 
					import babel
 | 
				
			||||||
import lxml.html
 | 
					import lxml.html
 | 
				
			||||||
@ -18,12 +20,12 @@ from searx import (
 | 
				
			|||||||
)
 | 
					)
 | 
				
			||||||
from searx.utils import (
 | 
					from searx.utils import (
 | 
				
			||||||
    eval_xpath,
 | 
					    eval_xpath,
 | 
				
			||||||
 | 
					    extr,
 | 
				
			||||||
    extract_text,
 | 
					    extract_text,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
from searx.network import get  # see https://github.com/searxng/searxng/issues/762
 | 
					from searx.network import get  # see https://github.com/searxng/searxng/issues/762
 | 
				
			||||||
from searx import redisdb
 | 
					from searx import redisdb
 | 
				
			||||||
from searx.enginelib.traits import EngineTraits
 | 
					from searx.enginelib.traits import EngineTraits
 | 
				
			||||||
from searx.utils import extr
 | 
					 | 
				
			||||||
from searx.exceptions import SearxEngineCaptchaException
 | 
					from searx.exceptions import SearxEngineCaptchaException
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if TYPE_CHECKING:
 | 
					if TYPE_CHECKING:
 | 
				
			||||||
@ -60,42 +62,30 @@ form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
 | 
				
			|||||||
__CACHE = []
 | 
					__CACHE = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _cache_key(data: dict):
 | 
					def _cache_key(query: str, region: str):
 | 
				
			||||||
    return 'SearXNG_ddg_web_vqd' + redislib.secret_hash(f"{data['q']}//{data['kl']}")
 | 
					    return 'SearXNG_ddg_web_vqd' + redislib.secret_hash(f"{query}//{region}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def cache_vqd(data: dict, value):
 | 
					def cache_vqd(query: str, region: str, value: str):
 | 
				
			||||||
    """Caches a ``vqd`` value from a query."""
 | 
					    """Caches a ``vqd`` value from a query."""
 | 
				
			||||||
    c = redisdb.client()
 | 
					    c = redisdb.client()
 | 
				
			||||||
    if c:
 | 
					    if c:
 | 
				
			||||||
        logger.debug("cache vqd value: %s", value)
 | 
					        logger.debug("VALKEY cache vqd value: %s (%s)", value, region)
 | 
				
			||||||
        c.set(_cache_key(data), value, ex=600)
 | 
					        c.set(_cache_key(query, region), value, ex=600)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        logger.debug("MEM cache vqd value: %s", value)
 | 
					        logger.debug("MEM cache vqd value: %s (%s)", value, region)
 | 
				
			||||||
        if len(__CACHE) > 100:  # cache vqd from last 100 queries
 | 
					        if len(__CACHE) > 100:  # cache vqd from last 100 queries
 | 
				
			||||||
            __CACHE.pop(0)
 | 
					            __CACHE.pop(0)
 | 
				
			||||||
        __CACHE.append((_cache_key(data), value))
 | 
					        __CACHE.append((_cache_key(query, region), value))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_vqd(data):
 | 
					def get_vqd(query: str, region: str, force_request: bool = False):
 | 
				
			||||||
    """Returns the ``vqd`` that fits to the *query* (``data`` from HTTP POST).
 | 
					    """Returns the ``vqd`` that fits to the *query*.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DDG's bot detection is sensitive to the ``vqd`` value.  For some search terms
 | 
					    :param query: The query term
 | 
				
			||||||
    (such as extremely long search terms that are often sent by bots), no ``vqd``
 | 
					    :param region: DDG's region code
 | 
				
			||||||
    value can be determined.
 | 
					    :param force_request: force a request to get a vqd value from DDG
 | 
				
			||||||
 | 
					 | 
				
			||||||
    If SearXNG cannot determine a ``vqd`` value, then no request should go out
 | 
					 | 
				
			||||||
    to DDG:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        A request with a wrong ``vqd`` value leads to DDG temporarily putting
 | 
					 | 
				
			||||||
        SearXNG's IP on a block list.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        Requests from IPs in this block list run into timeouts.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Not sure, but it seems the block list is a sliding window: to get my IP rid
 | 
					 | 
				
			||||||
    from the bot list I had to cool down my IP for 1h (send no requests from
 | 
					 | 
				
			||||||
    that IP to DDG).
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    TL;DR; the ``vqd`` value is needed to pass DDG's bot protection and is used
 | 
					    TL;DR; the ``vqd`` value is needed to pass DDG's bot protection and is used
 | 
				
			||||||
    by all request to DDG:
 | 
					    by all request to DDG:
 | 
				
			||||||
@ -106,23 +96,46 @@ def get_vqd(data):
 | 
				
			|||||||
    - DuckDuckGo Videos: ``https://duckduckgo.com/v.js??q=...&vqd=...``
 | 
					    - DuckDuckGo Videos: ``https://duckduckgo.com/v.js??q=...&vqd=...``
 | 
				
			||||||
    - DuckDuckGo News: ``https://duckduckgo.com/news.js??q=...&vqd=...``
 | 
					    - DuckDuckGo News: ``https://duckduckgo.com/news.js??q=...&vqd=...``
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    """
 | 
					    DDG's bot detection is sensitive to the ``vqd`` value.  For some search terms
 | 
				
			||||||
 | 
					    (such as extremely long search terms that are often sent by bots), no ``vqd``
 | 
				
			||||||
 | 
					    value can be determined.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    If SearXNG cannot determine a ``vqd`` value, then no request should go out
 | 
				
			||||||
 | 
					    to DDG.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    .. attention::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					       A request with a wrong ``vqd`` value leads to DDG temporarily putting
 | 
				
			||||||
 | 
					       SearXNG's IP on a block list.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Requests from IPs in this block list run into timeouts.  Not sure, but it
 | 
				
			||||||
 | 
					    seems the block list is a sliding window: to get my IP rid from the bot list
 | 
				
			||||||
 | 
					    I had to cool down my IP for 1h (send no requests from that IP to DDG).
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    key = _cache_key(query, region)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    key = _cache_key(data)
 | 
					 | 
				
			||||||
    value = None
 | 
					 | 
				
			||||||
    c = redisdb.client()
 | 
					    c = redisdb.client()
 | 
				
			||||||
    if c:
 | 
					    if c:
 | 
				
			||||||
        value = c.get(key)
 | 
					        value = c.get(key)
 | 
				
			||||||
        if value or value == b'':
 | 
					        if value or value == b'':
 | 
				
			||||||
            value = value.decode('utf-8')
 | 
					            value = value.decode('utf-8')  # type: ignore
 | 
				
			||||||
            logger.debug("re-use CACHED vqd value: %s", value)
 | 
					            logger.debug("re-use CACHED vqd value: %s", value)
 | 
				
			||||||
            return value
 | 
					            return value
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    else:
 | 
					    for k, value in __CACHE:
 | 
				
			||||||
        for k, value in __CACHE:
 | 
					        if k == key:
 | 
				
			||||||
            if k == key:
 | 
					            logger.debug("MEM re-use CACHED vqd value: %s", value)
 | 
				
			||||||
                logger.debug("MEM re-use CACHED vqd value: %s", value)
 | 
					            return value
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if force_request:
 | 
				
			||||||
 | 
					        resp = get(f'https://duckduckgo.com/?q={quote_plus(query)}')
 | 
				
			||||||
 | 
					        if resp.status_code == 200:  # type: ignore
 | 
				
			||||||
 | 
					            value = extr(resp.text, 'vqd="', '"')  # type: ignore
 | 
				
			||||||
 | 
					            if value:
 | 
				
			||||||
 | 
					                logger.debug("vqd value from DDG request: %s", value)
 | 
				
			||||||
 | 
					                cache_vqd(query, region, value)
 | 
				
			||||||
                return value
 | 
					                return value
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return None
 | 
					    return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -251,7 +264,7 @@ def request(query, params):
 | 
				
			|||||||
            for x in query.split()
 | 
					            for x in query.split()
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
 | 
					    eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale)  # type: ignore
 | 
				
			||||||
    if eng_region == "wt-wt":
 | 
					    if eng_region == "wt-wt":
 | 
				
			||||||
        # https://html.duckduckgo.com/html sets an empty value for "all".
 | 
					        # https://html.duckduckgo.com/html sets an empty value for "all".
 | 
				
			||||||
        eng_region = ""
 | 
					        eng_region = ""
 | 
				
			||||||
@ -310,10 +323,7 @@ def request(query, params):
 | 
				
			|||||||
        params['data']['v'] = form_data.get('v', 'l')
 | 
					        params['data']['v'] = form_data.get('v', 'l')
 | 
				
			||||||
        params['headers']['Referer'] = url
 | 
					        params['headers']['Referer'] = url
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # from here on no more params['data'] shuld be set, since this dict is
 | 
					        vqd = get_vqd(query, eng_region, force_request=False)
 | 
				
			||||||
        # needed to get a vqd value from the cache ..
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        vqd = get_vqd(params['data'])
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Certain conditions must be met in order to call up one of the
 | 
					        # Certain conditions must be met in order to call up one of the
 | 
				
			||||||
        # following pages ...
 | 
					        # following pages ...
 | 
				
			||||||
@ -362,7 +372,7 @@ def response(resp):
 | 
				
			|||||||
        form = form[0]
 | 
					        form = form[0]
 | 
				
			||||||
        form_vqd = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
 | 
					        form_vqd = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        cache_vqd(resp.search_params["data"], form_vqd)
 | 
					        cache_vqd(resp.search_params['data']['q'], resp.search_params['data']['kl'], form_vqd)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # just select "web-result" and ignore results of class "result--ad result--ad--small"
 | 
					    # just select "web-result" and ignore results of class "result--ad result--ad--small"
 | 
				
			||||||
    for div_result in eval_xpath(doc, '//div[@id="links"]/div[contains(@class, "web-result")]'):
 | 
					    for div_result in eval_xpath(doc, '//div[@id="links"]/div[contains(@class, "web-result")]'):
 | 
				
			||||||
@ -379,7 +389,7 @@ def response(resp):
 | 
				
			|||||||
        results.append(item)
 | 
					        results.append(item)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    zero_click_info_xpath = '//div[@id="zero_click_abstract"]'
 | 
					    zero_click_info_xpath = '//div[@id="zero_click_abstract"]'
 | 
				
			||||||
    zero_click = extract_text(eval_xpath(doc, zero_click_info_xpath)).strip()
 | 
					    zero_click = extract_text(eval_xpath(doc, zero_click_info_xpath)).strip()  # type: ignore
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if zero_click and (
 | 
					    if zero_click and (
 | 
				
			||||||
        "Your IP address is" not in zero_click
 | 
					        "Your IP address is" not in zero_click
 | 
				
			||||||
@ -432,7 +442,7 @@ def fetch_traits(engine_traits: EngineTraits):
 | 
				
			|||||||
    if not resp.ok:  # type: ignore
 | 
					    if not resp.ok:  # type: ignore
 | 
				
			||||||
        print("ERROR: response from DuckDuckGo is not OK.")
 | 
					        print("ERROR: response from DuckDuckGo is not OK.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    js_code = extr(resp.text, 'regions:', ',snippetLengths')
 | 
					    js_code = extr(resp.text, 'regions:', ',snippetLengths')  # type: ignore
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    regions = json.loads(js_code)
 | 
					    regions = json.loads(js_code)
 | 
				
			||||||
    for eng_tag, name in regions.items():
 | 
					    for eng_tag, name in regions.items():
 | 
				
			||||||
@ -466,7 +476,7 @@ def fetch_traits(engine_traits: EngineTraits):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    engine_traits.custom['lang_region'] = {}
 | 
					    engine_traits.custom['lang_region'] = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    js_code = extr(resp.text, 'languages:', ',regions')
 | 
					    js_code = extr(resp.text, 'languages:', ',regions')  # type: ignore
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    languages = js_variable_to_python(js_code)
 | 
					    languages = js_variable_to_python(js_code)
 | 
				
			||||||
    for eng_lang, name in languages.items():
 | 
					    for eng_lang, name in languages.items():
 | 
				
			||||||
 | 
				
			|||||||
@ -4,16 +4,15 @@ DuckDuckGo Extra (images, videos, news)
 | 
				
			|||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
					~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from __future__ import annotations
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from datetime import datetime
 | 
					from datetime import datetime
 | 
				
			||||||
from typing import TYPE_CHECKING
 | 
					from typing import TYPE_CHECKING
 | 
				
			||||||
from urllib.parse import urlencode
 | 
					from urllib.parse import urlencode
 | 
				
			||||||
from searx.utils import get_embeded_stream_url
 | 
					from searx.utils import get_embeded_stream_url
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from searx.engines.duckduckgo import fetch_traits  # pylint: disable=unused-import
 | 
					from searx.engines.duckduckgo import fetch_traits  # pylint: disable=unused-import
 | 
				
			||||||
from searx.engines.duckduckgo import (
 | 
					from searx.engines.duckduckgo import get_ddg_lang, get_vqd
 | 
				
			||||||
    get_ddg_lang,
 | 
					 | 
				
			||||||
    get_vqd,
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
from searx.enginelib.traits import EngineTraits
 | 
					from searx.enginelib.traits import EngineTraits
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if TYPE_CHECKING:
 | 
					if TYPE_CHECKING:
 | 
				
			||||||
@ -48,15 +47,16 @@ search_path_map = {'images': 'i', 'videos': 'v', 'news': 'news'}
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def request(query, params):
 | 
					def request(query, params):
 | 
				
			||||||
 | 
					    eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale)  # type: ignore
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # request needs a vqd argument
 | 
					    # request needs a vqd argument
 | 
				
			||||||
    vqd = get_vqd(query)
 | 
					    vqd = get_vqd(query, eng_region, force_request=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if not vqd:
 | 
					    if not vqd:
 | 
				
			||||||
        # some search terms do not have results and therefore no vqd value
 | 
					        # some search terms do not have results and therefore no vqd value
 | 
				
			||||||
        params['url'] = None
 | 
					        params['url'] = None
 | 
				
			||||||
        return params
 | 
					        return params
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
 | 
					 | 
				
			||||||
    eng_lang = get_ddg_lang(traits, params['searxng_locale'])
 | 
					    eng_lang = get_ddg_lang(traits, params['searxng_locale'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    args = {
 | 
					    args = {
 | 
				
			||||||
@ -86,6 +86,12 @@ def request(query, params):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    params['url'] = f'https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}'
 | 
					    params['url'] = f'https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # sending these two headers prevents rate limiting for the query
 | 
				
			||||||
 | 
					    params['headers'] = {
 | 
				
			||||||
 | 
					        'Referer': 'https://duckduckgo.com/',
 | 
				
			||||||
 | 
					        'X-Requested-With': 'XMLHttpRequest',
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return params
 | 
					    return params
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
		Reference in New Issue
	
	Block a user