| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | # SPDX-License-Identifier: AGPL-3.0-or-later | 
					
						
							| 
									
										
										
										
											2021-04-26 20:18:20 +02:00
										 |  |  | # lint: pylint | 
					
						
							| 
									
										
										
										
											2021-06-21 18:15:40 +02:00
										 |  |  | """This is the implementation of the google WEB engine.  Some of this
 | 
					
						
							|  |  |  | implementations are shared by other engines: | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-21 18:15:40 +02:00
										 |  |  | - :ref:`google images engine` | 
					
						
							|  |  |  | - :ref:`google news engine` | 
					
						
							|  |  |  | - :ref:`google videos engine` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The google WEB engine itself has a special setup option: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .. code:: yaml | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   - name: google | 
					
						
							|  |  |  |     ... | 
					
						
							| 
									
										
										
										
											2022-05-10 22:44:35 +02:00
										 |  |  |     use_mobile_ui: false | 
					
						
							| 
									
										
										
										
											2021-06-21 18:15:40 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-10 22:44:35 +02:00
										 |  |  | ``use_mobile_ui``: (default: ``false``) | 
					
						
							| 
									
										
										
										
											2021-06-21 18:15:40 +02:00
										 |  |  |   Enables to use *mobile endpoint* to bypass the google blocking (see | 
					
						
							|  |  |  |   :issue:`159`).  On the mobile UI of Google Search, the button :guilabel:`More | 
					
						
							|  |  |  |   results` is not affected by Google rate limiting and we can still do requests | 
					
						
							|  |  |  |   while actively blocked by the original Google search.  By activate | 
					
						
							|  |  |  |   ``use_mobile_ui`` this behavior is simulated by adding the parameter | 
					
						
							|  |  |  |   ``async=use_ac:true,_fmt:pc`` to the :py:func:`request`. | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-03-18 19:59:01 +01:00
										 |  |  | from urllib.parse import urlencode | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | from lxml import html | 
					
						
							| 
									
										
										
										
											2020-11-26 17:22:54 +01:00
										 |  |  | from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex | 
					
						
							|  |  |  | from searx.exceptions import SearxEngineCaptchaException | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-13 11:31:25 +01:00
										 |  |  | # about | 
					
						
							|  |  |  | about = { | 
					
						
							|  |  |  |     "website": 'https://www.google.com', | 
					
						
							|  |  |  |     "wikidata_id": 'Q9366', | 
					
						
							|  |  |  |     "official_api_documentation": 'https://developers.google.com/custom-search/', | 
					
						
							|  |  |  |     "use_official_api": False, | 
					
						
							|  |  |  |     "require_api_key": False, | 
					
						
							|  |  |  |     "results": 'HTML', | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-01 15:10:05 +02:00
										 |  |  | # engine dependent config | 
					
						
							| 
									
										
										
										
											2021-12-22 16:58:52 +01:00
										 |  |  | categories = ['general', 'web'] | 
					
						
							| 
									
										
										
										
											2014-09-01 15:10:05 +02:00
										 |  |  | paging = True | 
					
						
							| 
									
										
										
										
											2016-07-18 16:09:18 +02:00
										 |  |  | time_range_support = True | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | safesearch = True | 
					
						
							| 
									
										
										
										
											2021-06-21 12:18:28 +02:00
										 |  |  | use_mobile_ui = False | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | supported_languages_url = 'https://www.google.com/preferences?#languages' | 
					
						
							| 
									
										
										
										
											2015-05-30 17:41:40 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | google_domains = { | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |     'BG': 'google.bg',  # Bulgaria | 
					
						
							|  |  |  |     'CZ': 'google.cz',  # Czech Republic | 
					
						
							|  |  |  |     'DE': 'google.de',  # Germany | 
					
						
							|  |  |  |     'DK': 'google.dk',  # Denmark | 
					
						
							|  |  |  |     'AT': 'google.at',  # Austria | 
					
						
							|  |  |  |     'CH': 'google.ch',  # Switzerland | 
					
						
							|  |  |  |     'GR': 'google.gr',  # Greece | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     'AU': 'google.com.au',  # Australia | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |     'CA': 'google.ca',  # Canada | 
					
						
							|  |  |  |     'GB': 'google.co.uk',  # United Kingdom | 
					
						
							|  |  |  |     'ID': 'google.co.id',  # Indonesia | 
					
						
							|  |  |  |     'IE': 'google.ie',  # Ireland | 
					
						
							|  |  |  |     'IN': 'google.co.in',  # India | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     'MY': 'google.com.my',  # Malaysia | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |     'NZ': 'google.co.nz',  # New Zealand | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     'PH': 'google.com.ph',  # Philippines | 
					
						
							|  |  |  |     'SG': 'google.com.sg',  # Singapore | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |     'US': 'google.com',  # United States (google.us) redirects to .com | 
					
						
							|  |  |  |     'ZA': 'google.co.za',  # South Africa | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     'AR': 'google.com.ar',  # Argentina | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |     'CL': 'google.cl',  # Chile | 
					
						
							|  |  |  |     'ES': 'google.es',  # Spain | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     'MX': 'google.com.mx',  # Mexico | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |     'EE': 'google.ee',  # Estonia | 
					
						
							|  |  |  |     'FI': 'google.fi',  # Finland | 
					
						
							|  |  |  |     'BE': 'google.be',  # Belgium | 
					
						
							|  |  |  |     'FR': 'google.fr',  # France | 
					
						
							|  |  |  |     'IL': 'google.co.il',  # Israel | 
					
						
							|  |  |  |     'HR': 'google.hr',  # Croatia | 
					
						
							|  |  |  |     'HU': 'google.hu',  # Hungary | 
					
						
							|  |  |  |     'IT': 'google.it',  # Italy | 
					
						
							|  |  |  |     'JP': 'google.co.jp',  # Japan | 
					
						
							|  |  |  |     'KR': 'google.co.kr',  # South Korea | 
					
						
							|  |  |  |     'LT': 'google.lt',  # Lithuania | 
					
						
							|  |  |  |     'LV': 'google.lv',  # Latvia | 
					
						
							|  |  |  |     'NO': 'google.no',  # Norway | 
					
						
							|  |  |  |     'NL': 'google.nl',  # Netherlands | 
					
						
							|  |  |  |     'PL': 'google.pl',  # Poland | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     'BR': 'google.com.br',  # Brazil | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |     'PT': 'google.pt',  # Portugal | 
					
						
							|  |  |  |     'RO': 'google.ro',  # Romania | 
					
						
							|  |  |  |     'RU': 'google.ru',  # Russia | 
					
						
							|  |  |  |     'SK': 'google.sk',  # Slovakia | 
					
						
							|  |  |  |     'SI': 'google.si',  # Slovenia | 
					
						
							|  |  |  |     'SE': 'google.se',  # Sweden | 
					
						
							|  |  |  |     'TH': 'google.co.th',  # Thailand | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     'TR': 'google.com.tr',  # Turkey | 
					
						
							|  |  |  |     'UA': 'google.com.ua',  # Ukraine | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  |     'CN': 'google.com.hk',  # There is no google.cn, we use .com.hk for zh-CN | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     'HK': 'google.com.hk',  # Hong Kong | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |     'TW': 'google.com.tw',  # Taiwan | 
					
						
							| 
									
										
										
										
											2015-05-30 17:41:40 +02:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2014-01-29 19:28:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  | time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Filter results. 0: None, 1: Moderate, 2: Strict | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  | filter_mapping = {0: 'off', 1: 'medium', 2: 'high'} | 
					
						
							| 
									
										
										
										
											2015-06-05 11:23:24 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-14 14:40:55 +02:00
										 |  |  | # specific xpath variables | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | # ------------------------ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-09 14:34:14 +01:00
										 |  |  | # google results are grouped into <div class="jtfYYd ..." ../> | 
					
						
							|  |  |  | results_xpath = '//div[@class="jtfYYd"]' | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # google *sections* are no usual *results*, we ignore them | 
					
						
							| 
									
										
										
										
											2020-07-08 00:46:03 +02:00
										 |  |  | g_section_with_header = './g-section-with-header' | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # the title is a h3 tag relative to the result group | 
					
						
							|  |  |  | title_xpath = './/h3[1]' | 
					
						
							| 
									
										
										
										
											2014-01-29 19:28:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-01 09:44:29 +02:00
										 |  |  | # in the result group there is <div class="yuRUbf" ../> it's first child is a <a | 
					
						
							|  |  |  | # href=...> | 
					
						
							|  |  |  | href_xpath = './/div[@class="yuRUbf"]//a/@href' | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-17 22:49:57 +01:00
										 |  |  | # in the result group there is <div class="VwiC3b ..." ../> containing the *content* | 
					
						
							|  |  |  | content_xpath = './/div[contains(@class, "VwiC3b")]' | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Suggestions are links placed in a *card-section*, we extract only the text | 
					
						
							|  |  |  | # from the links not the links itself. | 
					
						
							| 
									
										
										
										
											2021-11-25 19:38:14 +01:00
										 |  |  | suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a' | 
					
						
							| 
									
										
										
										
											2020-07-08 00:46:03 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-06 08:18:07 +02:00
										 |  |  | def get_lang_info(params, lang_list, custom_aliases, supported_any_language): | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  |     """Composing various language properties for the google engines.
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-21 18:15:40 +02:00
										 |  |  |     This function is called by the various google engines (:ref:`google web | 
					
						
							|  |  |  |     engine`, :ref:`google images engine`, :ref:`google news engine` and | 
					
						
							|  |  |  |     :ref:`google videos engine`). | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     :param dict param: request parameters of the engine | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     :param list lang_list: list of supported languages of the engine | 
					
						
							|  |  |  |         :py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     :param dict lang_list: custom aliases for non standard language codes | 
					
						
							| 
									
										
										
										
											2021-06-21 18:15:40 +02:00
										 |  |  |         (used when calling :py:func:`searx.utils.match_language`) | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     :param bool supported_any_language: When a language is not specified, the | 
					
						
							|  |  |  |         language interpretation is left up to Google to decide how the search | 
					
						
							|  |  |  |         results should be delivered.  This argument is ``True`` for the google | 
					
						
							|  |  |  |         engine and ``False`` for the other engines (google-images, -news, | 
					
						
							|  |  |  |         -scholar, -videos). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     :rtype: dict | 
					
						
							|  |  |  |     :returns: | 
					
						
							|  |  |  |         Py-Dictionary with the key/value pairs: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         language: | 
					
						
							| 
									
										
										
										
											2021-06-21 18:15:40 +02:00
										 |  |  |             Return value from :py:func:`searx.utils.match_language` | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         country: | 
					
						
							|  |  |  |             The country code (e.g. US, AT, CA, FR, DE ..) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         subdomain: | 
					
						
							|  |  |  |             Google subdomain :py:obj:`google_domains` that fits to the country | 
					
						
							|  |  |  |             code. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         params: | 
					
						
							|  |  |  |             Py-Dictionary with additional request arguments (can be passed to | 
					
						
							|  |  |  |             :py:func:`urllib.parse.urlencode`). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         headers: | 
					
						
							|  |  |  |             Py-Dictionary with additional HTTP headers (can be passed to | 
					
						
							|  |  |  |             request's headers) | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     ret_val = { | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |         'language': None, | 
					
						
							|  |  |  |         'country': None, | 
					
						
							|  |  |  |         'subdomain': None, | 
					
						
							|  |  |  |         'params': {}, | 
					
						
							|  |  |  |         'headers': {}, | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # language ... | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     _lang = params['language'] | 
					
						
							| 
									
										
										
										
											2021-06-06 08:18:07 +02:00
										 |  |  |     _any_language = _lang.lower() == 'all' | 
					
						
							|  |  |  |     if _any_language: | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  |         _lang = 'en-US' | 
					
						
							|  |  |  |     language = match_language(_lang, lang_list, custom_aliases) | 
					
						
							|  |  |  |     ret_val['language'] = language | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  |     # country ... | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  |     _l = _lang.split('-') | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  |     if len(_l) == 2: | 
					
						
							|  |  |  |         country = _l[1] | 
					
						
							| 
									
										
										
										
											2018-03-01 05:30:48 +01:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  |         country = _l[0].upper() | 
					
						
							|  |  |  |         if country == 'EN': | 
					
						
							|  |  |  |             country = 'US' | 
					
						
							|  |  |  |     ret_val['country'] = country | 
					
						
							| 
									
										
										
										
											2018-03-01 05:30:48 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  |     # subdomain ... | 
					
						
							| 
									
										
										
										
											2015-05-30 17:41:40 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |     ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com') | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  |     # params & headers | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     lang_country = '%s-%s' % (language, country)  # (en-US, en-EN, de-DE, de-AU, fr-FR ..) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # hl parameter: | 
					
						
							|  |  |  |     #   https://developers.google.com/custom-search/docs/xml_results#hlsp The | 
					
						
							|  |  |  |     # Interface Language: | 
					
						
							|  |  |  |     #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ret_val['params']['hl'] = lang_list.get(lang_country, language) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # lr parameter: | 
					
						
							|  |  |  |     #   The lr (language restrict) parameter restricts search results to | 
					
						
							|  |  |  |     #   documents written in a particular language. | 
					
						
							|  |  |  |     #   https://developers.google.com/custom-search/docs/xml_results#lrsp | 
					
						
							|  |  |  |     #   Language Collection Values: | 
					
						
							|  |  |  |     #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections | 
					
						
							| 
									
										
										
										
											2021-06-06 08:18:07 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if _any_language and supported_any_language: | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # interpretation is left up to Google (based on whoogle) | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # - add parameter ``source=lnt`` | 
					
						
							|  |  |  |         # - don't use parameter ``lr`` | 
					
						
							|  |  |  |         # - don't add a ``Accept-Language`` HTTP header. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-06 08:18:07 +02:00
										 |  |  |         ret_val['params']['source'] = 'lnt' | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-06 08:18:07 +02:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # restricts search results to documents written in a particular | 
					
						
							|  |  |  |         # language. | 
					
						
							|  |  |  |         ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-06 08:18:07 +02:00
										 |  |  |         # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5 | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |         ret_val['headers']['Accept-Language'] = ','.join( | 
					
						
							|  |  |  |             [ | 
					
						
							|  |  |  |                 lang_country, | 
					
						
							|  |  |  |                 language + ';q=0.8,', | 
					
						
							|  |  |  |                 'en;q=0.6', | 
					
						
							|  |  |  |                 '*;q=0.5', | 
					
						
							|  |  |  |             ] | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2021-06-06 08:18:07 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  |     return ret_val | 
					
						
							| 
									
										
										
										
											2014-09-14 14:40:55 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-22 18:49:45 +01:00
										 |  |  | def detect_google_sorry(resp): | 
					
						
							| 
									
										
										
										
											2021-03-18 19:59:01 +01:00
										 |  |  |     if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'): | 
					
						
							| 
									
										
										
										
											2021-01-22 18:49:45 +01:00
										 |  |  |         raise SearxEngineCaptchaException() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-08 00:46:03 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | def request(query, params): | 
					
						
							|  |  |  |     """Google search request""" | 
					
						
							| 
									
										
										
										
											2015-05-30 17:41:40 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     offset = (params['pageno'] - 1) * 10 | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |     lang_info = get_lang_info(params, supported_languages, language_aliases, True) | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-21 12:18:28 +02:00
										 |  |  |     additional_parameters = {} | 
					
						
							|  |  |  |     if use_mobile_ui: | 
					
						
							|  |  |  |         additional_parameters = { | 
					
						
							| 
									
										
										
										
											2021-07-15 15:00:32 +02:00
										 |  |  |             'asearch': 'arc', | 
					
						
							| 
									
										
										
										
											2021-06-21 16:09:16 +02:00
										 |  |  |             'async': 'use_ac:true,_fmt:pc', | 
					
						
							| 
									
										
										
										
											2021-06-21 12:18:28 +02:00
										 |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  |     # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |     query_url = ( | 
					
						
							|  |  |  |         'https://' | 
					
						
							|  |  |  |         + lang_info['subdomain'] | 
					
						
							|  |  |  |         + '/search' | 
					
						
							|  |  |  |         + "?" | 
					
						
							|  |  |  |         + urlencode( | 
					
						
							|  |  |  |             { | 
					
						
							|  |  |  |                 'q': query, | 
					
						
							|  |  |  |                 **lang_info['params'], | 
					
						
							|  |  |  |                 'ie': "utf8", | 
					
						
							|  |  |  |                 'oe': "utf8", | 
					
						
							|  |  |  |                 'start': offset, | 
					
						
							|  |  |  |                 'filter': '0', | 
					
						
							|  |  |  |                 **additional_parameters, | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2014-09-01 15:10:05 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     if params['time_range'] in time_range_dict: | 
					
						
							|  |  |  |         query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) | 
					
						
							|  |  |  |     if params['safesearch']: | 
					
						
							|  |  |  |         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  |     params['url'] = query_url | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-06 08:18:07 +02:00
										 |  |  |     params['headers'].update(lang_info['headers']) | 
					
						
							| 
									
										
										
										
											2021-06-21 12:18:28 +02:00
										 |  |  |     if use_mobile_ui: | 
					
						
							|  |  |  |         params['headers']['Accept'] = '*/*' | 
					
						
							|  |  |  |     else: | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |         params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | 
					
						
							| 
									
										
										
										
											2014-01-29 19:28:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     return params | 
					
						
							| 
									
										
										
										
											2014-01-29 19:28:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-08 00:46:03 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-29 19:28:38 +01:00
										 |  |  | def response(resp): | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     """Get response from google's search request""" | 
					
						
							| 
									
										
										
										
											2014-09-01 15:10:05 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-22 18:49:45 +01:00
										 |  |  |     detect_google_sorry(resp) | 
					
						
							| 
									
										
										
										
											2017-12-05 20:38:34 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-22 18:49:45 +01:00
										 |  |  |     results = [] | 
					
						
							| 
									
										
										
										
											2015-05-30 17:41:40 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # convert the text to dom | 
					
						
							| 
									
										
										
										
											2014-09-14 14:40:55 +02:00
										 |  |  |     dom = html.fromstring(resp.text) | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     # results --> answer | 
					
						
							| 
									
										
										
										
											2021-06-21 16:46:08 +02:00
										 |  |  |     answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') | 
					
						
							|  |  |  |     if answer_list: | 
					
						
							|  |  |  |         answer_list = [_.xpath("normalize-space()") for _ in answer_list] | 
					
						
							|  |  |  |         results.append({'answer': ' '.join(answer_list)}) | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2021-04-11 22:12:53 +02:00
										 |  |  |         logger.debug("did not find 'answer'") | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |         # results --> number_of_results | 
					
						
							| 
									
										
										
										
											2021-06-21 12:18:28 +02:00
										 |  |  |         if not use_mobile_ui: | 
					
						
							|  |  |  |             try: | 
					
						
							|  |  |  |                 _txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0) | 
					
						
							|  |  |  |                 _digit = ''.join([n for n in _txt if n.isdigit()]) | 
					
						
							|  |  |  |                 number_of_results = int(_digit) | 
					
						
							|  |  |  |                 results.append({'number_of_results': number_of_results}) | 
					
						
							|  |  |  |             except Exception as e:  # pylint: disable=broad-except | 
					
						
							|  |  |  |                 logger.debug("did not 'number_of_results'") | 
					
						
							|  |  |  |                 logger.error(e, exc_info=True) | 
					
						
							| 
									
										
										
										
											2017-01-05 17:20:12 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-01 15:10:05 +02:00
										 |  |  |     # parse results | 
					
						
							| 
									
										
										
										
											2022-01-18 11:05:45 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-09 14:34:14 +01:00
										 |  |  |     for result in eval_xpath_list(dom, results_xpath): | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # google *sections* | 
					
						
							|  |  |  |         if extract_text(eval_xpath(result, g_section_with_header)): | 
					
						
							|  |  |  |             logger.debug("ingoring <g-section-with-header>") | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-14 14:40:55 +02:00
										 |  |  |         try: | 
					
						
							| 
									
										
										
										
											2020-11-26 17:22:54 +01:00
										 |  |  |             title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) | 
					
						
							|  |  |  |             if title_tag is None: | 
					
						
							| 
									
										
										
										
											2020-10-01 09:44:29 +02:00
										 |  |  |                 # this not one of the common google results *section* | 
					
						
							| 
									
										
										
										
											2022-01-18 13:23:35 +01:00
										 |  |  |                 logger.debug('ingoring item from the result_xpath list: missing title') | 
					
						
							| 
									
										
										
										
											2020-10-01 09:44:29 +02:00
										 |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2020-11-26 17:22:54 +01:00
										 |  |  |             title = extract_text(title_tag) | 
					
						
							| 
									
										
										
										
											2021-01-26 12:51:54 +01:00
										 |  |  |             url = eval_xpath_getindex(result, href_xpath, 0, None) | 
					
						
							|  |  |  |             if url is None: | 
					
						
							|  |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2020-11-26 17:22:54 +01:00
										 |  |  |             content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True) | 
					
						
							| 
									
										
										
										
											2022-01-18 13:23:35 +01:00
										 |  |  |             if content is None: | 
					
						
							|  |  |  |                 logger.debug('ingoring item from the result_xpath list: missing content of title "%s"', title) | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             logger.debug('add link to results: %s', title) | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |             results.append({'url': url, 'title': title, 'content': content}) | 
					
						
							| 
									
										
										
										
											2022-01-18 13:23:35 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |         except Exception as e:  # pylint: disable=broad-except | 
					
						
							|  |  |  |             logger.error(e, exc_info=True) | 
					
						
							| 
									
										
										
										
											2014-09-14 14:40:55 +02:00
										 |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # parse suggestion | 
					
						
							| 
									
										
										
										
											2020-11-26 17:22:54 +01:00
										 |  |  |     for suggestion in eval_xpath_list(dom, suggestion_xpath): | 
					
						
							| 
									
										
										
										
											2014-09-14 14:40:55 +02:00
										 |  |  |         # append suggestion | 
					
						
							| 
									
										
										
										
											2016-12-09 11:44:24 +01:00
										 |  |  |         results.append({'suggestion': extract_text(suggestion)}) | 
					
						
							| 
									
										
										
										
											2014-09-01 15:10:05 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # return results | 
					
						
							| 
									
										
										
										
											2014-01-29 19:28:38 +01:00
										 |  |  |     return results | 
					
						
							| 
									
										
										
										
											2014-09-14 14:40:55 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-08 00:46:03 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-11-06 03:51:38 +01:00
										 |  |  | # get supported languages from their site | 
					
						
							| 
									
										
										
										
											2016-12-15 07:34:43 +01:00
										 |  |  | def _fetch_supported_languages(resp): | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     ret_val = {} | 
					
						
							| 
									
										
										
										
											2016-12-15 07:34:43 +01:00
										 |  |  |     dom = html.fromstring(resp.text) | 
					
						
							| 
									
										
										
										
											2016-11-06 03:51:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-11-26 17:22:54 +01:00
										 |  |  |     radio_buttons = eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]') | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     for x in radio_buttons: | 
					
						
							|  |  |  |         name = x.get("data-name") | 
					
						
							| 
									
										
										
										
											2020-09-21 08:01:06 +02:00
										 |  |  |         code = x.get("value").split('_')[-1] | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |         ret_val[code] = {"name": name} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return ret_val |