| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | # SPDX-License-Identifier: AGPL-3.0-or-later | 
					
						
							| 
									
										
										
										
											2021-04-26 20:18:20 +02:00
										 |  |  | # lint: pylint | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | """Google (Web)
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  | For detailed description of the *REST-full* API see: `Query Parameter | 
					
						
							|  |  |  | Definitions`_. | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  | .. _Query Parameter Definitions: | 
					
						
							|  |  |  |    https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # pylint: disable=invalid-name, missing-function-docstring | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-03-18 19:59:01 +01:00
										 |  |  | from urllib.parse import urlencode | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | from lxml import html | 
					
						
							| 
									
										
										
										
											2016-11-30 18:43:03 +01:00
										 |  |  | from searx import logger | 
					
						
							| 
									
										
										
										
											2020-11-26 17:22:54 +01:00
										 |  |  | from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex | 
					
						
							|  |  |  | from searx.exceptions import SearxEngineCaptchaException | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-10-29 12:47:12 +01:00
										 |  |  | logger = logger.getChild('google engine') | 
					
						
							| 
									
										
										
										
											2014-01-29 19:28:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-13 11:31:25 +01:00
										 |  |  | # about | 
					
						
							|  |  |  | about = { | 
					
						
							|  |  |  |     "website": 'https://www.google.com', | 
					
						
							|  |  |  |     "wikidata_id": 'Q9366', | 
					
						
							|  |  |  |     "official_api_documentation": 'https://developers.google.com/custom-search/', | 
					
						
							|  |  |  |     "use_official_api": False, | 
					
						
							|  |  |  |     "require_api_key": False, | 
					
						
							|  |  |  |     "results": 'HTML', | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-01 15:10:05 +02:00
										 |  |  | # engine dependent config | 
					
						
							| 
									
										
										
										
											2014-01-29 19:28:38 +01:00
										 |  |  | categories = ['general'] | 
					
						
							| 
									
										
										
										
											2014-09-01 15:10:05 +02:00
										 |  |  | paging = True | 
					
						
							| 
									
										
										
										
											2016-07-18 16:09:18 +02:00
										 |  |  | time_range_support = True | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | safesearch = True | 
					
						
							|  |  |  | supported_languages_url = 'https://www.google.com/preferences?#languages' | 
					
						
							| 
									
										
										
										
											2015-05-30 17:41:40 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | google_domains = { | 
					
						
							|  |  |  |     'BG': 'google.bg',      # Bulgaria | 
					
						
							|  |  |  |     'CZ': 'google.cz',      # Czech Republic | 
					
						
							|  |  |  |     'DE': 'google.de',      # Germany | 
					
						
							|  |  |  |     'DK': 'google.dk',      # Denmark | 
					
						
							|  |  |  |     'AT': 'google.at',      # Austria | 
					
						
							|  |  |  |     'CH': 'google.ch',      # Switzerland | 
					
						
							|  |  |  |     'GR': 'google.gr',      # Greece | 
					
						
							|  |  |  |     'AU': 'google.com.au',  # Australia | 
					
						
							|  |  |  |     'CA': 'google.ca',      # Canada | 
					
						
							|  |  |  |     'GB': 'google.co.uk',   # United Kingdom | 
					
						
							|  |  |  |     'ID': 'google.co.id',   # Indonesia | 
					
						
							|  |  |  |     'IE': 'google.ie',      # Ireland | 
					
						
							|  |  |  |     'IN': 'google.co.in',   # India | 
					
						
							|  |  |  |     'MY': 'google.com.my',  # Malaysia | 
					
						
							|  |  |  |     'NZ': 'google.co.nz',   # New Zealand | 
					
						
							|  |  |  |     'PH': 'google.com.ph',  # Philippines | 
					
						
							|  |  |  |     'SG': 'google.com.sg',  # Singapore | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  |     'US': 'google.com',     # United States (google.us) redirects to .com | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     'ZA': 'google.co.za',   # South Africa | 
					
						
							|  |  |  |     'AR': 'google.com.ar',  # Argentina | 
					
						
							|  |  |  |     'CL': 'google.cl',      # Chile | 
					
						
							|  |  |  |     'ES': 'google.es',      # Spain | 
					
						
							|  |  |  |     'MX': 'google.com.mx',  # Mexico | 
					
						
							|  |  |  |     'EE': 'google.ee',      # Estonia | 
					
						
							|  |  |  |     'FI': 'google.fi',      # Finland | 
					
						
							|  |  |  |     'BE': 'google.be',      # Belgium | 
					
						
							|  |  |  |     'FR': 'google.fr',      # France | 
					
						
							|  |  |  |     'IL': 'google.co.il',   # Israel | 
					
						
							|  |  |  |     'HR': 'google.hr',      # Croatia | 
					
						
							|  |  |  |     'HU': 'google.hu',      # Hungary | 
					
						
							|  |  |  |     'IT': 'google.it',      # Italy | 
					
						
							|  |  |  |     'JP': 'google.co.jp',   # Japan | 
					
						
							|  |  |  |     'KR': 'google.co.kr',   # South Korea | 
					
						
							|  |  |  |     'LT': 'google.lt',      # Lithuania | 
					
						
							|  |  |  |     'LV': 'google.lv',      # Latvia | 
					
						
							|  |  |  |     'NO': 'google.no',      # Norway | 
					
						
							|  |  |  |     'NL': 'google.nl',      # Netherlands | 
					
						
							|  |  |  |     'PL': 'google.pl',      # Poland | 
					
						
							|  |  |  |     'BR': 'google.com.br',  # Brazil | 
					
						
							|  |  |  |     'PT': 'google.pt',      # Portugal | 
					
						
							|  |  |  |     'RO': 'google.ro',      # Romania | 
					
						
							|  |  |  |     'RU': 'google.ru',      # Russia | 
					
						
							|  |  |  |     'SK': 'google.sk',      # Slovakia | 
					
						
							|  |  |  |     'SI': 'google.si',      # Slovenia | 
					
						
							|  |  |  |     'SE': 'google.se',      # Sweden | 
					
						
							|  |  |  |     'TH': 'google.co.th',   # Thailand | 
					
						
							|  |  |  |     'TR': 'google.com.tr',  # Turkey | 
					
						
							|  |  |  |     'UA': 'google.com.ua',  # Ukraine | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  |     'CN': 'google.com.hk',  # There is no google.cn, we use .com.hk for zh-CN | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     'HK': 'google.com.hk',  # Hong Kong | 
					
						
							|  |  |  |     'TW': 'google.com.tw'   # Taiwan | 
					
						
							| 
									
										
										
										
											2015-05-30 17:41:40 +02:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2014-01-29 19:28:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | time_range_dict = { | 
					
						
							|  |  |  |     'day': 'd', | 
					
						
							|  |  |  |     'week': 'w', | 
					
						
							|  |  |  |     'month': 'm', | 
					
						
							|  |  |  |     'year': 'y' | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Filter results. 0: None, 1: Moderate, 2: Strict | 
					
						
							|  |  |  | filter_mapping = { | 
					
						
							| 
									
										
										
										
											2020-07-08 00:46:03 +02:00
										 |  |  |     0: 'off', | 
					
						
							|  |  |  |     1: 'medium', | 
					
						
							|  |  |  |     2: 'high' | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2015-06-05 11:23:24 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-14 14:40:55 +02:00
										 |  |  | # specific xpath variables | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | # ------------------------ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # google results are grouped into <div class="g" ../> | 
					
						
							|  |  |  | results_xpath = '//div[@class="g"]' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # google *sections* are no usual *results*, we ignore them | 
					
						
							| 
									
										
										
										
											2020-07-08 00:46:03 +02:00
										 |  |  | g_section_with_header = './g-section-with-header' | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # the title is a h3 tag relative to the result group | 
					
						
							|  |  |  | title_xpath = './/h3[1]' | 
					
						
							| 
									
										
										
										
											2014-01-29 19:28:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-01 09:44:29 +02:00
										 |  |  | # in the result group there is <div class="yuRUbf" ../> it's first child is a <a | 
					
						
							|  |  |  | # href=...> | 
					
						
							|  |  |  | href_xpath = './/div[@class="yuRUbf"]//a/@href' | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-01 09:44:29 +02:00
										 |  |  | # in the result group there is <div class="IsZvec" ../> containing he *content* | 
					
						
							|  |  |  | content_xpath = './/div[@class="IsZvec"]' | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Suggestions are links placed in a *card-section*, we extract only the text | 
					
						
							|  |  |  | # from the links not the links itself. | 
					
						
							|  |  |  | suggestion_xpath = '//div[contains(@class, "card-section")]//a' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Since google does *auto-correction* on the first query these are not really | 
					
						
							|  |  |  | # *spelling suggestions*, we use them anyway. | 
					
						
							|  |  |  | spelling_suggestion_xpath = '//div[@class="med"]/p/a' | 
					
						
							| 
									
										
										
										
											2014-12-05 20:03:16 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-08 00:46:03 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-06 08:18:07 +02:00
										 |  |  | def get_lang_info(params, lang_list, custom_aliases, supported_any_language): | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  |     """Composing various language properties for the google engines.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     This function is called by the various google engines (google itself, | 
					
						
							|  |  |  |     google-images, -news, -scholar, -videos). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     :param dict param: request parameters of the engine | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     :param list lang_list: list of supported languages of the engine | 
					
						
							|  |  |  |         :py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     :param dict lang_list: custom aliases for non standard language codes | 
					
						
							|  |  |  |         (used when calling :py:func:`searx.utils.match_language) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     :param bool supported_any_language: When a language is not specified, the | 
					
						
							|  |  |  |         language interpretation is left up to Google to decide how the search | 
					
						
							|  |  |  |         results should be delivered.  This argument is ``True`` for the google | 
					
						
							|  |  |  |         engine and ``False`` for the other engines (google-images, -news, | 
					
						
							|  |  |  |         -scholar, -videos). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     :rtype: dict | 
					
						
							|  |  |  |     :returns: | 
					
						
							|  |  |  |         Py-Dictionary with the key/value pairs: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         language: | 
					
						
							|  |  |  |             Return value from :py:func:`searx.utils.match_language | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         country: | 
					
						
							|  |  |  |             The country code (e.g. US, AT, CA, FR, DE ..) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         subdomain: | 
					
						
							|  |  |  |             Google subdomain :py:obj:`google_domains` that fits to the country | 
					
						
							|  |  |  |             code. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         params: | 
					
						
							|  |  |  |             Py-Dictionary with additional request arguments (can be passed to | 
					
						
							|  |  |  |             :py:func:`urllib.parse.urlencode`). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         headers: | 
					
						
							|  |  |  |             Py-Dictionary with additional HTTP headers (can be passed to | 
					
						
							|  |  |  |             request's headers) | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     ret_val = { | 
					
						
							|  |  |  |         'language' : None, | 
					
						
							|  |  |  |         'country' : None, | 
					
						
							|  |  |  |         'subdomain' : None, | 
					
						
							|  |  |  |         'params' : {}, | 
					
						
							|  |  |  |         'headers' : {}, | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # language ... | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     _lang = params['language'] | 
					
						
							| 
									
										
										
										
											2021-06-06 08:18:07 +02:00
										 |  |  |     _any_language = _lang.lower() == 'all' | 
					
						
							|  |  |  |     if _any_language: | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  |         _lang = 'en-US' | 
					
						
							|  |  |  |     language = match_language(_lang, lang_list, custom_aliases) | 
					
						
							|  |  |  |     ret_val['language'] = language | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  |     # country ... | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  |     _l = _lang.split('-') | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  |     if len(_l) == 2: | 
					
						
							|  |  |  |         country = _l[1] | 
					
						
							| 
									
										
										
										
											2018-03-01 05:30:48 +01:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  |         country = _l[0].upper() | 
					
						
							|  |  |  |         if country == 'EN': | 
					
						
							|  |  |  |             country = 'US' | 
					
						
							|  |  |  |     ret_val['country'] = country | 
					
						
							| 
									
										
										
										
											2018-03-01 05:30:48 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  |     # subdomain ... | 
					
						
							| 
									
										
										
										
											2015-05-30 17:41:40 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  |     ret_val['subdomain']  = 'www.' + google_domains.get(country.upper(), 'google.com') | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  |     # params & headers | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     lang_country = '%s-%s' % (language, country)  # (en-US, en-EN, de-DE, de-AU, fr-FR ..) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # hl parameter: | 
					
						
							|  |  |  |     #   https://developers.google.com/custom-search/docs/xml_results#hlsp The | 
					
						
							|  |  |  |     # Interface Language: | 
					
						
							|  |  |  |     #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ret_val['params']['hl'] = lang_list.get(lang_country, language) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # lr parameter: | 
					
						
							|  |  |  |     #   The lr (language restrict) parameter restricts search results to | 
					
						
							|  |  |  |     #   documents written in a particular language. | 
					
						
							|  |  |  |     #   https://developers.google.com/custom-search/docs/xml_results#lrsp | 
					
						
							|  |  |  |     #   Language Collection Values: | 
					
						
							|  |  |  |     #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections | 
					
						
							| 
									
										
										
										
											2021-06-06 08:18:07 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if _any_language and supported_any_language: | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # interpretation is left up to Google (based on whoogle) | 
					
						
							|  |  |  |         # | 
					
						
							|  |  |  |         # - add parameter ``source=lnt`` | 
					
						
							|  |  |  |         # - don't use parameter ``lr`` | 
					
						
							|  |  |  |         # - don't add a ``Accept-Language`` HTTP header. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-06 08:18:07 +02:00
										 |  |  |         ret_val['params']['source'] = 'lnt' | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-06 08:18:07 +02:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2021-06-11 16:06:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # restricts search results to documents written in a particular | 
					
						
							|  |  |  |         # language. | 
					
						
							|  |  |  |         ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-06 08:18:07 +02:00
										 |  |  |         # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5 | 
					
						
							|  |  |  |         ret_val['headers']['Accept-Language'] = ','.join([ | 
					
						
							|  |  |  |             lang_country, | 
					
						
							|  |  |  |             language + ';q=0.8,', | 
					
						
							|  |  |  |             'en;q=0.6', | 
					
						
							|  |  |  |             '*;q=0.5', | 
					
						
							|  |  |  |         ]) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  |     return ret_val | 
					
						
							| 
									
										
										
										
											2014-09-14 14:40:55 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-22 18:49:45 +01:00
										 |  |  | def detect_google_sorry(resp): | 
					
						
							| 
									
										
										
										
											2021-03-18 19:59:01 +01:00
										 |  |  |     if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'): | 
					
						
							| 
									
										
										
										
											2021-01-22 18:49:45 +01:00
										 |  |  |         raise SearxEngineCaptchaException() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-08 00:46:03 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | def request(query, params): | 
					
						
							|  |  |  |     """Google search request""" | 
					
						
							| 
									
										
										
										
											2015-05-30 17:41:40 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     offset = (params['pageno'] - 1) * 10 | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     lang_info = get_lang_info( | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |         # pylint: disable=undefined-variable | 
					
						
							| 
									
										
										
										
											2021-06-06 08:18:07 +02:00
										 |  |  |         params, supported_languages, language_aliases, True | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  |     # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium | 
					
						
							|  |  |  |     query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |         'q': query, | 
					
						
							| 
									
										
										
										
											2021-06-06 08:18:07 +02:00
										 |  |  |         **lang_info['params'], | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |         'ie': "utf8", | 
					
						
							|  |  |  |         'oe': "utf8", | 
					
						
							|  |  |  |         'start': offset, | 
					
						
							|  |  |  |     }) | 
					
						
							| 
									
										
										
										
											2014-09-01 15:10:05 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     if params['time_range'] in time_range_dict: | 
					
						
							|  |  |  |         query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) | 
					
						
							|  |  |  |     if params['safesearch']: | 
					
						
							|  |  |  |         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) | 
					
						
							| 
									
										
										
										
											2021-01-26 11:49:27 +01:00
										 |  |  |     params['url'] = query_url | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-06 08:18:07 +02:00
										 |  |  |     params['headers'].update(lang_info['headers']) | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     params['headers']['Accept'] = ( | 
					
						
							|  |  |  |         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | 
					
						
							| 
									
										
										
										
											2020-07-08 00:46:03 +02:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2014-01-29 19:28:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     return params | 
					
						
							| 
									
										
										
										
											2014-01-29 19:28:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-08 00:46:03 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-29 19:28:38 +01:00
										 |  |  | def response(resp): | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     """Get response from google's search request""" | 
					
						
							| 
									
										
										
										
											2014-09-01 15:10:05 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-22 18:49:45 +01:00
										 |  |  |     detect_google_sorry(resp) | 
					
						
							| 
									
										
										
										
											2017-12-05 20:38:34 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-22 18:49:45 +01:00
										 |  |  |     results = [] | 
					
						
							| 
									
										
										
										
											2015-05-30 17:41:40 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # convert the text to dom | 
					
						
							| 
									
										
										
										
											2014-09-14 14:40:55 +02:00
										 |  |  |     dom = html.fromstring(resp.text) | 
					
						
							| 
									
										
										
										
											2014-01-29 19:28:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     # results --> answer | 
					
						
							|  |  |  |     answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()') | 
					
						
							|  |  |  |     if answer: | 
					
						
							|  |  |  |         results.append({'answer': ' '.join(answer)}) | 
					
						
							|  |  |  |     else: | 
					
						
							| 
									
										
										
										
											2021-04-11 22:12:53 +02:00
										 |  |  |         logger.debug("did not find 'answer'") | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # results --> number_of_results | 
					
						
							| 
									
										
										
										
											2020-11-26 17:22:54 +01:00
										 |  |  |         try: | 
					
						
							|  |  |  |             _txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0) | 
					
						
							|  |  |  |             _digit = ''.join([n for n in _txt if n.isdigit()]) | 
					
						
							|  |  |  |             number_of_results = int(_digit) | 
					
						
							|  |  |  |             results.append({'number_of_results': number_of_results}) | 
					
						
							|  |  |  |         except Exception as e:  # pylint: disable=broad-except | 
					
						
							|  |  |  |             logger.debug("did not 'number_of_results'") | 
					
						
							|  |  |  |             logger.error(e, exc_info=True) | 
					
						
							| 
									
										
										
										
											2017-01-05 17:20:12 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-01 15:10:05 +02:00
										 |  |  |     # parse results | 
					
						
							| 
									
										
										
										
											2020-11-26 17:22:54 +01:00
										 |  |  |     for result in eval_xpath_list(dom, results_xpath): | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # google *sections* | 
					
						
							|  |  |  |         if extract_text(eval_xpath(result, g_section_with_header)): | 
					
						
							|  |  |  |             logger.debug("ingoring <g-section-with-header>") | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-14 14:40:55 +02:00
										 |  |  |         try: | 
					
						
							| 
									
										
										
										
											2020-11-26 17:22:54 +01:00
										 |  |  |             title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) | 
					
						
							|  |  |  |             if title_tag is None: | 
					
						
							| 
									
										
										
										
											2020-10-01 09:44:29 +02:00
										 |  |  |                 # this not one of the common google results *section* | 
					
						
							|  |  |  |                 logger.debug('ingoring <div class="g" ../> section: missing title') | 
					
						
							|  |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2020-11-26 17:22:54 +01:00
										 |  |  |             title = extract_text(title_tag) | 
					
						
							| 
									
										
										
										
											2021-01-26 12:51:54 +01:00
										 |  |  |             url = eval_xpath_getindex(result, href_xpath, 0, None) | 
					
						
							|  |  |  |             if url is None: | 
					
						
							|  |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2020-11-26 17:22:54 +01:00
										 |  |  |             content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True) | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |             results.append({ | 
					
						
							| 
									
										
										
										
											2020-07-08 00:46:03 +02:00
										 |  |  |                 'url': url, | 
					
						
							|  |  |  |                 'title': title, | 
					
						
							|  |  |  |                 'content': content | 
					
						
							|  |  |  |             }) | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |         except Exception as e:  # pylint: disable=broad-except | 
					
						
							|  |  |  |             logger.error(e, exc_info=True) | 
					
						
							| 
									
										
										
										
											2020-07-08 00:46:03 +02:00
										 |  |  |             # from lxml import etree | 
					
						
							|  |  |  |             # logger.debug(etree.tostring(result, pretty_print=True)) | 
					
						
							|  |  |  |             # import pdb | 
					
						
							|  |  |  |             # pdb.set_trace() | 
					
						
							| 
									
										
										
										
											2014-09-14 14:40:55 +02:00
										 |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # parse suggestion | 
					
						
							| 
									
										
										
										
											2020-11-26 17:22:54 +01:00
										 |  |  |     for suggestion in eval_xpath_list(dom, suggestion_xpath): | 
					
						
							| 
									
										
										
										
											2014-09-14 14:40:55 +02:00
										 |  |  |         # append suggestion | 
					
						
							| 
									
										
										
										
											2016-12-09 11:44:24 +01:00
										 |  |  |         results.append({'suggestion': extract_text(suggestion)}) | 
					
						
							| 
									
										
										
										
											2014-09-01 15:10:05 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-11-26 17:22:54 +01:00
										 |  |  |     for correction in eval_xpath_list(dom, spelling_suggestion_xpath): | 
					
						
							| 
									
										
										
										
											2017-01-14 09:40:37 +01:00
										 |  |  |         results.append({'correction': extract_text(correction)}) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-01 15:10:05 +02:00
										 |  |  |     # return results | 
					
						
							| 
									
										
										
										
											2014-01-29 19:28:38 +01:00
										 |  |  |     return results | 
					
						
							| 
									
										
										
										
											2014-09-14 14:40:55 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-08 00:46:03 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-11-06 03:51:38 +01:00
										 |  |  | # get supported languages from their site | 
					
						
							| 
									
										
										
										
											2016-12-15 07:34:43 +01:00
										 |  |  | def _fetch_supported_languages(resp): | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |     ret_val = {} | 
					
						
							| 
									
										
										
										
											2016-12-15 07:34:43 +01:00
										 |  |  |     dom = html.fromstring(resp.text) | 
					
						
							| 
									
										
										
										
											2016-11-06 03:51:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-11-26 17:22:54 +01:00
										 |  |  |     radio_buttons = eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]') | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     for x in radio_buttons: | 
					
						
							|  |  |  |         name = x.get("data-name") | 
					
						
							| 
									
										
										
										
											2020-09-21 08:01:06 +02:00
										 |  |  |         code = x.get("value").split('_')[-1] | 
					
						
							| 
									
										
										
										
											2020-07-07 21:50:59 +02:00
										 |  |  |         ret_val[code] = {"name": name} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return ret_val |