Merge pull request #134 from searxng/google-default-language
[enh] google engine: supports "default language"
This commit is contained in:
		
						commit
						52db47a1c9
					
				| @ -133,56 +133,121 @@ suggestion_xpath = '//div[contains(@class, "card-section")]//a' | |||||||
| spelling_suggestion_xpath = '//div[@class="med"]/p/a' | spelling_suggestion_xpath = '//div[@class="med"]/p/a' | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_lang_info(params, lang_list, custom_aliases): | def get_lang_info(params, lang_list, custom_aliases, supported_any_language): | ||||||
|     ret_val = {} |     """Composing various language properties for the google engines. | ||||||
|  | 
 | ||||||
|  |     This function is called by the various google engines (google itself, | ||||||
|  |     google-images, -news, -scholar, -videos). | ||||||
|  | 
 | ||||||
|  |     :param dict param: request parameters of the engine | ||||||
|  | 
 | ||||||
|  |     :param list lang_list: list of supported languages of the engine | ||||||
|  |         :py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>` | ||||||
|  | 
 | ||||||
|  |     :param dict lang_list: custom aliases for non standard language codes | ||||||
|  |         (used when calling :py:func:`searx.utils.match_language) | ||||||
|  | 
 | ||||||
|  |     :param bool supported_any_language: When a language is not specified, the | ||||||
|  |         language interpretation is left up to Google to decide how the search | ||||||
|  |         results should be delivered.  This argument is ``True`` for the google | ||||||
|  |         engine and ``False`` for the other engines (google-images, -news, | ||||||
|  |         -scholar, -videos). | ||||||
|  | 
 | ||||||
|  |     :rtype: dict | ||||||
|  |     :returns: | ||||||
|  |         Py-Dictionary with the key/value pairs: | ||||||
|  | 
 | ||||||
|  |         language: | ||||||
|  |             Return value from :py:func:`searx.utils.match_language | ||||||
|  | 
 | ||||||
|  |         country: | ||||||
|  |             The country code (e.g. US, AT, CA, FR, DE ..) | ||||||
|  | 
 | ||||||
|  |         subdomain: | ||||||
|  |             Google subdomain :py:obj:`google_domains` that fits to the country | ||||||
|  |             code. | ||||||
|  | 
 | ||||||
|  |         params: | ||||||
|  |             Py-Dictionary with additional request arguments (can be passed to | ||||||
|  |             :py:func:`urllib.parse.urlencode`). | ||||||
|  | 
 | ||||||
|  |         headers: | ||||||
|  |             Py-Dictionary with additional HTTP headers (can be passed to | ||||||
|  |             request's headers) | ||||||
|  |     """ | ||||||
|  |     ret_val = { | ||||||
|  |         'language' : None, | ||||||
|  |         'country' : None, | ||||||
|  |         'subdomain' : None, | ||||||
|  |         'params' : {}, | ||||||
|  |         'headers' : {}, | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     # language ... | ||||||
| 
 | 
 | ||||||
|     _lang = params['language'] |     _lang = params['language'] | ||||||
|     if _lang.lower() == 'all': |     _any_language = _lang.lower() == 'all' | ||||||
|  |     if _any_language: | ||||||
|         _lang = 'en-US' |         _lang = 'en-US' | ||||||
| 
 |  | ||||||
|     language = match_language(_lang, lang_list, custom_aliases) |     language = match_language(_lang, lang_list, custom_aliases) | ||||||
|     ret_val['language'] = language |     ret_val['language'] = language | ||||||
| 
 | 
 | ||||||
|     # the requested language from params (en, en-US, de, de-AT, fr, fr-CA, ...) |     # country ... | ||||||
|     _l = _lang.split('-') |  | ||||||
| 
 | 
 | ||||||
|     # the country code (US, AT, CA) |     _l = _lang.split('-') | ||||||
|     if len(_l) == 2: |     if len(_l) == 2: | ||||||
|         country = _l[1] |         country = _l[1] | ||||||
|     else: |     else: | ||||||
|         country = _l[0].upper() |         country = _l[0].upper() | ||||||
|         if country == 'EN': |         if country == 'EN': | ||||||
|             country = 'US' |             country = 'US' | ||||||
| 
 |  | ||||||
|     ret_val['country'] = country |     ret_val['country'] = country | ||||||
| 
 | 
 | ||||||
|     # the combination (en-US, en-EN, de-DE, de-AU, fr-FR, fr-FR) |     # subdomain ... | ||||||
|     lang_country = '%s-%s' % (language, country) |  | ||||||
| 
 | 
 | ||||||
|     # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5 |  | ||||||
|     ret_val['Accept-Language'] = ','.join([ |  | ||||||
|         lang_country, |  | ||||||
|         language + ';q=0.8,', |  | ||||||
|         'en;q=0.6', |  | ||||||
|         '*;q=0.5', |  | ||||||
|     ]) |  | ||||||
| 
 |  | ||||||
|     # subdomain |  | ||||||
|     ret_val['subdomain']  = 'www.' + google_domains.get(country.upper(), 'google.com') |     ret_val['subdomain']  = 'www.' + google_domains.get(country.upper(), 'google.com') | ||||||
| 
 | 
 | ||||||
|  |     # params & headers | ||||||
|  | 
 | ||||||
|  |     lang_country = '%s-%s' % (language, country)  # (en-US, en-EN, de-DE, de-AU, fr-FR ..) | ||||||
|  | 
 | ||||||
|     # hl parameter: |     # hl parameter: | ||||||
|     #   https://developers.google.com/custom-search/docs/xml_results#hlsp The |     #   https://developers.google.com/custom-search/docs/xml_results#hlsp The | ||||||
|     # Interface Language: |     # Interface Language: | ||||||
|     #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages |     #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages | ||||||
| 
 | 
 | ||||||
|     ret_val['hl'] = lang_list.get(lang_country, language) |     ret_val['params']['hl'] = lang_list.get(lang_country, language) | ||||||
| 
 | 
 | ||||||
|     # lr parameter: |     # lr parameter: | ||||||
|  |     #   The lr (language restrict) parameter restricts search results to | ||||||
|  |     #   documents written in a particular language. | ||||||
|     #   https://developers.google.com/custom-search/docs/xml_results#lrsp |     #   https://developers.google.com/custom-search/docs/xml_results#lrsp | ||||||
|     #   Language Collection Values: |     #   Language Collection Values: | ||||||
|     #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections |     #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections | ||||||
| 
 | 
 | ||||||
|     ret_val['lr'] = "lang_" + lang_list.get(lang_country, language) |     if _any_language and supported_any_language: | ||||||
|  | 
 | ||||||
|  |         # interpretation is left up to Google (based on whoogle) | ||||||
|  |         # | ||||||
|  |         # - add parameter ``source=lnt`` | ||||||
|  |         # - don't use parameter ``lr`` | ||||||
|  |         # - don't add a ``Accept-Language`` HTTP header. | ||||||
|  | 
 | ||||||
|  |         ret_val['params']['source'] = 'lnt' | ||||||
|  | 
 | ||||||
|  |     else: | ||||||
|  | 
 | ||||||
|  |         # restricts search results to documents written in a particular | ||||||
|  |         # language. | ||||||
|  |         ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language) | ||||||
|  | 
 | ||||||
|  |         # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5 | ||||||
|  |         ret_val['headers']['Accept-Language'] = ','.join([ | ||||||
|  |             lang_country, | ||||||
|  |             language + ';q=0.8,', | ||||||
|  |             'en;q=0.6', | ||||||
|  |             '*;q=0.5', | ||||||
|  |         ]) | ||||||
| 
 | 
 | ||||||
|     return ret_val |     return ret_val | ||||||
| 
 | 
 | ||||||
| @ -198,14 +263,13 @@ def request(query, params): | |||||||
| 
 | 
 | ||||||
|     lang_info = get_lang_info( |     lang_info = get_lang_info( | ||||||
|         # pylint: disable=undefined-variable |         # pylint: disable=undefined-variable | ||||||
|         params, supported_languages, language_aliases |         params, supported_languages, language_aliases, True | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|     # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium |     # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium | ||||||
|     query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ |     query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ | ||||||
|         'q': query, |         'q': query, | ||||||
|         'hl': lang_info['hl'], |         **lang_info['params'], | ||||||
|         'lr': lang_info['lr'], |  | ||||||
|         'ie': "utf8", |         'ie': "utf8", | ||||||
|         'oe': "utf8", |         'oe': "utf8", | ||||||
|         'start': offset, |         'start': offset, | ||||||
| @ -215,12 +279,9 @@ def request(query, params): | |||||||
|         query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) |         query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) | ||||||
|     if params['safesearch']: |     if params['safesearch']: | ||||||
|         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) |         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) | ||||||
| 
 |  | ||||||
|     logger.debug("query_url --> %s", query_url) |  | ||||||
|     params['url'] = query_url |     params['url'] = query_url | ||||||
| 
 | 
 | ||||||
|     logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language']) |     params['headers'].update(lang_info['headers']) | ||||||
|     params['headers']['Accept-Language'] = lang_info['Accept-Language'] |  | ||||||
|     params['headers']['Accept'] = ( |     params['headers']['Accept'] = ( | ||||||
|         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' |         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | ||||||
|     ) |     ) | ||||||
|  | |||||||
| @ -101,14 +101,15 @@ def request(query, params): | |||||||
| 
 | 
 | ||||||
|     lang_info = get_lang_info( |     lang_info = get_lang_info( | ||||||
|         # pylint: disable=undefined-variable |         # pylint: disable=undefined-variable | ||||||
|         params, supported_languages, language_aliases |         params, supported_languages, language_aliases, False | ||||||
|     ) |     ) | ||||||
|  |     logger.debug( | ||||||
|  |         "HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language']) | ||||||
| 
 | 
 | ||||||
|     query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ |     query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ | ||||||
|         'q': query, |         'q': query, | ||||||
|         'tbm': "isch", |         'tbm': "isch", | ||||||
|         'hl': lang_info['hl'], |         **lang_info['params'], | ||||||
|         'lr': lang_info['lr'], |  | ||||||
|         'ie': "utf8", |         'ie': "utf8", | ||||||
|         'oe': "utf8", |         'oe': "utf8", | ||||||
|         'num': 30, |         'num': 30, | ||||||
| @ -118,12 +119,9 @@ def request(query, params): | |||||||
|         query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) |         query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) | ||||||
|     if params['safesearch']: |     if params['safesearch']: | ||||||
|         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) |         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) | ||||||
| 
 |  | ||||||
|     logger.debug("query_url --> %s", query_url) |  | ||||||
|     params['url'] = query_url |     params['url'] = query_url | ||||||
| 
 | 
 | ||||||
|     logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language']) |     params['headers'].update(lang_info['headers']) | ||||||
|     params['headers']['Accept-Language'] = lang_info['Accept-Language'] |  | ||||||
|     params['headers']['Accept'] = ( |     params['headers']['Accept'] = ( | ||||||
|         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' |         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | ||||||
|     ) |     ) | ||||||
|  | |||||||
| @ -83,8 +83,10 @@ def request(query, params): | |||||||
| 
 | 
 | ||||||
|     lang_info = get_lang_info( |     lang_info = get_lang_info( | ||||||
|         # pylint: disable=undefined-variable |         # pylint: disable=undefined-variable | ||||||
|         params, supported_languages, language_aliases |         params, supported_languages, language_aliases, False | ||||||
|     ) |     ) | ||||||
|  |     logger.debug( | ||||||
|  |         "HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language']) | ||||||
| 
 | 
 | ||||||
|     # google news has only one domain |     # google news has only one domain | ||||||
|     lang_info['subdomain'] = 'news.google.com' |     lang_info['subdomain'] = 'news.google.com' | ||||||
| @ -102,18 +104,14 @@ def request(query, params): | |||||||
| 
 | 
 | ||||||
|     query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ |     query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ | ||||||
|         'q': query, |         'q': query, | ||||||
|         'hl': lang_info['hl'], |         **lang_info['params'], | ||||||
|         'lr': lang_info['lr'], |  | ||||||
|         'ie': "utf8", |         'ie': "utf8", | ||||||
|         'oe': "utf8", |         'oe': "utf8", | ||||||
|         'gl': lang_info['country'], |         'gl': lang_info['country'], | ||||||
|     }) + ('&ceid=%s' % ceid)  # ceid includes a ':' character which must not be urlencoded |     }) + ('&ceid=%s' % ceid)  # ceid includes a ':' character which must not be urlencoded | ||||||
| 
 |  | ||||||
|     logger.debug("query_url --> %s", query_url) |  | ||||||
|     params['url'] = query_url |     params['url'] = query_url | ||||||
| 
 | 
 | ||||||
|     logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language']) |     params['headers'].update(lang_info['headers']) | ||||||
|     params['headers']['Accept-Language'] = lang_info['Accept-Language'] |  | ||||||
|     params['headers']['Accept'] = ( |     params['headers']['Accept'] = ( | ||||||
|         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' |         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | ||||||
|         ) |         ) | ||||||
|  | |||||||
| @ -77,31 +77,26 @@ def request(query, params): | |||||||
|     offset = (params['pageno'] - 1) * 10 |     offset = (params['pageno'] - 1) * 10 | ||||||
|     lang_info = get_lang_info( |     lang_info = get_lang_info( | ||||||
|         # pylint: disable=undefined-variable |         # pylint: disable=undefined-variable | ||||||
| 
 |         params, supported_languages, language_aliases, False | ||||||
| 
 |  | ||||||
|         # params, {}, language_aliases |  | ||||||
| 
 |  | ||||||
|         params, supported_languages, language_aliases |  | ||||||
|     ) |     ) | ||||||
|  |     logger.debug( | ||||||
|  |         "HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language']) | ||||||
|  | 
 | ||||||
|     # subdomain is: scholar.google.xy |     # subdomain is: scholar.google.xy | ||||||
|     lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.") |     lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.") | ||||||
| 
 | 
 | ||||||
|     query_url = 'https://'+ lang_info['subdomain'] + '/scholar' + "?" + urlencode({ |     query_url = 'https://'+ lang_info['subdomain'] + '/scholar' + "?" + urlencode({ | ||||||
|         'q':  query, |         'q':  query, | ||||||
|         'hl': lang_info['hl'], |         **lang_info['params'], | ||||||
|         'lr': lang_info['lr'], |  | ||||||
|         'ie': "utf8", |         'ie': "utf8", | ||||||
|         'oe':  "utf8", |         'oe':  "utf8", | ||||||
|         'start' : offset, |         'start' : offset, | ||||||
|     }) |     }) | ||||||
| 
 | 
 | ||||||
|     query_url += time_range_url(params) |     query_url += time_range_url(params) | ||||||
| 
 |  | ||||||
|     logger.debug("query_url --> %s", query_url) |  | ||||||
|     params['url'] = query_url |     params['url'] = query_url | ||||||
| 
 | 
 | ||||||
|     logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language']) |     params['headers'].update(lang_info['headers']) | ||||||
|     params['headers']['Accept-Language'] = lang_info['Accept-Language'] |  | ||||||
|     params['headers']['Accept'] = ( |     params['headers']['Accept'] = ( | ||||||
|         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' |         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | ||||||
|     ) |     ) | ||||||
|  | |||||||
| @ -119,14 +119,15 @@ def request(query, params): | |||||||
| 
 | 
 | ||||||
|     lang_info = get_lang_info( |     lang_info = get_lang_info( | ||||||
|         # pylint: disable=undefined-variable |         # pylint: disable=undefined-variable | ||||||
|         params, supported_languages, language_aliases |         params, supported_languages, language_aliases, False | ||||||
|     ) |     ) | ||||||
|  |     logger.debug( | ||||||
|  |         "HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language']) | ||||||
| 
 | 
 | ||||||
|     query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ |     query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ | ||||||
|         'q':   query, |         'q':   query, | ||||||
|         'tbm': "vid", |         'tbm': "vid", | ||||||
|         'hl': lang_info['hl'], |         **lang_info['params'], | ||||||
|         'lr': lang_info['lr'], |  | ||||||
|         'ie': "utf8", |         'ie': "utf8", | ||||||
|         'oe': "utf8", |         'oe': "utf8", | ||||||
|     }) |     }) | ||||||
| @ -135,12 +136,9 @@ def request(query, params): | |||||||
|         query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) |         query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) | ||||||
|     if params['safesearch']: |     if params['safesearch']: | ||||||
|         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) |         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) | ||||||
| 
 |  | ||||||
|     logger.debug("query_url --> %s", query_url) |  | ||||||
|     params['url'] = query_url |     params['url'] = query_url | ||||||
| 
 | 
 | ||||||
|     logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language']) |     params['headers'].update(lang_info['headers']) | ||||||
|     params['headers']['Accept-Language'] = lang_info['Accept-Language'] |  | ||||||
|     params['headers']['Accept'] = ( |     params['headers']['Accept'] = ( | ||||||
|         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' |         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | ||||||
|         ) |         ) | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user