| 
									
										
										
										
											2021-02-25 17:42:52 +01:00
										 |  |  | #!/usr/bin/env python | 
					
						
							| 
									
										
										
										
											2022-01-03 12:58:48 +01:00
										 |  |  | # lint: pylint | 
					
						
							| 
									
										
										
										
											2021-10-03 15:12:09 +02:00
										 |  |  | # SPDX-License-Identifier: AGPL-3.0-or-later | 
					
						
							| 
									
										
										
										
											2022-09-29 20:54:46 +02:00
										 |  |  | """Update :py:obj:`searx.enginelib.traits.EngineTraitsMap` and :origin:`searx/languages.py`
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | :py:obj:`searx.enginelib.traits.EngineTraitsMap.ENGINE_TRAITS_FILE`: | 
					
						
							|  |  |  |   Persistence of engines traits, fetched from the engines. | 
					
						
							| 
									
										
										
										
											2016-11-06 03:51:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-29 20:54:46 +02:00
										 |  |  | :origin:`searx/languages.py` | 
					
						
							|  |  |  |   Is generated  from intersecting each engine's supported traits. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The script :origin:`searxng_extra/update/update_engine_traits.py` is called in | 
					
						
							|  |  |  | the :origin:`CI Update data ... <.github/workflows/data-update.yml>` | 
					
						
							| 
									
										
										
										
											2022-01-03 12:40:06 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | """
 | 
					
						
							| 
									
										
										
										
											2016-11-06 03:51:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:58:48 +01:00
										 |  |  | # pylint: disable=invalid-name | 
					
						
							| 
									
										
										
										
											2022-03-16 18:07:00 +01:00
										 |  |  | from unicodedata import lookup | 
					
						
							| 
									
										
										
										
											2021-01-24 14:25:27 +01:00
										 |  |  | from pathlib import Path | 
					
						
							| 
									
										
										
										
											2020-09-14 09:07:45 +02:00
										 |  |  | from pprint import pformat | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  | import babel | 
					
						
							| 
									
										
										
										
											2018-02-14 23:17:46 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-24 14:25:27 +01:00
										 |  |  | from searx import settings, searx_dir | 
					
						
							| 
									
										
										
										
											2022-09-29 20:54:46 +02:00
										 |  |  | from searx import network | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  | from searx.engines import load_engines | 
					
						
							| 
									
										
										
										
											2022-09-29 20:54:46 +02:00
										 |  |  | from searx.enginelib.traits import EngineTraitsMap | 
					
						
							| 
									
										
										
										
											2016-11-06 03:51:38 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Output files. | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  | languages_file = Path(searx_dir) / 'sxng_locales.py' | 
					
						
							|  |  |  | languages_file_header = """\
 | 
					
						
							|  |  |  | # -*- coding: utf-8 -*- | 
					
						
							|  |  |  | '''List of SearXNG's locale codes.
 | 
					
						
							| 
									
										
										
										
											2016-11-06 03:51:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  | This file is generated automatically by:: | 
					
						
							| 
									
										
										
										
											2016-11-06 03:51:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  |    ./manage pyenv.cmd searxng_extra/update/update_engine_traits.py | 
					
						
							|  |  |  | '''
 | 
					
						
							| 
									
										
										
										
											2016-11-06 03:51:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  | sxng_locales = ( | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | languages_file_footer = """,
 | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | '''
 | 
					
						
							|  |  |  | A list of five-digit tuples: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 0. SearXNG's internal locale tag (a language or region tag) | 
					
						
							|  |  |  | 1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`) | 
					
						
							|  |  |  | 2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`). | 
					
						
							|  |  |  |    Empty string for language tags. | 
					
						
							|  |  |  | 3. English language name (from :py:obj:`babel.core.Locale.english_name`) | 
					
						
							|  |  |  | 4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages | 
					
						
							|  |  |  |    are represented by a globe (\U0001F310) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .. code:: python | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    ('en',    'English', '',              'English', '\U0001f310'), | 
					
						
							|  |  |  |    ('en-CA', 'English', 'Canada',        'English', '\U0001f1e8\U0001f1e6'), | 
					
						
							|  |  |  |    ('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'), | 
					
						
							|  |  |  |    .. | 
					
						
							|  |  |  |    ('fr',    'Français', '',             'French',  '\U0001f310'), | 
					
						
							|  |  |  |    ('fr-BE', 'Français', 'Belgique',     'French',  '\U0001f1e7\U0001f1ea'), | 
					
						
							|  |  |  |    ('fr-CA', 'Français', 'Canada',       'French',  '\U0001f1e8\U0001f1e6'), | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | :meta hide-value: | 
					
						
							|  |  |  | '''
 | 
					
						
							|  |  |  | """
 | 
					
						
							| 
									
										
										
										
											2018-02-14 23:17:46 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-16 18:07:00 +01:00
										 |  |  | lang2emoji = { | 
					
						
							|  |  |  |     'ha': '\U0001F1F3\U0001F1EA',  # Hausa / Niger | 
					
						
							|  |  |  |     'bs': '\U0001F1E7\U0001F1E6',  # Bosnian / Bosnia & Herzegovina | 
					
						
							|  |  |  |     'jp': '\U0001F1EF\U0001F1F5',  # Japanese | 
					
						
							|  |  |  |     'ua': '\U0001F1FA\U0001F1E6',  # Ukrainian | 
					
						
							| 
									
										
										
										
											2023-04-16 01:32:33 +02:00
										 |  |  |     'he': '\U0001F1EE\U0001F1F1',  # Hebrew | 
					
						
							| 
									
										
										
										
											2022-03-16 18:07:00 +01:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  | def main(): | 
					
						
							|  |  |  |     load_engines(settings['engines']) | 
					
						
							|  |  |  |     # traits_map = EngineTraitsMap.from_data() | 
					
						
							|  |  |  |     traits_map = fetch_traits_map() | 
					
						
							|  |  |  |     sxng_tag_list = filter_locales(traits_map) | 
					
						
							|  |  |  |     write_languages_file(sxng_tag_list) | 
					
						
							| 
									
										
										
										
											2022-03-16 18:07:00 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  | def fetch_traits_map(): | 
					
						
							|  |  |  |     """Fetchs supported languages for each engine and writes json file with those.""" | 
					
						
							|  |  |  |     network.set_timeout_for_thread(10.0) | 
					
						
							| 
									
										
										
										
											2022-03-16 18:07:00 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  |     def log(msg): | 
					
						
							|  |  |  |         print(msg) | 
					
						
							| 
									
										
										
										
											2022-03-16 18:07:00 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  |     traits_map = EngineTraitsMap.fetch_traits(log=log) | 
					
						
							|  |  |  |     print("fetched properties from %s engines" % len(traits_map)) | 
					
						
							|  |  |  |     print("write json file: %s" % traits_map.ENGINE_TRAITS_FILE) | 
					
						
							|  |  |  |     traits_map.save_data() | 
					
						
							|  |  |  |     return traits_map | 
					
						
							| 
									
										
										
										
											2022-03-16 18:07:00 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  | def filter_locales(traits_map: EngineTraitsMap): | 
					
						
							|  |  |  |     """Filter language & region tags by a threshold.""" | 
					
						
							| 
									
										
										
										
											2022-03-16 18:07:00 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-29 01:14:29 +01:00
										 |  |  |     min_eng_per_region = 15 | 
					
						
							|  |  |  |     min_eng_per_lang = 20 | 
					
						
							| 
									
										
										
										
											2022-03-16 18:07:00 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  |     _ = {} | 
					
						
							|  |  |  |     for eng in traits_map.values(): | 
					
						
							|  |  |  |         for reg in eng.regions.keys(): | 
					
						
							|  |  |  |             _[reg] = _.get(reg, 0) + 1 | 
					
						
							| 
									
										
										
										
											2016-11-06 03:51:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  |     regions = set(k for k, v in _.items() if v >= min_eng_per_region) | 
					
						
							|  |  |  |     lang_from_region = set(k.split('-')[0] for k in regions) | 
					
						
							| 
									
										
										
										
											2016-11-06 03:51:38 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  |     _ = {} | 
					
						
							|  |  |  |     for eng in traits_map.values(): | 
					
						
							|  |  |  |         for lang in eng.languages.keys(): | 
					
						
							|  |  |  |             # ignore script types like zh_Hant, zh_Hans or sr_Latin, pa_Arab (they | 
					
						
							|  |  |  |             # already counted by existence of 'zh' or 'sr', 'pa') | 
					
						
							|  |  |  |             if '_' in lang: | 
					
						
							|  |  |  |                 # print("ignore %s" % lang) | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             _[lang] = _.get(lang, 0) + 1 | 
					
						
							| 
									
										
										
										
											2022-03-16 18:07:00 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  |     languages = set(k for k, v in _.items() if v >= min_eng_per_lang) | 
					
						
							| 
									
										
										
										
											2022-03-16 18:07:00 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  |     sxng_tag_list = set() | 
					
						
							|  |  |  |     sxng_tag_list.update(regions) | 
					
						
							|  |  |  |     sxng_tag_list.update(lang_from_region) | 
					
						
							|  |  |  |     sxng_tag_list.update(languages) | 
					
						
							| 
									
										
										
										
											2022-03-16 18:07:00 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  |     return sxng_tag_list | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def write_languages_file(sxng_tag_list): | 
					
						
							| 
									
										
										
										
											2020-09-14 09:07:45 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-01 17:22:22 +01:00
										 |  |  |     language_codes = [] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  |     for sxng_tag in sorted(sxng_tag_list): | 
					
						
							|  |  |  |         sxng_locale: babel.Locale = babel.Locale.parse(sxng_tag, sep='-') | 
					
						
							| 
									
										
										
										
											2022-01-01 17:22:22 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  |         flag = get_unicode_flag(sxng_locale) or '' | 
					
						
							| 
									
										
										
										
											2022-03-16 18:07:00 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-01 17:22:22 +01:00
										 |  |  |         item = ( | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  |             sxng_tag, | 
					
						
							|  |  |  |             sxng_locale.get_language_name().title(), | 
					
						
							|  |  |  |             sxng_locale.get_territory_name() or '', | 
					
						
							|  |  |  |             sxng_locale.english_name.split(' (')[0], | 
					
						
							| 
									
										
										
										
											2022-03-16 18:07:00 +01:00
										 |  |  |             UnicodeEscape(flag), | 
					
						
							| 
									
										
										
										
											2022-01-01 17:22:22 +01:00
										 |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         language_codes.append(item) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     language_codes = tuple(language_codes) | 
					
						
							| 
									
										
										
										
											2020-09-14 09:07:45 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:58:48 +01:00
										 |  |  |     with open(languages_file, 'w', encoding='utf-8') as new_file: | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  |         file_content = "{header} {language_codes}{footer}".format( | 
					
						
							|  |  |  |             header=languages_file_header, | 
					
						
							|  |  |  |             language_codes=pformat(language_codes, width=120, indent=4)[1:-1], | 
					
						
							|  |  |  |             footer=languages_file_footer, | 
					
						
							| 
									
										
										
										
											2020-09-14 09:07:45 +02:00
										 |  |  |         ) | 
					
						
							|  |  |  |         new_file.write(file_content) | 
					
						
							|  |  |  |         new_file.close() | 
					
						
							| 
									
										
										
										
											2016-11-06 03:51:38 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-10 19:31:22 +02:00
										 |  |  | class UnicodeEscape(str): | 
					
						
							|  |  |  |     """Escape unicode string in :py:obj:`pprint.pformat`""" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __repr__(self): | 
					
						
							|  |  |  |         return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def get_unicode_flag(locale: babel.Locale): | 
					
						
							|  |  |  |     """Determine a unicode flag (emoji) that fits to the ``locale``""" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     emoji = lang2emoji.get(locale.language) | 
					
						
							|  |  |  |     if emoji: | 
					
						
							|  |  |  |         return emoji | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if not locale.territory: | 
					
						
							|  |  |  |         return '\U0001F310' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     emoji = lang2emoji.get(locale.territory.lower()) | 
					
						
							|  |  |  |     if emoji: | 
					
						
							|  |  |  |         return emoji | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[0]) | 
					
						
							|  |  |  |         c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[1]) | 
					
						
							|  |  |  |         # print("OK   : %s --> %s%s" % (locale, c1, c2)) | 
					
						
							|  |  |  |     except KeyError as exc: | 
					
						
							|  |  |  |         print("ERROR: %s --> %s" % (locale, exc)) | 
					
						
							|  |  |  |         return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return c1 + c2 | 
					
						
							| 
									
										
										
										
											2022-09-29 20:54:46 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if __name__ == "__main__": | 
					
						
							|  |  |  |     main() |