commit
						d669da81fb
					
				| @ -1,8 +0,0 @@ | ||||
| .. _autodetect search language: | ||||
| 
 | ||||
| ====================== | ||||
| Search language plugin | ||||
| ====================== | ||||
| 
 | ||||
| .. automodule:: searx.plugins.autodetect_search_language | ||||
|   :members: | ||||
| @ -1,97 +0,0 @@ | ||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | ||||
| # lint: pylint | ||||
| """Plugin to detect the search language from the search query. | ||||
| 
 | ||||
| The language detection is done by using the fastText_ library (`python | ||||
| fasttext`_). fastText_ distributes the `language identification model`_, for | ||||
| reference: | ||||
| 
 | ||||
| - `FastText.zip: Compressing text classification models`_ | ||||
| - `Bag of Tricks for Efficient Text Classification`_ | ||||
| 
 | ||||
| The `language identification model`_ support the language codes (ISO-639-3):: | ||||
| 
 | ||||
|    af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr | ||||
|    ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa | ||||
|    fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io | ||||
|    is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv | ||||
|    mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn | ||||
|    no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd | ||||
|    sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep | ||||
|    vi vls vo wa war wuu xal xmf yi yo yue zh | ||||
| 
 | ||||
| The `language identification model`_ is harmonized with the SearXNG's language | ||||
| (locale) model.  General conditions of SearXNG's locale model are: | ||||
| 
 | ||||
| a. SearXNG's locale of a query is passed to the | ||||
|    :py:obj:`searx.locales.get_engine_locale` to get a language and/or region | ||||
|    code that is used by an engine. | ||||
| 
 | ||||
| b. SearXNG and most of the engines do not support all the languages from | ||||
|    language model and there might be also a discrepancy in the ISO-639-3 and | ||||
|    ISO-639-2 handling (:py:obj:`searx.locales.get_engine_locale`).  Further | ||||
|    more, in SearXNG the locales like ``zh-TH`` (``zh-CN``) are mapped to | ||||
|    ``zh_Hant`` (``zh_Hans``). | ||||
| 
 | ||||
| Conclusion: This plugin does only auto-detect the languages a user can select in | ||||
| the language menu (:py:obj:`supported_langs`). | ||||
| 
 | ||||
| SearXNG's locale of a query comes from (*highest wins*): | ||||
| 
 | ||||
| 1. The ``Accept-Language`` header from user's HTTP client. | ||||
| 2. The user select a locale in the preferences. | ||||
| 3. The user select a locale from the menu in the query form (e.g. ``:zh-TW``) | ||||
| 4. This plugin is activated in the preferences and the locale (only the language | ||||
|    code / none region code) comes from the fastText's language detection. | ||||
| 
 | ||||
| Conclusion: There is a conflict between the language selected by the user and | ||||
| the language from language detection of this plugin.  For example, the user | ||||
| explicitly selects the German locale via the search syntax to search for a term | ||||
| that is identified as an English term (try ``:de-DE thermomix``, for example). | ||||
| 
 | ||||
| .. hint:: | ||||
| 
 | ||||
|    To SearXNG maintainers; please take into account: under some circumstances | ||||
|    the auto-detection of the language of this plugin could be detrimental to | ||||
|    users expectations.  Its not recommended to activate this plugin by | ||||
|    default. It should always be the user's decision whether to activate this | ||||
|    plugin or not. | ||||
| 
 | ||||
| .. _fastText: https://fasttext.cc/ | ||||
| .. _python fasttext: https://pypi.org/project/fasttext/ | ||||
| .. _language identification model: https://fasttext.cc/docs/en/language-identification.html | ||||
| .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759 | ||||
| .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651 | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
| from flask_babel import gettext | ||||
| import babel | ||||
| 
 | ||||
| from searx.utils import detect_language | ||||
| from searx.languages import language_codes | ||||
| 
 | ||||
| name = gettext('Autodetect search language') | ||||
| description = gettext('Automatically detect the query search language and switch to it.') | ||||
| preference_section = 'general' | ||||
| default_on = False | ||||
| 
 | ||||
| supported_langs = set() | ||||
| """Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`).""" | ||||
| 
 | ||||
| 
 | ||||
| def pre_search(request, search):  # pylint: disable=unused-argument | ||||
|     lang = detect_language(search.search_query.query, min_probability=0) | ||||
|     if lang in supported_langs: | ||||
|         search.search_query.lang = lang | ||||
|         try: | ||||
|             search.search_query.locale = babel.Locale.parse(lang) | ||||
|         except babel.core.UnknownLocaleError: | ||||
|             pass | ||||
|     return True | ||||
| 
 | ||||
| 
 | ||||
| def init(app, settings):  # pylint: disable=unused-argument | ||||
|     for searxng_locale in language_codes: | ||||
|         supported_langs.add(searxng_locale[0].split('-')[0]) | ||||
|     return True | ||||
| @ -154,7 +154,7 @@ class SearchLanguageSetting(EnumStringSetting): | ||||
|     """Available choices may change, so user's value may not be in choices anymore""" | ||||
| 
 | ||||
|     def _validate_selection(self, selection): | ||||
|         if selection != '' and not VALID_LANGUAGE_CODE.match(selection): | ||||
|         if selection != '' and selection != 'auto' and not VALID_LANGUAGE_CODE.match(selection): | ||||
|             raise ValidationException('Invalid language code: "{0}"'.format(selection)) | ||||
| 
 | ||||
|     def parse(self, data: str): | ||||
|  | ||||
| @ -104,7 +104,7 @@ class LanguageParser(QueryPartParser): | ||||
|                     break | ||||
| 
 | ||||
|         # user may set a valid, yet not selectable language | ||||
|         if VALID_LANGUAGE_CODE.match(value): | ||||
|         if VALID_LANGUAGE_CODE.match(value) or value == 'auto': | ||||
|             lang_parts = value.split('-') | ||||
|             if len(lang_parts) > 1: | ||||
|                 value = lang_parts[0].lower() + '-' + lang_parts[1].upper() | ||||
|  | ||||
| @ -3,10 +3,12 @@ | ||||
| # pylint: disable=missing-module-docstring, too-few-public-methods | ||||
| 
 | ||||
| import threading | ||||
| from copy import copy | ||||
| from timeit import default_timer | ||||
| from uuid import uuid4 | ||||
| 
 | ||||
| import flask | ||||
| import babel | ||||
| 
 | ||||
| from searx import settings | ||||
| from searx.answerers import ask | ||||
| @ -20,6 +22,7 @@ from searx.network import initialize as initialize_network, check_network_config | ||||
| from searx.metrics import initialize as initialize_metrics, counter_inc, histogram_observe_time | ||||
| from searx.search.processors import PROCESSORS, initialize as initialize_processors | ||||
| from searx.search.checker import initialize as initialize_checker | ||||
| from searx.utils import detect_language | ||||
| 
 | ||||
| 
 | ||||
| logger = logger.getChild('search') | ||||
| @ -37,18 +40,57 @@ def initialize(settings_engines=None, enable_checker=False, check_network=False, | ||||
|         initialize_checker() | ||||
| 
 | ||||
| 
 | ||||
| def replace_auto_language(search_query: SearchQuery): | ||||
|     """ | ||||
|     Do nothing except if `search_query.lang` is "auto". | ||||
|     In this case: | ||||
|     * the value "auto" is replaced by the detected language of the query. | ||||
|       The default value is "all" when no language is detected. | ||||
|     * `search_query.locale` is updated accordingly | ||||
| 
 | ||||
|     Use :py:obj:`searx.utils.detect_language` with `only_search_languages=True` to keep | ||||
|     only languages supported by the engines. | ||||
|     """ | ||||
|     if search_query.lang != 'auto': | ||||
|         return | ||||
| 
 | ||||
|     detected_lang = detect_language(search_query.query, threshold=0.0, only_search_languages=True) | ||||
|     if detected_lang is None: | ||||
|         # fallback to 'all' if no language has been detected | ||||
|         search_query.lang = 'all' | ||||
|         search_query.locale = None | ||||
|         return | ||||
|     search_query.lang = detected_lang | ||||
|     try: | ||||
|         search_query.locale = babel.Locale.parse(search_query.lang) | ||||
|     except babel.core.UnknownLocaleError: | ||||
|         search_query.locale = None | ||||
| 
 | ||||
| 
 | ||||
| class Search: | ||||
|     """Search information container""" | ||||
| 
 | ||||
|     __slots__ = "search_query", "result_container", "start_time", "actual_timeout" | ||||
| 
 | ||||
|     def __init__(self, search_query: SearchQuery): | ||||
|         """Initialize the Search | ||||
| 
 | ||||
|         search_query is copied | ||||
|         """ | ||||
|         # init vars | ||||
|         super().__init__() | ||||
|         self.search_query = search_query | ||||
|         self.result_container = ResultContainer() | ||||
|         self.start_time = None | ||||
|         self.actual_timeout = None | ||||
|         self.search_query = copy(search_query) | ||||
|         self.update_search_query(self.search_query) | ||||
| 
 | ||||
|     def update_search_query(self, search_query: SearchQuery): | ||||
|         """Update search_query. | ||||
| 
 | ||||
|         call replace_auto_language to replace the "auto" language | ||||
|         """ | ||||
|         replace_auto_language(search_query) | ||||
| 
 | ||||
|     def search_external_bang(self): | ||||
|         """ | ||||
|  | ||||
| @ -109,3 +109,16 @@ class SearchQuery: | ||||
|                 self.external_bang, | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     def __copy__(self): | ||||
|         return SearchQuery( | ||||
|             self.query, | ||||
|             self.engineref_list, | ||||
|             self.lang, | ||||
|             self.safesearch, | ||||
|             self.pageno, | ||||
|             self.time_range, | ||||
|             self.timeout_limit, | ||||
|             self.external_bang, | ||||
|             self.engine_data, | ||||
|         ) | ||||
|  | ||||
| @ -18,7 +18,7 @@ searx_dir = abspath(dirname(__file__)) | ||||
| 
 | ||||
| logger = logging.getLogger('searx') | ||||
| OUTPUT_FORMATS = ['html', 'csv', 'json', 'rss'] | ||||
| LANGUAGE_CODES = ['all'] + list(l[0] for l in languages) | ||||
| LANGUAGE_CODES = ['all', 'auto'] + list(l[0] for l in languages) | ||||
| SIMPLE_STYLE = ('auto', 'light', 'dark') | ||||
| CATEGORIES_AS_TABS = { | ||||
|     'general': {}, | ||||
|  | ||||
| @ -1,5 +1,9 @@ | ||||
| <select class="language" id="language" name="language" aria-label="{{ _('Search language') }}">{{- '' -}} | ||||
| 	<option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }}</option> | ||||
| 	<option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}> | ||||
| 		{{- _('Auto-detect') -}} | ||||
| 		{%- if current_language == 'auto' %} ({{ search_language }}){%- endif -%} | ||||
| 	</option> | ||||
| 	{%- for lang_id,lang_name,country_name,english_name,flag in language_codes | sort(attribute=1) -%} | ||||
| 	<option value="{{ lang_id }}" {% if lang_id == current_language %}selected="selected"{% endif %}> | ||||
| 		{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %}({{ country_name }}) {% endif %} | ||||
|  | ||||
| @ -116,12 +116,15 @@ | ||||
|       <p class="value">{{- '' -}} | ||||
|         <select name='language' aria-labelledby="pref_language" aria-describedby="desc_language">{{- '' -}} | ||||
|           <option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }}</option> | ||||
|           <option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>{{ _('Auto-detect') }}</option> | ||||
|           {%- for lang_id,lang_name,country_name,english_name,flag in language_codes | sort(attribute=1) -%} | ||||
|           <option value="{{ lang_id }}" {% if lang_id == current_language %}selected="selected"{% endif %}>{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %}({{ country_name }}) {% endif %}</option> | ||||
|           {%- endfor -%} | ||||
|         </select>{{- '' -}} | ||||
|       </p> | ||||
|       <div class="description" id="desc_language">{{ _('What language do you prefer for search?') }}</div> | ||||
|       <div class="description" id="desc_language"> | ||||
|         {{- _('What language do you prefer for search?') }} {{ _('Choose Auto-detect to let SearXNG detect the language of your query.') -}} | ||||
|       </div> | ||||
|     </fieldset> | ||||
|     {% endif %} | ||||
|     {% if 'autocomplete' not in locked_preferences %} | ||||
|  | ||||
| @ -53,6 +53,9 @@ _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {} | ||||
| _FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None | ||||
| """fasttext model to predict laguage of a search term""" | ||||
| 
 | ||||
| SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in language_codes]) | ||||
| """Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`).""" | ||||
| 
 | ||||
| 
 | ||||
| class _NotSetClass:  # pylint: disable=too-few-public-methods | ||||
|     """Internal class for this module, do not create instance of this class. | ||||
| @ -637,11 +640,72 @@ def _get_fasttext_model() -> "fasttext.FastText._FastText": | ||||
|     return _FASTTEXT_MODEL | ||||
| 
 | ||||
| 
 | ||||
| def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]: | ||||
|     """https://fasttext.cc/docs/en/language-identification.html""" | ||||
| def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]: | ||||
|     """Detect the language of the ``text`` parameter. | ||||
| 
 | ||||
|     :param str text: The string whose language is to be detected. | ||||
| 
 | ||||
|     :param float threshold: Threshold filters the returned labels by a threshold | ||||
|         on probability.  A choice of 0.3 will return labels with at least 0.3 | ||||
|         probability. | ||||
| 
 | ||||
|     :param bool only_search_languages: If ``True``, returns only supported | ||||
|         SearXNG search languages.  see :py:obj:`searx.languages` | ||||
| 
 | ||||
|     :rtype: str, None | ||||
|     :returns: | ||||
|         The detected language code or ``None``. See below. | ||||
| 
 | ||||
|     :raises ValueError: If ``text`` is not a string. | ||||
| 
 | ||||
|     The language detection is done by using `a fork`_ of the fastText_ library | ||||
|     (`python fasttext`_). fastText_ distributes the `language identification | ||||
|     model`_, for reference: | ||||
| 
 | ||||
|     - `FastText.zip: Compressing text classification models`_ | ||||
|     - `Bag of Tricks for Efficient Text Classification`_ | ||||
| 
 | ||||
|     The `language identification model`_ support the language codes | ||||
|     (ISO-639-3):: | ||||
| 
 | ||||
|         af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs | ||||
|         bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es | ||||
|         et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia | ||||
|         id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li | ||||
|         lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah | ||||
|         nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru | ||||
|         rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl | ||||
|         tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh | ||||
| 
 | ||||
|     By using ``only_search_languages=True`` the `language identification model`_ | ||||
|     is harmonized with the SearXNG's language (locale) model.  General | ||||
|     conditions of SearXNG's locale model are: | ||||
| 
 | ||||
|     a. SearXNG's locale of a query is passed to the | ||||
|        :py:obj:`searx.locales.get_engine_locale` to get a language and/or region | ||||
|        code that is used by an engine. | ||||
| 
 | ||||
|     b. Most of SearXNG's engines do not support all the languages from `language | ||||
|        identification model`_ and there is also a discrepancy in the ISO-639-3 | ||||
|        (fastext) and ISO-639-2 (SearXNG)handling.  Further more, in SearXNG the | ||||
|        locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant`` | ||||
|        (``zh_Hans``) while the `language identification model`_ reduce both to | ||||
|        ``zh``. | ||||
| 
 | ||||
|     .. _a fork: https://github.com/searxng/fasttext-predict | ||||
|     .. _fastText: https://fasttext.cc/ | ||||
|     .. _python fasttext: https://pypi.org/project/fasttext/ | ||||
|     .. _language identification model: https://fasttext.cc/docs/en/language-identification.html | ||||
|     .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759 | ||||
|     .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651 | ||||
| 
 | ||||
|     """ | ||||
|     if not isinstance(text, str): | ||||
|         raise ValueError('text must a str') | ||||
|     r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold) | ||||
|     if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability: | ||||
|         return r[0][0].split('__label__')[1] | ||||
|     if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0: | ||||
|         language = r[0][0].split('__label__')[1] | ||||
|         if only_search_languages and language not in SEARCH_LANGUAGE_CODES: | ||||
|             return None | ||||
|         return language | ||||
|     return None | ||||
|  | ||||
| @ -63,7 +63,7 @@ def parse_lang(preferences: Preferences, form: Dict[str, str], raw_text_query: R | ||||
|         query_lang = preferences.get_value('language') | ||||
| 
 | ||||
|     # check language | ||||
|     if not VALID_LANGUAGE_CODE.match(query_lang): | ||||
|     if not VALID_LANGUAGE_CODE.match(query_lang) and query_lang != 'auto': | ||||
|         raise SearxParameterException('language', query_lang) | ||||
| 
 | ||||
|     return query_lang | ||||
|  | ||||
| @ -810,6 +810,9 @@ def search(): | ||||
|         ) | ||||
|     ) | ||||
| 
 | ||||
|     # search_query.lang contains the user choice (all, auto, en, ...) | ||||
|     # when the user choice is "auto", search.search_query.lang contains the detected language | ||||
|     # otherwise it is equals to search_query.lang | ||||
|     return render( | ||||
|         # fmt: off | ||||
|         'results.html', | ||||
| @ -834,6 +837,11 @@ def search(): | ||||
|             settings['search']['languages'], | ||||
|             fallback=request.preferences.get_value("language") | ||||
|         ), | ||||
|         search_language = match_language( | ||||
|             search.search_query.lang, | ||||
|             settings['search']['languages'], | ||||
|             fallback=request.preferences.get_value("language") | ||||
|         ), | ||||
|         timeout_limit = request.form.get('timeout_limit', None) | ||||
|         # fmt: on | ||||
|     ) | ||||
|  | ||||
| @ -91,6 +91,17 @@ class TestLanguageParser(SearxTestCase): | ||||
|         self.assertIn('all', query.languages) | ||||
|         self.assertFalse(query.specific) | ||||
| 
 | ||||
|     def test_auto_language_code(self): | ||||
|         language = 'auto' | ||||
|         query_text = 'una consulta' | ||||
|         full_query = ':' + language + ' ' + query_text | ||||
|         query = RawTextQuery(full_query, []) | ||||
| 
 | ||||
|         self.assertEqual(query.getFullQuery(), full_query) | ||||
|         self.assertEqual(len(query.query_parts), 1) | ||||
|         self.assertIn('auto', query.languages) | ||||
|         self.assertFalse(query.specific) | ||||
| 
 | ||||
|     def test_invalid_language_code(self): | ||||
|         language = 'not_a_language' | ||||
|         query_text = 'the query' | ||||
|  | ||||
| @ -1,5 +1,7 @@ | ||||
| # -*- coding: utf-8 -*- | ||||
| 
 | ||||
| from copy import copy | ||||
| 
 | ||||
| import searx.search | ||||
| from searx.search import SearchQuery, EngineRef | ||||
| from searx import settings | ||||
| @ -34,6 +36,11 @@ class SearchQueryTestCase(SearxTestCase): | ||||
|         self.assertEqual(s, s) | ||||
|         self.assertNotEqual(s, t) | ||||
| 
 | ||||
|     def test_copy(self): | ||||
|         s = SearchQuery('test', [EngineRef('bing', 'general')], 'all', 0, 1, None, None, None) | ||||
|         t = copy(s) | ||||
|         self.assertEqual(s, t) | ||||
| 
 | ||||
| 
 | ||||
| class SearchTestCase(SearxTestCase): | ||||
|     @classmethod | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user