Merge pull request #2019 from ArtikusHG/fasttext
Replace langdetect with fasttext (followup of #1969)
This commit is contained in:
		
						commit
						b927482195
					
				| @ -11,7 +11,6 @@ httpx[http2]==0.21.2 | |||||||
| Brotli==1.0.9 | Brotli==1.0.9 | ||||||
| uvloop==0.17.0 | uvloop==0.17.0 | ||||||
| httpx-socks[asyncio]==0.7.2 | httpx-socks[asyncio]==0.7.2 | ||||||
| langdetect==1.0.9 |  | ||||||
| setproctitle==1.3.2 | setproctitle==1.3.2 | ||||||
| redis==4.4.0 | redis==4.4.0 | ||||||
| markdown-it-py==2.1.0 | markdown-it-py==2.1.0 | ||||||
|  | |||||||
| @ -66,40 +66,22 @@ that is identified as an English term (try ``:de-DE thermomix``, for example). | |||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from flask_babel import gettext | from flask_babel import gettext | ||||||
| import fasttext |  | ||||||
| import babel | import babel | ||||||
| 
 | 
 | ||||||
| from searx.data import data_dir | from searx.utils import detect_language | ||||||
| from searx.languages import language_codes | from searx.languages import language_codes | ||||||
| 
 | 
 | ||||||
| # Monkey patch: prevent fasttext from showing a (useless) warning when loading a |  | ||||||
| # model. |  | ||||||
| fasttext.FastText.eprint = lambda x: None |  | ||||||
| 
 |  | ||||||
| name = gettext('Autodetect search language') | name = gettext('Autodetect search language') | ||||||
| description = gettext('Automatically detect the query search language and switch to it.') | description = gettext('Automatically detect the query search language and switch to it.') | ||||||
| preference_section = 'general' | preference_section = 'general' | ||||||
| default_on = False | default_on = False | ||||||
| 
 | 
 | ||||||
| lang_model: fasttext.FastText._FastText = None |  | ||||||
| """fasttext model to predict laguage of a search term""" |  | ||||||
| 
 |  | ||||||
| supported_langs = set() | supported_langs = set() | ||||||
| """Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`).""" | """Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`).""" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_model(): |  | ||||||
|     # lazy load, in order to to save memory |  | ||||||
|     global lang_model  # pylint: disable=global-statement |  | ||||||
|     if lang_model is None: |  | ||||||
|         lang_model = fasttext.load_model(str(data_dir / 'lid.176.ftz')) |  | ||||||
|     return lang_model |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def pre_search(request, search):  # pylint: disable=unused-argument | def pre_search(request, search):  # pylint: disable=unused-argument | ||||||
|     prediction = get_model().predict(search.search_query.query, k=1, threshold=0.3) |     lang = detect_language(search.search_query.query, min_probability=0) | ||||||
|     if prediction: |  | ||||||
|         lang = prediction[0][0].split('__label__')[1] |  | ||||||
|     if lang in supported_langs: |     if lang in supported_langs: | ||||||
|         search.search_query.lang = lang |         search.search_query.lang = lang | ||||||
|         try: |         try: | ||||||
|  | |||||||
| @ -10,12 +10,10 @@ from timeit import default_timer | |||||||
| from urllib.parse import urlparse | from urllib.parse import urlparse | ||||||
| 
 | 
 | ||||||
| import re | import re | ||||||
| from langdetect import detect_langs |  | ||||||
| from langdetect.lang_detect_exception import LangDetectException |  | ||||||
| import httpx | import httpx | ||||||
| 
 | 
 | ||||||
| from searx import network, logger | from searx import network, logger | ||||||
| from searx.utils import gen_useragent | from searx.utils import gen_useragent, detect_language | ||||||
| from searx.results import ResultContainer | from searx.results import ResultContainer | ||||||
| from searx.search.models import SearchQuery, EngineRef | from searx.search.models import SearchQuery, EngineRef | ||||||
| from searx.search.processors import EngineProcessor | from searx.search.processors import EngineProcessor | ||||||
| @ -208,14 +206,10 @@ class ResultContainerTests: | |||||||
|         self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')') |         self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')') | ||||||
| 
 | 
 | ||||||
|     def _add_language(self, text: str) -> typing.Optional[str]: |     def _add_language(self, text: str) -> typing.Optional[str]: | ||||||
|         try: |         langStr = detect_language(text) | ||||||
|             r = detect_langs(str(text))  # pylint: disable=E1101 |         if langStr: | ||||||
|         except LangDetectException: |             self.languages.add(langStr) | ||||||
|             return None |             self.test_results.add_language(langStr) | ||||||
| 
 |  | ||||||
|         if len(r) > 0 and r[0].prob > 0.95: |  | ||||||
|             self.languages.add(r[0].lang) |  | ||||||
|             self.test_results.add_language(r[0].lang) |  | ||||||
|         return None |         return None | ||||||
| 
 | 
 | ||||||
|     def _check_result(self, result): |     def _check_result(self, result): | ||||||
|  | |||||||
| @ -15,6 +15,7 @@ from os.path import splitext, join | |||||||
| from random import choice | from random import choice | ||||||
| from html.parser import HTMLParser | from html.parser import HTMLParser | ||||||
| from urllib.parse import urljoin, urlparse | from urllib.parse import urljoin, urlparse | ||||||
|  | import fasttext | ||||||
| 
 | 
 | ||||||
| from lxml import html | from lxml import html | ||||||
| from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult | from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult | ||||||
| @ -22,7 +23,7 @@ from babel.core import get_global | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| from searx import settings | from searx import settings | ||||||
| from searx.data import USER_AGENTS | from searx.data import USER_AGENTS, data_dir | ||||||
| from searx.version import VERSION_TAG | from searx.version import VERSION_TAG | ||||||
| from searx.languages import language_codes | from searx.languages import language_codes | ||||||
| from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException | from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException | ||||||
| @ -50,6 +51,12 @@ _STORAGE_UNIT_VALUE: Dict[str, int] = { | |||||||
| _XPATH_CACHE: Dict[str, XPath] = {} | _XPATH_CACHE: Dict[str, XPath] = {} | ||||||
| _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {} | _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {} | ||||||
| 
 | 
 | ||||||
|  | _FASTTEXT_MODEL: Optional[fasttext.FastText._FastText] = None | ||||||
|  | """fasttext model to predict laguage of a search term""" | ||||||
|  | 
 | ||||||
|  | # Monkey patch: prevent fasttext from showing a (useless) warning when loading a model. | ||||||
|  | fasttext.FastText.eprint = lambda x: None | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class _NotSetClass:  # pylint: disable=too-few-public-methods | class _NotSetClass:  # pylint: disable=too-few-public-methods | ||||||
|     """Internal class for this module, do not create instance of this class. |     """Internal class for this module, do not create instance of this class. | ||||||
| @ -621,3 +628,20 @@ def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index: | |||||||
|         # to record xpath_spec |         # to record xpath_spec | ||||||
|         raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found') |         raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found') | ||||||
|     return default |     return default | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def _get_fasttext_model() -> fasttext.FastText._FastText: | ||||||
|  |     global _FASTTEXT_MODEL  # pylint: disable=global-statement | ||||||
|  |     if _FASTTEXT_MODEL is None: | ||||||
|  |         _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz')) | ||||||
|  |     return _FASTTEXT_MODEL | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]: | ||||||
|  |     """https://fasttext.cc/docs/en/language-identification.html""" | ||||||
|  |     if not isinstance(text, str): | ||||||
|  |         raise ValueError('text must a str') | ||||||
|  |     r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold) | ||||||
|  |     if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability: | ||||||
|  |         return r[0][0].split('__label__')[1] | ||||||
|  |     return None | ||||||
|  | |||||||
| @ -17,14 +17,11 @@ from os.path import join | |||||||
| 
 | 
 | ||||||
| from lxml.html import fromstring | from lxml.html import fromstring | ||||||
| 
 | 
 | ||||||
| from langdetect import detect_langs |  | ||||||
| from langdetect.lang_detect_exception import LangDetectException |  | ||||||
| 
 |  | ||||||
| from searx.engines import wikidata, set_loggers | from searx.engines import wikidata, set_loggers | ||||||
| from searx.utils import extract_text, match_language | from searx.utils import extract_text, match_language | ||||||
| from searx.locales import LOCALE_NAMES, locales_initialize | from searx.locales import LOCALE_NAMES, locales_initialize | ||||||
| from searx import searx_dir | from searx import searx_dir | ||||||
| from searx.utils import gen_useragent | from searx.utils import gen_useragent, detect_language | ||||||
| import searx.search | import searx.search | ||||||
| import searx.network | import searx.network | ||||||
| 
 | 
 | ||||||
| @ -117,17 +114,6 @@ def get_wikipedia_summary(lang, pageid): | |||||||
|         return None |         return None | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def detect_language(text): |  | ||||||
|     try: |  | ||||||
|         r = detect_langs(str(text))  # pylint: disable=E1101 |  | ||||||
|     except LangDetectException: |  | ||||||
|         return None |  | ||||||
| 
 |  | ||||||
|     if len(r) > 0 and r[0].prob > 0.95: |  | ||||||
|         return r[0].lang |  | ||||||
|     return None |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def get_website_description(url, lang1, lang2=None): | def get_website_description(url, lang1, lang2=None): | ||||||
|     headers = { |     headers = { | ||||||
|         'User-Agent': gen_useragent(), |         'User-Agent': gen_useragent(), | ||||||
|  | |||||||
| @ -232,3 +232,25 @@ class TestXPathUtils(SearxTestCase): | |||||||
|         with self.assertRaises(SearxEngineXPathException) as context: |         with self.assertRaises(SearxEngineXPathException) as context: | ||||||
|             utils.eval_xpath_getindex(doc, 'count(//i)', 1) |             utils.eval_xpath_getindex(doc, 'count(//i)', 1) | ||||||
|         self.assertEqual(context.exception.message, 'the result is not a list') |         self.assertEqual(context.exception.message, 'the result is not a list') | ||||||
|  | 
 | ||||||
|  |     def test_detect_language(self): | ||||||
|  |         # make sure new line are not an issue | ||||||
|  |         # fasttext.predict('') does not accept new line. | ||||||
|  |         l = utils.detect_language('The quick brown fox jumps over\nthe lazy dog') | ||||||
|  |         self.assertEqual(l, 'en') | ||||||
|  | 
 | ||||||
|  |         l = utils.detect_language('いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす') | ||||||
|  |         self.assertEqual(l, 'ja') | ||||||
|  | 
 | ||||||
|  |         l = utils.detect_language('Pijamalı hasta yağız şoföre çabucak güvendi.') | ||||||
|  |         self.assertEqual(l, 'tr') | ||||||
|  | 
 | ||||||
|  |         l = utils.detect_language('') | ||||||
|  |         self.assertIsNone(l) | ||||||
|  | 
 | ||||||
|  |         # mix languages --> None | ||||||
|  |         l = utils.detect_language('The いろはにほへと Pijamalı') | ||||||
|  |         self.assertIsNone(l) | ||||||
|  | 
 | ||||||
|  |         with self.assertRaises(ValueError): | ||||||
|  |             utils.detect_language(None) | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user