Merge pull request #2269 from return42/locale-revision
Revision of the locale- and language- handling in SearXNG
This commit is contained in:
		
						commit
						f950119ca8
					
				
							
								
								
									
										2
									
								
								.github/workflows/data-update.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/data-update.yml
									
									
									
									
										vendored
									
									
								
							| @ -17,7 +17,7 @@ jobs: | |||||||
|           - update_currencies.py |           - update_currencies.py | ||||||
|           - update_external_bangs.py |           - update_external_bangs.py | ||||||
|           - update_firefox_version.py |           - update_firefox_version.py | ||||||
|           - update_languages.py |           - update_engine_traits.py | ||||||
|           - update_wikidata_units.py |           - update_wikidata_units.py | ||||||
|           - update_engine_descriptions.py |           - update_engine_descriptions.py | ||||||
|     steps: |     steps: | ||||||
|  | |||||||
| @ -42,7 +42,7 @@ Explanation of the :ref:`general engine configuration` shown in the table | |||||||
|         - Timeout |         - Timeout | ||||||
|         - Weight |         - Weight | ||||||
|         - Paging |         - Paging | ||||||
|         - Language |         - Language, Region | ||||||
|         - Safe search |         - Safe search | ||||||
|         - Time range |         - Time range | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -569,10 +569,13 @@ engine is shown.  Most of the options have a default value or even are optional. | |||||||
|   To disable by default the engine, but not deleting it.  It will allow the user |   To disable by default the engine, but not deleting it.  It will allow the user | ||||||
|   to manually activate it in the settings. |   to manually activate it in the settings. | ||||||
| 
 | 
 | ||||||
|  | ``inactive``: optional | ||||||
|  |   Remove the engine from the settings (*disabled & removed*). | ||||||
|  | 
 | ||||||
| ``language`` : optional | ``language`` : optional | ||||||
|   If you want to use another language for a specific engine, you can define it |   If you want to use another language for a specific engine, you can define it | ||||||
|   by using the full ISO code of language and country, like ``fr_FR``, ``en_US``, |   by using the ISO code of language (and region), like ``fr``, ``en-US``, | ||||||
|   ``de_DE``. |   ``de-DE``. | ||||||
| 
 | 
 | ||||||
| ``tokens`` : optional | ``tokens`` : optional | ||||||
|   A list of secret tokens to make this engine *private*, more details see |   A list of secret tokens to make this engine *private*, more details see | ||||||
|  | |||||||
| @ -127,6 +127,10 @@ extensions = [ | |||||||
|     'notfound.extension',  # https://github.com/readthedocs/sphinx-notfound-page |     'notfound.extension',  # https://github.com/readthedocs/sphinx-notfound-page | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
|  | autodoc_default_options = { | ||||||
|  |     'member-order': 'groupwise', | ||||||
|  | } | ||||||
|  | 
 | ||||||
| myst_enable_extensions = [ | myst_enable_extensions = [ | ||||||
|   "replacements", "smartquotes" |   "replacements", "smartquotes" | ||||||
| ] | ] | ||||||
| @ -135,6 +139,7 @@ suppress_warnings = ['myst.domains'] | |||||||
| 
 | 
 | ||||||
| intersphinx_mapping = { | intersphinx_mapping = { | ||||||
|     "python": ("https://docs.python.org/3/", None), |     "python": ("https://docs.python.org/3/", None), | ||||||
|  |     "babel" : ("https://babel.readthedocs.io/en/latest/", None), | ||||||
|     "flask": ("https://flask.palletsprojects.com/", None), |     "flask": ("https://flask.palletsprojects.com/", None), | ||||||
|     "flask_babel": ("https://python-babel.github.io/flask-babel/", None), |     "flask_babel": ("https://python-babel.github.io/flask-babel/", None), | ||||||
|     # "werkzeug": ("https://werkzeug.palletsprojects.com/", None), |     # "werkzeug": ("https://werkzeug.palletsprojects.com/", None), | ||||||
|  | |||||||
| @ -54,6 +54,7 @@ Engine File | |||||||
|                                        - ``offline`` :ref:`[ref] <offline engines>` |                                        - ``offline`` :ref:`[ref] <offline engines>` | ||||||
|                                        - ``online_dictionary`` |                                        - ``online_dictionary`` | ||||||
|                                        - ``online_currency`` |                                        - ``online_currency`` | ||||||
|  |                                        - ``online_url_search`` | ||||||
|    ======================= =========== ======================================================== |    ======================= =========== ======================================================== | ||||||
| 
 | 
 | ||||||
| .. _engine settings: | .. _engine settings: | ||||||
| @ -131,8 +132,10 @@ Passed Arguments (request) | |||||||
| These arguments can be used to construct the search query.  Furthermore, | These arguments can be used to construct the search query.  Furthermore, | ||||||
| parameters with default value can be redefined for special purposes. | parameters with default value can be redefined for special purposes. | ||||||
| 
 | 
 | ||||||
|  | .. _engine request online: | ||||||
| 
 | 
 | ||||||
| .. table:: If the ``engine_type`` is ``online`` | .. table:: If the ``engine_type`` is :py:obj:`online | ||||||
|  |            <searx.search.processors.online.OnlineProcessor.get_params>` | ||||||
|    :width: 100% |    :width: 100% | ||||||
| 
 | 
 | ||||||
|    ====================== ============== ======================================================================== |    ====================== ============== ======================================================================== | ||||||
| @ -149,12 +152,16 @@ parameters with default value can be redefined for special purposes. | |||||||
|    safesearch             int            ``0``, between ``0`` and ``2`` (normal, moderate, strict) |    safesearch             int            ``0``, between ``0`` and ``2`` (normal, moderate, strict) | ||||||
|    time_range             Optional[str]  ``None``, can be ``day``, ``week``, ``month``, ``year`` |    time_range             Optional[str]  ``None``, can be ``day``, ``week``, ``month``, ``year`` | ||||||
|    pageno                 int            current pagenumber |    pageno                 int            current pagenumber | ||||||
|    language               str            specific language code like ``'en_US'``, or ``'all'`` if unspecified |    searxng_locale         str            SearXNG's locale selected by user.  Specific language code like | ||||||
|  |                                          ``'en'``, ``'en-US'``, or ``'all'`` if unspecified. | ||||||
|    ====================== ============== ======================================================================== |    ====================== ============== ======================================================================== | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| .. table:: If the ``engine_type`` is ``online_dictionary``, in addition to the | .. _engine request online_dictionary: | ||||||
|            ``online`` arguments: | 
 | ||||||
|  | .. table:: If the ``engine_type`` is :py:obj:`online_dictionary | ||||||
|  |            <searx.search.processors.online_dictionary.OnlineDictionaryProcessor.get_params>`, | ||||||
|  |            in addition to the :ref:`online <engine request online>` arguments: | ||||||
|    :width: 100% |    :width: 100% | ||||||
| 
 | 
 | ||||||
|    ====================== ============== ======================================================================== |    ====================== ============== ======================================================================== | ||||||
| @ -165,8 +172,11 @@ parameters with default value can be redefined for special purposes. | |||||||
|    query                  str            the text query without the languages |    query                  str            the text query without the languages | ||||||
|    ====================== ============== ======================================================================== |    ====================== ============== ======================================================================== | ||||||
| 
 | 
 | ||||||
| .. table:: If the ``engine_type`` is ``online_currency```, in addition to the | .. _engine request online_currency: | ||||||
|            ``online`` arguments: | 
 | ||||||
|  | .. table:: If the ``engine_type`` is :py:obj:`online_currency | ||||||
|  |            <searx.search.processors.online_currency.OnlineCurrencyProcessor.get_params>`, | ||||||
|  |            in addition to the :ref:`online <engine request online>` arguments: | ||||||
|    :width: 100% |    :width: 100% | ||||||
| 
 | 
 | ||||||
|    ====================== ============== ======================================================================== |    ====================== ============== ======================================================================== | ||||||
| @ -179,6 +189,26 @@ parameters with default value can be redefined for special purposes. | |||||||
|    to_name                str            currency name |    to_name                str            currency name | ||||||
|    ====================== ============== ======================================================================== |    ====================== ============== ======================================================================== | ||||||
| 
 | 
 | ||||||
|  | .. _engine request online_url_search: | ||||||
|  | 
 | ||||||
|  | .. table:: If the ``engine_type`` is :py:obj:`online_url_search | ||||||
|  |            <searx.search.processors.online_url_search.OnlineUrlSearchProcessor.get_params>`, | ||||||
|  |            in addition to the :ref:`online <engine request online>` arguments: | ||||||
|  |    :width: 100% | ||||||
|  | 
 | ||||||
|  |    ====================== ============== ======================================================================== | ||||||
|  |    argument               type           default-value, information | ||||||
|  |    ====================== ============== ======================================================================== | ||||||
|  |    search_url             dict           URLs from the search query: | ||||||
|  | 
 | ||||||
|  |                                          .. code:: python | ||||||
|  | 
 | ||||||
|  |                                             { | ||||||
|  |                                               'http': str, | ||||||
|  |                                               'ftp': str, | ||||||
|  |                                               'data:image': str | ||||||
|  |                                             } | ||||||
|  |    ====================== ============== ======================================================================== | ||||||
| 
 | 
 | ||||||
| Specify Request | Specify Request | ||||||
| --------------- | --------------- | ||||||
|  | |||||||
| @ -52,12 +52,12 @@ Scripts to update static data in :origin:`searx/data/` | |||||||
|   :members: |   :members: | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| ``update_languages.py`` | ``update_engine_traits.py`` | ||||||
| ======================= | =========================== | ||||||
| 
 | 
 | ||||||
| :origin:`[source] <searxng_extra/update/update_languages.py>` | :origin:`[source] <searxng_extra/update/update_engine_traits.py>` | ||||||
| 
 | 
 | ||||||
| .. automodule:: searxng_extra.update.update_languages | .. automodule:: searxng_extra.update.update_engine_traits | ||||||
|   :members: |   :members: | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										9
									
								
								docs/src/searx.engine.archlinux.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								docs/src/searx.engine.archlinux.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,9 @@ | |||||||
|  | .. _archlinux engine: | ||||||
|  | 
 | ||||||
|  | ========== | ||||||
|  | Arch Linux | ||||||
|  | ========== | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.engines.archlinux | ||||||
|  |   :members: | ||||||
|  | 
 | ||||||
							
								
								
									
										8
									
								
								docs/src/searx.engine.dailymotion.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								docs/src/searx.engine.dailymotion.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,8 @@ | |||||||
|  | .. _dailymotion engine: | ||||||
|  | 
 | ||||||
|  | =========== | ||||||
|  | Dailymotion | ||||||
|  | =========== | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.engines.dailymotion | ||||||
|  |   :members: | ||||||
							
								
								
									
										22
									
								
								docs/src/searx.engine.duckduckgo.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								docs/src/searx.engine.duckduckgo.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,22 @@ | |||||||
|  | .. _duckduckgo engines: | ||||||
|  | 
 | ||||||
|  | ================= | ||||||
|  | DukcDukGo engines | ||||||
|  | ================= | ||||||
|  | 
 | ||||||
|  | .. contents:: Contents | ||||||
|  |    :depth: 2 | ||||||
|  |    :local: | ||||||
|  |    :backlinks: entry | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.engines.duckduckgo | ||||||
|  |    :members: | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.engines.duckduckgo_images | ||||||
|  |    :members: | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.engines.duckduckgo_definitions | ||||||
|  |    :members: | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.engines.duckduckgo_weather | ||||||
|  |    :members: | ||||||
							
								
								
									
										17
									
								
								docs/src/searx.enginelib.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								docs/src/searx.enginelib.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,17 @@ | |||||||
|  | .. _searx.enginelib: | ||||||
|  | 
 | ||||||
|  | ============ | ||||||
|  | Engine model | ||||||
|  | ============ | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.enginelib | ||||||
|  |   :members: | ||||||
|  | 
 | ||||||
|  | .. _searx.enginelib.traits: | ||||||
|  | 
 | ||||||
|  | ============= | ||||||
|  | Engine traits | ||||||
|  | ============= | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.enginelib.traits | ||||||
|  |   :members: | ||||||
							
								
								
									
										43
									
								
								docs/src/searx.engines.bing.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								docs/src/searx.engines.bing.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,43 @@ | |||||||
|  | .. _bing engines: | ||||||
|  | 
 | ||||||
|  | ============ | ||||||
|  | Bing Engines | ||||||
|  | ============ | ||||||
|  | 
 | ||||||
|  | .. contents:: Contents | ||||||
|  |    :depth: 2 | ||||||
|  |    :local: | ||||||
|  |    :backlinks: entry | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | .. _bing web engine: | ||||||
|  | 
 | ||||||
|  | Bing WEB | ||||||
|  | ======== | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.engines.bing | ||||||
|  |   :members: | ||||||
|  | 
 | ||||||
|  | .. _bing images engine: | ||||||
|  | 
 | ||||||
|  | Bing Images | ||||||
|  | =========== | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.engines.bing_images | ||||||
|  |   :members: | ||||||
|  | 
 | ||||||
|  | .. _bing videos engine: | ||||||
|  | 
 | ||||||
|  | Bing Videos | ||||||
|  | =========== | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.engines.bing_videos | ||||||
|  |   :members: | ||||||
|  | 
 | ||||||
|  | .. _bing news engine: | ||||||
|  | 
 | ||||||
|  | Bing News | ||||||
|  | ========= | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.engines.bing_news | ||||||
|  |   :members: | ||||||
| @ -12,15 +12,21 @@ Google Engines | |||||||
| 
 | 
 | ||||||
| .. _google API: | .. _google API: | ||||||
| 
 | 
 | ||||||
| google API | Google API | ||||||
| ========== | ========== | ||||||
| 
 | 
 | ||||||
| .. _Query Parameter Definitions: | .. _Query Parameter Definitions: | ||||||
|    https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions |    https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions | ||||||
| 
 | 
 | ||||||
|  | SearXNG's implementation of the Google API is mainly done in | ||||||
|  | :py:obj:`get_google_info <searx.engines.google.get_google_info>`. | ||||||
|  | 
 | ||||||
| For detailed description of the *REST-full* API see: `Query Parameter | For detailed description of the *REST-full* API see: `Query Parameter | ||||||
| Definitions`_.  Not all parameters can be appied and some engines are *special* | Definitions`_.  The linked API documentation can sometimes be helpful during | ||||||
| (e.g. :ref:`google news engine`). | reverse engineering.  However, we cannot use it in the freely accessible WEB | ||||||
|  | services; not all parameters can be applied and some engines are more *special* | ||||||
|  | than other (e.g. :ref:`google news engine`). | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| .. _google web engine: | .. _google web engine: | ||||||
| 
 | 
 | ||||||
| @ -30,6 +36,13 @@ Google WEB | |||||||
| .. automodule:: searx.engines.google | .. automodule:: searx.engines.google | ||||||
|   :members: |   :members: | ||||||
| 
 | 
 | ||||||
|  | .. _google autocomplete: | ||||||
|  | 
 | ||||||
|  | Google Autocomplete | ||||||
|  | ==================== | ||||||
|  | 
 | ||||||
|  | .. autofunction:: searx.autocomplete.google_complete | ||||||
|  | 
 | ||||||
| .. _google images engine: | .. _google images engine: | ||||||
| 
 | 
 | ||||||
| Google Images | Google Images | ||||||
| @ -53,3 +66,11 @@ Google News | |||||||
| 
 | 
 | ||||||
| .. automodule:: searx.engines.google_news | .. automodule:: searx.engines.google_news | ||||||
|   :members: |   :members: | ||||||
|  | 
 | ||||||
|  | .. _google scholar engine: | ||||||
|  | 
 | ||||||
|  | Google Scholar | ||||||
|  | ============== | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.engines.google_scholar | ||||||
|  |   :members: | ||||||
|  | |||||||
							
								
								
									
										27
									
								
								docs/src/searx.engines.peertube.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								docs/src/searx.engines.peertube.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,27 @@ | |||||||
|  | .. _peertube engines: | ||||||
|  | 
 | ||||||
|  | ================ | ||||||
|  | Peertube Engines | ||||||
|  | ================ | ||||||
|  | 
 | ||||||
|  | .. contents:: Contents | ||||||
|  |    :depth: 2 | ||||||
|  |    :local: | ||||||
|  |    :backlinks: entry | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | .. _peertube video engine: | ||||||
|  | 
 | ||||||
|  | Peertube Video | ||||||
|  | ============== | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.engines.peertube | ||||||
|  |   :members: | ||||||
|  | 
 | ||||||
|  | .. _sepiasearch engine: | ||||||
|  | 
 | ||||||
|  | SepiaSearch | ||||||
|  | =========== | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.engines.sepiasearch | ||||||
|  |   :members: | ||||||
| @ -1,8 +1,8 @@ | |||||||
| .. _load_engines: | .. _searx.engines: | ||||||
| 
 | 
 | ||||||
| ============ | ================= | ||||||
| Load Engines | SearXNG's engines | ||||||
| ============ | ================= | ||||||
| 
 | 
 | ||||||
| .. automodule:: searx.engines | .. automodule:: searx.engines | ||||||
|   :members: |   :members: | ||||||
|  | |||||||
							
								
								
									
										13
									
								
								docs/src/searx.engines.startpage.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								docs/src/searx.engines.startpage.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,13 @@ | |||||||
|  | .. _startpage engines: | ||||||
|  | 
 | ||||||
|  | ================= | ||||||
|  | Startpage engines | ||||||
|  | ================= | ||||||
|  | 
 | ||||||
|  | .. contents:: Contents | ||||||
|  |    :depth: 2 | ||||||
|  |    :local: | ||||||
|  |    :backlinks: entry | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.engines.startpage | ||||||
|  |    :members: | ||||||
							
								
								
									
										27
									
								
								docs/src/searx.engines.wikipedia.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								docs/src/searx.engines.wikipedia.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,27 @@ | |||||||
|  | .. _wikimedia engines: | ||||||
|  | 
 | ||||||
|  | ========= | ||||||
|  | Wikimedia | ||||||
|  | ========= | ||||||
|  | 
 | ||||||
|  | .. contents:: Contents | ||||||
|  |    :depth: 2 | ||||||
|  |    :local: | ||||||
|  |    :backlinks: entry | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | .. _wikipedia engine: | ||||||
|  | 
 | ||||||
|  | Wikipedia | ||||||
|  | ========= | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.engines.wikipedia | ||||||
|  |   :members: | ||||||
|  | 
 | ||||||
|  | .. _wikidata engine: | ||||||
|  | 
 | ||||||
|  | Wikidata | ||||||
|  | ========= | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.engines.wikidata | ||||||
|  |   :members: | ||||||
| @ -4,5 +4,17 @@ | |||||||
| Locales | Locales | ||||||
| ======= | ======= | ||||||
| 
 | 
 | ||||||
|  | .. contents:: Contents | ||||||
|  |    :depth: 2 | ||||||
|  |    :local: | ||||||
|  |    :backlinks: entry | ||||||
|  | 
 | ||||||
| .. automodule:: searx.locales | .. automodule:: searx.locales | ||||||
|   :members: |   :members: | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | SearXNG's locale codes | ||||||
|  | ====================== | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.sxng_locales | ||||||
|  |   :members: | ||||||
|  | |||||||
							
								
								
									
										47
									
								
								docs/src/searx.search.processors.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										47
									
								
								docs/src/searx.search.processors.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,47 @@ | |||||||
|  | .. _searx.search.processors: | ||||||
|  | 
 | ||||||
|  | ================= | ||||||
|  | Search processors | ||||||
|  | ================= | ||||||
|  | 
 | ||||||
|  | .. contents:: Contents | ||||||
|  |    :depth: 2 | ||||||
|  |    :local: | ||||||
|  |    :backlinks: entry | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | Abstract processor class | ||||||
|  | ======================== | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.search.processors.abstract | ||||||
|  |   :members: | ||||||
|  | 
 | ||||||
|  | Offline processor | ||||||
|  | ================= | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.search.processors.offline | ||||||
|  |   :members: | ||||||
|  | 
 | ||||||
|  | Online processor | ||||||
|  | ================ | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.search.processors.online | ||||||
|  |   :members: | ||||||
|  | 
 | ||||||
|  | Online currency processor | ||||||
|  | ========================= | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.search.processors.online_currency | ||||||
|  |   :members: | ||||||
|  | 
 | ||||||
|  | Online Dictionary processor | ||||||
|  | =========================== | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.search.processors.online_dictionary | ||||||
|  |   :members: | ||||||
|  | 
 | ||||||
|  | Online URL search processor | ||||||
|  | =========================== | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.search.processors.online_url_search | ||||||
|  |   :members: | ||||||
							
								
								
									
										2
									
								
								manage
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								manage
									
									
									
									
									
								
							| @ -63,7 +63,7 @@ PYLINT_SEARXNG_DISABLE_OPTION="\ | |||||||
| I,C,R,\ | I,C,R,\ | ||||||
| W0105,W0212,W0511,W0603,W0613,W0621,W0702,W0703,W1401,\ | W0105,W0212,W0511,W0603,W0613,W0621,W0702,W0703,W1401,\ | ||||||
| E1136" | E1136" | ||||||
| PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES="supported_languages,language_aliases,logger,categories" | PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES="traits,supported_languages,language_aliases,logger,categories" | ||||||
| PYLINT_OPTIONS="-m pylint -j 0 --rcfile .pylintrc" | PYLINT_OPTIONS="-m pylint -j 0 --rcfile .pylintrc" | ||||||
| 
 | 
 | ||||||
| help() { | help() { | ||||||
|  | |||||||
| @ -1,5 +1,5 @@ | |||||||
| certifi==2022.12.7 | certifi==2022.12.7 | ||||||
| babel==2.11.0 | babel==2.12.1 | ||||||
| flask-babel==3.0.1 | flask-babel==3.0.1 | ||||||
| flask==2.2.3 | flask==2.2.3 | ||||||
| jinja2==3.1.2 | jinja2==3.1.2 | ||||||
|  | |||||||
| @ -5,20 +5,20 @@ | |||||||
| """ | """ | ||||||
| # pylint: disable=use-dict-literal | # pylint: disable=use-dict-literal | ||||||
| 
 | 
 | ||||||
| from json import loads | import json | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| 
 | 
 | ||||||
| from lxml import etree | import lxml | ||||||
| from httpx import HTTPError | from httpx import HTTPError | ||||||
| 
 | 
 | ||||||
| from searx import settings | from searx import settings | ||||||
| from searx.data import ENGINES_LANGUAGES | from searx.engines import ( | ||||||
|  |     engines, | ||||||
|  |     google, | ||||||
|  | ) | ||||||
| from searx.network import get as http_get | from searx.network import get as http_get | ||||||
| from searx.exceptions import SearxEngineResponseException | from searx.exceptions import SearxEngineResponseException | ||||||
| 
 | 
 | ||||||
| # a fetch_supported_languages() for XPath engines isn't available right now |  | ||||||
| # _brave = ENGINES_LANGUAGES['brave'].keys() |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| def get(*args, **kwargs): | def get(*args, **kwargs): | ||||||
|     if 'timeout' not in kwargs: |     if 'timeout' not in kwargs: | ||||||
| @ -55,34 +55,58 @@ def dbpedia(query, _lang): | |||||||
|     results = [] |     results = [] | ||||||
| 
 | 
 | ||||||
|     if response.ok: |     if response.ok: | ||||||
|         dom = etree.fromstring(response.content) |         dom = lxml.etree.fromstring(response.content) | ||||||
|         results = dom.xpath('//Result/Label//text()') |         results = dom.xpath('//Result/Label//text()') | ||||||
| 
 | 
 | ||||||
|     return results |     return results | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def duckduckgo(query, _lang): | def duckduckgo(query, sxng_locale): | ||||||
|     # duckduckgo autocompleter |     """Autocomplete from DuckDuckGo. Supports DuckDuckGo's languages""" | ||||||
|     url = 'https://ac.duckduckgo.com/ac/?{0}&type=list' |  | ||||||
| 
 | 
 | ||||||
|     resp = loads(get(url.format(urlencode(dict(q=query)))).text) |     traits = engines['duckduckgo'].traits | ||||||
|     if len(resp) > 1: |     args = { | ||||||
|         return resp[1] |         'q': query, | ||||||
|     return [] |         'kl': traits.get_region(sxng_locale, traits.all_locale), | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     url = 'https://duckduckgo.com/ac/?type=list&' + urlencode(args) | ||||||
|  |     resp = get(url) | ||||||
|  | 
 | ||||||
|  |     ret_val = [] | ||||||
|  |     if resp.ok: | ||||||
|  |         j = resp.json() | ||||||
|  |         if len(j) > 1: | ||||||
|  |             ret_val = j[1] | ||||||
|  |     return ret_val | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def google(query, lang): | def google_complete(query, sxng_locale): | ||||||
|     # google autocompleter |     """Autocomplete from Google.  Supports Google's languages and subdomains | ||||||
|     autocomplete_url = 'https://suggestqueries.google.com/complete/search?client=toolbar&' |     (:py:obj:`searx.engines.google.get_google_info`) by using the async REST | ||||||
|  |     API:: | ||||||
| 
 | 
 | ||||||
|     response = get(autocomplete_url + urlencode(dict(hl=lang, q=query))) |         https://{subdomain}/complete/search?{args} | ||||||
| 
 | 
 | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     google_info = google.get_google_info({'searxng_locale': sxng_locale}, engines['google'].traits) | ||||||
|  | 
 | ||||||
|  |     url = 'https://{subdomain}/complete/search?{args}' | ||||||
|  |     args = urlencode( | ||||||
|  |         { | ||||||
|  |             'q': query, | ||||||
|  |             'client': 'gws-wiz', | ||||||
|  |             'hl': google_info['params']['hl'], | ||||||
|  |         } | ||||||
|  |     ) | ||||||
|     results = [] |     results = [] | ||||||
| 
 |     resp = get(url.format(subdomain=google_info['subdomain'], args=args)) | ||||||
|     if response.ok: |     if resp.ok: | ||||||
|         dom = etree.fromstring(response.text) |         json_txt = resp.text[resp.text.find('[') : resp.text.find(']', -3) + 1] | ||||||
|         results = dom.xpath('//suggestion/@data') |         data = json.loads(json_txt) | ||||||
| 
 |         for item in data[0]: | ||||||
|  |             results.append(lxml.html.fromstring(item[0]).text_content()) | ||||||
|     return results |     return results | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -109,9 +133,9 @@ def seznam(query, _lang): | |||||||
|     ] |     ] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def startpage(query, lang): | def startpage(query, sxng_locale): | ||||||
|     # startpage autocompleter |     """Autocomplete from Startpage. Supports Startpage's languages""" | ||||||
|     lui = ENGINES_LANGUAGES['startpage'].get(lang, 'english') |     lui = engines['startpage'].traits.get_language(sxng_locale, 'english') | ||||||
|     url = 'https://startpage.com/suggestions?{query}' |     url = 'https://startpage.com/suggestions?{query}' | ||||||
|     resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': lui}))) |     resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': lui}))) | ||||||
|     data = resp.json() |     data = resp.json() | ||||||
| @ -122,20 +146,20 @@ def swisscows(query, _lang): | |||||||
|     # swisscows autocompleter |     # swisscows autocompleter | ||||||
|     url = 'https://swisscows.ch/api/suggest?{query}&itemsCount=5' |     url = 'https://swisscows.ch/api/suggest?{query}&itemsCount=5' | ||||||
| 
 | 
 | ||||||
|     resp = loads(get(url.format(query=urlencode({'query': query}))).text) |     resp = json.loads(get(url.format(query=urlencode({'query': query}))).text) | ||||||
|     return resp |     return resp | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def qwant(query, lang): | def qwant(query, sxng_locale): | ||||||
|     # qwant autocompleter (additional parameter : lang=en_en&count=xxx ) |     """Autocomplete from Qwant. Supports Qwant's regions.""" | ||||||
|     url = 'https://api.qwant.com/api/suggest?{query}' |  | ||||||
| 
 |  | ||||||
|     resp = get(url.format(query=urlencode({'q': query, 'lang': lang}))) |  | ||||||
| 
 |  | ||||||
|     results = [] |     results = [] | ||||||
| 
 | 
 | ||||||
|  |     locale = engines['qwant'].traits.get_region(sxng_locale, 'en_US') | ||||||
|  |     url = 'https://api.qwant.com/v3/suggest?{query}' | ||||||
|  |     resp = get(url.format(query=urlencode({'q': query, 'locale': locale, 'version': '2'}))) | ||||||
|  | 
 | ||||||
|     if resp.ok: |     if resp.ok: | ||||||
|         data = loads(resp.text) |         data = resp.json() | ||||||
|         if data['status'] == 'success': |         if data['status'] == 'success': | ||||||
|             for item in data['data']['items']: |             for item in data['data']['items']: | ||||||
|                 results.append(item['value']) |                 results.append(item['value']) | ||||||
| @ -143,21 +167,38 @@ def qwant(query, lang): | |||||||
|     return results |     return results | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def wikipedia(query, lang): | def wikipedia(query, sxng_locale): | ||||||
|     # wikipedia autocompleter |     """Autocomplete from Wikipedia. Supports Wikipedia's languages (aka netloc).""" | ||||||
|     url = 'https://' + lang + '.wikipedia.org/w/api.php?action=opensearch&{0}&limit=10&namespace=0&format=json' |     results = [] | ||||||
|  |     eng_traits = engines['wikipedia'].traits | ||||||
|  |     wiki_lang = eng_traits.get_language(sxng_locale, 'en') | ||||||
|  |     wiki_netloc = eng_traits.custom['wiki_netloc'].get(wiki_lang, 'en.wikipedia.org') | ||||||
| 
 | 
 | ||||||
|     resp = loads(get(url.format(urlencode(dict(search=query)))).text) |     url = 'https://{wiki_netloc}/w/api.php?{args}' | ||||||
|     if len(resp) > 1: |     args = urlencode( | ||||||
|         return resp[1] |         { | ||||||
|     return [] |             'action': 'opensearch', | ||||||
|  |             'format': 'json', | ||||||
|  |             'formatversion': '2', | ||||||
|  |             'search': query, | ||||||
|  |             'namespace': '0', | ||||||
|  |             'limit': '10', | ||||||
|  |         } | ||||||
|  |     ) | ||||||
|  |     resp = get(url.format(args=args, wiki_netloc=wiki_netloc)) | ||||||
|  |     if resp.ok: | ||||||
|  |         data = resp.json() | ||||||
|  |         if len(data) > 1: | ||||||
|  |             results = data[1] | ||||||
|  | 
 | ||||||
|  |     return results | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def yandex(query, _lang): | def yandex(query, _lang): | ||||||
|     # yandex autocompleter |     # yandex autocompleter | ||||||
|     url = "https://suggest.yandex.com/suggest-ff.cgi?{0}" |     url = "https://suggest.yandex.com/suggest-ff.cgi?{0}" | ||||||
| 
 | 
 | ||||||
|     resp = loads(get(url.format(urlencode(dict(part=query)))).text) |     resp = json.loads(get(url.format(urlencode(dict(part=query)))).text) | ||||||
|     if len(resp) > 1: |     if len(resp) > 1: | ||||||
|         return resp[1] |         return resp[1] | ||||||
|     return [] |     return [] | ||||||
| @ -166,7 +207,7 @@ def yandex(query, _lang): | |||||||
| backends = { | backends = { | ||||||
|     'dbpedia': dbpedia, |     'dbpedia': dbpedia, | ||||||
|     'duckduckgo': duckduckgo, |     'duckduckgo': duckduckgo, | ||||||
|     'google': google, |     'google': google_complete, | ||||||
|     'seznam': seznam, |     'seznam': seznam, | ||||||
|     'startpage': startpage, |     'startpage': startpage, | ||||||
|     'swisscows': swisscows, |     'swisscows': swisscows, | ||||||
| @ -177,12 +218,11 @@ backends = { | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def search_autocomplete(backend_name, query, lang): | def search_autocomplete(backend_name, query, sxng_locale): | ||||||
|     backend = backends.get(backend_name) |     backend = backends.get(backend_name) | ||||||
|     if backend is None: |     if backend is None: | ||||||
|         return [] |         return [] | ||||||
| 
 |  | ||||||
|     try: |     try: | ||||||
|         return backend(query, lang) |         return backend(query, sxng_locale) | ||||||
|     except (HTTPError, SearxEngineResponseException): |     except (HTTPError, SearxEngineResponseException): | ||||||
|         return [] |         return [] | ||||||
|  | |||||||
| @ -7,7 +7,7 @@ | |||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| __all__ = [ | __all__ = [ | ||||||
|     'ENGINES_LANGUAGES', |     'ENGINE_TRAITS', | ||||||
|     'CURRENCIES', |     'CURRENCIES', | ||||||
|     'USER_AGENTS', |     'USER_AGENTS', | ||||||
|     'EXTERNAL_URLS', |     'EXTERNAL_URLS', | ||||||
| @ -42,7 +42,6 @@ def ahmia_blacklist_loader(): | |||||||
|         return f.read().split() |         return f.read().split() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| ENGINES_LANGUAGES = _load('engines_languages.json') |  | ||||||
| CURRENCIES = _load('currencies.json') | CURRENCIES = _load('currencies.json') | ||||||
| USER_AGENTS = _load('useragents.json') | USER_AGENTS = _load('useragents.json') | ||||||
| EXTERNAL_URLS = _load('external_urls.json') | EXTERNAL_URLS = _load('external_urls.json') | ||||||
| @ -50,3 +49,4 @@ WIKIDATA_UNITS = _load('wikidata_units.json') | |||||||
| EXTERNAL_BANGS = _load('external_bangs.json') | EXTERNAL_BANGS = _load('external_bangs.json') | ||||||
| OSM_KEYS_TAGS = _load('osm_keys_tags.json') | OSM_KEYS_TAGS = _load('osm_keys_tags.json') | ||||||
| ENGINE_DESCRIPTIONS = _load('engine_descriptions.json') | ENGINE_DESCRIPTIONS = _load('engine_descriptions.json') | ||||||
|  | ENGINE_TRAITS = _load('engine_traits.json') | ||||||
|  | |||||||
							
								
								
									
										3810
									
								
								searx/data/engine_traits.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3810
									
								
								searx/data/engine_traits.json
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										136
									
								
								searx/enginelib/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										136
									
								
								searx/enginelib/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,136 @@ | |||||||
|  | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
|  | # lint: pylint | ||||||
|  | """Engine related implementations | ||||||
|  | 
 | ||||||
|  | .. note:: | ||||||
|  | 
 | ||||||
|  |    The long term goal is to modularize all relevant implementations to the | ||||||
|  |    engines here in this Python package.  In addition to improved modularization, | ||||||
|  |    this will also be necessary in part because the probability of circular | ||||||
|  |    imports will increase due to the increased typification of implementations in | ||||||
|  |    the future. | ||||||
|  | 
 | ||||||
|  |    ToDo: | ||||||
|  | 
 | ||||||
|  |    - move :py:obj:`searx.engines.load_engine` to a new module `searx.enginelib`. | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | from __future__ import annotations | ||||||
|  | from typing import Union, Dict, List, Callable, TYPE_CHECKING | ||||||
|  | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     from searx.enginelib import traits | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Engine:  # pylint: disable=too-few-public-methods | ||||||
|  |     """Class of engine instances build from YAML settings. | ||||||
|  | 
 | ||||||
|  |     Further documentation see :ref:`general engine configuration`. | ||||||
|  | 
 | ||||||
|  |     .. hint:: | ||||||
|  | 
 | ||||||
|  |        This class is currently never initialized and only used for type hinting. | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     # Common options in the engine module | ||||||
|  | 
 | ||||||
|  |     engine_type: str | ||||||
|  |     """Type of the engine (:origin:`searx/search/processors`)""" | ||||||
|  | 
 | ||||||
|  |     paging: bool | ||||||
|  |     """Engine supports multiple pages.""" | ||||||
|  | 
 | ||||||
|  |     time_range_support: bool | ||||||
|  |     """Engine supports search time range.""" | ||||||
|  | 
 | ||||||
|  |     safesearch: bool | ||||||
|  |     """Engine supports SafeSearch""" | ||||||
|  | 
 | ||||||
|  |     language_support: bool | ||||||
|  |     """Engine supports languages (locales) search.""" | ||||||
|  | 
 | ||||||
|  |     language: str | ||||||
|  |     """For an engine, when there is ``language: ...`` in the YAML settings the engine | ||||||
|  |     does support only this one language: | ||||||
|  | 
 | ||||||
|  |     .. code:: yaml | ||||||
|  | 
 | ||||||
|  |       - name: google french | ||||||
|  |         engine: google | ||||||
|  |         language: fr | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     region: str | ||||||
|  |     """For an engine, when there is ``region: ...`` in the YAML settings the engine | ||||||
|  |     does support only this one region:: | ||||||
|  | 
 | ||||||
|  |     .. code:: yaml | ||||||
|  | 
 | ||||||
|  |       - name: google belgium | ||||||
|  |         engine: google | ||||||
|  |         region: fr-BE | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     fetch_traits: Callable | ||||||
|  |     """Function to to fetch engine's traits from origin.""" | ||||||
|  | 
 | ||||||
|  |     traits: traits.EngineTraits | ||||||
|  |     """Traits of the engine.""" | ||||||
|  | 
 | ||||||
|  |     # settings.yml | ||||||
|  | 
 | ||||||
|  |     categories: List[str] | ||||||
|  |     """Tabs, in which the engine is working.""" | ||||||
|  | 
 | ||||||
|  |     name: str | ||||||
|  |     """Name that will be used across SearXNG to define this engine.  In settings, on | ||||||
|  |     the result page ..""" | ||||||
|  | 
 | ||||||
|  |     engine: str | ||||||
|  |     """Name of the python file used to handle requests and responses to and from | ||||||
|  |     this search engine (file name from :origin:`searx/engines` without | ||||||
|  |     ``.py``).""" | ||||||
|  | 
 | ||||||
|  |     enable_http: bool | ||||||
|  |     """Enable HTTP (by default only HTTPS is enabled).""" | ||||||
|  | 
 | ||||||
|  |     shortcut: str | ||||||
|  |     """Code used to execute bang requests (``!foo``)""" | ||||||
|  | 
 | ||||||
|  |     timeout: float | ||||||
|  |     """Specific timeout for search-engine.""" | ||||||
|  | 
 | ||||||
|  |     display_error_messages: bool | ||||||
|  |     """Display error messages on the web UI.""" | ||||||
|  | 
 | ||||||
|  |     proxies: dict | ||||||
|  |     """Set proxies for a specific engine (YAML): | ||||||
|  | 
 | ||||||
|  |     .. code:: yaml | ||||||
|  | 
 | ||||||
|  |        proxies : | ||||||
|  |          http:  socks5://proxy:port | ||||||
|  |          https: socks5://proxy:port | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     disabled: bool | ||||||
|  |     """To disable by default the engine, but not deleting it.  It will allow the | ||||||
|  |     user to manually activate it in the settings.""" | ||||||
|  | 
 | ||||||
|  |     inactive: bool | ||||||
|  |     """Remove the engine from the settings (*disabled & removed*).""" | ||||||
|  | 
 | ||||||
|  |     about: dict | ||||||
|  |     """Additional fileds describing the engine. | ||||||
|  | 
 | ||||||
|  |     .. code:: yaml | ||||||
|  | 
 | ||||||
|  |        about: | ||||||
|  |           website: https://example.com | ||||||
|  |           wikidata_id: Q306656 | ||||||
|  |           official_api_documentation: https://example.com/api-doc | ||||||
|  |           use_official_api: true | ||||||
|  |           require_api_key: true | ||||||
|  |           results: HTML | ||||||
|  |     """ | ||||||
							
								
								
									
										250
									
								
								searx/enginelib/traits.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										250
									
								
								searx/enginelib/traits.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,250 @@ | |||||||
|  | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
|  | # lint: pylint | ||||||
|  | """Engine's traits are fetched from the origin engines and stored in a JSON file | ||||||
|  | in the *data folder*.  Most often traits are languages and region codes and | ||||||
|  | their mapping from SearXNG's representation to the representation in the origin | ||||||
|  | search engine.  For new traits new properties can be added to the class | ||||||
|  | :py:class:`EngineTraits`. | ||||||
|  | 
 | ||||||
|  | To load traits from the persistence :py:obj:`EngineTraitsMap.from_data` can be | ||||||
|  | used. | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | from __future__ import annotations | ||||||
|  | import json | ||||||
|  | import dataclasses | ||||||
|  | from typing import Dict, Union, Callable, Optional, TYPE_CHECKING | ||||||
|  | from typing_extensions import Literal, Self | ||||||
|  | 
 | ||||||
|  | from searx import locales | ||||||
|  | from searx.data import data_dir, ENGINE_TRAITS | ||||||
|  | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     from . import Engine | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class EngineTraitsEncoder(json.JSONEncoder): | ||||||
|  |     """Encodes :class:`EngineTraits` to a serializable object, see | ||||||
|  |     :class:`json.JSONEncoder`.""" | ||||||
|  | 
 | ||||||
|  |     def default(self, o): | ||||||
|  |         """Return dictionary of a :class:`EngineTraits` object.""" | ||||||
|  |         if isinstance(o, EngineTraits): | ||||||
|  |             return o.__dict__ | ||||||
|  |         return super().default(o) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @dataclasses.dataclass | ||||||
|  | class EngineTraits: | ||||||
|  |     """The class is intended to be instantiated for each engine.""" | ||||||
|  | 
 | ||||||
|  |     regions: Dict[str, str] = dataclasses.field(default_factory=dict) | ||||||
|  |     """Maps SearXNG's internal representation of a region to the one of the engine. | ||||||
|  | 
 | ||||||
|  |     SearXNG's internal representation can be parsed by babel and the value is | ||||||
|  |     send to the engine: | ||||||
|  | 
 | ||||||
|  |     .. code:: python | ||||||
|  | 
 | ||||||
|  |        regions ={ | ||||||
|  |            'fr-BE' : <engine's region name>, | ||||||
|  |        } | ||||||
|  | 
 | ||||||
|  |        for key, egnine_region regions.items(): | ||||||
|  |           searxng_region = babel.Locale.parse(key, sep='-') | ||||||
|  |           ... | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     languages: Dict[str, str] = dataclasses.field(default_factory=dict) | ||||||
|  |     """Maps SearXNG's internal representation of a language to the one of the engine. | ||||||
|  | 
 | ||||||
|  |     SearXNG's internal representation can be parsed by babel and the value is | ||||||
|  |     send to the engine: | ||||||
|  | 
 | ||||||
|  |     .. code:: python | ||||||
|  | 
 | ||||||
|  |        languages = { | ||||||
|  |            'ca' : <engine's language name>, | ||||||
|  |        } | ||||||
|  | 
 | ||||||
|  |        for key, egnine_lang in languages.items(): | ||||||
|  |           searxng_lang = babel.Locale.parse(key) | ||||||
|  |           ... | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     all_locale: Optional[str] = None | ||||||
|  |     """To which locale value SearXNG's ``all`` language is mapped (shown a "Default | ||||||
|  |     language"). | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     data_type: Literal['traits_v1'] = 'traits_v1' | ||||||
|  |     """Data type, default is 'traits_v1'. | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     custom: Dict[str, Dict] = dataclasses.field(default_factory=dict) | ||||||
|  |     """A place to store engine's custom traits, not related to the SearXNG core | ||||||
|  | 
 | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     def get_language(self, searxng_locale: str, default=None): | ||||||
|  |         """Return engine's language string that *best fits* to SearXNG's locale. | ||||||
|  | 
 | ||||||
|  |         :param searxng_locale: SearXNG's internal representation of locale | ||||||
|  |           selected by the user. | ||||||
|  | 
 | ||||||
|  |         :param default: engine's default language | ||||||
|  | 
 | ||||||
|  |         The *best fits* rules are implemented in | ||||||
|  |         :py:obj:`locales.get_engine_locale`.  Except for the special value ``all`` | ||||||
|  |         which is determined from :py:obj`EngineTraits.all_language`. | ||||||
|  |         """ | ||||||
|  |         if searxng_locale == 'all' and self.all_locale is not None: | ||||||
|  |             return self.all_locale | ||||||
|  |         return locales.get_engine_locale(searxng_locale, self.languages, default=default) | ||||||
|  | 
 | ||||||
|  |     def get_region(self, searxng_locale: str, default=None): | ||||||
|  |         """Return engine's region string that best fits to SearXNG's locale. | ||||||
|  | 
 | ||||||
|  |         :param searxng_locale: SearXNG's internal representation of locale | ||||||
|  |           selected by the user. | ||||||
|  | 
 | ||||||
|  |         :param default: engine's default region | ||||||
|  | 
 | ||||||
|  |         The *best fits* rules are implemented in | ||||||
|  |         :py:obj:`locales.get_engine_locale`.  Except for the special value ``all`` | ||||||
|  |         which is determined from :py:obj`EngineTraits.all_language`. | ||||||
|  |         """ | ||||||
|  |         if searxng_locale == 'all' and self.all_locale is not None: | ||||||
|  |             return self.all_locale | ||||||
|  |         return locales.get_engine_locale(searxng_locale, self.regions, default=default) | ||||||
|  | 
 | ||||||
|  |     def is_locale_supported(self, searxng_locale: str) -> bool: | ||||||
|  |         """A *locale* (SearXNG's internal representation) is considered to be supported | ||||||
|  |         by the engine if the *region* or the *language* is supported by the | ||||||
|  |         engine.  For verification the functions :py:func:`self.get_region` and | ||||||
|  |         :py:func:`self.get_region` are used. | ||||||
|  |         """ | ||||||
|  |         if self.data_type == 'traits_v1': | ||||||
|  |             return bool(self.get_region(searxng_locale) or self.get_language(searxng_locale)) | ||||||
|  | 
 | ||||||
|  |         raise TypeError('engine traits of type %s is unknown' % self.data_type) | ||||||
|  | 
 | ||||||
|  |     def copy(self): | ||||||
|  |         """Create a copy of the dataclass object.""" | ||||||
|  |         return EngineTraits(**dataclasses.asdict(self)) | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def fetch_traits(cls, engine: Engine) -> Union[Self, None]: | ||||||
|  |         """Call a function ``fetch_traits(engine_traits)`` from engines namespace to fetch | ||||||
|  |         and set properties from the origin engine in the object ``engine_traits``.  If | ||||||
|  |         function does not exists, ``None`` is returned. | ||||||
|  |         """ | ||||||
|  | 
 | ||||||
|  |         fetch_traits = getattr(engine, 'fetch_traits', None) | ||||||
|  |         engine_traits = None | ||||||
|  | 
 | ||||||
|  |         if fetch_traits: | ||||||
|  |             engine_traits = cls() | ||||||
|  |             fetch_traits(engine_traits) | ||||||
|  |         return engine_traits | ||||||
|  | 
 | ||||||
|  |     def set_traits(self, engine: Engine): | ||||||
|  |         """Set traits from self object in a :py:obj:`.Engine` namespace. | ||||||
|  | 
 | ||||||
|  |         :param engine: engine instance build by :py:func:`searx.engines.load_engine` | ||||||
|  |         """ | ||||||
|  | 
 | ||||||
|  |         if self.data_type == 'traits_v1': | ||||||
|  |             self._set_traits_v1(engine) | ||||||
|  |         else: | ||||||
|  |             raise TypeError('engine traits of type %s is unknown' % self.data_type) | ||||||
|  | 
 | ||||||
|  |     def _set_traits_v1(self, engine: Engine): | ||||||
|  |         # For an engine, when there is `language: ...` in the YAML settings the engine | ||||||
|  |         # does support only this one language (region):: | ||||||
|  |         # | ||||||
|  |         #   - name: google italian | ||||||
|  |         #     engine: google | ||||||
|  |         #     language: it | ||||||
|  |         #     region: it-IT | ||||||
|  | 
 | ||||||
|  |         traits = self.copy() | ||||||
|  | 
 | ||||||
|  |         _msg = "settings.yml - engine: '%s' / %s: '%s' not supported" | ||||||
|  | 
 | ||||||
|  |         languages = traits.languages | ||||||
|  |         if hasattr(engine, 'language'): | ||||||
|  |             if engine.language not in languages: | ||||||
|  |                 raise ValueError(_msg % (engine.name, 'language', engine.language)) | ||||||
|  |             traits.languages = {engine.language: languages[engine.language]} | ||||||
|  | 
 | ||||||
|  |         regions = traits.regions | ||||||
|  |         if hasattr(engine, 'region'): | ||||||
|  |             if engine.region not in regions: | ||||||
|  |                 raise ValueError(_msg % (engine.name, 'region', engine.region)) | ||||||
|  |             traits.regions = {engine.region: regions[engine.region]} | ||||||
|  | 
 | ||||||
|  |         engine.language_support = bool(traits.languages or traits.regions) | ||||||
|  | 
 | ||||||
|  |         # set the copied & modified traits in engine's namespace | ||||||
|  |         engine.traits = traits | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class EngineTraitsMap(Dict[str, EngineTraits]): | ||||||
|  |     """A python dictionary to map :class:`EngineTraits` by engine name.""" | ||||||
|  | 
 | ||||||
|  |     ENGINE_TRAITS_FILE = (data_dir / 'engine_traits.json').resolve() | ||||||
|  |     """File with persistence of the :py:obj:`EngineTraitsMap`.""" | ||||||
|  | 
 | ||||||
|  |     def save_data(self): | ||||||
|  |         """Store EngineTraitsMap in in file :py:obj:`self.ENGINE_TRAITS_FILE`""" | ||||||
|  |         with open(self.ENGINE_TRAITS_FILE, 'w', encoding='utf-8') as f: | ||||||
|  |             json.dump(self, f, indent=2, sort_keys=True, cls=EngineTraitsEncoder) | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def from_data(cls) -> Self: | ||||||
|  |         """Instantiate :class:`EngineTraitsMap` object from :py:obj:`ENGINE_TRAITS`""" | ||||||
|  |         obj = cls() | ||||||
|  |         for k, v in ENGINE_TRAITS.items(): | ||||||
|  |             obj[k] = EngineTraits(**v) | ||||||
|  |         return obj | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def fetch_traits(cls, log: Callable) -> Self: | ||||||
|  |         from searx import engines  # pylint: disable=cyclic-import, import-outside-toplevel | ||||||
|  | 
 | ||||||
|  |         names = list(engines.engines) | ||||||
|  |         names.sort() | ||||||
|  |         obj = cls() | ||||||
|  | 
 | ||||||
|  |         for engine_name in names: | ||||||
|  |             engine = engines.engines[engine_name] | ||||||
|  | 
 | ||||||
|  |             traits = EngineTraits.fetch_traits(engine) | ||||||
|  |             if traits is not None: | ||||||
|  |                 log("%-20s: SearXNG languages --> %s " % (engine_name, len(traits.languages))) | ||||||
|  |                 log("%-20s: SearXNG regions   --> %s" % (engine_name, len(traits.regions))) | ||||||
|  |                 obj[engine_name] = traits | ||||||
|  | 
 | ||||||
|  |         return obj | ||||||
|  | 
 | ||||||
|  |     def set_traits(self, engine: Engine): | ||||||
|  |         """Set traits in a :py:obj:`Engine` namespace. | ||||||
|  | 
 | ||||||
|  |         :param engine: engine instance build by :py:func:`searx.engines.load_engine` | ||||||
|  |         """ | ||||||
|  | 
 | ||||||
|  |         engine_traits = EngineTraits(data_type='traits_v1') | ||||||
|  |         if engine.name in self.keys(): | ||||||
|  |             engine_traits = self[engine.name] | ||||||
|  | 
 | ||||||
|  |         elif engine.engine in self.keys(): | ||||||
|  |             # The key of the dictionary traits_map is the *engine name* | ||||||
|  |             # configured in settings.xml.  When multiple engines are configured | ||||||
|  |             # in settings.yml to use the same origin engine (python module) | ||||||
|  |             # these additional engines can use the languages from the origin | ||||||
|  |             # engine.  For this use the configured ``engine: ...`` from | ||||||
|  |             # settings.yml | ||||||
|  |             engine_traits = self[engine.engine] | ||||||
|  | 
 | ||||||
|  |         engine_traits.set_traits(engine) | ||||||
| @ -11,24 +11,22 @@ usage:: | |||||||
| 
 | 
 | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
|  | from __future__ import annotations | ||||||
|  | 
 | ||||||
| import sys | import sys | ||||||
| import copy | import copy | ||||||
| from typing import Dict, List, Optional |  | ||||||
| 
 |  | ||||||
| from os.path import realpath, dirname | from os.path import realpath, dirname | ||||||
| from babel.localedata import locale_identifiers |  | ||||||
| from searx import logger, settings |  | ||||||
| from searx.data import ENGINES_LANGUAGES |  | ||||||
| from searx.network import get |  | ||||||
| from searx.utils import load_module, match_language, gen_useragent |  | ||||||
| 
 | 
 | ||||||
|  | from typing import TYPE_CHECKING, Dict, Optional | ||||||
|  | 
 | ||||||
|  | from searx import logger, settings | ||||||
|  | from searx.utils import load_module | ||||||
|  | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     from searx.enginelib import Engine | ||||||
| 
 | 
 | ||||||
| logger = logger.getChild('engines') | logger = logger.getChild('engines') | ||||||
| ENGINE_DIR = dirname(realpath(__file__)) | ENGINE_DIR = dirname(realpath(__file__)) | ||||||
| BABEL_LANGS = [ |  | ||||||
|     lang_parts[0] + '-' + lang_parts[-1] if len(lang_parts) > 1 else lang_parts[0] |  | ||||||
|     for lang_parts in (lang_code.split('_') for lang_code in locale_identifiers()) |  | ||||||
| ] |  | ||||||
| ENGINE_DEFAULT_ARGS = { | ENGINE_DEFAULT_ARGS = { | ||||||
|     "engine_type": "online", |     "engine_type": "online", | ||||||
|     "inactive": False, |     "inactive": False, | ||||||
| @ -36,8 +34,6 @@ ENGINE_DEFAULT_ARGS = { | |||||||
|     "timeout": settings["outgoing"]["request_timeout"], |     "timeout": settings["outgoing"]["request_timeout"], | ||||||
|     "shortcut": "-", |     "shortcut": "-", | ||||||
|     "categories": ["general"], |     "categories": ["general"], | ||||||
|     "supported_languages": [], |  | ||||||
|     "language_aliases": {}, |  | ||||||
|     "paging": False, |     "paging": False, | ||||||
|     "safesearch": False, |     "safesearch": False, | ||||||
|     "time_range_support": False, |     "time_range_support": False, | ||||||
| @ -52,24 +48,6 @@ ENGINE_DEFAULT_ARGS = { | |||||||
| OTHER_CATEGORY = 'other' | OTHER_CATEGORY = 'other' | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Engine:  # pylint: disable=too-few-public-methods |  | ||||||
|     """This class is currently never initialized and only used for type hinting.""" |  | ||||||
| 
 |  | ||||||
|     name: str |  | ||||||
|     engine: str |  | ||||||
|     shortcut: str |  | ||||||
|     categories: List[str] |  | ||||||
|     supported_languages: List[str] |  | ||||||
|     about: dict |  | ||||||
|     inactive: bool |  | ||||||
|     disabled: bool |  | ||||||
|     language_support: bool |  | ||||||
|     paging: bool |  | ||||||
|     safesearch: bool |  | ||||||
|     time_range_support: bool |  | ||||||
|     timeout: float |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Defaults for the namespace of an engine module, see :py:func:`load_engine` | # Defaults for the namespace of an engine module, see :py:func:`load_engine` | ||||||
| 
 | 
 | ||||||
| categories = {'general': []} | categories = {'general': []} | ||||||
| @ -136,9 +114,15 @@ def load_engine(engine_data: dict) -> Optional[Engine]: | |||||||
|         return None |         return None | ||||||
| 
 | 
 | ||||||
|     update_engine_attributes(engine, engine_data) |     update_engine_attributes(engine, engine_data) | ||||||
|     set_language_attributes(engine) |  | ||||||
|     update_attributes_for_tor(engine) |     update_attributes_for_tor(engine) | ||||||
| 
 | 
 | ||||||
|  |     # avoid cyclic imports | ||||||
|  |     # pylint: disable=import-outside-toplevel | ||||||
|  |     from searx.enginelib.traits import EngineTraitsMap | ||||||
|  | 
 | ||||||
|  |     trait_map = EngineTraitsMap.from_data() | ||||||
|  |     trait_map.set_traits(engine) | ||||||
|  | 
 | ||||||
|     if not is_engine_active(engine): |     if not is_engine_active(engine): | ||||||
|         return None |         return None | ||||||
| 
 | 
 | ||||||
| @ -190,60 +174,6 @@ def update_engine_attributes(engine: Engine, engine_data): | |||||||
|             setattr(engine, arg_name, copy.deepcopy(arg_value)) |             setattr(engine, arg_name, copy.deepcopy(arg_value)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def set_language_attributes(engine: Engine): |  | ||||||
|     # assign supported languages from json file |  | ||||||
|     if engine.name in ENGINES_LANGUAGES: |  | ||||||
|         engine.supported_languages = ENGINES_LANGUAGES[engine.name] |  | ||||||
| 
 |  | ||||||
|     elif engine.engine in ENGINES_LANGUAGES: |  | ||||||
|         # The key of the dictionary ENGINES_LANGUAGES is the *engine name* |  | ||||||
|         # configured in settings.xml.  When multiple engines are configured in |  | ||||||
|         # settings.yml to use the same origin engine (python module) these |  | ||||||
|         # additional engines can use the languages from the origin engine. |  | ||||||
|         # For this use the configured ``engine: ...`` from settings.yml |  | ||||||
|         engine.supported_languages = ENGINES_LANGUAGES[engine.engine] |  | ||||||
| 
 |  | ||||||
|     if hasattr(engine, 'language'): |  | ||||||
|         # For an engine, when there is `language: ...` in the YAML settings, the |  | ||||||
|         # engine supports only one language, in this case |  | ||||||
|         # engine.supported_languages should contains this value defined in |  | ||||||
|         # settings.yml |  | ||||||
|         if engine.language not in engine.supported_languages: |  | ||||||
|             raise ValueError( |  | ||||||
|                 "settings.yml - engine: '%s' / language: '%s' not supported" % (engine.name, engine.language) |  | ||||||
|             ) |  | ||||||
| 
 |  | ||||||
|         if isinstance(engine.supported_languages, dict): |  | ||||||
|             engine.supported_languages = {engine.language: engine.supported_languages[engine.language]} |  | ||||||
|         else: |  | ||||||
|             engine.supported_languages = [engine.language] |  | ||||||
| 
 |  | ||||||
|     # find custom aliases for non standard language codes |  | ||||||
|     for engine_lang in engine.supported_languages: |  | ||||||
|         iso_lang = match_language(engine_lang, BABEL_LANGS, fallback=None) |  | ||||||
|         if ( |  | ||||||
|             iso_lang |  | ||||||
|             and iso_lang != engine_lang |  | ||||||
|             and not engine_lang.startswith(iso_lang) |  | ||||||
|             and iso_lang not in engine.supported_languages |  | ||||||
|         ): |  | ||||||
|             engine.language_aliases[iso_lang] = engine_lang |  | ||||||
| 
 |  | ||||||
|     # language_support |  | ||||||
|     engine.language_support = len(engine.supported_languages) > 0 |  | ||||||
| 
 |  | ||||||
|     # assign language fetching method if auxiliary method exists |  | ||||||
|     if hasattr(engine, '_fetch_supported_languages'): |  | ||||||
|         headers = { |  | ||||||
|             'User-Agent': gen_useragent(), |  | ||||||
|             'Accept-Language': "en-US,en;q=0.5",  # bing needs to set the English language |  | ||||||
|         } |  | ||||||
|         engine.fetch_supported_languages = ( |  | ||||||
|             # pylint: disable=protected-access |  | ||||||
|             lambda: engine._fetch_supported_languages(get(engine.supported_languages_url, headers=headers)) |  | ||||||
|         ) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def update_attributes_for_tor(engine: Engine) -> bool: | def update_attributes_for_tor(engine: Engine) -> bool: | ||||||
|     if using_tor_proxy(engine) and hasattr(engine, 'onion_url'): |     if using_tor_proxy(engine) and hasattr(engine, 'onion_url'): | ||||||
|         engine.search_url = engine.onion_url + getattr(engine, 'search_path', '') |         engine.search_url = engine.onion_url + getattr(engine, 'search_path', '') | ||||||
|  | |||||||
| @ -1,15 +1,32 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
|  | # lint: pylint | ||||||
| """ | """ | ||||||
|  Arch Linux Wiki | Arch Linux Wiki | ||||||
|  | ~~~~~~~~~~~~~~~ | ||||||
|  | 
 | ||||||
|  | This implementation does not use a official API: Mediawiki provides API, but | ||||||
|  | Arch Wiki blocks access to it. | ||||||
| 
 | 
 | ||||||
|  API: Mediawiki provides API, but Arch Wiki blocks access to it |  | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode, urljoin | from typing import TYPE_CHECKING | ||||||
| from lxml import html | from urllib.parse import urlencode, urljoin, urlparse | ||||||
|  | import lxml | ||||||
|  | import babel | ||||||
|  | 
 | ||||||
|  | from searx import network | ||||||
| from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex | from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex | ||||||
|  | from searx.enginelib.traits import EngineTraits | ||||||
|  | from searx.locales import language_tag | ||||||
|  | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     import logging | ||||||
|  | 
 | ||||||
|  |     logger: logging.Logger | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| # about |  | ||||||
| about = { | about = { | ||||||
|     "website": 'https://wiki.archlinux.org/', |     "website": 'https://wiki.archlinux.org/', | ||||||
|     "wikidata_id": 'Q101445877', |     "wikidata_id": 'Q101445877', | ||||||
| @ -22,125 +39,113 @@ about = { | |||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['it', 'software wikis'] | categories = ['it', 'software wikis'] | ||||||
| paging = True | paging = True | ||||||
| base_url = 'https://wiki.archlinux.org' | main_wiki = 'wiki.archlinux.org' | ||||||
| 
 |  | ||||||
| # xpath queries |  | ||||||
| xpath_results = '//ul[@class="mw-search-results"]/li' |  | ||||||
| xpath_link = './/div[@class="mw-search-result-heading"]/a' |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # cut 'en' from 'en-US', 'de' from 'de-CH', and so on |  | ||||||
| def locale_to_lang_code(locale): |  | ||||||
|     if locale.find('-') >= 0: |  | ||||||
|         locale = locale.split('-')[0] |  | ||||||
|     return locale |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # wikis for some languages were moved off from the main site, we need to make |  | ||||||
| # requests to correct URLs to be able to get results in those languages |  | ||||||
| lang_urls = { |  | ||||||
|     # fmt: off |  | ||||||
|     'all': { |  | ||||||
|         'base': 'https://wiki.archlinux.org', |  | ||||||
|         'search': '/index.php?title=Special:Search&offset={offset}&{query}' |  | ||||||
|     }, |  | ||||||
|     'de': { |  | ||||||
|         'base': 'https://wiki.archlinux.de', |  | ||||||
|         'search': '/index.php?title=Spezial:Suche&offset={offset}&{query}' |  | ||||||
|     }, |  | ||||||
|     'fr': { |  | ||||||
|         'base': 'https://wiki.archlinux.fr', |  | ||||||
|         'search': '/index.php?title=Spécial:Recherche&offset={offset}&{query}' |  | ||||||
|     }, |  | ||||||
|     'ja': { |  | ||||||
|         'base': 'https://wiki.archlinuxjp.org', |  | ||||||
|         'search': '/index.php?title=特別:検索&offset={offset}&{query}' |  | ||||||
|     }, |  | ||||||
|     'ro': { |  | ||||||
|         'base': 'http://wiki.archlinux.ro', |  | ||||||
|         'search': '/index.php?title=Special:Căutare&offset={offset}&{query}' |  | ||||||
|     }, |  | ||||||
|     'tr': { |  | ||||||
|         'base': 'http://archtr.org/wiki', |  | ||||||
|         'search': '/index.php?title=Özel:Ara&offset={offset}&{query}' |  | ||||||
|     } |  | ||||||
|     # fmt: on |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # get base & search URLs for selected language |  | ||||||
| def get_lang_urls(language): |  | ||||||
|     if language in lang_urls: |  | ||||||
|         return lang_urls[language] |  | ||||||
|     return lang_urls['all'] |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Language names to build search requests for |  | ||||||
| # those languages which are hosted on the main site. |  | ||||||
| main_langs = { |  | ||||||
|     'ar': 'العربية', |  | ||||||
|     'bg': 'Български', |  | ||||||
|     'cs': 'Česky', |  | ||||||
|     'da': 'Dansk', |  | ||||||
|     'el': 'Ελληνικά', |  | ||||||
|     'es': 'Español', |  | ||||||
|     'he': 'עברית', |  | ||||||
|     'hr': 'Hrvatski', |  | ||||||
|     'hu': 'Magyar', |  | ||||||
|     'it': 'Italiano', |  | ||||||
|     'ko': '한국어', |  | ||||||
|     'lt': 'Lietuviškai', |  | ||||||
|     'nl': 'Nederlands', |  | ||||||
|     'pl': 'Polski', |  | ||||||
|     'pt': 'Português', |  | ||||||
|     'ru': 'Русский', |  | ||||||
|     'sl': 'Slovenský', |  | ||||||
|     'th': 'ไทย', |  | ||||||
|     'uk': 'Українська', |  | ||||||
|     'zh': '简体中文', |  | ||||||
| } |  | ||||||
| supported_languages = dict(lang_urls, **main_langs) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # do search-request |  | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     # translate the locale (e.g. 'en-US') to language code ('en') |  | ||||||
|     language = locale_to_lang_code(params['language']) |  | ||||||
| 
 | 
 | ||||||
|     # if our language is hosted on the main site, we need to add its name |     sxng_lang = params['searxng_locale'].split('-')[0] | ||||||
|     # to the query in order to narrow the results to that language |     netloc = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) | ||||||
|     if language in main_langs: |     title = traits.custom['title'].get(sxng_lang, 'Special:Search') | ||||||
|         query += ' (' + main_langs[language] + ')' |     base_url = 'https://' + netloc + '/index.php?' | ||||||
| 
 |  | ||||||
|     # prepare the request parameters |  | ||||||
|     query = urlencode({'search': query}) |  | ||||||
|     offset = (params['pageno'] - 1) * 20 |     offset = (params['pageno'] - 1) * 20 | ||||||
| 
 | 
 | ||||||
|     # get request URLs for our language of choice |     if netloc == main_wiki: | ||||||
|     urls = get_lang_urls(language) |         eng_lang: str = traits.get_language(sxng_lang, 'English') | ||||||
|     search_url = urls['base'] + urls['search'] |         query += ' (' + eng_lang + ')' | ||||||
|  |     elif netloc == 'wiki.archlinuxcn.org': | ||||||
|  |         base_url = 'https://' + netloc + '/wzh/index.php?' | ||||||
| 
 | 
 | ||||||
|     params['url'] = search_url.format(query=query, offset=offset) |     args = { | ||||||
|  |         'search': query, | ||||||
|  |         'title': title, | ||||||
|  |         'limit': 20, | ||||||
|  |         'offset': offset, | ||||||
|  |         'profile': 'default', | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|  |     params['url'] = base_url + urlencode(args) | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # get response from search-request |  | ||||||
| def response(resp): | def response(resp): | ||||||
|     # get the base URL for the language in which request was made |  | ||||||
|     language = locale_to_lang_code(resp.search_params['language']) |  | ||||||
|     base_url = get_lang_urls(language)['base'] |  | ||||||
| 
 | 
 | ||||||
|     results = [] |     results = [] | ||||||
|  |     dom = lxml.html.fromstring(resp.text) | ||||||
| 
 | 
 | ||||||
|     dom = html.fromstring(resp.text) |     # get the base URL for the language in which request was made | ||||||
|  |     sxng_lang = resp.search_params['searxng_locale'].split('-')[0] | ||||||
|  |     netloc = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) | ||||||
|  |     base_url = 'https://' + netloc + '/index.php?' | ||||||
| 
 | 
 | ||||||
|     # parse results |     for result in eval_xpath_list(dom, '//ul[@class="mw-search-results"]/li'): | ||||||
|     for result in eval_xpath_list(dom, xpath_results): |         link = eval_xpath_getindex(result, './/div[@class="mw-search-result-heading"]/a', 0) | ||||||
|         link = eval_xpath_getindex(result, xpath_link, 0) |         content = extract_text(result.xpath('.//div[@class="searchresult"]')) | ||||||
|         href = urljoin(base_url, link.attrib.get('href')) |         results.append( | ||||||
|         title = extract_text(link) |             { | ||||||
| 
 |                 'url': urljoin(base_url, link.get('href')), | ||||||
|         results.append({'url': href, 'title': title}) |                 'title': extract_text(link), | ||||||
|  |                 'content': content, | ||||||
|  |             } | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
|     return results |     return results | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def fetch_traits(engine_traits: EngineTraits): | ||||||
|  |     """Fetch languages from Archlinix-Wiki.  The location of the Wiki address of a | ||||||
|  |     language is mapped in a :py:obj:`custom field | ||||||
|  |     <searx.enginelib.traits.EngineTraits.custom>` (``wiki_netloc``).  Depending | ||||||
|  |     on the location, the ``title`` argument in the request is translated. | ||||||
|  | 
 | ||||||
|  |     .. code:: python | ||||||
|  | 
 | ||||||
|  |        "custom": { | ||||||
|  |          "wiki_netloc": { | ||||||
|  |            "de": "wiki.archlinux.de", | ||||||
|  |             # ... | ||||||
|  |            "zh": "wiki.archlinuxcn.org" | ||||||
|  |          } | ||||||
|  |          "title": { | ||||||
|  |            "de": "Spezial:Suche", | ||||||
|  |             # ... | ||||||
|  |            "zh": "Special:\u641c\u7d22" | ||||||
|  |          }, | ||||||
|  |        }, | ||||||
|  | 
 | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     engine_traits.custom['wiki_netloc'] = {} | ||||||
|  |     engine_traits.custom['title'] = {} | ||||||
|  | 
 | ||||||
|  |     title_map = { | ||||||
|  |         'de': 'Spezial:Suche', | ||||||
|  |         'fa': 'ویژه:جستجو', | ||||||
|  |         'ja': '特別:検索', | ||||||
|  |         'zh': 'Special:搜索', | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     resp = network.get('https://wiki.archlinux.org/') | ||||||
|  |     if not resp.ok: | ||||||
|  |         print("ERROR: response from wiki.archlinix.org is not OK.") | ||||||
|  | 
 | ||||||
|  |     dom = lxml.html.fromstring(resp.text) | ||||||
|  |     for a in eval_xpath_list(dom, "//a[@class='interlanguage-link-target']"): | ||||||
|  | 
 | ||||||
|  |         sxng_tag = language_tag(babel.Locale.parse(a.get('lang'), sep='-')) | ||||||
|  |         # zh_Hans --> zh | ||||||
|  |         sxng_tag = sxng_tag.split('_')[0] | ||||||
|  | 
 | ||||||
|  |         netloc = urlparse(a.get('href')).netloc | ||||||
|  |         if netloc != 'wiki.archlinux.org': | ||||||
|  |             title = title_map.get(sxng_tag) | ||||||
|  |             if not title: | ||||||
|  |                 print("ERROR: title tag from %s (%s) is unknown" % (netloc, sxng_tag)) | ||||||
|  |                 continue | ||||||
|  |             engine_traits.custom['wiki_netloc'][sxng_tag] = netloc | ||||||
|  |             engine_traits.custom['title'][sxng_tag] = title | ||||||
|  | 
 | ||||||
|  |         eng_tag = extract_text(eval_xpath_list(a, ".//span")) | ||||||
|  |         engine_traits.languages[sxng_tag] = eng_tag | ||||||
|  | 
 | ||||||
|  |     engine_traits.languages['en'] = 'English' | ||||||
|  | |||||||
| @ -1,16 +1,53 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| # lint: pylint | # lint: pylint | ||||||
| """Bing (Web) | """This is the implementation of the Bing-WEB engine. Some of this | ||||||
|  | implementations are shared by other engines: | ||||||
|  | 
 | ||||||
|  | - :ref:`bing images engine` | ||||||
|  | - :ref:`bing news engine` | ||||||
|  | - :ref:`bing videos engine` | ||||||
|  | 
 | ||||||
|  | On the `preference page`_ Bing offers a lot of languages an regions (see section | ||||||
|  | 'Search results languages' and 'Country/region').  However, the abundant choice | ||||||
|  | does not correspond to reality, where Bing has a full-text indexer only for a | ||||||
|  | limited number of languages.  By example: you can select a language like Māori | ||||||
|  | but you never get a result in this language. | ||||||
|  | 
 | ||||||
|  | What comes a bit closer to the truth are the `search-APIs`_ but they don`t seem | ||||||
|  | to be completely correct either (if you take a closer look you will find some | ||||||
|  | inaccuracies there too): | ||||||
|  | 
 | ||||||
|  | - :py:obj:`searx.engines.bing.bing_traits_url` | ||||||
|  | - :py:obj:`searx.engines.bing_videos.bing_traits_url` | ||||||
|  | - :py:obj:`searx.engines.bing_images.bing_traits_url` | ||||||
|  | - :py:obj:`searx.engines.bing_news.bing_traits_url` | ||||||
|  | 
 | ||||||
|  | .. _preference page: https://www.bing.com/account/general | ||||||
|  | .. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/ | ||||||
| 
 | 
 | ||||||
| - https://github.com/searx/searx/issues/2019#issuecomment-648227442 |  | ||||||
| """ | """ | ||||||
| # pylint: disable=too-many-branches | # pylint: disable=too-many-branches, invalid-name | ||||||
| 
 | 
 | ||||||
|  | from typing import TYPE_CHECKING | ||||||
|  | import datetime | ||||||
| import re | import re | ||||||
| from urllib.parse import urlencode, urlparse, parse_qs | import uuid | ||||||
|  | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language, eval_xpath_getindex | import babel | ||||||
| from searx.network import multi_requests, Request | import babel.languages | ||||||
|  | 
 | ||||||
|  | from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex | ||||||
|  | from searx import network | ||||||
|  | from searx.locales import language_tag, region_tag | ||||||
|  | from searx.enginelib.traits import EngineTraits | ||||||
|  | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     import logging | ||||||
|  | 
 | ||||||
|  |     logger: logging.Logger | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
| 
 | 
 | ||||||
| about = { | about = { | ||||||
|     "website": 'https://www.bing.com', |     "website": 'https://www.bing.com', | ||||||
| @ -21,56 +58,124 @@ about = { | |||||||
|     "results": 'HTML', |     "results": 'HTML', | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | send_accept_language_header = True | ||||||
|  | """Bing tries to guess user's language and territory from the HTTP | ||||||
|  | Accept-Language.  Optional the user can select a search-language (can be | ||||||
|  | different to the UI language) and a region (market code).""" | ||||||
|  | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['general', 'web'] | categories = ['general', 'web'] | ||||||
| paging = True | paging = True | ||||||
| time_range_support = False | time_range_support = True | ||||||
| safesearch = False | safesearch = True | ||||||
| send_accept_language_header = True | safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'}  # cookie: ADLT=STRICT | ||||||
| supported_languages_url = 'https://www.bing.com/account/general' |  | ||||||
| language_aliases = {} |  | ||||||
| 
 | 
 | ||||||
| # search-url | base_url = 'https://www.bing.com/search' | ||||||
| base_url = 'https://www.bing.com/' | """Bing (Web) search URL""" | ||||||
| 
 | 
 | ||||||
| # initial query:     https://www.bing.com/search?q=foo&search=&form=QBLH | bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes' | ||||||
| inital_query = 'search?{query}&search=&form=QBLH' | """Bing (Web) search API description""" | ||||||
| 
 |  | ||||||
| # following queries: https://www.bing.com/search?q=foo&search=&first=11&FORM=PERE |  | ||||||
| page_query = 'search?{query}&search=&first={offset}&FORM=PERE' |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _get_offset_from_pageno(pageno): | def _get_offset_from_pageno(pageno): | ||||||
|     return (pageno - 1) * 10 + 1 |     return (pageno - 1) * 10 + 1 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def set_bing_cookies(params, engine_language, engine_region, SID): | ||||||
|  | 
 | ||||||
|  |     # set cookies | ||||||
|  |     # ----------- | ||||||
|  | 
 | ||||||
|  |     params['cookies']['_EDGE_V'] = '1' | ||||||
|  | 
 | ||||||
|  |     # _EDGE_S: F=1&SID=3A5253BD6BCA609509B741876AF961CA&mkt=zh-tw | ||||||
|  |     _EDGE_S = [ | ||||||
|  |         'F=1', | ||||||
|  |         'SID=%s' % SID, | ||||||
|  |         'mkt=%s' % engine_region.lower(), | ||||||
|  |         'ui=%s' % engine_language.lower(), | ||||||
|  |     ] | ||||||
|  |     params['cookies']['_EDGE_S'] = '&'.join(_EDGE_S) | ||||||
|  |     logger.debug("cookie _EDGE_S=%s", params['cookies']['_EDGE_S']) | ||||||
|  | 
 | ||||||
|  |     # "_EDGE_CD": "m=zh-tw", | ||||||
|  | 
 | ||||||
|  |     _EDGE_CD = [  # pylint: disable=invalid-name | ||||||
|  |         'm=%s' % engine_region.lower(),  # search region: zh-cn | ||||||
|  |         'u=%s' % engine_language.lower(),  # UI: en-us | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  |     params['cookies']['_EDGE_CD'] = '&'.join(_EDGE_CD) + ';' | ||||||
|  |     logger.debug("cookie _EDGE_CD=%s", params['cookies']['_EDGE_CD']) | ||||||
|  | 
 | ||||||
|  |     SRCHHPGUSR = [  # pylint: disable=invalid-name | ||||||
|  |         'SRCHLANG=%s' % engine_language, | ||||||
|  |         # Trying to set ADLT cookie here seems not to have any effect, I assume | ||||||
|  |         # there is some age verification by a cookie (and/or session ID) needed, | ||||||
|  |         # to disable the SafeSearch. | ||||||
|  |         'ADLT=%s' % safesearch_types.get(params['safesearch'], 'DEMOTE'), | ||||||
|  |     ] | ||||||
|  |     params['cookies']['SRCHHPGUSR'] = '&'.join(SRCHHPGUSR) | ||||||
|  |     logger.debug("cookie SRCHHPGUSR=%s", params['cookies']['SRCHHPGUSR']) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def request(query, params): | def request(query, params): | ||||||
|  |     """Assemble a Bing-Web request.""" | ||||||
| 
 | 
 | ||||||
|     offset = _get_offset_from_pageno(params.get('pageno', 1)) |     engine_region = traits.get_region(params['searxng_locale'], 'en-US') | ||||||
|  |     engine_language = traits.get_language(params['searxng_locale'], 'en') | ||||||
| 
 | 
 | ||||||
|     # logger.debug("params['pageno'] --> %s", params.get('pageno')) |     SID = uuid.uuid1().hex.upper() | ||||||
|     # logger.debug("          offset --> %s", offset) |     CVID = uuid.uuid1().hex.upper() | ||||||
| 
 | 
 | ||||||
|     search_string = page_query |     set_bing_cookies(params, engine_language, engine_region, SID) | ||||||
|     if offset == 1: |  | ||||||
|         search_string = inital_query |  | ||||||
| 
 | 
 | ||||||
|     if params['language'] == 'all': |     # build URL query | ||||||
|         lang = 'EN' |     # --------------- | ||||||
|     else: |  | ||||||
|         lang = match_language(params['language'], supported_languages, language_aliases) |  | ||||||
| 
 | 
 | ||||||
|     query = 'language:{} {}'.format(lang.split('-')[0].upper(), query) |     # query term | ||||||
|  |     page = int(params.get('pageno', 1)) | ||||||
|  |     query_params = { | ||||||
|  |         # fmt: off | ||||||
|  |         'q': query, | ||||||
|  |         'pq': query, | ||||||
|  |         'cvid': CVID, | ||||||
|  |         'qs': 'n', | ||||||
|  |         'sp': '-1' | ||||||
|  |         # fmt: on | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|     search_path = search_string.format(query=urlencode({'q': query}), offset=offset) |     # page | ||||||
| 
 |     if page > 1: | ||||||
|     if offset > 1: |         referer = base_url + '?' + urlencode(query_params) | ||||||
|         referer = base_url + inital_query.format(query=urlencode({'q': query})) |  | ||||||
|         params['headers']['Referer'] = referer |         params['headers']['Referer'] = referer | ||||||
|         logger.debug("headers.Referer --> %s", referer) |         logger.debug("headers.Referer --> %s", referer) | ||||||
| 
 | 
 | ||||||
|     params['url'] = base_url + search_path |     query_params['first'] = _get_offset_from_pageno(page) | ||||||
|     params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | 
 | ||||||
|  |     if page == 2: | ||||||
|  |         query_params['FORM'] = 'PERE' | ||||||
|  |     elif page > 2: | ||||||
|  |         query_params['FORM'] = 'PERE%s' % (page - 2) | ||||||
|  | 
 | ||||||
|  |     filters = '' | ||||||
|  |     if params['time_range']: | ||||||
|  |         query_params['filt'] = 'custom' | ||||||
|  | 
 | ||||||
|  |         if params['time_range'] == 'day': | ||||||
|  |             filters = 'ex1:"ez1"' | ||||||
|  |         elif params['time_range'] == 'week': | ||||||
|  |             filters = 'ex1:"ez2"' | ||||||
|  |         elif params['time_range'] == 'month': | ||||||
|  |             filters = 'ex1:"ez3"' | ||||||
|  |         elif params['time_range'] == 'year': | ||||||
|  |             epoch_1970 = datetime.date(1970, 1, 1) | ||||||
|  |             today_no = (datetime.date.today() - epoch_1970).days | ||||||
|  |             filters = 'ex1:"ez5_%s_%s"' % (today_no - 365, today_no) | ||||||
|  | 
 | ||||||
|  |     params['url'] = base_url + '?' + urlencode(query_params) | ||||||
|  |     if filters: | ||||||
|  |         params['url'] = params['url'] + '&filters=' + filters | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -107,7 +212,8 @@ def response(resp): | |||||||
|             url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite')) |             url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite')) | ||||||
|             # Bing can shorten the URL either at the end or in the middle of the string |             # Bing can shorten the URL either at the end or in the middle of the string | ||||||
|             if ( |             if ( | ||||||
|                 url_cite.startswith('https://') |                 url_cite | ||||||
|  |                 and url_cite.startswith('https://') | ||||||
|                 and '…' not in url_cite |                 and '…' not in url_cite | ||||||
|                 and '...' not in url_cite |                 and '...' not in url_cite | ||||||
|                 and '›' not in url_cite |                 and '›' not in url_cite | ||||||
| @ -127,9 +233,9 @@ def response(resp): | |||||||
| 
 | 
 | ||||||
|     # resolve all Bing redirections in parallel |     # resolve all Bing redirections in parallel | ||||||
|     request_list = [ |     request_list = [ | ||||||
|         Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve |         network.Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve | ||||||
|     ] |     ] | ||||||
|     response_list = multi_requests(request_list) |     response_list = network.multi_requests(request_list) | ||||||
|     for i, redirect_response in enumerate(response_list): |     for i, redirect_response in enumerate(response_list): | ||||||
|         if not isinstance(redirect_response, Exception): |         if not isinstance(redirect_response, Exception): | ||||||
|             results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location'] |             results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location'] | ||||||
| @ -157,27 +263,71 @@ def response(resp): | |||||||
|     return results |     return results | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # get supported languages from their site | def fetch_traits(engine_traits: EngineTraits): | ||||||
| def _fetch_supported_languages(resp): |     """Fetch languages and regions from Bing-Web.""" | ||||||
| 
 | 
 | ||||||
|     lang_tags = set() |     xpath_market_codes = '//table[1]/tbody/tr/td[3]' | ||||||
|  |     # xpath_country_codes = '//table[2]/tbody/tr/td[2]' | ||||||
|  |     xpath_language_codes = '//table[3]/tbody/tr/td[2]' | ||||||
|  | 
 | ||||||
|  |     _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str): | ||||||
|  | 
 | ||||||
|  |     # insert alias to map from a language (zh) to a language + script (zh_Hans) | ||||||
|  |     engine_traits.languages['zh'] = 'zh-hans' | ||||||
|  | 
 | ||||||
|  |     resp = network.get(url) | ||||||
|  | 
 | ||||||
|  |     if not resp.ok: | ||||||
|  |         print("ERROR: response from peertube is not OK.") | ||||||
| 
 | 
 | ||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
|     lang_links = eval_xpath(dom, '//div[@id="language-section"]//li') |  | ||||||
| 
 | 
 | ||||||
|     for _li in lang_links: |     map_lang = {'jp': 'ja'} | ||||||
|  |     for td in eval_xpath(dom, xpath_language_codes): | ||||||
|  |         eng_lang = td.text | ||||||
| 
 | 
 | ||||||
|         href = eval_xpath(_li, './/@href')[0] |         if eng_lang in ('en-gb', 'pt-br'): | ||||||
|         (_scheme, _netloc, _path, _params, query, _fragment) = urlparse(href) |             # language 'en' is already in the list and a language 'en-gb' can't | ||||||
|         query = parse_qs(query, keep_blank_values=True) |             # be handled in SearXNG, same with pt-br which is covered by pt-pt. | ||||||
|  |             continue | ||||||
| 
 | 
 | ||||||
|         # fmt: off |         babel_lang = map_lang.get(eng_lang, eng_lang).replace('-', '_') | ||||||
|         setlang = query.get('setlang', [None, ])[0] |         try: | ||||||
|         # example: 'mn-Cyrl-MN' --> '['mn', 'Cyrl-MN'] |             sxng_tag = language_tag(babel.Locale.parse(babel_lang)) | ||||||
|         lang, nation = (setlang.split('-', maxsplit=1) + [None,])[:2]  # fmt: skip |         except babel.UnknownLocaleError: | ||||||
|         # fmt: on |             print("ERROR: language (%s) is unknown by babel" % (eng_lang)) | ||||||
|  |             continue | ||||||
|  |         conflict = engine_traits.languages.get(sxng_tag) | ||||||
|  |         if conflict: | ||||||
|  |             if conflict != eng_lang: | ||||||
|  |                 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang)) | ||||||
|  |             continue | ||||||
|  |         engine_traits.languages[sxng_tag] = eng_lang | ||||||
| 
 | 
 | ||||||
|         tag = lang + '-' + nation if nation else lang |     map_region = { | ||||||
|         lang_tags.add(tag) |         'en-ID': 'id_ID', | ||||||
|  |         'no-NO': 'nb_NO', | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|     return list(lang_tags) |     for td in eval_xpath(dom, xpath_market_codes): | ||||||
|  |         eng_region = td.text | ||||||
|  |         babel_region = map_region.get(eng_region, eng_region).replace('-', '_') | ||||||
|  | 
 | ||||||
|  |         if eng_region == 'en-WW': | ||||||
|  |             engine_traits.all_locale = eng_region | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         try: | ||||||
|  |             sxng_tag = region_tag(babel.Locale.parse(babel_region)) | ||||||
|  |         except babel.UnknownLocaleError: | ||||||
|  |             print("ERROR: region (%s) is unknown by babel" % (eng_region)) | ||||||
|  |             continue | ||||||
|  |         conflict = engine_traits.regions.get(sxng_tag) | ||||||
|  |         if conflict: | ||||||
|  |             if conflict != eng_region: | ||||||
|  |                 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_region)) | ||||||
|  |             continue | ||||||
|  |         engine_traits.regions[sxng_tag] = eng_region | ||||||
|  | |||||||
| @ -1,20 +1,30 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| # lint: pylint | # lint: pylint | ||||||
| """Bing (Images) | """Bing-Images: description see :py:obj:`searx.engines.bing`. | ||||||
| 
 |  | ||||||
| """ | """ | ||||||
|  | # pylint: disable=invalid-name | ||||||
| 
 | 
 | ||||||
| from json import loads | 
 | ||||||
|  | from typing import TYPE_CHECKING | ||||||
|  | import uuid | ||||||
|  | import json | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| 
 | 
 | ||||||
| from lxml import html | from lxml import html | ||||||
| 
 | 
 | ||||||
| from searx.utils import match_language | from searx.enginelib.traits import EngineTraits | ||||||
| from searx.engines.bing import language_aliases | from searx.engines.bing import ( | ||||||
| from searx.engines.bing import (  # pylint: disable=unused-import |     set_bing_cookies, | ||||||
|     _fetch_supported_languages, |     _fetch_traits, | ||||||
|     supported_languages_url, |  | ||||||
| ) | ) | ||||||
|  | from searx.engines.bing import send_accept_language_header  # pylint: disable=unused-import | ||||||
|  | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     import logging | ||||||
|  | 
 | ||||||
|  |     logger: logging.Logger | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
| @ -31,77 +41,92 @@ categories = ['images', 'web'] | |||||||
| paging = True | paging = True | ||||||
| safesearch = True | safesearch = True | ||||||
| time_range_support = True | time_range_support = True | ||||||
| send_accept_language_header = True |  | ||||||
| supported_languages_url = 'https://www.bing.com/account/general' |  | ||||||
| number_of_results = 28 |  | ||||||
| 
 | 
 | ||||||
| # search-url | base_url = 'https://www.bing.com/images/async' | ||||||
| base_url = 'https://www.bing.com/' | """Bing (Images) search URL""" | ||||||
| search_string = ( | 
 | ||||||
|  | bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-image-search/reference/market-codes' | ||||||
|  | """Bing (Images) search API description""" | ||||||
|  | 
 | ||||||
|  | time_map = { | ||||||
|     # fmt: off |     # fmt: off | ||||||
|     'images/search' |     'day': 60 * 24, | ||||||
|     '?{query}' |     'week': 60 * 24 * 7, | ||||||
|     '&count={count}' |     'month': 60 * 24 * 31, | ||||||
|     '&first={first}' |     'year': 60 * 24 * 365, | ||||||
|     '&tsc=ImageHoverTitle' |  | ||||||
|     # fmt: on |     # fmt: on | ||||||
| ) | } | ||||||
| time_range_string = '&qft=+filterui:age-lt{interval}' |  | ||||||
| time_range_dict = {'day': '1440', 'week': '10080', 'month': '43200', 'year': '525600'} |  | ||||||
| 
 |  | ||||||
| # safesearch definitions |  | ||||||
| safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # do search-request |  | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     offset = ((params['pageno'] - 1) * number_of_results) + 1 |     """Assemble a Bing-Image request.""" | ||||||
| 
 | 
 | ||||||
|     search_path = search_string.format(query=urlencode({'q': query}), count=number_of_results, first=offset) |     engine_region = traits.get_region(params['searxng_locale'], 'en-US') | ||||||
|  |     engine_language = traits.get_language(params['searxng_locale'], 'en') | ||||||
| 
 | 
 | ||||||
|     language = match_language(params['language'], supported_languages, language_aliases).lower() |     SID = uuid.uuid1().hex.upper() | ||||||
|  |     set_bing_cookies(params, engine_language, engine_region, SID) | ||||||
| 
 | 
 | ||||||
|     params['cookies']['SRCHHPGUSR'] = 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') |     # build URL query | ||||||
|  |     # - example: https://www.bing.com/images/async?q=foo&first=155&count=35 | ||||||
| 
 | 
 | ||||||
|     params['cookies']['_EDGE_S'] = 'mkt=' + language + '&ui=' + language + '&F=1' |     query_params = { | ||||||
|  |         # fmt: off | ||||||
|  |         'q': query, | ||||||
|  |         'async' : 'content', | ||||||
|  |         # to simplify the page count lets use the default of 35 images per page | ||||||
|  |         'first' : (int(params.get('pageno', 1)) - 1) * 35 + 1, | ||||||
|  |         'count' : 35, | ||||||
|  |         # fmt: on | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|     params['url'] = base_url + search_path |     # time range | ||||||
|     if params['time_range'] in time_range_dict: |     # - example: one year (525600 minutes) 'qft=+filterui:age-lt525600' | ||||||
|         params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) | 
 | ||||||
|  |     if params['time_range']: | ||||||
|  |         query_params['qft'] = 'filterui:age-lt%s' % time_map[params['time_range']] | ||||||
|  | 
 | ||||||
|  |     params['url'] = base_url + '?' + urlencode(query_params) | ||||||
| 
 | 
 | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # get response from search-request |  | ||||||
| def response(resp): | def response(resp): | ||||||
|     results = [] |     """Get response from Bing-Images""" | ||||||
| 
 | 
 | ||||||
|  |     results = [] | ||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
| 
 | 
 | ||||||
|     # parse results |     for result in dom.xpath('//ul[contains(@class, "dgControl_list")]/li'): | ||||||
|     for result in dom.xpath('//div[@class="imgpt"]'): |  | ||||||
|         img_format = result.xpath('./div[contains(@class, "img_info")]/span/text()')[0] |  | ||||||
|         # Microsoft seems to experiment with this code so don't make the path too specific, |  | ||||||
|         # just catch the text section for the first anchor in img_info assuming this to be |  | ||||||
|         # the originating site. |  | ||||||
|         source = result.xpath('./div[contains(@class, "img_info")]//a/text()')[0] |  | ||||||
| 
 | 
 | ||||||
|         m = loads(result.xpath('./a/@m')[0]) |         metadata = result.xpath('.//a[@class="iusc"]/@m') | ||||||
|  |         if not metadata: | ||||||
|  |             continue | ||||||
| 
 | 
 | ||||||
|         # strip 'Unicode private use area' highlighting, they render to Tux |         metadata = json.loads(result.xpath('.//a[@class="iusc"]/@m')[0]) | ||||||
|         # the Linux penguin and a standing diamond on my machine... |         title = ' '.join(result.xpath('.//div[@class="infnmpt"]//a/text()')).strip() | ||||||
|         title = m.get('t', '').replace('\ue000', '').replace('\ue001', '') |         img_format = ' '.join(result.xpath('.//div[@class="imgpt"]/div/span/text()')).strip() | ||||||
|  |         source = ' '.join(result.xpath('.//div[@class="imgpt"]//div[@class="lnkw"]//a/text()')).strip() | ||||||
|         results.append( |         results.append( | ||||||
|             { |             { | ||||||
|                 'template': 'images.html', |                 'template': 'images.html', | ||||||
|                 'url': m['purl'], |                 'url': metadata['purl'], | ||||||
|                 'thumbnail_src': m['turl'], |                 'thumbnail_src': metadata['turl'], | ||||||
|                 'img_src': m['murl'], |                 'img_src': metadata['murl'], | ||||||
|                 'content': '', |                 'content': metadata['desc'], | ||||||
|                 'title': title, |                 'title': title, | ||||||
|                 'source': source, |                 'source': source, | ||||||
|                 'img_format': img_format, |                 'img_format': img_format, | ||||||
|             } |             } | ||||||
|         ) |         ) | ||||||
| 
 |  | ||||||
|     return results |     return results | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def fetch_traits(engine_traits: EngineTraits): | ||||||
|  |     """Fetch languages and regions from Bing-News.""" | ||||||
|  | 
 | ||||||
|  |     xpath_market_codes = '//table[1]/tbody/tr/td[3]' | ||||||
|  |     # xpath_country_codes = '//table[2]/tbody/tr/td[2]' | ||||||
|  |     xpath_language_codes = '//table[3]/tbody/tr/td[2]' | ||||||
|  | 
 | ||||||
|  |     _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes) | ||||||
|  | |||||||
| @ -1,24 +1,30 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| # lint: pylint | # lint: pylint | ||||||
| """Bing (News) | """Bing-News: description see :py:obj:`searx.engines.bing`. | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from urllib.parse import ( | # pylint: disable=invalid-name | ||||||
|     urlencode, | 
 | ||||||
|     urlparse, | from typing import TYPE_CHECKING | ||||||
|     parse_qsl, | import uuid | ||||||
|     quote, | from urllib.parse import urlencode | ||||||
| ) | 
 | ||||||
| from datetime import datetime | from lxml import html | ||||||
| from dateutil import parser | 
 | ||||||
| from lxml import etree | from searx.enginelib.traits import EngineTraits | ||||||
| from lxml.etree import XPath | from searx.engines.bing import ( | ||||||
| from searx.utils import match_language, eval_xpath_getindex |     set_bing_cookies, | ||||||
| from searx.engines.bing import (  # pylint: disable=unused-import |     _fetch_traits, | ||||||
|     language_aliases, |  | ||||||
|     _fetch_supported_languages, |  | ||||||
|     supported_languages_url, |  | ||||||
| ) | ) | ||||||
|  | from searx.engines.bing import send_accept_language_header  # pylint: disable=unused-import | ||||||
|  | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     import logging | ||||||
|  | 
 | ||||||
|  |     logger: logging.Logger | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
| @ -34,108 +40,111 @@ about = { | |||||||
| categories = ['news'] | categories = ['news'] | ||||||
| paging = True | paging = True | ||||||
| time_range_support = True | time_range_support = True | ||||||
| send_accept_language_header = True | time_map = { | ||||||
|  |     'day': '4', | ||||||
|  |     'week': '8', | ||||||
|  |     'month': '9', | ||||||
|  | } | ||||||
|  | """A string '4' means *last hour*. We use *last hour* for ``day`` here since the | ||||||
|  | difference of *last day* and *last week* in the result list is just marginally. | ||||||
|  | """ | ||||||
| 
 | 
 | ||||||
| # search-url | base_url = 'https://www.bing.com/news/infinitescrollajax' | ||||||
| base_url = 'https://www.bing.com/' | """Bing (News) search URL""" | ||||||
| search_string = 'news/search?{query}&first={offset}&format=RSS' |  | ||||||
| search_string_with_time = 'news/search?{query}&first={offset}&qft=interval%3d"{interval}"&format=RSS' |  | ||||||
| time_range_dict = {'day': '7', 'week': '8', 'month': '9'} |  | ||||||
| 
 | 
 | ||||||
|  | bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-news-search/reference/market-codes' | ||||||
|  | """Bing (News) search API description""" | ||||||
| 
 | 
 | ||||||
| def url_cleanup(url_string): | mkt_alias = { | ||||||
|     """remove click""" |     'zh': 'en-WW', | ||||||
| 
 |     'zh-CN': 'en-WW', | ||||||
|     parsed_url = urlparse(url_string) | } | ||||||
|     if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx': | """Bing News has an official market code 'zh-CN' but we won't get a result with | ||||||
|         query = dict(parse_qsl(parsed_url.query)) | this market code.  For 'zh' and 'zh-CN' we better use the *Worldwide aggregate* | ||||||
|         url_string = query.get('url', None) | market code (en-WW). | ||||||
|     return url_string | """ | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def image_url_cleanup(url_string): |  | ||||||
|     """replace the http://*bing.com/th?id=... by https://www.bing.com/th?id=...""" |  | ||||||
| 
 |  | ||||||
|     parsed_url = urlparse(url_string) |  | ||||||
|     if parsed_url.netloc.endswith('bing.com') and parsed_url.path == '/th': |  | ||||||
|         query = dict(parse_qsl(parsed_url.query)) |  | ||||||
|         url_string = "https://www.bing.com/th?id=" + quote(query.get('id')) |  | ||||||
|     return url_string |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def _get_url(query, language, offset, time_range): |  | ||||||
|     if time_range in time_range_dict: |  | ||||||
|         search_path = search_string_with_time.format( |  | ||||||
|             # fmt: off |  | ||||||
|             query = urlencode({ |  | ||||||
|                 'q': query, |  | ||||||
|                 'setmkt': language |  | ||||||
|             }), |  | ||||||
|             offset = offset, |  | ||||||
|             interval = time_range_dict[time_range] |  | ||||||
|             # fmt: on |  | ||||||
|         ) |  | ||||||
|     else: |  | ||||||
|         # e.g. setmkt=de-de&setlang=de |  | ||||||
|         search_path = search_string.format( |  | ||||||
|             # fmt: off |  | ||||||
|             query = urlencode({ |  | ||||||
|                 'q': query, |  | ||||||
|                 'setmkt': language |  | ||||||
|             }), |  | ||||||
|             offset = offset |  | ||||||
|             # fmt: on |  | ||||||
|         ) |  | ||||||
|     return base_url + search_path |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def request(query, params): | def request(query, params): | ||||||
|  |     """Assemble a Bing-News request.""" | ||||||
| 
 | 
 | ||||||
|     if params['time_range'] and params['time_range'] not in time_range_dict: |     sxng_locale = params['searxng_locale'] | ||||||
|         return params |     engine_region = traits.get_region(mkt_alias.get(sxng_locale, sxng_locale), traits.all_locale) | ||||||
|  |     engine_language = traits.get_language(sxng_locale, 'en') | ||||||
| 
 | 
 | ||||||
|     offset = (params['pageno'] - 1) * 10 + 1 |     SID = uuid.uuid1().hex.upper() | ||||||
|     if params['language'] == 'all': |     set_bing_cookies(params, engine_language, engine_region, SID) | ||||||
|         language = 'en-US' | 
 | ||||||
|     else: |     # build URL query | ||||||
|         language = match_language(params['language'], supported_languages, language_aliases) |     # | ||||||
|     params['url'] = _get_url(query, language, offset, params['time_range']) |     # example: https://www.bing.com/news/infinitescrollajax?q=london&first=1 | ||||||
|  | 
 | ||||||
|  |     query_params = { | ||||||
|  |         # fmt: off | ||||||
|  |         'q': query, | ||||||
|  |         'InfiniteScroll': 1, | ||||||
|  |         # to simplify the page count lets use the default of 10 images per page | ||||||
|  |         'first' : (int(params.get('pageno', 1)) - 1) * 10 + 1, | ||||||
|  |         # fmt: on | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if params['time_range']: | ||||||
|  |         # qft=interval:"7" | ||||||
|  |         query_params['qft'] = 'qft=interval="%s"' % time_map.get(params['time_range'], '9') | ||||||
|  | 
 | ||||||
|  |     params['url'] = base_url + '?' + urlencode(query_params) | ||||||
| 
 | 
 | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def response(resp): | def response(resp): | ||||||
| 
 |     """Get response from Bing-Video""" | ||||||
|     results = [] |     results = [] | ||||||
|     rss = etree.fromstring(resp.content) |  | ||||||
|     namespaces = rss.nsmap |  | ||||||
| 
 | 
 | ||||||
|     for item in rss.xpath('./channel/item'): |     if not resp.ok or not resp.text: | ||||||
|         # url / title / content |         return results | ||||||
|         url = url_cleanup(eval_xpath_getindex(item, './link/text()', 0, default=None)) |  | ||||||
|         title = eval_xpath_getindex(item, './title/text()', 0, default=url) |  | ||||||
|         content = eval_xpath_getindex(item, './description/text()', 0, default='') |  | ||||||
| 
 | 
 | ||||||
|         # publishedDate |     dom = html.fromstring(resp.text) | ||||||
|         publishedDate = eval_xpath_getindex(item, './pubDate/text()', 0, default=None) |  | ||||||
|         try: |  | ||||||
|             publishedDate = parser.parse(publishedDate, dayfirst=False) |  | ||||||
|         except TypeError: |  | ||||||
|             publishedDate = datetime.now() |  | ||||||
|         except ValueError: |  | ||||||
|             publishedDate = datetime.now() |  | ||||||
| 
 | 
 | ||||||
|         # thumbnail |     for newsitem in dom.xpath('//div[contains(@class, "newsitem")]'): | ||||||
|         thumbnail = eval_xpath_getindex(item, XPath('./News:Image/text()', namespaces=namespaces), 0, default=None) | 
 | ||||||
|         if thumbnail is not None: |         url = newsitem.xpath('./@url')[0] | ||||||
|             thumbnail = image_url_cleanup(thumbnail) |         title = ' '.join(newsitem.xpath('.//div[@class="caption"]//a[@class="title"]/text()')).strip() | ||||||
|  |         content = ' '.join(newsitem.xpath('.//div[@class="snippet"]/text()')).strip() | ||||||
|  |         thumbnail = None | ||||||
|  |         author = newsitem.xpath('./@data-author')[0] | ||||||
|  |         metadata = ' '.join(newsitem.xpath('.//div[@class="source"]/span/text()')).strip() | ||||||
|  | 
 | ||||||
|  |         img_src = newsitem.xpath('.//a[@class="imagelink"]//img/@src') | ||||||
|  |         if img_src: | ||||||
|  |             thumbnail = 'https://www.bing.com/' + img_src[0] | ||||||
| 
 | 
 | ||||||
|         # append result |  | ||||||
|         if thumbnail is not None: |  | ||||||
|         results.append( |         results.append( | ||||||
|                 {'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content, 'img_src': thumbnail} |             { | ||||||
|  |                 'url': url, | ||||||
|  |                 'title': title, | ||||||
|  |                 'content': content, | ||||||
|  |                 'img_src': thumbnail, | ||||||
|  |                 'author': author, | ||||||
|  |                 'metadata': metadata, | ||||||
|  |             } | ||||||
|         ) |         ) | ||||||
|         else: |  | ||||||
|             results.append({'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content}) |  | ||||||
| 
 | 
 | ||||||
|     return results |     return results | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def fetch_traits(engine_traits: EngineTraits): | ||||||
|  |     """Fetch languages and regions from Bing-News. | ||||||
|  | 
 | ||||||
|  |     The :py:obj:`description <searx.engines.bing_news.bing_traits_url>` of the | ||||||
|  |     first table says *"query parameter when calling the Video Search API."* | ||||||
|  |     .. thats why I use the 4. table "News Category API markets" for the | ||||||
|  |     ``xpath_market_codes``. | ||||||
|  | 
 | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     xpath_market_codes = '//table[4]/tbody/tr/td[3]' | ||||||
|  |     # xpath_country_codes = '//table[2]/tbody/tr/td[2]' | ||||||
|  |     xpath_language_codes = '//table[3]/tbody/tr/td[2]' | ||||||
|  | 
 | ||||||
|  |     _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes) | ||||||
|  | |||||||
| @ -1,21 +1,30 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| # lint: pylint | # lint: pylint | ||||||
| """Bing (Videos) | """Bing-Videos: description see :py:obj:`searx.engines.bing`. | ||||||
| 
 |  | ||||||
| """ | """ | ||||||
|  | # pylint: disable=invalid-name | ||||||
| 
 | 
 | ||||||
| from json import loads | from typing import TYPE_CHECKING | ||||||
|  | import uuid | ||||||
|  | import json | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| 
 | 
 | ||||||
| from lxml import html | from lxml import html | ||||||
| 
 | 
 | ||||||
| from searx.utils import match_language | from searx.enginelib.traits import EngineTraits | ||||||
| from searx.engines.bing import language_aliases | from searx.engines.bing import ( | ||||||
| 
 |     set_bing_cookies, | ||||||
| from searx.engines.bing import (  # pylint: disable=unused-import |     _fetch_traits, | ||||||
|     _fetch_supported_languages, |  | ||||||
|     supported_languages_url, |  | ||||||
| ) | ) | ||||||
|  | from searx.engines.bing import send_accept_language_header  # pylint: disable=unused-import | ||||||
|  | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     import logging | ||||||
|  | 
 | ||||||
|  |     logger: logging.Logger | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| about = { | about = { | ||||||
|     "website": 'https://www.bing.com/videos', |     "website": 'https://www.bing.com/videos', | ||||||
| @ -26,65 +35,76 @@ about = { | |||||||
|     "results": 'HTML', |     "results": 'HTML', | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | # engine dependent config | ||||||
| categories = ['videos', 'web'] | categories = ['videos', 'web'] | ||||||
| paging = True | paging = True | ||||||
| safesearch = True | safesearch = True | ||||||
| time_range_support = True | time_range_support = True | ||||||
| send_accept_language_header = True |  | ||||||
| number_of_results = 28 |  | ||||||
| 
 | 
 | ||||||
| base_url = 'https://www.bing.com/' | base_url = 'https://www.bing.com/videos/asyncv2' | ||||||
| search_string = ( | """Bing (Videos) async search URL.""" | ||||||
|  | 
 | ||||||
|  | bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-video-search/reference/market-codes' | ||||||
|  | """Bing (Video) search API description""" | ||||||
|  | 
 | ||||||
|  | time_map = { | ||||||
|     # fmt: off |     # fmt: off | ||||||
|     'videos/search' |     'day': 60 * 24, | ||||||
|     '?{query}' |     'week': 60 * 24 * 7, | ||||||
|     '&count={count}' |     'month': 60 * 24 * 31, | ||||||
|     '&first={first}' |     'year': 60 * 24 * 365, | ||||||
|     '&scope=video' |  | ||||||
|     '&FORM=QBLH' |  | ||||||
|     # fmt: on |     # fmt: on | ||||||
| ) | } | ||||||
| time_range_string = '&qft=+filterui:videoage-lt{interval}' |  | ||||||
| time_range_dict = {'day': '1440', 'week': '10080', 'month': '43200', 'year': '525600'} |  | ||||||
| 
 |  | ||||||
| # safesearch definitions |  | ||||||
| safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # do search-request |  | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     offset = ((params['pageno'] - 1) * number_of_results) + 1 |     """Assemble a Bing-Video request.""" | ||||||
| 
 | 
 | ||||||
|     search_path = search_string.format(query=urlencode({'q': query}), count=number_of_results, first=offset) |     engine_region = traits.get_region(params['searxng_locale'], 'en-US') | ||||||
|  |     engine_language = traits.get_language(params['searxng_locale'], 'en') | ||||||
| 
 | 
 | ||||||
|     # safesearch cookie |     SID = uuid.uuid1().hex.upper() | ||||||
|     params['cookies']['SRCHHPGUSR'] = 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') |     set_bing_cookies(params, engine_language, engine_region, SID) | ||||||
| 
 | 
 | ||||||
|     # language cookie |     # build URL query | ||||||
|     language = match_language(params['language'], supported_languages, language_aliases).lower() |     # | ||||||
|     params['cookies']['_EDGE_S'] = 'mkt=' + language + '&F=1' |     # example: https://www.bing.com/videos/asyncv2?q=foo&async=content&first=1&count=35 | ||||||
| 
 | 
 | ||||||
|     # query and paging |     query_params = { | ||||||
|     params['url'] = base_url + search_path |         # fmt: off | ||||||
|  |         'q': query, | ||||||
|  |         'async' : 'content', | ||||||
|  |         # to simplify the page count lets use the default of 35 images per page | ||||||
|  |         'first' : (int(params.get('pageno', 1)) - 1) * 35 + 1, | ||||||
|  |         'count' : 35, | ||||||
|  |         # fmt: on | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|     # time range |     # time range | ||||||
|     if params['time_range'] in time_range_dict: |     # | ||||||
|         params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) |     # example: one week (10080 minutes) '&qft= filterui:videoage-lt10080'  '&form=VRFLTR' | ||||||
|  | 
 | ||||||
|  |     if params['time_range']: | ||||||
|  |         query_params['form'] = 'VRFLTR' | ||||||
|  |         query_params['qft'] = ' filterui:videoage-lt%s' % time_map[params['time_range']] | ||||||
|  | 
 | ||||||
|  |     params['url'] = base_url + '?' + urlencode(query_params) | ||||||
| 
 | 
 | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # get response from search-request |  | ||||||
| def response(resp): | def response(resp): | ||||||
|  |     """Get response from Bing-Video""" | ||||||
|     results = [] |     results = [] | ||||||
| 
 | 
 | ||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
| 
 | 
 | ||||||
|     for result in dom.xpath('//div[@class="dg_u"]/div[contains(@class, "mc_vtvc")]'): |     for result in dom.xpath('//div[@class="dg_u"]//div[contains(@id, "mc_vtvc_video")]'): | ||||||
|         metadata = loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0]) |         metadata = json.loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0]) | ||||||
|         info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip() |         info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip() | ||||||
|         content = '{0} - {1}'.format(metadata['du'], info) |         content = '{0} - {1}'.format(metadata['du'], info) | ||||||
|         thumbnail = '{0}th?id={1}'.format(base_url, metadata['thid']) |         thumbnail = result.xpath('.//div[contains(@class, "mc_vtvc_th")]//img/@src')[0] | ||||||
|  | 
 | ||||||
|         results.append( |         results.append( | ||||||
|             { |             { | ||||||
|                 'url': metadata['murl'], |                 'url': metadata['murl'], | ||||||
| @ -96,3 +116,13 @@ def response(resp): | |||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|     return results |     return results | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def fetch_traits(engine_traits: EngineTraits): | ||||||
|  |     """Fetch languages and regions from Bing-Videos.""" | ||||||
|  | 
 | ||||||
|  |     xpath_market_codes = '//table[1]/tbody/tr/td[3]' | ||||||
|  |     # xpath_country_codes = '//table[2]/tbody/tr/td[2]' | ||||||
|  |     xpath_language_codes = '//table[3]/tbody/tr/td[2]' | ||||||
|  | 
 | ||||||
|  |     _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes) | ||||||
|  | |||||||
| @ -1,17 +1,35 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| """Dailymotion (Videos) | # lint: pylint | ||||||
|  | """ | ||||||
|  | Dailymotion (Videos) | ||||||
|  | ~~~~~~~~~~~~~~~~~~~~ | ||||||
|  | 
 | ||||||
|  | .. _REST GET: https://developers.dailymotion.com/tools/ | ||||||
|  | .. _Global API Parameters: https://developers.dailymotion.com/api/#global-parameters | ||||||
|  | .. _Video filters API: https://developers.dailymotion.com/api/#video-filters | ||||||
|  | .. _Fields selection: https://developers.dailymotion.com/api/#fields-selection | ||||||
| 
 | 
 | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from typing import Set | from typing import TYPE_CHECKING | ||||||
|  | 
 | ||||||
| from datetime import datetime, timedelta | from datetime import datetime, timedelta | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| import time | import time | ||||||
| import babel | import babel | ||||||
| 
 | 
 | ||||||
| from searx.exceptions import SearxEngineAPIException | from searx.exceptions import SearxEngineAPIException | ||||||
| from searx.network import raise_for_httperror | from searx import network | ||||||
| from searx.utils import html_to_text | from searx.utils import html_to_text | ||||||
|  | from searx.locales import region_tag, language_tag | ||||||
|  | from searx.enginelib.traits import EngineTraits | ||||||
|  | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     import logging | ||||||
|  | 
 | ||||||
|  |     logger: logging.Logger | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
| @ -37,11 +55,24 @@ time_delta_dict = { | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| safesearch = True | safesearch = True | ||||||
| safesearch_params = {2: '&is_created_for_kids=true', 1: '&is_created_for_kids=true', 0: ''} | safesearch_params = { | ||||||
|  |     2: {'is_created_for_kids': 'true'}, | ||||||
|  |     1: {'is_created_for_kids': 'true'}, | ||||||
|  |     0: {}, | ||||||
|  | } | ||||||
|  | """True if this video is "Created for Kids" / intends to target an audience | ||||||
|  | under the age of 16 (``is_created_for_kids`` in `Video filters API`_ ) | ||||||
|  | """ | ||||||
| 
 | 
 | ||||||
| # search-url | family_filter_map = { | ||||||
| # - https://developers.dailymotion.com/tools/ |     2: 'true', | ||||||
| # - https://www.dailymotion.com/doc/api/obj-video.html |     1: 'true', | ||||||
|  |     0: 'false', | ||||||
|  | } | ||||||
|  | """By default, the family filter is turned on. Setting this parameter to | ||||||
|  | ``false`` will stop filtering-out explicit content from searches and global | ||||||
|  | contexts (``family_filter`` in `Global API Parameters`_ ). | ||||||
|  | """ | ||||||
| 
 | 
 | ||||||
| result_fields = [ | result_fields = [ | ||||||
|     'allow_embed', |     'allow_embed', | ||||||
| @ -53,27 +84,21 @@ result_fields = [ | |||||||
|     'thumbnail_360_url', |     'thumbnail_360_url', | ||||||
|     'id', |     'id', | ||||||
| ] | ] | ||||||
| search_url = ( | """`Fields selection`_, by default, a few fields are returned. To request more | ||||||
|     'https://api.dailymotion.com/videos?' | specific fields, the ``fields`` parameter is used with the list of fields | ||||||
|     'fields={fields}&password_protected={password_protected}&private={private}&sort={sort}&limit={limit}' | SearXNG needs in the response to build a video result list. | ||||||
| ).format( | """ | ||||||
|     fields=','.join(result_fields), | 
 | ||||||
|     password_protected='false', | search_url = 'https://api.dailymotion.com/videos?' | ||||||
|     private='false', | """URL to retrieve a list of videos. | ||||||
|     sort='relevance', | 
 | ||||||
|     limit=number_of_results, | - `REST GET`_ | ||||||
| ) | - `Global API Parameters`_ | ||||||
|  | - `Video filters API`_ | ||||||
|  | """ | ||||||
|  | 
 | ||||||
| iframe_src = "https://www.dailymotion.com/embed/video/{video_id}" | iframe_src = "https://www.dailymotion.com/embed/video/{video_id}" | ||||||
| 
 | """URL template to embed video in SearXNG's result list.""" | ||||||
| # The request query filters by 'languages' & 'country', therefore instead of |  | ||||||
| # fetching only languages we need to fetch locales. |  | ||||||
| supported_languages_url = 'https://api.dailymotion.com/locales' |  | ||||||
| supported_languages_iso639: Set[str] = set() |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def init(_engine_settings): |  | ||||||
|     global supported_languages_iso639 |  | ||||||
|     supported_languages_iso639 = set([language.split('_')[0] for language in supported_languages]) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def request(query, params): | def request(query, params): | ||||||
| @ -81,34 +106,42 @@ def request(query, params): | |||||||
|     if not query: |     if not query: | ||||||
|         return False |         return False | ||||||
| 
 | 
 | ||||||
|     language = params['language'] |     eng_region = traits.get_region(params['searxng_locale'], 'en_US') | ||||||
|     if language == 'all': |     eng_lang = traits.get_language(params['searxng_locale'], 'en') | ||||||
|         language = 'en-US' |  | ||||||
|     locale = babel.Locale.parse(language, sep='-') |  | ||||||
| 
 | 
 | ||||||
|     language_iso639 = locale.language |     args = { | ||||||
|     if locale.language not in supported_languages_iso639: |  | ||||||
|         language_iso639 = 'en' |  | ||||||
| 
 |  | ||||||
|     query_args = { |  | ||||||
|         'search': query, |         'search': query, | ||||||
|         'languages': language_iso639, |         'family_filter': family_filter_map.get(params['safesearch'], 'false'), | ||||||
|  |         'thumbnail_ratio': 'original',  # original|widescreen|square | ||||||
|  |         # https://developers.dailymotion.com/api/#video-filters | ||||||
|  |         'languages': eng_lang, | ||||||
|         'page': params['pageno'], |         'page': params['pageno'], | ||||||
|  |         'password_protected': 'false', | ||||||
|  |         'private': 'false', | ||||||
|  |         'sort': 'relevance', | ||||||
|  |         'limit': number_of_results, | ||||||
|  |         'fields': ','.join(result_fields), | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     if locale.territory: |     args.update(safesearch_params.get(params['safesearch'], {})) | ||||||
|         localization = locale.language + '_' + locale.territory | 
 | ||||||
|         if localization in supported_languages: |     # Don't add localization and country arguments if the user does select a | ||||||
|             query_args['country'] = locale.territory |     # language (:de, :en, ..) | ||||||
|  | 
 | ||||||
|  |     if len(params['searxng_locale'].split('-')) > 1: | ||||||
|  |         # https://developers.dailymotion.com/api/#global-parameters | ||||||
|  |         args['localization'] = eng_region | ||||||
|  |         args['country'] = eng_region.split('_')[1] | ||||||
|  |         # Insufficient rights for the `ams_country' parameter of route `GET /videos' | ||||||
|  |         # 'ams_country': eng_region.split('_')[1], | ||||||
| 
 | 
 | ||||||
|     time_delta = time_delta_dict.get(params["time_range"]) |     time_delta = time_delta_dict.get(params["time_range"]) | ||||||
|     if time_delta: |     if time_delta: | ||||||
|         created_after = datetime.now() - time_delta |         created_after = datetime.now() - time_delta | ||||||
|         query_args['created_after'] = datetime.timestamp(created_after) |         args['created_after'] = datetime.timestamp(created_after) | ||||||
| 
 | 
 | ||||||
|     query_str = urlencode(query_args) |     query_str = urlencode(args) | ||||||
|     params['url'] = search_url + '&' + query_str + safesearch_params.get(params['safesearch'], '') |     params['url'] = search_url + query_str | ||||||
|     params['raise_for_httperror'] = False |  | ||||||
| 
 | 
 | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| @ -123,7 +156,7 @@ def response(resp): | |||||||
|     if 'error' in search_res: |     if 'error' in search_res: | ||||||
|         raise SearxEngineAPIException(search_res['error'].get('message')) |         raise SearxEngineAPIException(search_res['error'].get('message')) | ||||||
| 
 | 
 | ||||||
|     raise_for_httperror(resp) |     network.raise_for_httperror(resp) | ||||||
| 
 | 
 | ||||||
|     # parse results |     # parse results | ||||||
|     for res in search_res.get('list', []): |     for res in search_res.get('list', []): | ||||||
| @ -167,7 +200,53 @@ def response(resp): | |||||||
|     return results |     return results | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # get supported languages from their site | def fetch_traits(engine_traits: EngineTraits): | ||||||
| def _fetch_supported_languages(resp): |     """Fetch locales & languages from dailymotion. | ||||||
|     response_json = resp.json() | 
 | ||||||
|     return [item['locale'] for item in response_json['list']] |     Locales fetched from `api/locales <https://api.dailymotion.com/locales>`_. | ||||||
|  |     There are duplications in the locale codes returned from Dailymotion which | ||||||
|  |     can be ignored:: | ||||||
|  | 
 | ||||||
|  |       en_EN --> en_GB, en_US | ||||||
|  |       ar_AA --> ar_EG, ar_AE, ar_SA | ||||||
|  | 
 | ||||||
|  |     The language list `api/languages <https://api.dailymotion.com/languages>`_ | ||||||
|  |     contains over 7000 *languages* codes (see PR1071_).  We use only those | ||||||
|  |     language codes that are used in the locales. | ||||||
|  | 
 | ||||||
|  |     .. _PR1071: https://github.com/searxng/searxng/pull/1071 | ||||||
|  | 
 | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     resp = network.get('https://api.dailymotion.com/locales') | ||||||
|  |     if not resp.ok: | ||||||
|  |         print("ERROR: response from dailymotion/locales is not OK.") | ||||||
|  | 
 | ||||||
|  |     for item in resp.json()['list']: | ||||||
|  |         eng_tag = item['locale'] | ||||||
|  |         if eng_tag in ('en_EN', 'ar_AA'): | ||||||
|  |             continue | ||||||
|  |         try: | ||||||
|  |             sxng_tag = region_tag(babel.Locale.parse(eng_tag)) | ||||||
|  |         except babel.UnknownLocaleError: | ||||||
|  |             print("ERROR: item unknown --> %s" % item) | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         conflict = engine_traits.regions.get(sxng_tag) | ||||||
|  |         if conflict: | ||||||
|  |             if conflict != eng_tag: | ||||||
|  |                 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) | ||||||
|  |             continue | ||||||
|  |         engine_traits.regions[sxng_tag] = eng_tag | ||||||
|  | 
 | ||||||
|  |     locale_lang_list = [x.split('_')[0] for x in engine_traits.regions.values()] | ||||||
|  | 
 | ||||||
|  |     resp = network.get('https://api.dailymotion.com/languages') | ||||||
|  |     if not resp.ok: | ||||||
|  |         print("ERROR: response from dailymotion/languages is not OK.") | ||||||
|  | 
 | ||||||
|  |     for item in resp.json()['list']: | ||||||
|  |         eng_tag = item['code'] | ||||||
|  |         if eng_tag in locale_lang_list: | ||||||
|  |             sxng_tag = language_tag(babel.Locale.parse(eng_tag)) | ||||||
|  |             engine_traits.languages[sxng_tag] = eng_tag | ||||||
|  | |||||||
| @ -63,7 +63,7 @@ def search(query, request_params): | |||||||
|     for row in result_list: |     for row in result_list: | ||||||
|         entry = { |         entry = { | ||||||
|             'query': query, |             'query': query, | ||||||
|             'language': request_params['language'], |             'language': request_params['searxng_locale'], | ||||||
|             'value': row.get("value"), |             'value': row.get("value"), | ||||||
|             # choose a result template or comment out to use the *default* |             # choose a result template or comment out to use the *default* | ||||||
|             'template': 'key-value.html', |             'template': 'key-value.html', | ||||||
|  | |||||||
| @ -1,71 +1,207 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| # lint: pylint | # lint: pylint | ||||||
| """DuckDuckGo Lite | """ | ||||||
|  | DuckDuckGo Lite | ||||||
|  | ~~~~~~~~~~~~~~~ | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from json import loads | from typing import TYPE_CHECKING | ||||||
| 
 | from urllib.parse import urlencode | ||||||
| from lxml.html import fromstring | import json | ||||||
|  | import babel | ||||||
|  | import lxml.html | ||||||
| 
 | 
 | ||||||
|  | from searx import ( | ||||||
|  |     network, | ||||||
|  |     locales, | ||||||
|  |     redislib, | ||||||
|  | ) | ||||||
|  | from searx import redisdb | ||||||
| from searx.utils import ( | from searx.utils import ( | ||||||
|     dict_subset, |  | ||||||
|     eval_xpath, |     eval_xpath, | ||||||
|     eval_xpath_getindex, |     eval_xpath_getindex, | ||||||
|     extract_text, |     extract_text, | ||||||
|     match_language, |  | ||||||
| ) | ) | ||||||
| from searx.network import get | from searx.enginelib.traits import EngineTraits | ||||||
|  | from searx.exceptions import SearxEngineAPIException | ||||||
|  | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     import logging | ||||||
|  | 
 | ||||||
|  |     logger: logging.Logger | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
| 
 | 
 | ||||||
| # about |  | ||||||
| about = { | about = { | ||||||
|     "website": 'https://lite.duckduckgo.com/lite/', |     "website": 'https://lite.duckduckgo.com/lite/', | ||||||
|     "wikidata_id": 'Q12805', |     "wikidata_id": 'Q12805', | ||||||
|     "official_api_documentation": 'https://duckduckgo.com/api', |  | ||||||
|     "use_official_api": False, |     "use_official_api": False, | ||||||
|     "require_api_key": False, |     "require_api_key": False, | ||||||
|     "results": 'HTML', |     "results": 'HTML', | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | send_accept_language_header = True | ||||||
|  | """DuckDuckGo-Lite tries to guess user's prefered language from the HTTP | ||||||
|  | ``Accept-Language``.  Optional the user can select a region filter (but not a | ||||||
|  | language). | ||||||
|  | """ | ||||||
|  | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['general', 'web'] | categories = ['general', 'web'] | ||||||
| paging = True | paging = True | ||||||
| supported_languages_url = 'https://duckduckgo.com/util/u588.js' |  | ||||||
| time_range_support = True | time_range_support = True | ||||||
| send_accept_language_header = True | safesearch = True  # user can't select but the results are filtered | ||||||
| 
 | 
 | ||||||
| language_aliases = { | url = 'https://lite.duckduckgo.com/lite/' | ||||||
|     'ar-SA': 'ar-XA', | # url_ping = 'https://duckduckgo.com/t/sl_l' | ||||||
|     'es-419': 'es-XL', |  | ||||||
|     'ja': 'jp-JP', |  | ||||||
|     'ko': 'kr-KR', |  | ||||||
|     'sl-SI': 'sl-SL', |  | ||||||
|     'zh-TW': 'tzh-TW', |  | ||||||
|     'zh-HK': 'tzh-HK', |  | ||||||
| } |  | ||||||
| 
 | 
 | ||||||
| time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} | time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} | ||||||
|  | form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'} | ||||||
| 
 | 
 | ||||||
| # search-url |  | ||||||
| url = 'https://lite.duckduckgo.com/lite/' |  | ||||||
| url_ping = 'https://duckduckgo.com/t/sl_l' |  | ||||||
| 
 | 
 | ||||||
| # match query's language to a region code that duckduckgo will accept | def cache_vqd(query, value): | ||||||
| def get_region_code(lang, lang_list=None): |     """Caches a ``vqd`` value from a query. | ||||||
|     if lang == 'all': |  | ||||||
|         return None |  | ||||||
| 
 | 
 | ||||||
|     lang_code = match_language(lang, lang_list or [], language_aliases, 'wt-WT') |     The vqd value depends on the query string and is needed for the follow up | ||||||
|     lang_parts = lang_code.split('-') |     pages or the images loaded by a XMLHttpRequest: | ||||||
| 
 | 
 | ||||||
|     # country code goes first |     - DuckDuckGo Web: `https://links.duckduckgo.com/d.js?q=...&vqd=...` | ||||||
|     return lang_parts[1].lower() + '-' + lang_parts[0].lower() |     - DuckDuckGo Images: `https://duckduckgo.com/i.js??q=...&vqd=...` | ||||||
|  | 
 | ||||||
|  |     """ | ||||||
|  |     c = redisdb.client() | ||||||
|  |     if c: | ||||||
|  |         logger.debug("cache vqd value: %s", value) | ||||||
|  |         key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query) | ||||||
|  |         c.set(key, value, ex=600) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_vqd(query, headers): | ||||||
|  |     """Returns the ``vqd`` that fits to the *query*.  If there is no ``vqd`` cached | ||||||
|  |     (:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd value from the | ||||||
|  |     response. | ||||||
|  | 
 | ||||||
|  |     """ | ||||||
|  |     value = None | ||||||
|  |     c = redisdb.client() | ||||||
|  |     if c: | ||||||
|  |         key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query) | ||||||
|  |         value = c.get(key) | ||||||
|  |         if value: | ||||||
|  |             value = value.decode('utf-8') | ||||||
|  |             logger.debug("re-use cached vqd value: %s", value) | ||||||
|  |             return value | ||||||
|  | 
 | ||||||
|  |     query_url = 'https://duckduckgo.com/?{query}&iar=images'.format(query=urlencode({'q': query})) | ||||||
|  |     res = network.get(query_url, headers=headers) | ||||||
|  |     content = res.text | ||||||
|  |     if content.find('vqd=\'') == -1: | ||||||
|  |         raise SearxEngineAPIException('Request failed') | ||||||
|  |     value = content[content.find('vqd=\'') + 5 :] | ||||||
|  |     value = value[: value.find('\'')] | ||||||
|  |     logger.debug("new vqd value: %s", value) | ||||||
|  |     cache_vqd(query, value) | ||||||
|  |     return value | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'): | ||||||
|  |     """Get DuckDuckGo's language identifier from SearXNG's locale. | ||||||
|  | 
 | ||||||
|  |     DuckDuckGo defines its lanaguages by region codes (see | ||||||
|  |     :py:obj:`fetch_traits`). | ||||||
|  | 
 | ||||||
|  |     To get region and language of a DDG service use: | ||||||
|  | 
 | ||||||
|  |     .. code: python | ||||||
|  | 
 | ||||||
|  |        eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) | ||||||
|  |        eng_lang = get_ddg_lang(traits, params['searxng_locale']) | ||||||
|  | 
 | ||||||
|  |     It might confuse, but the ``l`` value of the cookie is what SearXNG calls | ||||||
|  |     the *region*: | ||||||
|  | 
 | ||||||
|  |     .. code:: python | ||||||
|  | 
 | ||||||
|  |         # !ddi paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'} | ||||||
|  |         params['cookies']['ad'] = eng_lang | ||||||
|  |         params['cookies']['ah'] = eng_region | ||||||
|  |         params['cookies']['l'] = eng_region | ||||||
|  | 
 | ||||||
|  |     .. hint:: | ||||||
|  | 
 | ||||||
|  |        `DDG-lite <https://lite.duckduckgo.com/lite>`__ does not offer a language | ||||||
|  |        selection to the user, only a region can be selected by the user | ||||||
|  |        (``eng_region`` from the example above).  DDG-lite stores the selected | ||||||
|  |        region in a cookie:: | ||||||
|  | 
 | ||||||
|  |          params['cookies']['kl'] = eng_region  # 'ar-es' | ||||||
|  | 
 | ||||||
|  |     """ | ||||||
|  |     return eng_traits.custom['lang_region'].get(sxng_locale, eng_traits.get_language(sxng_locale, default)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ddg_reg_map = { | ||||||
|  |     'tw-tzh': 'zh_TW', | ||||||
|  |     'hk-tzh': 'zh_HK', | ||||||
|  |     'ct-ca': 'skip',  # ct-ca and es-ca both map to ca_ES | ||||||
|  |     'es-ca': 'ca_ES', | ||||||
|  |     'id-en': 'id_ID', | ||||||
|  |     'no-no': 'nb_NO', | ||||||
|  |     'jp-jp': 'ja_JP', | ||||||
|  |     'kr-kr': 'ko_KR', | ||||||
|  |     'xa-ar': 'ar_SA', | ||||||
|  |     'sl-sl': 'sl_SI', | ||||||
|  |     'th-en': 'th_TH', | ||||||
|  |     'vn-en': 'vi_VN', | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | ddg_lang_map = { | ||||||
|  |     # use ar --> ar_EG (Egypt's arabic) | ||||||
|  |     "ar_DZ": 'lang_region', | ||||||
|  |     "ar_JO": 'lang_region', | ||||||
|  |     "ar_SA": 'lang_region', | ||||||
|  |     # use bn --> bn_BD | ||||||
|  |     'bn_IN': 'lang_region', | ||||||
|  |     # use de --> de_DE | ||||||
|  |     'de_CH': 'lang_region', | ||||||
|  |     # use en --> en_US, | ||||||
|  |     'en_AU': 'lang_region', | ||||||
|  |     'en_CA': 'lang_region', | ||||||
|  |     'en_GB': 'lang_region', | ||||||
|  |     # Esperanto | ||||||
|  |     'eo_XX': 'eo', | ||||||
|  |     # use es --> es_ES, | ||||||
|  |     'es_AR': 'lang_region', | ||||||
|  |     'es_CL': 'lang_region', | ||||||
|  |     'es_CO': 'lang_region', | ||||||
|  |     'es_CR': 'lang_region', | ||||||
|  |     'es_EC': 'lang_region', | ||||||
|  |     'es_MX': 'lang_region', | ||||||
|  |     'es_PE': 'lang_region', | ||||||
|  |     'es_UY': 'lang_region', | ||||||
|  |     'es_VE': 'lang_region', | ||||||
|  |     # use fr --> rf_FR | ||||||
|  |     'fr_CA': 'lang_region', | ||||||
|  |     'fr_CH': 'lang_region', | ||||||
|  |     'fr_BE': 'lang_region', | ||||||
|  |     # use nl --> nl_NL | ||||||
|  |     'nl_BE': 'lang_region', | ||||||
|  |     # use pt --> pt_PT | ||||||
|  |     'pt_BR': 'lang_region', | ||||||
|  |     # skip these languages | ||||||
|  |     'od_IN': 'skip', | ||||||
|  |     'io_XX': 'skip', | ||||||
|  |     'tokipona_XX': 'skip', | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def request(query, params): | def request(query, params): | ||||||
| 
 | 
 | ||||||
|  |     eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) | ||||||
|  |     # eng_lang = get_ddg_lang(traits, params['searxng_locale']) | ||||||
|  | 
 | ||||||
|     params['url'] = url |     params['url'] = url | ||||||
|     params['method'] = 'POST' |     params['method'] = 'POST' | ||||||
| 
 |  | ||||||
|     params['data']['q'] = query |     params['data']['q'] = query | ||||||
| 
 | 
 | ||||||
|     # The API is not documented, so we do some reverse engineering and emulate |     # The API is not documented, so we do some reverse engineering and emulate | ||||||
| @ -88,23 +224,19 @@ def request(query, params): | |||||||
|         params['data']['s'] = offset |         params['data']['s'] = offset | ||||||
|         params['data']['dc'] = offset + 1 |         params['data']['dc'] = offset + 1 | ||||||
| 
 | 
 | ||||||
|  |     # request needs a vqd argument | ||||||
|  |     params['data']['vqd'] = get_vqd(query, params["headers"]) | ||||||
|  | 
 | ||||||
|     # initial page does not have additional data in the input form |     # initial page does not have additional data in the input form | ||||||
|     if params['pageno'] > 1: |     if params['pageno'] > 1: | ||||||
|         # request the second page (and more pages) needs 'o' and 'api' arguments |  | ||||||
|         params['data']['o'] = 'json' |  | ||||||
|         params['data']['api'] = 'd.js' |  | ||||||
| 
 | 
 | ||||||
|     # initial page does not have additional data in the input form |         params['data']['o'] = form_data.get('o', 'json') | ||||||
|     if params['pageno'] > 2: |         params['data']['api'] = form_data.get('api', 'd.js') | ||||||
|         # request the third page (and more pages) some more arguments |         params['data']['nextParams'] = form_data.get('nextParams', '') | ||||||
|         params['data']['nextParams'] = '' |         params['data']['v'] = form_data.get('v', 'l') | ||||||
|         params['data']['v'] = '' |  | ||||||
|         params['data']['vqd'] = '' |  | ||||||
| 
 | 
 | ||||||
|     region_code = get_region_code(params['language'], supported_languages) |     params['data']['kl'] = eng_region | ||||||
|     if region_code: |     params['cookies']['kl'] = eng_region | ||||||
|         params['data']['kl'] = region_code |  | ||||||
|         params['cookies']['kl'] = region_code |  | ||||||
| 
 | 
 | ||||||
|     params['data']['df'] = '' |     params['data']['df'] = '' | ||||||
|     if params['time_range'] in time_range_dict: |     if params['time_range'] in time_range_dict: | ||||||
| @ -116,26 +248,40 @@ def request(query, params): | |||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # get response from search-request |  | ||||||
| def response(resp): | def response(resp): | ||||||
| 
 | 
 | ||||||
|     headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie']) |  | ||||||
|     get(url_ping, headers=headers_ping) |  | ||||||
| 
 |  | ||||||
|     if resp.status_code == 303: |     if resp.status_code == 303: | ||||||
|         return [] |         return [] | ||||||
| 
 | 
 | ||||||
|     results = [] |     results = [] | ||||||
|     doc = fromstring(resp.text) |     doc = lxml.html.fromstring(resp.text) | ||||||
| 
 | 
 | ||||||
|     result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') |     result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') | ||||||
|     if not len(result_table) >= 3: | 
 | ||||||
|  |     if len(result_table) == 2: | ||||||
|  |         # some locales (at least China) does not have a "next page" button and | ||||||
|  |         # the layout of the HTML tables is different. | ||||||
|  |         result_table = result_table[1] | ||||||
|  |     elif not len(result_table) >= 3: | ||||||
|         # no more results |         # no more results | ||||||
|         return [] |         return [] | ||||||
|  |     else: | ||||||
|         result_table = result_table[2] |         result_table = result_table[2] | ||||||
|  |         # update form data from response | ||||||
|  |         form = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table//input/..') | ||||||
|  |         if len(form): | ||||||
|  | 
 | ||||||
|  |             form = form[0] | ||||||
|  |             form_data['v'] = eval_xpath(form, '//input[@name="v"]/@value')[0] | ||||||
|  |             form_data['api'] = eval_xpath(form, '//input[@name="api"]/@value')[0] | ||||||
|  |             form_data['o'] = eval_xpath(form, '//input[@name="o"]/@value')[0] | ||||||
|  |             logger.debug('form_data: %s', form_data) | ||||||
|  | 
 | ||||||
|  |             value = eval_xpath(form, '//input[@name="vqd"]/@value')[0] | ||||||
|  |             query = resp.search_params['data']['q'] | ||||||
|  |             cache_vqd(query, value) | ||||||
| 
 | 
 | ||||||
|     tr_rows = eval_xpath(result_table, './/tr') |     tr_rows = eval_xpath(result_table, './/tr') | ||||||
| 
 |  | ||||||
|     # In the last <tr> is the form of the 'previous/next page' links |     # In the last <tr> is the form of the 'previous/next page' links | ||||||
|     tr_rows = tr_rows[:-1] |     tr_rows = tr_rows[:-1] | ||||||
| 
 | 
 | ||||||
| @ -172,15 +318,105 @@ def response(resp): | |||||||
|     return results |     return results | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # get supported languages from their site | def fetch_traits(engine_traits: EngineTraits): | ||||||
| def _fetch_supported_languages(resp): |     """Fetch languages & regions from DuckDuckGo. | ||||||
| 
 | 
 | ||||||
|     # response is a js file with regions as an embedded object |     SearXNG's ``all`` locale maps DuckDuckGo's "Alle regions" (``wt-wt``). | ||||||
|     response_page = resp.text |     DuckDuckGo's language "Browsers prefered language" (``wt_WT``) makes no | ||||||
|     response_page = response_page[response_page.find('regions:{') + 8 :] |     sense in a SearXNG request since SearXNG's ``all`` will not add a | ||||||
|     response_page = response_page[: response_page.find('}') + 1] |     ``Accept-Language`` HTTP header.  The value in ``engine_traits.all_locale`` | ||||||
|  |     is ``wt-wt`` (the region). | ||||||
| 
 | 
 | ||||||
|     regions_json = loads(response_page) |     Beside regions DuckDuckGo also defines its lanaguages by region codes.  By | ||||||
|     supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys()) |     example these are the english languages in DuckDuckGo: | ||||||
| 
 | 
 | ||||||
|     return list(supported_languages) |     - en_US | ||||||
|  |     - en_AU | ||||||
|  |     - en_CA | ||||||
|  |     - en_GB | ||||||
|  | 
 | ||||||
|  |     The function :py:obj:`get_ddg_lang` evaluates DuckDuckGo's language from | ||||||
|  |     SearXNG's locale. | ||||||
|  | 
 | ||||||
|  |     """ | ||||||
|  |     # pylint: disable=too-many-branches, too-many-statements | ||||||
|  |     # fetch regions | ||||||
|  | 
 | ||||||
|  |     engine_traits.all_locale = 'wt-wt' | ||||||
|  | 
 | ||||||
|  |     # updated from u588 to u661 / should be updated automatically? | ||||||
|  |     resp = network.get('https://duckduckgo.com/util/u661.js') | ||||||
|  | 
 | ||||||
|  |     if not resp.ok: | ||||||
|  |         print("ERROR: response from DuckDuckGo is not OK.") | ||||||
|  | 
 | ||||||
|  |     pos = resp.text.find('regions:{') + 8 | ||||||
|  |     js_code = resp.text[pos:] | ||||||
|  |     pos = js_code.find('}') + 1 | ||||||
|  |     regions = json.loads(js_code[:pos]) | ||||||
|  | 
 | ||||||
|  |     for eng_tag, name in regions.items(): | ||||||
|  | 
 | ||||||
|  |         if eng_tag == 'wt-wt': | ||||||
|  |             engine_traits.all_locale = 'wt-wt' | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         region = ddg_reg_map.get(eng_tag) | ||||||
|  |         if region == 'skip': | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         if not region: | ||||||
|  |             eng_territory, eng_lang = eng_tag.split('-') | ||||||
|  |             region = eng_lang + '_' + eng_territory.upper() | ||||||
|  | 
 | ||||||
|  |         try: | ||||||
|  |             sxng_tag = locales.region_tag(babel.Locale.parse(region)) | ||||||
|  |         except babel.UnknownLocaleError: | ||||||
|  |             print("ERROR: %s (%s) -> %s is unknown by babel" % (name, eng_tag, region)) | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         conflict = engine_traits.regions.get(sxng_tag) | ||||||
|  |         if conflict: | ||||||
|  |             if conflict != eng_tag: | ||||||
|  |                 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) | ||||||
|  |             continue | ||||||
|  |         engine_traits.regions[sxng_tag] = eng_tag | ||||||
|  | 
 | ||||||
|  |     # fetch languages | ||||||
|  | 
 | ||||||
|  |     engine_traits.custom['lang_region'] = {} | ||||||
|  | 
 | ||||||
|  |     pos = resp.text.find('languages:{') + 10 | ||||||
|  |     js_code = resp.text[pos:] | ||||||
|  |     pos = js_code.find('}') + 1 | ||||||
|  |     js_code = '{"' + js_code[1:pos].replace(':', '":').replace(',', ',"') | ||||||
|  |     languages = json.loads(js_code) | ||||||
|  | 
 | ||||||
|  |     for eng_lang, name in languages.items(): | ||||||
|  | 
 | ||||||
|  |         if eng_lang == 'wt_WT': | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         babel_tag = ddg_lang_map.get(eng_lang, eng_lang) | ||||||
|  |         if babel_tag == 'skip': | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         try: | ||||||
|  | 
 | ||||||
|  |             if babel_tag == 'lang_region': | ||||||
|  |                 sxng_tag = locales.region_tag(babel.Locale.parse(eng_lang)) | ||||||
|  |                 engine_traits.custom['lang_region'][sxng_tag] = eng_lang | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |             sxng_tag = locales.language_tag(babel.Locale.parse(babel_tag)) | ||||||
|  | 
 | ||||||
|  |         except babel.UnknownLocaleError: | ||||||
|  |             print("ERROR: language %s (%s) is unknown by babel" % (name, eng_lang)) | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         conflict = engine_traits.languages.get(sxng_tag) | ||||||
|  |         if conflict: | ||||||
|  |             if conflict != eng_lang: | ||||||
|  |                 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang)) | ||||||
|  |             continue | ||||||
|  |         engine_traits.languages[sxng_tag] = eng_lang | ||||||
|  | |||||||
| @ -1,22 +1,33 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| # lint: pylint | # lint: pylint | ||||||
| """DuckDuckGo (Instant Answer API) | """ | ||||||
|  | DuckDuckGo Instant Answer API | ||||||
|  | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||||
|  | 
 | ||||||
|  | The `DDG-API <https://duckduckgo.com/api>`__ is no longer documented but from | ||||||
|  | reverse engineering we can see that some services (e.g. instant answers) still | ||||||
|  | in use from the DDG search engine. | ||||||
|  | 
 | ||||||
|  | As far we can say the *instant answers* API does not support languages, or at | ||||||
|  | least we could not find out how language support should work.  It seems that | ||||||
|  | most of the features are based on English terms. | ||||||
| 
 | 
 | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| import json | from typing import TYPE_CHECKING | ||||||
|  | 
 | ||||||
| from urllib.parse import urlencode, urlparse, urljoin | from urllib.parse import urlencode, urlparse, urljoin | ||||||
| from lxml import html | from lxml import html | ||||||
| 
 | 
 | ||||||
| from searx.data import WIKIDATA_UNITS | from searx.data import WIKIDATA_UNITS | ||||||
| from searx.engines.duckduckgo import language_aliases | from searx.utils import extract_text, html_to_text, get_string_replaces_function | ||||||
| from searx.engines.duckduckgo import (  # pylint: disable=unused-import |  | ||||||
|     _fetch_supported_languages, |  | ||||||
|     supported_languages_url, |  | ||||||
| ) |  | ||||||
| from searx.utils import extract_text, html_to_text, match_language, get_string_replaces_function |  | ||||||
| from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom | from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom | ||||||
| 
 | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     import logging | ||||||
|  | 
 | ||||||
|  |     logger: logging.Logger | ||||||
|  | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
|     "website": 'https://duckduckgo.com/', |     "website": 'https://duckduckgo.com/', | ||||||
| @ -37,7 +48,7 @@ replace_http_by_https = get_string_replaces_function({'http:': 'https:'}) | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def is_broken_text(text): | def is_broken_text(text): | ||||||
|     """duckduckgo may return something like "<a href="xxxx">http://somewhere Related website<a/>" |     """duckduckgo may return something like ``<a href="xxxx">http://somewhere Related website<a/>`` | ||||||
| 
 | 
 | ||||||
|     The href URL is broken, the "Related website" may contains some HTML. |     The href URL is broken, the "Related website" may contains some HTML. | ||||||
| 
 | 
 | ||||||
| @ -62,8 +73,6 @@ def result_to_text(text, htmlResult): | |||||||
| 
 | 
 | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     params['url'] = URL.format(query=urlencode({'q': query})) |     params['url'] = URL.format(query=urlencode({'q': query})) | ||||||
|     language = match_language(params['language'], supported_languages, language_aliases) |  | ||||||
|     language = language.split('-')[0] |  | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -71,7 +80,7 @@ def response(resp): | |||||||
|     # pylint: disable=too-many-locals, too-many-branches, too-many-statements |     # pylint: disable=too-many-locals, too-many-branches, too-many-statements | ||||||
|     results = [] |     results = [] | ||||||
| 
 | 
 | ||||||
|     search_res = json.loads(resp.text) |     search_res = resp.json() | ||||||
| 
 | 
 | ||||||
|     # search_res.get('Entity') possible values (not exhaustive) : |     # search_res.get('Entity') possible values (not exhaustive) : | ||||||
|     # * continent / country / department / location / waterfall |     # * continent / country / department / location / waterfall | ||||||
| @ -235,7 +244,7 @@ def unit_to_str(unit): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def area_to_str(area): | def area_to_str(area): | ||||||
|     """parse {'unit': 'http://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}""" |     """parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``""" | ||||||
|     unit = unit_to_str(area.get('unit')) |     unit = unit_to_str(area.get('unit')) | ||||||
|     if unit is not None: |     if unit is not None: | ||||||
|         try: |         try: | ||||||
|  | |||||||
| @ -1,26 +1,30 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| """ | """ | ||||||
|  DuckDuckGo (Images) | DuckDuckGo Images | ||||||
|  | ~~~~~~~~~~~~~~~~~ | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from json import loads | from typing import TYPE_CHECKING | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from searx.exceptions import SearxEngineAPIException | 
 | ||||||
| from searx.engines.duckduckgo import get_region_code | from searx.engines.duckduckgo import fetch_traits  # pylint: disable=unused-import | ||||||
| from searx.engines.duckduckgo import (  # pylint: disable=unused-import | from searx.engines.duckduckgo import ( | ||||||
|     _fetch_supported_languages, |     get_ddg_lang, | ||||||
|     supported_languages_url, |     get_vqd, | ||||||
| ) | ) | ||||||
| from searx.network import get | from searx.enginelib.traits import EngineTraits | ||||||
|  | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     import logging | ||||||
|  | 
 | ||||||
|  |     logger: logging.Logger | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
|     "website": 'https://duckduckgo.com/', |     "website": 'https://duckduckgo.com/', | ||||||
|     "wikidata_id": 'Q12805', |     "wikidata_id": 'Q12805', | ||||||
|     "official_api_documentation": { |  | ||||||
|         'url': 'https://duckduckgo.com/api', |  | ||||||
|         'comment': 'but images are not supported', |  | ||||||
|     }, |  | ||||||
|     "use_official_api": False, |     "use_official_api": False, | ||||||
|     "require_api_key": False, |     "require_api_key": False, | ||||||
|     "results": 'JSON (site requires js to get images)', |     "results": 'JSON (site requires js to get images)', | ||||||
| @ -32,70 +36,64 @@ paging = True | |||||||
| safesearch = True | safesearch = True | ||||||
| send_accept_language_header = True | send_accept_language_header = True | ||||||
| 
 | 
 | ||||||
| # search-url | safesearch_cookies = {0: '-2', 1: None, 2: '1'} | ||||||
| images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}' | safesearch_args = {0: '1', 1: None, 2: '1'} | ||||||
| site_url = 'https://duckduckgo.com/?{query}&iar=images&iax=1&ia=images' |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # run query in site to get vqd number needed for requesting images |  | ||||||
| # TODO: find a way to get this number without an extra request (is it a hash of the query?) |  | ||||||
| def get_vqd(query, headers): |  | ||||||
|     query_url = site_url.format(query=urlencode({'q': query})) |  | ||||||
|     res = get(query_url, headers=headers) |  | ||||||
|     content = res.text |  | ||||||
|     if content.find('vqd=\'') == -1: |  | ||||||
|         raise SearxEngineAPIException('Request failed') |  | ||||||
|     vqd = content[content.find('vqd=\'') + 5 :] |  | ||||||
|     vqd = vqd[: vqd.find('\'')] |  | ||||||
|     return vqd |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # do search-request |  | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     # to avoid running actual external requests when testing |  | ||||||
|     if 'is_test' not in params: |  | ||||||
|         vqd = get_vqd(query, params['headers']) |  | ||||||
|     else: |  | ||||||
|         vqd = '12345' |  | ||||||
| 
 | 
 | ||||||
|     offset = (params['pageno'] - 1) * 50 |     eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) | ||||||
|  |     eng_lang = get_ddg_lang(traits, params['searxng_locale']) | ||||||
| 
 | 
 | ||||||
|     safesearch = params['safesearch'] - 1 |     args = { | ||||||
|  |         'q': query, | ||||||
|  |         'o': 'json', | ||||||
|  |         # 'u': 'bing', | ||||||
|  |         'l': eng_region, | ||||||
|  |         'vqd': get_vqd(query, params["headers"]), | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|     region_code = get_region_code(params['language'], lang_list=supported_languages) |     if params['pageno'] > 1: | ||||||
|     if region_code: |         args['s'] = (params['pageno'] - 1) * 100 | ||||||
|         params['url'] = images_url.format( | 
 | ||||||
|             query=urlencode({'q': query, 'l': region_code}), offset=offset, safesearch=safesearch, vqd=vqd |     params['cookies']['ad'] = eng_lang  # zh_CN | ||||||
|         ) |     params['cookies']['ah'] = eng_region  # "us-en,de-de" | ||||||
|     else: |     params['cookies']['l'] = eng_region  # "hk-tzh" | ||||||
|         params['url'] = images_url.format(query=urlencode({'q': query}), offset=offset, safesearch=safesearch, vqd=vqd) |     logger.debug("cookies: %s", params['cookies']) | ||||||
|  | 
 | ||||||
|  |     safe_search = safesearch_cookies.get(params['safesearch']) | ||||||
|  |     if safe_search is not None: | ||||||
|  |         params['cookies']['p'] = safe_search  # "-2", "1" | ||||||
|  |     safe_search = safesearch_args.get(params['safesearch']) | ||||||
|  |     if safe_search is not None: | ||||||
|  |         args['p'] = safe_search  # "-1", "1" | ||||||
|  | 
 | ||||||
|  |     args = urlencode(args) | ||||||
|  |     params['url'] = 'https://duckduckgo.com/i.js?{args}&f={f}'.format(args=args, f=',,,,,') | ||||||
|  | 
 | ||||||
|  |     params['headers']['Accept'] = 'application/json, text/javascript, */*; q=0.01' | ||||||
|  |     params['headers']['Referer'] = 'https://duckduckgo.com/' | ||||||
|  |     params['headers']['X-Requested-With'] = 'XMLHttpRequest' | ||||||
|  |     logger.debug("headers: %s", params['headers']) | ||||||
| 
 | 
 | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # get response from search-request |  | ||||||
| def response(resp): | def response(resp): | ||||||
|     results = [] |     results = [] | ||||||
|  |     res_json = resp.json() | ||||||
| 
 | 
 | ||||||
|     content = resp.text |  | ||||||
|     res_json = loads(content) |  | ||||||
| 
 |  | ||||||
|     # parse results |  | ||||||
|     for result in res_json['results']: |     for result in res_json['results']: | ||||||
|         title = result['title'] |  | ||||||
|         url = result['url'] |  | ||||||
|         thumbnail = result['thumbnail'] |  | ||||||
|         image = result['image'] |  | ||||||
| 
 |  | ||||||
|         # append result |  | ||||||
|         results.append( |         results.append( | ||||||
|             { |             { | ||||||
|                 'template': 'images.html', |                 'template': 'images.html', | ||||||
|                 'title': title, |                 'title': result['title'], | ||||||
|                 'content': '', |                 'content': '', | ||||||
|                 'thumbnail_src': thumbnail, |                 'thumbnail_src': result['thumbnail'], | ||||||
|                 'img_src': image, |                 'img_src': result['image'], | ||||||
|                 'url': url, |                 'url': result['url'], | ||||||
|  |                 'img_format': '%s x %s' % (result['width'], result['height']), | ||||||
|  |                 'source': result['source'], | ||||||
|             } |             } | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -1,13 +1,29 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| # lint: pylint | # lint: pylint | ||||||
| """DuckDuckGo Weather""" | """ | ||||||
|  | DuckDuckGo Weather | ||||||
|  | ~~~~~~~~~~~~~~~~~~ | ||||||
|  | """ | ||||||
| 
 | 
 | ||||||
|  | from typing import TYPE_CHECKING | ||||||
| from json import loads | from json import loads | ||||||
| from urllib.parse import quote | from urllib.parse import quote | ||||||
| 
 | 
 | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from flask_babel import gettext | from flask_babel import gettext | ||||||
| 
 | 
 | ||||||
|  | from searx.engines.duckduckgo import fetch_traits  # pylint: disable=unused-import | ||||||
|  | from searx.engines.duckduckgo import get_ddg_lang | ||||||
|  | from searx.enginelib.traits import EngineTraits | ||||||
|  | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     import logging | ||||||
|  | 
 | ||||||
|  |     logger: logging.Logger | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| about = { | about = { | ||||||
|     "website": 'https://duckduckgo.com/', |     "website": 'https://duckduckgo.com/', | ||||||
|     "wikidata_id": 'Q12805', |     "wikidata_id": 'Q12805', | ||||||
| @ -17,9 +33,11 @@ about = { | |||||||
|     "results": "JSON", |     "results": "JSON", | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| categories = ["others"] | send_accept_language_header = True | ||||||
| 
 | 
 | ||||||
| url = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}" | # engine dependent config | ||||||
|  | categories = ["others"] | ||||||
|  | URL = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def generate_condition_table(condition): | def generate_condition_table(condition): | ||||||
| @ -72,8 +90,17 @@ def generate_day_table(day): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     params["url"] = url.format(query=quote(query), lang=params['language'].split('-')[0]) |  | ||||||
| 
 | 
 | ||||||
|  |     eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) | ||||||
|  |     eng_lang = get_ddg_lang(traits, params['searxng_locale']) | ||||||
|  | 
 | ||||||
|  |     # !ddw paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'} | ||||||
|  |     params['cookies']['ad'] = eng_lang | ||||||
|  |     params['cookies']['ah'] = eng_region | ||||||
|  |     params['cookies']['l'] = eng_region | ||||||
|  |     logger.debug("cookies: %s", params['cookies']) | ||||||
|  | 
 | ||||||
|  |     params["url"] = URL.format(query=quote(query), lang=eng_lang.split('_')[0]) | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -25,6 +25,7 @@ base_url = 'https://wiki.gentoo.org' | |||||||
| # xpath queries | # xpath queries | ||||||
| xpath_results = '//ul[@class="mw-search-results"]/li' | xpath_results = '//ul[@class="mw-search-results"]/li' | ||||||
| xpath_link = './/div[@class="mw-search-result-heading"]/a' | xpath_link = './/div[@class="mw-search-result-heading"]/a' | ||||||
|  | xpath_content = './/div[@class="searchresult"]' | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # cut 'en' from 'en-US', 'de' from 'de-CH', and so on | # cut 'en' from 'en-US', 'de' from 'de-CH', and so on | ||||||
| @ -77,8 +78,6 @@ main_langs = { | |||||||
|     'uk': 'Українська', |     'uk': 'Українська', | ||||||
|     'zh': '简体中文', |     'zh': '简体中文', | ||||||
| } | } | ||||||
| supported_languages = dict(lang_urls, **main_langs) |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| # do search-request | # do search-request | ||||||
| def request(query, params): | def request(query, params): | ||||||
| @ -118,7 +117,8 @@ def response(resp): | |||||||
|         link = result.xpath(xpath_link)[0] |         link = result.xpath(xpath_link)[0] | ||||||
|         href = urljoin(base_url, link.attrib.get('href')) |         href = urljoin(base_url, link.attrib.get('href')) | ||||||
|         title = extract_text(link) |         title = extract_text(link) | ||||||
|  |         content = extract_text(result.xpath(xpath_content)) | ||||||
| 
 | 
 | ||||||
|         results.append({'url': href, 'title': title}) |         results.append({'url': href, 'title': title, 'content': content}) | ||||||
| 
 | 
 | ||||||
|     return results |     return results | ||||||
|  | |||||||
| @ -1,34 +1,39 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| # lint: pylint | # lint: pylint | ||||||
| """This is the implementation of the google WEB engine.  Some of this | """This is the implementation of the Google WEB engine.  Some of this | ||||||
| implementations are shared by other engines: | implementations (manly the :py:obj:`get_google_info`) are shared by other | ||||||
|  | engines: | ||||||
| 
 | 
 | ||||||
| - :ref:`google images engine` | - :ref:`google images engine` | ||||||
| - :ref:`google news engine` | - :ref:`google news engine` | ||||||
| - :ref:`google videos engine` | - :ref:`google videos engine` | ||||||
| 
 | - :ref:`google scholar engine` | ||||||
| The google WEB engine itself has a special setup option: | - :ref:`google autocomplete` | ||||||
| 
 |  | ||||||
| .. code:: yaml |  | ||||||
| 
 |  | ||||||
|   - name: google |  | ||||||
|     ... |  | ||||||
|     use_mobile_ui: false |  | ||||||
| 
 |  | ||||||
| ``use_mobile_ui``: (default: ``false``) |  | ||||||
|   Enables to use *mobile endpoint* to bypass the google blocking (see |  | ||||||
|   :issue:`159`).  On the mobile UI of Google Search, the button :guilabel:`More |  | ||||||
|   results` is not affected by Google rate limiting and we can still do requests |  | ||||||
|   while actively blocked by the original Google search.  By activate |  | ||||||
|   ``use_mobile_ui`` this behavior is simulated by adding the parameter |  | ||||||
|   ``async=use_ac:true,_fmt:pc`` to the :py:func:`request`. |  | ||||||
| 
 | 
 | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
|  | from typing import TYPE_CHECKING | ||||||
|  | 
 | ||||||
|  | import re | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex | import babel | ||||||
|  | import babel.core | ||||||
|  | import babel.languages | ||||||
|  | 
 | ||||||
|  | from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex | ||||||
|  | from searx.locales import language_tag, region_tag, get_offical_locales | ||||||
|  | from searx import network | ||||||
| from searx.exceptions import SearxEngineCaptchaException | from searx.exceptions import SearxEngineCaptchaException | ||||||
|  | from searx.enginelib.traits import EngineTraits | ||||||
|  | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     import logging | ||||||
|  | 
 | ||||||
|  |     logger: logging.Logger | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
| @ -45,64 +50,6 @@ categories = ['general', 'web'] | |||||||
| paging = True | paging = True | ||||||
| time_range_support = True | time_range_support = True | ||||||
| safesearch = True | safesearch = True | ||||||
| send_accept_language_header = True |  | ||||||
| use_mobile_ui = False |  | ||||||
| supported_languages_url = 'https://www.google.com/preferences?#languages' |  | ||||||
| 
 |  | ||||||
| # based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests |  | ||||||
| google_domains = { |  | ||||||
|     'BG': 'google.bg',  # Bulgaria |  | ||||||
|     'CZ': 'google.cz',  # Czech Republic |  | ||||||
|     'DE': 'google.de',  # Germany |  | ||||||
|     'DK': 'google.dk',  # Denmark |  | ||||||
|     'AT': 'google.at',  # Austria |  | ||||||
|     'CH': 'google.ch',  # Switzerland |  | ||||||
|     'GR': 'google.gr',  # Greece |  | ||||||
|     'AU': 'google.com.au',  # Australia |  | ||||||
|     'CA': 'google.ca',  # Canada |  | ||||||
|     'GB': 'google.co.uk',  # United Kingdom |  | ||||||
|     'ID': 'google.co.id',  # Indonesia |  | ||||||
|     'IE': 'google.ie',  # Ireland |  | ||||||
|     'IN': 'google.co.in',  # India |  | ||||||
|     'MY': 'google.com.my',  # Malaysia |  | ||||||
|     'NZ': 'google.co.nz',  # New Zealand |  | ||||||
|     'PH': 'google.com.ph',  # Philippines |  | ||||||
|     'SG': 'google.com.sg',  # Singapore |  | ||||||
|     'US': 'google.com',  # United States (google.us) redirects to .com |  | ||||||
|     'ZA': 'google.co.za',  # South Africa |  | ||||||
|     'AR': 'google.com.ar',  # Argentina |  | ||||||
|     'CL': 'google.cl',  # Chile |  | ||||||
|     'ES': 'google.es',  # Spain |  | ||||||
|     'MX': 'google.com.mx',  # Mexico |  | ||||||
|     'EE': 'google.ee',  # Estonia |  | ||||||
|     'FI': 'google.fi',  # Finland |  | ||||||
|     'BE': 'google.be',  # Belgium |  | ||||||
|     'FR': 'google.fr',  # France |  | ||||||
|     'IL': 'google.co.il',  # Israel |  | ||||||
|     'HR': 'google.hr',  # Croatia |  | ||||||
|     'HU': 'google.hu',  # Hungary |  | ||||||
|     'IT': 'google.it',  # Italy |  | ||||||
|     'JP': 'google.co.jp',  # Japan |  | ||||||
|     'KR': 'google.co.kr',  # South Korea |  | ||||||
|     'LT': 'google.lt',  # Lithuania |  | ||||||
|     'LV': 'google.lv',  # Latvia |  | ||||||
|     'NO': 'google.no',  # Norway |  | ||||||
|     'NL': 'google.nl',  # Netherlands |  | ||||||
|     'PL': 'google.pl',  # Poland |  | ||||||
|     'BR': 'google.com.br',  # Brazil |  | ||||||
|     'PT': 'google.pt',  # Portugal |  | ||||||
|     'RO': 'google.ro',  # Romania |  | ||||||
|     'RU': 'google.ru',  # Russia |  | ||||||
|     'SK': 'google.sk',  # Slovakia |  | ||||||
|     'SI': 'google.si',  # Slovenia |  | ||||||
|     'SE': 'google.se',  # Sweden |  | ||||||
|     'TH': 'google.co.th',  # Thailand |  | ||||||
|     'TR': 'google.com.tr',  # Turkey |  | ||||||
|     'UA': 'google.com.ua',  # Ukraine |  | ||||||
|     'CN': 'google.com.hk',  # There is no google.cn, we use .com.hk for zh-CN |  | ||||||
|     'HK': 'google.com.hk',  # Hong Kong |  | ||||||
|     'TW': 'google.com.tw',  # Taiwan |  | ||||||
| } |  | ||||||
| 
 | 
 | ||||||
| time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} | time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} | ||||||
| 
 | 
 | ||||||
| @ -112,50 +59,50 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'} | |||||||
| # specific xpath variables | # specific xpath variables | ||||||
| # ------------------------ | # ------------------------ | ||||||
| 
 | 
 | ||||||
| results_xpath = './/div[@data-sokoban-container]' | results_xpath = './/div[contains(@jscontroller, "SC7lYd")]' | ||||||
| title_xpath = './/a/h3[1]' | title_xpath = './/a/h3[1]' | ||||||
| href_xpath = './/a[h3]/@href' | href_xpath = './/a[h3]/@href' | ||||||
| content_xpath = './/div[@data-content-feature=1]' | content_xpath = './/div[@data-sncf]' | ||||||
| 
 |  | ||||||
| # google *sections* are no usual *results*, we ignore them |  | ||||||
| g_section_with_header = './g-section-with-header' |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| # Suggestions are links placed in a *card-section*, we extract only the text | # Suggestions are links placed in a *card-section*, we extract only the text | ||||||
| # from the links not the links itself. | # from the links not the links itself. | ||||||
| suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a' | suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a' | ||||||
| 
 | 
 | ||||||
|  | # UI_ASYNC = 'use_ac:true,_fmt:html' # returns a HTTP 500 when user search for | ||||||
|  | #                                    # celebrities like '!google natasha allegri' | ||||||
|  | #                                    # or '!google chris evans' | ||||||
|  | UI_ASYNC = 'use_ac:true,_fmt:prog' | ||||||
|  | """Format of the response from UI's async request.""" | ||||||
| 
 | 
 | ||||||
| def get_lang_info(params, lang_list, custom_aliases, supported_any_language): | 
 | ||||||
|     """Composing various language properties for the google engines. | def get_google_info(params, eng_traits): | ||||||
|  |     """Composing various (language) properties for the google engines (:ref:`google | ||||||
|  |     API`). | ||||||
| 
 | 
 | ||||||
|     This function is called by the various google engines (:ref:`google web |     This function is called by the various google engines (:ref:`google web | ||||||
|     engine`, :ref:`google images engine`, :ref:`google news engine` and |     engine`, :ref:`google images engine`, :ref:`google news engine` and | ||||||
|     :ref:`google videos engine`). |     :ref:`google videos engine`). | ||||||
| 
 | 
 | ||||||
|     :param dict param: request parameters of the engine |     :param dict param: Request parameters of the engine.  At least | ||||||
|  |         a ``searxng_locale`` key should be in the dictionary. | ||||||
| 
 | 
 | ||||||
|     :param list lang_list: list of supported languages of the engine |     :param eng_traits: Engine's traits fetched from google preferences | ||||||
|         :py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>` |         (:py:obj:`searx.enginelib.traits.EngineTraits`) | ||||||
| 
 |  | ||||||
|     :param dict lang_list: custom aliases for non standard language codes |  | ||||||
|         (used when calling :py:func:`searx.utils.match_language`) |  | ||||||
| 
 |  | ||||||
|     :param bool supported_any_language: When a language is not specified, the |  | ||||||
|         language interpretation is left up to Google to decide how the search |  | ||||||
|         results should be delivered.  This argument is ``True`` for the google |  | ||||||
|         engine and ``False`` for the other engines (google-images, -news, |  | ||||||
|         -scholar, -videos). |  | ||||||
| 
 | 
 | ||||||
|     :rtype: dict |     :rtype: dict | ||||||
|     :returns: |     :returns: | ||||||
|         Py-Dictionary with the key/value pairs: |         Py-Dictionary with the key/value pairs: | ||||||
| 
 | 
 | ||||||
|         language: |         language: | ||||||
|             Return value from :py:func:`searx.utils.match_language` |             The language code that is used by google (e.g. ``lang_en`` or | ||||||
|  |             ``lang_zh-TW``) | ||||||
| 
 | 
 | ||||||
|         country: |         country: | ||||||
|             The country code (e.g. US, AT, CA, FR, DE ..) |             The country code that is used by google (e.g. ``US`` or ``TW``) | ||||||
|  | 
 | ||||||
|  |         locale: | ||||||
|  |             A instance of :py:obj:`babel.core.Locale` build from the | ||||||
|  |             ``searxng_locale`` value. | ||||||
| 
 | 
 | ||||||
|         subdomain: |         subdomain: | ||||||
|             Google subdomain :py:obj:`google_domains` that fits to the country |             Google subdomain :py:obj:`google_domains` that fits to the country | ||||||
| @ -165,52 +112,67 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language): | |||||||
|             Py-Dictionary with additional request arguments (can be passed to |             Py-Dictionary with additional request arguments (can be passed to | ||||||
|             :py:func:`urllib.parse.urlencode`). |             :py:func:`urllib.parse.urlencode`). | ||||||
| 
 | 
 | ||||||
|  |             - ``hl`` parameter: specifies the interface language of user interface. | ||||||
|  |             - ``lr`` parameter: restricts search results to documents written in | ||||||
|  |               a particular language. | ||||||
|  |             - ``cr`` parameter: restricts search results to documents | ||||||
|  |               originating in a particular country. | ||||||
|  |             - ``ie`` parameter: sets the character encoding scheme that should | ||||||
|  |               be used to interpret the query string ('utf8'). | ||||||
|  |             - ``oe`` parameter: sets the character encoding scheme that should | ||||||
|  |               be used to decode the XML result ('utf8'). | ||||||
|  | 
 | ||||||
|         headers: |         headers: | ||||||
|             Py-Dictionary with additional HTTP headers (can be passed to |             Py-Dictionary with additional HTTP headers (can be passed to | ||||||
|             request's headers) |             request's headers) | ||||||
|  | 
 | ||||||
|  |             - ``Accept: '*/*`` | ||||||
|  | 
 | ||||||
|     """ |     """ | ||||||
|  | 
 | ||||||
|     ret_val = { |     ret_val = { | ||||||
|         'language': None, |         'language': None, | ||||||
|         'country': None, |         'country': None, | ||||||
|         'subdomain': None, |         'subdomain': None, | ||||||
|         'params': {}, |         'params': {}, | ||||||
|         'headers': {}, |         'headers': {}, | ||||||
|  |         'cookies': {}, | ||||||
|  |         'locale': None, | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     # language ... |     sxng_locale = params.get('searxng_locale', 'all') | ||||||
|  |     try: | ||||||
|  |         locale = babel.Locale.parse(sxng_locale, sep='-') | ||||||
|  |     except babel.core.UnknownLocaleError: | ||||||
|  |         locale = None | ||||||
| 
 | 
 | ||||||
|     _lang = params['language'] |     eng_lang = eng_traits.get_language(sxng_locale, 'lang_en') | ||||||
|     _any_language = _lang.lower() == 'all' |     lang_code = eng_lang.split('_')[-1]  # lang_zh-TW --> zh-TW / lang_en --> en | ||||||
|     if _any_language: |     country = eng_traits.get_region(sxng_locale, eng_traits.all_locale) | ||||||
|         _lang = 'en-US' |  | ||||||
|     language = match_language(_lang, lang_list, custom_aliases) |  | ||||||
|     ret_val['language'] = language |  | ||||||
| 
 | 
 | ||||||
|     # country ... |     # Test zh_hans & zh_hant --> in the topmost links in the result list of list | ||||||
|  |     # TW and HK you should a find wiktionary.org zh_hant link.  In the result | ||||||
|  |     # list of zh-CN should not be no hant link instead you should find | ||||||
|  |     # zh.m.wikipedia.org/zh somewhere in the top. | ||||||
| 
 | 
 | ||||||
|     _l = _lang.split('-') |     # '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5 | ||||||
|     if len(_l) == 2: |     # '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5 | ||||||
|         country = _l[1] | 
 | ||||||
|     else: |     ret_val['language'] = eng_lang | ||||||
|         country = _l[0].upper() |  | ||||||
|         if country == 'EN': |  | ||||||
|             country = 'US' |  | ||||||
|     ret_val['country'] = country |     ret_val['country'] = country | ||||||
| 
 |     ret_val['locale'] = locale | ||||||
|     # subdomain ... |     ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com') | ||||||
| 
 |  | ||||||
|     ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com') |  | ||||||
| 
 |  | ||||||
|     # params & headers |  | ||||||
| 
 |  | ||||||
|     lang_country = '%s-%s' % (language, country)  # (en-US, en-EN, de-DE, de-AU, fr-FR ..) |  | ||||||
| 
 | 
 | ||||||
|     # hl parameter: |     # hl parameter: | ||||||
|     #   https://developers.google.com/custom-search/docs/xml_results#hlsp The |     #   The hl parameter specifies the interface language (host language) of | ||||||
|     # Interface Language: |     #   your user interface. To improve the performance and the quality of your | ||||||
|  |     #   search results, you are strongly encouraged to set this parameter | ||||||
|  |     #   explicitly. | ||||||
|  |     #   https://developers.google.com/custom-search/docs/xml_results#hlsp | ||||||
|  |     # The Interface Language: | ||||||
|     #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages |     #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages | ||||||
| 
 | 
 | ||||||
|     ret_val['params']['hl'] = lang_list.get(lang_country, language) |     ret_val['params']['hl'] = lang_code | ||||||
| 
 | 
 | ||||||
|     # lr parameter: |     # lr parameter: | ||||||
|     #   The lr (language restrict) parameter restricts search results to |     #   The lr (language restrict) parameter restricts search results to | ||||||
| @ -218,22 +180,72 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language): | |||||||
|     #   https://developers.google.com/custom-search/docs/xml_results#lrsp |     #   https://developers.google.com/custom-search/docs/xml_results#lrsp | ||||||
|     #   Language Collection Values: |     #   Language Collection Values: | ||||||
|     #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections |     #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections | ||||||
| 
 |  | ||||||
|     if _any_language and supported_any_language: |  | ||||||
| 
 |  | ||||||
|         # interpretation is left up to Google (based on whoogle) |  | ||||||
|     # |     # | ||||||
|         # - add parameter ``source=lnt`` |     # To select 'all' languages an empty 'lr' value is used. | ||||||
|         # - don't use parameter ``lr`` |     # | ||||||
|         # - don't add a ``Accept-Language`` HTTP header. |     # Different to other google services, Google Schloar supports to select more | ||||||
|  |     # than one language. The languages are seperated by a pipe '|' (logical OR). | ||||||
|  |     # By example: &lr=lang_zh-TW%7Clang_de selects articles written in | ||||||
|  |     # traditional chinese OR german language. | ||||||
| 
 | 
 | ||||||
|         ret_val['params']['source'] = 'lnt' |     ret_val['params']['lr'] = eng_lang | ||||||
|  |     if sxng_locale == 'all': | ||||||
|  |         ret_val['params']['lr'] = '' | ||||||
| 
 | 
 | ||||||
|     else: |     # cr parameter: | ||||||
|  |     #   The cr parameter restricts search results to documents originating in a | ||||||
|  |     #   particular country. | ||||||
|  |     #   https://developers.google.com/custom-search/docs/xml_results#crsp | ||||||
| 
 | 
 | ||||||
|         # restricts search results to documents written in a particular |     ret_val['params']['cr'] = 'country' + country | ||||||
|         # language. |     if sxng_locale == 'all': | ||||||
|         ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language) |         ret_val['params']['cr'] = '' | ||||||
|  | 
 | ||||||
|  |     # gl parameter: (mandatory by Geeogle News) | ||||||
|  |     #   The gl parameter value is a two-letter country code. For WebSearch | ||||||
|  |     #   results, the gl parameter boosts search results whose country of origin | ||||||
|  |     #   matches the parameter value. See the Country Codes section for a list of | ||||||
|  |     #   valid values. | ||||||
|  |     #   Specifying a gl parameter value in WebSearch requests should improve the | ||||||
|  |     #   relevance of results. This is particularly true for international | ||||||
|  |     #   customers and, even more specifically, for customers in English-speaking | ||||||
|  |     #   countries other than the United States. | ||||||
|  |     #   https://developers.google.com/custom-search/docs/xml_results#glsp | ||||||
|  | 
 | ||||||
|  |     ret_val['params']['gl'] = country | ||||||
|  | 
 | ||||||
|  |     # ie parameter: | ||||||
|  |     #   The ie parameter sets the character encoding scheme that should be used | ||||||
|  |     #   to interpret the query string. The default ie value is latin1. | ||||||
|  |     #   https://developers.google.com/custom-search/docs/xml_results#iesp | ||||||
|  | 
 | ||||||
|  |     ret_val['params']['ie'] = 'utf8' | ||||||
|  | 
 | ||||||
|  |     # oe parameter: | ||||||
|  |     #   The oe parameter sets the character encoding scheme that should be used | ||||||
|  |     #   to decode the XML result. The default oe value is latin1. | ||||||
|  |     #   https://developers.google.com/custom-search/docs/xml_results#oesp | ||||||
|  | 
 | ||||||
|  |     ret_val['params']['oe'] = 'utf8' | ||||||
|  | 
 | ||||||
|  |     # num parameter: | ||||||
|  |     #   The num parameter identifies the number of search results to return. | ||||||
|  |     #   The default num value is 10, and the maximum value is 20. If you request | ||||||
|  |     #   more than 20 results, only 20 results will be returned. | ||||||
|  |     #   https://developers.google.com/custom-search/docs/xml_results#numsp | ||||||
|  | 
 | ||||||
|  |     # HINT: seems to have no effect (tested in google WEB & Images) | ||||||
|  |     # ret_val['params']['num'] = 20 | ||||||
|  | 
 | ||||||
|  |     # HTTP headers | ||||||
|  | 
 | ||||||
|  |     ret_val['headers']['Accept'] = '*/*' | ||||||
|  | 
 | ||||||
|  |     # Cookies | ||||||
|  | 
 | ||||||
|  |     # - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746 | ||||||
|  |     # - https://github.com/searxng/searxng/issues/1555 | ||||||
|  |     ret_val['cookies']['CONSENT'] = "YES+" | ||||||
| 
 | 
 | ||||||
|     return ret_val |     return ret_val | ||||||
| 
 | 
 | ||||||
| @ -245,33 +257,34 @@ def detect_google_sorry(resp): | |||||||
| 
 | 
 | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     """Google search request""" |     """Google search request""" | ||||||
| 
 |     # pylint: disable=line-too-long | ||||||
|     offset = (params['pageno'] - 1) * 10 |     offset = (params['pageno'] - 1) * 10 | ||||||
| 
 |     google_info = get_google_info(params, traits) | ||||||
|     lang_info = get_lang_info(params, supported_languages, language_aliases, True) |  | ||||||
| 
 |  | ||||||
|     additional_parameters = {} |  | ||||||
|     if use_mobile_ui: |  | ||||||
|         additional_parameters = { |  | ||||||
|             'asearch': 'arc', |  | ||||||
|             'async': 'use_ac:true,_fmt:prog', |  | ||||||
|         } |  | ||||||
| 
 | 
 | ||||||
|     # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium |     # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium | ||||||
|     query_url = ( |     query_url = ( | ||||||
|         'https://' |         'https://' | ||||||
|         + lang_info['subdomain'] |         + google_info['subdomain'] | ||||||
|         + '/search' |         + '/search' | ||||||
|         + "?" |         + "?" | ||||||
|         + urlencode( |         + urlencode( | ||||||
|             { |             { | ||||||
|                 'q': query, |                 'q': query, | ||||||
|                 **lang_info['params'], |                 **google_info['params'], | ||||||
|                 'ie': "utf8", |  | ||||||
|                 'oe': "utf8", |  | ||||||
|                 'start': offset, |  | ||||||
|                 'filter': '0', |                 'filter': '0', | ||||||
|                 **additional_parameters, |                 'start': offset, | ||||||
|  |                 # 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i', | ||||||
|  |                 # 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG', | ||||||
|  |                 # 'cs' : 1, | ||||||
|  |                 # 'sa': 'N', | ||||||
|  |                 # 'yv': 3, | ||||||
|  |                 # 'prmd': 'vin', | ||||||
|  |                 # 'ei': 'GASaY6TxOcy_xc8PtYeY6AE', | ||||||
|  |                 # 'sa': 'N', | ||||||
|  |                 # 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg' | ||||||
|  |                 # formally known as use_mobile_ui | ||||||
|  |                 'asearch': 'arc', | ||||||
|  |                 'async': UI_ASYNC, | ||||||
|             } |             } | ||||||
|         ) |         ) | ||||||
|     ) |     ) | ||||||
| @ -282,25 +295,38 @@ def request(query, params): | |||||||
|         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) |         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) | ||||||
|     params['url'] = query_url |     params['url'] = query_url | ||||||
| 
 | 
 | ||||||
|     params['cookies']['CONSENT'] = "YES+" |     params['cookies'] = google_info['cookies'] | ||||||
|     params['headers'].update(lang_info['headers']) |     params['headers'].update(google_info['headers']) | ||||||
|     if use_mobile_ui: |  | ||||||
|         params['headers']['Accept'] = '*/*' |  | ||||||
|     else: |  | ||||||
|         params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' |  | ||||||
| 
 |  | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | # =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87; | ||||||
|  | # ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26; | ||||||
|  | RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def _parse_data_images(dom): | ||||||
|  |     data_image_map = {} | ||||||
|  |     for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()): | ||||||
|  |         end_pos = data_image.rfind('=') | ||||||
|  |         if end_pos > 0: | ||||||
|  |             data_image = data_image[: end_pos + 1] | ||||||
|  |         data_image_map[img_id] = data_image | ||||||
|  |     logger.debug('data:image objects --> %s', list(data_image_map.keys())) | ||||||
|  |     return data_image_map | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def response(resp): | def response(resp): | ||||||
|     """Get response from google's search request""" |     """Get response from google's search request""" | ||||||
| 
 |     # pylint: disable=too-many-branches, too-many-statements | ||||||
|     detect_google_sorry(resp) |     detect_google_sorry(resp) | ||||||
| 
 | 
 | ||||||
|     results = [] |     results = [] | ||||||
| 
 | 
 | ||||||
|     # convert the text to dom |     # convert the text to dom | ||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
|  |     data_image_map = _parse_data_images(dom) | ||||||
|  | 
 | ||||||
|     # results --> answer |     # results --> answer | ||||||
|     answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') |     answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') | ||||||
|     if answer_list: |     if answer_list: | ||||||
| @ -309,25 +335,9 @@ def response(resp): | |||||||
|     else: |     else: | ||||||
|         logger.debug("did not find 'answer'") |         logger.debug("did not find 'answer'") | ||||||
| 
 | 
 | ||||||
|         # results --> number_of_results |  | ||||||
|         if not use_mobile_ui: |  | ||||||
|             try: |  | ||||||
|                 _txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0) |  | ||||||
|                 _digit = ''.join([n for n in _txt if n.isdigit()]) |  | ||||||
|                 number_of_results = int(_digit) |  | ||||||
|                 results.append({'number_of_results': number_of_results}) |  | ||||||
|             except Exception as e:  # pylint: disable=broad-except |  | ||||||
|                 logger.debug("did not 'number_of_results'") |  | ||||||
|                 logger.error(e, exc_info=True) |  | ||||||
| 
 |  | ||||||
|     # parse results |     # parse results | ||||||
| 
 | 
 | ||||||
|     for result in eval_xpath_list(dom, results_xpath): |     for result in eval_xpath_list(dom, results_xpath):  # pylint: disable=too-many-nested-blocks | ||||||
| 
 |  | ||||||
|         # google *sections* |  | ||||||
|         if extract_text(eval_xpath(result, g_section_with_header)): |  | ||||||
|             logger.debug("ignoring <g-section-with-header>") |  | ||||||
|             continue |  | ||||||
| 
 | 
 | ||||||
|         try: |         try: | ||||||
|             title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) |             title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) | ||||||
| @ -336,16 +346,30 @@ def response(resp): | |||||||
|                 logger.debug('ignoring item from the result_xpath list: missing title') |                 logger.debug('ignoring item from the result_xpath list: missing title') | ||||||
|                 continue |                 continue | ||||||
|             title = extract_text(title_tag) |             title = extract_text(title_tag) | ||||||
|  | 
 | ||||||
|             url = eval_xpath_getindex(result, href_xpath, 0, None) |             url = eval_xpath_getindex(result, href_xpath, 0, None) | ||||||
|             if url is None: |             if url is None: | ||||||
|  |                 logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title) | ||||||
|                 continue |                 continue | ||||||
|             content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True) | 
 | ||||||
|             if content is None: |             content_nodes = eval_xpath(result, content_xpath) | ||||||
|  |             content = extract_text(content_nodes) | ||||||
|  | 
 | ||||||
|  |             if not content: | ||||||
|                 logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title) |                 logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title) | ||||||
|                 continue |                 continue | ||||||
| 
 | 
 | ||||||
|             logger.debug('add link to results: %s', title) |             img_src = content_nodes[0].xpath('.//img/@src') | ||||||
|             results.append({'url': url, 'title': title, 'content': content}) |             if img_src: | ||||||
|  |                 img_src = img_src[0] | ||||||
|  |                 if img_src.startswith('data:image'): | ||||||
|  |                     img_id = content_nodes[0].xpath('.//img/@id') | ||||||
|  |                     if img_id: | ||||||
|  |                         img_src = data_image_map.get(img_id[0]) | ||||||
|  |             else: | ||||||
|  |                 img_src = None | ||||||
|  | 
 | ||||||
|  |             results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src}) | ||||||
| 
 | 
 | ||||||
|         except Exception as e:  # pylint: disable=broad-except |         except Exception as e:  # pylint: disable=broad-except | ||||||
|             logger.error(e, exc_info=True) |             logger.error(e, exc_info=True) | ||||||
| @ -361,15 +385,107 @@ def response(resp): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # get supported languages from their site | # get supported languages from their site | ||||||
| def _fetch_supported_languages(resp): | 
 | ||||||
|     ret_val = {} | 
 | ||||||
|  | skip_countries = [ | ||||||
|  |     # official language of google-country not in google-languages | ||||||
|  |     'AL',  # Albanien (sq) | ||||||
|  |     'AZ',  # Aserbaidschan  (az) | ||||||
|  |     'BD',  # Bangladesch (bn) | ||||||
|  |     'BN',  # Brunei Darussalam (ms) | ||||||
|  |     'BT',  # Bhutan (dz) | ||||||
|  |     'ET',  # Äthiopien (am) | ||||||
|  |     'GE',  # Georgien (ka, os) | ||||||
|  |     'GL',  # Grönland (kl) | ||||||
|  |     'KH',  # Kambodscha (km) | ||||||
|  |     'LA',  # Laos (lo) | ||||||
|  |     'LK',  # Sri Lanka (si, ta) | ||||||
|  |     'ME',  # Montenegro (sr) | ||||||
|  |     'MK',  # Nordmazedonien (mk, sq) | ||||||
|  |     'MM',  # Myanmar (my) | ||||||
|  |     'MN',  # Mongolei (mn) | ||||||
|  |     'MV',  # Malediven (dv) // dv_MV is unknown by babel | ||||||
|  |     'MY',  # Malaysia (ms) | ||||||
|  |     'NP',  # Nepal (ne) | ||||||
|  |     'TJ',  # Tadschikistan (tg) | ||||||
|  |     'TM',  # Turkmenistan (tk) | ||||||
|  |     'UZ',  # Usbekistan (uz) | ||||||
|  | ] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True): | ||||||
|  |     """Fetch languages from Google.""" | ||||||
|  |     # pylint: disable=import-outside-toplevel, too-many-branches | ||||||
|  | 
 | ||||||
|  |     engine_traits.custom['supported_domains'] = {} | ||||||
|  | 
 | ||||||
|  |     resp = network.get('https://www.google.com/preferences') | ||||||
|  |     if not resp.ok: | ||||||
|  |         raise RuntimeError("Response from Google's preferences is not OK.") | ||||||
|  | 
 | ||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
| 
 | 
 | ||||||
|     radio_buttons = eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]') |     # supported language codes | ||||||
| 
 | 
 | ||||||
|     for x in radio_buttons: |     lang_map = {'no': 'nb'} | ||||||
|         name = x.get("data-name") |     for x in eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]'): | ||||||
|         code = x.get("value").split('_')[-1] |  | ||||||
|         ret_val[code] = {"name": name} |  | ||||||
| 
 | 
 | ||||||
|     return ret_val |         eng_lang = x.get("value").split('_')[-1] | ||||||
|  |         try: | ||||||
|  |             locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-') | ||||||
|  |         except babel.UnknownLocaleError: | ||||||
|  |             print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang)) | ||||||
|  |             continue | ||||||
|  |         sxng_lang = language_tag(locale) | ||||||
|  | 
 | ||||||
|  |         conflict = engine_traits.languages.get(sxng_lang) | ||||||
|  |         if conflict: | ||||||
|  |             if conflict != eng_lang: | ||||||
|  |                 print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang)) | ||||||
|  |             continue | ||||||
|  |         engine_traits.languages[sxng_lang] = 'lang_' + eng_lang | ||||||
|  | 
 | ||||||
|  |     # alias languages | ||||||
|  |     engine_traits.languages['zh'] = 'lang_zh-CN' | ||||||
|  | 
 | ||||||
|  |     # supported region codes | ||||||
|  | 
 | ||||||
|  |     for x in eval_xpath_list(dom, '//*[@name="region"]/..//input[@name="region"]'): | ||||||
|  |         eng_country = x.get("value") | ||||||
|  | 
 | ||||||
|  |         if eng_country in skip_countries: | ||||||
|  |             continue | ||||||
|  |         if eng_country == 'ZZ': | ||||||
|  |             engine_traits.all_locale = 'ZZ' | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         sxng_locales = get_offical_locales(eng_country, engine_traits.languages.keys(), regional=True) | ||||||
|  | 
 | ||||||
|  |         if not sxng_locales: | ||||||
|  |             print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country)) | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         for sxng_locale in sxng_locales: | ||||||
|  |             engine_traits.regions[region_tag(sxng_locale)] = eng_country | ||||||
|  | 
 | ||||||
|  |     # alias regions | ||||||
|  |     engine_traits.regions['zh-CN'] = 'HK' | ||||||
|  | 
 | ||||||
|  |     # supported domains | ||||||
|  | 
 | ||||||
|  |     if add_domains: | ||||||
|  |         resp = network.get('https://www.google.com/supported_domains') | ||||||
|  |         if not resp.ok: | ||||||
|  |             raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.") | ||||||
|  | 
 | ||||||
|  |         for domain in resp.text.split(): | ||||||
|  |             domain = domain.strip() | ||||||
|  |             if not domain or domain in [ | ||||||
|  |                 '.google.com', | ||||||
|  |             ]: | ||||||
|  |                 continue | ||||||
|  |             region = domain.split('.')[-1].upper() | ||||||
|  |             engine_traits.custom['supported_domains'][region] = 'www' + domain | ||||||
|  |             if region == 'HK': | ||||||
|  |                 # There is no google.cn, we use .com.hk for zh-CN | ||||||
|  |                 engine_traits.custom['supported_domains']['CN'] = 'www' + domain | ||||||
|  | |||||||
| @ -1,31 +1,38 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| # lint: pylint | # lint: pylint | ||||||
| """This is the implementation of the google images engine using the google | """This is the implementation of the Google Images engine using the internal | ||||||
| internal API used the Google Go Android app. | Google API used by the Google Go Android app. | ||||||
| 
 | 
 | ||||||
| This internal API offer results in | This internal API offer results in | ||||||
| 
 | 
 | ||||||
| - JSON (_fmt:json) | - JSON (``_fmt:json``) | ||||||
| - Protobuf (_fmt:pb) | - Protobuf_ (``_fmt:pb``) | ||||||
| - Protobuf compressed? (_fmt:pc) | - Protobuf_ compressed? (``_fmt:pc``) | ||||||
| - HTML (_fmt:html) | - HTML (``_fmt:html``) | ||||||
| - Protobuf encoded in JSON (_fmt:jspb). | - Protobuf_ encoded in JSON (``_fmt:jspb``). | ||||||
| 
 | 
 | ||||||
|  | .. _Protobuf: https://en.wikipedia.org/wiki/Protocol_Buffers | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
|  | from typing import TYPE_CHECKING | ||||||
|  | 
 | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from json import loads | from json import loads | ||||||
| 
 | 
 | ||||||
|  | from searx.engines.google import fetch_traits  # pylint: disable=unused-import | ||||||
| from searx.engines.google import ( | from searx.engines.google import ( | ||||||
|     get_lang_info, |     get_google_info, | ||||||
|     time_range_dict, |     time_range_dict, | ||||||
|     detect_google_sorry, |     detect_google_sorry, | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| # pylint: disable=unused-import | if TYPE_CHECKING: | ||||||
| from searx.engines.google import supported_languages_url, _fetch_supported_languages |     import logging | ||||||
|  |     from searx.enginelib.traits import EngineTraits | ||||||
|  | 
 | ||||||
|  |     logger: logging.Logger | ||||||
|  |     traits: EngineTraits | ||||||
| 
 | 
 | ||||||
| # pylint: enable=unused-import |  | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
| @ -40,7 +47,6 @@ about = { | |||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['images', 'web'] | categories = ['images', 'web'] | ||||||
| paging = True | paging = True | ||||||
| use_locale_domain = True |  | ||||||
| time_range_support = True | time_range_support = True | ||||||
| safesearch = True | safesearch = True | ||||||
| send_accept_language_header = True | send_accept_language_header = True | ||||||
| @ -51,20 +57,18 @@ filter_mapping = {0: 'images', 1: 'active', 2: 'active'} | |||||||
| def request(query, params): | def request(query, params): | ||||||
|     """Google-Image search request""" |     """Google-Image search request""" | ||||||
| 
 | 
 | ||||||
|     lang_info = get_lang_info(params, supported_languages, language_aliases, False) |     google_info = get_google_info(params, traits) | ||||||
| 
 | 
 | ||||||
|     query_url = ( |     query_url = ( | ||||||
|         'https://' |         'https://' | ||||||
|         + lang_info['subdomain'] |         + google_info['subdomain'] | ||||||
|         + '/search' |         + '/search' | ||||||
|         + "?" |         + "?" | ||||||
|         + urlencode( |         + urlencode( | ||||||
|             { |             { | ||||||
|                 'q': query, |                 'q': query, | ||||||
|                 'tbm': "isch", |                 'tbm': "isch", | ||||||
|                 **lang_info['params'], |                 **google_info['params'], | ||||||
|                 'ie': "utf8", |  | ||||||
|                 'oe': "utf8", |  | ||||||
|                 'asearch': 'isch', |                 'asearch': 'isch', | ||||||
|                 'async': '_fmt:json,p:1,ijn:' + str(params['pageno']), |                 'async': '_fmt:json,p:1,ijn:' + str(params['pageno']), | ||||||
|             } |             } | ||||||
| @ -77,9 +81,8 @@ def request(query, params): | |||||||
|         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) |         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) | ||||||
|     params['url'] = query_url |     params['url'] = query_url | ||||||
| 
 | 
 | ||||||
|     params['headers'].update(lang_info['headers']) |     params['cookies'] = google_info['cookies'] | ||||||
|     params['headers']['User-Agent'] = 'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12; US) gzip' |     params['headers'].update(google_info['headers']) | ||||||
|     params['headers']['Accept'] = '*/*' |  | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -111,7 +114,11 @@ def response(resp): | |||||||
| 
 | 
 | ||||||
|         copyright_notice = item["result"].get('iptc', {}).get('copyright_notice') |         copyright_notice = item["result"].get('iptc', {}).get('copyright_notice') | ||||||
|         if copyright_notice: |         if copyright_notice: | ||||||
|             result_item['source'] += ' / ' + copyright_notice |             result_item['source'] += ' | ' + copyright_notice | ||||||
|  | 
 | ||||||
|  |         freshness_date = item["result"].get("freshness_date") | ||||||
|  |         if freshness_date: | ||||||
|  |             result_item['source'] += ' | ' + freshness_date | ||||||
| 
 | 
 | ||||||
|         file_size = item.get('gsa', {}).get('file_size') |         file_size = item.get('gsa', {}).get('file_size') | ||||||
|         if file_size: |         if file_size: | ||||||
|  | |||||||
| @ -1,24 +1,40 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| # lint: pylint | # lint: pylint | ||||||
| """This is the implementation of the google news engine.  The google news API | """This is the implementation of the Google News engine. | ||||||
| ignores some parameters from the common :ref:`google API`: |  | ||||||
| 
 | 
 | ||||||
| - num_ : the number of search results is ignored | Google News has a different region handling compared to Google WEB. | ||||||
|  | 
 | ||||||
|  | - the ``ceid`` argument has to be set (:py:obj:`ceid_list`) | ||||||
|  | - the hl_ argument has to be set correctly (and different to Google WEB) | ||||||
|  | - the gl_ argument is mandatory | ||||||
|  | 
 | ||||||
|  | If one of this argument is not set correctly, the request is redirected to | ||||||
|  | CONSENT dialog:: | ||||||
|  | 
 | ||||||
|  |   https://consent.google.com/m?continue= | ||||||
|  | 
 | ||||||
|  | The google news API ignores some parameters from the common :ref:`google API`: | ||||||
|  | 
 | ||||||
|  | - num_ : the number of search results is ignored / there is no paging all | ||||||
|  |   results for a query term are in the first response. | ||||||
| - save_ : is ignored / Google-News results are always *SafeSearch* | - save_ : is ignored / Google-News results are always *SafeSearch* | ||||||
| 
 | 
 | ||||||
|  | .. _hl: https://developers.google.com/custom-search/docs/xml_results#hlsp | ||||||
|  | .. _gl: https://developers.google.com/custom-search/docs/xml_results#glsp | ||||||
| .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp | .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp | ||||||
| .. _save: https://developers.google.com/custom-search/docs/xml_results#safesp | .. _save: https://developers.google.com/custom-search/docs/xml_results#safesp | ||||||
| 
 |  | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| # pylint: disable=invalid-name | from typing import TYPE_CHECKING | ||||||
| 
 | 
 | ||||||
| import binascii | import binascii | ||||||
| import re | import re | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from base64 import b64decode | from base64 import b64decode | ||||||
| from lxml import html | from lxml import html | ||||||
|  | import babel | ||||||
| 
 | 
 | ||||||
|  | from searx import locales | ||||||
| from searx.utils import ( | from searx.utils import ( | ||||||
|     eval_xpath, |     eval_xpath, | ||||||
|     eval_xpath_list, |     eval_xpath_list, | ||||||
| @ -26,18 +42,19 @@ from searx.utils import ( | |||||||
|     extract_text, |     extract_text, | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| # pylint: disable=unused-import | from searx.engines.google import fetch_traits as _fetch_traits  # pylint: disable=unused-import | ||||||
| from searx.engines.google import ( | from searx.engines.google import ( | ||||||
|     supported_languages_url, |     get_google_info, | ||||||
|     _fetch_supported_languages, |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| # pylint: enable=unused-import |  | ||||||
| 
 |  | ||||||
| from searx.engines.google import ( |  | ||||||
|     get_lang_info, |  | ||||||
|     detect_google_sorry, |     detect_google_sorry, | ||||||
| ) | ) | ||||||
|  | from searx.enginelib.traits import EngineTraits | ||||||
|  | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     import logging | ||||||
|  | 
 | ||||||
|  |     logger: logging.Logger | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
| @ -49,70 +66,77 @@ about = { | |||||||
|     "results": 'HTML', |     "results": 'HTML', | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| # compared to other google engines google-news has a different time range |  | ||||||
| # support.  The time range is included in the search term. |  | ||||||
| time_range_dict = { |  | ||||||
|     'day': 'when:1d', |  | ||||||
|     'week': 'when:7d', |  | ||||||
|     'month': 'when:1m', |  | ||||||
|     'year': 'when:1y', |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| # engine dependent config | # engine dependent config | ||||||
| 
 |  | ||||||
| categories = ['news'] | categories = ['news'] | ||||||
| paging = False | paging = False | ||||||
| use_locale_domain = True | time_range_support = False | ||||||
| time_range_support = True |  | ||||||
| 
 | 
 | ||||||
| # Google-News results are always *SafeSearch*. Option 'safesearch' is set to | # Google-News results are always *SafeSearch*. Option 'safesearch' is set to | ||||||
| # False here, otherwise checker will report safesearch-errors:: | # False here, otherwise checker will report safesearch-errors:: | ||||||
| # | # | ||||||
| #  safesearch : results are identitical for safesearch=0 and safesearch=2 | #  safesearch : results are identitical for safesearch=0 and safesearch=2 | ||||||
| safesearch = False | safesearch = True | ||||||
| send_accept_language_header = True | # send_accept_language_header = True | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     """Google-News search request""" |     """Google-News search request""" | ||||||
| 
 | 
 | ||||||
|     lang_info = get_lang_info(params, supported_languages, language_aliases, False) |     sxng_locale = params.get('searxng_locale', 'en-US') | ||||||
|  |     ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en') | ||||||
|  |     google_info = get_google_info(params, traits) | ||||||
|  |     google_info['subdomain'] = 'news.google.com'  # google news has only one domain | ||||||
| 
 | 
 | ||||||
|     # google news has only one domain |     ceid_region, ceid_lang = ceid.split(':') | ||||||
|     lang_info['subdomain'] = 'news.google.com' |     ceid_lang, ceid_suffix = ( | ||||||
|  |         ceid_lang.split('-') | ||||||
|  |         + [ | ||||||
|  |             None, | ||||||
|  |         ] | ||||||
|  |     )[:2] | ||||||
| 
 | 
 | ||||||
|     ceid = "%s:%s" % (lang_info['country'], lang_info['language']) |     google_info['params']['hl'] = ceid_lang | ||||||
| 
 | 
 | ||||||
|     # google news redirects en to en-US |     if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']: | ||||||
|     if lang_info['params']['hl'] == 'en': |  | ||||||
|         lang_info['params']['hl'] = 'en-US' |  | ||||||
| 
 | 
 | ||||||
|     # Very special to google-news compared to other google engines, the time |         if ceid_region.lower() == ceid_lang: | ||||||
|     # range is included in the search term. |             google_info['params']['hl'] = ceid_lang + '-' + ceid_region | ||||||
|     if params['time_range']: |         else: | ||||||
|         query += ' ' + time_range_dict[params['time_range']] |             google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix | ||||||
|  | 
 | ||||||
|  |     elif ceid_region.lower() != ceid_lang: | ||||||
|  | 
 | ||||||
|  |         if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']: | ||||||
|  |             google_info['params']['hl'] = ceid_lang | ||||||
|  |         else: | ||||||
|  |             google_info['params']['hl'] = ceid_lang + '-' + ceid_region | ||||||
|  | 
 | ||||||
|  |     google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0] | ||||||
|  |     google_info['params']['gl'] = ceid_region | ||||||
| 
 | 
 | ||||||
|     query_url = ( |     query_url = ( | ||||||
|         'https://' |         'https://' | ||||||
|         + lang_info['subdomain'] |         + google_info['subdomain'] | ||||||
|         + '/search' |         + "/search?" | ||||||
|         + "?" |         + urlencode( | ||||||
|         + urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'gl': lang_info['country']}) |             { | ||||||
|  |                 'q': query, | ||||||
|  |                 **google_info['params'], | ||||||
|  |             } | ||||||
|  |         ) | ||||||
|  |         # ceid includes a ':' character which must not be urlencoded | ||||||
|         + ('&ceid=%s' % ceid) |         + ('&ceid=%s' % ceid) | ||||||
|     )  # ceid includes a ':' character which must not be urlencoded |     ) | ||||||
|  | 
 | ||||||
|     params['url'] = query_url |     params['url'] = query_url | ||||||
| 
 |     params['cookies'] = google_info['cookies'] | ||||||
|     params['cookies']['CONSENT'] = "YES+" |     params['headers'].update(google_info['headers']) | ||||||
|     params['headers'].update(lang_info['headers']) |  | ||||||
|     params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' |  | ||||||
| 
 |  | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def response(resp): | def response(resp): | ||||||
|     """Get response from google's search request""" |     """Get response from google's search request""" | ||||||
|     results = [] |     results = [] | ||||||
| 
 |  | ||||||
|     detect_google_sorry(resp) |     detect_google_sorry(resp) | ||||||
| 
 | 
 | ||||||
|     # convert the text to dom |     # convert the text to dom | ||||||
| @ -152,8 +176,8 @@ def response(resp): | |||||||
| 
 | 
 | ||||||
|         # The pub_date is mostly a string like 'yesertday', not a real |         # The pub_date is mostly a string like 'yesertday', not a real | ||||||
|         # timezone date or time.  Therefore we can't use publishedDate. |         # timezone date or time.  Therefore we can't use publishedDate. | ||||||
|         pub_date = extract_text(eval_xpath(result, './article/div[1]/div[1]/time')) |         pub_date = extract_text(eval_xpath(result, './article//time')) | ||||||
|         pub_origin = extract_text(eval_xpath(result, './article/div[1]/div[1]/a')) |         pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]')) | ||||||
| 
 | 
 | ||||||
|         content = ' / '.join([x for x in [pub_origin, pub_date] if x]) |         content = ' / '.join([x for x in [pub_origin, pub_date] if x]) | ||||||
| 
 | 
 | ||||||
| @ -174,3 +198,127 @@ def response(resp): | |||||||
| 
 | 
 | ||||||
|     # return results |     # return results | ||||||
|     return results |     return results | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ceid_list = [ | ||||||
|  |     'AE:ar', | ||||||
|  |     'AR:es-419', | ||||||
|  |     'AT:de', | ||||||
|  |     'AU:en', | ||||||
|  |     'BD:bn', | ||||||
|  |     'BE:fr', | ||||||
|  |     'BE:nl', | ||||||
|  |     'BG:bg', | ||||||
|  |     'BR:pt-419', | ||||||
|  |     'BW:en', | ||||||
|  |     'CA:en', | ||||||
|  |     'CA:fr', | ||||||
|  |     'CH:de', | ||||||
|  |     'CH:fr', | ||||||
|  |     'CL:es-419', | ||||||
|  |     'CN:zh-Hans', | ||||||
|  |     'CO:es-419', | ||||||
|  |     'CU:es-419', | ||||||
|  |     'CZ:cs', | ||||||
|  |     'DE:de', | ||||||
|  |     'EG:ar', | ||||||
|  |     'ES:es', | ||||||
|  |     'ET:en', | ||||||
|  |     'FR:fr', | ||||||
|  |     'GB:en', | ||||||
|  |     'GH:en', | ||||||
|  |     'GR:el', | ||||||
|  |     'HK:zh-Hant', | ||||||
|  |     'HU:hu', | ||||||
|  |     'ID:en', | ||||||
|  |     'ID:id', | ||||||
|  |     'IE:en', | ||||||
|  |     'IL:en', | ||||||
|  |     'IL:he', | ||||||
|  |     'IN:bn', | ||||||
|  |     'IN:en', | ||||||
|  |     'IN:hi', | ||||||
|  |     'IN:ml', | ||||||
|  |     'IN:mr', | ||||||
|  |     'IN:ta', | ||||||
|  |     'IN:te', | ||||||
|  |     'IT:it', | ||||||
|  |     'JP:ja', | ||||||
|  |     'KE:en', | ||||||
|  |     'KR:ko', | ||||||
|  |     'LB:ar', | ||||||
|  |     'LT:lt', | ||||||
|  |     'LV:en', | ||||||
|  |     'LV:lv', | ||||||
|  |     'MA:fr', | ||||||
|  |     'MX:es-419', | ||||||
|  |     'MY:en', | ||||||
|  |     'NA:en', | ||||||
|  |     'NG:en', | ||||||
|  |     'NL:nl', | ||||||
|  |     'NO:no', | ||||||
|  |     'NZ:en', | ||||||
|  |     'PE:es-419', | ||||||
|  |     'PH:en', | ||||||
|  |     'PK:en', | ||||||
|  |     'PL:pl', | ||||||
|  |     'PT:pt-150', | ||||||
|  |     'RO:ro', | ||||||
|  |     'RS:sr', | ||||||
|  |     'RU:ru', | ||||||
|  |     'SA:ar', | ||||||
|  |     'SE:sv', | ||||||
|  |     'SG:en', | ||||||
|  |     'SI:sl', | ||||||
|  |     'SK:sk', | ||||||
|  |     'SN:fr', | ||||||
|  |     'TH:th', | ||||||
|  |     'TR:tr', | ||||||
|  |     'TW:zh-Hant', | ||||||
|  |     'TZ:en', | ||||||
|  |     'UA:ru', | ||||||
|  |     'UA:uk', | ||||||
|  |     'UG:en', | ||||||
|  |     'US:en', | ||||||
|  |     'US:es-419', | ||||||
|  |     'VE:es-419', | ||||||
|  |     'VN:vi', | ||||||
|  |     'ZA:en', | ||||||
|  |     'ZW:en', | ||||||
|  | ] | ||||||
|  | """List of region/language combinations supported by Google News.  Values of the | ||||||
|  | ``ceid`` argument of the Google News REST API.""" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | _skip_values = [ | ||||||
|  |     'ET:en',  # english (ethiopia) | ||||||
|  |     'ID:en',  # english (indonesia) | ||||||
|  |     'LV:en',  # english (latvia) | ||||||
|  | ] | ||||||
|  | 
 | ||||||
|  | _ceid_locale_map = {'NO:no': 'nb-NO'} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def fetch_traits(engine_traits: EngineTraits): | ||||||
|  |     _fetch_traits(engine_traits, add_domains=False) | ||||||
|  | 
 | ||||||
|  |     engine_traits.custom['ceid'] = {} | ||||||
|  | 
 | ||||||
|  |     for ceid in ceid_list: | ||||||
|  |         if ceid in _skip_values: | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         region, lang = ceid.split(':') | ||||||
|  |         x = lang.split('-') | ||||||
|  |         if len(x) > 1: | ||||||
|  |             if x[1] not in ['Hant', 'Hans']: | ||||||
|  |                 lang = x[0] | ||||||
|  | 
 | ||||||
|  |         sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region) | ||||||
|  |         try: | ||||||
|  |             locale = babel.Locale.parse(sxng_locale, sep='-') | ||||||
|  |         except babel.UnknownLocaleError: | ||||||
|  |             print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale)) | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid | ||||||
|  | |||||||
| @ -1,19 +1,18 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| # lint: pylint | # lint: pylint | ||||||
| """Google (Scholar) | """This is the implementation of the Google Scholar engine. | ||||||
| 
 | 
 | ||||||
| For detailed description of the *REST-full* API see: `Query Parameter | Compared to other Google services the Scholar engine has a simple GET REST-API | ||||||
| Definitions`_. | and there does not exists `async` API.  Even though the API slightly vintage we | ||||||
| 
 | can make use of the :ref:`google API` to assemble the arguments of the GET | ||||||
| .. _Query Parameter Definitions: | request. | ||||||
|    https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions |  | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| # pylint: disable=invalid-name | from typing import TYPE_CHECKING | ||||||
|  | from typing import Optional | ||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from typing import Optional |  | ||||||
| from lxml import html | from lxml import html | ||||||
| 
 | 
 | ||||||
| from searx.utils import ( | from searx.utils import ( | ||||||
| @ -23,19 +22,21 @@ from searx.utils import ( | |||||||
|     extract_text, |     extract_text, | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
|  | from searx.exceptions import SearxEngineCaptchaException | ||||||
|  | 
 | ||||||
|  | from searx.engines.google import fetch_traits  # pylint: disable=unused-import | ||||||
| from searx.engines.google import ( | from searx.engines.google import ( | ||||||
|     get_lang_info, |     get_google_info, | ||||||
|     time_range_dict, |     time_range_dict, | ||||||
|     detect_google_sorry, |  | ||||||
| ) | ) | ||||||
|  | from searx.enginelib.traits import EngineTraits | ||||||
| 
 | 
 | ||||||
| # pylint: disable=unused-import | if TYPE_CHECKING: | ||||||
| from searx.engines.google import ( |     import logging | ||||||
|     supported_languages_url, |  | ||||||
|     _fetch_supported_languages, |  | ||||||
| ) |  | ||||||
| 
 | 
 | ||||||
| # pylint: enable=unused-import |     logger: logging.Logger | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
| @ -51,53 +52,62 @@ about = { | |||||||
| categories = ['science', 'scientific publications'] | categories = ['science', 'scientific publications'] | ||||||
| paging = True | paging = True | ||||||
| language_support = True | language_support = True | ||||||
| use_locale_domain = True |  | ||||||
| time_range_support = True | time_range_support = True | ||||||
| safesearch = False | safesearch = False | ||||||
| send_accept_language_header = True | send_accept_language_header = True | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def time_range_url(params): | def time_range_args(params): | ||||||
|     """Returns a URL query component for a google-Scholar time range based on |     """Returns a dictionary with a time range arguments based on | ||||||
|     ``params['time_range']``.  Google-Scholar does only support ranges in years. |     ``params['time_range']``. | ||||||
|     To have any effect, all the Searx ranges (*day*, *week*, *month*, *year*) | 
 | ||||||
|     are mapped to *year*.  If no range is set, an empty string is returned. |     Google Scholar supports a detailed search by year.  Searching by *last | ||||||
|     Example:: |     month* or *last week* (as offered by SearXNG) is uncommon for scientific | ||||||
|  |     publications and is not supported by Google Scholar. | ||||||
|  | 
 | ||||||
|  |     To limit the result list when the users selects a range, all the SearXNG | ||||||
|  |     ranges (*day*, *week*, *month*, *year*) are mapped to *year*.  If no range | ||||||
|  |     is set an empty dictionary of arguments is returned.  Example;  when | ||||||
|  |     user selects a time range (current year minus one in 2022): | ||||||
|  | 
 | ||||||
|  |     .. code:: python | ||||||
|  | 
 | ||||||
|  |         { 'as_ylo' : 2021 } | ||||||
| 
 | 
 | ||||||
|         &as_ylo=2019 |  | ||||||
|     """ |     """ | ||||||
|     # as_ylo=2016&as_yhi=2019 |     ret_val = {} | ||||||
|     ret_val = '' |  | ||||||
|     if params['time_range'] in time_range_dict: |     if params['time_range'] in time_range_dict: | ||||||
|         ret_val = urlencode({'as_ylo': datetime.now().year - 1}) |         ret_val['as_ylo'] = datetime.now().year - 1 | ||||||
|     return '&' + ret_val |     return ret_val | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def detect_google_captcha(dom): | ||||||
|  |     """In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is | ||||||
|  |     not redirected to ``sorry.google.com``. | ||||||
|  |     """ | ||||||
|  |     if eval_xpath(dom, "//form[@id='gs_captcha_f']"): | ||||||
|  |         raise SearxEngineCaptchaException() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     """Google-Scholar search request""" |     """Google-Scholar search request""" | ||||||
| 
 | 
 | ||||||
|     offset = (params['pageno'] - 1) * 10 |     google_info = get_google_info(params, traits) | ||||||
|     lang_info = get_lang_info(params, supported_languages, language_aliases, False) |  | ||||||
| 
 |  | ||||||
|     # subdomain is: scholar.google.xy |     # subdomain is: scholar.google.xy | ||||||
|     lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.") |     google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.") | ||||||
| 
 | 
 | ||||||
|     query_url = ( |     args = { | ||||||
|         'https://' |         'q': query, | ||||||
|         + lang_info['subdomain'] |         **google_info['params'], | ||||||
|         + '/scholar' |         'start': (params['pageno'] - 1) * 10, | ||||||
|         + "?" |         'as_sdt': '2007',  # include patents / to disable set '0,5' | ||||||
|         + urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'start': offset}) |         'as_vis': '0',  # include citations / to disable set '1' | ||||||
|     ) |     } | ||||||
|  |     args.update(time_range_args(params)) | ||||||
| 
 | 
 | ||||||
|     query_url += time_range_url(params) |     params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args) | ||||||
|     params['url'] = query_url |     params['cookies'] = google_info['cookies'] | ||||||
| 
 |     params['headers'].update(google_info['headers']) | ||||||
|     params['cookies']['CONSENT'] = "YES+" |  | ||||||
|     params['headers'].update(lang_info['headers']) |  | ||||||
|     params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' |  | ||||||
| 
 |  | ||||||
|     # params['google_subdomain'] = subdomain |  | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -138,19 +148,15 @@ def parse_gs_a(text: Optional[str]): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def response(resp):  # pylint: disable=too-many-locals | def response(resp):  # pylint: disable=too-many-locals | ||||||
|     """Get response from google's search request""" |     """Parse response from Google Scholar""" | ||||||
|     results = [] |     results = [] | ||||||
| 
 | 
 | ||||||
|     detect_google_sorry(resp) |  | ||||||
| 
 |  | ||||||
|     # which subdomain ? |  | ||||||
|     # subdomain = resp.search_params.get('google_subdomain') |  | ||||||
| 
 |  | ||||||
|     # convert the text to dom |     # convert the text to dom | ||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
|  |     detect_google_captcha(dom) | ||||||
| 
 | 
 | ||||||
|     # parse results |     # parse results | ||||||
|     for result in eval_xpath_list(dom, '//div[@data-cid]'): |     for result in eval_xpath_list(dom, '//div[@data-rp]'): | ||||||
| 
 | 
 | ||||||
|         title = extract_text(eval_xpath(result, './/h3[1]//a')) |         title = extract_text(eval_xpath(result, './/h3[1]//a')) | ||||||
| 
 | 
 | ||||||
| @ -158,7 +164,7 @@ def response(resp):  # pylint: disable=too-many-locals | |||||||
|             # this is a [ZITATION] block |             # this is a [ZITATION] block | ||||||
|             continue |             continue | ||||||
| 
 | 
 | ||||||
|         pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]')) |         pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]')) | ||||||
|         if pub_type: |         if pub_type: | ||||||
|             pub_type = pub_type[1:-1].lower() |             pub_type = pub_type[1:-1].lower() | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -1,6 +1,6 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| # lint: pylint | # lint: pylint | ||||||
| """This is the implementation of the google videos engine. | """This is the implementation of the Google Videos engine. | ||||||
| 
 | 
 | ||||||
| .. admonition:: Content-Security-Policy (CSP) | .. admonition:: Content-Security-Policy (CSP) | ||||||
| 
 | 
 | ||||||
| @ -14,9 +14,8 @@ | |||||||
| 
 | 
 | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| # pylint: disable=invalid-name | from typing import TYPE_CHECKING | ||||||
| 
 | 
 | ||||||
| import re |  | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| 
 | 
 | ||||||
| @ -27,20 +26,22 @@ from searx.utils import ( | |||||||
|     extract_text, |     extract_text, | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
|  | from searx.engines.google import fetch_traits  # pylint: disable=unused-import | ||||||
| from searx.engines.google import ( | from searx.engines.google import ( | ||||||
|     get_lang_info, |     get_google_info, | ||||||
|     time_range_dict, |     time_range_dict, | ||||||
|     filter_mapping, |     filter_mapping, | ||||||
|     g_section_with_header, |  | ||||||
|     title_xpath, |  | ||||||
|     suggestion_xpath, |     suggestion_xpath, | ||||||
|     detect_google_sorry, |     detect_google_sorry, | ||||||
| ) | ) | ||||||
|  | from searx.enginelib.traits import EngineTraits | ||||||
| 
 | 
 | ||||||
| # pylint: disable=unused-import | if TYPE_CHECKING: | ||||||
| from searx.engines.google import supported_languages_url, _fetch_supported_languages |     import logging | ||||||
| 
 | 
 | ||||||
| # pylint: enable=unused-import |     logger: logging.Logger | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
| @ -55,70 +56,32 @@ about = { | |||||||
| # engine dependent config | # engine dependent config | ||||||
| 
 | 
 | ||||||
| categories = ['videos', 'web'] | categories = ['videos', 'web'] | ||||||
| paging = False | paging = True | ||||||
| language_support = True | language_support = True | ||||||
| use_locale_domain = True |  | ||||||
| time_range_support = True | time_range_support = True | ||||||
| safesearch = True | safesearch = True | ||||||
| send_accept_language_header = True |  | ||||||
| 
 |  | ||||||
| RE_CACHE = {} |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def _re(regexpr): |  | ||||||
|     """returns compiled regular expression""" |  | ||||||
|     RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr)) |  | ||||||
|     return RE_CACHE[regexpr] |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def scrap_out_thumbs_src(dom): |  | ||||||
|     ret_val = {} |  | ||||||
|     thumb_name = 'dimg_' |  | ||||||
|     for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): |  | ||||||
|         _script = script.text |  | ||||||
|         # "dimg_35":"https://i.ytimg.c....", |  | ||||||
|         _dimurl = _re("s='([^']*)").findall(_script) |  | ||||||
|         for k, v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)').findall(_script): |  | ||||||
|             v = v.replace(r'\u003d', '=') |  | ||||||
|             v = v.replace(r'\u0026', '&') |  | ||||||
|             ret_val[k] = v |  | ||||||
|     logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) |  | ||||||
|     return ret_val |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def scrap_out_thumbs(dom): |  | ||||||
|     """Scrap out thumbnail data from <script> tags.""" |  | ||||||
|     ret_val = {} |  | ||||||
|     thumb_name = 'dimg_' |  | ||||||
| 
 |  | ||||||
|     for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'): |  | ||||||
|         _script = script.text |  | ||||||
| 
 |  | ||||||
|         # var s='data:image/jpeg;base64, ...' |  | ||||||
|         _imgdata = _re("s='([^']*)").findall(_script) |  | ||||||
|         if not _imgdata: |  | ||||||
|             continue |  | ||||||
| 
 |  | ||||||
|         # var ii=['dimg_17'] |  | ||||||
|         for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script): |  | ||||||
|             # At least the equal sign in the URL needs to be decoded |  | ||||||
|             ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=") |  | ||||||
| 
 |  | ||||||
|     logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) |  | ||||||
|     return ret_val |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     """Google-Video search request""" |     """Google-Video search request""" | ||||||
| 
 | 
 | ||||||
|     lang_info = get_lang_info(params, supported_languages, language_aliases, False) |     google_info = get_google_info(params, traits) | ||||||
| 
 | 
 | ||||||
|     query_url = ( |     query_url = ( | ||||||
|         'https://' |         'https://' | ||||||
|         + lang_info['subdomain'] |         + google_info['subdomain'] | ||||||
|         + '/search' |         + '/search' | ||||||
|         + "?" |         + "?" | ||||||
|         + urlencode({'q': query, 'tbm': "vid", **lang_info['params'], 'ie': "utf8", 'oe': "utf8"}) |         + urlencode( | ||||||
|  |             { | ||||||
|  |                 'q': query, | ||||||
|  |                 'tbm': "vid", | ||||||
|  |                 'start': 10 * params['pageno'], | ||||||
|  |                 **google_info['params'], | ||||||
|  |                 'asearch': 'arc', | ||||||
|  |                 'async': 'use_ac:true,_fmt:html', | ||||||
|  |             } | ||||||
|  |         ) | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|     if params['time_range'] in time_range_dict: |     if params['time_range'] in time_range_dict: | ||||||
| @ -127,9 +90,8 @@ def request(query, params): | |||||||
|         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) |         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) | ||||||
|     params['url'] = query_url |     params['url'] = query_url | ||||||
| 
 | 
 | ||||||
|     params['cookies']['CONSENT'] = "YES+" |     params['cookies'] = google_info['cookies'] | ||||||
|     params['headers'].update(lang_info['headers']) |     params['headers'].update(google_info['headers']) | ||||||
|     params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' |  | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -141,43 +103,30 @@ def response(resp): | |||||||
| 
 | 
 | ||||||
|     # convert the text to dom |     # convert the text to dom | ||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
|     vidthumb_imgdata = scrap_out_thumbs(dom) |  | ||||||
|     thumbs_src = scrap_out_thumbs_src(dom) |  | ||||||
|     logger.debug(str(thumbs_src)) |  | ||||||
| 
 | 
 | ||||||
|     # parse results |     # parse results | ||||||
|     for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'): |     for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'): | ||||||
| 
 | 
 | ||||||
|         # ignore google *sections* |         img_src = eval_xpath_getindex(result, './/img/@src', 0, None) | ||||||
|         if extract_text(eval_xpath(result, g_section_with_header)): |         if img_src is None: | ||||||
|             logger.debug("ignoring <g-section-with-header>") |  | ||||||
|             continue |             continue | ||||||
| 
 | 
 | ||||||
|         # ingnore articles without an image id / e.g. news articles |         title = extract_text(eval_xpath_getindex(result, './/a/h3[1]', 0)) | ||||||
|         img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None) |         url = eval_xpath_getindex(result, './/a/h3[1]/../@href', 0) | ||||||
|         if img_id is None: |  | ||||||
|             logger.error("no img_id found in item %s (news article?)", len(results) + 1) |  | ||||||
|             continue |  | ||||||
| 
 | 
 | ||||||
|         img_src = vidthumb_imgdata.get(img_id, None) |  | ||||||
|         if not img_src: |  | ||||||
|             img_src = thumbs_src.get(img_id, "") |  | ||||||
| 
 |  | ||||||
|         title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) |  | ||||||
|         url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0) |  | ||||||
|         length = extract_text(eval_xpath(result, './/div[contains(@class, "P7xzyf")]/span/span')) |  | ||||||
|         c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0) |         c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0) | ||||||
|         content = extract_text(c_node) |         content = extract_text(c_node) | ||||||
|         pub_info = extract_text(eval_xpath(result, './/div[@class="Zg1NU"]')) |         pub_info = extract_text(eval_xpath(result, './/div[@class="P7xzyf"]')) | ||||||
|  |         length = extract_text(eval_xpath(result, './/div[@class="J1mWY"]')) | ||||||
| 
 | 
 | ||||||
|         results.append( |         results.append( | ||||||
|             { |             { | ||||||
|                 'url': url, |                 'url': url, | ||||||
|                 'title': title, |                 'title': title, | ||||||
|                 'content': content, |                 'content': content, | ||||||
|                 'length': length, |  | ||||||
|                 'author': pub_info, |                 'author': pub_info, | ||||||
|                 'thumbnail': img_src, |                 'thumbnail': img_src, | ||||||
|  |                 'length': length, | ||||||
|                 'template': 'videos.html', |                 'template': 'videos.html', | ||||||
|             } |             } | ||||||
|         ) |         ) | ||||||
|  | |||||||
| @ -1,18 +1,30 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| """ | # lint: pylint | ||||||
|  peertube (Videos) | """Peertube and :py:obj:`SepiaSearch <searx.engines.sepiasearch>` do share | ||||||
|  | (more or less) the same REST API and the schema of the JSON result is identical. | ||||||
|  | 
 | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from json import loads | import re | ||||||
| from datetime import datetime |  | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from searx.utils import html_to_text | from datetime import datetime | ||||||
|  | from dateutil.parser import parse | ||||||
|  | from dateutil.relativedelta import relativedelta | ||||||
|  | 
 | ||||||
|  | import babel | ||||||
|  | 
 | ||||||
|  | from searx import network | ||||||
|  | from searx.locales import language_tag | ||||||
|  | from searx.utils import html_to_text | ||||||
|  | from searx.enginelib.traits import EngineTraits | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
| 
 | 
 | ||||||
| # about |  | ||||||
| about = { | about = { | ||||||
|  |     # pylint: disable=line-too-long | ||||||
|     "website": 'https://joinpeertube.org', |     "website": 'https://joinpeertube.org', | ||||||
|     "wikidata_id": 'Q50938515', |     "wikidata_id": 'Q50938515', | ||||||
|     "official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html', |     "official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html#tag/Search/operation/searchVideos', | ||||||
|     "use_official_api": True, |     "use_official_api": True, | ||||||
|     "require_api_key": False, |     "require_api_key": False, | ||||||
|     "results": 'JSON', |     "results": 'JSON', | ||||||
| @ -22,66 +34,155 @@ about = { | |||||||
| categories = ["videos"] | categories = ["videos"] | ||||||
| paging = True | paging = True | ||||||
| base_url = "https://peer.tube" | base_url = "https://peer.tube" | ||||||
| supported_languages_url = 'https://peer.tube/api/v1/videos/languages' | """Base URL of the Peertube instance.  A list of instances is available at: | ||||||
|  | 
 | ||||||
|  | - https://instances.joinpeertube.org/instances | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | time_range_support = True | ||||||
|  | time_range_table = { | ||||||
|  |     'day': relativedelta(), | ||||||
|  |     'week': relativedelta(weeks=-1), | ||||||
|  |     'month': relativedelta(months=-1), | ||||||
|  |     'year': relativedelta(years=-1), | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | safesearch = True | ||||||
|  | safesearch_table = {0: 'both', 1: 'false', 2: 'false'} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def minute_to_hm(minute): | ||||||
|  |     if isinstance(minute, int): | ||||||
|  |         return "%d:%02d" % (divmod(minute, 60)) | ||||||
|  |     return None | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # do search-request |  | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     sanitized_url = base_url.rstrip("/") |     """Assemble request for the Peertube API""" | ||||||
|     pageno = (params["pageno"] - 1) * 15 | 
 | ||||||
|     search_url = sanitized_url + "/api/v1/search/videos/?pageno={pageno}&{query}" |     if not query: | ||||||
|     query_dict = {"search": query} |         return False | ||||||
|     language = params["language"].split("-")[0] | 
 | ||||||
|     if "all" != language and language in supported_languages: |     # eng_region = traits.get_region(params['searxng_locale'], 'en_US') | ||||||
|         query_dict["languageOneOf"] = language |     eng_lang = traits.get_language(params['searxng_locale'], None) | ||||||
|     params["url"] = search_url.format(query=urlencode(query_dict), pageno=pageno) | 
 | ||||||
|  |     params['url'] = ( | ||||||
|  |         base_url.rstrip("/") | ||||||
|  |         + "/api/v1/search/videos?" | ||||||
|  |         + urlencode( | ||||||
|  |             { | ||||||
|  |                 'search': query, | ||||||
|  |                 'searchTarget': 'search-index',  # Vidiversum | ||||||
|  |                 'resultType': 'videos', | ||||||
|  |                 'start': (params['pageno'] - 1) * 10, | ||||||
|  |                 'count': 10, | ||||||
|  |                 # -createdAt: sort by date ascending / createdAt: date descending | ||||||
|  |                 'sort': '-match',  # sort by *match descending* | ||||||
|  |                 'nsfw': safesearch_table[params['safesearch']], | ||||||
|  |             } | ||||||
|  |         ) | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     if eng_lang is not None: | ||||||
|  |         params['url'] += '&languageOneOf[]=' + eng_lang | ||||||
|  |         params['url'] += '&boostLanguages[]=' + eng_lang | ||||||
|  | 
 | ||||||
|  |     if params['time_range'] in time_range_table: | ||||||
|  |         time = datetime.now().date() + time_range_table[params['time_range']] | ||||||
|  |         params['url'] += '&startDate=' + time.isoformat() | ||||||
|  | 
 | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _get_offset_from_pageno(pageno): |  | ||||||
|     return (pageno - 1) * 15 + 1 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # get response from search-request |  | ||||||
| def response(resp): | def response(resp): | ||||||
|     sanitized_url = base_url.rstrip("/") |     return video_response(resp) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def video_response(resp): | ||||||
|  |     """Parse video response from SepiaSearch and Peertube instances.""" | ||||||
|     results = [] |     results = [] | ||||||
| 
 | 
 | ||||||
|     search_res = loads(resp.text) |     json_data = resp.json() | ||||||
| 
 | 
 | ||||||
|     # return empty array if there are no results |     if 'data' not in json_data: | ||||||
|     if "data" not in search_res: |  | ||||||
|         return [] |         return [] | ||||||
| 
 | 
 | ||||||
|     # parse results |     for result in json_data['data']: | ||||||
|     for res in search_res["data"]: |         metadata = [ | ||||||
|         title = res["name"] |             x | ||||||
|         url = sanitized_url + "/videos/watch/" + res["uuid"] |             for x in [ | ||||||
|         description = res["description"] |                 result.get('channel', {}).get('displayName'), | ||||||
|         if description: |                 result.get('channel', {}).get('name') + '@' + result.get('channel', {}).get('host'), | ||||||
|             content = html_to_text(res["description"]) |                 ', '.join(result.get('tags', [])), | ||||||
|         else: |             ] | ||||||
|             content = "" |             if x | ||||||
|         thumbnail = sanitized_url + res["thumbnailPath"] |         ] | ||||||
|         publishedDate = datetime.strptime(res["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ") |  | ||||||
| 
 | 
 | ||||||
|         results.append( |         results.append( | ||||||
|             { |             { | ||||||
|                 "template": "videos.html", |                 'url': result['url'], | ||||||
|                 "url": url, |                 'title': result['name'], | ||||||
|                 "title": title, |                 'content': html_to_text(result.get('description') or ''), | ||||||
|                 "content": content, |                 'author': result.get('account', {}).get('displayName'), | ||||||
|                 "publishedDate": publishedDate, |                 'length': minute_to_hm(result.get('duration')), | ||||||
|                 "iframe_src": sanitized_url + res["embedPath"], |                 'template': 'videos.html', | ||||||
|                 "thumbnail": thumbnail, |                 'publishedDate': parse(result['publishedAt']), | ||||||
|  |                 'iframe_src': result.get('embedUrl'), | ||||||
|  |                 'thumbnail': result.get('thumbnailUrl') or result.get('previewUrl'), | ||||||
|  |                 'metadata': ' | '.join(metadata), | ||||||
|             } |             } | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|     # return results |  | ||||||
|     return results |     return results | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _fetch_supported_languages(resp): | def fetch_traits(engine_traits: EngineTraits): | ||||||
|     videolanguages = resp.json() |     """Fetch languages from peertube's search-index source code. | ||||||
|     peertube_languages = list(videolanguages.keys()) | 
 | ||||||
|     return peertube_languages |     See videoLanguages_ in commit `8ed5c729 - Refactor and redesign client`_ | ||||||
|  | 
 | ||||||
|  |     .. _8ed5c729 - Refactor and redesign client: | ||||||
|  |        https://framagit.org/framasoft/peertube/search-index/-/commit/8ed5c729 | ||||||
|  |     .. _videoLanguages: | ||||||
|  |        https://framagit.org/framasoft/peertube/search-index/-/commit/8ed5c729#3d8747f9a60695c367c70bb64efba8f403721fad_0_291 | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     resp = network.get( | ||||||
|  |         'https://framagit.org/framasoft/peertube/search-index/-/raw/master/client/src/components/Filters.vue', | ||||||
|  |         # the response from search-index repository is very slow | ||||||
|  |         timeout=60, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     if not resp.ok: | ||||||
|  |         print("ERROR: response from peertube is not OK.") | ||||||
|  |         return | ||||||
|  | 
 | ||||||
|  |     js_lang = re.search(r"videoLanguages \(\)[^\n]+(.*?)\]", resp.text, re.DOTALL) | ||||||
|  |     if not js_lang: | ||||||
|  |         print("ERROR: can't determine languages from peertube") | ||||||
|  |         return | ||||||
|  | 
 | ||||||
|  |     for lang in re.finditer(r"\{ id: '([a-z]+)', label:", js_lang.group(1)): | ||||||
|  |         try: | ||||||
|  |             eng_tag = lang.group(1) | ||||||
|  |             if eng_tag == 'oc': | ||||||
|  |                 # Occitanis not known by babel, its closest relative is Catalan | ||||||
|  |                 # but 'ca' is already in the list of engine_traits.languages --> | ||||||
|  |                 # 'oc' will be ignored. | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |             sxng_tag = language_tag(babel.Locale.parse(eng_tag)) | ||||||
|  | 
 | ||||||
|  |         except babel.UnknownLocaleError: | ||||||
|  |             print("ERROR: %s is unknown by babel" % eng_tag) | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         conflict = engine_traits.languages.get(sxng_tag) | ||||||
|  |         if conflict: | ||||||
|  |             if conflict != eng_tag: | ||||||
|  |                 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) | ||||||
|  |             continue | ||||||
|  |         engine_traits.languages[sxng_tag] = eng_tag | ||||||
|  | 
 | ||||||
|  |     engine_traits.languages['zh_Hans'] = 'zh' | ||||||
|  |     engine_traits.languages['zh_Hant'] = 'zh' | ||||||
|  | |||||||
| @ -34,7 +34,9 @@ import babel | |||||||
| 
 | 
 | ||||||
| from searx.exceptions import SearxEngineAPIException | from searx.exceptions import SearxEngineAPIException | ||||||
| from searx.network import raise_for_httperror | from searx.network import raise_for_httperror | ||||||
| from searx.locales import get_engine_locale | from searx.enginelib.traits import EngineTraits | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
| @ -49,7 +51,6 @@ about = { | |||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = [] | categories = [] | ||||||
| paging = True | paging = True | ||||||
| supported_languages_url = about['website'] |  | ||||||
| qwant_categ = None  # web|news|inages|videos | qwant_categ = None  # web|news|inages|videos | ||||||
| 
 | 
 | ||||||
| safesearch = True | safesearch = True | ||||||
| @ -95,7 +96,7 @@ def request(query, params): | |||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|     # add quant's locale |     # add quant's locale | ||||||
|     q_locale = get_engine_locale(params['language'], supported_languages, default='en_US') |     q_locale = traits.get_region(params["searxng_locale"], default='en_US') | ||||||
|     params['url'] += '&locale=' + q_locale |     params['url'] += '&locale=' + q_locale | ||||||
| 
 | 
 | ||||||
|     # add safesearch option |     # add safesearch option | ||||||
| @ -243,15 +244,20 @@ def response(resp): | |||||||
|     return results |     return results | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _fetch_supported_languages(resp): | def fetch_traits(engine_traits: EngineTraits): | ||||||
| 
 | 
 | ||||||
|  |     # pylint: disable=import-outside-toplevel | ||||||
|  |     from searx import network | ||||||
|  |     from searx.locales import region_tag | ||||||
|  | 
 | ||||||
|  |     resp = network.get(about['website']) | ||||||
|     text = resp.text |     text = resp.text | ||||||
|     text = text[text.find('INITIAL_PROPS') :] |     text = text[text.find('INITIAL_PROPS') :] | ||||||
|     text = text[text.find('{') : text.find('</script>')] |     text = text[text.find('{') : text.find('</script>')] | ||||||
| 
 | 
 | ||||||
|     q_initial_props = loads(text) |     q_initial_props = loads(text) | ||||||
|     q_locales = q_initial_props.get('locales') |     q_locales = q_initial_props.get('locales') | ||||||
|     q_valid_locales = [] |     eng_tag_list = set() | ||||||
| 
 | 
 | ||||||
|     for country, v in q_locales.items(): |     for country, v in q_locales.items(): | ||||||
|         for lang in v['langs']: |         for lang in v['langs']: | ||||||
| @ -261,25 +267,18 @@ def _fetch_supported_languages(resp): | |||||||
|                 # qwant-news does not support all locales from qwant-web: |                 # qwant-news does not support all locales from qwant-web: | ||||||
|                 continue |                 continue | ||||||
| 
 | 
 | ||||||
|             q_valid_locales.append(_locale) |             eng_tag_list.add(_locale) | ||||||
| 
 | 
 | ||||||
|     supported_languages = {} |     for eng_tag in eng_tag_list: | ||||||
| 
 |  | ||||||
|     for q_locale in q_valid_locales: |  | ||||||
|         try: |         try: | ||||||
|             locale = babel.Locale.parse(q_locale, sep='_') |             sxng_tag = region_tag(babel.Locale.parse(eng_tag, sep='_')) | ||||||
|         except babel.core.UnknownLocaleError: |         except babel.UnknownLocaleError: | ||||||
|             print("ERROR: can't determine babel locale of quant's locale %s" % q_locale) |             print("ERROR: can't determine babel locale of quant's locale %s" % eng_tag) | ||||||
|             continue |             continue | ||||||
| 
 | 
 | ||||||
|         # note: supported_languages (dict) |         conflict = engine_traits.regions.get(sxng_tag) | ||||||
|         # |         if conflict: | ||||||
|         #   dict's key is a string build up from a babel.Locale object / the |             if conflict != eng_tag: | ||||||
|         #   notation 'xx-XX' (and 'xx') conforms to SearXNG's locale (and |                 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) | ||||||
|         #   language) notation and dict's values are the locale strings used by |             continue | ||||||
|         #   the engine. |         engine_traits.regions[sxng_tag] = eng_tag | ||||||
| 
 |  | ||||||
|         searxng_locale = locale.language + '-' + locale.territory  # --> params['language'] |  | ||||||
|         supported_languages[searxng_locale] = q_locale |  | ||||||
| 
 |  | ||||||
|     return supported_languages |  | ||||||
|  | |||||||
| @ -1,70 +1,80 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| """ | # lint: pylint | ||||||
|  SepiaSearch (Videos) | """SepiaSearch uses the same languages as :py:obj:`Peertube | ||||||
|  | <searx.engines.peertube>` and the response is identical to the response from the | ||||||
|  | peertube engines. | ||||||
|  | 
 | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from json import loads | from typing import TYPE_CHECKING | ||||||
| from dateutil import parser, relativedelta | 
 | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| 
 | 
 | ||||||
| # about | from searx.engines.peertube import fetch_traits  # pylint: disable=unused-import | ||||||
|  | from searx.engines.peertube import ( | ||||||
|  |     # pylint: disable=unused-import | ||||||
|  |     video_response, | ||||||
|  |     safesearch_table, | ||||||
|  |     time_range_table, | ||||||
|  | ) | ||||||
|  | from searx.enginelib.traits import EngineTraits | ||||||
|  | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     import logging | ||||||
|  | 
 | ||||||
|  |     logger: logging.Logger | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
|  | 
 | ||||||
| about = { | about = { | ||||||
|  |     # pylint: disable=line-too-long | ||||||
|     "website": 'https://sepiasearch.org', |     "website": 'https://sepiasearch.org', | ||||||
|     "wikidata_id": None, |     "wikidata_id": None, | ||||||
|     "official_api_documentation": "https://framagit.org/framasoft/peertube/search-index/-/tree/master/server/controllers/api",  # NOQA |     "official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html#tag/Search/operation/searchVideos', | ||||||
|     "use_official_api": True, |     "use_official_api": True, | ||||||
|     "require_api_key": False, |     "require_api_key": False, | ||||||
|     "results": 'JSON', |     "results": 'JSON', | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | # engine dependent config | ||||||
| categories = ['videos'] | categories = ['videos'] | ||||||
| paging = True | paging = True | ||||||
|  | 
 | ||||||
|  | base_url = 'https://sepiasearch.org' | ||||||
|  | 
 | ||||||
| time_range_support = True | time_range_support = True | ||||||
| safesearch = True | safesearch = True | ||||||
| supported_languages = [ |  | ||||||
|     # fmt: off |  | ||||||
|     'en', 'fr', 'ja', 'eu', 'ca', 'cs', 'eo', 'el', |  | ||||||
|     'de', 'it', 'nl', 'es', 'oc', 'gd', 'zh', 'pt', |  | ||||||
|     'sv', 'pl', 'fi', 'ru' |  | ||||||
|     # fmt: on |  | ||||||
| ] |  | ||||||
| base_url = 'https://sepiasearch.org/api/v1/search/videos' |  | ||||||
| 
 |  | ||||||
| safesearch_table = {0: 'both', 1: 'false', 2: 'false'} |  | ||||||
| 
 |  | ||||||
| time_range_table = { |  | ||||||
|     'day': relativedelta.relativedelta(), |  | ||||||
|     'week': relativedelta.relativedelta(weeks=-1), |  | ||||||
|     'month': relativedelta.relativedelta(months=-1), |  | ||||||
|     'year': relativedelta.relativedelta(years=-1), |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def minute_to_hm(minute): |  | ||||||
|     if isinstance(minute, int): |  | ||||||
|         return "%d:%02d" % (divmod(minute, 60)) |  | ||||||
|     return None |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def request(query, params): | def request(query, params): | ||||||
|  |     """Assemble request for the SepiaSearch API""" | ||||||
|  | 
 | ||||||
|  |     if not query: | ||||||
|  |         return False | ||||||
|  | 
 | ||||||
|  |     # eng_region = traits.get_region(params['searxng_locale'], 'en_US') | ||||||
|  |     eng_lang = traits.get_language(params['searxng_locale'], None) | ||||||
|  | 
 | ||||||
|     params['url'] = ( |     params['url'] = ( | ||||||
|         base_url |         base_url.rstrip("/") | ||||||
|         + '?' |         + "/api/v1/search/videos?" | ||||||
|         + urlencode( |         + urlencode( | ||||||
|             { |             { | ||||||
|                 'search': query, |                 'search': query, | ||||||
|                 'start': (params['pageno'] - 1) * 10, |                 'start': (params['pageno'] - 1) * 10, | ||||||
|                 'count': 10, |                 'count': 10, | ||||||
|                 'sort': '-match', |                 # -createdAt: sort by date ascending / createdAt: date descending | ||||||
|  |                 'sort': '-match',  # sort by *match descending* | ||||||
|                 'nsfw': safesearch_table[params['safesearch']], |                 'nsfw': safesearch_table[params['safesearch']], | ||||||
|             } |             } | ||||||
|         ) |         ) | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|     language = params['language'].split('-')[0] |     if eng_lang is not None: | ||||||
|     if language in supported_languages: |         params['url'] += '&languageOneOf[]=' + eng_lang | ||||||
|         params['url'] += '&languageOneOf[]=' + language |         params['url'] += '&boostLanguages[]=' + eng_lang | ||||||
|  | 
 | ||||||
|     if params['time_range'] in time_range_table: |     if params['time_range'] in time_range_table: | ||||||
|         time = datetime.now().date() + time_range_table[params['time_range']] |         time = datetime.now().date() + time_range_table[params['time_range']] | ||||||
|         params['url'] += '&startDate=' + time.isoformat() |         params['url'] += '&startDate=' + time.isoformat() | ||||||
| @ -73,34 +83,4 @@ def request(query, params): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def response(resp): | def response(resp): | ||||||
|     results = [] |     return video_response(resp) | ||||||
| 
 |  | ||||||
|     search_results = loads(resp.text) |  | ||||||
| 
 |  | ||||||
|     if 'data' not in search_results: |  | ||||||
|         return [] |  | ||||||
| 
 |  | ||||||
|     for result in search_results['data']: |  | ||||||
|         title = result['name'] |  | ||||||
|         content = result['description'] |  | ||||||
|         thumbnail = result['thumbnailUrl'] |  | ||||||
|         publishedDate = parser.parse(result['publishedAt']) |  | ||||||
|         author = result.get('account', {}).get('displayName') |  | ||||||
|         length = minute_to_hm(result.get('duration')) |  | ||||||
|         url = result['url'] |  | ||||||
| 
 |  | ||||||
|         results.append( |  | ||||||
|             { |  | ||||||
|                 'url': url, |  | ||||||
|                 'title': title, |  | ||||||
|                 'content': content, |  | ||||||
|                 'author': author, |  | ||||||
|                 'length': length, |  | ||||||
|                 'template': 'videos.html', |  | ||||||
|                 'publishedDate': publishedDate, |  | ||||||
|                 'iframe_src': result.get('embedUrl'), |  | ||||||
|                 'thumbnail': thumbnail, |  | ||||||
|             } |  | ||||||
|         ) |  | ||||||
| 
 |  | ||||||
|     return results |  | ||||||
|  | |||||||
| @ -1,28 +1,108 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| # lint: pylint | # lint: pylint | ||||||
| """Startpage (Web) | """Startpage's language & region selectors are a mess .. | ||||||
|  | 
 | ||||||
|  | .. _startpage regions: | ||||||
|  | 
 | ||||||
|  | Startpage regions | ||||||
|  | ================= | ||||||
|  | 
 | ||||||
|  | In the list of regions there are tags we need to map to common region tags:: | ||||||
|  | 
 | ||||||
|  |   pt-BR_BR --> pt_BR | ||||||
|  |   zh-CN_CN --> zh_Hans_CN | ||||||
|  |   zh-TW_TW --> zh_Hant_TW | ||||||
|  |   zh-TW_HK --> zh_Hant_HK | ||||||
|  |   en-GB_GB --> en_GB | ||||||
|  | 
 | ||||||
|  | and there is at least one tag with a three letter language tag (ISO 639-2):: | ||||||
|  | 
 | ||||||
|  |   fil_PH --> fil_PH | ||||||
|  | 
 | ||||||
|  | The locale code ``no_NO`` from Startpage does not exists and is mapped to | ||||||
|  | ``nb-NO``:: | ||||||
|  | 
 | ||||||
|  |     babel.core.UnknownLocaleError: unknown locale 'no_NO' | ||||||
|  | 
 | ||||||
|  | For reference see languages-subtag at iana; ``no`` is the macrolanguage [1]_ and | ||||||
|  | W3C recommends subtag over macrolanguage [2]_. | ||||||
|  | 
 | ||||||
|  | .. [1] `iana: language-subtag-registry | ||||||
|  |    <https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry>`_ :: | ||||||
|  | 
 | ||||||
|  |       type: language | ||||||
|  |       Subtag: nb | ||||||
|  |       Description: Norwegian Bokmål | ||||||
|  |       Added: 2005-10-16 | ||||||
|  |       Suppress-Script: Latn | ||||||
|  |       Macrolanguage: no | ||||||
|  | 
 | ||||||
|  | .. [2] | ||||||
|  |    Use macrolanguages with care.  Some language subtags have a Scope field set to | ||||||
|  |    macrolanguage, i.e. this primary language subtag encompasses a number of more | ||||||
|  |    specific primary language subtags in the registry.  ...  As we recommended for | ||||||
|  |    the collection subtags mentioned above, in most cases you should try to use | ||||||
|  |    the more specific subtags ... `W3: The primary language subtag | ||||||
|  |    <https://www.w3.org/International/questions/qa-choosing-language-tags#langsubtag>`_ | ||||||
|  | 
 | ||||||
|  | .. _startpage languages: | ||||||
|  | 
 | ||||||
|  | Startpage languages | ||||||
|  | =================== | ||||||
|  | 
 | ||||||
|  | :py:obj:`send_accept_language_header`: | ||||||
|  |   The displayed name in Startpage's settings page depend on the location of the | ||||||
|  |   IP when ``Accept-Language`` HTTP header is unset.  In :py:obj:`fetch_traits` | ||||||
|  |   we use:: | ||||||
|  | 
 | ||||||
|  |     'Accept-Language': "en-US,en;q=0.5", | ||||||
|  |     .. | ||||||
|  | 
 | ||||||
|  |   to get uniform names independent from the IP). | ||||||
|  | 
 | ||||||
|  | .. _startpage categories: | ||||||
|  | 
 | ||||||
|  | Startpage categories | ||||||
|  | ==================== | ||||||
|  | 
 | ||||||
|  | Startpage's category (for Web-search, News, Videos, ..) is set by | ||||||
|  | :py:obj:`startpage_categ` in  settings.yml:: | ||||||
|  | 
 | ||||||
|  |   - name: startpage | ||||||
|  |     engine: startpage | ||||||
|  |     startpage_categ: web | ||||||
|  |     ... | ||||||
|  | 
 | ||||||
|  | .. hint:: | ||||||
|  | 
 | ||||||
|  |    The default category is ``web`` .. and other categories than ``web`` are not | ||||||
|  |    yet implemented. | ||||||
| 
 | 
 | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
|  | from typing import TYPE_CHECKING | ||||||
|  | from collections import OrderedDict | ||||||
| import re | import re | ||||||
| from time import time |  | ||||||
| 
 |  | ||||||
| from urllib.parse import urlencode |  | ||||||
| from unicodedata import normalize, combining | from unicodedata import normalize, combining | ||||||
|  | from time import time | ||||||
| from datetime import datetime, timedelta | from datetime import datetime, timedelta | ||||||
| 
 | 
 | ||||||
| from dateutil import parser | import dateutil.parser | ||||||
| from lxml import html | import lxml.html | ||||||
| from babel import Locale | import babel | ||||||
| from babel.localedata import locale_identifiers |  | ||||||
| 
 | 
 | ||||||
| from searx.network import get | from searx import network | ||||||
| from searx.utils import extract_text, eval_xpath, match_language | from searx.utils import extract_text, eval_xpath, gen_useragent | ||||||
| from searx.exceptions import ( | from searx.exceptions import SearxEngineCaptchaException | ||||||
|     SearxEngineResponseException, | from searx.locales import region_tag | ||||||
|     SearxEngineCaptchaException, | from searx.enginelib.traits import EngineTraits | ||||||
| ) |  | ||||||
| 
 | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     import logging | ||||||
|  | 
 | ||||||
|  |     logger: logging.Logger | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
| @ -34,18 +114,28 @@ about = { | |||||||
|     "results": 'HTML', |     "results": 'HTML', | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | startpage_categ = 'web' | ||||||
|  | """Startpage's category, visit :ref:`startpage categories`. | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | send_accept_language_header = True | ||||||
|  | """Startpage tries to guess user's language and territory from the HTTP | ||||||
|  | ``Accept-Language``.  Optional the user can select a search-language (can be | ||||||
|  | different to the UI language) and a region filter. | ||||||
|  | """ | ||||||
|  | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['general', 'web'] | categories = ['general', 'web'] | ||||||
| # there is a mechanism to block "bot" search |  | ||||||
| # (probably the parameter qid), require |  | ||||||
| # storing of qid's between mulitble search-calls |  | ||||||
| 
 |  | ||||||
| paging = True | paging = True | ||||||
| supported_languages_url = 'https://www.startpage.com/do/settings' | time_range_support = True | ||||||
|  | safesearch = True | ||||||
|  | 
 | ||||||
|  | time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} | ||||||
|  | safesearch_dict = {0: '0', 1: '1', 2: '1'} | ||||||
| 
 | 
 | ||||||
| # search-url | # search-url | ||||||
| base_url = 'https://startpage.com/' | base_url = 'https://www.startpage.com' | ||||||
| search_url = base_url + 'sp/search?' | search_url = base_url + '/sp/search' | ||||||
| 
 | 
 | ||||||
| # specific xpath variables | # specific xpath variables | ||||||
| # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] | # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] | ||||||
| @ -53,92 +143,193 @@ search_url = base_url + 'sp/search?' | |||||||
| results_xpath = '//div[@class="w-gl__result__main"]' | results_xpath = '//div[@class="w-gl__result__main"]' | ||||||
| link_xpath = './/a[@class="w-gl__result-title result-link"]' | link_xpath = './/a[@class="w-gl__result-title result-link"]' | ||||||
| content_xpath = './/p[@class="w-gl__description"]' | content_xpath = './/p[@class="w-gl__description"]' | ||||||
|  | search_form_xpath = '//form[@id="search"]' | ||||||
|  | """XPath of Startpage's origin search form | ||||||
|  | 
 | ||||||
|  | .. code: html | ||||||
|  | 
 | ||||||
|  |     <form action="/sp/search" method="post"> | ||||||
|  |       <input type="text" name="query"  value="" ..> | ||||||
|  |       <input type="hidden" name="t" value="device"> | ||||||
|  |       <input type="hidden" name="lui" value="english"> | ||||||
|  |       <input type="hidden" name="sc" value="Q7Mt5TRqowKB00"> | ||||||
|  |       <input type="hidden" name="cat" value="web"> | ||||||
|  |       <input type="hidden" class="abp" id="abp-input" name="abp" value="1"> | ||||||
|  |     </form> | ||||||
|  | """ | ||||||
| 
 | 
 | ||||||
| # timestamp of the last fetch of 'sc' code | # timestamp of the last fetch of 'sc' code | ||||||
| sc_code_ts = 0 | sc_code_ts = 0 | ||||||
| sc_code = '' | sc_code = '' | ||||||
|  | sc_code_cache_sec = 30 | ||||||
|  | """Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`.""" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def raise_captcha(resp): | def get_sc_code(searxng_locale, params): | ||||||
|  |     """Get an actual ``sc`` argument from Startpage's search form (HTML page). | ||||||
| 
 | 
 | ||||||
|     if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): |     Startpage puts a ``sc`` argument on every HTML :py:obj:`search form | ||||||
|         raise SearxEngineCaptchaException() |     <search_form_xpath>`.  Without this argument Startpage considers the request | ||||||
|  |     is from a bot.  We do not know what is encoded in the value of the ``sc`` | ||||||
|  |     argument, but it seems to be a kind of a *time-stamp*. | ||||||
| 
 | 
 | ||||||
| 
 |     Startpage's search form generates a new sc-code on each request.  This | ||||||
| def get_sc_code(headers): |     function scrap a new sc-code from Startpage's home page every | ||||||
|     """Get an actual `sc` argument from startpage's home page. |     :py:obj:`sc_code_cache_sec` seconds. | ||||||
| 
 |  | ||||||
|     Startpage puts a `sc` argument on every link.  Without this argument |  | ||||||
|     startpage considers the request is from a bot.  We do not know what is |  | ||||||
|     encoded in the value of the `sc` argument, but it seems to be a kind of a |  | ||||||
|     *time-stamp*.  This *time-stamp* is valid for a few hours. |  | ||||||
| 
 |  | ||||||
|     This function scrap a new *time-stamp* from startpage's home page every hour |  | ||||||
|     (3000 sec). |  | ||||||
| 
 | 
 | ||||||
|     """ |     """ | ||||||
| 
 | 
 | ||||||
|     global sc_code_ts, sc_code  # pylint: disable=global-statement |     global sc_code_ts, sc_code  # pylint: disable=global-statement | ||||||
| 
 | 
 | ||||||
|     if time() > (sc_code_ts + 3000): |     if sc_code and (time() < (sc_code_ts + sc_code_cache_sec)): | ||||||
|         logger.debug("query new sc time-stamp ...") |         logger.debug("get_sc_code: reuse '%s'", sc_code) | ||||||
|  |         return sc_code | ||||||
| 
 | 
 | ||||||
|         resp = get(base_url, headers=headers) |     headers = {**params['headers']} | ||||||
|         raise_captcha(resp) |     headers['Origin'] = base_url | ||||||
|         dom = html.fromstring(resp.text) |     headers['Referer'] = base_url + '/' | ||||||
|  |     # headers['Connection'] = 'keep-alive' | ||||||
|  |     # headers['Accept-Encoding'] = 'gzip, deflate, br' | ||||||
|  |     # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' | ||||||
|  |     # headers['User-Agent'] = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0' | ||||||
|  | 
 | ||||||
|  |     # add Accept-Language header | ||||||
|  |     if searxng_locale == 'all': | ||||||
|  |         searxng_locale = 'en-US' | ||||||
|  |     locale = babel.Locale.parse(searxng_locale, sep='-') | ||||||
|  | 
 | ||||||
|  |     if send_accept_language_header: | ||||||
|  |         ac_lang = locale.language | ||||||
|  |         if locale.territory: | ||||||
|  |             ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % ( | ||||||
|  |                 locale.language, | ||||||
|  |                 locale.territory, | ||||||
|  |                 locale.language, | ||||||
|  |             ) | ||||||
|  |         headers['Accept-Language'] = ac_lang | ||||||
|  | 
 | ||||||
|  |     get_sc_url = base_url + '/?sc=%s' % (sc_code) | ||||||
|  |     logger.debug("query new sc time-stamp ... %s", get_sc_url) | ||||||
|  |     logger.debug("headers: %s", headers) | ||||||
|  |     resp = network.get(get_sc_url, headers=headers) | ||||||
|  | 
 | ||||||
|  |     # ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers) | ||||||
|  |     # ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg | ||||||
|  |     # ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21 | ||||||
|  | 
 | ||||||
|  |     if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): | ||||||
|  |         raise SearxEngineCaptchaException( | ||||||
|  |             message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha", | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     dom = lxml.html.fromstring(resp.text) | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
|             # <input type="hidden" name="sc" value="..."> |         sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0] | ||||||
|             sc_code = eval_xpath(dom, '//input[@name="sc"]/@value')[0] |  | ||||||
|     except IndexError as exc: |     except IndexError as exc: | ||||||
|             # suspend startpage API --> https://github.com/searxng/searxng/pull/695 |         logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695") | ||||||
|             raise SearxEngineResponseException( |         raise SearxEngineCaptchaException( | ||||||
|                 suspended_time=7 * 24 * 3600, message="PR-695: query new sc time-stamp failed!" |             message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url, | ||||||
|         ) from exc |         ) from exc | ||||||
| 
 | 
 | ||||||
|     sc_code_ts = time() |     sc_code_ts = time() | ||||||
|         logger.debug("new value is: %s", sc_code) |     logger.debug("get_sc_code: new value is: %s", sc_code) | ||||||
| 
 |  | ||||||
|     return sc_code |     return sc_code | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # do search-request |  | ||||||
| def request(query, params): | def request(query, params): | ||||||
|  |     """Assemble a Startpage request. | ||||||
| 
 | 
 | ||||||
|     # pylint: disable=line-too-long |     To avoid CAPTCHA we need to send a well formed HTTP POST request with a | ||||||
|     # The format string from Startpage's FFox add-on [1]:: |     cookie.  We need to form a request that is identical to the request build by | ||||||
|     # |     Startpage's search form: | ||||||
|     #     https://www.startpage.com/do/dsearch?query={searchTerms}&cat=web&pl=ext-ff&language=__MSG_extensionUrlLanguage__&extVersion=1.3.0 |  | ||||||
|     # |  | ||||||
|     # [1] https://addons.mozilla.org/en-US/firefox/addon/startpage-private-search/ |  | ||||||
| 
 | 
 | ||||||
|  |     - in the cookie the **region** is selected | ||||||
|  |     - in the HTTP POST data the **language** is selected | ||||||
|  | 
 | ||||||
|  |     Additionally the arguments form Startpage's search form needs to be set in | ||||||
|  |     HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`. | ||||||
|  |     """ | ||||||
|  |     if startpage_categ == 'web': | ||||||
|  |         return _request_cat_web(query, params) | ||||||
|  | 
 | ||||||
|  |     logger.error("Startpages's category '%' is not yet implemented.", startpage_categ) | ||||||
|  |     return params | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def _request_cat_web(query, params): | ||||||
|  | 
 | ||||||
|  |     engine_region = traits.get_region(params['searxng_locale'], 'en-US') | ||||||
|  |     engine_language = traits.get_language(params['searxng_locale'], 'en') | ||||||
|  | 
 | ||||||
|  |     # build arguments | ||||||
|     args = { |     args = { | ||||||
|         'query': query, |         'query': query, | ||||||
|         'page': params['pageno'], |  | ||||||
|         'cat': 'web', |         'cat': 'web', | ||||||
|         # 'pl': 'ext-ff', |         't': 'device', | ||||||
|         # 'extVersion': '1.3.0', |         'sc': get_sc_code(params['searxng_locale'], params),  # hint: this func needs HTTP headers, | ||||||
|         # 'abp': "-1", |         'with_date': time_range_dict.get(params['time_range'], ''), | ||||||
|         'sc': get_sc_code(params['headers']), |  | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     # set language if specified |     if engine_language: | ||||||
|     if params['language'] != 'all': |         args['language'] = engine_language | ||||||
|         lang_code = match_language(params['language'], supported_languages, fallback=None) |         args['lui'] = engine_language | ||||||
|         if lang_code: | 
 | ||||||
|             language_name = supported_languages[lang_code]['alias'] |     args['abp'] = '1' | ||||||
|             args['language'] = language_name |     if params['pageno'] > 1: | ||||||
|             args['lui'] = language_name |         args['page'] = params['pageno'] | ||||||
|  | 
 | ||||||
|  |     # build cookie | ||||||
|  |     lang_homepage = 'en' | ||||||
|  |     cookie = OrderedDict() | ||||||
|  |     cookie['date_time'] = 'world' | ||||||
|  |     cookie['disable_family_filter'] = safesearch_dict[params['safesearch']] | ||||||
|  |     cookie['disable_open_in_new_window'] = '0' | ||||||
|  |     cookie['enable_post_method'] = '1'  # hint: POST | ||||||
|  |     cookie['enable_proxy_safety_suggest'] = '1' | ||||||
|  |     cookie['enable_stay_control'] = '1' | ||||||
|  |     cookie['instant_answers'] = '1' | ||||||
|  |     cookie['lang_homepage'] = 's/device/%s/' % lang_homepage | ||||||
|  |     cookie['num_of_results'] = '10' | ||||||
|  |     cookie['suggestions'] = '1' | ||||||
|  |     cookie['wt_unit'] = 'celsius' | ||||||
|  | 
 | ||||||
|  |     if engine_language: | ||||||
|  |         cookie['language'] = engine_language | ||||||
|  |         cookie['language_ui'] = engine_language | ||||||
|  | 
 | ||||||
|  |     if engine_region: | ||||||
|  |         cookie['search_results_region'] = engine_region | ||||||
|  | 
 | ||||||
|  |     params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()]) | ||||||
|  |     logger.debug('cookie preferences: %s', params['cookies']['preferences']) | ||||||
|  | 
 | ||||||
|  |     # POST request | ||||||
|  |     logger.debug("data: %s", args) | ||||||
|  |     params['data'] = args | ||||||
|  |     params['method'] = 'POST' | ||||||
|  |     params['url'] = search_url | ||||||
|  |     params['headers']['Origin'] = base_url | ||||||
|  |     params['headers']['Referer'] = base_url + '/' | ||||||
|  |     # is the Accept header needed? | ||||||
|  |     # params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | ||||||
| 
 | 
 | ||||||
|     params['url'] = search_url + urlencode(args) |  | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # get response from search-request | # get response from search-request | ||||||
| def response(resp): | def response(resp): | ||||||
|     results = [] |     dom = lxml.html.fromstring(resp.text) | ||||||
| 
 | 
 | ||||||
|     dom = html.fromstring(resp.text) |     if startpage_categ == 'web': | ||||||
|  |         return _response_cat_web(dom) | ||||||
|  | 
 | ||||||
|  |     logger.error("Startpages's category '%' is not yet implemented.", startpage_categ) | ||||||
|  |     return [] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def _response_cat_web(dom): | ||||||
|  |     results = [] | ||||||
| 
 | 
 | ||||||
|     # parse results |     # parse results | ||||||
|     for result in eval_xpath(dom, results_xpath): |     for result in eval_xpath(dom, results_xpath): | ||||||
| @ -173,7 +364,7 @@ def response(resp): | |||||||
|             content = content[date_pos:] |             content = content[date_pos:] | ||||||
| 
 | 
 | ||||||
|             try: |             try: | ||||||
|                 published_date = parser.parse(date_string, dayfirst=True) |                 published_date = dateutil.parser.parse(date_string, dayfirst=True) | ||||||
|             except ValueError: |             except ValueError: | ||||||
|                 pass |                 pass | ||||||
| 
 | 
 | ||||||
| @ -199,62 +390,103 @@ def response(resp): | |||||||
|     return results |     return results | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # get supported languages from their site | def fetch_traits(engine_traits: EngineTraits): | ||||||
| def _fetch_supported_languages(resp): |     """Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage | ||||||
|     # startpage's language selector is a mess each option has a displayed name |     regions>` from Startpage.""" | ||||||
|     # and a value, either of which may represent the language name in the native |     # pylint: disable=too-many-branches | ||||||
|     # script, the language name in English, an English transliteration of the |  | ||||||
|     # native name, the English name of the writing script used by the language, |  | ||||||
|     # or occasionally something else entirely. |  | ||||||
| 
 | 
 | ||||||
|     # this cases are so special they need to be hardcoded, a couple of them are misspellings |     headers = { | ||||||
|     language_names = { |         'User-Agent': gen_useragent(), | ||||||
|         'english_uk': 'en-GB', |         'Accept-Language': "en-US,en;q=0.5",  # bing needs to set the English language | ||||||
|         'fantizhengwen': ['zh-TW', 'zh-HK'], |  | ||||||
|         'hangul': 'ko', |  | ||||||
|         'malayam': 'ml', |  | ||||||
|         'norsk': 'nb', |  | ||||||
|         'sinhalese': 'si', |  | ||||||
|         'sudanese': 'su', |  | ||||||
|     } |     } | ||||||
|  |     resp = network.get('https://www.startpage.com/do/settings', headers=headers) | ||||||
| 
 | 
 | ||||||
|     # get the English name of every language known by babel |     if not resp.ok: | ||||||
|     language_names.update( |         print("ERROR: response from Startpage is not OK.") | ||||||
|         { | 
 | ||||||
|             # fmt: off |     dom = lxml.html.fromstring(resp.text) | ||||||
|             name.lower(): lang_code | 
 | ||||||
|             # pylint: disable=protected-access |     # regions | ||||||
|             for lang_code, name in Locale('en')._data['languages'].items() | 
 | ||||||
|             # fmt: on |     sp_region_names = [] | ||||||
|         } |     for option in dom.xpath('//form[@name="settings"]//select[@name="search_results_region"]/option'): | ||||||
|     ) |         sp_region_names.append(option.get('value')) | ||||||
|  | 
 | ||||||
|  |     for eng_tag in sp_region_names: | ||||||
|  |         if eng_tag == 'all': | ||||||
|  |             continue | ||||||
|  |         babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag)  # norway | ||||||
|  | 
 | ||||||
|  |         if '-' in babel_region_tag: | ||||||
|  |             l, r = babel_region_tag.split('-') | ||||||
|  |             r = r.split('_')[-1] | ||||||
|  |             sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_')) | ||||||
|  | 
 | ||||||
|  |         else: | ||||||
|  |             try: | ||||||
|  |                 sxng_tag = region_tag(babel.Locale.parse(babel_region_tag, sep='_')) | ||||||
|  | 
 | ||||||
|  |             except babel.UnknownLocaleError: | ||||||
|  |                 print("ERROR: can't determine babel locale of startpage's locale %s" % eng_tag) | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |         conflict = engine_traits.regions.get(sxng_tag) | ||||||
|  |         if conflict: | ||||||
|  |             if conflict != eng_tag: | ||||||
|  |                 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) | ||||||
|  |             continue | ||||||
|  |         engine_traits.regions[sxng_tag] = eng_tag | ||||||
|  | 
 | ||||||
|  |     # languages | ||||||
|  | 
 | ||||||
|  |     catalog_engine2code = {name.lower(): lang_code for lang_code, name in babel.Locale('en').languages.items()} | ||||||
| 
 | 
 | ||||||
|     # get the native name of every language known by babel |     # get the native name of every language known by babel | ||||||
|     for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, locale_identifiers()): | 
 | ||||||
|         native_name = Locale(lang_code).get_language_name().lower() |     for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers()): | ||||||
|  |         native_name = babel.Locale(lang_code).get_language_name().lower() | ||||||
|         # add native name exactly as it is |         # add native name exactly as it is | ||||||
|         language_names[native_name] = lang_code |         catalog_engine2code[native_name] = lang_code | ||||||
| 
 | 
 | ||||||
|         # add "normalized" language name (i.e. français becomes francais and español becomes espanol) |         # add "normalized" language name (i.e. français becomes francais and español becomes espanol) | ||||||
|         unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name))) |         unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name))) | ||||||
|         if len(unaccented_name) == len(unaccented_name.encode()): |         if len(unaccented_name) == len(unaccented_name.encode()): | ||||||
|             # add only if result is ascii (otherwise "normalization" didn't work) |             # add only if result is ascii (otherwise "normalization" didn't work) | ||||||
|             language_names[unaccented_name] = lang_code |             catalog_engine2code[unaccented_name] = lang_code | ||||||
|  | 
 | ||||||
|  |     # values that can't be determined by babel's languages names | ||||||
|  | 
 | ||||||
|  |     catalog_engine2code.update( | ||||||
|  |         { | ||||||
|  |             # traditional chinese used in .. | ||||||
|  |             'fantizhengwen': 'zh_Hant', | ||||||
|  |             # Korean alphabet | ||||||
|  |             'hangul': 'ko', | ||||||
|  |             # Malayalam is one of 22 scheduled languages of India. | ||||||
|  |             'malayam': 'ml', | ||||||
|  |             'norsk': 'nb', | ||||||
|  |             'sinhalese': 'si', | ||||||
|  |         } | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     skip_eng_tags = { | ||||||
|  |         'english_uk',  # SearXNG lang 'en' already maps to 'english' | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|     dom = html.fromstring(resp.text) |  | ||||||
|     sp_lang_names = [] |  | ||||||
|     for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'): |     for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'): | ||||||
|         sp_lang_names.append((option.get('value'), extract_text(option).lower())) |  | ||||||
| 
 | 
 | ||||||
|     supported_languages = {} |         eng_tag = option.get('value') | ||||||
|     for sp_option_value, sp_option_text in sp_lang_names: |         if eng_tag in skip_eng_tags: | ||||||
|         lang_code = language_names.get(sp_option_value) or language_names.get(sp_option_text) |             continue | ||||||
|         if isinstance(lang_code, str): |         name = extract_text(option).lower() | ||||||
|             supported_languages[lang_code] = {'alias': sp_option_value} |  | ||||||
|         elif isinstance(lang_code, list): |  | ||||||
|             for _lc in lang_code: |  | ||||||
|                 supported_languages[_lc] = {'alias': sp_option_value} |  | ||||||
|         else: |  | ||||||
|             print('Unknown language option in Startpage: {} ({})'.format(sp_option_value, sp_option_text)) |  | ||||||
| 
 | 
 | ||||||
|     return supported_languages |         sxng_tag = catalog_engine2code.get(eng_tag) | ||||||
|  |         if sxng_tag is None: | ||||||
|  |             sxng_tag = catalog_engine2code[name] | ||||||
|  | 
 | ||||||
|  |         conflict = engine_traits.languages.get(sxng_tag) | ||||||
|  |         if conflict: | ||||||
|  |             if conflict != eng_tag: | ||||||
|  |                 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) | ||||||
|  |             continue | ||||||
|  |         engine_traits.languages[sxng_tag] = eng_tag | ||||||
|  | |||||||
| @ -1,9 +1,12 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| # lint: pylint | # lint: pylint | ||||||
| """Wikidata | """This module implements the Wikidata engine.  Some implementations are shared | ||||||
|  | from :ref:`wikipedia engine`. | ||||||
|  | 
 | ||||||
| """ | """ | ||||||
| # pylint: disable=missing-class-docstring | # pylint: disable=missing-class-docstring | ||||||
| 
 | 
 | ||||||
|  | from typing import TYPE_CHECKING | ||||||
| from hashlib import md5 | from hashlib import md5 | ||||||
| from urllib.parse import urlencode, unquote | from urllib.parse import urlencode, unquote | ||||||
| from json import loads | from json import loads | ||||||
| @ -13,12 +16,17 @@ from babel.dates import format_datetime, format_date, format_time, get_datetime_ | |||||||
| 
 | 
 | ||||||
| from searx.data import WIKIDATA_UNITS | from searx.data import WIKIDATA_UNITS | ||||||
| from searx.network import post, get | from searx.network import post, get | ||||||
| from searx.utils import match_language, searx_useragent, get_string_replaces_function | from searx.utils import searx_useragent, get_string_replaces_function | ||||||
| from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom | from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom | ||||||
| from searx.engines.wikipedia import (  # pylint: disable=unused-import | from searx.engines.wikipedia import fetch_traits as _fetch_traits | ||||||
|     _fetch_supported_languages, | from searx.enginelib.traits import EngineTraits | ||||||
|     supported_languages_url, | 
 | ||||||
| ) | if TYPE_CHECKING: | ||||||
|  |     import logging | ||||||
|  | 
 | ||||||
|  |     logger: logging.Logger | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
| @ -154,33 +162,35 @@ def send_wikidata_query(query, method='GET'): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     language = params['language'].split('-')[0] | 
 | ||||||
|     if language == 'all': |     # wikidata does not support zh-classical (zh_Hans) / zh-TW, zh-HK and zh-CN | ||||||
|         language = 'en' |     # mapped to zh | ||||||
|     else: |     sxng_lang = params['searxng_locale'].split('-')[0] | ||||||
|         language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] |     language = traits.get_language(sxng_lang, 'en') | ||||||
| 
 | 
 | ||||||
|     query, attributes = get_query(query, language) |     query, attributes = get_query(query, language) | ||||||
|  |     logger.debug("request --> language %s // len(attributes): %s", language, len(attributes)) | ||||||
| 
 | 
 | ||||||
|     params['method'] = 'POST' |     params['method'] = 'POST' | ||||||
|     params['url'] = SPARQL_ENDPOINT_URL |     params['url'] = SPARQL_ENDPOINT_URL | ||||||
|     params['data'] = {'query': query} |     params['data'] = {'query': query} | ||||||
|     params['headers'] = get_headers() |     params['headers'] = get_headers() | ||||||
| 
 |  | ||||||
|     params['language'] = language |     params['language'] = language | ||||||
|     params['attributes'] = attributes |     params['attributes'] = attributes | ||||||
|  | 
 | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def response(resp): | def response(resp): | ||||||
|  | 
 | ||||||
|     results = [] |     results = [] | ||||||
|     jsonresponse = loads(resp.content.decode()) |     jsonresponse = loads(resp.content.decode()) | ||||||
| 
 | 
 | ||||||
|     language = resp.search_params['language'].lower() |     language = resp.search_params['language'] | ||||||
|     attributes = resp.search_params['attributes'] |     attributes = resp.search_params['attributes'] | ||||||
|  |     logger.debug("request --> language %s // len(attributes): %s", language, len(attributes)) | ||||||
| 
 | 
 | ||||||
|     seen_entities = set() |     seen_entities = set() | ||||||
| 
 |  | ||||||
|     for result in jsonresponse.get('results', {}).get('bindings', []): |     for result in jsonresponse.get('results', {}).get('bindings', []): | ||||||
|         attribute_result = {key: value['value'] for key, value in result.items()} |         attribute_result = {key: value['value'] for key, value in result.items()} | ||||||
|         entity_url = attribute_result['item'] |         entity_url = attribute_result['item'] | ||||||
| @ -756,3 +766,15 @@ def init(engine_settings=None):  # pylint: disable=unused-argument | |||||||
|         lang = result['name']['xml:lang'] |         lang = result['name']['xml:lang'] | ||||||
|         entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '') |         entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '') | ||||||
|         WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize() |         WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def fetch_traits(engine_traits: EngineTraits): | ||||||
|  |     """Use languages evaluated from :py:obj:`wikipedia.fetch_traits | ||||||
|  |     <searx.engines.wikipedia.fetch_traits>` except zh-classical (zh_Hans) what | ||||||
|  |     is not supported by wikidata.""" | ||||||
|  | 
 | ||||||
|  |     _fetch_traits(engine_traits) | ||||||
|  |     # wikidata does not support zh-classical (zh_Hans) | ||||||
|  |     engine_traits.languages.pop('zh_Hans') | ||||||
|  |     # wikidata does not have net-locations for the languages | ||||||
|  |     engine_traits.custom['wiki_netloc'] = {} | ||||||
|  | |||||||
| @ -1,13 +1,26 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| """ | # lint: pylint | ||||||
|  Wikipedia (Web) | """This module implements the Wikipedia engine.  Some of this implementations | ||||||
|  | are shared by other engines: | ||||||
|  | 
 | ||||||
|  | - :ref:`wikidata engine` | ||||||
|  | 
 | ||||||
|  | The list of supported languages is fetched from the article linked by | ||||||
|  | :py:obj:`wikipedia_article_depth`.  Unlike traditional search engines, wikipedia | ||||||
|  | does not support one Wikipedia for all the languages, but there is one Wikipedia | ||||||
|  | for every language (:py:obj:`fetch_traits`). | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from urllib.parse import quote | import urllib.parse | ||||||
| from json import loads | import babel | ||||||
| from lxml.html import fromstring | 
 | ||||||
| from searx.utils import match_language, searx_useragent | from lxml import html | ||||||
| from searx.network import raise_for_httperror | 
 | ||||||
|  | from searx import network | ||||||
|  | from searx.locales import language_tag | ||||||
|  | from searx.enginelib.traits import EngineTraits | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
| @ -19,32 +32,40 @@ about = { | |||||||
|     "results": 'JSON', |     "results": 'JSON', | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| send_accept_language_header = True | send_accept_language_header = True | ||||||
| 
 | 
 | ||||||
| # search-url | wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth' | ||||||
| search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' | """The *editing depth* of Wikipedia is one of several possible rough indicators | ||||||
| supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' | of the encyclopedia's collaborative quality, showing how frequently its articles | ||||||
| language_variants = {"zh": ("zh-cn", "zh-hk", "zh-mo", "zh-my", "zh-sg", "zh-tw")} | are updated.  The measurement of depth was introduced after some limitations of | ||||||
|  | the classic measurement of article count were realized. | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | # example: https://zh-classical.wikipedia.org/api/rest_v1/page/summary/日 | ||||||
|  | rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}' | ||||||
|  | """`wikipedia rest_v1 summary API`_: The summary response includes an extract of | ||||||
|  | the first paragraph of the page in plain text and HTML as well as the type of | ||||||
|  | page. This is useful for page previews (fka. Hovercards, aka. Popups) on the web | ||||||
|  | and link previews in the apps. | ||||||
|  | 
 | ||||||
|  | .. _wikipedia rest_v1 summary API: https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_ | ||||||
|  | 
 | ||||||
|  | """ | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # set language in base_url |  | ||||||
| def url_lang(lang): |  | ||||||
|     lang_pre = lang.split('-')[0] |  | ||||||
|     if lang_pre == 'all' or lang_pre not in supported_languages and lang_pre not in language_aliases: |  | ||||||
|         return 'en' |  | ||||||
|     return match_language(lang, supported_languages, language_aliases).split('-')[0] |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # do search-request |  | ||||||
| def request(query, params): | def request(query, params): | ||||||
|  |     """Assemble a request (`wikipedia rest_v1 summary API`_).""" | ||||||
|     if query.islower(): |     if query.islower(): | ||||||
|         query = query.title() |         query = query.title() | ||||||
| 
 | 
 | ||||||
|     language = url_lang(params['language']) |     engine_language = traits.get_language(params['searxng_locale'], 'en') | ||||||
|     params['url'] = search_url.format(title=quote(query), language=language) |     wiki_netloc = traits.custom['wiki_netloc'].get(engine_language, 'https://en.wikipedia.org/wiki/') | ||||||
|  |     title = urllib.parse.quote(query) | ||||||
|  | 
 | ||||||
|  |     # '!wikipedia 日 :zh-TW' --> https://zh-classical.wikipedia.org/ | ||||||
|  |     # '!wikipedia 日 :zh' --> https://zh.wikipedia.org/ | ||||||
|  |     params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title) | ||||||
| 
 | 
 | ||||||
|     params['headers']['User-Agent'] = searx_useragent() |  | ||||||
|     params['raise_for_httperror'] = False |     params['raise_for_httperror'] = False | ||||||
|     params['soft_max_redirects'] = 2 |     params['soft_max_redirects'] = 2 | ||||||
| 
 | 
 | ||||||
| @ -53,13 +74,14 @@ def request(query, params): | |||||||
| 
 | 
 | ||||||
| # get response from search-request | # get response from search-request | ||||||
| def response(resp): | def response(resp): | ||||||
|  | 
 | ||||||
|  |     results = [] | ||||||
|     if resp.status_code == 404: |     if resp.status_code == 404: | ||||||
|         return [] |         return [] | ||||||
| 
 |  | ||||||
|     if resp.status_code == 400: |     if resp.status_code == 400: | ||||||
|         try: |         try: | ||||||
|             api_result = loads(resp.text) |             api_result = resp.json() | ||||||
|         except: |         except Exception:  # pylint: disable=broad-except | ||||||
|             pass |             pass | ||||||
|         else: |         else: | ||||||
|             if ( |             if ( | ||||||
| @ -68,20 +90,14 @@ def response(resp): | |||||||
|             ): |             ): | ||||||
|                 return [] |                 return [] | ||||||
| 
 | 
 | ||||||
|     raise_for_httperror(resp) |     network.raise_for_httperror(resp) | ||||||
| 
 |  | ||||||
|     results = [] |  | ||||||
|     api_result = loads(resp.text) |  | ||||||
| 
 |  | ||||||
|     # skip disambiguation pages |  | ||||||
|     if api_result.get('type') != 'standard': |  | ||||||
|         return [] |  | ||||||
| 
 | 
 | ||||||
|  |     api_result = resp.json() | ||||||
|     title = api_result['title'] |     title = api_result['title'] | ||||||
|     wikipedia_link = api_result['content_urls']['desktop']['page'] |     wikipedia_link = api_result['content_urls']['desktop']['page'] | ||||||
|  |     results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')}) | ||||||
| 
 | 
 | ||||||
|     results.append({'url': wikipedia_link, 'title': title}) |     if api_result.get('type') == 'standard': | ||||||
| 
 |  | ||||||
|         results.append( |         results.append( | ||||||
|             { |             { | ||||||
|                 'infobox': title, |                 'infobox': title, | ||||||
| @ -95,22 +111,114 @@ def response(resp): | |||||||
|     return results |     return results | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # get supported languages from their site | # Nonstandard language codes | ||||||
| def _fetch_supported_languages(resp): | # | ||||||
|     supported_languages = {} | # These Wikipedias use language codes that do not conform to the ISO 639 | ||||||
|     dom = fromstring(resp.text) | # standard (which is how wiki subdomains are chosen nowadays). | ||||||
|     tables = dom.xpath('//table[contains(@class,"sortable")]') |  | ||||||
|     for table in tables: |  | ||||||
|         # exclude header row |  | ||||||
|         trs = table.xpath('.//tr')[1:] |  | ||||||
|         for tr in trs: |  | ||||||
|             td = tr.xpath('./td') |  | ||||||
|             code = td[3].xpath('./a')[0].text |  | ||||||
|             name = td[1].xpath('./a')[0].text |  | ||||||
|             english_name = td[1].xpath('./a')[0].text |  | ||||||
|             articles = int(td[4].xpath('./a')[0].text.replace(',', '')) |  | ||||||
|             # exclude languages with too few articles |  | ||||||
|             if articles >= 100: |  | ||||||
|                 supported_languages[code] = {"name": name, "english_name": english_name} |  | ||||||
| 
 | 
 | ||||||
|     return supported_languages | lang_map = { | ||||||
|  |     'be-tarask': 'bel', | ||||||
|  |     'ak': 'aka', | ||||||
|  |     'als': 'gsw', | ||||||
|  |     'bat-smg': 'sgs', | ||||||
|  |     'cbk-zam': 'cbk', | ||||||
|  |     'fiu-vro': 'vro', | ||||||
|  |     'map-bms': 'map', | ||||||
|  |     'nrm': 'nrf', | ||||||
|  |     'roa-rup': 'rup', | ||||||
|  |     'nds-nl': 'nds', | ||||||
|  |     #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple) | ||||||
|  |     'zh-min-nan': 'nan', | ||||||
|  |     'zh-yue': 'yue', | ||||||
|  |     'an': 'arg', | ||||||
|  |     'zh-classical': 'zh-Hant',  # babel maps classical to zh-Hans (for whatever reason) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | unknown_langs = [ | ||||||
|  |     'an',  # Aragonese | ||||||
|  |     'ba',  # Bashkir | ||||||
|  |     'bar',  # Bavarian | ||||||
|  |     'bcl',  # Central Bicolano | ||||||
|  |     'be-tarask',  # Belarusian variant / Belarusian is already covered by 'be' | ||||||
|  |     'bpy',  # Bishnupriya Manipuri is unknown by babel | ||||||
|  |     'hif',  # Fiji Hindi | ||||||
|  |     'ilo',  # Ilokano | ||||||
|  |     'li',  # Limburgish | ||||||
|  |     'sco',  # Scots (sco) is not known by babel, Scottish Gaelic (gd) is known by babel | ||||||
|  |     'sh',  # Serbo-Croatian | ||||||
|  |     'simple',  # simple english is not know as a natural language different to english (babel) | ||||||
|  |     'vo',  # Volapük | ||||||
|  |     'wa',  # Walloon | ||||||
|  | ] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def fetch_traits(engine_traits: EngineTraits): | ||||||
|  |     """Fetch languages from Wikipedia. | ||||||
|  | 
 | ||||||
|  |     The location of the Wikipedia address of a language is mapped in a | ||||||
|  |     :py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>` | ||||||
|  |     (``wiki_netloc``).  Here is a reduced example: | ||||||
|  | 
 | ||||||
|  |     .. code:: python | ||||||
|  | 
 | ||||||
|  |        traits.custom['wiki_netloc'] = { | ||||||
|  |            "en": "en.wikipedia.org", | ||||||
|  |            .. | ||||||
|  |            "gsw": "als.wikipedia.org", | ||||||
|  |            .. | ||||||
|  |            "zh": "zh.wikipedia.org", | ||||||
|  |            "zh-classical": "zh-classical.wikipedia.org" | ||||||
|  |        } | ||||||
|  | 
 | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     engine_traits.custom['wiki_netloc'] = {} | ||||||
|  | 
 | ||||||
|  |     # insert alias to map from a region like zh-CN to a language zh_Hans | ||||||
|  |     engine_traits.languages['zh_Hans'] = 'zh' | ||||||
|  | 
 | ||||||
|  |     resp = network.get(wikipedia_article_depth) | ||||||
|  |     if not resp.ok: | ||||||
|  |         print("ERROR: response from Wikipedia is not OK.") | ||||||
|  | 
 | ||||||
|  |     dom = html.fromstring(resp.text) | ||||||
|  |     for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'): | ||||||
|  | 
 | ||||||
|  |         cols = row.xpath('./td') | ||||||
|  |         if not cols: | ||||||
|  |             continue | ||||||
|  |         cols = [c.text_content().strip() for c in cols] | ||||||
|  | 
 | ||||||
|  |         depth = float(cols[3].replace('-', '0').replace(',', '')) | ||||||
|  |         articles = int(cols[4].replace(',', '').replace(',', '')) | ||||||
|  | 
 | ||||||
|  |         if articles < 10000: | ||||||
|  |             # exclude languages with too few articles | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         if int(depth) < 20: | ||||||
|  |             # Rough indicator of a Wikipedia’s quality, showing how frequently | ||||||
|  |             # its articles are updated. | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         eng_tag = cols[2] | ||||||
|  |         wiki_url = row.xpath('./td[3]/a/@href')[0] | ||||||
|  |         wiki_url = urllib.parse.urlparse(wiki_url) | ||||||
|  | 
 | ||||||
|  |         if eng_tag in unknown_langs: | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         try: | ||||||
|  |             sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-')) | ||||||
|  |         except babel.UnknownLocaleError: | ||||||
|  |             print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag)) | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         conflict = engine_traits.languages.get(sxng_tag) | ||||||
|  |         if conflict: | ||||||
|  |             if conflict != eng_tag: | ||||||
|  |                 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         engine_traits.languages[sxng_tag] = eng_tag | ||||||
|  |         engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc | ||||||
|  | |||||||
| @ -17,8 +17,10 @@ from searx.utils import ( | |||||||
|     eval_xpath_getindex, |     eval_xpath_getindex, | ||||||
|     eval_xpath_list, |     eval_xpath_list, | ||||||
|     extract_text, |     extract_text, | ||||||
|     match_language, |  | ||||||
| ) | ) | ||||||
|  | from searx.enginelib.traits import EngineTraits | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
| @ -34,8 +36,7 @@ about = { | |||||||
| categories = ['general', 'web'] | categories = ['general', 'web'] | ||||||
| paging = True | paging = True | ||||||
| time_range_support = True | time_range_support = True | ||||||
| supported_languages_url = 'https://search.yahoo.com/preferences/languages' | # send_accept_language_header = True | ||||||
| """Supported languages are read from Yahoo preference page.""" |  | ||||||
| 
 | 
 | ||||||
| time_range_dict = { | time_range_dict = { | ||||||
|     'day': ('1d', 'd'), |     'day': ('1d', 'd'), | ||||||
| @ -43,15 +44,10 @@ time_range_dict = { | |||||||
|     'month': ('1m', 'm'), |     'month': ('1m', 'm'), | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| language_aliases = { |  | ||||||
|     'zh-HK': 'zh_chs', |  | ||||||
|     'zh-CN': 'zh_chs',  # dead since 2015 / routed to hk.search.yahoo.com |  | ||||||
|     'zh-TW': 'zh_cht', |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| lang2domain = { | lang2domain = { | ||||||
|     'zh_chs': 'hk.search.yahoo.com', |     'zh_chs': 'hk.search.yahoo.com', | ||||||
|     'zh_cht': 'tw.search.yahoo.com', |     'zh_cht': 'tw.search.yahoo.com', | ||||||
|  |     'any': 'search.yahoo.com', | ||||||
|     'en': 'search.yahoo.com', |     'en': 'search.yahoo.com', | ||||||
|     'bg': 'search.yahoo.com', |     'bg': 'search.yahoo.com', | ||||||
|     'cs': 'search.yahoo.com', |     'cs': 'search.yahoo.com', | ||||||
| @ -67,21 +63,23 @@ lang2domain = { | |||||||
| } | } | ||||||
| """Map language to domain""" | """Map language to domain""" | ||||||
| 
 | 
 | ||||||
| 
 | locale_aliases = { | ||||||
| def _get_language(params): |     'zh': 'zh_Hans', | ||||||
| 
 |     'zh-HK': 'zh_Hans', | ||||||
|     lang = language_aliases.get(params['language']) |     'zh-CN': 'zh_Hans',  # dead since 2015 / routed to hk.search.yahoo.com | ||||||
|     if lang is None: |     'zh-TW': 'zh_Hant', | ||||||
|         lang = match_language(params['language'], supported_languages, language_aliases) | } | ||||||
|     lang = lang.split('-')[0] |  | ||||||
|     logger.debug("params['language']: %s --> %s", params['language'], lang) |  | ||||||
|     return lang |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     """build request""" |     """build request""" | ||||||
|  | 
 | ||||||
|  |     lang = locale_aliases.get(params['language'], None) | ||||||
|  |     if not lang: | ||||||
|  |         lang = params['language'].split('-')[0] | ||||||
|  |     lang = traits.get_language(lang, traits.all_locale) | ||||||
|  | 
 | ||||||
|     offset = (params['pageno'] - 1) * 7 + 1 |     offset = (params['pageno'] - 1) * 7 + 1 | ||||||
|     lang = _get_language(params) |  | ||||||
|     age, btf = time_range_dict.get(params['time_range'], ('', '')) |     age, btf = time_range_dict.get(params['time_range'], ('', '')) | ||||||
| 
 | 
 | ||||||
|     args = urlencode( |     args = urlencode( | ||||||
| @ -154,13 +152,37 @@ def response(resp): | |||||||
|     return results |     return results | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # get supported languages from their site | def fetch_traits(engine_traits: EngineTraits): | ||||||
| def _fetch_supported_languages(resp): |     """Fetch languages from yahoo""" | ||||||
|     supported_languages = [] | 
 | ||||||
|  |     # pylint: disable=import-outside-toplevel | ||||||
|  |     import babel | ||||||
|  |     from searx import network | ||||||
|  |     from searx.locales import language_tag | ||||||
|  | 
 | ||||||
|  |     engine_traits.all_locale = 'any' | ||||||
|  | 
 | ||||||
|  |     resp = network.get('https://search.yahoo.com/preferences/languages') | ||||||
|  |     if not resp.ok: | ||||||
|  |         print("ERROR: response from peertube is not OK.") | ||||||
|  | 
 | ||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
|     offset = len('lang_') |     offset = len('lang_') | ||||||
| 
 | 
 | ||||||
|     for val in eval_xpath_list(dom, '//div[contains(@class, "lang-item")]/input/@value'): |     eng2sxng = {'zh_chs': 'zh_Hans', 'zh_cht': 'zh_Hant'} | ||||||
|         supported_languages.append(val[offset:]) |  | ||||||
| 
 | 
 | ||||||
|     return supported_languages |     for val in eval_xpath_list(dom, '//div[contains(@class, "lang-item")]/input/@value'): | ||||||
|  |         eng_tag = val[offset:] | ||||||
|  | 
 | ||||||
|  |         try: | ||||||
|  |             sxng_tag = language_tag(babel.Locale.parse(eng2sxng.get(eng_tag, eng_tag))) | ||||||
|  |         except babel.UnknownLocaleError: | ||||||
|  |             print('ERROR: unknown language --> %s' % eng_tag) | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         conflict = engine_traits.languages.get(sxng_tag) | ||||||
|  |         if conflict: | ||||||
|  |             if conflict != eng_tag: | ||||||
|  |                 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) | ||||||
|  |             continue | ||||||
|  |         engine_traits.languages[sxng_tag] = eng_tag | ||||||
|  | |||||||
							
								
								
									
										190
									
								
								searx/locales.py
									
									
									
									
									
								
							
							
						
						
									
										190
									
								
								searx/locales.py
									
									
									
									
									
								
							| @ -4,11 +4,11 @@ | |||||||
| """Initialize :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`. | """Initialize :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`. | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from typing import Set | from typing import Set, Optional, List | ||||||
| import os | import os | ||||||
| import pathlib | import pathlib | ||||||
| 
 | 
 | ||||||
| from babel import Locale | import babel | ||||||
| from babel.support import Translations | from babel.support import Translations | ||||||
| import babel.languages | import babel.languages | ||||||
| import babel.core | import babel.core | ||||||
| @ -134,7 +134,7 @@ def locales_initialize(directory=None): | |||||||
|     flask_babel.get_translations = get_translations |     flask_babel.get_translations = get_translations | ||||||
| 
 | 
 | ||||||
|     for tag, descr in ADDITIONAL_TRANSLATIONS.items(): |     for tag, descr in ADDITIONAL_TRANSLATIONS.items(): | ||||||
|         locale = Locale.parse(LOCALE_BEST_MATCH[tag], sep='-') |         locale = babel.Locale.parse(LOCALE_BEST_MATCH[tag], sep='-') | ||||||
|         LOCALE_NAMES[tag] = descr |         LOCALE_NAMES[tag] = descr | ||||||
|         if locale.text_direction == 'rtl': |         if locale.text_direction == 'rtl': | ||||||
|             RTL_LOCALES.add(tag) |             RTL_LOCALES.add(tag) | ||||||
| @ -142,7 +142,7 @@ def locales_initialize(directory=None): | |||||||
|     for tag in LOCALE_BEST_MATCH: |     for tag in LOCALE_BEST_MATCH: | ||||||
|         descr = LOCALE_NAMES.get(tag) |         descr = LOCALE_NAMES.get(tag) | ||||||
|         if not descr: |         if not descr: | ||||||
|             locale = Locale.parse(tag, sep='-') |             locale = babel.Locale.parse(tag, sep='-') | ||||||
|             LOCALE_NAMES[tag] = get_locale_descr(locale, tag.replace('-', '_')) |             LOCALE_NAMES[tag] = get_locale_descr(locale, tag.replace('-', '_')) | ||||||
|             if locale.text_direction == 'rtl': |             if locale.text_direction == 'rtl': | ||||||
|                 RTL_LOCALES.add(tag) |                 RTL_LOCALES.add(tag) | ||||||
| @ -154,12 +154,77 @@ def locales_initialize(directory=None): | |||||||
|         tag = dirname.replace('_', '-') |         tag = dirname.replace('_', '-') | ||||||
|         descr = LOCALE_NAMES.get(tag) |         descr = LOCALE_NAMES.get(tag) | ||||||
|         if not descr: |         if not descr: | ||||||
|             locale = Locale.parse(dirname) |             locale = babel.Locale.parse(dirname) | ||||||
|             LOCALE_NAMES[tag] = get_locale_descr(locale, dirname) |             LOCALE_NAMES[tag] = get_locale_descr(locale, dirname) | ||||||
|             if locale.text_direction == 'rtl': |             if locale.text_direction == 'rtl': | ||||||
|                 RTL_LOCALES.add(tag) |                 RTL_LOCALES.add(tag) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def region_tag(locale: babel.Locale) -> str: | ||||||
|  |     """Returns SearXNG's region tag from the locale (e.g. zh-TW , en-US).""" | ||||||
|  |     if not locale.territory: | ||||||
|  |         raise ValueError('%s missed a territory') | ||||||
|  |     return locale.language + '-' + locale.territory | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def language_tag(locale: babel.Locale) -> str: | ||||||
|  |     """Returns SearXNG's language tag from the locale and if exits, the tag | ||||||
|  |     includes the script name (e.g. en, zh_Hant). | ||||||
|  |     """ | ||||||
|  |     sxng_lang = locale.language | ||||||
|  |     if locale.script: | ||||||
|  |         sxng_lang += '_' + locale.script | ||||||
|  |     return sxng_lang | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_locale(locale_tag: str) -> Optional[babel.Locale]: | ||||||
|  |     """Returns a :py:obj:`babel.Locale` object parsed from argument | ||||||
|  |     ``locale_tag``""" | ||||||
|  |     try: | ||||||
|  |         locale = babel.Locale.parse(locale_tag, sep='-') | ||||||
|  |         return locale | ||||||
|  | 
 | ||||||
|  |     except babel.core.UnknownLocaleError: | ||||||
|  |         return None | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_offical_locales( | ||||||
|  |     territory: str, languages=None, regional: bool = False, de_facto: bool = True | ||||||
|  | ) -> Set[babel.Locale]: | ||||||
|  |     """Returns a list of :py:obj:`babel.Locale` with languages from | ||||||
|  |     :py:obj:`babel.languages.get_official_languages`. | ||||||
|  | 
 | ||||||
|  |     :param territory: The territory (country or region) code. | ||||||
|  | 
 | ||||||
|  |     :param languages: A list of language codes the languages from | ||||||
|  |       :py:obj:`babel.languages.get_official_languages` should be in | ||||||
|  |       (intersection).  If this argument is ``None``, all official languages in | ||||||
|  |       this territory are used. | ||||||
|  | 
 | ||||||
|  |     :param regional: If the regional flag is set, then languages which are | ||||||
|  |       regionally official are also returned. | ||||||
|  | 
 | ||||||
|  |     :param de_facto: If the de_facto flag is set to `False`, then languages | ||||||
|  |       which are “de facto” official are not returned. | ||||||
|  | 
 | ||||||
|  |     """ | ||||||
|  |     ret_val = set() | ||||||
|  |     o_languages = babel.languages.get_official_languages(territory, regional=regional, de_facto=de_facto) | ||||||
|  | 
 | ||||||
|  |     if languages: | ||||||
|  |         languages = [l.lower() for l in languages] | ||||||
|  |         o_languages = set(l for l in o_languages if l.lower() in languages) | ||||||
|  | 
 | ||||||
|  |     for lang in o_languages: | ||||||
|  |         try: | ||||||
|  |             locale = babel.Locale.parse(lang + '_' + territory) | ||||||
|  |             ret_val.add(locale) | ||||||
|  |         except babel.UnknownLocaleError: | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |     return ret_val | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def get_engine_locale(searxng_locale, engine_locales, default=None): | def get_engine_locale(searxng_locale, engine_locales, default=None): | ||||||
|     """Return engine's language (aka locale) string that best fits to argument |     """Return engine's language (aka locale) string that best fits to argument | ||||||
|     ``searxng_locale``. |     ``searxng_locale``. | ||||||
| @ -177,6 +242,10 @@ def get_engine_locale(searxng_locale, engine_locales, default=None): | |||||||
|           ... |           ... | ||||||
|           'pl-PL'          : 'pl_PL', |           'pl-PL'          : 'pl_PL', | ||||||
|           'pt-PT'          : 'pt_PT' |           'pt-PT'          : 'pt_PT' | ||||||
|  |           .. | ||||||
|  |           'zh'             : 'zh' | ||||||
|  |           'zh_Hans'        : 'zh' | ||||||
|  |           'zh_Hant'        : 'zh-classical' | ||||||
|       } |       } | ||||||
| 
 | 
 | ||||||
|     .. hint:: |     .. hint:: | ||||||
| @ -210,13 +279,13 @@ def get_engine_locale(searxng_locale, engine_locales, default=None): | |||||||
|       engine. |       engine. | ||||||
| 
 | 
 | ||||||
|     """ |     """ | ||||||
|     # pylint: disable=too-many-branches |     # pylint: disable=too-many-branches, too-many-return-statements | ||||||
| 
 | 
 | ||||||
|     engine_locale = engine_locales.get(searxng_locale) |     engine_locale = engine_locales.get(searxng_locale) | ||||||
| 
 | 
 | ||||||
|     if engine_locale is not None: |     if engine_locale is not None: | ||||||
|         # There was a 1:1 mapping (e.g. "fr-BE --> fr_BE" or "fr --> fr_FR"), no |         # There was a 1:1 mapping (e.g. a region "fr-BE --> fr_BE" or a language | ||||||
|         # need to narrow language nor territory. |         # "zh --> zh"), no need to narrow language-script nor territory. | ||||||
|         return engine_locale |         return engine_locale | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
| @ -227,6 +296,12 @@ def get_engine_locale(searxng_locale, engine_locales, default=None): | |||||||
|         except babel.core.UnknownLocaleError: |         except babel.core.UnknownLocaleError: | ||||||
|             return default |             return default | ||||||
| 
 | 
 | ||||||
|  |     searxng_lang = language_tag(locale) | ||||||
|  |     engine_locale = engine_locales.get(searxng_lang) | ||||||
|  |     if engine_locale is not None: | ||||||
|  |         # There was a 1:1 mapping (e.g. "zh-HK --> zh_Hant" or "zh-CN --> zh_Hans") | ||||||
|  |         return engine_locale | ||||||
|  | 
 | ||||||
|     # SearXNG's selected locale is not supported by the engine .. |     # SearXNG's selected locale is not supported by the engine .. | ||||||
| 
 | 
 | ||||||
|     if locale.territory: |     if locale.territory: | ||||||
| @ -247,10 +322,6 @@ def get_engine_locale(searxng_locale, engine_locales, default=None): | |||||||
| 
 | 
 | ||||||
|     if locale.language: |     if locale.language: | ||||||
| 
 | 
 | ||||||
|         searxng_lang = locale.language |  | ||||||
|         if locale.script: |  | ||||||
|             searxng_lang += '_' + locale.script |  | ||||||
| 
 |  | ||||||
|         terr_lang_dict = {} |         terr_lang_dict = {} | ||||||
|         for territory, langs in babel.core.get_global("territory_languages").items(): |         for territory, langs in babel.core.get_global("territory_languages").items(): | ||||||
|             if not langs.get(searxng_lang, {}).get('official_status'): |             if not langs.get(searxng_lang, {}).get('official_status'): | ||||||
| @ -303,3 +374,98 @@ def get_engine_locale(searxng_locale, engine_locales, default=None): | |||||||
|         engine_locale = default |         engine_locale = default | ||||||
| 
 | 
 | ||||||
|     return default |     return default | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def match_locale(searxng_locale: str, locale_tag_list: List[str], fallback: Optional[str] = None) -> Optional[str]: | ||||||
|  |     """Return tag from ``locale_tag_list`` that best fits to ``searxng_locale``. | ||||||
|  | 
 | ||||||
|  |     :param str searxng_locale: SearXNG's internal representation of locale (de, | ||||||
|  |         de-DE, fr-BE, zh, zh-CN, zh-TW ..). | ||||||
|  | 
 | ||||||
|  |     :param list locale_tag_list: The list of locale tags to select from | ||||||
|  | 
 | ||||||
|  |     :param str fallback: fallback locale tag (if unset --> ``None``) | ||||||
|  | 
 | ||||||
|  |     The rules to find a match are implemented in :py:obj:`get_engine_locale`, | ||||||
|  |     the ``engine_locales`` is build up by :py:obj:`build_engine_locales`. | ||||||
|  | 
 | ||||||
|  |     .. hint:: | ||||||
|  | 
 | ||||||
|  |        The *SearXNG locale* string and the members of ``locale_tag_list`` has to | ||||||
|  |        be known by babel!  The :py:obj:`ADDITIONAL_TRANSLATIONS` are used in the | ||||||
|  |        UI and are not known by babel --> will be ignored. | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     # searxng_locale = 'es' | ||||||
|  |     # locale_tag_list = ['es-AR', 'es-ES', 'es-MX'] | ||||||
|  | 
 | ||||||
|  |     if not searxng_locale: | ||||||
|  |         return fallback | ||||||
|  | 
 | ||||||
|  |     locale = get_locale(searxng_locale) | ||||||
|  |     if locale is None: | ||||||
|  |         return fallback | ||||||
|  | 
 | ||||||
|  |     # normalize to a SearXNG locale that can be passed to get_engine_locale | ||||||
|  | 
 | ||||||
|  |     searxng_locale = language_tag(locale) | ||||||
|  |     if locale.territory: | ||||||
|  |         searxng_locale = region_tag(locale) | ||||||
|  | 
 | ||||||
|  |     # clean up locale_tag_list | ||||||
|  | 
 | ||||||
|  |     tag_list = [] | ||||||
|  |     for tag in locale_tag_list: | ||||||
|  |         if tag in ('all', 'auto') or tag in ADDITIONAL_TRANSLATIONS: | ||||||
|  |             continue | ||||||
|  |         tag_list.append(tag) | ||||||
|  | 
 | ||||||
|  |     # emulate fetch_traits | ||||||
|  |     engine_locales = build_engine_locales(tag_list) | ||||||
|  |     return get_engine_locale(searxng_locale, engine_locales, default=fallback) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def build_engine_locales(tag_list: List[str]): | ||||||
|  |     """From a list of locale tags a dictionary is build that can be passed by | ||||||
|  |     argument ``engine_locales`` to :py:obj:`get_engine_locale`.  This function | ||||||
|  |     is mainly used by :py:obj:`match_locale` and is similar to what the | ||||||
|  |     ``fetch_traits(..)`` function of engines do. | ||||||
|  | 
 | ||||||
|  |     If there are territory codes in the ``tag_list`` that have a *script code* | ||||||
|  |     additional keys are added to the returned dictionary. | ||||||
|  | 
 | ||||||
|  |     .. code:: python | ||||||
|  | 
 | ||||||
|  |        >>> import locales | ||||||
|  |        >>> engine_locales = locales.build_engine_locales(['en', 'en-US', 'zh', 'zh-CN', 'zh-TW']) | ||||||
|  |        >>> engine_locales | ||||||
|  |        { | ||||||
|  |            'en': 'en', 'en-US': 'en-US', | ||||||
|  |            'zh': 'zh', 'zh-CN': 'zh-CN', 'zh_Hans': 'zh-CN', | ||||||
|  |            'zh-TW': 'zh-TW', 'zh_Hant': 'zh-TW' | ||||||
|  |        } | ||||||
|  |        >>> get_engine_locale('zh-Hans', engine_locales) | ||||||
|  |        'zh-CN' | ||||||
|  | 
 | ||||||
|  |     This function is a good example to understand the language/region model | ||||||
|  |     of SearXNG: | ||||||
|  | 
 | ||||||
|  |       SearXNG only distinguishes between **search languages** and **search | ||||||
|  |       regions**, by adding the *script-tags*, languages with *script-tags* can | ||||||
|  |       be assigned to the **regions** that SearXNG supports. | ||||||
|  | 
 | ||||||
|  |     """ | ||||||
|  |     engine_locales = {} | ||||||
|  | 
 | ||||||
|  |     for tag in tag_list: | ||||||
|  |         locale = get_locale(tag) | ||||||
|  |         if locale is None: | ||||||
|  |             logger.warn("build_engine_locales: skip locale tag %s / unknown by babel", tag) | ||||||
|  |             continue | ||||||
|  |         if locale.territory: | ||||||
|  |             engine_locales[region_tag(locale)] = tag | ||||||
|  |             if locale.script: | ||||||
|  |                 engine_locales[language_tag(locale)] = tag | ||||||
|  |         else: | ||||||
|  |             engine_locales[language_tag(locale)] = tag | ||||||
|  |     return engine_locales | ||||||
|  | |||||||
| @ -13,7 +13,7 @@ from typing import Iterable, Dict, List | |||||||
| import flask | import flask | ||||||
| 
 | 
 | ||||||
| from searx import settings, autocomplete | from searx import settings, autocomplete | ||||||
| from searx.engines import Engine | from searx.enginelib import Engine | ||||||
| from searx.plugins import Plugin | from searx.plugins import Plugin | ||||||
| from searx.locales import LOCALE_NAMES | from searx.locales import LOCALE_NAMES | ||||||
| from searx.webutils import VALID_LANGUAGE_CODE | from searx.webutils import VALID_LANGUAGE_CODE | ||||||
|  | |||||||
| @ -4,7 +4,7 @@ from abc import abstractmethod, ABC | |||||||
| import re | import re | ||||||
| 
 | 
 | ||||||
| from searx import settings | from searx import settings | ||||||
| from searx.languages import language_codes | from searx.sxng_locales import sxng_locales | ||||||
| from searx.engines import categories, engines, engine_shortcuts | from searx.engines import categories, engines, engine_shortcuts | ||||||
| from searx.external_bang import get_bang_definition_and_autocomplete | from searx.external_bang import get_bang_definition_and_autocomplete | ||||||
| from searx.search import EngineRef | from searx.search import EngineRef | ||||||
| @ -84,7 +84,7 @@ class LanguageParser(QueryPartParser): | |||||||
|         found = False |         found = False | ||||||
|         # check if any language-code is equal with |         # check if any language-code is equal with | ||||||
|         # declared language-codes |         # declared language-codes | ||||||
|         for lc in language_codes: |         for lc in sxng_locales: | ||||||
|             lang_id, lang_name, country, english_name, _flag = map(str.lower, lc) |             lang_id, lang_name, country, english_name, _flag = map(str.lower, lc) | ||||||
| 
 | 
 | ||||||
|             # if correct language-code is found |             # if correct language-code is found | ||||||
| @ -125,7 +125,7 @@ class LanguageParser(QueryPartParser): | |||||||
|                     self.raw_text_query.autocomplete_list.append(lang) |                     self.raw_text_query.autocomplete_list.append(lang) | ||||||
|             return |             return | ||||||
| 
 | 
 | ||||||
|         for lc in language_codes: |         for lc in sxng_locales: | ||||||
|             if lc[0] not in settings['search']['languages']: |             if lc[0] not in settings['search']['languages']: | ||||||
|                 continue |                 continue | ||||||
|             lang_id, lang_name, country, english_name, _flag = map(str.lower, lc) |             lang_id, lang_name, country, english_name, _flag = map(str.lower, lc) | ||||||
|  | |||||||
| @ -30,7 +30,10 @@ from .abstract import EngineProcessor | |||||||
| 
 | 
 | ||||||
| logger = logger.getChild('search.processors') | logger = logger.getChild('search.processors') | ||||||
| PROCESSORS: Dict[str, EngineProcessor] = {} | PROCESSORS: Dict[str, EngineProcessor] = {} | ||||||
| """Cache request processores, stored by *engine-name* (:py:func:`initialize`)""" | """Cache request processores, stored by *engine-name* (:py:func:`initialize`) | ||||||
|  | 
 | ||||||
|  | :meta hide-value: | ||||||
|  | """ | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_processor_class(engine_type): | def get_processor_class(engine_type): | ||||||
|  | |||||||
| @ -138,7 +138,8 @@ class EngineProcessor(ABC): | |||||||
|         return False |         return False | ||||||
| 
 | 
 | ||||||
|     def get_params(self, search_query, engine_category): |     def get_params(self, search_query, engine_category): | ||||||
|         """Returns a set of *request params* or ``None`` if request is not supported. |         """Returns a set of (see :ref:`request params <engine request arguments>`) or | ||||||
|  |         ``None`` if request is not supported. | ||||||
| 
 | 
 | ||||||
|         Not supported conditions (``None`` is returned): |         Not supported conditions (``None`` is returned): | ||||||
| 
 | 
 | ||||||
| @ -159,11 +160,20 @@ class EngineProcessor(ABC): | |||||||
|         params['safesearch'] = search_query.safesearch |         params['safesearch'] = search_query.safesearch | ||||||
|         params['time_range'] = search_query.time_range |         params['time_range'] = search_query.time_range | ||||||
|         params['engine_data'] = search_query.engine_data.get(self.engine_name, {}) |         params['engine_data'] = search_query.engine_data.get(self.engine_name, {}) | ||||||
|  |         params['searxng_locale'] = search_query.lang | ||||||
|  | 
 | ||||||
|  |         # deprecated / vintage --> use params['searxng_locale'] | ||||||
|  |         # | ||||||
|  |         # Conditions related to engine's traits are implemented in engine.traits | ||||||
|  |         # module. Don't do 'locale' decissions here in the abstract layer of the | ||||||
|  |         # search processor, just pass the value from user's choice unchanged to | ||||||
|  |         # the engine request. | ||||||
| 
 | 
 | ||||||
|         if hasattr(self.engine, 'language') and self.engine.language: |         if hasattr(self.engine, 'language') and self.engine.language: | ||||||
|             params['language'] = self.engine.language |             params['language'] = self.engine.language | ||||||
|         else: |         else: | ||||||
|             params['language'] = search_query.lang |             params['language'] = search_query.lang | ||||||
|  | 
 | ||||||
|         return params |         return params | ||||||
| 
 | 
 | ||||||
|     @abstractmethod |     @abstractmethod | ||||||
|  | |||||||
| @ -51,6 +51,9 @@ class OnlineProcessor(EngineProcessor): | |||||||
|         super().initialize() |         super().initialize() | ||||||
| 
 | 
 | ||||||
|     def get_params(self, search_query, engine_category): |     def get_params(self, search_query, engine_category): | ||||||
|  |         """Returns a set of :ref:`request params <engine request online>` or ``None`` | ||||||
|  |         if request is not supported. | ||||||
|  |         """ | ||||||
|         params = super().get_params(search_query, engine_category) |         params = super().get_params(search_query, engine_category) | ||||||
|         if params is None: |         if params is None: | ||||||
|             return None |             return None | ||||||
| @ -184,11 +187,6 @@ class OnlineProcessor(EngineProcessor): | |||||||
|             self.handle_exception(result_container, e, suspend=True) |             self.handle_exception(result_container, e, suspend=True) | ||||||
|             self.logger.exception('CAPTCHA') |             self.logger.exception('CAPTCHA') | ||||||
|         except SearxEngineTooManyRequestsException as e: |         except SearxEngineTooManyRequestsException as e: | ||||||
|             if "google" in self.engine_name: |  | ||||||
|                 self.logger.warn( |  | ||||||
|                     "Set to 'true' the use_mobile_ui parameter in the 'engines:'" |  | ||||||
|                     " section of your settings.yml file if google is blocked for you." |  | ||||||
|                 ) |  | ||||||
|             self.handle_exception(result_container, e, suspend=True) |             self.handle_exception(result_container, e, suspend=True) | ||||||
|             self.logger.exception('Too many requests') |             self.logger.exception('Too many requests') | ||||||
|         except SearxEngineAccessDeniedException as e: |         except SearxEngineAccessDeniedException as e: | ||||||
| @ -223,7 +221,7 @@ class OnlineProcessor(EngineProcessor): | |||||||
|                 'test': ['unique_results'], |                 'test': ['unique_results'], | ||||||
|             } |             } | ||||||
| 
 | 
 | ||||||
|         if getattr(self.engine, 'supported_languages', []): |         if getattr(self.engine, 'traits', False): | ||||||
|             tests['lang_fr'] = { |             tests['lang_fr'] = { | ||||||
|                 'matrix': {'query': 'paris', 'lang': 'fr'}, |                 'matrix': {'query': 'paris', 'lang': 'fr'}, | ||||||
|                 'result_container': ['not_empty', ('has_language', 'fr')], |                 'result_container': ['not_empty', ('has_language', 'fr')], | ||||||
|  | |||||||
| @ -38,8 +38,8 @@ class OnlineCurrencyProcessor(OnlineProcessor): | |||||||
|     engine_type = 'online_currency' |     engine_type = 'online_currency' | ||||||
| 
 | 
 | ||||||
|     def get_params(self, search_query, engine_category): |     def get_params(self, search_query, engine_category): | ||||||
|         """Returns a set of *request params* or ``None`` if search query does not match |         """Returns a set of :ref:`request params <engine request online_currency>` | ||||||
|         to :py:obj:`parser_re`.""" |         or ``None`` if search query does not match to :py:obj:`parser_re`.""" | ||||||
| 
 | 
 | ||||||
|         params = super().get_params(search_query, engine_category) |         params = super().get_params(search_query, engine_category) | ||||||
|         if params is None: |         if params is None: | ||||||
|  | |||||||
| @ -18,8 +18,9 @@ class OnlineDictionaryProcessor(OnlineProcessor): | |||||||
|     engine_type = 'online_dictionary' |     engine_type = 'online_dictionary' | ||||||
| 
 | 
 | ||||||
|     def get_params(self, search_query, engine_category): |     def get_params(self, search_query, engine_category): | ||||||
|         """Returns a set of *request params* or ``None`` if search query does not match |         """Returns a set of :ref:`request params <engine request online_dictionary>` or | ||||||
|         to :py:obj:`parser_re`.""" |         ``None`` if search query does not match to :py:obj:`parser_re`. | ||||||
|  |         """ | ||||||
|         params = super().get_params(search_query, engine_category) |         params = super().get_params(search_query, engine_category) | ||||||
|         if params is None: |         if params is None: | ||||||
|             return None |             return None | ||||||
|  | |||||||
| @ -20,9 +20,10 @@ class OnlineUrlSearchProcessor(OnlineProcessor): | |||||||
|     engine_type = 'online_url_search' |     engine_type = 'online_url_search' | ||||||
| 
 | 
 | ||||||
|     def get_params(self, search_query, engine_category): |     def get_params(self, search_query, engine_category): | ||||||
|         """Returns a set of *request params* or ``None`` if search query does not match |         """Returns a set of :ref:`request params <engine request online>` or ``None`` if | ||||||
|         to at least one of :py:obj:`re_search_urls`. |         search query does not match to :py:obj:`re_search_urls`. | ||||||
|         """ |         """ | ||||||
|  | 
 | ||||||
|         params = super().get_params(search_query, engine_category) |         params = super().get_params(search_query, engine_category) | ||||||
|         if params is None: |         if params is None: | ||||||
|             return None |             return None | ||||||
|  | |||||||
| @ -731,22 +731,9 @@ engines: | |||||||
|   - name: google |   - name: google | ||||||
|     engine: google |     engine: google | ||||||
|     shortcut: go |     shortcut: go | ||||||
|     # see https://docs.searxng.org/src/searx.engines.google.html#module-searx.engines.google |  | ||||||
|     use_mobile_ui: false |  | ||||||
|     # additional_tests: |     # additional_tests: | ||||||
|     #   android: *test_android |     #   android: *test_android | ||||||
| 
 | 
 | ||||||
|   # - name: google italian |  | ||||||
|   #   engine: google |  | ||||||
|   #   shortcut: goit |  | ||||||
|   #   use_mobile_ui: false |  | ||||||
|   #   language: it |  | ||||||
| 
 |  | ||||||
|   # - name: google mobile ui |  | ||||||
|   #   engine: google |  | ||||||
|   #   shortcut: gomui |  | ||||||
|   #   use_mobile_ui: true |  | ||||||
| 
 |  | ||||||
|   - name: google images |   - name: google images | ||||||
|     engine: google_images |     engine: google_images | ||||||
|     shortcut: goi |     shortcut: goi | ||||||
| @ -1758,9 +1745,8 @@ engines: | |||||||
|     engine: peertube |     engine: peertube | ||||||
|     shortcut: ptb |     shortcut: ptb | ||||||
|     paging: true |     paging: true | ||||||
|     # https://instances.joinpeertube.org/instances |     # alternatives see: https://instances.joinpeertube.org/instances | ||||||
|     base_url: https://peertube.biz/ |     # base_url: https://tube.4aem.com | ||||||
|     # base_url: https://tube.tardis.world/ |  | ||||||
|     categories: videos |     categories: videos | ||||||
|     disabled: true |     disabled: true | ||||||
|     timeout: 6.0 |     timeout: 6.0 | ||||||
|  | |||||||
| @ -12,13 +12,13 @@ import logging | |||||||
| from base64 import b64decode | from base64 import b64decode | ||||||
| from os.path import dirname, abspath | from os.path import dirname, abspath | ||||||
| 
 | 
 | ||||||
| from searx.languages import language_codes as languages | from .sxng_locales import sxng_locales | ||||||
| 
 | 
 | ||||||
| searx_dir = abspath(dirname(__file__)) | searx_dir = abspath(dirname(__file__)) | ||||||
| 
 | 
 | ||||||
| logger = logging.getLogger('searx') | logger = logging.getLogger('searx') | ||||||
| OUTPUT_FORMATS = ['html', 'csv', 'json', 'rss'] | OUTPUT_FORMATS = ['html', 'csv', 'json', 'rss'] | ||||||
| LANGUAGE_CODES = ['all', 'auto'] + list(l[0] for l in languages) | SXNG_LOCALE_TAGS = ['all', 'auto'] + list(l[0] for l in sxng_locales) | ||||||
| SIMPLE_STYLE = ('auto', 'light', 'dark') | SIMPLE_STYLE = ('auto', 'light', 'dark') | ||||||
| CATEGORIES_AS_TABS = { | CATEGORIES_AS_TABS = { | ||||||
|     'general': {}, |     'general': {}, | ||||||
| @ -156,8 +156,8 @@ SCHEMA = { | |||||||
|         'safe_search': SettingsValue((0, 1, 2), 0), |         'safe_search': SettingsValue((0, 1, 2), 0), | ||||||
|         'autocomplete': SettingsValue(str, ''), |         'autocomplete': SettingsValue(str, ''), | ||||||
|         'autocomplete_min': SettingsValue(int, 4), |         'autocomplete_min': SettingsValue(int, 4), | ||||||
|         'default_lang': SettingsValue(tuple(LANGUAGE_CODES + ['']), ''), |         'default_lang': SettingsValue(tuple(SXNG_LOCALE_TAGS + ['']), ''), | ||||||
|         'languages': SettingSublistValue(LANGUAGE_CODES, LANGUAGE_CODES), |         'languages': SettingSublistValue(SXNG_LOCALE_TAGS, SXNG_LOCALE_TAGS), | ||||||
|         'ban_time_on_fail': SettingsValue(numbers.Real, 5), |         'ban_time_on_fail': SettingsValue(numbers.Real, 5), | ||||||
|         'max_ban_time_on_fail': SettingsValue(numbers.Real, 120), |         'max_ban_time_on_fail': SettingsValue(numbers.Real, 120), | ||||||
|         'suspended_times': { |         'suspended_times': { | ||||||
|  | |||||||
| @ -1,73 +1,120 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
| # list of language codes | '''List of SearXNG's locale codes. | ||||||
| # this file is generated automatically by utils/fetch_languages.py | 
 | ||||||
| language_codes = ( | This file is generated automatically by:: | ||||||
|     ('af-ZA', 'Afrikaans', 'Suid-Afrika', 'Afrikaans', '\U0001f1ff\U0001f1e6'), | 
 | ||||||
|     ('ar-EG', 'العربية', 'مصر', 'Arabic', '\U0001f1ea\U0001f1ec'), |    ./manage pyenv.cmd searxng_extra/update/update_engine_traits.py | ||||||
|     ('be-BY', 'Беларуская', 'Беларусь', 'Belarusian', '\U0001f1e7\U0001f1fe'), | ''' | ||||||
|  | 
 | ||||||
|  | sxng_locales = ( | ||||||
|  |     ('ar', 'العربية', '', 'Arabic', '\U0001f310'), | ||||||
|  |     ('bg', 'Български', '', 'Bulgarian', '\U0001f310'), | ||||||
|     ('bg-BG', 'Български', 'България', 'Bulgarian', '\U0001f1e7\U0001f1ec'), |     ('bg-BG', 'Български', 'България', 'Bulgarian', '\U0001f1e7\U0001f1ec'), | ||||||
|  |     ('ca', 'Català', '', 'Catalan', '\U0001f310'), | ||||||
|     ('ca-ES', 'Català', 'Espanya', 'Catalan', '\U0001f1ea\U0001f1f8'), |     ('ca-ES', 'Català', 'Espanya', 'Catalan', '\U0001f1ea\U0001f1f8'), | ||||||
|  |     ('cs', 'Čeština', '', 'Czech', '\U0001f310'), | ||||||
|     ('cs-CZ', 'Čeština', 'Česko', 'Czech', '\U0001f1e8\U0001f1ff'), |     ('cs-CZ', 'Čeština', 'Česko', 'Czech', '\U0001f1e8\U0001f1ff'), | ||||||
|  |     ('da', 'Dansk', '', 'Danish', '\U0001f310'), | ||||||
|     ('da-DK', 'Dansk', 'Danmark', 'Danish', '\U0001f1e9\U0001f1f0'), |     ('da-DK', 'Dansk', 'Danmark', 'Danish', '\U0001f1e9\U0001f1f0'), | ||||||
|     ('de', 'Deutsch', '', 'German', '\U0001f310'), |     ('de', 'Deutsch', '', 'German', '\U0001f310'), | ||||||
|     ('de-AT', 'Deutsch', 'Österreich', 'German', '\U0001f1e6\U0001f1f9'), |     ('de-AT', 'Deutsch', 'Österreich', 'German', '\U0001f1e6\U0001f1f9'), | ||||||
|     ('de-CH', 'Deutsch', 'Schweiz', 'German', '\U0001f1e8\U0001f1ed'), |     ('de-CH', 'Deutsch', 'Schweiz', 'German', '\U0001f1e8\U0001f1ed'), | ||||||
|     ('de-DE', 'Deutsch', 'Deutschland', 'German', '\U0001f1e9\U0001f1ea'), |     ('de-DE', 'Deutsch', 'Deutschland', 'German', '\U0001f1e9\U0001f1ea'), | ||||||
|  |     ('el', 'Ελληνικά', '', 'Greek', '\U0001f310'), | ||||||
|     ('el-GR', 'Ελληνικά', 'Ελλάδα', 'Greek', '\U0001f1ec\U0001f1f7'), |     ('el-GR', 'Ελληνικά', 'Ελλάδα', 'Greek', '\U0001f1ec\U0001f1f7'), | ||||||
|     ('en', 'English', '', 'English', '\U0001f310'), |     ('en', 'English', '', 'English', '\U0001f310'), | ||||||
|     ('en-AU', 'English', 'Australia', 'English', '\U0001f1e6\U0001f1fa'), |     ('en-AU', 'English', 'Australia', 'English', '\U0001f1e6\U0001f1fa'), | ||||||
|     ('en-CA', 'English', 'Canada', 'English', '\U0001f1e8\U0001f1e6'), |     ('en-CA', 'English', 'Canada', 'English', '\U0001f1e8\U0001f1e6'), | ||||||
|     ('en-GB', 'English', 'United Kingdom', 'English', '\U0001f1ec\U0001f1e7'), |     ('en-GB', 'English', 'United Kingdom', 'English', '\U0001f1ec\U0001f1e7'), | ||||||
|     ('en-IE', 'English', 'Ireland', 'English', '\U0001f1ee\U0001f1ea'), |     ('en-IE', 'English', 'Ireland', 'English', '\U0001f1ee\U0001f1ea'), | ||||||
|  |     ('en-IN', 'English', 'India', 'English', '\U0001f1ee\U0001f1f3'), | ||||||
|     ('en-MY', 'English', 'Malaysia', 'English', '\U0001f1f2\U0001f1fe'), |     ('en-MY', 'English', 'Malaysia', 'English', '\U0001f1f2\U0001f1fe'), | ||||||
|     ('en-NZ', 'English', 'New Zealand', 'English', '\U0001f1f3\U0001f1ff'), |     ('en-NZ', 'English', 'New Zealand', 'English', '\U0001f1f3\U0001f1ff'), | ||||||
|  |     ('en-PH', 'English', 'Philippines', 'English', '\U0001f1f5\U0001f1ed'), | ||||||
|     ('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'), |     ('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'), | ||||||
|  |     ('en-ZA', 'English', 'South Africa', 'English', '\U0001f1ff\U0001f1e6'), | ||||||
|     ('es', 'Español', '', 'Spanish', '\U0001f310'), |     ('es', 'Español', '', 'Spanish', '\U0001f310'), | ||||||
|     ('es-AR', 'Español', 'Argentina', 'Spanish', '\U0001f1e6\U0001f1f7'), |     ('es-AR', 'Español', 'Argentina', 'Spanish', '\U0001f1e6\U0001f1f7'), | ||||||
|     ('es-CL', 'Español', 'Chile', 'Spanish', '\U0001f1e8\U0001f1f1'), |     ('es-CL', 'Español', 'Chile', 'Spanish', '\U0001f1e8\U0001f1f1'), | ||||||
|     ('es-ES', 'Español', 'España', 'Spanish', '\U0001f1ea\U0001f1f8'), |     ('es-ES', 'Español', 'España', 'Spanish', '\U0001f1ea\U0001f1f8'), | ||||||
|     ('es-MX', 'Español', 'México', 'Spanish', '\U0001f1f2\U0001f1fd'), |     ('es-MX', 'Español', 'México', 'Spanish', '\U0001f1f2\U0001f1fd'), | ||||||
|  |     ('es-US', 'Español', 'Estados Unidos', 'Spanish', '\U0001f1fa\U0001f1f8'), | ||||||
|  |     ('et', 'Eesti', '', 'Estonian', '\U0001f310'), | ||||||
|     ('et-EE', 'Eesti', 'Eesti', 'Estonian', '\U0001f1ea\U0001f1ea'), |     ('et-EE', 'Eesti', 'Eesti', 'Estonian', '\U0001f1ea\U0001f1ea'), | ||||||
|     ('fa-IR', 'فارسی', 'ایران', 'Persian', '\U0001f1ee\U0001f1f7'), |     ('fi', 'Suomi', '', 'Finnish', '\U0001f310'), | ||||||
|     ('fi-FI', 'Suomi', 'Suomi', 'Finnish', '\U0001f1eb\U0001f1ee'), |     ('fi-FI', 'Suomi', 'Suomi', 'Finnish', '\U0001f1eb\U0001f1ee'), | ||||||
|     ('fil-PH', 'Filipino', 'Pilipinas', 'Filipino', '\U0001f1f5\U0001f1ed'), |  | ||||||
|     ('fr', 'Français', '', 'French', '\U0001f310'), |     ('fr', 'Français', '', 'French', '\U0001f310'), | ||||||
|     ('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'), |     ('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'), | ||||||
|     ('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'), |     ('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'), | ||||||
|     ('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'), |     ('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'), | ||||||
|     ('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'), |     ('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'), | ||||||
|     ('he-IL', 'עברית', 'ישראל', 'Hebrew', '\U0001f1ee\U0001f1f1'), |     ('he', 'עברית', '', 'Hebrew', '\U0001f1ee\U0001f1f7'), | ||||||
|     ('hi-IN', 'हिन्दी', 'भारत', 'Hindi', '\U0001f1ee\U0001f1f3'), |     ('hi', 'हिन्दी', '', 'Hindi', '\U0001f310'), | ||||||
|     ('hr-HR', 'Hrvatski', 'Hrvatska', 'Croatian', '\U0001f1ed\U0001f1f7'), |     ('hr', 'Hrvatski', '', 'Croatian', '\U0001f310'), | ||||||
|  |     ('hu', 'Magyar', '', 'Hungarian', '\U0001f310'), | ||||||
|     ('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'), |     ('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'), | ||||||
|  |     ('id', 'Indonesia', '', 'Indonesian', '\U0001f310'), | ||||||
|     ('id-ID', 'Indonesia', 'Indonesia', 'Indonesian', '\U0001f1ee\U0001f1e9'), |     ('id-ID', 'Indonesia', 'Indonesia', 'Indonesian', '\U0001f1ee\U0001f1e9'), | ||||||
|     ('is-IS', 'Íslenska', 'Ísland', 'Icelandic', '\U0001f1ee\U0001f1f8'), |     ('is', 'Íslenska', '', 'Icelandic', '\U0001f310'), | ||||||
|  |     ('it', 'Italiano', '', 'Italian', '\U0001f310'), | ||||||
|  |     ('it-CH', 'Italiano', 'Svizzera', 'Italian', '\U0001f1e8\U0001f1ed'), | ||||||
|     ('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'), |     ('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'), | ||||||
|  |     ('ja', '日本語', '', 'Japanese', '\U0001f310'), | ||||||
|     ('ja-JP', '日本語', '日本', 'Japanese', '\U0001f1ef\U0001f1f5'), |     ('ja-JP', '日本語', '日本', 'Japanese', '\U0001f1ef\U0001f1f5'), | ||||||
|  |     ('ko', '한국어', '', 'Korean', '\U0001f310'), | ||||||
|     ('ko-KR', '한국어', '대한민국', 'Korean', '\U0001f1f0\U0001f1f7'), |     ('ko-KR', '한국어', '대한민국', 'Korean', '\U0001f1f0\U0001f1f7'), | ||||||
|     ('lt-LT', 'Lietuvių', 'Lietuva', 'Lithuanian', '\U0001f1f1\U0001f1f9'), |     ('lt', 'Lietuvių', '', 'Lithuanian', '\U0001f310'), | ||||||
|     ('lv-LV', 'Latviešu', 'Latvija', 'Latvian', '\U0001f1f1\U0001f1fb'), |     ('lv', 'Latviešu', '', 'Latvian', '\U0001f310'), | ||||||
|  |     ('nb', 'Norsk Bokmål', '', 'Norwegian Bokmål', '\U0001f310'), | ||||||
|  |     ('nb-NO', 'Norsk Bokmål', 'Norge', 'Norwegian Bokmål', '\U0001f1f3\U0001f1f4'), | ||||||
|     ('nl', 'Nederlands', '', 'Dutch', '\U0001f310'), |     ('nl', 'Nederlands', '', 'Dutch', '\U0001f310'), | ||||||
|     ('nl-BE', 'Nederlands', 'België', 'Dutch', '\U0001f1e7\U0001f1ea'), |     ('nl-BE', 'Nederlands', 'België', 'Dutch', '\U0001f1e7\U0001f1ea'), | ||||||
|     ('nl-NL', 'Nederlands', 'Nederland', 'Dutch', '\U0001f1f3\U0001f1f1'), |     ('nl-NL', 'Nederlands', 'Nederland', 'Dutch', '\U0001f1f3\U0001f1f1'), | ||||||
|     ('no-NO', 'Norsk', '', 'Norwegian (Bokmål)', '\U0001f1f3\U0001f1f4'), |     ('pl', 'Polski', '', 'Polish', '\U0001f310'), | ||||||
|     ('pl-PL', 'Polski', 'Polska', 'Polish', '\U0001f1f5\U0001f1f1'), |     ('pl-PL', 'Polski', 'Polska', 'Polish', '\U0001f1f5\U0001f1f1'), | ||||||
|     ('pt', 'Português', '', 'Portuguese', '\U0001f310'), |     ('pt', 'Português', '', 'Portuguese', '\U0001f310'), | ||||||
|     ('pt-BR', 'Português', 'Brasil', 'Portuguese', '\U0001f1e7\U0001f1f7'), |     ('pt-BR', 'Português', 'Brasil', 'Portuguese', '\U0001f1e7\U0001f1f7'), | ||||||
|     ('pt-PT', 'Português', 'Portugal', 'Portuguese', '\U0001f1f5\U0001f1f9'), |     ('pt-PT', 'Português', 'Portugal', 'Portuguese', '\U0001f1f5\U0001f1f9'), | ||||||
|  |     ('ro', 'Română', '', 'Romanian', '\U0001f310'), | ||||||
|     ('ro-RO', 'Română', 'România', 'Romanian', '\U0001f1f7\U0001f1f4'), |     ('ro-RO', 'Română', 'România', 'Romanian', '\U0001f1f7\U0001f1f4'), | ||||||
|  |     ('ru', 'Русский', '', 'Russian', '\U0001f310'), | ||||||
|     ('ru-RU', 'Русский', 'Россия', 'Russian', '\U0001f1f7\U0001f1fa'), |     ('ru-RU', 'Русский', 'Россия', 'Russian', '\U0001f1f7\U0001f1fa'), | ||||||
|     ('sk-SK', 'Slovenčina', 'Slovensko', 'Slovak', '\U0001f1f8\U0001f1f0'), |     ('sk', 'Slovenčina', '', 'Slovak', '\U0001f310'), | ||||||
|     ('sl-SI', 'Slovenščina', 'Slovenija', 'Slovenian', '\U0001f1f8\U0001f1ee'), |     ('sl', 'Slovenščina', '', 'Slovenian', '\U0001f310'), | ||||||
|     ('sr-RS', 'Српски', 'Србија', 'Serbian', '\U0001f1f7\U0001f1f8'), |     ('sr', 'Српски', '', 'Serbian', '\U0001f310'), | ||||||
|  |     ('sv', 'Svenska', '', 'Swedish', '\U0001f310'), | ||||||
|     ('sv-SE', 'Svenska', 'Sverige', 'Swedish', '\U0001f1f8\U0001f1ea'), |     ('sv-SE', 'Svenska', 'Sverige', 'Swedish', '\U0001f1f8\U0001f1ea'), | ||||||
|     ('sw-TZ', 'Kiswahili', 'Tanzania', 'Swahili', '\U0001f1f9\U0001f1ff'), |     ('th', 'ไทย', '', 'Thai', '\U0001f310'), | ||||||
|     ('th-TH', 'ไทย', 'ไทย', 'Thai', '\U0001f1f9\U0001f1ed'), |     ('th-TH', 'ไทย', 'ไทย', 'Thai', '\U0001f1f9\U0001f1ed'), | ||||||
|  |     ('tr', 'Türkçe', '', 'Turkish', '\U0001f310'), | ||||||
|     ('tr-TR', 'Türkçe', 'Türkiye', 'Turkish', '\U0001f1f9\U0001f1f7'), |     ('tr-TR', 'Türkçe', 'Türkiye', 'Turkish', '\U0001f1f9\U0001f1f7'), | ||||||
|     ('uk-UA', 'Українська', 'Україна', 'Ukrainian', '\U0001f1fa\U0001f1e6'), |     ('uk', 'Українська', '', 'Ukrainian', '\U0001f310'), | ||||||
|     ('vi-VN', 'Tiếng Việt', 'Việt Nam', 'Vietnamese', '\U0001f1fb\U0001f1f3'), |     ('vi', 'Tiếng Việt', '', 'Vietnamese', '\U0001f310'), | ||||||
|     ('zh', '中文', '', 'Chinese', '\U0001f310'), |     ('zh', '中文', '', 'Chinese', '\U0001f310'), | ||||||
|     ('zh-CN', '中文', '中国', 'Chinese', '\U0001f1e8\U0001f1f3'), |     ('zh-CN', '中文', '中国', 'Chinese', '\U0001f1e8\U0001f1f3'), | ||||||
|     ('zh-HK', '中文', '中國香港', 'Chinese', '\U0001f1ed\U0001f1f0'), |     ('zh-HK', '中文', '中國香港特別行政區', 'Chinese', '\U0001f1ed\U0001f1f0'), | ||||||
|     ('zh-TW', '中文', '台灣', 'Chinese', '\U0001f1f9\U0001f1fc'), |     ('zh-TW', '中文', '台灣', 'Chinese', '\U0001f1f9\U0001f1fc'), | ||||||
| ) | ) | ||||||
|  | ''' | ||||||
|  | A list of five-digit tuples: | ||||||
|  | 
 | ||||||
|  | 0. SearXNG's internal locale tag (a language or region tag) | ||||||
|  | 1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`) | ||||||
|  | 2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`). | ||||||
|  |    Empty string for language tags. | ||||||
|  | 3. English language name (from :py:obj:`babel.core.Locale.english_name`) | ||||||
|  | 4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages | ||||||
|  |    are represented by a globe (🌐) | ||||||
|  | 
 | ||||||
|  | .. code:: python | ||||||
|  | 
 | ||||||
|  |    ('en',    'English', '',              'English', '🌐'), | ||||||
|  |    ('en-CA', 'English', 'Canada',        'English', '🇨🇦'), | ||||||
|  |    ('en-US', 'English', 'United States', 'English', '🇺🇸'), | ||||||
|  |    .. | ||||||
|  |    ('fr',    'Français', '',             'French',  '🌐'), | ||||||
|  |    ('fr-BE', 'Français', 'Belgique',     'French',  '🇧🇪'), | ||||||
|  |    ('fr-CA', 'Français', 'Canada',       'French',  '🇨🇦'), | ||||||
|  | 
 | ||||||
|  | :meta hide-value: | ||||||
|  | ''' | ||||||
| @ -1,12 +1,12 @@ | |||||||
| <select class="language" id="language" name="language" aria-label="{{ _('Search language') }}">{{- '' -}} | <select class="language" id="language" name="language" aria-label="{{ _('Search language') }}">{{- '' -}} | ||||||
| 	<option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }}</option> | 	<option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }} [all]</option> | ||||||
| 	<option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}> | 	<option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}> | ||||||
| 		{{- _('Auto-detect') -}} | 		{{- _('Auto-detect') -}} | ||||||
| 		{%- if current_language == 'auto' %} ({{ search_language }}){%- endif -%} | 		{%- if current_language == 'auto' %} ({{ search_language }}){%- endif -%} | ||||||
| 	</option> | 	</option> | ||||||
| 	{%- for lang_id,lang_name,country_name,english_name,flag in language_codes | sort(attribute=1) -%} | 	{%- for sxng_tag,lang_name,country_name,english_name,flag in sxng_locales | sort(attribute=1) -%} | ||||||
| 	<option value="{{ lang_id }}" {% if lang_id == current_language %}selected="selected"{% endif %}> | 	<option value="{{ sxng_tag }}" {% if sxng_tag == current_language %}selected="selected"{% endif %}> | ||||||
| 		{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %}({{ country_name }}) {% endif %} | 		{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %} - {{ country_name }} {% endif %} [{{sxng_tag}}] | ||||||
| 	</option> | 	</option> | ||||||
| 	{%- endfor -%} | 	{%- endfor -%} | ||||||
| </select> | </select> | ||||||
|  | |||||||
| @ -115,10 +115,10 @@ | |||||||
|       <legend id="pref_language">{{ _('Search language') }}</legend> |       <legend id="pref_language">{{ _('Search language') }}</legend> | ||||||
|       <p class="value">{{- '' -}} |       <p class="value">{{- '' -}} | ||||||
|         <select name='language' aria-labelledby="pref_language" aria-describedby="desc_language">{{- '' -}} |         <select name='language' aria-labelledby="pref_language" aria-describedby="desc_language">{{- '' -}} | ||||||
|           <option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }}</option> |           <option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }} [all]</option> | ||||||
|           <option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>{{ _('Auto-detect') }}</option> |           <option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>{{ _('Auto-detect') }} [auto]</option> | ||||||
|           {%- for lang_id,lang_name,country_name,english_name,flag in language_codes | sort(attribute=1) -%} |           {%- for sxng_tag,lang_name,country_name,english_name,flag in sxng_locales | sort(attribute=1) -%} | ||||||
|           <option value="{{ lang_id }}" {% if lang_id == current_language %}selected="selected"{% endif %}>{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %}({{ country_name }}) {% endif %}</option> |           <option value="{{ sxng_tag }}" {% if sxng_tag == current_language %}selected="selected"{% endif %}>{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %} - {{ country_name }} {% endif %} [{{sxng_tag}}]</option> | ||||||
|           {%- endfor -%} |           {%- endfor -%} | ||||||
|         </select>{{- '' -}} |         </select>{{- '' -}} | ||||||
|       </p> |       </p> | ||||||
|  | |||||||
| @ -18,13 +18,11 @@ from urllib.parse import urljoin, urlparse | |||||||
| 
 | 
 | ||||||
| from lxml import html | from lxml import html | ||||||
| from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult | from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult | ||||||
| from babel.core import get_global |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| from searx import settings | from searx import settings | ||||||
| from searx.data import USER_AGENTS, data_dir | from searx.data import USER_AGENTS, data_dir | ||||||
| from searx.version import VERSION_TAG | from searx.version import VERSION_TAG | ||||||
| from searx.languages import language_codes | from searx.sxng_locales import sxng_locales | ||||||
| from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException | from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException | ||||||
| from searx import logger | from searx import logger | ||||||
| 
 | 
 | ||||||
| @ -53,8 +51,8 @@ _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {} | |||||||
| _FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None | _FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None | ||||||
| """fasttext model to predict laguage of a search term""" | """fasttext model to predict laguage of a search term""" | ||||||
| 
 | 
 | ||||||
| SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in language_codes]) | SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales]) | ||||||
| """Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`).""" | """Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`).""" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class _NotSetClass:  # pylint: disable=too-few-public-methods | class _NotSetClass:  # pylint: disable=too-few-public-methods | ||||||
| @ -355,102 +353,16 @@ def is_valid_lang(lang) -> Optional[Tuple[bool, str, str]]: | |||||||
|     is_abbr = len(lang) == 2 |     is_abbr = len(lang) == 2 | ||||||
|     lang = lang.lower() |     lang = lang.lower() | ||||||
|     if is_abbr: |     if is_abbr: | ||||||
|         for l in language_codes: |         for l in sxng_locales: | ||||||
|             if l[0][:2] == lang: |             if l[0][:2] == lang: | ||||||
|                 return (True, l[0][:2], l[3].lower()) |                 return (True, l[0][:2], l[3].lower()) | ||||||
|         return None |         return None | ||||||
|     for l in language_codes: |     for l in sxng_locales: | ||||||
|         if l[1].lower() == lang or l[3].lower() == lang: |         if l[1].lower() == lang or l[3].lower() == lang: | ||||||
|             return (True, l[0][:2], l[3].lower()) |             return (True, l[0][:2], l[3].lower()) | ||||||
|     return None |     return None | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _get_lang_to_lc_dict(lang_list: List[str]) -> Dict[str, str]: |  | ||||||
|     key = str(lang_list) |  | ||||||
|     value = _LANG_TO_LC_CACHE.get(key, None) |  | ||||||
|     if value is None: |  | ||||||
|         value = {} |  | ||||||
|         for lang in lang_list: |  | ||||||
|             value.setdefault(lang.split('-')[0], lang) |  | ||||||
|         _LANG_TO_LC_CACHE[key] = value |  | ||||||
|     return value |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # babel's get_global contains all sorts of miscellaneous locale and territory related data |  | ||||||
| # see get_global in: https://github.com/python-babel/babel/blob/master/babel/core.py |  | ||||||
| def _get_from_babel(lang_code: str, key): |  | ||||||
|     match = get_global(key).get(lang_code.replace('-', '_')) |  | ||||||
|     # for some keys, such as territory_aliases, match may be a list |  | ||||||
|     if isinstance(match, str): |  | ||||||
|         return match.replace('_', '-') |  | ||||||
|     return match |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def _match_language(lang_code: str, lang_list=[], custom_aliases={}) -> Optional[str]:  # pylint: disable=W0102 |  | ||||||
|     """auxiliary function to match lang_code in lang_list""" |  | ||||||
|     # replace language code with a custom alias if necessary |  | ||||||
|     if lang_code in custom_aliases: |  | ||||||
|         lang_code = custom_aliases[lang_code] |  | ||||||
| 
 |  | ||||||
|     if lang_code in lang_list: |  | ||||||
|         return lang_code |  | ||||||
| 
 |  | ||||||
|     # try to get the most likely country for this language |  | ||||||
|     subtags = _get_from_babel(lang_code, 'likely_subtags') |  | ||||||
|     if subtags: |  | ||||||
|         if subtags in lang_list: |  | ||||||
|             return subtags |  | ||||||
|         subtag_parts = subtags.split('-') |  | ||||||
|         new_code = subtag_parts[0] + '-' + subtag_parts[-1] |  | ||||||
|         if new_code in custom_aliases: |  | ||||||
|             new_code = custom_aliases[new_code] |  | ||||||
|         if new_code in lang_list: |  | ||||||
|             return new_code |  | ||||||
| 
 |  | ||||||
|     # try to get the any supported country for this language |  | ||||||
|     return _get_lang_to_lc_dict(lang_list).get(lang_code) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def match_language(  # pylint: disable=W0102 |  | ||||||
|     locale_code, lang_list=[], custom_aliases={}, fallback: Optional[str] = 'en-US' |  | ||||||
| ) -> Optional[str]: |  | ||||||
|     """get the language code from lang_list that best matches locale_code""" |  | ||||||
|     # try to get language from given locale_code |  | ||||||
|     language = _match_language(locale_code, lang_list, custom_aliases) |  | ||||||
|     if language: |  | ||||||
|         return language |  | ||||||
| 
 |  | ||||||
|     locale_parts = locale_code.split('-') |  | ||||||
|     lang_code = locale_parts[0] |  | ||||||
| 
 |  | ||||||
|     # if locale_code has script, try matching without it |  | ||||||
|     if len(locale_parts) > 2: |  | ||||||
|         language = _match_language(lang_code + '-' + locale_parts[-1], lang_list, custom_aliases) |  | ||||||
|         if language: |  | ||||||
|             return language |  | ||||||
| 
 |  | ||||||
|     # try to get language using an equivalent country code |  | ||||||
|     if len(locale_parts) > 1: |  | ||||||
|         country_alias = _get_from_babel(locale_parts[-1], 'territory_aliases') |  | ||||||
|         if country_alias: |  | ||||||
|             language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases) |  | ||||||
|             if language: |  | ||||||
|                 return language |  | ||||||
| 
 |  | ||||||
|     # try to get language using an equivalent language code |  | ||||||
|     alias = _get_from_babel(lang_code, 'language_aliases') |  | ||||||
|     if alias: |  | ||||||
|         language = _match_language(alias, lang_list, custom_aliases) |  | ||||||
|         if language: |  | ||||||
|             return language |  | ||||||
| 
 |  | ||||||
|     if lang_code != locale_code: |  | ||||||
|         # try to get language from given language without giving the country |  | ||||||
|         language = _match_language(lang_code, lang_list, custom_aliases) |  | ||||||
| 
 |  | ||||||
|     return language or fallback |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def load_module(filename: str, module_dir: str) -> types.ModuleType: | def load_module(filename: str, module_dir: str) -> types.ModuleType: | ||||||
|     modname = splitext(filename)[0] |     modname = splitext(filename)[0] | ||||||
|     modpath = join(module_dir, filename) |     modpath = join(module_dir, filename) | ||||||
|  | |||||||
| @ -89,7 +89,6 @@ from searx.utils import ( | |||||||
|     html_to_text, |     html_to_text, | ||||||
|     gen_useragent, |     gen_useragent, | ||||||
|     dict_subset, |     dict_subset, | ||||||
|     match_language, |  | ||||||
| ) | ) | ||||||
| from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH | from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH | ||||||
| from searx.query import RawTextQuery | from searx.query import RawTextQuery | ||||||
| @ -117,12 +116,13 @@ from searx.locales import ( | |||||||
|     RTL_LOCALES, |     RTL_LOCALES, | ||||||
|     localeselector, |     localeselector, | ||||||
|     locales_initialize, |     locales_initialize, | ||||||
|  |     match_locale, | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| # renaming names from searx imports ... | # renaming names from searx imports ... | ||||||
| from searx.autocomplete import search_autocomplete, backends as autocomplete_backends | from searx.autocomplete import search_autocomplete, backends as autocomplete_backends | ||||||
| from searx.languages import language_codes as languages |  | ||||||
| from searx.redisdb import initialize as redis_initialize | from searx.redisdb import initialize as redis_initialize | ||||||
|  | from searx.sxng_locales import sxng_locales | ||||||
| from searx.search import SearchWithPlugins, initialize as search_initialize | from searx.search import SearchWithPlugins, initialize as search_initialize | ||||||
| from searx.network import stream as http_stream, set_context_network_name | from searx.network import stream as http_stream, set_context_network_name | ||||||
| from searx.search.checker import get_result as checker_get_result | from searx.search.checker import get_result as checker_get_result | ||||||
| @ -227,7 +227,7 @@ def _get_browser_language(req, lang_list): | |||||||
|         if '-' in lang: |         if '-' in lang: | ||||||
|             lang_parts = lang.split('-') |             lang_parts = lang.split('-') | ||||||
|             lang = "{}-{}".format(lang_parts[0], lang_parts[-1].upper()) |             lang = "{}-{}".format(lang_parts[0], lang_parts[-1].upper()) | ||||||
|         locale = match_language(lang, lang_list, fallback=None) |         locale = match_locale(lang, lang_list, fallback=None) | ||||||
|         if locale is not None: |         if locale is not None: | ||||||
|             return locale |             return locale | ||||||
|     return 'en' |     return 'en' | ||||||
| @ -407,7 +407,7 @@ def get_client_settings(): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def render(template_name: str, **kwargs): | def render(template_name: str, **kwargs): | ||||||
| 
 |     # pylint: disable=too-many-statements | ||||||
|     kwargs['client_settings'] = str( |     kwargs['client_settings'] = str( | ||||||
|         base64.b64encode( |         base64.b64encode( | ||||||
|             bytes( |             bytes( | ||||||
| @ -438,17 +438,20 @@ def render(template_name: str, **kwargs): | |||||||
|     kwargs['OTHER_CATEGORY'] = OTHER_CATEGORY |     kwargs['OTHER_CATEGORY'] = OTHER_CATEGORY | ||||||
| 
 | 
 | ||||||
|     # i18n |     # i18n | ||||||
|     kwargs['language_codes'] = [l for l in languages if l[0] in settings['search']['languages']] |     kwargs['sxng_locales'] = [l for l in sxng_locales if l[0] in settings['search']['languages']] | ||||||
| 
 | 
 | ||||||
|     locale = request.preferences.get_value('locale') |     locale = request.preferences.get_value('locale') | ||||||
|     kwargs['locale_rfc5646'] = _get_locale_rfc5646(locale) |     kwargs['locale_rfc5646'] = _get_locale_rfc5646(locale) | ||||||
| 
 | 
 | ||||||
|     if locale in RTL_LOCALES and 'rtl' not in kwargs: |     if locale in RTL_LOCALES and 'rtl' not in kwargs: | ||||||
|         kwargs['rtl'] = True |         kwargs['rtl'] = True | ||||||
|  | 
 | ||||||
|     if 'current_language' not in kwargs: |     if 'current_language' not in kwargs: | ||||||
|         kwargs['current_language'] = match_language( |         _locale = request.preferences.get_value('language') | ||||||
|             request.preferences.get_value('language'), settings['search']['languages'] |         if _locale in ('auto', 'all'): | ||||||
|         ) |             kwargs['current_language'] = _locale | ||||||
|  |         else: | ||||||
|  |             kwargs['current_language'] = match_locale(_locale, settings['search']['languages']) | ||||||
| 
 | 
 | ||||||
|     # values from settings |     # values from settings | ||||||
|     kwargs['search_formats'] = [x for x in settings['search']['formats'] if x != 'html'] |     kwargs['search_formats'] = [x for x in settings['search']['formats'] if x != 'html'] | ||||||
| @ -810,6 +813,13 @@ def search(): | |||||||
|         ) |         ) | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|  |     if search_query.lang in ('auto', 'all'): | ||||||
|  |         current_language = search_query.lang | ||||||
|  |     else: | ||||||
|  |         current_language = match_locale( | ||||||
|  |             search_query.lang, settings['search']['languages'], fallback=request.preferences.get_value("language") | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|     # search_query.lang contains the user choice (all, auto, en, ...) |     # search_query.lang contains the user choice (all, auto, en, ...) | ||||||
|     # when the user choice is "auto", search.search_query.lang contains the detected language |     # when the user choice is "auto", search.search_query.lang contains the detected language | ||||||
|     # otherwise it is equals to search_query.lang |     # otherwise it is equals to search_query.lang | ||||||
| @ -832,12 +842,8 @@ def search(): | |||||||
|             result_container.unresponsive_engines |             result_container.unresponsive_engines | ||||||
|         ), |         ), | ||||||
|         current_locale = request.preferences.get_value("locale"), |         current_locale = request.preferences.get_value("locale"), | ||||||
|         current_language = match_language( |         current_language = current_language, | ||||||
|             search_query.lang, |         search_language = match_locale( | ||||||
|             settings['search']['languages'], |  | ||||||
|             fallback=request.preferences.get_value("language") |  | ||||||
|         ), |  | ||||||
|         search_language = match_language( |  | ||||||
|             search.search_query.lang, |             search.search_query.lang, | ||||||
|             settings['search']['languages'], |             settings['search']['languages'], | ||||||
|             fallback=request.preferences.get_value("language") |             fallback=request.preferences.get_value("language") | ||||||
| @ -907,16 +913,11 @@ def autocompleter(): | |||||||
|     # and there is a query part |     # and there is a query part | ||||||
|     if len(raw_text_query.autocomplete_list) == 0 and len(sug_prefix) > 0: |     if len(raw_text_query.autocomplete_list) == 0 and len(sug_prefix) > 0: | ||||||
| 
 | 
 | ||||||
|         # get language from cookie |         # get SearXNG's locale and autocomplete backend from cookie | ||||||
|         language = request.preferences.get_value('language') |         sxng_locale = request.preferences.get_value('language') | ||||||
|         if not language or language == 'all': |         backend_name = request.preferences.get_value('autocomplete') | ||||||
|             language = 'en' |  | ||||||
|         else: |  | ||||||
|             language = language.split('-')[0] |  | ||||||
| 
 | 
 | ||||||
|         # run autocompletion |         for result in search_autocomplete(backend_name, sug_prefix, sxng_locale): | ||||||
|         raw_results = search_autocomplete(request.preferences.get_value('autocomplete'), sug_prefix, language) |  | ||||||
|         for result in raw_results: |  | ||||||
|             # attention: this loop will change raw_text_query object and this is |             # attention: this loop will change raw_text_query object and this is | ||||||
|             # the reason why the sug_prefix was stored before (see above) |             # the reason why the sug_prefix was stored before (see above) | ||||||
|             if result != sug_prefix: |             if result != sug_prefix: | ||||||
| @ -1001,7 +1002,9 @@ def preferences(): | |||||||
|             'rate80': rate80, |             'rate80': rate80, | ||||||
|             'rate95': rate95, |             'rate95': rate95, | ||||||
|             'warn_timeout': e.timeout > settings['outgoing']['request_timeout'], |             'warn_timeout': e.timeout > settings['outgoing']['request_timeout'], | ||||||
|             'supports_selected_language': _is_selected_language_supported(e, request.preferences), |             'supports_selected_language': e.traits.is_locale_supported( | ||||||
|  |                 str(request.preferences.get_value('language') or 'all') | ||||||
|  |             ), | ||||||
|             'result_count': result_count, |             'result_count': result_count, | ||||||
|         } |         } | ||||||
|     # end of stats |     # end of stats | ||||||
| @ -1052,7 +1055,9 @@ def preferences(): | |||||||
|     # supports |     # supports | ||||||
|     supports = {} |     supports = {} | ||||||
|     for _, e in filtered_engines.items(): |     for _, e in filtered_engines.items(): | ||||||
|         supports_selected_language = _is_selected_language_supported(e, request.preferences) |         supports_selected_language = e.traits.is_locale_supported( | ||||||
|  |             str(request.preferences.get_value('language') or 'all') | ||||||
|  |         ) | ||||||
|         safesearch = e.safesearch |         safesearch = e.safesearch | ||||||
|         time_range_support = e.time_range_support |         time_range_support = e.time_range_support | ||||||
|         for checker_test_name in checker_results.get(e.name, {}).get('errors', {}): |         for checker_test_name in checker_results.get(e.name, {}).get('errors', {}): | ||||||
| @ -1099,16 +1104,6 @@ def preferences(): | |||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _is_selected_language_supported(engine, preferences: Preferences):  # pylint: disable=redefined-outer-name |  | ||||||
|     language = preferences.get_value('language') |  | ||||||
|     if language == 'all': |  | ||||||
|         return True |  | ||||||
|     x = match_language( |  | ||||||
|         language, getattr(engine, 'supported_languages', []), getattr(engine, 'language_aliases', {}), None |  | ||||||
|     ) |  | ||||||
|     return bool(x) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| @app.route('/image_proxy', methods=['GET']) | @app.route('/image_proxy', methods=['GET']) | ||||||
| def image_proxy(): | def image_proxy(): | ||||||
|     # pylint: disable=too-many-return-statements, too-many-branches |     # pylint: disable=too-many-return-statements, too-many-branches | ||||||
| @ -1327,10 +1322,7 @@ def config(): | |||||||
|         if not request.preferences.validate_token(engine): |         if not request.preferences.validate_token(engine): | ||||||
|             continue |             continue | ||||||
| 
 | 
 | ||||||
|         supported_languages = engine.supported_languages |         _languages = engine.traits.languages.keys() | ||||||
|         if isinstance(engine.supported_languages, dict): |  | ||||||
|             supported_languages = list(engine.supported_languages.keys()) |  | ||||||
| 
 |  | ||||||
|         _engines.append( |         _engines.append( | ||||||
|             { |             { | ||||||
|                 'name': name, |                 'name': name, | ||||||
| @ -1339,7 +1331,8 @@ def config(): | |||||||
|                 'enabled': not engine.disabled, |                 'enabled': not engine.disabled, | ||||||
|                 'paging': engine.paging, |                 'paging': engine.paging, | ||||||
|                 'language_support': engine.language_support, |                 'language_support': engine.language_support, | ||||||
|                 'supported_languages': supported_languages, |                 'languages': list(_languages), | ||||||
|  |                 'regions': list(engine.traits.regions.keys()), | ||||||
|                 'safesearch': engine.safesearch, |                 'safesearch': engine.safesearch, | ||||||
|                 'time_range_support': engine.time_range_support, |                 'time_range_support': engine.time_range_support, | ||||||
|                 'timeout': engine.timeout, |                 'timeout': engine.timeout, | ||||||
|  | |||||||
| @ -1,4 +1,6 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  | from __future__ import annotations | ||||||
|  | 
 | ||||||
| import os | import os | ||||||
| import pathlib | import pathlib | ||||||
| import csv | import csv | ||||||
| @ -8,7 +10,7 @@ import re | |||||||
| import inspect | import inspect | ||||||
| import itertools | import itertools | ||||||
| from datetime import datetime, timedelta | from datetime import datetime, timedelta | ||||||
| from typing import Iterable, List, Tuple, Dict | from typing import Iterable, List, Tuple, Dict, TYPE_CHECKING | ||||||
| 
 | 
 | ||||||
| from io import StringIO | from io import StringIO | ||||||
| from codecs import getincrementalencoder | from codecs import getincrementalencoder | ||||||
| @ -16,7 +18,10 @@ from codecs import getincrementalencoder | |||||||
| from flask_babel import gettext, format_date | from flask_babel import gettext, format_date | ||||||
| 
 | 
 | ||||||
| from searx import logger, settings | from searx import logger, settings | ||||||
| from searx.engines import Engine, OTHER_CATEGORY | from searx.engines import OTHER_CATEGORY | ||||||
|  | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     from searx.enginelib import Engine | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| VALID_LANGUAGE_CODE = re.compile(r'^[a-z]{2,3}(-[a-zA-Z]{2})?$') | VALID_LANGUAGE_CODE = re.compile(r'^[a-z]{2,3}(-[a-zA-Z]{2})?$') | ||||||
|  | |||||||
| @ -18,8 +18,8 @@ from os.path import join | |||||||
| from lxml.html import fromstring | from lxml.html import fromstring | ||||||
| 
 | 
 | ||||||
| from searx.engines import wikidata, set_loggers | from searx.engines import wikidata, set_loggers | ||||||
| from searx.utils import extract_text, match_language | from searx.utils import extract_text | ||||||
| from searx.locales import LOCALE_NAMES, locales_initialize | from searx.locales import LOCALE_NAMES, locales_initialize, match_locale | ||||||
| from searx import searx_dir | from searx import searx_dir | ||||||
| from searx.utils import gen_useragent, detect_language | from searx.utils import gen_useragent, detect_language | ||||||
| import searx.search | import searx.search | ||||||
| @ -225,9 +225,9 @@ def fetch_website_description(engine_name, website): | |||||||
|             fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang]) |             fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang]) | ||||||
|             if fetched_lang is None or desc is None: |             if fetched_lang is None or desc is None: | ||||||
|                 continue |                 continue | ||||||
|             matched_lang = match_language(fetched_lang, LANGUAGES, fallback=None) |             matched_lang = match_locale(fetched_lang, LANGUAGES, fallback=None) | ||||||
|             if matched_lang is None: |             if matched_lang is None: | ||||||
|                 fetched_wikipedia_lang = match_language(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None) |                 fetched_wikipedia_lang = match_locale(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None) | ||||||
|                 matched_lang = wikipedia_languages_r.get(fetched_wikipedia_lang) |                 matched_lang = wikipedia_languages_r.get(fetched_wikipedia_lang) | ||||||
|             if matched_lang is not None: |             if matched_lang is not None: | ||||||
|                 update_description(engine_name, matched_lang, desc, website, replace=False) |                 update_description(engine_name, matched_lang, desc, website, replace=False) | ||||||
|  | |||||||
							
								
								
									
										198
									
								
								searxng_extra/update/update_engine_traits.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										198
									
								
								searxng_extra/update/update_engine_traits.py
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,198 @@ | |||||||
|  | #!/usr/bin/env python | ||||||
|  | # lint: pylint | ||||||
|  | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
|  | """Update :py:obj:`searx.enginelib.traits.EngineTraitsMap` and :origin:`searx/languages.py` | ||||||
|  | 
 | ||||||
|  | :py:obj:`searx.enginelib.traits.EngineTraitsMap.ENGINE_TRAITS_FILE`: | ||||||
|  |   Persistence of engines traits, fetched from the engines. | ||||||
|  | 
 | ||||||
|  | :origin:`searx/languages.py` | ||||||
|  |   Is generated  from intersecting each engine's supported traits. | ||||||
|  | 
 | ||||||
|  | The script :origin:`searxng_extra/update/update_engine_traits.py` is called in | ||||||
|  | the :origin:`CI Update data ... <.github/workflows/data-update.yml>` | ||||||
|  | 
 | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | # pylint: disable=invalid-name | ||||||
|  | from unicodedata import lookup | ||||||
|  | from pathlib import Path | ||||||
|  | from pprint import pformat | ||||||
|  | import babel | ||||||
|  | 
 | ||||||
|  | from searx import settings, searx_dir | ||||||
|  | from searx import network | ||||||
|  | from searx.engines import load_engines | ||||||
|  | from searx.enginelib.traits import EngineTraitsMap | ||||||
|  | 
 | ||||||
|  | # Output files. | ||||||
|  | languages_file = Path(searx_dir) / 'sxng_locales.py' | ||||||
|  | languages_file_header = """\ | ||||||
|  | # -*- coding: utf-8 -*- | ||||||
|  | '''List of SearXNG's locale codes. | ||||||
|  | 
 | ||||||
|  | This file is generated automatically by:: | ||||||
|  | 
 | ||||||
|  |    ./manage pyenv.cmd searxng_extra/update/update_engine_traits.py | ||||||
|  | ''' | ||||||
|  | 
 | ||||||
|  | sxng_locales = ( | ||||||
|  | """ | ||||||
|  | languages_file_footer = """, | ||||||
|  | ) | ||||||
|  | ''' | ||||||
|  | A list of five-digit tuples: | ||||||
|  | 
 | ||||||
|  | 0. SearXNG's internal locale tag (a language or region tag) | ||||||
|  | 1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`) | ||||||
|  | 2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`). | ||||||
|  |    Empty string for language tags. | ||||||
|  | 3. English language name (from :py:obj:`babel.core.Locale.english_name`) | ||||||
|  | 4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages | ||||||
|  |    are represented by a globe (\U0001F310) | ||||||
|  | 
 | ||||||
|  | .. code:: python | ||||||
|  | 
 | ||||||
|  |    ('en',    'English', '',              'English', '\U0001f310'), | ||||||
|  |    ('en-CA', 'English', 'Canada',        'English', '\U0001f1e8\U0001f1e6'), | ||||||
|  |    ('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'), | ||||||
|  |    .. | ||||||
|  |    ('fr',    'Français', '',             'French',  '\U0001f310'), | ||||||
|  |    ('fr-BE', 'Français', 'Belgique',     'French',  '\U0001f1e7\U0001f1ea'), | ||||||
|  |    ('fr-CA', 'Français', 'Canada',       'French',  '\U0001f1e8\U0001f1e6'), | ||||||
|  | 
 | ||||||
|  | :meta hide-value: | ||||||
|  | ''' | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | lang2emoji = { | ||||||
|  |     'ha': '\U0001F1F3\U0001F1EA',  # Hausa / Niger | ||||||
|  |     'bs': '\U0001F1E7\U0001F1E6',  # Bosnian / Bosnia & Herzegovina | ||||||
|  |     'jp': '\U0001F1EF\U0001F1F5',  # Japanese | ||||||
|  |     'ua': '\U0001F1FA\U0001F1E6',  # Ukrainian | ||||||
|  |     'he': '\U0001F1EE\U0001F1F7',  # Hebrew | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def main(): | ||||||
|  |     load_engines(settings['engines']) | ||||||
|  |     # traits_map = EngineTraitsMap.from_data() | ||||||
|  |     traits_map = fetch_traits_map() | ||||||
|  |     sxng_tag_list = filter_locales(traits_map) | ||||||
|  |     write_languages_file(sxng_tag_list) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def fetch_traits_map(): | ||||||
|  |     """Fetchs supported languages for each engine and writes json file with those.""" | ||||||
|  |     network.set_timeout_for_thread(10.0) | ||||||
|  | 
 | ||||||
|  |     def log(msg): | ||||||
|  |         print(msg) | ||||||
|  | 
 | ||||||
|  |     traits_map = EngineTraitsMap.fetch_traits(log=log) | ||||||
|  |     print("fetched properties from %s engines" % len(traits_map)) | ||||||
|  |     print("write json file: %s" % traits_map.ENGINE_TRAITS_FILE) | ||||||
|  |     traits_map.save_data() | ||||||
|  |     return traits_map | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def filter_locales(traits_map: EngineTraitsMap): | ||||||
|  |     """Filter language & region tags by a threshold.""" | ||||||
|  | 
 | ||||||
|  |     min_eng_per_region = 11 | ||||||
|  |     min_eng_per_lang = 13 | ||||||
|  | 
 | ||||||
|  |     _ = {} | ||||||
|  |     for eng in traits_map.values(): | ||||||
|  |         for reg in eng.regions.keys(): | ||||||
|  |             _[reg] = _.get(reg, 0) + 1 | ||||||
|  | 
 | ||||||
|  |     regions = set(k for k, v in _.items() if v >= min_eng_per_region) | ||||||
|  |     lang_from_region = set(k.split('-')[0] for k in regions) | ||||||
|  | 
 | ||||||
|  |     _ = {} | ||||||
|  |     for eng in traits_map.values(): | ||||||
|  |         for lang in eng.languages.keys(): | ||||||
|  |             # ignore script types like zh_Hant, zh_Hans or sr_Latin, pa_Arab (they | ||||||
|  |             # already counted by existence of 'zh' or 'sr', 'pa') | ||||||
|  |             if '_' in lang: | ||||||
|  |                 # print("ignore %s" % lang) | ||||||
|  |                 continue | ||||||
|  |             _[lang] = _.get(lang, 0) + 1 | ||||||
|  | 
 | ||||||
|  |     languages = set(k for k, v in _.items() if v >= min_eng_per_lang) | ||||||
|  | 
 | ||||||
|  |     sxng_tag_list = set() | ||||||
|  |     sxng_tag_list.update(regions) | ||||||
|  |     sxng_tag_list.update(lang_from_region) | ||||||
|  |     sxng_tag_list.update(languages) | ||||||
|  | 
 | ||||||
|  |     return sxng_tag_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def write_languages_file(sxng_tag_list): | ||||||
|  | 
 | ||||||
|  |     language_codes = [] | ||||||
|  | 
 | ||||||
|  |     for sxng_tag in sorted(sxng_tag_list): | ||||||
|  |         sxng_locale: babel.Locale = babel.Locale.parse(sxng_tag, sep='-') | ||||||
|  | 
 | ||||||
|  |         flag = get_unicode_flag(sxng_locale) or '' | ||||||
|  | 
 | ||||||
|  |         item = ( | ||||||
|  |             sxng_tag, | ||||||
|  |             sxng_locale.get_language_name().title(), | ||||||
|  |             sxng_locale.get_territory_name() or '', | ||||||
|  |             sxng_locale.english_name.split(' (')[0], | ||||||
|  |             UnicodeEscape(flag), | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         language_codes.append(item) | ||||||
|  | 
 | ||||||
|  |     language_codes = tuple(language_codes) | ||||||
|  | 
 | ||||||
|  |     with open(languages_file, 'w', encoding='utf-8') as new_file: | ||||||
|  |         file_content = "{header} {language_codes}{footer}".format( | ||||||
|  |             header=languages_file_header, | ||||||
|  |             language_codes=pformat(language_codes, width=120, indent=4)[1:-1], | ||||||
|  |             footer=languages_file_footer, | ||||||
|  |         ) | ||||||
|  |         new_file.write(file_content) | ||||||
|  |         new_file.close() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class UnicodeEscape(str): | ||||||
|  |     """Escape unicode string in :py:obj:`pprint.pformat`""" | ||||||
|  | 
 | ||||||
|  |     def __repr__(self): | ||||||
|  |         return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_unicode_flag(locale: babel.Locale): | ||||||
|  |     """Determine a unicode flag (emoji) that fits to the ``locale``""" | ||||||
|  | 
 | ||||||
|  |     emoji = lang2emoji.get(locale.language) | ||||||
|  |     if emoji: | ||||||
|  |         return emoji | ||||||
|  | 
 | ||||||
|  |     if not locale.territory: | ||||||
|  |         return '\U0001F310' | ||||||
|  | 
 | ||||||
|  |     emoji = lang2emoji.get(locale.territory.lower()) | ||||||
|  |     if emoji: | ||||||
|  |         return emoji | ||||||
|  | 
 | ||||||
|  |     try: | ||||||
|  |         c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[0]) | ||||||
|  |         c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[1]) | ||||||
|  |         # print("OK   : %s --> %s%s" % (locale, c1, c2)) | ||||||
|  |     except KeyError as exc: | ||||||
|  |         print("ERROR: %s --> %s" % (locale, exc)) | ||||||
|  |         return None | ||||||
|  | 
 | ||||||
|  |     return c1 + c2 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     main() | ||||||
| @ -1,313 +0,0 @@ | |||||||
| #!/usr/bin/env python |  | ||||||
| # lint: pylint |  | ||||||
| 
 |  | ||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later |  | ||||||
| """This script generates languages.py from intersecting each engine's supported |  | ||||||
| languages. |  | ||||||
| 
 |  | ||||||
| Output files: :origin:`searx/data/engines_languages.json` and |  | ||||||
| :origin:`searx/languages.py` (:origin:`CI Update data ... |  | ||||||
| <.github/workflows/data-update.yml>`). |  | ||||||
| 
 |  | ||||||
| """ |  | ||||||
| 
 |  | ||||||
| # pylint: disable=invalid-name |  | ||||||
| from unicodedata import lookup |  | ||||||
| import json |  | ||||||
| from pathlib import Path |  | ||||||
| from pprint import pformat |  | ||||||
| from babel import Locale, UnknownLocaleError |  | ||||||
| from babel.languages import get_global |  | ||||||
| from babel.core import parse_locale |  | ||||||
| 
 |  | ||||||
| from searx import settings, searx_dir |  | ||||||
| from searx.engines import load_engines, engines |  | ||||||
| from searx.network import set_timeout_for_thread |  | ||||||
| 
 |  | ||||||
| # Output files. |  | ||||||
| engines_languages_file = Path(searx_dir) / 'data' / 'engines_languages.json' |  | ||||||
| languages_file = Path(searx_dir) / 'languages.py' |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Fetches supported languages for each engine and writes json file with those. |  | ||||||
| def fetch_supported_languages(): |  | ||||||
|     set_timeout_for_thread(10.0) |  | ||||||
| 
 |  | ||||||
|     engines_languages = {} |  | ||||||
|     names = list(engines) |  | ||||||
|     names.sort() |  | ||||||
| 
 |  | ||||||
|     for engine_name in names: |  | ||||||
|         if hasattr(engines[engine_name], 'fetch_supported_languages'): |  | ||||||
|             engines_languages[engine_name] = engines[engine_name].fetch_supported_languages() |  | ||||||
|             print("fetched %s languages from engine %s" % (len(engines_languages[engine_name]), engine_name)) |  | ||||||
|             if type(engines_languages[engine_name]) == list:  # pylint: disable=unidiomatic-typecheck |  | ||||||
|                 engines_languages[engine_name] = sorted(engines_languages[engine_name]) |  | ||||||
| 
 |  | ||||||
|     print("fetched languages from %s engines" % len(engines_languages)) |  | ||||||
| 
 |  | ||||||
|     # write json file |  | ||||||
|     with open(engines_languages_file, 'w', encoding='utf-8') as f: |  | ||||||
|         json.dump(engines_languages, f, indent=2, sort_keys=True) |  | ||||||
| 
 |  | ||||||
|     return engines_languages |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Get babel Locale object from lang_code if possible. |  | ||||||
| def get_locale(lang_code): |  | ||||||
|     try: |  | ||||||
|         locale = Locale.parse(lang_code, sep='-') |  | ||||||
|         return locale |  | ||||||
|     except (UnknownLocaleError, ValueError): |  | ||||||
|         return None |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| lang2emoji = { |  | ||||||
|     'ha': '\U0001F1F3\U0001F1EA',  # Hausa / Niger |  | ||||||
|     'bs': '\U0001F1E7\U0001F1E6',  # Bosnian / Bosnia & Herzegovina |  | ||||||
|     'jp': '\U0001F1EF\U0001F1F5',  # Japanese |  | ||||||
|     'ua': '\U0001F1FA\U0001F1E6',  # Ukrainian |  | ||||||
|     'he': '\U0001F1EE\U0001F1F7',  # Hebrew |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def get_unicode_flag(lang_code): |  | ||||||
|     """Determine a unicode flag (emoji) that fits to the ``lang_code``""" |  | ||||||
| 
 |  | ||||||
|     emoji = lang2emoji.get(lang_code.lower()) |  | ||||||
|     if emoji: |  | ||||||
|         return emoji |  | ||||||
| 
 |  | ||||||
|     if len(lang_code) == 2: |  | ||||||
|         return '\U0001F310' |  | ||||||
| 
 |  | ||||||
|     language = territory = script = variant = '' |  | ||||||
|     try: |  | ||||||
|         language, territory, script, variant = parse_locale(lang_code, '-') |  | ||||||
|     except ValueError as exc: |  | ||||||
|         print(exc) |  | ||||||
| 
 |  | ||||||
|     # https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2 |  | ||||||
|     if not territory: |  | ||||||
|         # https://www.unicode.org/emoji/charts/emoji-list.html#country-flag |  | ||||||
|         emoji = lang2emoji.get(language) |  | ||||||
|         if not emoji: |  | ||||||
|             print( |  | ||||||
|                 "%s --> language: %s / territory: %s / script: %s / variant: %s" |  | ||||||
|                 % (lang_code, language, territory, script, variant) |  | ||||||
|             ) |  | ||||||
|         return emoji |  | ||||||
| 
 |  | ||||||
|     emoji = lang2emoji.get(territory.lower()) |  | ||||||
|     if emoji: |  | ||||||
|         return emoji |  | ||||||
| 
 |  | ||||||
|     try: |  | ||||||
|         c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + territory[0]) |  | ||||||
|         c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + territory[1]) |  | ||||||
|         # print("%s --> territory: %s --> %s%s" %(lang_code, territory, c1, c2 )) |  | ||||||
|     except KeyError as exc: |  | ||||||
|         print("%s --> territory: %s --> %s" % (lang_code, territory, exc)) |  | ||||||
|         return None |  | ||||||
| 
 |  | ||||||
|     return c1 + c2 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def get_territory_name(lang_code): |  | ||||||
|     country_name = None |  | ||||||
|     locale = get_locale(lang_code) |  | ||||||
|     try: |  | ||||||
|         if locale is not None: |  | ||||||
|             country_name = locale.get_territory_name() |  | ||||||
|     except FileNotFoundError as exc: |  | ||||||
|         print("ERROR: %s --> %s" % (locale, exc)) |  | ||||||
|     return country_name |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Join all language lists. |  | ||||||
| def join_language_lists(engines_languages): |  | ||||||
|     language_list = {} |  | ||||||
|     for engine_name in engines_languages: |  | ||||||
|         for lang_code in engines_languages[engine_name]: |  | ||||||
| 
 |  | ||||||
|             # apply custom fixes if necessary |  | ||||||
|             if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values(): |  | ||||||
|                 lang_code = next( |  | ||||||
|                     lc for lc, alias in engines[engine_name].language_aliases.items() if lang_code == alias |  | ||||||
|                 ) |  | ||||||
| 
 |  | ||||||
|             locale = get_locale(lang_code) |  | ||||||
| 
 |  | ||||||
|             # ensure that lang_code uses standard language and country codes |  | ||||||
|             if locale and locale.territory: |  | ||||||
|                 lang_code = "{lang}-{country}".format(lang=locale.language, country=locale.territory) |  | ||||||
|             short_code = lang_code.split('-')[0] |  | ||||||
| 
 |  | ||||||
|             # add language without country if not in list |  | ||||||
|             if short_code not in language_list: |  | ||||||
|                 if locale: |  | ||||||
|                     # get language's data from babel's Locale object |  | ||||||
|                     language_name = locale.get_language_name().title() |  | ||||||
|                     english_name = locale.english_name.split(' (')[0] |  | ||||||
|                 elif short_code in engines_languages['wikipedia']: |  | ||||||
|                     # get language's data from wikipedia if not known by babel |  | ||||||
|                     language_name = engines_languages['wikipedia'][short_code]['name'] |  | ||||||
|                     english_name = engines_languages['wikipedia'][short_code]['english_name'] |  | ||||||
|                 else: |  | ||||||
|                     language_name = None |  | ||||||
|                     english_name = None |  | ||||||
| 
 |  | ||||||
|                 # add language to list |  | ||||||
|                 language_list[short_code] = { |  | ||||||
|                     'name': language_name, |  | ||||||
|                     'english_name': english_name, |  | ||||||
|                     'counter': set(), |  | ||||||
|                     'countries': {}, |  | ||||||
|                 } |  | ||||||
| 
 |  | ||||||
|             # add language with country if not in list |  | ||||||
|             if lang_code != short_code and lang_code not in language_list[short_code]['countries']: |  | ||||||
|                 country_name = '' |  | ||||||
|                 if locale: |  | ||||||
|                     # get country name from babel's Locale object |  | ||||||
|                     try: |  | ||||||
|                         country_name = locale.get_territory_name() |  | ||||||
|                     except FileNotFoundError as exc: |  | ||||||
|                         print("ERROR: %s --> %s" % (locale, exc)) |  | ||||||
|                         locale = None |  | ||||||
| 
 |  | ||||||
|                 language_list[short_code]['countries'][lang_code] = { |  | ||||||
|                     'country_name': country_name, |  | ||||||
|                     'counter': set(), |  | ||||||
|                 } |  | ||||||
| 
 |  | ||||||
|             # count engine for both language_country combination and language alone |  | ||||||
|             language_list[short_code]['counter'].add(engine_name) |  | ||||||
|             if lang_code != short_code: |  | ||||||
|                 language_list[short_code]['countries'][lang_code]['counter'].add(engine_name) |  | ||||||
| 
 |  | ||||||
|     return language_list |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Filter language list so it only includes the most supported languages and countries |  | ||||||
| def filter_language_list(all_languages): |  | ||||||
|     min_engines_per_lang = 12 |  | ||||||
|     min_engines_per_country = 7 |  | ||||||
|     # pylint: disable=consider-using-dict-items, consider-iterating-dictionary |  | ||||||
|     main_engines = [ |  | ||||||
|         engine_name |  | ||||||
|         for engine_name in engines.keys() |  | ||||||
|         if 'general' in engines[engine_name].categories |  | ||||||
|         and engines[engine_name].supported_languages |  | ||||||
|         and not engines[engine_name].disabled |  | ||||||
|     ] |  | ||||||
| 
 |  | ||||||
|     # filter list to include only languages supported by most engines or all default general engines |  | ||||||
|     filtered_languages = { |  | ||||||
|         code: lang |  | ||||||
|         for code, lang in all_languages.items() |  | ||||||
|         if ( |  | ||||||
|             len(lang['counter']) >= min_engines_per_lang |  | ||||||
|             or all(main_engine in lang['counter'] for main_engine in main_engines) |  | ||||||
|         ) |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     def _copy_lang_data(lang, country_name=None): |  | ||||||
|         new_dict = {} |  | ||||||
|         new_dict['name'] = all_languages[lang]['name'] |  | ||||||
|         new_dict['english_name'] = all_languages[lang]['english_name'] |  | ||||||
|         if country_name: |  | ||||||
|             new_dict['country_name'] = country_name |  | ||||||
|         return new_dict |  | ||||||
| 
 |  | ||||||
|     # for each language get country codes supported by most engines or at least one country code |  | ||||||
|     filtered_languages_with_countries = {} |  | ||||||
|     for lang, lang_data in filtered_languages.items(): |  | ||||||
|         countries = lang_data['countries'] |  | ||||||
|         filtered_countries = {} |  | ||||||
| 
 |  | ||||||
|         # get language's country codes with enough supported engines |  | ||||||
|         for lang_country, country_data in countries.items(): |  | ||||||
|             if len(country_data['counter']) >= min_engines_per_country: |  | ||||||
|                 filtered_countries[lang_country] = _copy_lang_data(lang, country_data['country_name']) |  | ||||||
| 
 |  | ||||||
|         # add language without countries too if there's more than one country to choose from |  | ||||||
|         if len(filtered_countries) > 1: |  | ||||||
|             filtered_countries[lang] = _copy_lang_data(lang, None) |  | ||||||
|         elif len(filtered_countries) == 1: |  | ||||||
|             lang_country = next(iter(filtered_countries)) |  | ||||||
| 
 |  | ||||||
|         # if no country has enough engines try to get most likely country code from babel |  | ||||||
|         if not filtered_countries: |  | ||||||
|             lang_country = None |  | ||||||
|             subtags = get_global('likely_subtags').get(lang) |  | ||||||
|             if subtags: |  | ||||||
|                 country_code = subtags.split('_')[-1] |  | ||||||
|                 if len(country_code) == 2: |  | ||||||
|                     lang_country = "{lang}-{country}".format(lang=lang, country=country_code) |  | ||||||
| 
 |  | ||||||
|             if lang_country: |  | ||||||
|                 filtered_countries[lang_country] = _copy_lang_data(lang, None) |  | ||||||
|             else: |  | ||||||
|                 filtered_countries[lang] = _copy_lang_data(lang, None) |  | ||||||
| 
 |  | ||||||
|         filtered_languages_with_countries.update(filtered_countries) |  | ||||||
| 
 |  | ||||||
|     return filtered_languages_with_countries |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class UnicodeEscape(str): |  | ||||||
|     """Escape unicode string in :py:obj:`pprint.pformat`""" |  | ||||||
| 
 |  | ||||||
|     def __repr__(self): |  | ||||||
|         return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Write languages.py. |  | ||||||
| def write_languages_file(languages): |  | ||||||
|     file_headers = ( |  | ||||||
|         "# -*- coding: utf-8 -*-", |  | ||||||
|         "# list of language codes", |  | ||||||
|         "# this file is generated automatically by utils/fetch_languages.py", |  | ||||||
|         "language_codes = (\n", |  | ||||||
|     ) |  | ||||||
| 
 |  | ||||||
|     language_codes = [] |  | ||||||
| 
 |  | ||||||
|     for code in sorted(languages): |  | ||||||
| 
 |  | ||||||
|         name = languages[code]['name'] |  | ||||||
|         if name is None: |  | ||||||
|             print("ERROR: languages['%s'] --> %s" % (code, languages[code])) |  | ||||||
|             continue |  | ||||||
| 
 |  | ||||||
|         flag = get_unicode_flag(code) or '' |  | ||||||
|         item = ( |  | ||||||
|             code, |  | ||||||
|             languages[code]['name'].split(' (')[0], |  | ||||||
|             get_territory_name(code) or '', |  | ||||||
|             languages[code].get('english_name') or '', |  | ||||||
|             UnicodeEscape(flag), |  | ||||||
|         ) |  | ||||||
| 
 |  | ||||||
|         language_codes.append(item) |  | ||||||
| 
 |  | ||||||
|     language_codes = tuple(language_codes) |  | ||||||
| 
 |  | ||||||
|     with open(languages_file, 'w', encoding='utf-8') as new_file: |  | ||||||
|         file_content = "{file_headers} {language_codes},\n)\n".format( |  | ||||||
|             # fmt: off |  | ||||||
|             file_headers = '\n'.join(file_headers), |  | ||||||
|             language_codes = pformat(language_codes, indent=4)[1:-1] |  | ||||||
|             # fmt: on |  | ||||||
|         ) |  | ||||||
|         new_file.write(file_content) |  | ||||||
|         new_file.close() |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| if __name__ == "__main__": |  | ||||||
|     load_engines(settings['engines']) |  | ||||||
|     _engines_languages = fetch_supported_languages() |  | ||||||
|     _all_languages = join_language_lists(_engines_languages) |  | ||||||
|     _filtered_languages = filter_language_list(_all_languages) |  | ||||||
|     write_languages_file(_filtered_languages) |  | ||||||
| @ -50,7 +50,7 @@ from pathlib import Path | |||||||
| from searx import searx_dir | from searx import searx_dir | ||||||
| from searx.network import set_timeout_for_thread | from searx.network import set_timeout_for_thread | ||||||
| from searx.engines import wikidata, set_loggers | from searx.engines import wikidata, set_loggers | ||||||
| from searx.languages import language_codes | from searx.sxng_locales import sxng_locales | ||||||
| from searx.engines.openstreetmap import get_key_rank, VALUE_TO_LINK | from searx.engines.openstreetmap import get_key_rank, VALUE_TO_LINK | ||||||
| 
 | 
 | ||||||
| set_loggers(wikidata, 'wikidata') | set_loggers(wikidata, 'wikidata') | ||||||
| @ -76,7 +76,7 @@ GROUP BY ?key ?item ?itemLabel | |||||||
| ORDER BY ?key ?item ?itemLabel | ORDER BY ?key ?item ?itemLabel | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| LANGUAGES = [l[0].lower() for l in language_codes] | LANGUAGES = [l[0].lower() for l in sxng_locales] | ||||||
| 
 | 
 | ||||||
| PRESET_KEYS = { | PRESET_KEYS = { | ||||||
|     ('wikidata',): {'en': 'Wikidata'}, |     ('wikidata',): {'en': 'Wikidata'}, | ||||||
|  | |||||||
							
								
								
									
										111
									
								
								tests/unit/test_locales.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										111
									
								
								tests/unit/test_locales.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,111 @@ | |||||||
|  | # -*- coding: utf-8 -*- | ||||||
|  | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
|  | # lint: pylint | ||||||
|  | """Test some code from module :py:obj:`searx.locales`""" | ||||||
|  | 
 | ||||||
|  | from searx import locales | ||||||
|  | from searx.sxng_locales import sxng_locales | ||||||
|  | from tests import SearxTestCase | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TestLocales(SearxTestCase): | ||||||
|  |     """Implemented tests: | ||||||
|  | 
 | ||||||
|  |     - :py:obj:`searx.locales.match_locale` | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     def test_match_locale(self): | ||||||
|  | 
 | ||||||
|  |         locale_tag_list = [x[0] for x in sxng_locales] | ||||||
|  | 
 | ||||||
|  |         # Test SearXNG search languages | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(locales.match_locale('de', locale_tag_list), 'de') | ||||||
|  |         self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr') | ||||||
|  |         self.assertEqual(locales.match_locale('zh', locale_tag_list), 'zh') | ||||||
|  | 
 | ||||||
|  |         # Test SearXNG search regions | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(locales.match_locale('ca-es', locale_tag_list), 'ca-ES') | ||||||
|  |         self.assertEqual(locales.match_locale('de-at', locale_tag_list), 'de-AT') | ||||||
|  |         self.assertEqual(locales.match_locale('de-de', locale_tag_list), 'de-DE') | ||||||
|  |         self.assertEqual(locales.match_locale('en-UK', locale_tag_list), 'en-GB') | ||||||
|  |         self.assertEqual(locales.match_locale('fr-be', locale_tag_list), 'fr-BE') | ||||||
|  |         self.assertEqual(locales.match_locale('fr-be', locale_tag_list), 'fr-BE') | ||||||
|  |         self.assertEqual(locales.match_locale('fr-ca', locale_tag_list), 'fr-CA') | ||||||
|  |         self.assertEqual(locales.match_locale('fr-ch', locale_tag_list), 'fr-CH') | ||||||
|  |         self.assertEqual(locales.match_locale('zh-cn', locale_tag_list), 'zh-CN') | ||||||
|  |         self.assertEqual(locales.match_locale('zh-tw', locale_tag_list), 'zh-TW') | ||||||
|  |         self.assertEqual(locales.match_locale('zh-hk', locale_tag_list), 'zh-HK') | ||||||
|  | 
 | ||||||
|  |         # Test language script code | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(locales.match_locale('zh-hans', locale_tag_list), 'zh-CN') | ||||||
|  |         self.assertEqual(locales.match_locale('zh-hans-cn', locale_tag_list), 'zh-CN') | ||||||
|  |         self.assertEqual(locales.match_locale('zh-hant', locale_tag_list), 'zh-TW') | ||||||
|  |         self.assertEqual(locales.match_locale('zh-hant-tw', locale_tag_list), 'zh-TW') | ||||||
|  | 
 | ||||||
|  |         # Test individual locale lists | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(locales.match_locale('es', [], fallback='fallback'), 'fallback') | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(locales.match_locale('de', ['de-CH', 'de-DE']), 'de-DE') | ||||||
|  |         self.assertEqual(locales.match_locale('de', ['de-CH', 'de-DE']), 'de-DE') | ||||||
|  |         self.assertEqual(locales.match_locale('es', ['ES']), 'ES') | ||||||
|  |         self.assertEqual(locales.match_locale('es', ['es-AR', 'es-ES', 'es-MX']), 'es-ES') | ||||||
|  |         self.assertEqual(locales.match_locale('es-AR', ['es-AR', 'es-ES', 'es-MX']), 'es-AR') | ||||||
|  |         self.assertEqual(locales.match_locale('es-CO', ['es-AR', 'es-ES']), 'es-ES') | ||||||
|  |         self.assertEqual(locales.match_locale('es-CO', ['es-AR']), 'es-AR') | ||||||
|  | 
 | ||||||
|  |         # Tests from the commit message of 9ae409a05a | ||||||
|  | 
 | ||||||
|  |         # Assumption: | ||||||
|  |         #   A. When a user selects a language the results should be optimized according to | ||||||
|  |         #      the selected language. | ||||||
|  |         # | ||||||
|  |         #   B. When user selects a language and a territory the results should be | ||||||
|  |         #      optimized with first priority on territory and second on language. | ||||||
|  | 
 | ||||||
|  |         # Assume we have an engine that supports the follwoing locales: | ||||||
|  |         locale_tag_list = ['zh-CN', 'zh-HK', 'nl-BE', 'fr-CA'] | ||||||
|  | 
 | ||||||
|  |         # Examples (Assumption A.) | ||||||
|  |         # ------------------------ | ||||||
|  | 
 | ||||||
|  |         # A user selects region 'zh-TW' which should end in zh_HK. | ||||||
|  |         # hint: CN is 'Hans' and HK ('Hant') fits better to TW ('Hant') | ||||||
|  |         self.assertEqual(locales.match_locale('zh-TW', locale_tag_list), 'zh-HK') | ||||||
|  | 
 | ||||||
|  |         # A user selects only the language 'zh' which should end in CN | ||||||
|  |         self.assertEqual(locales.match_locale('zh', locale_tag_list), 'zh-CN') | ||||||
|  | 
 | ||||||
|  |         # A user selects only the language 'fr' which should end in fr_CA | ||||||
|  |         self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr-CA') | ||||||
|  | 
 | ||||||
|  |         # The difference in priority on the territory is best shown with a | ||||||
|  |         # engine that supports the following locales: | ||||||
|  |         locale_tag_list = ['fr-FR', 'fr-CA', 'en-GB', 'nl-BE'] | ||||||
|  | 
 | ||||||
|  |         # A user selects only a language | ||||||
|  |         self.assertEqual(locales.match_locale('en', locale_tag_list), 'en-GB') | ||||||
|  | 
 | ||||||
|  |         # hint: the engine supports fr_FR and fr_CA since no territory is given, | ||||||
|  |         # fr_FR takes priority .. | ||||||
|  |         self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr-FR') | ||||||
|  | 
 | ||||||
|  |         # Examples (Assumption B.) | ||||||
|  |         # ------------------------ | ||||||
|  | 
 | ||||||
|  |         #  A user selects region 'fr-BE' which should end in nl-BE | ||||||
|  |         self.assertEqual(locales.match_locale('fr-BE', locale_tag_list), 'nl-BE') | ||||||
|  | 
 | ||||||
|  |         # If the user selects a language and there are two locales like the | ||||||
|  |         # following: | ||||||
|  | 
 | ||||||
|  |         locale_tag_list = ['fr-BE', 'fr-CH'] | ||||||
|  | 
 | ||||||
|  |         # The get_engine_locale selects the locale by looking at the "population | ||||||
|  |         # percent" and this percentage has an higher amount in BE (68.%) | ||||||
|  |         # compared to CH (21%) | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr-BE') | ||||||
| @ -87,39 +87,6 @@ class TestUtils(SearxTestCase): | |||||||
|         html = '<p><b>Lorem ipsum</i>dolor sit amet</p>' |         html = '<p><b>Lorem ipsum</i>dolor sit amet</p>' | ||||||
|         self.assertEqual(utils.html_to_text(html), "Lorem ipsum") |         self.assertEqual(utils.html_to_text(html), "Lorem ipsum") | ||||||
| 
 | 
 | ||||||
|     def test_match_language(self): |  | ||||||
|         self.assertEqual(utils.match_language('es', ['es']), 'es') |  | ||||||
|         self.assertEqual(utils.match_language('es', [], fallback='fallback'), 'fallback') |  | ||||||
|         self.assertEqual(utils.match_language('ja', ['jp'], {'ja': 'jp'}), 'jp') |  | ||||||
| 
 |  | ||||||
|         # handle script tags |  | ||||||
|         self.assertEqual(utils.match_language('zh-CN', ['zh-Hans-CN', 'zh-Hant-TW']), 'zh-Hans-CN') |  | ||||||
|         self.assertEqual(utils.match_language('zh-TW', ['zh-Hans-CN', 'zh-Hant-TW']), 'zh-Hant-TW') |  | ||||||
|         self.assertEqual(utils.match_language('zh-Hans-CN', ['zh-CN', 'zh-TW']), 'zh-CN') |  | ||||||
|         self.assertEqual(utils.match_language('zh-Hant-TW', ['zh-CN', 'zh-TW']), 'zh-TW') |  | ||||||
|         self.assertEqual(utils.match_language('zh-Hans', ['zh-CN', 'zh-TW', 'zh-HK']), 'zh-CN') |  | ||||||
|         self.assertEqual(utils.match_language('zh-Hant', ['zh-CN', 'zh-TW', 'zh-HK']), 'zh-TW') |  | ||||||
| 
 |  | ||||||
|         aliases = {'en-GB': 'en-UK', 'he': 'iw'} |  | ||||||
| 
 |  | ||||||
|         # guess country |  | ||||||
|         self.assertEqual(utils.match_language('de-DE', ['de']), 'de') |  | ||||||
|         self.assertEqual(utils.match_language('de', ['de-DE']), 'de-DE') |  | ||||||
|         self.assertEqual(utils.match_language('es-CO', ['es-AR', 'es-ES', 'es-MX']), 'es-ES') |  | ||||||
|         self.assertEqual(utils.match_language('es-CO', ['es-MX']), 'es-MX') |  | ||||||
|         self.assertEqual(utils.match_language('en-UK', ['en-AU', 'en-GB', 'en-US']), 'en-GB') |  | ||||||
|         self.assertEqual(utils.match_language('en-GB', ['en-AU', 'en-UK', 'en-US'], aliases), 'en-UK') |  | ||||||
| 
 |  | ||||||
|         # language aliases |  | ||||||
|         self.assertEqual(utils.match_language('iw', ['he']), 'he') |  | ||||||
|         self.assertEqual(utils.match_language('he', ['iw'], aliases), 'iw') |  | ||||||
|         self.assertEqual(utils.match_language('iw-IL', ['he']), 'he') |  | ||||||
|         self.assertEqual(utils.match_language('he-IL', ['iw'], aliases), 'iw') |  | ||||||
|         self.assertEqual(utils.match_language('iw', ['he-IL']), 'he-IL') |  | ||||||
|         self.assertEqual(utils.match_language('he', ['iw-IL'], aliases), 'iw-IL') |  | ||||||
|         self.assertEqual(utils.match_language('iw-IL', ['he-IL']), 'he-IL') |  | ||||||
|         self.assertEqual(utils.match_language('he-IL', ['iw-IL'], aliases), 'iw-IL') |  | ||||||
| 
 |  | ||||||
|     def test_ecma_unscape(self): |     def test_ecma_unscape(self): | ||||||
|         self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space') |         self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space') | ||||||
|         self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó') |         self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó') | ||||||
|  | |||||||
| @ -52,9 +52,6 @@ enabled_plugins: | |||||||
| 
 | 
 | ||||||
| engines: | engines: | ||||||
| 
 | 
 | ||||||
|   - name: google |  | ||||||
|     use_mobile_ui: true |  | ||||||
| 
 |  | ||||||
| #   - name: fdroid | #   - name: fdroid | ||||||
| #     disabled: false | #     disabled: false | ||||||
| # | # | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user