| 
									
										
										
										
											2023-08-05 19:46:04 +02:00
										 |  |  | # SPDX-License-Identifier: AGPL-3.0-or-later | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | """Brave supports the categories listed in :py:obj:`brave_category` (General,
 | 
					
						
							|  |  |  | news, videos, images).  The support of :py:obj:`paging` and :py:obj:`time range | 
					
						
							|  |  |  | <time_range_support>` is limited (see remarks). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Configured ``brave`` engines: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .. code:: yaml | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   - name: brave | 
					
						
							|  |  |  |     engine: brave | 
					
						
							|  |  |  |     ... | 
					
						
							|  |  |  |     brave_category: search | 
					
						
							|  |  |  |     time_range_support: true | 
					
						
							|  |  |  |     paging: true | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   - name: brave.images | 
					
						
							|  |  |  |     engine: brave | 
					
						
							|  |  |  |     ... | 
					
						
							|  |  |  |     brave_category: images | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   - name: brave.videos | 
					
						
							|  |  |  |     engine: brave | 
					
						
							|  |  |  |     ... | 
					
						
							|  |  |  |     brave_category: videos | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   - name: brave.news | 
					
						
							|  |  |  |     engine: brave | 
					
						
							|  |  |  |     ... | 
					
						
							|  |  |  |     brave_category: news | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-10 05:14:54 +01:00
										 |  |  |   - name: brave.goggles | 
					
						
							|  |  |  |     brave_category: goggles | 
					
						
							|  |  |  |     time_range_support: true | 
					
						
							|  |  |  |     paging: true | 
					
						
							|  |  |  |     ... | 
					
						
							|  |  |  |     brave_category: goggles | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  | .. _brave regions: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Brave regions | 
					
						
							|  |  |  | ============= | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Brave uses two-digit tags for the regions like ``ca`` while SearXNG deals with | 
					
						
							| 
									
										
										
										
											2023-09-15 09:53:03 +02:00
										 |  |  | locales.  To get a mapping, all *officiat de-facto* languages of the Brave | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  | region are mapped to regions in SearXNG (see :py:obj:`babel | 
					
						
							|  |  |  | <babel.languages.get_official_languages>`): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .. code:: python | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     "regions": { | 
					
						
							|  |  |  |       .. | 
					
						
							|  |  |  |       "en-CA": "ca", | 
					
						
							|  |  |  |       "fr-CA": "ca", | 
					
						
							|  |  |  |       .. | 
					
						
							|  |  |  |      } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .. note:: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    The language (aka region) support of Brave's index is limited to very basic | 
					
						
							|  |  |  |    languages.  The search results for languages like Chinese or Arabic are of | 
					
						
							|  |  |  |    low quality. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-10 05:14:54 +01:00
										 |  |  | .. _brave googles: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Brave Goggles | 
					
						
							|  |  |  | ============= | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .. _list of Goggles: https://search.brave.com/goggles/discover | 
					
						
							|  |  |  | .. _Goggles Whitepaper: https://brave.com/static-assets/files/goggles.pdf | 
					
						
							|  |  |  | .. _Goggles Quickstart: https://github.com/brave/goggles-quickstart | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Goggles allow you to choose, alter, or extend the ranking of Brave Search | 
					
						
							|  |  |  | results (`Goggles Whitepaper`_).  Goggles are openly developed by the community | 
					
						
							|  |  |  | of Brave Search users. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Select from the `list of Goggles`_ people have published, or create your own | 
					
						
							|  |  |  | (`Goggles Quickstart`_). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  | .. _brave languages: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Brave languages | 
					
						
							|  |  |  | =============== | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-15 09:53:03 +02:00
										 |  |  | Brave's language support is limited to the UI (menus, area local notations, | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  | etc).  Brave's index only seems to support a locale, but it does not seem to | 
					
						
							|  |  |  | support any languages in its index.  The choice of available languages is very | 
					
						
							| 
									
										
										
										
											2023-09-15 09:53:03 +02:00
										 |  |  | small (and its not clear to me where the difference in UI is when switching | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  | from en-us to en-ca or en-gb). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | In the :py:obj:`EngineTraits object <searx.enginelib.traits.EngineTraits>` the | 
					
						
							|  |  |  | UI languages are stored in a custom field named ``ui_lang``: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .. code:: python | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     "custom": { | 
					
						
							|  |  |  |       "ui_lang": { | 
					
						
							|  |  |  |         "ca": "ca", | 
					
						
							|  |  |  |         "de-DE": "de-de", | 
					
						
							|  |  |  |         "en-CA": "en-ca", | 
					
						
							|  |  |  |         "en-GB": "en-gb", | 
					
						
							|  |  |  |         "en-US": "en-us", | 
					
						
							|  |  |  |         "es": "es", | 
					
						
							|  |  |  |         "fr-CA": "fr-ca", | 
					
						
							|  |  |  |         "fr-FR": "fr-fr", | 
					
						
							|  |  |  |         "ja-JP": "ja-jp", | 
					
						
							|  |  |  |         "pt-BR": "pt-br", | 
					
						
							|  |  |  |         "sq-AL": "sq-al" | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |     }, | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | Implementations | 
					
						
							|  |  |  | =============== | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-05 19:46:04 +02:00
										 |  |  | """
 | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-10 05:14:54 +01:00
										 |  |  | from typing import Any, TYPE_CHECKING | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | from urllib.parse import ( | 
					
						
							|  |  |  |     urlencode, | 
					
						
							|  |  |  |     urlparse, | 
					
						
							|  |  |  |     parse_qs, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-08-05 19:46:04 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-09 17:03:07 +01:00
										 |  |  | from dateutil import parser | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | from lxml import html | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  | from searx import locales | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | from searx.utils import ( | 
					
						
							|  |  |  |     extract_text, | 
					
						
							| 
									
										
										
										
											2024-01-09 17:03:07 +01:00
										 |  |  |     eval_xpath, | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  |     eval_xpath_list, | 
					
						
							|  |  |  |     eval_xpath_getindex, | 
					
						
							| 
									
										
										
										
											2023-09-09 12:18:39 +02:00
										 |  |  |     js_variable_to_python, | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  | from searx.enginelib.traits import EngineTraits | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if TYPE_CHECKING: | 
					
						
							|  |  |  |     import logging | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     logger: logging.Logger | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | traits: EngineTraits | 
					
						
							| 
									
										
										
										
											2023-08-05 19:46:04 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | about = { | 
					
						
							|  |  |  |     "website": 'https://search.brave.com/', | 
					
						
							|  |  |  |     "wikidata_id": 'Q22906900', | 
					
						
							|  |  |  |     "official_api_documentation": None, | 
					
						
							|  |  |  |     "use_official_api": False, | 
					
						
							|  |  |  |     "require_api_key": False, | 
					
						
							|  |  |  |     "results": 'HTML', | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-05 19:46:04 +02:00
										 |  |  | base_url = "https://search.brave.com/" | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | categories = [] | 
					
						
							|  |  |  | brave_category = 'search' | 
					
						
							| 
									
										
										
										
											2024-02-10 05:14:54 +01:00
										 |  |  | Goggles = Any | 
					
						
							|  |  |  | """Brave supports common web-search, videos, images, news, and goggles search.
 | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | - ``search``: Common WEB search | 
					
						
							|  |  |  | - ``videos``: search for videos | 
					
						
							|  |  |  | - ``images``: search for images | 
					
						
							|  |  |  | - ``news``: search for news | 
					
						
							| 
									
										
										
										
											2024-02-10 05:14:54 +01:00
										 |  |  | - ``goggles``: Common WEB search with custom rules | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | brave_spellcheck = False | 
					
						
							|  |  |  | """Brave supports some kind of spell checking.  When activated, Brave tries to
 | 
					
						
							|  |  |  | fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``.  In | 
					
						
							|  |  |  | the UI of Brave the user gets warned about this, since we can not warn the user | 
					
						
							|  |  |  | in SearXNG, the spellchecking is disabled by default. | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | send_accept_language_header = True | 
					
						
							| 
									
										
										
										
											2023-08-05 19:46:04 +02:00
										 |  |  | paging = False | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | """Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
 | 
					
						
							| 
									
										
										
										
											2024-02-10 05:14:54 +01:00
										 |  |  | category All) and in the goggles category."""
 | 
					
						
							| 
									
										
										
										
											2023-11-14 08:25:06 +01:00
										 |  |  | max_page = 10 | 
					
						
							|  |  |  | """Tested 9 pages maximum (``&offset=8``), to be save max is set to 10.  Trying
 | 
					
						
							|  |  |  | to do more won't return any result and you will most likely be flagged as a bot. | 
					
						
							|  |  |  | """
 | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | safesearch = True | 
					
						
							|  |  |  | safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'}  # cookie: safesearch=off | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | time_range_support = False | 
					
						
							|  |  |  | """Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
 | 
					
						
							| 
									
										
										
										
											2024-02-10 05:14:54 +01:00
										 |  |  | category All) and in the goggles category."""
 | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | time_range_map = { | 
					
						
							|  |  |  |     'day': 'pd', | 
					
						
							|  |  |  |     'week': 'pw', | 
					
						
							|  |  |  |     'month': 'pm', | 
					
						
							|  |  |  |     'year': 'py', | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2023-08-05 20:25:10 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-05 19:46:04 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | def request(query, params): | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787 | 
					
						
							|  |  |  |     params['headers']['Accept-Encoding'] = 'gzip, deflate' | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-05 19:46:04 +02:00
										 |  |  |     args = { | 
					
						
							|  |  |  |         'q': query, | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  |     if brave_spellcheck: | 
					
						
							|  |  |  |         args['spellcheck'] = '1' | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-10 05:14:54 +01:00
										 |  |  |     if brave_category in ('search', 'goggles'): | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  |         if params.get('pageno', 1) - 1: | 
					
						
							|  |  |  |             args['offset'] = params.get('pageno', 1) - 1 | 
					
						
							|  |  |  |         if time_range_map.get(params['time_range']): | 
					
						
							|  |  |  |             args['tf'] = time_range_map.get(params['time_range']) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-10 05:14:54 +01:00
										 |  |  |     if brave_category == 'goggles': | 
					
						
							|  |  |  |         args['goggles_id'] = Goggles | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  |     params["url"] = f"{base_url}{brave_category}?{urlencode(args)}" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  |     # set properties in the cookies | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  |     params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off') | 
					
						
							|  |  |  |     # the useLocation is IP based, we use cookie 'country' for the region | 
					
						
							|  |  |  |     params['cookies']['useLocation'] = '0' | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  |     params['cookies']['summarizer'] = '0' | 
					
						
							| 
									
										
										
										
											2023-08-05 19:46:04 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  |     engine_region = traits.get_region(params['searxng_locale'], 'all') | 
					
						
							|  |  |  |     params['cookies']['country'] = engine_region.split('-')[-1].lower()  # type: ignore | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  |     ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us') | 
					
						
							|  |  |  |     params['cookies']['ui_lang'] = ui_lang | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     logger.debug("cookies %s", params['cookies']) | 
					
						
							| 
									
										
										
										
											2023-08-05 19:46:04 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-05 20:25:10 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-09 17:03:07 +01:00
										 |  |  | def _extract_published_date(published_date_raw): | 
					
						
							|  |  |  |     if published_date_raw is None: | 
					
						
							|  |  |  |         return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         return parser.parse(published_date_raw) | 
					
						
							|  |  |  |     except parser.ParserError: | 
					
						
							|  |  |  |         return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-05 19:46:04 +02:00
										 |  |  | def response(resp): | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-10 05:14:54 +01:00
										 |  |  |     if brave_category in ('search', 'goggles'): | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  |         return _parse_search(resp) | 
					
						
							| 
									
										
										
										
											2023-08-05 19:46:04 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-05 20:25:10 +02:00
										 |  |  |     datastr = "" | 
					
						
							|  |  |  |     for line in resp.text.split("\n"): | 
					
						
							|  |  |  |         if "const data = " in line: | 
					
						
							|  |  |  |             datastr = line.replace("const data = ", "").strip()[:-1] | 
					
						
							|  |  |  |             break | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-09 12:18:39 +02:00
										 |  |  |     json_data = js_variable_to_python(datastr) | 
					
						
							| 
									
										
										
										
											2023-08-05 20:35:04 +02:00
										 |  |  |     json_resp = json_data[1]['data']['body']['response'] | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if brave_category == 'news': | 
					
						
							| 
									
										
										
										
											2023-09-15 20:57:03 +02:00
										 |  |  |         return _parse_news(json_resp['news']) | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if brave_category == 'images': | 
					
						
							|  |  |  |         return _parse_images(json_resp) | 
					
						
							|  |  |  |     if brave_category == 'videos': | 
					
						
							|  |  |  |         return _parse_videos(json_resp) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-10 12:19:03 +02:00
										 |  |  |     raise ValueError(f"Unsupported brave category: {brave_category}") | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def _parse_search(resp): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     result_list = [] | 
					
						
							|  |  |  |     dom = html.fromstring(resp.text) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None) | 
					
						
							|  |  |  |     if answer_tag: | 
					
						
							| 
									
										
										
										
											2023-08-08 18:18:28 +02:00
										 |  |  |         url = eval_xpath_getindex(dom, '//div[@id="featured_snippet"]/a[@class="result-header"]/@href', 0, default=None) | 
					
						
							|  |  |  |         result_list.append({'answer': extract_text(answer_tag), 'url': url}) | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]' | 
					
						
							| 
									
										
										
										
											2023-09-12 09:23:24 +02:00
										 |  |  |     xpath_results = '//div[contains(@class, "snippet ")]' | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     for result in eval_xpath_list(dom, xpath_results): | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-12 09:23:24 +02:00
										 |  |  |         url = eval_xpath_getindex(result, './/a[contains(@class, "h")]/@href', 0, default=None) | 
					
						
							| 
									
										
										
										
											2024-01-29 17:58:53 +01:00
										 |  |  |         title_tag = eval_xpath_getindex( | 
					
						
							|  |  |  |             result, './/a[contains(@class, "h")]//div[contains(@class, "title")]', 0, default=None | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-09-21 15:30:00 +02:00
										 |  |  |         if url is None or title_tag is None or not urlparse(url).netloc:  # partial url likely means it's an ad | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-09 17:03:07 +01:00
										 |  |  |         content_tag = eval_xpath_getindex(result, './/div[contains(@class, "snippet-description")]', 0, default='') | 
					
						
							|  |  |  |         pub_date_raw = eval_xpath(result, 'substring-before(.//div[contains(@class, "snippet-description")], "-")') | 
					
						
							| 
									
										
										
										
											2024-05-12 17:52:52 +02:00
										 |  |  |         thumbnail = eval_xpath_getindex(result, './/img[contains(@class, "thumb")]/@src', 0, default='') | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         item = { | 
					
						
							|  |  |  |             'url': url, | 
					
						
							|  |  |  |             'title': extract_text(title_tag), | 
					
						
							|  |  |  |             'content': extract_text(content_tag), | 
					
						
							| 
									
										
										
										
											2024-01-09 17:03:07 +01:00
										 |  |  |             'publishedDate': _extract_published_date(pub_date_raw), | 
					
						
							| 
									
										
										
										
											2024-05-12 17:52:52 +02:00
										 |  |  |             'thumbnail': thumbnail, | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         video_tag = eval_xpath_getindex( | 
					
						
							|  |  |  |             result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  |         if video_tag is not None: | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-15 09:53:03 +02:00
										 |  |  |             # In my tests a video tag in the WEB search was most often not a | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  |             # video, except the ones from youtube .. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             iframe_src = _get_iframe_src(url) | 
					
						
							|  |  |  |             if iframe_src: | 
					
						
							|  |  |  |                 item['iframe_src'] = iframe_src | 
					
						
							|  |  |  |                 item['template'] = 'videos.html' | 
					
						
							|  |  |  |                 item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='') | 
					
						
							| 
									
										
										
										
											2024-01-09 17:03:07 +01:00
										 |  |  |                 pub_date_raw = extract_text( | 
					
						
							|  |  |  |                     eval_xpath(video_tag, './/div[contains(@class, "snippet-attributes")]/div/text()') | 
					
						
							|  |  |  |                 ) | 
					
						
							|  |  |  |                 item['publishedDate'] = _extract_published_date(pub_date_raw) | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  |             else: | 
					
						
							| 
									
										
										
										
											2024-05-12 17:52:52 +02:00
										 |  |  |                 item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='') | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         result_list.append(item) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return result_list | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def _get_iframe_src(url): | 
					
						
							|  |  |  |     parsed_url = urlparse(url) | 
					
						
							|  |  |  |     if parsed_url.path == '/watch' and parsed_url.query: | 
					
						
							|  |  |  |         video_id = parse_qs(parsed_url.query).get('v', [])  # type: ignore | 
					
						
							|  |  |  |         if video_id: | 
					
						
							|  |  |  |             return 'https://www.youtube-nocookie.com/embed/' + video_id[0]  # type: ignore | 
					
						
							|  |  |  |     return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def _parse_news(json_resp): | 
					
						
							|  |  |  |     result_list = [] | 
					
						
							| 
									
										
										
										
											2023-08-05 20:25:10 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-05 20:35:04 +02:00
										 |  |  |     for result in json_resp["results"]: | 
					
						
							| 
									
										
										
										
											2023-08-05 20:25:10 +02:00
										 |  |  |         item = { | 
					
						
							|  |  |  |             'url': result['url'], | 
					
						
							|  |  |  |             'title': result['title'], | 
					
						
							|  |  |  |             'content': result['description'], | 
					
						
							| 
									
										
										
										
											2024-01-09 17:03:07 +01:00
										 |  |  |             'publishedDate': _extract_published_date(result['age']), | 
					
						
							| 
									
										
										
										
											2023-08-05 20:25:10 +02:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2023-09-15 20:57:03 +02:00
										 |  |  |         if result['thumbnail'] is not None: | 
					
						
							| 
									
										
										
										
											2024-05-12 17:52:52 +02:00
										 |  |  |             item['thumbnail'] = result['thumbnail']['src'] | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  |         result_list.append(item) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return result_list | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def _parse_images(json_resp): | 
					
						
							|  |  |  |     result_list = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for result in json_resp["results"]: | 
					
						
							|  |  |  |         item = { | 
					
						
							|  |  |  |             'url': result['url'], | 
					
						
							|  |  |  |             'title': result['title'], | 
					
						
							|  |  |  |             'content': result['description'], | 
					
						
							|  |  |  |             'template': 'images.html', | 
					
						
							| 
									
										
										
										
											2024-02-20 10:51:58 +01:00
										 |  |  |             'resolution': result['properties']['format'], | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  |             'source': result['source'], | 
					
						
							|  |  |  |             'img_src': result['properties']['url'], | 
					
						
							| 
									
										
										
										
											2024-02-20 10:51:58 +01:00
										 |  |  |             'thumbnail_src': result['thumbnail']['src'], | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  |         } | 
					
						
							|  |  |  |         result_list.append(item) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return result_list | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def _parse_videos(json_resp): | 
					
						
							|  |  |  |     result_list = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for result in json_resp["results"]: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         url = result['url'] | 
					
						
							|  |  |  |         item = { | 
					
						
							|  |  |  |             'url': url, | 
					
						
							|  |  |  |             'title': result['title'], | 
					
						
							|  |  |  |             'content': result['description'], | 
					
						
							|  |  |  |             'template': 'videos.html', | 
					
						
							|  |  |  |             'length': result['video']['duration'], | 
					
						
							|  |  |  |             'duration': result['video']['duration'], | 
					
						
							| 
									
										
										
										
											2024-01-09 17:03:07 +01:00
										 |  |  |             'publishedDate': _extract_published_date(result['age']), | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-15 21:33:23 +02:00
										 |  |  |         if result['thumbnail'] is not None: | 
					
						
							| 
									
										
										
										
											2023-08-05 20:25:10 +02:00
										 |  |  |             item['thumbnail'] = result['thumbnail']['src'] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  |         iframe_src = _get_iframe_src(url) | 
					
						
							|  |  |  |         if iframe_src: | 
					
						
							|  |  |  |             item['iframe_src'] = iframe_src | 
					
						
							| 
									
										
										
										
											2023-08-05 20:35:04 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  |         result_list.append(item) | 
					
						
							| 
									
										
										
										
											2023-08-05 20:25:10 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-06 19:35:56 +02:00
										 |  |  |     return result_list | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def fetch_traits(engine_traits: EngineTraits): | 
					
						
							|  |  |  |     """Fetch :ref:`languages <brave languages>` and :ref:`regions <brave
 | 
					
						
							|  |  |  |     regions>` from Brave."""
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-26 12:59:56 +02:00
										 |  |  |     # pylint: disable=import-outside-toplevel, too-many-branches | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     import babel.languages | 
					
						
							|  |  |  |     from searx.locales import region_tag, language_tag | 
					
						
							|  |  |  |     from searx.network import get  # see https://github.com/searxng/searxng/issues/762 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     engine_traits.custom["ui_lang"] = {} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     headers = { | 
					
						
							|  |  |  |         'Accept-Encoding': 'gzip, deflate', | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     lang_map = {'no': 'nb'}  # norway | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # languages (UI) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     resp = get('https://search.brave.com/settings', headers=headers) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if not resp.ok:  # type: ignore | 
					
						
							|  |  |  |         print("ERROR: response from Brave is not OK.") | 
					
						
							|  |  |  |     dom = html.fromstring(resp.text)  # type: ignore | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for option in dom.xpath('//div[@id="language-select"]//option'): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         ui_lang = option.get('value') | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             if '-' in ui_lang: | 
					
						
							|  |  |  |                 sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-')) | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 sxng_tag = language_tag(babel.Locale.parse(ui_lang)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         except babel.UnknownLocaleError: | 
					
						
							|  |  |  |             print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang) | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         conflict = engine_traits.custom["ui_lang"].get(sxng_tag) | 
					
						
							|  |  |  |         if conflict: | 
					
						
							|  |  |  |             if conflict != ui_lang: | 
					
						
							|  |  |  |                 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang)) | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         engine_traits.custom["ui_lang"][sxng_tag] = ui_lang | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # search regions of brave | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-26 12:59:56 +02:00
										 |  |  |     resp = get('https://cdn.search.brave.com/serp/v2/_app/immutable/chunks/parameters.734c106a.js', headers=headers) | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-26 12:59:56 +02:00
										 |  |  |     if not resp.ok:  # type: ignore | 
					
						
							|  |  |  |         print("ERROR: response from Brave is not OK.") | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-26 12:59:56 +02:00
										 |  |  |     country_js = resp.text[resp.text.index("options:{all") + len('options:') :] | 
					
						
							|  |  |  |     country_js = country_js[: country_js.index("},k={default")] | 
					
						
							|  |  |  |     country_tags = js_variable_to_python(country_js) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for k, v in country_tags.items(): | 
					
						
							|  |  |  |         if k == 'all': | 
					
						
							|  |  |  |             engine_traits.all_locale = 'all' | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         country_tag = v['value'] | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-15 09:53:03 +02:00
										 |  |  |         # add official languages of the country .. | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  |         for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True): | 
					
						
							|  |  |  |             lang_tag = lang_map.get(lang_tag, lang_tag) | 
					
						
							|  |  |  |             sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper()))) | 
					
						
							| 
									
										
										
										
											2023-09-26 12:59:56 +02:00
										 |  |  |             # print("%-20s: %s <-- %s" % (v['label'], country_tag, sxng_tag)) | 
					
						
							| 
									
										
										
										
											2023-08-08 11:20:10 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |             conflict = engine_traits.regions.get(sxng_tag) | 
					
						
							|  |  |  |             if conflict: | 
					
						
							|  |  |  |                 if conflict != country_tag: | 
					
						
							|  |  |  |                     print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, country_tag)) | 
					
						
							|  |  |  |                     continue | 
					
						
							|  |  |  |             engine_traits.regions[sxng_tag] = country_tag |