| 
									
										
										
										
											2021-01-13 11:31:25 +01:00
										 |  |  | # SPDX-License-Identifier: AGPL-3.0-or-later | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  | """The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by
 | 
					
						
							|  |  |  | the `MediaWiki Action API`_.  For a `query action`_ all Wikimedia wikis have | 
					
						
							|  |  |  | endpoints that follow this pattern:: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     https://{base_url}/w/api.php?action=query&list=search&format=json | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .. note:: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    In its actual state, this engine is implemented to parse JSON result | 
					
						
							|  |  |  |    (`format=json`_) from a search query (`list=search`_).  If you need other | 
					
						
							|  |  |  |    ``action`` and ``list`` types ask SearXNG developers to extend the | 
					
						
							|  |  |  |    implementation according to your needs. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | .. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page | 
					
						
							|  |  |  | .. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query | 
					
						
							|  |  |  | .. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch | 
					
						
							|  |  |  | .. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Configuration | 
					
						
							|  |  |  | ============= | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Request: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | - :py:obj:`base_url` | 
					
						
							|  |  |  | - :py:obj:`search_type` | 
					
						
							|  |  |  | - :py:obj:`srenablerewrites` | 
					
						
							|  |  |  | - :py:obj:`srsort` | 
					
						
							|  |  |  | - :py:obj:`srprop` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Implementations | 
					
						
							|  |  |  | =============== | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-05-02 15:45:17 +02:00
										 |  |  | """
 | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  | from __future__ import annotations | 
					
						
							|  |  |  | from typing import TYPE_CHECKING | 
					
						
							| 
									
										
										
										
											2014-09-03 11:40:29 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  | from datetime import datetime | 
					
						
							| 
									
										
										
										
											2020-08-06 17:42:46 +02:00
										 |  |  | from urllib.parse import urlencode, quote | 
					
						
							| 
									
										
										
										
											2013-10-23 23:53:27 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-03 19:07:22 +02:00
										 |  |  | from searx.utils import html_to_text | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  | from searx.enginelib.traits import EngineTraits | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if TYPE_CHECKING: | 
					
						
							|  |  |  |     import logging | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     logger: logging.Logger | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | traits: EngineTraits | 
					
						
							| 
									
										
										
										
											2023-08-03 19:07:22 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-13 11:31:25 +01:00
										 |  |  | # about | 
					
						
							|  |  |  | about = { | 
					
						
							|  |  |  |     "website": None, | 
					
						
							|  |  |  |     "wikidata_id": None, | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  |     "official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query', | 
					
						
							| 
									
										
										
										
											2021-01-13 11:31:25 +01:00
										 |  |  |     "use_official_api": True, | 
					
						
							|  |  |  |     "require_api_key": False, | 
					
						
							|  |  |  |     "results": 'JSON', | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-03 11:40:29 +02:00
										 |  |  | # engine dependent config | 
					
						
							|  |  |  | categories = ['general'] | 
					
						
							|  |  |  | paging = True | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  | number_of_results = 5 | 
					
						
							| 
									
										
										
										
											2014-12-16 17:10:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  | search_type: str = 'nearmatch' | 
					
						
							|  |  |  | """Which type of search to perform.  One of the following values: ``nearmatch``,
 | 
					
						
							|  |  |  | ``text`` or ``title``. | 
					
						
							| 
									
										
										
										
											2014-12-16 17:10:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  | See ``srwhat`` argument in `list=search`_ documentation. | 
					
						
							|  |  |  | """
 | 
					
						
							| 
									
										
										
										
											2013-10-23 23:53:27 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  | srenablerewrites: bool = True | 
					
						
							|  |  |  | """Enable internal query rewriting (Type: boolean).  Some search backends can
 | 
					
						
							|  |  |  | rewrite the query into another which is thought to provide better results, for | 
					
						
							|  |  |  | instance by correcting spelling errors. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | See ``srenablerewrites`` argument in `list=search`_ documentation. | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | srsort: str = 'relevance' | 
					
						
							|  |  |  | """Set the sort order of returned results.  One of the following values:
 | 
					
						
							|  |  |  | ``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``, | 
					
						
							|  |  |  | ``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``, | 
					
						
							|  |  |  | ``none``, ``random``, ``relevance``, ``user_random``. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | See ``srenablerewrites`` argument in `list=search`_ documentation. | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet' | 
					
						
							|  |  |  | """Which properties to return.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | See ``srprop`` argument in `list=search`_ documentation. | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | base_url: str = 'https://{language}.wikipedia.org/' | 
					
						
							|  |  |  | """Base URL of the Wikimedia wiki.
 | 
					
						
							| 
									
										
										
										
											2014-09-04 21:19:11 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  | ``{language}``: | 
					
						
							|  |  |  |   ISO 639-1 language code (en, de, fr ..) of the search language. | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-07-02 20:43:23 +02:00
										 |  |  | api_path: str = 'w/api.php' | 
					
						
							|  |  |  | """The path the PHP api is listening on.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The default path should work fine usually. | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  | timestamp_format = '%Y-%m-%dT%H:%M:%SZ' | 
					
						
							|  |  |  | """The longhand version of MediaWiki time strings.""" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def request(query, params): | 
					
						
							| 
									
										
										
										
											2014-09-04 21:19:11 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-03 11:40:29 +02:00
										 |  |  |     # write search-language back to params, required in response | 
					
						
							| 
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  |     if params['language'] == 'all': | 
					
						
							|  |  |  |         params['language'] = 'en' | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         params['language'] = params['language'].split('-')[0] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-07-02 20:43:23 +02:00
										 |  |  |     api_url = f"{base_url.rstrip('/')}/{api_path}?".format(language=params['language']) | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  |     offset = (params['pageno'] - 1) * number_of_results | 
					
						
							| 
									
										
										
										
											2014-09-03 11:40:29 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  |     args = { | 
					
						
							|  |  |  |         'action': 'query', | 
					
						
							|  |  |  |         'list': 'search', | 
					
						
							|  |  |  |         'format': 'json', | 
					
						
							|  |  |  |         'srsearch': query, | 
					
						
							|  |  |  |         'sroffset': offset, | 
					
						
							|  |  |  |         'srlimit': number_of_results, | 
					
						
							|  |  |  |         'srwhat': search_type, | 
					
						
							|  |  |  |         'srprop': srprop, | 
					
						
							|  |  |  |         'srsort': srsort, | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if srenablerewrites: | 
					
						
							|  |  |  |         args['srenablerewrites'] = '1' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     params['url'] = api_url + urlencode(args) | 
					
						
							| 
									
										
										
										
											2013-10-23 23:53:27 +02:00
										 |  |  |     return params | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-03 11:40:29 +02:00
										 |  |  | # get response from search-request | 
					
						
							| 
									
										
										
										
											2013-10-23 23:53:27 +02:00
										 |  |  | def response(resp): | 
					
						
							| 
									
										
										
										
											2014-09-03 11:40:29 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  |     results = [] | 
					
						
							| 
									
										
										
										
											2023-08-03 19:07:22 +02:00
										 |  |  |     search_results = resp.json() | 
					
						
							| 
									
										
										
										
											2014-09-03 11:40:29 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # return empty array if there are no results | 
					
						
							|  |  |  |     if not search_results.get('query', {}).get('search'): | 
					
						
							|  |  |  |         return [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for result in search_results['query']['search']: | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-02-09 21:21:59 +01:00
										 |  |  |         if result.get('snippet', '').startswith('#REDIRECT'): | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         title = result['title'] | 
					
						
							|  |  |  |         sectiontitle = result.get('sectiontitle') | 
					
						
							|  |  |  |         content = html_to_text(result.get('snippet', '')) | 
					
						
							|  |  |  |         metadata = html_to_text(result.get('categorysnippet', '')) | 
					
						
							|  |  |  |         timestamp = result.get('timestamp') | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |         url = ( | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  |             base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode()) | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  |         if sectiontitle: | 
					
						
							|  |  |  |             # in case of sectiontitle create a link to the section in the wiki page | 
					
						
							|  |  |  |             url += '#' + quote(sectiontitle.replace(' ', '_').encode()) | 
					
						
							|  |  |  |             title += ' / ' + sectiontitle | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         item = {'url': url, 'title': title, 'content': content, 'metadata': metadata} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if timestamp: | 
					
						
							|  |  |  |             item['publishedDate'] = datetime.strptime(timestamp, timestamp_format) | 
					
						
							| 
									
										
										
										
											2014-09-04 21:19:11 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-04 16:54:22 +02:00
										 |  |  |         results.append(item) | 
					
						
							| 
									
										
										
										
											2014-09-03 11:40:29 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # return results | 
					
						
							|  |  |  |     return results |