| 
									
										
										
										
											2015-05-02 15:45:17 +02:00
										 |  |  | """
 | 
					
						
							|  |  |  |  general mediawiki-engine (Web) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  @website     websites built on mediawiki (https://www.mediawiki.org) | 
					
						
							|  |  |  |  @provide-api yes (http://www.mediawiki.org/wiki/API:Search) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  @using-api   yes | 
					
						
							|  |  |  |  @results     JSON | 
					
						
							|  |  |  |  @stable      yes | 
					
						
							|  |  |  |  @parse       url, title | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  @todo        content | 
					
						
							|  |  |  | """
 | 
					
						
							| 
									
										
										
										
											2014-09-03 11:40:29 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-10-23 23:53:27 +02:00
										 |  |  | from json import loads | 
					
						
							| 
									
										
										
										
											2014-09-04 21:19:11 +02:00
										 |  |  | from string import Formatter | 
					
						
							| 
									
										
										
										
											2020-08-06 17:42:46 +02:00
										 |  |  | from urllib.parse import urlencode, quote | 
					
						
							| 
									
										
										
										
											2013-10-23 23:53:27 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-03 11:40:29 +02:00
										 |  |  | # engine dependent config | 
					
						
							|  |  |  | categories = ['general'] | 
					
						
							|  |  |  | language_support = True | 
					
						
							|  |  |  | paging = True | 
					
						
							|  |  |  | number_of_results = 1 | 
					
						
							| 
									
										
										
										
											2017-05-18 22:19:44 +02:00
										 |  |  | search_type = 'nearmatch'  # possible values: title, text, nearmatch | 
					
						
							| 
									
										
										
										
											2014-09-04 21:19:11 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-03 11:40:29 +02:00
										 |  |  | # search-url | 
					
						
							|  |  |  | base_url = 'https://{language}.wikipedia.org/' | 
					
						
							| 
									
										
										
										
											2016-02-08 21:51:47 +01:00
										 |  |  | search_postfix = 'w/api.php?action=query'\ | 
					
						
							|  |  |  |     '&list=search'\ | 
					
						
							|  |  |  |     '&{query}'\ | 
					
						
							|  |  |  |     '&format=json'\ | 
					
						
							|  |  |  |     '&sroffset={offset}'\ | 
					
						
							| 
									
										
										
										
											2016-02-09 21:21:59 +01:00
										 |  |  |     '&srlimit={limit}'\ | 
					
						
							| 
									
										
										
										
											2017-05-18 22:04:31 +02:00
										 |  |  |     '&srwhat={searchtype}' | 
					
						
							| 
									
										
										
										
											2013-10-23 23:53:27 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-03 11:40:29 +02:00
										 |  |  | # do search-request | 
					
						
							|  |  |  | def request(query, params): | 
					
						
							|  |  |  |     offset = (params['pageno'] - 1) * number_of_results | 
					
						
							| 
									
										
										
										
											2014-12-16 17:10:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-04 21:19:11 +02:00
										 |  |  |     string_args = dict(query=urlencode({'srsearch': query}), | 
					
						
							| 
									
										
										
										
											2014-12-16 17:10:20 +01:00
										 |  |  |                        offset=offset, | 
					
						
							| 
									
										
										
										
											2017-05-18 21:34:54 +02:00
										 |  |  |                        limit=number_of_results, | 
					
						
							|  |  |  |                        searchtype=search_type) | 
					
						
							| 
									
										
										
										
											2014-12-16 17:10:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-04 23:53:13 +02:00
										 |  |  |     format_strings = list(Formatter().parse(base_url)) | 
					
						
							| 
									
										
										
										
											2013-10-23 23:53:27 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-01-06 15:27:46 +01:00
										 |  |  |     if params['language'] == 'all': | 
					
						
							|  |  |  |         language = 'en' | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         language = params['language'].split('-')[0] | 
					
						
							| 
									
										
										
										
											2014-09-04 21:19:11 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-02-08 21:51:47 +01:00
										 |  |  |     # format_string [('https://', 'language', '', None), ('.wikipedia.org/', None, None, None)] | 
					
						
							|  |  |  |     if any(x[1] == 'language' for x in format_strings): | 
					
						
							| 
									
										
										
										
											2014-09-04 21:19:11 +02:00
										 |  |  |         string_args['language'] = language | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-03 11:40:29 +02:00
										 |  |  |     # write search-language back to params, required in response | 
					
						
							|  |  |  |     params['language'] = language | 
					
						
							| 
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-02-08 21:51:47 +01:00
										 |  |  |     search_url = base_url + search_postfix | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-04 21:19:11 +02:00
										 |  |  |     params['url'] = search_url.format(**string_args) | 
					
						
							| 
									
										
										
										
											2014-09-03 11:40:29 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-10-23 23:53:27 +02:00
										 |  |  |     return params | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-03 11:40:29 +02:00
										 |  |  | # get response from search-request | 
					
						
							| 
									
										
										
										
											2013-10-23 23:53:27 +02:00
										 |  |  | def response(resp): | 
					
						
							| 
									
										
										
										
											2014-09-03 11:40:29 +02:00
										 |  |  |     results = [] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-10-23 23:53:27 +02:00
										 |  |  |     search_results = loads(resp.text) | 
					
						
							| 
									
										
										
										
											2014-09-03 11:40:29 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # return empty array if there are no results | 
					
						
							|  |  |  |     if not search_results.get('query', {}).get('search'): | 
					
						
							|  |  |  |         return [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # parse results | 
					
						
							|  |  |  |     for result in search_results['query']['search']: | 
					
						
							| 
									
										
										
										
											2016-02-09 21:21:59 +01:00
										 |  |  |         if result.get('snippet', '').startswith('#REDIRECT'): | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2014-12-16 17:10:20 +01:00
										 |  |  |         url = base_url.format(language=resp.search_params['language']) +\ | 
					
						
							| 
									
										
										
										
											2020-08-06 17:42:46 +02:00
										 |  |  |             'wiki/' + quote(result['title'].replace(' ', '_').encode()) | 
					
						
							| 
									
										
										
										
											2014-09-04 21:19:11 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-03 11:40:29 +02:00
										 |  |  |         # append result | 
					
						
							|  |  |  |         results.append({'url': url, | 
					
						
							|  |  |  |                         'title': result['title'], | 
					
						
							|  |  |  |                         'content': ''}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # return results | 
					
						
							|  |  |  |     return results |