[feat] plugins: new rerank results plugin
This commit is contained in:
		
							parent
							
								
									738906358b
								
							
						
					
					
						commit
						9f9ae5427d
					
				
							
								
								
									
										77
									
								
								searx/plugins/rerank.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										77
									
								
								searx/plugins/rerank.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,77 @@ | |||||||
|  | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
|  | """Plugin which reranks the search results using the Okapi BM25 algorithm. | ||||||
|  | Before enabling the Rerank plugin, you must the install the pip package ``bm25s``. | ||||||
|  | 
 | ||||||
|  | Enable in ``settings.yml``: | ||||||
|  | 
 | ||||||
|  | .. code:: yaml | ||||||
|  | 
 | ||||||
|  |   enabled_plugins: | ||||||
|  |     .. | ||||||
|  |     - 'Rerank plugin' | ||||||
|  | 
 | ||||||
|  | By default, the engine list is retained, so results found by multiple engines receive a score boost. | ||||||
|  | The following setting can be used to ensure that the engine list only contains the first engine. | ||||||
|  | This will prevent overlapping search engine results from affecting the ranking: | ||||||
|  | 
 | ||||||
|  | .. code:: yaml | ||||||
|  | 
 | ||||||
|  |   rerank: | ||||||
|  |     remove_extra_engines: true | ||||||
|  | 
 | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | from searx import settings | ||||||
|  | 
 | ||||||
|  | try: | ||||||
|  |     import bm25s | ||||||
|  | except ImportError: | ||||||
|  |     # Import error is ignored because the admin has to install bm25s manually to use the engine | ||||||
|  |     pass | ||||||
|  | 
 | ||||||
|  | name = 'Rerank plugin' | ||||||
|  | description = 'Rerank search results, ignoring original engine ranking' | ||||||
|  | default_on = False | ||||||
|  | preference_section = 'general' | ||||||
|  | 
 | ||||||
|  | # Supported stopwords for bm25s. Default is 'en' | ||||||
|  | stopword_langs = ['en', 'de', 'nl', 'fr', 'es', 'pt', 'it', 'ru', 'sv', 'no', 'zh'] | ||||||
|  | 
 | ||||||
|  | remove_extra_engines = settings.get('rerank', {}).get('remove_extra_engines') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def post_search(_request, search): | ||||||
|  |     # pylint: disable=protected-access | ||||||
|  |     results = search.result_container._merged_results | ||||||
|  |     query = search.search_query.query | ||||||
|  |     locale = search.search_query.locale | ||||||
|  | 
 | ||||||
|  |     # Determine the stopwords based on the selected locale | ||||||
|  |     stopwords = locale.language if locale and locale.language in stopword_langs else True | ||||||
|  | 
 | ||||||
|  |     retriever = bm25s.BM25() | ||||||
|  |     result_tokens = bm25s.tokenize( | ||||||
|  |         [f"{result.get('title', '')} | {result.get('content', '')} | {result.get('url', '')}" for result in results], | ||||||
|  |         stopwords=stopwords, | ||||||
|  |     ) | ||||||
|  |     retriever.index(result_tokens) | ||||||
|  | 
 | ||||||
|  |     query_tokens = bm25s.tokenize(query, stopwords=stopwords) | ||||||
|  | 
 | ||||||
|  |     # Retrieve ranked indices of results based on the query tokens | ||||||
|  |     indices = retriever.retrieve(query_tokens, k=len(results), return_as='documents', show_progress=False) | ||||||
|  | 
 | ||||||
|  |     if remove_extra_engines: | ||||||
|  |         # Only keep the main engine and set our ranking | ||||||
|  |         for position, index in enumerate(indices[0]): | ||||||
|  |             if 'positions' in results[index]: | ||||||
|  |                 results[index]['positions'] = [position + 1] | ||||||
|  |                 results[index]['engines'] = set([results[index]['engine']]) | ||||||
|  |     else: | ||||||
|  |         # Overwrite all engine positions with the new ranking | ||||||
|  |         # Results returned from multiple engines will still get a score boost | ||||||
|  |         for position, index in enumerate(indices[0]): | ||||||
|  |             if 'positions' in results[index]: | ||||||
|  |                 results[index]['positions'] = [position + 1] * len(results[index]['positions']) | ||||||
|  | 
 | ||||||
|  |     return True | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user