[mod] result scoring separated
This commit is contained in:
		
							parent
							
								
									fa6b980edd
								
							
						
					
					
						commit
						9688495b9f
					
				| @ -123,6 +123,46 @@ def highlight_content(content, query): | ||||
| 
 | ||||
|     return content | ||||
| 
 | ||||
| def score_results(results): | ||||
|     flat_res = filter(None, chain.from_iterable(izip_longest(*results.values()))) | ||||
|     flat_len = len(flat_res) | ||||
|     engines_len = len(results) | ||||
|     results = [] | ||||
|     # deduplication + scoring | ||||
|     for i,res in enumerate(flat_res): | ||||
|         res['parsed_url'] = urlparse(res['url']) | ||||
|         res['engines'] = [res['engine']] | ||||
|         weight = 1.0 | ||||
|         if hasattr(engines[res['engine']], 'weight'): | ||||
|             weight = float(engines[res['engine']].weight) | ||||
|         elif res['engine'] in settings.weights: | ||||
|             weight = float(settings.weights[res['engine']]) | ||||
|         score = int((flat_len - i)/engines_len)*weight+1 | ||||
|         duplicated = False | ||||
|         for new_res in results: | ||||
|             p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path | ||||
|             p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path | ||||
|             if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\ | ||||
|                p1 == p2 and\ | ||||
|                res['parsed_url'].query == new_res['parsed_url'].query and\ | ||||
|                res.get('template') == new_res.get('template'): | ||||
|                 duplicated = new_res | ||||
|                 break | ||||
|         if duplicated: | ||||
|             if len(res.get('content', '')) > len(duplicated.get('content', '')): | ||||
|                 duplicated['content'] = res['content'] | ||||
|             duplicated['score'] += score | ||||
|             duplicated['engines'].append(res['engine']) | ||||
|             if duplicated['parsed_url'].scheme == 'https': | ||||
|                 continue | ||||
|             elif res['parsed_url'].scheme == 'https': | ||||
|                 duplicated['url'] = res['parsed_url'].geturl() | ||||
|                 duplicated['parsed_url'] = res['parsed_url'] | ||||
|         else: | ||||
|             res['score'] = score | ||||
|             results.append(res) | ||||
|     return sorted(results, key=itemgetter('score'), reverse=True) | ||||
| 
 | ||||
| def search(query, request, selected_engines): | ||||
|     global engines, categories, number_of_searches | ||||
|     requests = [] | ||||
| @ -165,43 +205,8 @@ def search(query, request, selected_engines): | ||||
|     for engine_name,engine_results in results.items(): | ||||
|         engines[engine_name].stats['search_count'] += 1 | ||||
|         engines[engine_name].stats['result_count'] += len(engine_results) | ||||
|     flat_res = filter(None, chain.from_iterable(izip_longest(*results.values()))) | ||||
|     flat_len = len(flat_res) | ||||
|     engines_len = len(selected_engines) | ||||
|     results = [] | ||||
|     # deduplication + scoring | ||||
|     for i,res in enumerate(flat_res): | ||||
|         res['parsed_url'] = urlparse(res['url']) | ||||
|         res['engines'] = [res['engine']] | ||||
|         weight = 1.0 | ||||
|         if hasattr(engines[res['engine']], 'weight'): | ||||
|             weight = float(engines[res['engine']].weight) | ||||
|         elif res['engine'] in settings.weights: | ||||
|             weight = float(settings.weights[res['engine']]) | ||||
|         score = int((flat_len - i)/engines_len)*weight+1 | ||||
|         duplicated = False | ||||
|         for new_res in results: | ||||
|             p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path | ||||
|             p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path | ||||
|             if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\ | ||||
|                p1 == p2 and\ | ||||
|                res['parsed_url'].query == new_res['parsed_url'].query and\ | ||||
|                res.get('template') == new_res.get('template'): | ||||
|                 duplicated = new_res | ||||
|                 break | ||||
|         if duplicated: | ||||
|             if len(res.get('content', '')) > len(duplicated.get('content', '')): | ||||
|                 duplicated['content'] = res['content'] | ||||
|             duplicated['score'] += score | ||||
|             duplicated['engines'].append(res['engine']) | ||||
|             if duplicated['parsed_url'].scheme == 'https': | ||||
|                 continue | ||||
|             elif res['parsed_url'].scheme == 'https': | ||||
|                 duplicated['url'] = res['parsed_url'].geturl() | ||||
|                 duplicated['parsed_url'] = res['parsed_url'] | ||||
|         else: | ||||
|             res['score'] = score | ||||
|             results.append(res) | ||||
| 
 | ||||
|     results = score_results(results) | ||||
| 
 | ||||
|     for result in results: | ||||
|         if 'content' in result: | ||||
| @ -209,7 +214,7 @@ def search(query, request, selected_engines): | ||||
|         for res_engine in result['engines']: | ||||
|             engines[result['engine']].stats['score_count'] += result['score'] | ||||
| 
 | ||||
|     return sorted(results, key=itemgetter('score'), reverse=True) | ||||
|     return results | ||||
| 
 | ||||
| def get_engines_stats(): | ||||
|     pageloads = [] | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user