[fix] www. domain duplications
This commit is contained in:
		
							parent
							
								
									78d42f094c
								
							
						
					
					
						commit
						b226e6462b
					
				| @ -154,16 +154,24 @@ def score_results(results): | |||||||
|     # deduplication + scoring |     # deduplication + scoring | ||||||
|     for i, res in enumerate(flat_res): |     for i, res in enumerate(flat_res): | ||||||
|         res['parsed_url'] = urlparse(res['url']) |         res['parsed_url'] = urlparse(res['url']) | ||||||
|  |         res['host'] = res['parsed_url'].netloc | ||||||
|  | 
 | ||||||
|  |         if res['host'].startswith('www.'): | ||||||
|  |             res['host'] = res['host'].replace('www.', '', 1) | ||||||
|  | 
 | ||||||
|         res['engines'] = [res['engine']] |         res['engines'] = [res['engine']] | ||||||
|         weight = 1.0 |         weight = 1.0 | ||||||
|  | 
 | ||||||
|         if hasattr(engines[res['engine']], 'weight'): |         if hasattr(engines[res['engine']], 'weight'): | ||||||
|             weight = float(engines[res['engine']].weight) |             weight = float(engines[res['engine']].weight) | ||||||
|  | 
 | ||||||
|         score = int((flat_len - i) / engines_len) * weight + 1 |         score = int((flat_len - i) / engines_len) * weight + 1 | ||||||
|         duplicated = False |         duplicated = False | ||||||
|  | 
 | ||||||
|         for new_res in results: |         for new_res in results: | ||||||
|             p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path  # noqa |             p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path  # noqa | ||||||
|             p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path  # noqa |             p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path  # noqa | ||||||
|             if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\ |             if res['host'] == new_res['host'] and\ | ||||||
|                p1 == p2 and\ |                p1 == p2 and\ | ||||||
|                res['parsed_url'].query == new_res['parsed_url'].query and\ |                res['parsed_url'].query == new_res['parsed_url'].query and\ | ||||||
|                res.get('template') == new_res.get('template'): |                res.get('template') == new_res.get('template'): | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user