[fix] url path unquoted check to avoid duplications
This commit is contained in:
		
							parent
							
								
									cebf5868b3
								
							
						
					
					
						commit
						a07b2b514c
					
				| @ -21,7 +21,7 @@ import sys | |||||||
| from imp import load_source | from imp import load_source | ||||||
| from itertools import izip_longest, chain | from itertools import izip_longest, chain | ||||||
| from operator import itemgetter | from operator import itemgetter | ||||||
| from urlparse import urlparse | from urlparse import urlparse, unquote | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| import grequests | import grequests | ||||||
| from flask.ext.babel import gettext | from flask.ext.babel import gettext | ||||||
| @ -153,7 +153,9 @@ def score_results(results): | |||||||
|     results = [] |     results = [] | ||||||
|     # deduplication + scoring |     # deduplication + scoring | ||||||
|     for i, res in enumerate(flat_res): |     for i, res in enumerate(flat_res): | ||||||
|  | 
 | ||||||
|         res['parsed_url'] = urlparse(res['url']) |         res['parsed_url'] = urlparse(res['url']) | ||||||
|  | 
 | ||||||
|         res['host'] = res['parsed_url'].netloc |         res['host'] = res['parsed_url'].netloc | ||||||
| 
 | 
 | ||||||
|         if res['host'].startswith('www.'): |         if res['host'].startswith('www.'): | ||||||
| @ -172,7 +174,7 @@ def score_results(results): | |||||||
|             p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path  # noqa |             p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path  # noqa | ||||||
|             p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path  # noqa |             p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path  # noqa | ||||||
|             if res['host'] == new_res['host'] and\ |             if res['host'] == new_res['host'] and\ | ||||||
|                p1 == p2 and\ |                unquote(p1) == unquote(p2) and\ | ||||||
|                res['parsed_url'].query == new_res['parsed_url'].query and\ |                res['parsed_url'].query == new_res['parsed_url'].query and\ | ||||||
|                res.get('template') == new_res.get('template'): |                res.get('template') == new_res.get('template'): | ||||||
|                 duplicated = new_res |                 duplicated = new_res | ||||||
| @ -222,6 +224,10 @@ def search(query, request, selected_engines, pageno=1, lang='all'): | |||||||
|         request_params['language'] = lang |         request_params['language'] = lang | ||||||
|         request_params = engine.request(query.encode('utf-8'), request_params) |         request_params = engine.request(query.encode('utf-8'), request_params) | ||||||
| 
 | 
 | ||||||
|  |         if request_params['url'] is None: | ||||||
|  |             # TODO add support of offline engines | ||||||
|  |             pass | ||||||
|  | 
 | ||||||
|         callback = make_callback( |         callback = make_callback( | ||||||
|             selected_engine['name'], |             selected_engine['name'], | ||||||
|             results, |             results, | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user