Merge pull request #99 from dalf/master
[enh] stick results from the same category and template and [fix] rewrite the google engine
This commit is contained in:
		
						commit
						090254feca
					
				| @ -1,15 +1,17 @@ | |||||||
| ## Google (Web) | ## Google (Web) | ||||||
| #  | #  | ||||||
| # @website     https://www.google.com | # @website     https://www.google.com | ||||||
| # @provide-api yes (https://developers.google.com/web-search/docs/), deprecated! | # @provide-api yes (https://developers.google.com/custom-search/) | ||||||
| #  | #  | ||||||
| # @using-api   yes | # @using-api   no | ||||||
| # @results     JSON | # @results     HTML | ||||||
| # @stable      yes (but deprecated) | # @stable      no (HTML can change) | ||||||
| # @parse       url, title, content | # @parse       url, title, content, suggestion | ||||||
| 
 | 
 | ||||||
| from urllib import urlencode | from urllib import urlencode | ||||||
| from json import loads | from urlparse import unquote,urlparse,parse_qsl | ||||||
|  | from lxml import html | ||||||
|  | from searx.engines.xpath import extract_text, extract_url | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['general'] | categories = ['general'] | ||||||
| @ -17,21 +19,45 @@ paging = True | |||||||
| language_support = True | language_support = True | ||||||
| 
 | 
 | ||||||
| # search-url | # search-url | ||||||
| url = 'https://ajax.googleapis.com/' | google_hostname = 'www.google.com' | ||||||
| search_url = url + 'ajax/services/search/web?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}'  # noqa | search_path = '/search' | ||||||
|  | redirect_path = '/url' | ||||||
|  | images_path = '/images' | ||||||
|  | search_url = 'https://' + google_hostname + search_path + '?{query}&start={offset}&gbv=1' | ||||||
| 
 | 
 | ||||||
|  | # specific xpath variables | ||||||
|  | results_xpath= '//li[@class="g"]' | ||||||
|  | url_xpath = './/h3/a/@href' | ||||||
|  | title_xpath = './/h3' | ||||||
|  | content_xpath = './/span[@class="st"]' | ||||||
|  | suggestion_xpath = '//p[@class="_Bmc"]' | ||||||
|  | 
 | ||||||
|  | images_xpath = './/div/a' | ||||||
|  | image_url_xpath = './@href' | ||||||
|  | image_img_src_xpath = './img/@src' | ||||||
|  | 
 | ||||||
|  | # remove google-specific tracking-url | ||||||
|  | def parse_url(url_string): | ||||||
|  |     parsed_url = urlparse(url_string) | ||||||
|  |     if parsed_url.netloc in [google_hostname, ''] and parsed_url.path==redirect_path: | ||||||
|  |         query = dict(parse_qsl(parsed_url.query)) | ||||||
|  |         return query['q'] | ||||||
|  |     else: | ||||||
|  |         return url_string | ||||||
| 
 | 
 | ||||||
| # do search-request | # do search-request | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     offset = (params['pageno'] - 1) * 8 |     offset = (params['pageno'] - 1) * 10 | ||||||
| 
 | 
 | ||||||
|     language = 'en-US' |     if params['language'] == 'all': | ||||||
|     if params['language'] != 'all': |         language = 'en' | ||||||
|         language = params['language'].replace('_', '-') |     else: | ||||||
|  |         language = params['language'].replace('_','-').lower() | ||||||
| 
 | 
 | ||||||
|     params['url'] = search_url.format(offset=offset, |     params['url'] = search_url.format(offset=offset, | ||||||
|                                       query=urlencode({'q': query}), |                                       query=urlencode({'q': query})) | ||||||
|                                       language=language) | 
 | ||||||
|  |     params['headers']['Accept-Language'] = language | ||||||
| 
 | 
 | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| @ -40,18 +66,50 @@ def request(query, params): | |||||||
| def response(resp): | def response(resp): | ||||||
|     results = [] |     results = [] | ||||||
| 
 | 
 | ||||||
|     search_res = loads(resp.text) |     dom = html.fromstring(resp.text) | ||||||
| 
 |  | ||||||
|     # return empty array if there are no results |  | ||||||
|     if not search_res.get('responseData', {}).get('results'): |  | ||||||
|         return [] |  | ||||||
| 
 | 
 | ||||||
|     # parse results |     # parse results | ||||||
|     for result in search_res['responseData']['results']: |     for result in dom.xpath(results_xpath): | ||||||
|         # append result |         title = extract_text(result.xpath(title_xpath)[0]) | ||||||
|         results.append({'url': result['unescapedUrl'], |         try: | ||||||
|                         'title': result['titleNoFormatting'], |             url = parse_url(extract_url(result.xpath(url_xpath), search_url)) | ||||||
|                         'content': result['content']}) |             parsed_url = urlparse(url) | ||||||
|  |             if parsed_url.netloc==google_hostname and parsed_url.path==search_path: | ||||||
|  |                 # remove the link to google news | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |             if parsed_url.netloc==google_hostname and parsed_url.path==images_path: | ||||||
|  |                 # images result | ||||||
|  |                 results = results + parse_images(result) | ||||||
|  |             else: | ||||||
|  |                 # normal result | ||||||
|  |                 content = extract_text(result.xpath(content_xpath)[0]) | ||||||
|  |                 # append result | ||||||
|  |                 results.append({'url': url,  | ||||||
|  |                                 'title': title,  | ||||||
|  |                                 'content': content}) | ||||||
|  |         except: | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |     # parse suggestion | ||||||
|  |     for suggestion in dom.xpath(suggestion_xpath): | ||||||
|  |         # append suggestion | ||||||
|  |         results.append({'suggestion': extract_text(suggestion)}) | ||||||
| 
 | 
 | ||||||
|     # return results |     # return results | ||||||
|     return results |     return results | ||||||
|  | 
 | ||||||
|  | def parse_images(result): | ||||||
|  |     results = [] | ||||||
|  |     for image in result.xpath(images_xpath): | ||||||
|  |         url = parse_url(extract_text(image.xpath(image_url_xpath)[0])) | ||||||
|  |         img_src = extract_text(image.xpath(image_img_src_xpath)[0]) | ||||||
|  |          | ||||||
|  |         # append result | ||||||
|  |         results.append({'url': url, | ||||||
|  |                         'title': '', | ||||||
|  |                         'content': '', | ||||||
|  |                         'img_src': img_src, | ||||||
|  |                         'template': 'images.html'}) | ||||||
|  | 
 | ||||||
|  |     return results | ||||||
|  | |||||||
| @ -49,7 +49,8 @@ def score_results(results): | |||||||
|     flat_len = len(flat_res) |     flat_len = len(flat_res) | ||||||
|     engines_len = len(results) |     engines_len = len(results) | ||||||
|     results = [] |     results = [] | ||||||
|     # deduplication + scoring | 
 | ||||||
|  |     # pass 1: deduplication + scoring | ||||||
|     for i, res in enumerate(flat_res): |     for i, res in enumerate(flat_res): | ||||||
| 
 | 
 | ||||||
|         res['parsed_url'] = urlparse(res['url']) |         res['parsed_url'] = urlparse(res['url']) | ||||||
| @ -90,7 +91,42 @@ def score_results(results): | |||||||
|         else: |         else: | ||||||
|             res['score'] = score |             res['score'] = score | ||||||
|             results.append(res) |             results.append(res) | ||||||
|     return sorted(results, key=itemgetter('score'), reverse=True) |     results = sorted(results, key=itemgetter('score'), reverse=True) | ||||||
|  | 
 | ||||||
|  |     # pass 2 : group results by category and template | ||||||
|  |     gresults = [] | ||||||
|  |     categoryPositions = {} | ||||||
|  | 
 | ||||||
|  |     for i, res in enumerate(results): | ||||||
|  |         # FIXME : handle more than one category per engine | ||||||
|  |         category = engines[res['engine']].categories[0] + ':' + '' if 'template' not in res else res['template']  | ||||||
|  | 
 | ||||||
|  |         current = None if category not in categoryPositions else categoryPositions[category] | ||||||
|  | 
 | ||||||
|  |         # group with previous results using the same category if the group can accept more result and is not too far from the current position | ||||||
|  |         if current != None and (current['count'] > 0) and (len(gresults) - current['index'] < 20): | ||||||
|  |             # group with the previous results using the same category with this one | ||||||
|  |             index = current['index'] | ||||||
|  |             gresults.insert(index, res) | ||||||
|  | 
 | ||||||
|  |             # update every index after the current one (including the current one) | ||||||
|  |             for k in categoryPositions: | ||||||
|  |                 v = categoryPositions[k]['index'] | ||||||
|  |                 if v >= index: | ||||||
|  |                     categoryPositions[k]['index'] = v+1 | ||||||
|  | 
 | ||||||
|  |             # update this category | ||||||
|  |             current['count'] -= 1 | ||||||
|  | 
 | ||||||
|  |         else: | ||||||
|  |             # same category | ||||||
|  |             gresults.append(res) | ||||||
|  | 
 | ||||||
|  |             # update categoryIndex | ||||||
|  |             categoryPositions[category] = { 'index' : len(gresults), 'count' : 8 } | ||||||
|  | 
 | ||||||
|  |     # return gresults | ||||||
|  |     return gresults | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Search(object): | class Search(object): | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user