[fix] highlighting only html
This commit is contained in:
		
							parent
							
								
									04c408389d
								
							
						
					
					
						commit
						7b4ec5c5e9
					
				| @ -25,7 +25,6 @@ from urlparse import urlparse | ||||
| from searx import settings | ||||
| import ConfigParser | ||||
| import sys | ||||
| import re | ||||
| from datetime import datetime | ||||
| 
 | ||||
| engine_dir = dirname(realpath(__file__)) | ||||
| @ -106,31 +105,6 @@ def make_callback(engine_name, results, suggestions, callback, params): | ||||
|         results[engine_name] = cb_res | ||||
|     return process_callback | ||||
| 
 | ||||
| def highlight_content(content, query): | ||||
| 
 | ||||
|     if not content: | ||||
|         return None | ||||
|     # ignoring html contents | ||||
|     # TODO better html content detection | ||||
|     if content.find('<') != -1: | ||||
|         return content | ||||
| 
 | ||||
|     query = query.decode('utf-8') | ||||
|     if content.lower().find(query.lower()) > -1: | ||||
|         query_regex = u'({0})'.format(re.escape(query)) | ||||
|         content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U) | ||||
|     else: | ||||
|         regex_parts = [] | ||||
|         for chunk in query.split(): | ||||
|             if len(chunk) == 1: | ||||
|                 regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk))) | ||||
|             else: | ||||
|                 regex_parts.append(u'{0}'.format(re.escape(chunk))) | ||||
|         query_regex = u'({0})'.format('|'.join(regex_parts)) | ||||
|         content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U) | ||||
| 
 | ||||
|     return content | ||||
| 
 | ||||
| def score_results(results): | ||||
|     flat_res = filter(None, chain.from_iterable(izip_longest(*results.values()))) | ||||
|     flat_len = len(flat_res) | ||||
| @ -218,8 +192,6 @@ def search(query, request, selected_engines): | ||||
|     results = score_results(results) | ||||
| 
 | ||||
|     for result in results: | ||||
|         if 'content' in result: | ||||
|             result['content'] = highlight_content(result['content'], query) | ||||
|         for res_engine in result['engines']: | ||||
|             engines[result['engine']].stats['score_count'] += result['score'] | ||||
| 
 | ||||
|  | ||||
| @ -3,6 +3,32 @@ from HTMLParser import HTMLParser | ||||
| import csv | ||||
| import codecs | ||||
| import cStringIO | ||||
| import re | ||||
| 
 | ||||
| def highlight_content(content, query): | ||||
| 
 | ||||
|     if not content: | ||||
|         return None | ||||
|     # ignoring html contents | ||||
|     # TODO better html content detection | ||||
|     if content.find('<') != -1: | ||||
|         return content | ||||
| 
 | ||||
|     query = query.decode('utf-8') | ||||
|     if content.lower().find(query.lower()) > -1: | ||||
|         query_regex = u'({0})'.format(re.escape(query)) | ||||
|         content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U) | ||||
|     else: | ||||
|         regex_parts = [] | ||||
|         for chunk in query.split(): | ||||
|             if len(chunk) == 1: | ||||
|                 regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk))) | ||||
|             else: | ||||
|                 regex_parts.append(u'{0}'.format(re.escape(chunk))) | ||||
|         query_regex = u'({0})'.format('|'.join(regex_parts)) | ||||
|         content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U) | ||||
| 
 | ||||
|     return content | ||||
| 
 | ||||
| class HTMLTextExtractor(HTMLParser): | ||||
|     def __init__(self): | ||||
|  | ||||
| @ -29,6 +29,7 @@ import json | ||||
| import cStringIO | ||||
| from searx.utils import UnicodeWriter | ||||
| from flask import send_from_directory | ||||
| from searx.utils import highlight_content, html_to_text | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| @ -104,6 +105,14 @@ def index(): | ||||
|     results, suggestions = search(query, request, selected_engines) | ||||
| 
 | ||||
|     for result in results: | ||||
|         if request_data.get('format', 'html') == 'html': | ||||
|             if 'content' in result: | ||||
|                 result['content'] = highlight_content(result['content'], query) | ||||
|             result['title'] = highlight_content(result['title'], query) | ||||
|         else: | ||||
|             if 'content' in result: | ||||
|                 result['content'] = html_to_text(result['content']).strip() | ||||
|             result['title'] = html_to_text(result['title']).strip() | ||||
|         if len(result['url']) > 74: | ||||
|             result['pretty_url'] = result['url'][:35] + '[..]' + result['url'][-35:] | ||||
|         else: | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user