Merge pull request #452 from pointhi/engine_fix
[enh] fix content fetching, parse published date from description for startpage and ixquick
This commit is contained in:
		
						commit
						3a2f29344a
					
				| @ -12,6 +12,8 @@ | ||||
| 
 | ||||
| from lxml import html | ||||
| from cgi import escape | ||||
| from dateutil import parser | ||||
| from datetime import datetime, timedelta | ||||
| import re | ||||
| from searx.engines.xpath import extract_text | ||||
| 
 | ||||
| @ -79,15 +81,44 @@ def response(resp): | ||||
| 
 | ||||
|         title = escape(extract_text(link)) | ||||
| 
 | ||||
|         if result.xpath('./p[@class="desc"]'): | ||||
|             content = escape(extract_text(result.xpath('./p[@class="desc"]'))) | ||||
|         if result.xpath('./p[@class="desc clk"]'): | ||||
|             content = escape(extract_text(result.xpath('./p[@class="desc clk"]'))) | ||||
|         else: | ||||
|             content = '' | ||||
| 
 | ||||
|         # append result | ||||
|         results.append({'url': url, | ||||
|                         'title': title, | ||||
|                         'content': content}) | ||||
|         published_date = None | ||||
| 
 | ||||
|         # check if search result starts with something like: "2 Sep 2014 ... " | ||||
|         if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): | ||||
|             date_pos = content.find('...')+4 | ||||
|             date_string = content[0:date_pos-5] | ||||
|             published_date = parser.parse(date_string, dayfirst=True) | ||||
| 
 | ||||
|             # fix content string | ||||
|             content = content[date_pos:] | ||||
| 
 | ||||
|         # check if search result starts with something like: "5 days ago ... " | ||||
|         elif re.match("^[0-9]+ days? ago \.\.\. ", content): | ||||
|             date_pos = content.find('...')+4 | ||||
|             date_string = content[0:date_pos-5] | ||||
| 
 | ||||
|             # calculate datetime | ||||
|             published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) | ||||
| 
 | ||||
|             # fix content string | ||||
|             content = content[date_pos:] | ||||
| 
 | ||||
|         if published_date: | ||||
|             # append result | ||||
|             results.append({'url': url, | ||||
|                             'title': title, | ||||
|                             'content': content, | ||||
|                             'publishedDate': published_date}) | ||||
|         else: | ||||
|             # append result | ||||
|             results.append({'url': url, | ||||
|                             'title': title, | ||||
|                             'content': content}) | ||||
| 
 | ||||
|     # return results | ||||
|     return results | ||||
|  | ||||
| @ -42,7 +42,7 @@ class TestStartpageEngine(SearxTestCase): | ||||
|                 </a> | ||||
|                 <span id='title_stars_2' name='title_stars_2'>  </span> | ||||
|             </h3> | ||||
|             <p class='desc'> | ||||
|             <p class='desc clk'> | ||||
|                 This should be the content. | ||||
|             </p> | ||||
|             <p> | ||||
| @ -78,7 +78,7 @@ class TestStartpageEngine(SearxTestCase): | ||||
|                 </a> | ||||
|                 <span id='title_stars_2' name='title_stars_2'>  </span> | ||||
|             </h3> | ||||
|             <p class='desc'> | ||||
|             <p class='desc clk'> | ||||
|                 This should be the content. | ||||
|             </p> | ||||
|             <p> | ||||
| @ -101,7 +101,7 @@ class TestStartpageEngine(SearxTestCase): | ||||
|             <h3> | ||||
|                 <span id='title_stars_2' name='title_stars_2'>  </span> | ||||
|             </h3> | ||||
|             <p class='desc'> | ||||
|             <p class='desc clk'> | ||||
|                 This should be the content. | ||||
|             </p> | ||||
|             <p> | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user