Merge pull request #452 from pointhi/engine_fix
[enh] fix content fetching, parse published date from description for startpage and ixquick
This commit is contained in:
		
						commit
						3a2f29344a
					
				| @ -12,6 +12,8 @@ | |||||||
| 
 | 
 | ||||||
| from lxml import html | from lxml import html | ||||||
| from cgi import escape | from cgi import escape | ||||||
|  | from dateutil import parser | ||||||
|  | from datetime import datetime, timedelta | ||||||
| import re | import re | ||||||
| from searx.engines.xpath import extract_text | from searx.engines.xpath import extract_text | ||||||
| 
 | 
 | ||||||
| @ -79,15 +81,44 @@ def response(resp): | |||||||
| 
 | 
 | ||||||
|         title = escape(extract_text(link)) |         title = escape(extract_text(link)) | ||||||
| 
 | 
 | ||||||
|         if result.xpath('./p[@class="desc"]'): |         if result.xpath('./p[@class="desc clk"]'): | ||||||
|             content = escape(extract_text(result.xpath('./p[@class="desc"]'))) |             content = escape(extract_text(result.xpath('./p[@class="desc clk"]'))) | ||||||
|         else: |         else: | ||||||
|             content = '' |             content = '' | ||||||
| 
 | 
 | ||||||
|         # append result |         published_date = None | ||||||
|         results.append({'url': url, | 
 | ||||||
|                         'title': title, |         # check if search result starts with something like: "2 Sep 2014 ... " | ||||||
|                         'content': content}) |         if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): | ||||||
|  |             date_pos = content.find('...')+4 | ||||||
|  |             date_string = content[0:date_pos-5] | ||||||
|  |             published_date = parser.parse(date_string, dayfirst=True) | ||||||
|  | 
 | ||||||
|  |             # fix content string | ||||||
|  |             content = content[date_pos:] | ||||||
|  | 
 | ||||||
|  |         # check if search result starts with something like: "5 days ago ... " | ||||||
|  |         elif re.match("^[0-9]+ days? ago \.\.\. ", content): | ||||||
|  |             date_pos = content.find('...')+4 | ||||||
|  |             date_string = content[0:date_pos-5] | ||||||
|  | 
 | ||||||
|  |             # calculate datetime | ||||||
|  |             published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) | ||||||
|  | 
 | ||||||
|  |             # fix content string | ||||||
|  |             content = content[date_pos:] | ||||||
|  | 
 | ||||||
|  |         if published_date: | ||||||
|  |             # append result | ||||||
|  |             results.append({'url': url, | ||||||
|  |                             'title': title, | ||||||
|  |                             'content': content, | ||||||
|  |                             'publishedDate': published_date}) | ||||||
|  |         else: | ||||||
|  |             # append result | ||||||
|  |             results.append({'url': url, | ||||||
|  |                             'title': title, | ||||||
|  |                             'content': content}) | ||||||
| 
 | 
 | ||||||
|     # return results |     # return results | ||||||
|     return results |     return results | ||||||
|  | |||||||
| @ -42,7 +42,7 @@ class TestStartpageEngine(SearxTestCase): | |||||||
|                 </a> |                 </a> | ||||||
|                 <span id='title_stars_2' name='title_stars_2'>  </span> |                 <span id='title_stars_2' name='title_stars_2'>  </span> | ||||||
|             </h3> |             </h3> | ||||||
|             <p class='desc'> |             <p class='desc clk'> | ||||||
|                 This should be the content. |                 This should be the content. | ||||||
|             </p> |             </p> | ||||||
|             <p> |             <p> | ||||||
| @ -78,7 +78,7 @@ class TestStartpageEngine(SearxTestCase): | |||||||
|                 </a> |                 </a> | ||||||
|                 <span id='title_stars_2' name='title_stars_2'>  </span> |                 <span id='title_stars_2' name='title_stars_2'>  </span> | ||||||
|             </h3> |             </h3> | ||||||
|             <p class='desc'> |             <p class='desc clk'> | ||||||
|                 This should be the content. |                 This should be the content. | ||||||
|             </p> |             </p> | ||||||
|             <p> |             <p> | ||||||
| @ -101,7 +101,7 @@ class TestStartpageEngine(SearxTestCase): | |||||||
|             <h3> |             <h3> | ||||||
|                 <span id='title_stars_2' name='title_stars_2'>  </span> |                 <span id='title_stars_2' name='title_stars_2'>  </span> | ||||||
|             </h3> |             </h3> | ||||||
|             <p class='desc'> |             <p class='desc clk'> | ||||||
|                 This should be the content. |                 This should be the content. | ||||||
|             </p> |             </p> | ||||||
|             <p> |             <p> | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user