improve publishDate extraction from yahoo
This commit is contained in:
		
							parent
							
								
									794165d19c
								
							
						
					
					
						commit
						5538c6771a
					
				| @ -4,7 +4,8 @@ from urllib import urlencode | |||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text, extract_url | from searx.engines.xpath import extract_text, extract_url | ||||||
| from searx.engines.yahoo import parse_url | from searx.engines.yahoo import parse_url | ||||||
| from datetime import datetime | from datetime import datetime, timedelta | ||||||
|  | import re | ||||||
| 
 | 
 | ||||||
| categories = ['news'] | categories = ['news'] | ||||||
| search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}' | search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}' | ||||||
| @ -39,9 +40,21 @@ def response(resp): | |||||||
|         url = parse_url(extract_url(result.xpath(url_xpath), search_url)) |         url = parse_url(extract_url(result.xpath(url_xpath), search_url)) | ||||||
|         title = extract_text(result.xpath(title_xpath)[0]) |         title = extract_text(result.xpath(title_xpath)[0]) | ||||||
|         content = extract_text(result.xpath(content_xpath)[0]) |         content = extract_text(result.xpath(content_xpath)[0]) | ||||||
| # Feb 20 04:02am |         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) | ||||||
|         publishedDate = datetime.strptime(extract_text(result.xpath(publishedDate_xpath)[0]),"%b %d %H:%M%p") | 
 | ||||||
|         #publishedDate.replace(year=2014) |         if re.match("^[0-9]+ minute(s|) ago$", publishedDate): | ||||||
|  |             publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group())) | ||||||
|  |         else: | ||||||
|  |             if re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate): | ||||||
|  |                 timeNumbers = re.findall(r'\d+', publishedDate) | ||||||
|  |                 publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0])) - timedelta(minutes=int(timeNumbers[1])) | ||||||
|  |             else: | ||||||
|  |                 # TODO year in string possible? | ||||||
|  |                 publishedDate = datetime.strptime(publishedDate,"%b %d %H:%M%p") | ||||||
|  | 
 | ||||||
|  |         if publishedDate.year == 1900: | ||||||
|  |             publishedDate = publishedDate.replace(year=datetime.now().year) | ||||||
|  | 
 | ||||||
|         results.append({'url': url, 'title': title, 'content': content,'publishedDate':publishedDate}) |         results.append({'url': url, 'title': title, 'content': content,'publishedDate':publishedDate}) | ||||||
| 
 | 
 | ||||||
|     if not suggestion_xpath: |     if not suggestion_xpath: | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user