2013-10-26 02:22:20 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								from lxml import html
							 | 
						
					
						
							
								
									
										
										
										
											2014-03-04 19:43:41 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from lxml.etree import _ElementStringResult, _ElementUnicodeResult
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-23 11:08:08 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from searx.utils import html_to_text
							 | 
						
					
						
							
								
									
										
										
										
											2016-11-30 18:43:03 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from searx.url_utils import unquote, urlencode, urljoin, urlparse
							 | 
						
					
						
							
								
									
										
										
										
											2013-10-26 02:22:20 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								search_url = None
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								url_xpath = None
							 | 
						
					
						
							
								
									
										
										
										
											2013-10-26 02:22:20 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								content_xpath = None
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								title_xpath = None
							 | 
						
					
						
							
								
									
										
										
										
											2016-11-30 18:43:03 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								paging = False
							 | 
						
					
						
							
								
									
										
										
										
											2013-11-13 19:33:09 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								suggestion_xpath = ''
							 | 
						
					
						
							
								
									
										
										
										
											2013-10-26 13:45:43 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								results_xpath = ''
							 | 
						
					
						
							
								
									
										
										
										
											2013-10-26 02:22:20 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2016-03-28 15:15:03 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								# parameters for engines with paging support
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								#
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								# number of results on each page
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								# (only needed if the site requires not a page number, but an offset)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								page_size = 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								# number of the first page (usually 0 or 1)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								first_page_num = 1
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-05 14:06:52 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								'''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								if xpath_results is list, extract the text from each result and concat the list
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								if xpath_results is a xml element, extract all the text node from it
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								   ( text_content() method from lxml )
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-05 14:06:52 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								if xpath_results is a string element, then it's already done
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								'''
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-05 14:06:52 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def extract_text(xpath_results):
							 | 
						
					
						
							
								
									
										
										
										
											2013-10-26 02:22:20 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if type(xpath_results) == list:
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-05 14:06:52 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        # it's list of result : concat everything using recursive call
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        result = ''
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        for e in xpath_results:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            result = result + extract_text(e)
							 | 
						
					
						
							
								
									
										
										
										
											2015-01-25 20:04:44 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        return result.strip()
							 | 
						
					
						
							
								
									
										
										
										
											2014-03-04 19:43:41 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]:
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-05 14:06:52 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        # it's a string
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return ''.join(xpath_results)
							 | 
						
					
						
							
								
									
										
										
										
											2013-10-26 02:22:20 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    else:
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-05 14:06:52 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        # it's a element
							 | 
						
					
						
							
								
									
										
										
										
											2016-12-31 13:56:09 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        text = html.tostring(xpath_results, encoding='unicode', method='text', with_tail=False)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        text = text.strip().replace('\n', ' ')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return ' '.join(text.split())
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-05 14:06:52 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-30 02:36:05 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def extract_url(xpath_results, search_url):
							 | 
						
					
						
							
								
									
										
										
										
											2017-01-17 12:14:33 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if xpath_results == []:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        raise Exception('Empty url resultset')
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-05 14:06:52 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    url = extract_text(xpath_results)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if url.startswith('//'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # add http or https to this kind of url //example.com/
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        parsed_search_url = urlparse(search_url)
							 | 
						
					
						
							
								
									
										
										
										
											2018-04-09 03:35:34 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        url = u'{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-05 14:06:52 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    elif url.startswith('/'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # fix relative url to the search engine
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        url = urljoin(search_url, url)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # normalize url
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    url = normalize_url(url)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return url
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								def normalize_url(url):
							 | 
						
					
						
							
								
									
										
										
										
											2013-10-26 02:22:20 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parsed_url = urlparse(url)
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-05 14:06:52 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # add a / at this end of the url if there is no path
							 | 
						
					
						
							
								
									
										
										
										
											2013-10-26 02:22:20 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if not parsed_url.netloc:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        raise Exception('Cannot parse url')
							 | 
						
					
						
							
								
									
										
										
										
											2013-10-27 12:01:03 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not parsed_url.path:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        url += '/'
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-05 14:06:52 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # FIXME : hack for yahoo
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if parsed_url.hostname == 'search.yahoo.com'\
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								       and parsed_url.path.startswith('/r'):
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-05 14:06:52 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        p = parsed_url.path
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        mark = p.find('/**')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if mark != -1:
							 | 
						
					
						
							
								
									
										
										
										
											2016-01-18 12:47:31 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            return unquote(p[mark + 3:]).decode('utf-8')
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-05 14:06:52 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2013-10-26 02:22:20 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return url
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-05 14:06:52 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2013-10-26 02:22:20 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								def request(query, params):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    query = urlencode({'q': query})[2:]
							 | 
						
					
						
							
								
									
										
										
										
											2016-03-28 15:15:03 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    fp = {'query': query}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if paging and search_url.find('{pageno}') >= 0:
							 | 
						
					
						
							
								
									
										
										
										
											2016-08-14 13:46:54 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
							 | 
						
					
						
							
								
									
										
										
										
											2016-03-28 15:15:03 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    params['url'] = search_url.format(**fp)
							 | 
						
					
						
							
								
									
										
										
										
											2013-10-26 02:22:20 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    params['query'] = query
							 | 
						
					
						
							
								
									
										
										
										
											2016-03-28 15:15:03 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2013-10-26 02:22:20 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return params
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								def response(resp):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    results = []
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    dom = html.fromstring(resp.text)
							 | 
						
					
						
							
								
									
										
										
										
											2013-10-26 13:45:43 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if results_xpath:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        for result in dom.xpath(results_xpath):
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-30 02:36:05 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            url = extract_url(result.xpath(url_xpath), search_url)
							 | 
						
					
						
							
								
									
										
										
										
											2017-01-17 12:14:33 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            title = extract_text(result.xpath(title_xpath))
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            content = extract_text(result.xpath(content_xpath))
							 | 
						
					
						
							
								
									
										
										
										
											2013-10-26 13:45:43 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            results.append({'url': url, 'title': title, 'content': content})
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    else:
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-12 18:48:38 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        for url, title, content in zip(
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-30 03:10:20 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            (extract_url(x, search_url) for
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								             x in dom.xpath(url_xpath)),
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            map(extract_text, dom.xpath(title_xpath)),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            map(extract_text, dom.xpath(content_xpath))
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        ):
							 | 
						
					
						
							
								
									
										
										
										
											2013-10-26 13:45:43 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            results.append({'url': url, 'title': title, 'content': content})
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2013-11-13 19:33:09 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    if not suggestion_xpath:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return results
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    for suggestion in dom.xpath(suggestion_xpath):
							 | 
						
					
						
							
								
									
										
										
										
											2014-01-05 14:06:52 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        results.append({'suggestion': extract_text(suggestion)})
							 | 
						
					
						
							
								
									
										
										
										
											2013-10-26 02:22:20 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return results
							 |