fix startpage engine and add comments
* add language support * remove not required code * improve google-ad detection (no false detection anymore, I hope) * other improvements
This commit is contained in:
		
							parent
							
								
									a46bbb4042
								
							
						
					
					
						commit
						678a80f043
					
				| @ -1,47 +1,79 @@ | |||||||
|  | ## Startpage (Web) | ||||||
|  | #  | ||||||
|  | # @website     https://startpage.com | ||||||
|  | # @provide-api no (nothing found) | ||||||
|  | #  | ||||||
|  | # @using-api   no | ||||||
|  | # @results     HTML | ||||||
|  | # @stable      no (HTML can change) | ||||||
|  | # @parse       url, title, content | ||||||
|  | # | ||||||
|  | # @todo        paging | ||||||
|  | 
 | ||||||
| from urllib import urlencode | from urllib import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from cgi import escape | from cgi import escape | ||||||
|  | import re | ||||||
| 
 | 
 | ||||||
| base_url = None | # engine dependent config | ||||||
| search_url = None | categories = ['general'] | ||||||
|  | # there is a mechanism to block "bot" search (probably the parameter qid), require storing of qid's between mulitble search-calls | ||||||
|  | #paging = False  | ||||||
|  | language_support = True | ||||||
| 
 | 
 | ||||||
| # TODO paging | # search-url | ||||||
| paging = False | base_url = 'https://startpage.com/' | ||||||
| # TODO complete list of country mapping | search_url = base_url + 'do/search' | ||||||
| country_map = {'en_US': 'eng', | 
 | ||||||
|                'en_UK': 'uk', | # specific xpath variables | ||||||
|                'nl_NL': 'ned'} | # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] | ||||||
|  | # not ads: div[@class="result"] are the direct childs of div[@id="results"] | ||||||
|  | results_xpath = '//div[@class="result"]' | ||||||
|  | link_xpath = './/h3/a' | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | # do search-request | ||||||
| def request(query, params): | def request(query, params): | ||||||
|  |     offset = (params['pageno'] - 1) * 10 | ||||||
|     query = urlencode({'q': query})[2:] |     query = urlencode({'q': query})[2:] | ||||||
|  | 
 | ||||||
|     params['url'] = search_url |     params['url'] = search_url | ||||||
|     params['method'] = 'POST' |     params['method'] = 'POST' | ||||||
|     params['data'] = {'query': query, |     params['data'] = {'query': query, | ||||||
|                       'startat': (params['pageno'] - 1) * 10}  # offset |                       'startat': offset}    | ||||||
|     country = country_map.get(params['language'], 'eng') | 
 | ||||||
|     params['cookies']['preferences'] = \ |     # set language if specified | ||||||
|         'lang_homepageEEEs/air/{country}/N1NsslEEE1N1Nfont_sizeEEEmediumN1Nrecent_results_filterEEE1N1Nlanguage_uiEEEenglishN1Ndisable_open_in_new_windowEEE0N1Ncolor_schemeEEEnewN1Nnum_of_resultsEEE10N1N'.format(country=country)  # noqa |     if params['language'] != 'all': | ||||||
|  |         params['data']['with_language'] = 'lang_' + params['language'].split('_')[0] | ||||||
|  | 
 | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | # get response from search-request | ||||||
| def response(resp): | def response(resp): | ||||||
|     results = [] |     results = [] | ||||||
|  | 
 | ||||||
|     dom = html.fromstring(resp.content) |     dom = html.fromstring(resp.content) | ||||||
|     # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] |      | ||||||
|     # not ads: div[@class="result"] are the direct childs of div[@id="results"] |     # parse results | ||||||
|     for result in dom.xpath('//div[@class="result"]'): |     for result in dom.xpath(results_xpath): | ||||||
|         link = result.xpath('.//h3/a')[0] |         link = result.xpath(link_xpath)[0] | ||||||
|         url = link.attrib.get('href') |         url = link.attrib.get('href') | ||||||
|         if url.startswith('http://www.google.')\ |  | ||||||
|            or url.startswith('https://www.google.'): |  | ||||||
|             continue |  | ||||||
|         title = escape(link.text_content()) |         title = escape(link.text_content()) | ||||||
| 
 | 
 | ||||||
|         content = '' |         # block google-ad url's | ||||||
|  |         if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url): | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|         if result.xpath('./p[@class="desc"]'): |         if result.xpath('./p[@class="desc"]'): | ||||||
|             content = escape(result.xpath('./p[@class="desc"]')[0].text_content()) |             content = escape(result.xpath('./p[@class="desc"]')[0].text_content()) | ||||||
|  |         else: | ||||||
|  |             content = '' | ||||||
| 
 | 
 | ||||||
|         results.append({'url': url, 'title': title, 'content': content}) |         # append result | ||||||
|  |         results.append({'url': url,  | ||||||
|  |                         'title': title,  | ||||||
|  |                         'content': content}) | ||||||
| 
 | 
 | ||||||
|  |     # return results | ||||||
|     return results |     return results | ||||||
|  | |||||||
| @ -94,8 +94,6 @@ engines: | |||||||
| 
 | 
 | ||||||
|   - name : startpage |   - name : startpage | ||||||
|     engine : startpage |     engine : startpage | ||||||
|     base_url : 'https://startpage.com/' |  | ||||||
|     search_url : 'https://startpage.com/do/search' |  | ||||||
|     shortcut : sp |     shortcut : sp | ||||||
| 
 | 
 | ||||||
| # +30% page load time | # +30% page load time | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user