| 
									
										
										
										
											2013-11-09 18:39:20 +01:00
										 |  |  | from urllib import urlencode | 
					
						
							| 
									
										
										
										
											2013-10-19 18:29:39 +02:00
										 |  |  | from lxml import html | 
					
						
							| 
									
										
										
										
											2013-10-24 23:43:39 +02:00
										 |  |  | from urlparse import urlparse | 
					
						
							| 
									
										
										
										
											2013-10-19 18:29:39 +02:00
										 |  |  | from cgi import escape | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-10-19 18:30:08 +02:00
										 |  |  | base_url = 'https://startpage.com/' | 
					
						
							| 
									
										
										
										
											2013-10-19 18:29:39 +02:00
										 |  |  | search_url = base_url+'do/search' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def request(query, params): | 
					
						
							|  |  |  |     global search_url | 
					
						
							| 
									
										
										
										
											2013-11-09 18:39:20 +01:00
										 |  |  |     query = urlencode({'q': query})[2:] | 
					
						
							| 
									
										
										
										
											2013-10-19 18:29:39 +02:00
										 |  |  |     params['url'] = search_url | 
					
						
							|  |  |  |     params['method'] = 'POST' | 
					
						
							|  |  |  |     params['data'] = {'query': query} | 
					
						
							|  |  |  |     return params | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def response(resp): | 
					
						
							|  |  |  |     global base_url | 
					
						
							|  |  |  |     results = [] | 
					
						
							| 
									
										
										
										
											2013-10-24 21:00:44 +02:00
										 |  |  |     dom = html.fromstring(resp.content) | 
					
						
							| 
									
										
										
										
											2014-01-05 14:00:10 +01:00
										 |  |  |     # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] | 
					
						
							|  |  |  |     # not ads : div[@class="result"] are the direct childs of div[@id="results"] | 
					
						
							|  |  |  |     for result in dom.xpath('//div[@id="results"]/div[@class="result"]'): | 
					
						
							| 
									
										
										
										
											2013-10-19 18:29:39 +02:00
										 |  |  |         link = result.xpath('.//h3/a')[0] | 
					
						
							| 
									
										
										
										
											2013-10-24 23:43:39 +02:00
										 |  |  |         url = link.attrib.get('href') | 
					
						
							| 
									
										
										
										
											2013-10-19 20:12:22 +02:00
										 |  |  |         parsed_url = urlparse(url) | 
					
						
							| 
									
										
										
										
											2014-01-05 14:00:10 +01:00
										 |  |  |         title = link.text_content() | 
					
						
							|  |  |  |         content = result.xpath('./p[@class="desc"]')[0].text_content() | 
					
						
							| 
									
										
										
										
											2013-10-19 18:29:39 +02:00
										 |  |  |         results.append({'url': url, 'title': title, 'content': content}) | 
					
						
							|  |  |  |     return results |