| 
									
										
										
										
											2021-02-11 12:32:58 +01:00
										 |  |  | # SPDX-License-Identifier: AGPL-3.0-or-later | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  |  Seznam | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-03-18 19:59:01 +01:00
										 |  |  | from urllib.parse import urlencode | 
					
						
							| 
									
										
										
										
											2021-02-11 12:32:58 +01:00
										 |  |  | from lxml import html | 
					
						
							| 
									
										
										
											
												[httpx] replace searx.poolrequests by searx.network
settings.yml:
* outgoing.networks:
   * can contains network definition
   * propertiers: enable_http, verify, http2, max_connections, max_keepalive_connections,
     keepalive_expiry, local_addresses, support_ipv4, support_ipv6, proxies, max_redirects, retries
   * retries: 0 by default, number of times searx retries to send the HTTP request (using different IP & proxy each time)
   * local_addresses can be "192.168.0.1/24" (it supports IPv6)
   * support_ipv4 & support_ipv6: both True by default
     see https://github.com/searx/searx/pull/1034
* each engine can define a "network" section:
   * either a full network description
   * either reference an existing network
* all HTTP requests of engine use the same HTTP configuration (it was not the case before, see proxy configuration in master)
											
										 
											2021-04-05 10:43:33 +02:00
										 |  |  | from searx.network import get | 
					
						
							| 
									
										
										
										
											2021-02-11 12:32:58 +01:00
										 |  |  | from searx.exceptions import SearxEngineAccessDeniedException | 
					
						
							| 
									
										
										
										
											2021-03-27 15:29:00 +01:00
										 |  |  | from searx.utils import ( | 
					
						
							|  |  |  |     extract_text, | 
					
						
							|  |  |  |     eval_xpath_list, | 
					
						
							|  |  |  |     eval_xpath_getindex, | 
					
						
							|  |  |  |     eval_xpath, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2021-02-11 12:32:58 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | # about | 
					
						
							|  |  |  | about = { | 
					
						
							|  |  |  |     "website": "https://www.seznam.cz/", | 
					
						
							|  |  |  |     "wikidata_id": "Q3490485", | 
					
						
							|  |  |  |     "official_api_documentation": "https://api.sklik.cz/", | 
					
						
							|  |  |  |     "use_official_api": False, | 
					
						
							|  |  |  |     "require_api_key": False, | 
					
						
							|  |  |  |     "results": "HTML", | 
					
						
							| 
									
										
										
										
											2021-12-21 09:39:03 +01:00
										 |  |  |     "language": "cz", | 
					
						
							| 
									
										
										
										
											2021-02-11 12:32:58 +01:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | base_url = 'https://search.seznam.cz/' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def request(query, params): | 
					
						
							|  |  |  |     response_index = get(base_url, headers=params['headers'], raise_for_httperror=True) | 
					
						
							|  |  |  |     dom = html.fromstring(response_index.text) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-03-27 15:29:00 +01:00
										 |  |  |     url_params = { | 
					
						
							|  |  |  |         'q': query, | 
					
						
							|  |  |  |         'oq': query, | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2021-02-11 12:32:58 +01:00
										 |  |  |     for e in eval_xpath_list(dom, '//input[@type="hidden"]'): | 
					
						
							|  |  |  |         name = e.get('name') | 
					
						
							|  |  |  |         value = e.get('value') | 
					
						
							|  |  |  |         url_params[name] = value | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     params['url'] = base_url + '?' + urlencode(url_params) | 
					
						
							|  |  |  |     params['cookies'] = response_index.cookies | 
					
						
							|  |  |  |     return params | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def response(resp): | 
					
						
							| 
									
										
										
										
											2021-03-18 19:59:01 +01:00
										 |  |  |     if resp.url.path.startswith('/verify'): | 
					
						
							| 
									
										
										
										
											2021-02-11 12:32:58 +01:00
										 |  |  |         raise SearxEngineAccessDeniedException() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     results = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     dom = html.fromstring(resp.content.decode()) | 
					
						
							| 
									
										
										
										
											2021-03-27 15:29:00 +01:00
										 |  |  |     for result_element in eval_xpath_list(dom, '//div[@data-dot="results"]/div'): | 
					
						
							| 
									
										
										
										
											2021-07-26 19:57:55 +02:00
										 |  |  |         result_data = eval_xpath_getindex(result_element, './/div[contains(@class, "bec586")]', 0, default=None) | 
					
						
							| 
									
										
										
										
											2021-03-27 15:29:00 +01:00
										 |  |  |         if result_data is None: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         title_element = eval_xpath_getindex(result_element, './/h3/a', 0) | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |         results.append( | 
					
						
							|  |  |  |             { | 
					
						
							|  |  |  |                 'url': title_element.get('href'), | 
					
						
							|  |  |  |                 'title': extract_text(title_element), | 
					
						
							|  |  |  |                 'content': extract_text(eval_xpath(result_data, './/div[@class="_3eded7"]')), | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2021-02-11 12:32:58 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     return results |