| 
									
										
										
										
											2014-09-13 18:25:25 +02:00
										 |  |  | '''
 | 
					
						
							|  |  |  | searx is free software: you can redistribute it and/or modify | 
					
						
							|  |  |  | it under the terms of the GNU Affero General Public License as published by | 
					
						
							|  |  |  | the Free Software Foundation, either version 3 of the License, or | 
					
						
							|  |  |  | (at your option) any later version. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | searx is distributed in the hope that it will be useful, | 
					
						
							|  |  |  | but WITHOUT ANY WARRANTY; without even the implied warranty of | 
					
						
							|  |  |  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
					
						
							|  |  |  | GNU Affero General Public License for more details. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | You should have received a copy of the GNU Affero General Public License | 
					
						
							|  |  |  | along with searx. If not, see < http://www.gnu.org/licenses/ >. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | (C) 2013- by Adam Tauber, <asciimoo@gmail.com> | 
					
						
							|  |  |  | '''
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-10 18:08:14 +02:00
										 |  |  | import typing | 
					
						
							| 
									
										
										
										
											2016-07-31 23:39:58 +02:00
										 |  |  | import gc | 
					
						
							| 
									
										
										
										
											2014-12-05 19:24:11 +01:00
										 |  |  | import threading | 
					
						
							| 
									
										
										
										
											2014-12-14 01:18:01 +01:00
										 |  |  | from time import time | 
					
						
							| 
									
										
										
										
											2016-09-06 00:36:33 +02:00
										 |  |  | from uuid import uuid4 | 
					
						
							| 
									
										
										
										
											2020-08-06 17:42:46 +02:00
										 |  |  | from _thread import start_new_thread | 
					
						
							| 
									
										
										
										
											2020-07-03 15:25:04 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-12-29 11:08:19 +01:00
										 |  |  | import requests.exceptions | 
					
						
							| 
									
										
										
										
											2016-07-31 23:39:58 +02:00
										 |  |  | import searx.poolrequests as requests_lib | 
					
						
							| 
									
										
										
										
											2020-09-22 13:59:27 +02:00
										 |  |  | from searx.engines import engines, settings | 
					
						
							| 
									
										
										
										
											2016-11-19 20:53:51 +01:00
										 |  |  | from searx.answerers import ask | 
					
						
							| 
									
										
										
										
											2020-07-03 15:25:04 +02:00
										 |  |  | from searx.external_bang import get_bang_url | 
					
						
							| 
									
										
										
										
											2016-04-08 16:38:05 +02:00
										 |  |  | from searx.utils import gen_useragent | 
					
						
							| 
									
										
										
										
											2015-10-03 17:26:07 +02:00
										 |  |  | from searx.results import ResultContainer | 
					
						
							| 
									
										
										
										
											2015-01-09 04:13:05 +01:00
										 |  |  | from searx import logger | 
					
						
							| 
									
										
										
										
											2016-10-22 13:10:31 +02:00
										 |  |  | from searx.plugins import plugins | 
					
						
							| 
									
										
										
										
											2014-07-07 13:59:27 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-11-30 18:43:03 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-01-09 04:13:05 +01:00
										 |  |  | logger = logger.getChild('search') | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-08-02 13:50:51 +02:00
										 |  |  | max_request_timeout = settings.get('outgoing', {}).get('max_request_timeout' or None) | 
					
						
							|  |  |  | if max_request_timeout is None: | 
					
						
							|  |  |  |     logger.info('max_request_timeout={0}'.format(max_request_timeout)) | 
					
						
							|  |  |  | else: | 
					
						
							|  |  |  |     if isinstance(max_request_timeout, float): | 
					
						
							|  |  |  |         logger.info('max_request_timeout={0} second(s)'.format(max_request_timeout)) | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         logger.critical('outgoing.max_request_timeout if defined has to be float') | 
					
						
							| 
									
										
										
										
											2020-11-16 09:43:23 +01:00
										 |  |  |         import sys | 
					
						
							|  |  |  |         sys.exit(1) | 
					
						
							| 
									
										
										
										
											2014-07-07 13:59:27 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-22 16:22:22 +02:00
										 |  |  | class EngineRef: | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-10 18:08:14 +02:00
										 |  |  |     __slots__ = 'name', 'category', 'from_bang' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __init__(self, name: str, category: str, from_bang: bool=False): | 
					
						
							| 
									
										
										
										
											2020-09-22 16:22:22 +02:00
										 |  |  |         self.name = name | 
					
						
							|  |  |  |         self.category = category | 
					
						
							|  |  |  |         self.from_bang = from_bang | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-11-10 10:17:49 +01:00
										 |  |  |     def __repr__(self): | 
					
						
							|  |  |  |         return "EngineRef({!r}, {!r}, {!r})".format(self.name, self.category, self.from_bang) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __eq__(self, other): | 
					
						
							|  |  |  |         return self.name == other.name and self.category == other.category and self.from_bang == other.from_bang | 
					
						
							| 
									
										
										
										
											2020-09-22 16:22:22 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-22 13:59:27 +02:00
										 |  |  | class SearchQuery: | 
					
						
							|  |  |  |     """container for all the search parameters (query, language, etc...)""" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-10 18:08:14 +02:00
										 |  |  |     __slots__ = 'query', 'engineref_list', 'categories', 'lang', 'safesearch', 'pageno', 'time_range',\ | 
					
						
							|  |  |  |                 'timeout_limit', 'external_bang' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __init__(self, | 
					
						
							|  |  |  |                  query: str, | 
					
						
							|  |  |  |                  engineref_list: typing.List[EngineRef], | 
					
						
							|  |  |  |                  categories: typing.List[str], | 
					
						
							|  |  |  |                  lang: str, | 
					
						
							| 
									
										
										
										
											2020-10-06 15:23:19 +02:00
										 |  |  |                  safesearch: int, | 
					
						
							| 
									
										
										
										
											2020-09-10 18:08:14 +02:00
										 |  |  |                  pageno: int, | 
					
						
							|  |  |  |                  time_range: typing.Optional[str], | 
					
						
							|  |  |  |                  timeout_limit: typing.Optional[float]=None, | 
					
						
							| 
									
										
										
										
											2020-10-06 15:23:19 +02:00
										 |  |  |                  external_bang: typing.Optional[str]=None): | 
					
						
							| 
									
										
										
										
											2020-09-22 13:59:27 +02:00
										 |  |  |         self.query = query | 
					
						
							| 
									
										
										
										
											2020-09-22 16:22:22 +02:00
										 |  |  |         self.engineref_list = engineref_list | 
					
						
							| 
									
										
										
										
											2020-09-22 13:59:27 +02:00
										 |  |  |         self.categories = categories | 
					
						
							|  |  |  |         self.lang = lang | 
					
						
							|  |  |  |         self.safesearch = safesearch | 
					
						
							|  |  |  |         self.pageno = pageno | 
					
						
							| 
									
										
										
										
											2020-09-22 16:31:17 +02:00
										 |  |  |         self.time_range = time_range | 
					
						
							| 
									
										
										
										
											2020-09-22 13:59:27 +02:00
										 |  |  |         self.timeout_limit = timeout_limit | 
					
						
							|  |  |  |         self.external_bang = external_bang | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-11-10 10:17:49 +01:00
										 |  |  |     def __repr__(self): | 
					
						
							|  |  |  |         return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\ | 
					
						
							|  |  |  |                format(self.query, self.engineref_list, self.categories, self.lang, self.safesearch, | 
					
						
							|  |  |  |                       self.pageno, self.time_range, self.timeout_limit, self.external_bang) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __eq__(self, other): | 
					
						
							|  |  |  |         return self.query == other.query\ | 
					
						
							|  |  |  |             and self.engineref_list == other.engineref_list\ | 
					
						
							|  |  |  |             and self.categories == self.categories\ | 
					
						
							|  |  |  |             and self.lang == other.lang\ | 
					
						
							|  |  |  |             and self.safesearch == other.safesearch\ | 
					
						
							|  |  |  |             and self.pageno == other.pageno\ | 
					
						
							|  |  |  |             and self.time_range == other.time_range\ | 
					
						
							|  |  |  |             and self.timeout_limit == other.timeout_limit\ | 
					
						
							|  |  |  |             and self.external_bang == other.external_bang | 
					
						
							| 
									
										
										
										
											2020-09-22 13:59:27 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-07-23 11:56:57 +02:00
										 |  |  | def send_http_request(engine, request_params): | 
					
						
							| 
									
										
										
										
											2016-12-29 11:08:19 +01:00
										 |  |  |     # create dictionary which contain all | 
					
						
							|  |  |  |     # informations about the request | 
					
						
							|  |  |  |     request_args = dict( | 
					
						
							|  |  |  |         headers=request_params['headers'], | 
					
						
							|  |  |  |         cookies=request_params['cookies'], | 
					
						
							| 
									
										
										
										
											2020-10-09 15:05:13 +02:00
										 |  |  |         verify=request_params['verify'], | 
					
						
							|  |  |  |         auth=request_params['auth'] | 
					
						
							| 
									
										
										
										
											2016-12-29 11:08:19 +01:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-01 15:57:35 +02:00
										 |  |  |     # setting engine based proxies | 
					
						
							|  |  |  |     if hasattr(engine, 'proxies'): | 
					
						
							| 
									
										
										
										
											2020-11-16 12:44:07 +01:00
										 |  |  |         request_args['proxies'] = requests_lib.get_proxies(engine.proxies) | 
					
						
							| 
									
										
										
										
											2020-09-01 15:57:35 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-12-29 11:08:19 +01:00
										 |  |  |     # specific type of request (GET or POST) | 
					
						
							|  |  |  |     if request_params['method'] == 'GET': | 
					
						
							|  |  |  |         req = requests_lib.get | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         req = requests_lib.post | 
					
						
							| 
									
										
										
										
											2020-10-23 20:19:48 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     request_args['data'] = request_params['data'] | 
					
						
							| 
									
										
										
										
											2016-11-05 13:45:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-12-29 11:08:19 +01:00
										 |  |  |     # send the request | 
					
						
							| 
									
										
										
										
											2017-07-23 11:56:57 +02:00
										 |  |  |     return req(request_params['url'], **request_args) | 
					
						
							| 
									
										
										
										
											2016-11-05 13:45:20 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-09-23 17:14:32 +02:00
										 |  |  | def search_one_http_request(engine, query, request_params): | 
					
						
							| 
									
										
										
										
											2016-11-05 13:45:20 +01:00
										 |  |  |     # update request parameters dependent on | 
					
						
							|  |  |  |     # search-engine (contained in engines folder) | 
					
						
							|  |  |  |     engine.request(query, request_params) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-12-29 11:08:19 +01:00
										 |  |  |     # ignoring empty urls | 
					
						
							| 
									
										
										
										
											2016-11-05 13:45:20 +01:00
										 |  |  |     if request_params['url'] is None: | 
					
						
							| 
									
										
										
										
											2019-07-17 10:38:45 +02:00
										 |  |  |         return None | 
					
						
							| 
									
										
										
										
											2016-11-05 13:45:20 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if not request_params['url']: | 
					
						
							| 
									
										
										
										
											2019-07-17 10:38:45 +02:00
										 |  |  |         return None | 
					
						
							| 
									
										
										
										
											2016-11-05 13:45:20 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # send request | 
					
						
							| 
									
										
										
										
											2017-07-23 11:56:57 +02:00
										 |  |  |     response = send_http_request(engine, request_params) | 
					
						
							| 
									
										
										
										
											2016-11-05 13:45:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-12-29 11:08:19 +01:00
										 |  |  |     # parse the response | 
					
						
							|  |  |  |     response.search_params = request_params | 
					
						
							|  |  |  |     return engine.response(response) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-09-23 17:14:32 +02:00
										 |  |  | def search_one_http_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit): | 
					
						
							| 
									
										
										
										
											2017-07-23 11:56:57 +02:00
										 |  |  |     # set timeout for all HTTP requests | 
					
						
							|  |  |  |     requests_lib.set_timeout_for_thread(timeout_limit, start_time=start_time) | 
					
						
							|  |  |  |     # reset the HTTP total time | 
					
						
							|  |  |  |     requests_lib.reset_time_for_thread() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # | 
					
						
							| 
									
										
										
										
											2016-12-29 11:08:19 +01:00
										 |  |  |     engine = engines[engine_name] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-07-23 11:56:57 +02:00
										 |  |  |     # suppose everything will be alright | 
					
						
							|  |  |  |     requests_exception = False | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-12-29 11:08:19 +01:00
										 |  |  |     try: | 
					
						
							|  |  |  |         # send requests and parse the results | 
					
						
							| 
									
										
										
										
											2019-09-23 17:14:32 +02:00
										 |  |  |         search_results = search_one_http_request(engine, query, request_params) | 
					
						
							| 
									
										
										
										
											2016-11-05 13:45:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-07-17 10:38:45 +02:00
										 |  |  |         # check if the engine accepted the request | 
					
						
							|  |  |  |         if search_results is not None: | 
					
						
							|  |  |  |             # yes, so add results | 
					
						
							|  |  |  |             result_container.extend(engine_name, search_results) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # update engine time when there is no exception | 
					
						
							|  |  |  |             engine_time = time() - start_time | 
					
						
							|  |  |  |             page_load_time = requests_lib.get_time_for_thread() | 
					
						
							|  |  |  |             result_container.add_timing(engine_name, engine_time, page_load_time) | 
					
						
							|  |  |  |             with threading.RLock(): | 
					
						
							|  |  |  |                 engine.stats['engine_time'] += engine_time | 
					
						
							|  |  |  |                 engine.stats['engine_time_count'] += 1 | 
					
						
							|  |  |  |                 # update stats with the total HTTP time | 
					
						
							|  |  |  |                 engine.stats['page_load_time'] += page_load_time | 
					
						
							|  |  |  |                 engine.stats['page_load_count'] += 1 | 
					
						
							| 
									
										
										
										
											2016-12-29 11:08:19 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							| 
									
										
										
										
											2019-07-17 10:38:45 +02:00
										 |  |  |         # Timing | 
					
						
							|  |  |  |         engine_time = time() - start_time | 
					
						
							|  |  |  |         page_load_time = requests_lib.get_time_for_thread() | 
					
						
							|  |  |  |         result_container.add_timing(engine_name, engine_time, page_load_time) | 
					
						
							| 
									
										
										
										
											2017-07-23 11:56:57 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-07-17 10:38:45 +02:00
										 |  |  |         # Record the errors | 
					
						
							| 
									
										
										
										
											2017-07-23 11:56:57 +02:00
										 |  |  |         with threading.RLock(): | 
					
						
							|  |  |  |             engine.stats['errors'] += 1 | 
					
						
							| 
									
										
										
										
											2016-12-29 11:08:19 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |         if (issubclass(e.__class__, requests.exceptions.Timeout)): | 
					
						
							| 
									
										
										
										
											2020-04-17 16:31:02 +02:00
										 |  |  |             result_container.add_unresponsive_engine(engine_name, 'timeout') | 
					
						
							| 
									
										
										
										
											2016-12-29 11:08:19 +01:00
										 |  |  |             # requests timeout (connect or read) | 
					
						
							|  |  |  |             logger.error("engine {0} : HTTP requests timeout" | 
					
						
							|  |  |  |                          "(search duration : {1} s, timeout: {2} s) : {3}" | 
					
						
							| 
									
										
										
										
											2019-07-17 10:38:45 +02:00
										 |  |  |                          .format(engine_name, engine_time, timeout_limit, e.__class__.__name__)) | 
					
						
							| 
									
										
										
										
											2016-12-29 11:08:19 +01:00
										 |  |  |             requests_exception = True | 
					
						
							| 
									
										
										
										
											2016-12-30 18:08:48 +01:00
										 |  |  |         elif (issubclass(e.__class__, requests.exceptions.RequestException)): | 
					
						
							| 
									
										
										
										
											2020-04-17 16:31:02 +02:00
										 |  |  |             result_container.add_unresponsive_engine(engine_name, 'request exception') | 
					
						
							| 
									
										
										
										
											2016-12-29 11:08:19 +01:00
										 |  |  |             # other requests exception | 
					
						
							|  |  |  |             logger.exception("engine {0} : requests exception" | 
					
						
							|  |  |  |                              "(search duration : {1} s, timeout: {2} s) : {3}" | 
					
						
							| 
									
										
										
										
											2019-07-17 10:38:45 +02:00
										 |  |  |                              .format(engine_name, engine_time, timeout_limit, e)) | 
					
						
							| 
									
										
										
										
											2016-12-29 11:08:19 +01:00
										 |  |  |             requests_exception = True | 
					
						
							|  |  |  |         else: | 
					
						
							| 
									
										
										
										
											2020-04-17 16:31:02 +02:00
										 |  |  |             result_container.add_unresponsive_engine(engine_name, 'unexpected crash', str(e)) | 
					
						
							| 
									
										
										
										
											2016-12-29 11:08:19 +01:00
										 |  |  |             # others errors | 
					
						
							|  |  |  |             logger.exception('engine {0} : exception : {1}'.format(engine_name, e)) | 
					
						
							| 
									
										
										
										
											2014-12-19 20:01:01 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-07-23 11:56:57 +02:00
										 |  |  |     # suspend or not the engine if there are HTTP errors | 
					
						
							|  |  |  |     with threading.RLock(): | 
					
						
							| 
									
										
										
										
											2016-12-29 11:08:19 +01:00
										 |  |  |         if requests_exception: | 
					
						
							| 
									
										
										
										
											2017-07-23 11:56:57 +02:00
										 |  |  |             # update continuous_errors / suspend_end_time | 
					
						
							|  |  |  |             engine.continuous_errors += 1 | 
					
						
							| 
									
										
										
										
											2018-08-19 15:29:52 +02:00
										 |  |  |             engine.suspend_end_time = time() + min(settings['search']['max_ban_time_on_fail'], | 
					
						
							| 
									
										
										
										
											2018-08-19 15:32:32 +02:00
										 |  |  |                                                    engine.continuous_errors * settings['search']['ban_time_on_fail']) | 
					
						
							| 
									
										
										
										
											2017-07-23 11:56:57 +02:00
										 |  |  |         else: | 
					
						
							|  |  |  |             # no HTTP error (perhaps an engine error) | 
					
						
							|  |  |  |             # anyway, reset the suspend variables | 
					
						
							|  |  |  |             engine.continuous_errors = 0 | 
					
						
							|  |  |  |             engine.suspend_end_time = 0 | 
					
						
							| 
									
										
										
										
											2016-11-05 13:45:20 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-14 10:25:29 +02:00
										 |  |  | def record_offline_engine_stats_on_error(engine, result_container, start_time): | 
					
						
							|  |  |  |     engine_time = time() - start_time | 
					
						
							|  |  |  |     result_container.add_timing(engine.name, engine_time, engine_time) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with threading.RLock(): | 
					
						
							|  |  |  |         engine.stats['errors'] += 1 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def search_one_offline_request(engine, query, request_params): | 
					
						
							|  |  |  |     return engine.search(query, request_params) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def search_one_offline_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit): | 
					
						
							|  |  |  |     engine = engines[engine_name] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         search_results = search_one_offline_request(engine, query, request_params) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if search_results: | 
					
						
							|  |  |  |             result_container.extend(engine_name, search_results) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             engine_time = time() - start_time | 
					
						
							|  |  |  |             result_container.add_timing(engine_name, engine_time, engine_time) | 
					
						
							|  |  |  |             with threading.RLock(): | 
					
						
							|  |  |  |                 engine.stats['engine_time'] += engine_time | 
					
						
							|  |  |  |                 engine.stats['engine_time_count'] += 1 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     except ValueError as e: | 
					
						
							|  |  |  |         record_offline_engine_stats_on_error(engine, result_container, start_time) | 
					
						
							|  |  |  |         logger.exception('engine {0} : invalid input : {1}'.format(engine_name, e)) | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         record_offline_engine_stats_on_error(engine, result_container, start_time) | 
					
						
							|  |  |  |         result_container.add_unresponsive_engine(engine_name, 'unexpected crash', str(e)) | 
					
						
							|  |  |  |         logger.exception('engine {0} : exception : {1}'.format(engine_name, e)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def search_one_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit): | 
					
						
							|  |  |  |     if engines[engine_name].offline: | 
					
						
							|  |  |  |         return search_one_offline_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit)  # noqa | 
					
						
							|  |  |  |     return search_one_http_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-12-30 17:37:46 +01:00
										 |  |  | def search_multiple_requests(requests, result_container, start_time, timeout_limit): | 
					
						
							| 
									
										
										
										
											2016-09-06 00:36:33 +02:00
										 |  |  |     search_id = uuid4().__str__() | 
					
						
							| 
									
										
										
										
											2016-11-05 13:45:20 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     for engine_name, query, request_params in requests: | 
					
						
							| 
									
										
										
										
											2014-12-05 19:24:11 +01:00
										 |  |  |         th = threading.Thread( | 
					
						
							| 
									
										
										
										
											2016-12-29 11:08:19 +01:00
										 |  |  |             target=search_one_request_safe, | 
					
						
							| 
									
										
										
										
											2016-12-30 17:37:46 +01:00
										 |  |  |             args=(engine_name, query, request_params, result_container, start_time, timeout_limit), | 
					
						
							| 
									
										
										
										
											2016-09-06 00:36:33 +02:00
										 |  |  |             name=search_id, | 
					
						
							| 
									
										
										
										
											2014-12-05 19:24:11 +01:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2014-12-19 13:59:41 +01:00
										 |  |  |         th._engine_name = engine_name | 
					
						
							| 
									
										
										
										
											2014-12-05 19:24:11 +01:00
										 |  |  |         th.start() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for th in threading.enumerate(): | 
					
						
							| 
									
										
										
										
											2016-09-06 00:36:33 +02:00
										 |  |  |         if th.name == search_id: | 
					
						
							| 
									
										
										
										
											2016-11-05 13:45:20 +01:00
										 |  |  |             remaining_time = max(0.0, timeout_limit - (time() - start_time)) | 
					
						
							| 
									
										
										
										
											2014-12-14 01:18:01 +01:00
										 |  |  |             th.join(remaining_time) | 
					
						
							| 
									
										
										
										
											2020-07-30 23:28:54 +02:00
										 |  |  |             if th.is_alive(): | 
					
						
							| 
									
										
										
										
											2020-04-17 16:31:02 +02:00
										 |  |  |                 result_container.add_unresponsive_engine(th._engine_name, 'timeout') | 
					
						
							| 
									
										
										
										
											2015-01-09 04:13:05 +01:00
										 |  |  |                 logger.warning('engine timeout: {0}'.format(th._engine_name)) | 
					
						
							| 
									
										
										
										
											2014-12-14 01:18:01 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-12-05 19:24:11 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-13 18:25:25 +02:00
										 |  |  | # get default reqest parameter | 
					
						
							| 
									
										
										
										
											2014-07-07 13:59:27 +02:00
										 |  |  | def default_request_params(): | 
					
						
							|  |  |  |     return { | 
					
						
							| 
									
										
										
										
											2014-12-29 21:31:04 +01:00
										 |  |  |         'method': 'GET', | 
					
						
							|  |  |  |         'headers': {}, | 
					
						
							|  |  |  |         'data': {}, | 
					
						
							|  |  |  |         'url': '', | 
					
						
							|  |  |  |         'cookies': {}, | 
					
						
							| 
									
										
										
										
											2020-10-09 15:05:13 +02:00
										 |  |  |         'verify': True, | 
					
						
							|  |  |  |         'auth': None | 
					
						
							| 
									
										
										
										
											2014-12-29 21:31:04 +01:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2014-07-07 13:59:27 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-08-12 09:42:27 +02:00
										 |  |  | class Search: | 
					
						
							| 
									
										
										
										
											2016-10-22 13:10:31 +02:00
										 |  |  |     """Search information container""" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-10 18:08:14 +02:00
										 |  |  |     __slots__ = "search_query", "result_container", "start_time", "actual_timeout" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-10-22 13:10:31 +02:00
										 |  |  |     def __init__(self, search_query): | 
					
						
							|  |  |  |         # init vars | 
					
						
							| 
									
										
										
										
											2020-08-12 09:42:27 +02:00
										 |  |  |         super().__init__() | 
					
						
							| 
									
										
										
										
											2016-10-22 13:10:31 +02:00
										 |  |  |         self.search_query = search_query | 
					
						
							|  |  |  |         self.result_container = ResultContainer() | 
					
						
							| 
									
										
										
										
											2020-09-14 13:21:21 +02:00
										 |  |  |         self.start_time = None | 
					
						
							| 
									
										
										
										
											2019-08-02 13:50:51 +02:00
										 |  |  |         self.actual_timeout = None | 
					
						
							| 
									
										
										
										
											2016-02-20 00:21:56 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-14 13:21:21 +02:00
										 |  |  |     def search_external_bang(self): | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         Check if there is a external bang. | 
					
						
							|  |  |  |         If yes, update self.result_container and return True | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2020-07-03 15:25:04 +02:00
										 |  |  |         if self.search_query.external_bang: | 
					
						
							|  |  |  |             self.result_container.redirect_url = get_bang_url(self.search_query) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # This means there was a valid bang and the | 
					
						
							|  |  |  |             # rest of the search does not need to be continued | 
					
						
							| 
									
										
										
										
											2020-08-06 17:42:46 +02:00
										 |  |  |             if isinstance(self.result_container.redirect_url, str): | 
					
						
							| 
									
										
										
										
											2020-09-14 13:21:21 +02:00
										 |  |  |                 return True | 
					
						
							|  |  |  |         return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def search_answerers(self): | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         Check if an answer return a result. | 
					
						
							|  |  |  |         If yes, update self.result_container and return True | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2016-11-19 20:53:51 +01:00
										 |  |  |         answerers_results = ask(self.search_query) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if answerers_results: | 
					
						
							|  |  |  |             for results in answerers_results: | 
					
						
							|  |  |  |                 self.result_container.extend('answer', results) | 
					
						
							| 
									
										
										
										
											2020-09-14 13:21:21 +02:00
										 |  |  |             return True | 
					
						
							|  |  |  |         return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def _is_accepted(self, engine_name, engine): | 
					
						
							|  |  |  |         # skip suspended engines | 
					
						
							|  |  |  |         if engine.suspend_end_time >= time(): | 
					
						
							|  |  |  |             logger.debug('Engine currently suspended: %s', engine_name) | 
					
						
							|  |  |  |             return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # if paging is not supported, skip | 
					
						
							|  |  |  |         if self.search_query.pageno > 1 and not engine.paging: | 
					
						
							|  |  |  |             return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # if time_range is not supported, skip | 
					
						
							|  |  |  |         if self.search_query.time_range and not engine.time_range_support: | 
					
						
							|  |  |  |             return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return True | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-22 16:22:22 +02:00
										 |  |  |     def _get_params(self, engineref, user_agent): | 
					
						
							|  |  |  |         if engineref.name not in engines: | 
					
						
							| 
									
										
										
										
											2020-09-14 13:21:21 +02:00
										 |  |  |             return None, None | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-22 16:22:22 +02:00
										 |  |  |         engine = engines[engineref.name] | 
					
						
							| 
									
										
										
										
											2020-09-14 13:21:21 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-22 16:22:22 +02:00
										 |  |  |         if not self._is_accepted(engineref.name, engine): | 
					
						
							| 
									
										
										
										
											2020-09-14 13:21:21 +02:00
										 |  |  |             return None, None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # set default request parameters | 
					
						
							|  |  |  |         request_params = {} | 
					
						
							|  |  |  |         if not engine.offline: | 
					
						
							|  |  |  |             request_params = default_request_params() | 
					
						
							|  |  |  |             request_params['headers']['User-Agent'] = user_agent | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             if hasattr(engine, 'language') and engine.language: | 
					
						
							|  |  |  |                 request_params['language'] = engine.language | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 request_params['language'] = self.search_query.lang | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             request_params['safesearch'] = self.search_query.safesearch | 
					
						
							|  |  |  |             request_params['time_range'] = self.search_query.time_range | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-22 16:22:22 +02:00
										 |  |  |         request_params['category'] = engineref.category | 
					
						
							| 
									
										
										
										
											2020-09-14 13:21:21 +02:00
										 |  |  |         request_params['pageno'] = self.search_query.pageno | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return request_params, engine.timeout | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # do search-request | 
					
						
							|  |  |  |     def _get_requests(self): | 
					
						
							| 
									
										
										
										
											2014-09-13 18:25:25 +02:00
										 |  |  |         # init vars | 
					
						
							| 
									
										
										
										
											2014-07-07 13:59:27 +02:00
										 |  |  |         requests = [] | 
					
						
							| 
									
										
										
										
											2014-09-13 18:25:25 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # set default useragent | 
					
						
							| 
									
										
										
										
											2014-10-19 12:41:04 +02:00
										 |  |  |         # user_agent = request.headers.get('User-Agent', '') | 
					
						
							| 
									
										
										
										
											2014-07-07 13:59:27 +02:00
										 |  |  |         user_agent = gen_useragent() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-11-05 13:45:20 +01:00
										 |  |  |         # max of all selected engine timeout | 
					
						
							| 
									
										
										
										
											2019-08-02 13:50:51 +02:00
										 |  |  |         default_timeout = 0 | 
					
						
							| 
									
										
										
										
											2016-11-05 13:45:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-13 18:25:25 +02:00
										 |  |  |         # start search-reqest for all selected engines | 
					
						
							| 
									
										
										
										
											2020-09-22 16:22:22 +02:00
										 |  |  |         for engineref in self.search_query.engineref_list: | 
					
						
							| 
									
										
										
										
											2014-09-13 18:25:25 +02:00
										 |  |  |             # set default request parameters | 
					
						
							| 
									
										
										
										
											2020-09-22 16:22:22 +02:00
										 |  |  |             request_params, engine_timeout = self._get_params(engineref, user_agent) | 
					
						
							| 
									
										
										
										
											2020-09-14 13:21:21 +02:00
										 |  |  |             if request_params is None: | 
					
						
							|  |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2014-09-13 18:25:25 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-11-05 13:45:20 +01:00
										 |  |  |             # append request to list | 
					
						
							| 
									
										
										
										
											2020-09-22 16:22:22 +02:00
										 |  |  |             requests.append((engineref.name, self.search_query.query, request_params)) | 
					
						
							| 
									
										
										
										
											2014-07-07 13:59:27 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-08-02 13:50:51 +02:00
										 |  |  |             # update default_timeout | 
					
						
							| 
									
										
										
										
											2020-09-14 13:21:21 +02:00
										 |  |  |             default_timeout = max(default_timeout, engine_timeout) | 
					
						
							| 
									
										
										
										
											2019-08-02 13:50:51 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # adjust timeout | 
					
						
							| 
									
										
										
										
											2020-09-14 13:21:21 +02:00
										 |  |  |         actual_timeout = default_timeout | 
					
						
							| 
									
										
										
										
											2019-08-02 13:50:51 +02:00
										 |  |  |         query_timeout = self.search_query.timeout_limit | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if max_request_timeout is None and query_timeout is None: | 
					
						
							|  |  |  |             # No max, no user query: default_timeout | 
					
						
							|  |  |  |             pass | 
					
						
							|  |  |  |         elif max_request_timeout is None and query_timeout is not None: | 
					
						
							|  |  |  |             # No max, but user query: From user query except if above default | 
					
						
							| 
									
										
										
										
											2020-09-14 13:21:21 +02:00
										 |  |  |             actual_timeout = min(default_timeout, query_timeout) | 
					
						
							| 
									
										
										
										
											2019-08-02 13:50:51 +02:00
										 |  |  |         elif max_request_timeout is not None and query_timeout is None: | 
					
						
							|  |  |  |             # Max, no user query: Default except if above max | 
					
						
							| 
									
										
										
										
											2020-09-14 13:21:21 +02:00
										 |  |  |             actual_timeout = min(default_timeout, max_request_timeout) | 
					
						
							| 
									
										
										
										
											2019-08-02 13:50:51 +02:00
										 |  |  |         elif max_request_timeout is not None and query_timeout is not None: | 
					
						
							|  |  |  |             # Max & user query: From user query except if above max | 
					
						
							| 
									
										
										
										
											2020-09-14 13:21:21 +02:00
										 |  |  |             actual_timeout = min(query_timeout, max_request_timeout) | 
					
						
							| 
									
										
										
										
											2019-08-02 13:50:51 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         logger.debug("actual_timeout={0} (default_timeout={1}, ?timeout_limit={2}, max_request_timeout={3})" | 
					
						
							| 
									
										
										
										
											2020-09-10 18:08:14 +02:00
										 |  |  |                      .format(actual_timeout, default_timeout, query_timeout, max_request_timeout)) | 
					
						
							| 
									
										
										
										
											2019-08-02 13:50:51 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-14 13:21:21 +02:00
										 |  |  |         return requests, actual_timeout | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def search_standard(self): | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         Update self.result_container, self.actual_timeout | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         requests, self.actual_timeout = self._get_requests() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-08-02 13:50:51 +02:00
										 |  |  |         # send all search-request | 
					
						
							| 
									
										
										
										
											2016-11-05 13:45:20 +01:00
										 |  |  |         if requests: | 
					
						
							| 
									
										
										
										
											2020-09-14 13:21:21 +02:00
										 |  |  |             search_multiple_requests(requests, self.result_container, self.start_time, self.actual_timeout) | 
					
						
							| 
									
										
										
										
											2016-11-05 13:45:20 +01:00
										 |  |  |             start_new_thread(gc.collect, tuple()) | 
					
						
							| 
									
										
										
										
											2014-09-13 18:25:25 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  |         # return results, suggestions, answers and infoboxes | 
					
						
							| 
									
										
										
										
											2020-09-14 13:21:21 +02:00
										 |  |  |         return True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # do search-request | 
					
						
							|  |  |  |     def search(self): | 
					
						
							|  |  |  |         self.start_time = time() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if not self.search_external_bang(): | 
					
						
							|  |  |  |             if not self.search_answerers(): | 
					
						
							|  |  |  |                 self.search_standard() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-10-22 13:10:31 +02:00
										 |  |  |         return self.result_container | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class SearchWithPlugins(Search): | 
					
						
							| 
									
										
										
										
											2016-10-22 14:01:53 +02:00
										 |  |  |     """Similar to the Search class but call the plugins.""" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-10 18:08:14 +02:00
										 |  |  |     __slots__ = 'ordered_plugin_list', 'request' | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-01-02 12:06:04 +01:00
										 |  |  |     def __init__(self, search_query, ordered_plugin_list, request): | 
					
						
							| 
									
										
										
										
											2020-08-12 09:42:27 +02:00
										 |  |  |         super().__init__(search_query) | 
					
						
							| 
									
										
										
										
											2017-01-02 12:06:04 +01:00
										 |  |  |         self.ordered_plugin_list = ordered_plugin_list | 
					
						
							| 
									
										
										
										
											2016-10-22 13:10:31 +02:00
										 |  |  |         self.request = request | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def search(self): | 
					
						
							| 
									
										
										
										
											2017-01-02 12:06:04 +01:00
										 |  |  |         if plugins.call(self.ordered_plugin_list, 'pre_search', self.request, self): | 
					
						
							| 
									
										
										
										
											2020-08-12 09:42:27 +02:00
										 |  |  |             super().search() | 
					
						
							| 
									
										
										
										
											2016-10-22 13:10:31 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-01-02 12:06:04 +01:00
										 |  |  |         plugins.call(self.ordered_plugin_list, 'post_search', self.request, self) | 
					
						
							| 
									
										
										
										
											2016-10-22 14:01:53 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         results = self.result_container.get_ordered_results() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         for result in results: | 
					
						
							| 
									
										
										
										
											2017-01-02 12:06:04 +01:00
										 |  |  |             plugins.call(self.ordered_plugin_list, 'on_result', self.request, self, result) | 
					
						
							| 
									
										
										
										
											2016-10-22 14:01:53 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-10-22 13:10:31 +02:00
										 |  |  |         return self.result_container |