Merge pull request #343 from dalf/fix-checker-memory-issue
[fix] checker: fix memory usage
This commit is contained in:
		
						commit
						c23aa5760c
					
				| @ -9,6 +9,7 @@ from types import MethodType | |||||||
| from timeit import default_timer | from timeit import default_timer | ||||||
| 
 | 
 | ||||||
| import httpx | import httpx | ||||||
|  | import anyio | ||||||
| import h2.exceptions | import h2.exceptions | ||||||
| 
 | 
 | ||||||
| from .network import get_network, initialize | from .network import get_network, initialize | ||||||
| @ -166,7 +167,7 @@ async def stream_chunk_to_queue(network, queue, method, url, **kwargs): | |||||||
|             async for chunk in response.aiter_raw(65536): |             async for chunk in response.aiter_raw(65536): | ||||||
|                 if len(chunk) > 0: |                 if len(chunk) > 0: | ||||||
|                     queue.put(chunk) |                     queue.put(chunk) | ||||||
|     except httpx.StreamClosed: |     except (httpx.StreamClosed, anyio.ClosedResourceError): | ||||||
|         # the response was queued before the exception. |         # the response was queued before the exception. | ||||||
|         # the exception was raised on aiter_raw. |         # the exception was raised on aiter_raw. | ||||||
|         # we do nothing here: in the finally block, None will be queued |         # we do nothing here: in the finally block, None will be queued | ||||||
| @ -183,11 +184,35 @@ async def stream_chunk_to_queue(network, queue, method, url, **kwargs): | |||||||
|         queue.put(None) |         queue.put(None) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def _stream_generator(method, url, **kwargs): | ||||||
|  |     queue = SimpleQueue() | ||||||
|  |     network = get_context_network() | ||||||
|  |     future = asyncio.run_coroutine_threadsafe( | ||||||
|  |         stream_chunk_to_queue(network, queue, method, url, **kwargs), | ||||||
|  |         get_loop() | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # yield chunks | ||||||
|  |     obj_or_exception = queue.get() | ||||||
|  |     while obj_or_exception is not None: | ||||||
|  |         if isinstance(obj_or_exception, Exception): | ||||||
|  |             raise obj_or_exception | ||||||
|  |         yield obj_or_exception | ||||||
|  |         obj_or_exception = queue.get() | ||||||
|  |     future.result() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def _close_response_method(self): | def _close_response_method(self): | ||||||
|     asyncio.run_coroutine_threadsafe( |     asyncio.run_coroutine_threadsafe( | ||||||
|         self.aclose(), |         self.aclose(), | ||||||
|         get_loop() |         get_loop() | ||||||
|     ) |     ) | ||||||
|  |     # reach the end of _self.generator ( _stream_generator ) to an avoid memory leak. | ||||||
|  |     # it makes sure that : | ||||||
|  |     # * the httpx response is closed (see the stream_chunk_to_queue function) | ||||||
|  |     # * to call future.result() in _stream_generator | ||||||
|  |     for _ in self._generator:  # pylint: disable=protected-access | ||||||
|  |         continue | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def stream(method, url, **kwargs): | def stream(method, url, **kwargs): | ||||||
| @ -202,25 +227,15 @@ def stream(method, url, **kwargs): | |||||||
|     httpx.Client.stream requires to write the httpx.HTTPTransport version of the |     httpx.Client.stream requires to write the httpx.HTTPTransport version of the | ||||||
|     the httpx.AsyncHTTPTransport declared above. |     the httpx.AsyncHTTPTransport declared above. | ||||||
|     """ |     """ | ||||||
|     queue = SimpleQueue() |     generator = _stream_generator(method, url, **kwargs) | ||||||
|     network = get_context_network() |  | ||||||
|     future = asyncio.run_coroutine_threadsafe( |  | ||||||
|         stream_chunk_to_queue(network, queue, method, url, **kwargs), |  | ||||||
|         get_loop() |  | ||||||
|     ) |  | ||||||
| 
 | 
 | ||||||
|     # yield response |     # yield response | ||||||
|     response = queue.get() |     response = next(generator)  # pylint: disable=stop-iteration-return | ||||||
|     if isinstance(response, Exception): |     if isinstance(response, Exception): | ||||||
|         raise response |         raise response | ||||||
|  | 
 | ||||||
|  |     response._generator = generator  # pylint: disable=protected-access | ||||||
|     response.close = MethodType(_close_response_method, response) |     response.close = MethodType(_close_response_method, response) | ||||||
|     yield response |     yield response | ||||||
| 
 | 
 | ||||||
|     # yield chunks |     yield from generator | ||||||
|     chunk_or_exception = queue.get() |  | ||||||
|     while chunk_or_exception is not None: |  | ||||||
|         if isinstance(chunk_or_exception, Exception): |  | ||||||
|             raise chunk_or_exception |  | ||||||
|         yield chunk_or_exception |  | ||||||
|         chunk_or_exception = queue.get() |  | ||||||
|     future.result() |  | ||||||
|  | |||||||
| @ -6,6 +6,7 @@ import asyncio | |||||||
| import logging | import logging | ||||||
| import threading | import threading | ||||||
| 
 | 
 | ||||||
|  | import anyio | ||||||
| import httpcore | import httpcore | ||||||
| import httpx | import httpx | ||||||
| from httpx_socks import AsyncProxyTransport | from httpx_socks import AsyncProxyTransport | ||||||
| @ -102,6 +103,9 @@ class AsyncProxyTransportFixed(AsyncProxyTransport): | |||||||
|                 # then each new request creates a new stream and raise the same WriteError |                 # then each new request creates a new stream and raise the same WriteError | ||||||
|                 await close_connections_for_url(self, url) |                 await close_connections_for_url(self, url) | ||||||
|                 raise e |                 raise e | ||||||
|  |             except anyio.ClosedResourceError as e: | ||||||
|  |                 await close_connections_for_url(self, url) | ||||||
|  |                 raise httpx.CloseError from e | ||||||
|             except httpx.RemoteProtocolError as e: |             except httpx.RemoteProtocolError as e: | ||||||
|                 # in case of httpx.RemoteProtocolError: Server disconnected |                 # in case of httpx.RemoteProtocolError: Server disconnected | ||||||
|                 await close_connections_for_url(self, url) |                 await close_connections_for_url(self, url) | ||||||
| @ -130,6 +134,9 @@ class AsyncHTTPTransportFixed(httpx.AsyncHTTPTransport): | |||||||
|                 # then each new request creates a new stream and raise the same WriteError |                 # then each new request creates a new stream and raise the same WriteError | ||||||
|                 await close_connections_for_url(self._pool, url) |                 await close_connections_for_url(self._pool, url) | ||||||
|                 raise e |                 raise e | ||||||
|  |             except anyio.ClosedResourceError as e: | ||||||
|  |                 await close_connections_for_url(self._pool, url) | ||||||
|  |                 raise httpx.CloseError from e | ||||||
|             except httpx.RemoteProtocolError as e: |             except httpx.RemoteProtocolError as e: | ||||||
|                 # in case of httpx.RemoteProtocolError: Server disconnected |                 # in case of httpx.RemoteProtocolError: Server disconnected | ||||||
|                 await close_connections_for_url(self._pool, url) |                 await close_connections_for_url(self._pool, url) | ||||||
|  | |||||||
| @ -1,5 +1,6 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| 
 | 
 | ||||||
|  | import gc | ||||||
| import typing | import typing | ||||||
| import types | import types | ||||||
| import functools | import functools | ||||||
| @ -14,6 +15,7 @@ from langdetect.lang_detect_exception import LangDetectException | |||||||
| import httpx | import httpx | ||||||
| 
 | 
 | ||||||
| from searx import network, logger | from searx import network, logger | ||||||
|  | from searx.utils import gen_useragent | ||||||
| from searx.results import ResultContainer | from searx.results import ResultContainer | ||||||
| from searx.search.models import SearchQuery, EngineRef | from searx.search.models import SearchQuery, EngineRef | ||||||
| from searx.search.processors import EngineProcessor | from searx.search.processors import EngineProcessor | ||||||
| @ -58,7 +60,50 @@ def _is_url(url): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @functools.lru_cache(maxsize=8192) | @functools.lru_cache(maxsize=8192) | ||||||
| def _is_url_image(image_url): | def _download_and_check_if_image(image_url: str) -> bool: | ||||||
|  |     """Download an URL and check if the Content-Type starts with "image/" | ||||||
|  |     This function should not be called directly: use _is_url_image | ||||||
|  |     otherwise the cache of functools.lru_cache contains data: URL which might be huge. | ||||||
|  |     """ | ||||||
|  |     retry = 2 | ||||||
|  | 
 | ||||||
|  |     while retry > 0: | ||||||
|  |         a = time() | ||||||
|  |         try: | ||||||
|  |             # use "image_proxy" (avoid HTTP/2) | ||||||
|  |             network.set_context_network_name('image_proxy') | ||||||
|  |             stream = network.stream('GET', image_url, timeout=10.0, allow_redirects=True, headers={ | ||||||
|  |                 'User-Agent': gen_useragent(), | ||||||
|  |                 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | ||||||
|  |                 'Accept-Language': 'en-US;q=0.5,en;q=0.3', | ||||||
|  |                 'Accept-Encoding': 'gzip, deflate, br', | ||||||
|  |                 'DNT': '1', | ||||||
|  |                 'Connection': 'keep-alive', | ||||||
|  |                 'Upgrade-Insecure-Requests': '1', | ||||||
|  |                 'Sec-GPC': '1', | ||||||
|  |                 'Cache-Control': 'max-age=0' | ||||||
|  |             }) | ||||||
|  |             r = next(stream) | ||||||
|  |             r.close() | ||||||
|  |             if r.status_code == 200: | ||||||
|  |                 is_image = r.headers.get('content-type', '').startswith('image/') | ||||||
|  |             else: | ||||||
|  |                 is_image = False | ||||||
|  |             del r | ||||||
|  |             del stream | ||||||
|  |             return is_image | ||||||
|  |         except httpx.TimeoutException: | ||||||
|  |             logger.error('Timeout for %s: %i', image_url, int(time() - a)) | ||||||
|  |             retry -= 1 | ||||||
|  |         except httpx.HTTPError: | ||||||
|  |             logger.exception('Exception for %s', image_url) | ||||||
|  |             return False | ||||||
|  |     return False | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def _is_url_image(image_url) -> bool: | ||||||
|  |     """Normalize image_url | ||||||
|  |     """ | ||||||
|     if not isinstance(image_url, str): |     if not isinstance(image_url, str): | ||||||
|         return False |         return False | ||||||
| 
 | 
 | ||||||
| @ -71,32 +116,7 @@ def _is_url_image(image_url): | |||||||
|     if not _is_url(image_url): |     if not _is_url(image_url): | ||||||
|         return False |         return False | ||||||
| 
 | 
 | ||||||
|     retry = 2 |     return _download_and_check_if_image(image_url) | ||||||
| 
 |  | ||||||
|     while retry > 0: |  | ||||||
|         a = time() |  | ||||||
|         try: |  | ||||||
|             network.set_timeout_for_thread(10.0, time()) |  | ||||||
|             r = network.get(image_url, timeout=10.0, allow_redirects=True, headers={ |  | ||||||
|                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', |  | ||||||
|                 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', |  | ||||||
|                 'Accept-Language': 'en-US;q=0.5,en;q=0.3', |  | ||||||
|                 'Accept-Encoding': 'gzip, deflate, br', |  | ||||||
|                 'DNT': '1', |  | ||||||
|                 'Connection': 'keep-alive', |  | ||||||
|                 'Upgrade-Insecure-Requests': '1', |  | ||||||
|                 'Sec-GPC': '1', |  | ||||||
|                 'Cache-Control': 'max-age=0' |  | ||||||
|             }) |  | ||||||
|             if r.headers["content-type"].startswith('image/'): |  | ||||||
|                 return True |  | ||||||
|             return False |  | ||||||
|         except httpx.TimeoutException: |  | ||||||
|             logger.error('Timeout for %s: %i', image_url, int(time() - a)) |  | ||||||
|             retry -= 1 |  | ||||||
|         except httpx.HTTPError: |  | ||||||
|             logger.exception('Exception for %s', image_url) |  | ||||||
|             return False |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]: | def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]: | ||||||
| @ -414,3 +434,7 @@ class Checker: | |||||||
|     def run(self): |     def run(self): | ||||||
|         for test_name in self.tests: |         for test_name in self.tests: | ||||||
|             self.run_test(test_name) |             self.run_test(test_name) | ||||||
|  |             # clear cache | ||||||
|  |             _download_and_check_if_image.cache_clear() | ||||||
|  |             # force a garbage collector | ||||||
|  |             gc.collect() | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user