477 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			477 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
|  | # SPDX-License-Identifier: AGPL-3.0-or-later | ||
|  | """Implementations for caching favicons.
 | ||
|  | 
 | ||
|  | :py:obj:`FaviconCacheConfig`: | ||
|  |   Configuration of the favicon cache | ||
|  | 
 | ||
|  | :py:obj:`FaviconCache`: | ||
|  |   Abstract base class for the implementation of a favicon cache. | ||
|  | 
 | ||
|  | :py:obj:`FaviconCacheSQLite`: | ||
|  |   Favicon cache that manages the favicon BLOBs in a SQLite DB. | ||
|  | 
 | ||
|  | :py:obj:`FaviconCacheNull`: | ||
|  |   Fallback solution if the configured cache cannot be used for system reasons. | ||
|  | 
 | ||
|  | ---- | ||
|  | 
 | ||
|  | """
 | ||
|  | 
 | ||
|  | from __future__ import annotations | ||
|  | from typing import Literal | ||
|  | 
 | ||
|  | import abc | ||
|  | import dataclasses | ||
|  | import hashlib | ||
|  | import logging | ||
|  | import pathlib | ||
|  | import sqlite3 | ||
|  | import tempfile | ||
|  | import time | ||
|  | import typer | ||
|  | 
 | ||
|  | from pydantic import BaseModel | ||
|  | 
 | ||
|  | from searx import sqlitedb | ||
|  | from searx import logger | ||
|  | from searx.utils import humanize_bytes, humanize_number | ||
|  | 
 | ||
|  | CACHE: "FaviconCache" | ||
|  | FALLBACK_ICON = b"FALLBACK_ICON" | ||
|  | 
 | ||
|  | logger = logger.getChild('favicons.cache') | ||
|  | app = typer.Typer() | ||
|  | 
 | ||
|  | 
 | ||
|  | @app.command() | ||
|  | def state(): | ||
|  |     """show state of the cache""" | ||
|  |     print(CACHE.state().report()) | ||
|  | 
 | ||
|  | 
 | ||
|  | @app.command() | ||
|  | def maintenance(force: bool = True, debug: bool = False): | ||
|  |     """perform maintenance of the cache""" | ||
|  |     root_log = logging.getLogger() | ||
|  |     if debug: | ||
|  |         root_log.setLevel(logging.DEBUG) | ||
|  |     else: | ||
|  |         root_log.handlers = [] | ||
|  |         handler = logging.StreamHandler() | ||
|  |         handler.setFormatter(logging.Formatter("%(message)s")) | ||
|  |         logger.addHandler(handler) | ||
|  |         logger.setLevel(logging.DEBUG) | ||
|  | 
 | ||
|  |     state_t0 = CACHE.state() | ||
|  |     CACHE.maintenance(force=force) | ||
|  |     state_t1 = CACHE.state() | ||
|  |     state_delta = state_t0 - state_t1 | ||
|  |     print("The cache has been reduced by:") | ||
|  |     print(state_delta.report("\n- {descr}: {val}").lstrip("\n")) | ||
|  | 
 | ||
|  | 
 | ||
|  | def init(cfg: "FaviconCacheConfig"): | ||
|  |     """Initialization of a global ``CACHE``""" | ||
|  | 
 | ||
|  |     global CACHE  # pylint: disable=global-statement | ||
|  |     if cfg.db_type == "sqlite": | ||
|  |         if sqlite3.sqlite_version_info <= (3, 35): | ||
|  |             logger.critical( | ||
|  |                 "Disable favicon caching completely: SQLite library (%s) is too old! (require >= 3.35)", | ||
|  |                 sqlite3.sqlite_version, | ||
|  |             ) | ||
|  |             CACHE = FaviconCacheNull(cfg) | ||
|  |         else: | ||
|  |             CACHE = FaviconCacheSQLite(cfg) | ||
|  |     elif cfg.db_type == "mem": | ||
|  |         logger.error("Favicons are cached in memory, don't use this in production!") | ||
|  |         CACHE = FaviconCacheMEM(cfg) | ||
|  |     else: | ||
|  |         raise NotImplementedError(f"favicons db_type '{cfg.db_type}' is unknown") | ||
|  | 
 | ||
|  | 
 | ||
|  | class FaviconCacheConfig(BaseModel): | ||
|  |     """Configuration of the favicon cache.""" | ||
|  | 
 | ||
|  |     db_type: Literal["sqlite", "mem"] = "sqlite" | ||
|  |     """Type of the database:
 | ||
|  | 
 | ||
|  |     ``sqlite``: | ||
|  |       :py:obj:`.cache.FaviconCacheSQLite` | ||
|  | 
 | ||
|  |     ``mem``: | ||
|  |       :py:obj:`.cache.FaviconCacheMEM` (not recommended) | ||
|  |     """
 | ||
|  | 
 | ||
|  |     db_url: pathlib.Path = pathlib.Path(tempfile.gettempdir()) / "faviconcache.db" | ||
|  |     """URL of the SQLite DB, the path to the database file.""" | ||
|  | 
 | ||
|  |     HOLD_TIME: int = 60 * 60 * 24 * 30  # 30 days | ||
|  |     """Hold time (default in sec.), after which a BLOB is removed from the cache.""" | ||
|  | 
 | ||
|  |     LIMIT_TOTAL_BYTES: int = 1024 * 1024 * 50  # 50 MB | ||
|  |     """Maximum of bytes (default) stored in the cache of all blobs.  Note: The
 | ||
|  |     limit is only reached at each maintenance interval after which the oldest | ||
|  |     BLOBs are deleted; the limit is exceeded during the maintenance period. If | ||
|  |     the maintenance period is *too long* or maintenance is switched off | ||
|  |     completely, the cache grows uncontrollably."""
 | ||
|  | 
 | ||
|  |     BLOB_MAX_BYTES: int = 1024 * 20  # 20 KB | ||
|  |     """The maximum BLOB size in bytes that a favicon may have so that it can be
 | ||
|  |     saved in the cache.  If the favicon is larger, it is not saved in the cache | ||
|  |     and must be requested by the client via the proxy."""
 | ||
|  | 
 | ||
|  |     MAINTENANCE_PERIOD: int = 60 * 60 | ||
|  |     """Maintenance period in seconds / when :py:obj:`MAINTENANCE_MODE` is set to
 | ||
|  |     ``auto``."""
 | ||
|  | 
 | ||
|  |     MAINTENANCE_MODE: Literal["auto", "off"] = "auto" | ||
|  |     """Type of maintenance mode
 | ||
|  | 
 | ||
|  |     ``auto``: | ||
|  |       Maintenance is carried out automatically as part of the maintenance | ||
|  |       intervals (:py:obj:`MAINTENANCE_PERIOD`); no external process is required. | ||
|  | 
 | ||
|  |     ``off``: | ||
|  |       Maintenance is switched off and must be carried out by an external process | ||
|  |       if required. | ||
|  |     """
 | ||
|  | 
 | ||
|  | 
 | ||
|  | @dataclasses.dataclass | ||
|  | class FaviconCacheStats: | ||
|  |     """Dataclass wich provides information on the status of the cache.""" | ||
|  | 
 | ||
|  |     favicons: int | None = None | ||
|  |     bytes: int | None = None | ||
|  |     domains: int | None = None | ||
|  |     resolvers: int | None = None | ||
|  | 
 | ||
|  |     field_descr = ( | ||
|  |         ("favicons", "number of favicons in cache", humanize_number), | ||
|  |         ("bytes", "total size (approx. bytes) of cache", humanize_bytes), | ||
|  |         ("domains", "total number of domains in cache", humanize_number), | ||
|  |         ("resolvers", "number of resolvers", str), | ||
|  |     ) | ||
|  | 
 | ||
|  |     def __sub__(self, other) -> FaviconCacheStats: | ||
|  |         if not isinstance(other, self.__class__): | ||
|  |             raise TypeError(f"unsupported operand type(s) for +: '{self.__class__}' and '{type(other)}'") | ||
|  |         kwargs = {} | ||
|  |         for field, _, _ in self.field_descr: | ||
|  |             self_val, other_val = getattr(self, field), getattr(other, field) | ||
|  |             if None in (self_val, other_val): | ||
|  |                 continue | ||
|  |             if isinstance(self_val, int): | ||
|  |                 kwargs[field] = self_val - other_val | ||
|  |             else: | ||
|  |                 kwargs[field] = self_val | ||
|  |         return self.__class__(**kwargs) | ||
|  | 
 | ||
|  |     def report(self, fmt: str = "{descr}: {val}\n"): | ||
|  |         s = [] | ||
|  |         for field, descr, cast in self.field_descr: | ||
|  |             val = getattr(self, field) | ||
|  |             if val is None: | ||
|  |                 val = "--" | ||
|  |             else: | ||
|  |                 val = cast(val) | ||
|  |             s.append(fmt.format(descr=descr, val=val)) | ||
|  |         return "".join(s) | ||
|  | 
 | ||
|  | 
 | ||
|  | class FaviconCache(abc.ABC): | ||
|  |     """Abstract base class for the implementation of a favicon cache.""" | ||
|  | 
 | ||
|  |     @abc.abstractmethod | ||
|  |     def __init__(self, cfg: FaviconCacheConfig): | ||
|  |         """An instance of the favicon cache is build up from the configuration.""" | ||
|  | 
 | ||
|  |     @abc.abstractmethod | ||
|  |     def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]: | ||
|  |         """Returns ``None`` or the tuple of ``(data, mime)`` that has been
 | ||
|  |         registered in the cache.  The ``None`` indicates that there was no entry | ||
|  |         in the cache."""
 | ||
|  | 
 | ||
|  |     @abc.abstractmethod | ||
|  |     def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool: | ||
|  |         """Set data and mime-type in the cache.  If data is None, the
 | ||
|  |         :py:obj:`FALLBACK_ICON` is registered. in the cache."""
 | ||
|  | 
 | ||
|  |     @abc.abstractmethod | ||
|  |     def state(self) -> FaviconCacheStats: | ||
|  |         """Returns a :py:obj:`FaviconCacheStats` (key/values) with information
 | ||
|  |         on the state of the cache."""
 | ||
|  | 
 | ||
|  |     @abc.abstractmethod | ||
|  |     def maintenance(self, force=False): | ||
|  |         """Performs maintenance on the cache""" | ||
|  | 
 | ||
|  | 
 | ||
|  | class FaviconCacheNull(FaviconCache): | ||
|  |     """A dummy favicon cache that caches nothing / a fallback solution. The
 | ||
|  |     NullCache is used when more efficient caches such as the | ||
|  |     :py:obj:`FaviconCacheSQLite` cannot be used because, for example, the SQLite | ||
|  |     library is only available in an old version and does not meet the | ||
|  |     requirements."""
 | ||
|  | 
 | ||
|  |     def __init__(self, cfg: FaviconCacheConfig): | ||
|  |         return None | ||
|  | 
 | ||
|  |     def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]: | ||
|  |         return None | ||
|  | 
 | ||
|  |     def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool: | ||
|  |         return False | ||
|  | 
 | ||
|  |     def state(self): | ||
|  |         return FaviconCacheStats(favicons=0) | ||
|  | 
 | ||
|  |     def maintenance(self, force=False): | ||
|  |         pass | ||
|  | 
 | ||
|  | 
 | ||
|  | class FaviconCacheSQLite(sqlitedb.SQLiteAppl, FaviconCache): | ||
|  |     """Favicon cache that manages the favicon BLOBs in a SQLite DB.  The DB
 | ||
|  |     model in the SQLite DB is implemented using the abstract class | ||
|  |     :py:obj:`sqlitedb.SQLiteAppl`. | ||
|  | 
 | ||
|  |     The following configurations are required / supported: | ||
|  | 
 | ||
|  |     - :py:obj:`FaviconCacheConfig.db_url` | ||
|  |     - :py:obj:`FaviconCacheConfig.HOLD_TIME` | ||
|  |     - :py:obj:`FaviconCacheConfig.LIMIT_TOTAL_BYTES` | ||
|  |     - :py:obj:`FaviconCacheConfig.BLOB_MAX_BYTES` | ||
|  |     - :py:obj:`MAINTENANCE_PERIOD` | ||
|  |     - :py:obj:`MAINTENANCE_MODE` | ||
|  |     """
 | ||
|  | 
 | ||
|  |     DB_SCHEMA = 1 | ||
|  | 
 | ||
|  |     DDL_BLOBS = """\
 | ||
|  | CREATE TABLE IF NOT EXISTS blobs ( | ||
|  |   sha256     TEXT, | ||
|  |   bytes_c    INTEGER, | ||
|  |   mime       TEXT NOT NULL, | ||
|  |   data       BLOB NOT NULL, | ||
|  |   PRIMARY KEY (sha256))"""
 | ||
|  | 
 | ||
|  |     """Table to store BLOB objects by their sha256 hash values.""" | ||
|  | 
 | ||
|  |     DDL_BLOB_MAP = """\
 | ||
|  | CREATE TABLE IF NOT EXISTS blob_map ( | ||
|  |     m_time     INTEGER DEFAULT (strftime('%s', 'now')),  -- last modified (unix epoch) time in sec. | ||
|  |     sha256     TEXT, | ||
|  |     resolver   TEXT, | ||
|  |     authority  TEXT, | ||
|  |     PRIMARY KEY (resolver, authority))"""
 | ||
|  | 
 | ||
|  |     """Table to map from (resolver, authority) to sha256 hash values.""" | ||
|  | 
 | ||
|  |     DDL_CREATE_TABLES = { | ||
|  |         "blobs": DDL_BLOBS, | ||
|  |         "blob_map": DDL_BLOB_MAP, | ||
|  |     } | ||
|  | 
 | ||
|  |     SQL_DROP_LEFTOVER_BLOBS = ( | ||
|  |         "DELETE FROM blobs WHERE sha256 IN (" | ||
|  |         " SELECT b.sha256" | ||
|  |         "   FROM blobs b" | ||
|  |         "   LEFT JOIN blob_map bm" | ||
|  |         "     ON b.sha256 = bm.sha256" | ||
|  |         "  WHERE bm.sha256 IS NULL)" | ||
|  |     ) | ||
|  |     """Delete blobs.sha256 (BLOBs) no longer in blob_map.sha256.""" | ||
|  | 
 | ||
|  |     SQL_ITER_BLOBS_SHA256_BYTES_C = ( | ||
|  |         "SELECT b.sha256, b.bytes_c FROM blobs b" | ||
|  |         "  JOIN blob_map bm " | ||
|  |         "    ON b.sha256 = bm.sha256" | ||
|  |         " ORDER BY bm.m_time ASC" | ||
|  |     ) | ||
|  | 
 | ||
|  |     SQL_INSERT_BLOBS = ( | ||
|  |         "INSERT INTO blobs (sha256, bytes_c, mime, data) VALUES (?, ?, ?, ?)" | ||
|  |         "    ON CONFLICT (sha256) DO NOTHING" | ||
|  |     )  # fmt: skip | ||
|  | 
 | ||
|  |     SQL_INSERT_BLOB_MAP = ( | ||
|  |         "INSERT INTO blob_map (sha256, resolver, authority) VALUES (?, ?, ?)" | ||
|  |         "    ON CONFLICT DO UPDATE " | ||
|  |         "   SET sha256=excluded.sha256, m_time=strftime('%s', 'now')" | ||
|  |     ) | ||
|  | 
 | ||
|  |     def __init__(self, cfg: FaviconCacheConfig): | ||
|  |         """An instance of the favicon cache is build up from the configuration."""  # | ||
|  | 
 | ||
|  |         if cfg.db_url == ":memory:": | ||
|  |             logger.critical("don't use SQLite DB in :memory: in production!!") | ||
|  |         super().__init__(cfg.db_url) | ||
|  |         self.cfg = cfg | ||
|  | 
 | ||
|  |     def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]: | ||
|  | 
 | ||
|  |         sql = "SELECT sha256 FROM blob_map WHERE resolver = ? AND authority = ?" | ||
|  |         res = self.DB.execute(sql, (resolver, authority)).fetchone() | ||
|  |         if res is None: | ||
|  |             return None | ||
|  | 
 | ||
|  |         data, mime = (None, None) | ||
|  |         sha256 = res[0] | ||
|  |         if sha256 == FALLBACK_ICON: | ||
|  |             return data, mime | ||
|  | 
 | ||
|  |         sql = "SELECT data, mime FROM blobs WHERE sha256 = ?" | ||
|  |         res = self.DB.execute(sql, (sha256,)).fetchone() | ||
|  |         if res is not None: | ||
|  |             data, mime = res | ||
|  |         return data, mime | ||
|  | 
 | ||
|  |     def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool: | ||
|  | 
 | ||
|  |         if self.cfg.MAINTENANCE_MODE == "auto" and int(time.time()) > self.next_maintenance_time: | ||
|  |             # Should automatic maintenance be moved to a new thread? | ||
|  |             self.maintenance() | ||
|  | 
 | ||
|  |         if data is not None and mime is None: | ||
|  |             logger.error( | ||
|  |                 "favicon resolver %s tries to cache mime-type None for authority %s", | ||
|  |                 resolver, | ||
|  |                 authority, | ||
|  |             ) | ||
|  |             return False | ||
|  | 
 | ||
|  |         bytes_c = len(data or b"") | ||
|  |         if bytes_c > self.cfg.BLOB_MAX_BYTES: | ||
|  |             logger.info( | ||
|  |                 "favicon of resolver: %s / authority: %s to big to cache (bytes: %s) " % (resolver, authority, bytes_c) | ||
|  |             ) | ||
|  |             return False | ||
|  | 
 | ||
|  |         if data is None: | ||
|  |             sha256 = FALLBACK_ICON | ||
|  |         else: | ||
|  |             sha256 = hashlib.sha256(data).hexdigest() | ||
|  | 
 | ||
|  |         with self.connect() as conn: | ||
|  |             if sha256 != FALLBACK_ICON: | ||
|  |                 conn.execute(self.SQL_INSERT_BLOBS, (sha256, bytes_c, mime, data)) | ||
|  |             conn.execute(self.SQL_INSERT_BLOB_MAP, (sha256, resolver, authority)) | ||
|  | 
 | ||
|  |         return True | ||
|  | 
 | ||
|  |     @property | ||
|  |     def next_maintenance_time(self) -> int: | ||
|  |         """Returns (unix epoch) time of the next maintenance.""" | ||
|  | 
 | ||
|  |         return self.cfg.MAINTENANCE_PERIOD + self.properties.m_time("LAST_MAINTENANCE") | ||
|  | 
 | ||
|  |     def maintenance(self, force=False): | ||
|  | 
 | ||
|  |         # Prevent parallel DB maintenance cycles from other DB connections | ||
|  |         # (e.g. in multi thread or process environments). | ||
|  | 
 | ||
|  |         if not force and int(time.time()) < self.next_maintenance_time: | ||
|  |             logger.debug("no maintenance required yet, next maintenance interval is in the future") | ||
|  |             return | ||
|  |         self.properties.set("LAST_MAINTENANCE", "")  # hint: this (also) sets the m_time of the property! | ||
|  | 
 | ||
|  |         # do maintenance tasks | ||
|  | 
 | ||
|  |         with self.connect() as conn: | ||
|  | 
 | ||
|  |             # drop items not in HOLD time | ||
|  |             res = conn.execute( | ||
|  |                 f"DELETE FROM blob_map" | ||
|  |                 f" WHERE cast(m_time as integer) < cast(strftime('%s', 'now') as integer) - {self.cfg.HOLD_TIME}" | ||
|  |             ) | ||
|  |             logger.debug("dropped %s obsolete blob_map items from db", res.rowcount) | ||
|  |             res = conn.execute(self.SQL_DROP_LEFTOVER_BLOBS) | ||
|  |             logger.debug("dropped %s obsolete BLOBS from db", res.rowcount) | ||
|  | 
 | ||
|  |             # drop old items to be in LIMIT_TOTAL_BYTES | ||
|  |             total_bytes = conn.execute("SELECT SUM(bytes_c) FROM blobs").fetchone()[0] or 0 | ||
|  |             if total_bytes > self.cfg.LIMIT_TOTAL_BYTES: | ||
|  | 
 | ||
|  |                 x = total_bytes - self.cfg.LIMIT_TOTAL_BYTES | ||
|  |                 c = 0 | ||
|  |                 sha_list = [] | ||
|  |                 for row in conn.execute(self.SQL_ITER_BLOBS_SHA256_BYTES_C): | ||
|  |                     sha256, bytes_c = row | ||
|  |                     sha_list.append(sha256) | ||
|  |                     c += bytes_c | ||
|  |                     if c > x: | ||
|  |                         break | ||
|  |                 if sha_list: | ||
|  |                     conn.execute("DELETE FROM blobs WHERE sha256 IN ('%s')" % "','".join(sha_list)) | ||
|  |                     conn.execute("DELETE FROM blob_map WHERE sha256 IN ('%s')" % "','".join(sha_list)) | ||
|  |                     logger.debug("dropped %s blobs with total size of %s bytes", len(sha_list), c) | ||
|  | 
 | ||
|  |     def _query_val(self, sql, default=None): | ||
|  |         val = self.DB.execute(sql).fetchone() | ||
|  |         if val is not None: | ||
|  |             val = val[0] | ||
|  |         if val is None: | ||
|  |             val = default | ||
|  |         return val | ||
|  | 
 | ||
|  |     def state(self) -> FaviconCacheStats: | ||
|  |         return FaviconCacheStats( | ||
|  |             favicons=self._query_val("SELECT count(*) FROM blobs", 0), | ||
|  |             bytes=self._query_val("SELECT SUM(bytes_c) FROM blobs", 0), | ||
|  |             domains=self._query_val("SELECT count(*) FROM (SELECT authority FROM blob_map GROUP BY authority)", 0), | ||
|  |             resolvers=self._query_val("SELECT count(*) FROM (SELECT resolver FROM blob_map GROUP BY resolver)", 0), | ||
|  |         ) | ||
|  | 
 | ||
|  | 
 | ||
|  | class FaviconCacheMEM(FaviconCache): | ||
|  |     """Favicon cache in process' memory.  Its just a POC that stores the
 | ||
|  |     favicons in the memory of the process. | ||
|  | 
 | ||
|  |     .. attention:: | ||
|  | 
 | ||
|  |        Don't use it in production, it will blow up your memory!! | ||
|  | 
 | ||
|  |     """
 | ||
|  | 
 | ||
|  |     def __init__(self, cfg): | ||
|  | 
 | ||
|  |         self.cfg = cfg | ||
|  |         self._data = {} | ||
|  |         self._sha_mime = {} | ||
|  | 
 | ||
|  |     def __call__(self, resolver: str, authority: str) -> None | tuple[bytes | None, str | None]: | ||
|  | 
 | ||
|  |         sha, mime = self._sha_mime.get(f"{resolver}:{authority}", (None, None)) | ||
|  |         if sha is None: | ||
|  |             return None | ||
|  |         data = self._data.get(sha) | ||
|  |         if data == FALLBACK_ICON: | ||
|  |             data = None | ||
|  |         return data, mime | ||
|  | 
 | ||
|  |     def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool: | ||
|  | 
 | ||
|  |         if data is None: | ||
|  |             data = FALLBACK_ICON | ||
|  |             mime = None | ||
|  | 
 | ||
|  |         elif mime is None: | ||
|  |             logger.error( | ||
|  |                 "favicon resolver %s tries to cache mime-type None for authority %s", | ||
|  |                 resolver, | ||
|  |                 authority, | ||
|  |             ) | ||
|  |             return False | ||
|  | 
 | ||
|  |         digest = hashlib.sha256(data).hexdigest() | ||
|  |         self._data[digest] = data | ||
|  |         self._sha_mime[f"{resolver}:{authority}"] = (digest, mime) | ||
|  |         return True | ||
|  | 
 | ||
|  |     def state(self): | ||
|  |         return FaviconCacheStats(favicons=len(self._data.keys())) | ||
|  | 
 | ||
|  |     def maintenance(self, force=False): | ||
|  |         pass |