diff --git a/AUTHORS.rst b/AUTHORS.rst index 95d154b12..a0258b9b1 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -175,3 +175,4 @@ features or generally made searx better: - Daniel Kukula `<https://github.com/dkuku>` - Patrick Evans `https://github.com/holysoles` - Daniel Mowitz `<https://daniel.mowitz.rocks>` +- SentientTapeDrive `<https://github.com/SentientTapeDrive>`_ `<https://thefubar.company>`_ diff --git a/docs/dev/engines/online/kagi.rst b/docs/dev/engines/online/kagi.rst new file mode 100644 index 000000000..f8835c628 --- /dev/null +++ b/docs/dev/engines/online/kagi.rst @@ -0,0 +1,105 @@ +.. _kagi engine: + +Kagi +==== + +The Kagi engine scrapes search results from Kagi's HTML search interface. + +Example +------- + +Configuration +~~~~~~~~~~~~ + +.. code:: yaml + + - name: kagi + engine: kagi + shortcut: kg + categories: [general, web] + timeout: 4.0 + api_key: "YOUR-KAGI-TOKEN" # required + about: + website: https://kagi.com + use_official_api: false + require_api_key: true + results: HTML + + +Parameters +~~~~~~~~~~ + +``api_key`` : required + The Kagi API token used for authentication. Can be obtained from your Kagi account settings. + +``pageno`` : optional + The page number for paginated results. Defaults to 1. + +Example Request +~~~~~~~~~~~~~~ + +.. code:: python + + params = { + 'api_key': 'YOUR-KAGI-TOKEN', + 'pageno': 1, + 'headers': { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'DNT': '1' + } + } + query = 'test query' + request_params = kagi.request(query, params) + +Example Response +~~~~~~~~~~~~~~ + +.. code:: python + + [ + # Search result + { + 'url': 'https://example.com/', + 'title': 'Example Title', + 'content': 'Example content snippet...', + 'domain': 'example.com' + } + ] + +Implementation +------------- + +The engine performs the following steps: + +1. Constructs a GET request to ``https://kagi.com/html/search`` with: + - ``q`` parameter for the search query + - ``token`` parameter for authentication + - ``batch`` parameter for pagination + +2. Parses the HTML response using XPath to extract: + - Result titles + - URLs + - Content snippets + - Domain information + +3. Handles various error cases: + - 401: Invalid API token + - 429: Rate limit exceeded + - Other non-200 status codes + +Dependencies +----------- + +- lxml: For HTML parsing and XPath evaluation +- urllib.parse: For URL handling and encoding +- searx.utils: For text extraction and XPath helpers + +Notes +----- + +- The engine requires a valid Kagi API token to function +- Results are scraped from Kagi's HTML interface rather than using an official API +- Rate limiting may apply based on your Kagi subscription level +- The engine sets specific browser-like headers to ensure reliable scraping diff --git a/searx/engines/kagi.py b/searx/engines/kagi.py new file mode 100644 index 000000000..84c0006f4 --- /dev/null +++ b/searx/engines/kagi.py @@ -0,0 +1,148 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Kagi Search +Scrapes Kagi's HTML search results. +""" + +from urllib.parse import urlencode +from lxml import html + +from searx.utils import extract_text, eval_xpath, eval_xpath_list +from searx.exceptions import SearxEngineAPIException +from searx import logger + +logger = logger.getChild('kagi') + +about = { + "website": 'https://kagi.com', + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": True, + "results": 'HTML', +} + +categories = ['general', 'web'] +paging = True +time_range_support = False +safesearch = False + +base_url = 'https://kagi.com/html/search' + +api_key = None # Set in settings.yml + +# Global cookie storage for Kagi authentication +kagi_cookies = {'kagi_session': None, '_kagi_search_': None} + + +def request(query, params): + if not api_key: + raise SearxEngineAPIException('missing Kagi API key') + + page = params['pageno'] + + if 'cookies' not in params: + params['cookies'] = {} + params['cookies'].update(kagi_cookies) + + if kagi_cookies['kagi_session'] and kagi_cookies['_kagi_search_']: + logger.debug( + "Using Kagi cookies for authentication - session: %s, search: %s", + kagi_cookies['kagi_session'], + kagi_cookies['_kagi_search_'], + ) + search_url = base_url + '?' + urlencode({'q': query, 'batch': page}) + else: + missing = [] + if not kagi_cookies['kagi_session']: + missing.append('kagi_session') + if not kagi_cookies['_kagi_search_']: + missing.append('_kagi_search_') + logger.debug("Missing cookies %s, using API key for initial authentication", missing) + search_url = base_url + '?' + urlencode({'q': query, 'token': api_key, 'batch': page}) + + params['url'] = search_url + params['headers'].update( + { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'DNT': '1', + } + ) + params['allow_redirects'] = True + params['verify'] = True + params['max_redirects'] = 1 + + return params + + +def response(resp): + results = [] + + if 'set-cookie' in resp.headers: + cookies = resp.headers.get_list('set-cookie') + for cookie in cookies: + try: + cookie_parts = cookie.split('=', 1) + if len(cookie_parts) != 2: + continue + + name = cookie_parts[0].strip() + value = cookie_parts[1].split(';')[0].strip() + + if name == 'kagi_session': + if value != kagi_cookies['kagi_session']: + kagi_cookies['kagi_session'] = value + resp.search_params['cookies']['kagi_session'] = value + logger.debug("Updated kagi_session cookie: %s", value) + elif name == '_kagi_search_': # Exact match for search cookie + if value != kagi_cookies['_kagi_search_']: + kagi_cookies['_kagi_search_'] = value + resp.search_params['cookies']['_kagi_search_'] = value + logger.debug("Updated _kagi_search_ cookie: %s", value) + except ValueError as e: + logger.warning("Failed to parse Kagi cookie: %s", str(e)) + + logger.debug( + "Global Kagi cookies - session: %s, search: %s", kagi_cookies['kagi_session'], kagi_cookies['_kagi_search_'] + ) + logger.debug( + "Request Kagi cookies - session: %s, search: %s", + resp.search_params['cookies'].get('kagi_session'), + resp.search_params['cookies'].get('_kagi_search_'), + ) + + if resp.status_code == 401: + kagi_cookies['kagi_session'] = None + kagi_cookies['_kagi_search_'] = None + resp.search_params['cookies'].clear() + logger.debug("Cleared invalid Kagi cookies") + + raise SearxEngineAPIException('Invalid Kagi authentication') + if resp.status_code == 429: + raise SearxEngineAPIException('Kagi rate limit exceeded') + if resp.status_code != 200: + raise SearxEngineAPIException(f'Unexpected HTTP status code: {resp.status_code}') + + dom = html.fromstring(resp.text) + + for result in eval_xpath_list(dom, '//div[contains(@class, "_0_SRI")]'): + try: + title_tag = eval_xpath(result, './/a[contains(@class, "__sri_title_link")]')[0] + title = extract_text(title_tag) + url = title_tag.get('href') + content_tag = eval_xpath(result, './/div[contains(@class, "__sri-desc")]') + content = extract_text(content_tag[0]) if content_tag else '' + domain = eval_xpath(result, './/span[contains(@class, "host")]/text()') + if domain: + domain = domain[0] + + search_result = {'url': url, 'title': title, 'content': content, 'domain': domain} + results.append(search_result) + + except (IndexError, KeyError): + continue + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 23092b800..ff42679b8 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -2484,6 +2484,19 @@ engines: shortcut: pgo disabled: true + - name: kagi + engine: kagi + shortcut: kg + categories: [general, web] + disabled: true + timeout: 4.0 + api_key: "" + about: + website: https://kagi.com + use_official_api: false + require_api_key: true + results: HTML + # Doku engine lets you access to any Doku wiki instance: # A public one or a privete/corporate one. # - name: ubuntuwiki diff --git a/tests/unit/test_engine_kagi.py b/tests/unit/test_engine_kagi.py new file mode 100644 index 000000000..4ff6a52ca --- /dev/null +++ b/tests/unit/test_engine_kagi.py @@ -0,0 +1,152 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# pylint: disable=missing-module-docstring + +import mock +from lxml import html +from urllib.parse import parse_qs + +from searx.engines import kagi +from searx.exceptions import SearxEngineAPIException +from tests import SearxTestCase + + +class TestKagiEngine(SearxTestCase): + + def setUp(self): + self.test_html = """ + <div class="_0_main-search-results"> + <div class="_0_SRI search-result"> + <div class="_0_TITLE __sri-title"> + <h3 class="__sri-title-box"> + <a class="__sri_title_link _ext_t" href="https://example1.com">Result 1</a> + </h3> + </div> + <div class="__sri-url-box"> + <span class="host">example1.com</span> + </div> + <div class="__sri-body"> + <div class="__sri-desc">Content 1</div> + </div> + </div> + <div class="_0_SRI search-result"> + <div class="_0_TITLE __sri-title"> + <h3 class="__sri-title-box"> + <a class="__sri_title_link _ext_t" href="https://example2.com">Result 2</a> + </h3> + </div> + <div class="__sri-url-box"> + <span class="host">example2.com</span> + </div> + <div class="__sri-body"> + <div class="__sri-desc">Content 2</div> + </div> + </div> + </div> + """ + + def test_request(self): + # Test with missing API token + kagi.token = None + params = {'pageno': 1, 'headers': {}} + self.assertRaises(SearxEngineAPIException, kagi.request, 'test query', params) + + # Test with valid API token but no cookie + kagi.token = 'test_token' + params = {'pageno': 1, 'headers': {}, 'cookies': {}} + query = 'test query' + request_params = kagi.request(query, params) + + self.assertIn('url', request_params) + self.assertIn('token=test_token', request_params['url']) + self.assertIn('q=test+query', request_params['url']) + self.assertEqual(request_params['max_redirects'], 1) + self.assertTrue(request_params['allow_redirects']) + + # Test with both required cookies + params['cookies']['kagi_session'] = 'test_session' + params['cookies']['_kagi_search_'] = 'test_search' + request_params = kagi.request(query, params) + self.assertNotIn('token=', request_params['url']) + self.assertIn('q=test+query', request_params['url']) + self.assertEqual(request_params['max_redirects'], 1) + self.assertTrue(request_params['allow_redirects']) + + # Test with missing search cookie + params['cookies'] = {'kagi_session': 'test_session'} + request_params = kagi.request(query, params) + self.assertIn('token=', request_params['url']) + + # Test with missing session cookie + params['cookies'] = {'_kagi_search_': 'test_search'} + request_params = kagi.request(query, params) + self.assertIn('token=', request_params['url']) + + # Test pagination + params['pageno'] = 2 + request_params = kagi.request(query, params) + self.assertIn('batch=2', request_params['url']) + self.assertEqual(request_params['max_redirects'], 1) + + def test_response(self): + def verify_cookie_capture(cookie_headers, expected_session, expected_search): + mock_headers = mock.Mock() + mock_headers.get_list = mock.Mock(return_value=cookie_headers) + mock_headers.__contains__ = mock.Mock(return_value=True) + + response = mock.Mock( + text=self.test_html, status_code=200, headers=mock_headers, search_params={'cookies': {}} + ) + results = kagi.response(response) + + self.assertEqual(response.search_params['cookies'].get('kagi_session'), expected_session) + self.assertEqual(response.search_params['cookies'].get('_kagi_search_'), expected_search) + return results + + # Test cookie capture with standard attributes + results = verify_cookie_capture( + ['kagi_session=test_session; Path=/; HttpOnly', '_kagi_search_=test_search; Path=/; HttpOnly'], + 'test_session', + 'test_search', + ) + + # Test cookie capture with additional attributes + results = verify_cookie_capture( + [ + 'kagi_session=test_session2; Path=/; HttpOnly; SameSite=Lax', + '_kagi_search_=test_search2; Domain=.kagi.com; Path=/; SameSite=Lax', + ], + 'test_session2', + 'test_search2', + ) + + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) # 2 search results + + # Check first result + self.assertEqual(results[0]['title'], 'Result 1') + self.assertEqual(results[0]['url'], 'https://example1.com') + self.assertEqual(results[0]['content'], 'Content 1') + self.assertEqual(results[0]['domain'], 'example1.com') + + # Check second result + self.assertEqual(results[1]['title'], 'Result 2') + self.assertEqual(results[1]['url'], 'https://example2.com') + self.assertEqual(results[1]['content'], 'Content 2') + self.assertEqual(results[1]['domain'], 'example2.com') + + def test_response_error_handling(self): + # Test invalid token/cookie response + response = mock.Mock( + text='', status_code=401, search_params={'cookies': {'kagi_session': 'invalid_session'}}, headers={} + ) + self.assertRaises(SearxEngineAPIException, kagi.response, response) + # Verify invalid cookie was cleared + self.assertNotIn('kagi_session', response.search_params['cookies']) + + # Test rate limit response + response = mock.Mock(text='', status_code=429, search_params={'cookies': {}}, headers={}) + self.assertRaises(SearxEngineAPIException, kagi.response, response) + + # Test other error response + response = mock.Mock(text='', status_code=500, search_params={'cookies': {}}, headers={}) + self.assertRaises(SearxEngineAPIException, kagi.response, response)