chore: update md, py, toml files
This commit is contained in:
commit
ccb50fbdbb
37
.gitignore
vendored
Normal file
37
.gitignore
vendored
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
.pytest_cache/
|
||||||
|
.coverage
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.hypothesis/
|
||||||
|
*.log
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
.idea/
|
||||||
|
.vscode/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
.DS_Store
|
||||||
10
CHANGELOG.md
Normal file
10
CHANGELOG.md
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
# Changelog
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Version 1.1.0 - 2026-01-01
|
||||||
|
|
||||||
|
update md, py, toml files
|
||||||
|
|
||||||
|
**Changes:** 10 files, 1034 lines
|
||||||
|
**Languages:** Markdown (182 lines), Other (68 lines), Python (729 lines), TOML (50 lines), Text (5 lines)
|
||||||
31
Makefile
Normal file
31
Makefile
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
.PHONY: install dev run test test-unit test-integration clean build uninstall
|
||||||
|
|
||||||
|
install:
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
dev:
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
run:
|
||||||
|
python -m rsearch
|
||||||
|
|
||||||
|
test: test-integration
|
||||||
|
|
||||||
|
test-unit:
|
||||||
|
pytest tests/ -v --ignore=tests/test_providers.py
|
||||||
|
|
||||||
|
test-integration:
|
||||||
|
pytest tests/test_providers.py -v
|
||||||
|
|
||||||
|
test-quick:
|
||||||
|
curl -s "http://localhost:8080/health" | python -m json.tool
|
||||||
|
curl -s "http://localhost:8080/search?query=python&count=3" | python -m json.tool
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf __pycache__ *.egg-info dist build .eggs rsearch/__pycache__ tests/__pycache__ .pytest_cache
|
||||||
|
|
||||||
|
build:
|
||||||
|
python -m build
|
||||||
|
|
||||||
|
uninstall:
|
||||||
|
pip uninstall -y rsearch
|
||||||
182
README.md
Normal file
182
README.md
Normal file
@ -0,0 +1,182 @@
|
|||||||
|
# rsearch
|
||||||
|
|
||||||
|
Author: retoor <retoor@molodetz.nl>
|
||||||
|
|
||||||
|
Multi-source search aggregator API that queries multiple search engines and returns unified results without requiring API keys.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- 7 search providers with automatic fallback
|
||||||
|
- No API keys required (HTML scraping + public APIs)
|
||||||
|
- Async architecture for performance
|
||||||
|
- Unified JSON response format
|
||||||
|
- Fixed provider ordering by result quality
|
||||||
|
- Comprehensive integration tests
|
||||||
|
|
||||||
|
## Search Providers
|
||||||
|
|
||||||
|
| Provider | Type | Description |
|
||||||
|
|----------|------|-------------|
|
||||||
|
| Brave | Scraping | High quality web results |
|
||||||
|
| DuckDuckGo HTML | Scraping | Reliable lightweight version |
|
||||||
|
| Bing | Scraping | Microsoft search engine |
|
||||||
|
| Mojeek | Scraping | Independent search index |
|
||||||
|
| DuckDuckGo | API | Instant answers |
|
||||||
|
| Wikipedia | API | Encyclopedia reference |
|
||||||
|
| Wikidata | API | Structured knowledge base |
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
Install dependencies:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
Install as package (development mode):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -e .
|
||||||
|
```
|
||||||
|
|
||||||
|
Install with test dependencies:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -e ".[test]"
|
||||||
|
```
|
||||||
|
|
||||||
|
Or using make:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make dev
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
Run as module:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m rsearch
|
||||||
|
```
|
||||||
|
|
||||||
|
Or after installation:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
rsearch
|
||||||
|
```
|
||||||
|
|
||||||
|
Or using make:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make run
|
||||||
|
```
|
||||||
|
|
||||||
|
### Command Line Options
|
||||||
|
|
||||||
|
```
|
||||||
|
usage: rsearch [-h] [-H HOST] [-p PORT] [-l {DEBUG,INFO,WARNING,ERROR}] [-v]
|
||||||
|
|
||||||
|
options:
|
||||||
|
-h, --help show help message
|
||||||
|
-H, --host HOST Host to bind to (default: 0.0.0.0)
|
||||||
|
-p, --port PORT Port to listen on (default: 8080)
|
||||||
|
-l, --log-level Log level: DEBUG, INFO, WARNING, ERROR (default: INFO)
|
||||||
|
-v, --version show version number
|
||||||
|
```
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
rsearch --port 9000 # Run on port 9000
|
||||||
|
rsearch --host 127.0.0.1 --port 3000 # Bind to localhost:3000
|
||||||
|
rsearch --log-level DEBUG # Enable debug logging
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
Run integration tests:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make test
|
||||||
|
```
|
||||||
|
|
||||||
|
Or directly with pytest:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pytest tests/test_providers.py -v
|
||||||
|
```
|
||||||
|
|
||||||
|
Quick API test (requires running server):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make test-quick
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
### Search
|
||||||
|
|
||||||
|
```
|
||||||
|
GET /search?query=<q>&count=<n>
|
||||||
|
```
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- `query`: Search term (required)
|
||||||
|
- `count`: Number of results (default: 10, max: 100)
|
||||||
|
|
||||||
|
Response:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"query": "python",
|
||||||
|
"source": "brave",
|
||||||
|
"count": 3,
|
||||||
|
"results": [
|
||||||
|
{
|
||||||
|
"title": "Welcome to Python.org",
|
||||||
|
"url": "https://www.python.org/",
|
||||||
|
"description": "The official home of the Python Programming Language",
|
||||||
|
"source": "brave",
|
||||||
|
"extra": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"timestamp": "2024-01-01T12:00:00.000000Z",
|
||||||
|
"success": true,
|
||||||
|
"error": null
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Health Check
|
||||||
|
|
||||||
|
```
|
||||||
|
GET /health
|
||||||
|
```
|
||||||
|
|
||||||
|
Response:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "ok",
|
||||||
|
"services": ["brave", "duckduckgo_html", "bing", "mojeek", "duckduckgo", "wikipedia", "wikidata"],
|
||||||
|
"timestamp": "2024-01-01T12:00:00.000000Z"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
rsearch/
|
||||||
|
├── rsearch/
|
||||||
|
│ ├── __init__.py
|
||||||
|
│ ├── __main__.py
|
||||||
|
│ └── app.py
|
||||||
|
├── tests/
|
||||||
|
│ ├── __init__.py
|
||||||
|
│ └── test_providers.py
|
||||||
|
├── requirements.txt
|
||||||
|
├── pyproject.toml
|
||||||
|
├── Makefile
|
||||||
|
└── README.md
|
||||||
|
```
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
MIT
|
||||||
50
pyproject.toml
Normal file
50
pyproject.toml
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=61.0", "wheel"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "rsearch"
|
||||||
|
version = "1.1.0"
|
||||||
|
description = "Multi-source search aggregator API"
|
||||||
|
authors = [
|
||||||
|
{name = "retoor", email = "retoor@molodetz.nl"}
|
||||||
|
]
|
||||||
|
readme = "README.md"
|
||||||
|
license = {text = "MIT"}
|
||||||
|
requires-python = ">=3.9"
|
||||||
|
classifiers = [
|
||||||
|
"Development Status :: 4 - Beta",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"Programming Language :: Python :: 3.9",
|
||||||
|
"Programming Language :: Python :: 3.10",
|
||||||
|
"Programming Language :: Python :: 3.11",
|
||||||
|
"Programming Language :: Python :: 3.12",
|
||||||
|
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
|
||||||
|
]
|
||||||
|
dependencies = [
|
||||||
|
"aiohttp>=3.9.0",
|
||||||
|
"beautifulsoup4>=4.12.0",
|
||||||
|
"lxml>=5.0.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
test = [
|
||||||
|
"pytest>=7.0.0",
|
||||||
|
"pytest-asyncio>=0.21.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
rsearch = "rsearch:main"
|
||||||
|
|
||||||
|
[project.urls]
|
||||||
|
Homepage = "https://github.com/retoor/rsearch"
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
where = ["."]
|
||||||
|
include = ["rsearch*"]
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
asyncio_mode = "auto"
|
||||||
|
testpaths = ["tests"]
|
||||||
5
requirements.txt
Normal file
5
requirements.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
aiohttp>=3.9.0
|
||||||
|
beautifulsoup4>=4.12.0
|
||||||
|
lxml>=5.0.0
|
||||||
|
pytest>=7.0.0
|
||||||
|
pytest-asyncio>=0.21.0
|
||||||
6
rsearch/__init__.py
Normal file
6
rsearch/__init__.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
# retoor <retoor@molodetz.nl>
|
||||||
|
|
||||||
|
from rsearch.app import MultiSearch, create_app, main
|
||||||
|
|
||||||
|
__version__ = "1.0.0"
|
||||||
|
__all__ = ["MultiSearch", "create_app", "main"]
|
||||||
6
rsearch/__main__.py
Normal file
6
rsearch/__main__.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
# retoor <retoor@molodetz.nl>
|
||||||
|
|
||||||
|
from rsearch.app import main
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
549
rsearch/app.py
Normal file
549
rsearch/app.py
Normal file
@ -0,0 +1,549 @@
|
|||||||
|
# retoor <retoor@molodetz.nl>
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
from aiohttp import web
|
||||||
|
from datetime import datetime
|
||||||
|
from html import unescape
|
||||||
|
from typing import Dict, Optional, List, Callable
|
||||||
|
from urllib.parse import unquote, parse_qs, urlparse
|
||||||
|
import random
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
logger = logging.getLogger("search-api")
|
||||||
|
|
||||||
|
|
||||||
|
class MultiSearch:
|
||||||
|
def __init__(self):
|
||||||
|
self.services: List[Callable[[str, int], asyncio.Future]] = [
|
||||||
|
self.brave_search,
|
||||||
|
self.duckduckgo_html_search,
|
||||||
|
self.bing_search,
|
||||||
|
self.mojeek_search,
|
||||||
|
self.duckduckgo_search,
|
||||||
|
self.wikipedia_search,
|
||||||
|
self.wikidata_search,
|
||||||
|
]
|
||||||
|
self.user_agents = [
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0",
|
||||||
|
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
def _headers(self) -> Dict[str, str]:
|
||||||
|
return {
|
||||||
|
"User-Agent": random.choice(self.user_agents),
|
||||||
|
"Accept": "application/json, text/html;q=0.9",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
}
|
||||||
|
|
||||||
|
async def _safe_get(
|
||||||
|
self,
|
||||||
|
session: aiohttp.ClientSession,
|
||||||
|
url: str,
|
||||||
|
params: Optional[Dict] = None,
|
||||||
|
timeout: int = 10,
|
||||||
|
) -> Optional[Dict]:
|
||||||
|
try:
|
||||||
|
async with session.get(
|
||||||
|
url,
|
||||||
|
params=params,
|
||||||
|
headers=self._headers(),
|
||||||
|
timeout=aiohttp.ClientTimeout(total=timeout),
|
||||||
|
) as resp:
|
||||||
|
if resp.status != 200:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return await resp.json(content_type=None)
|
||||||
|
except Exception:
|
||||||
|
text = await resp.text()
|
||||||
|
return {"_raw": text}
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"GET {url} failed: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _scrape_headers(self) -> Dict[str, str]:
|
||||||
|
ua = random.choice(self.user_agents)
|
||||||
|
headers = {
|
||||||
|
"User-Agent": ua,
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
"Accept-Encoding": "gzip, deflate",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"Upgrade-Insecure-Requests": "1",
|
||||||
|
"DNT": "1",
|
||||||
|
}
|
||||||
|
if "Chrome" in ua and "Edg" not in ua:
|
||||||
|
headers["Sec-Ch-Ua"] = '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"'
|
||||||
|
headers["Sec-Ch-Ua-Mobile"] = "?0"
|
||||||
|
headers["Sec-Ch-Ua-Platform"] = '"Linux"' if "Linux" in ua else '"Windows"'
|
||||||
|
headers["Sec-Fetch-Dest"] = "document"
|
||||||
|
headers["Sec-Fetch-Mode"] = "navigate"
|
||||||
|
headers["Sec-Fetch-Site"] = "none"
|
||||||
|
headers["Sec-Fetch-User"] = "?1"
|
||||||
|
return headers
|
||||||
|
|
||||||
|
async def _safe_get_html(
|
||||||
|
self,
|
||||||
|
session: aiohttp.ClientSession,
|
||||||
|
url: str,
|
||||||
|
params: Optional[Dict] = None,
|
||||||
|
timeout: int = 15,
|
||||||
|
) -> Optional[str]:
|
||||||
|
try:
|
||||||
|
async with session.get(
|
||||||
|
url,
|
||||||
|
params=params,
|
||||||
|
headers=self._scrape_headers(),
|
||||||
|
timeout=aiohttp.ClientTimeout(total=timeout),
|
||||||
|
allow_redirects=True,
|
||||||
|
) as resp:
|
||||||
|
if resp.status != 200:
|
||||||
|
return None
|
||||||
|
return await resp.text()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"HTML GET {url} failed: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _clean_html(self, html: str) -> str:
|
||||||
|
if not html:
|
||||||
|
return ""
|
||||||
|
text = re.sub(r'<[^>]+>', ' ', html)
|
||||||
|
text = re.sub(r'\s+', ' ', text)
|
||||||
|
return unescape(text).strip()
|
||||||
|
|
||||||
|
async def duckduckgo_search(self, query: str, count: int) -> Optional[Dict]:
|
||||||
|
url = "https://api.duckduckgo.com/"
|
||||||
|
params = {
|
||||||
|
"q": query,
|
||||||
|
"format": "json",
|
||||||
|
"no_html": "1",
|
||||||
|
"skip_disambig": "1",
|
||||||
|
}
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
data = await self._safe_get(session, url, params, timeout=10)
|
||||||
|
if not data or not isinstance(data, dict):
|
||||||
|
return None
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
if data.get("AbstractText"):
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
"title": data.get("Heading", "Instant Answer"),
|
||||||
|
"url": data.get("AbstractURL", ""),
|
||||||
|
"description": data.get("AbstractText", "")[:500],
|
||||||
|
"source": "duckduckgo_instant",
|
||||||
|
"extra": {
|
||||||
|
"image": data.get("Image", ""),
|
||||||
|
"abstract_source": data.get("AbstractSource", ""),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
for item in data.get("Results", [])[: max(0, count - len(results))]:
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
"title": item.get("Text", "")[:200],
|
||||||
|
"url": item.get("FirstURL", ""),
|
||||||
|
"description": item.get("Text", "")[:500],
|
||||||
|
"source": "duckduckgo",
|
||||||
|
"extra": {
|
||||||
|
"icon": item.get("Icon", ""),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return self._wrap(query, "duckduckgo", results[:count])
|
||||||
|
|
||||||
|
async def wikipedia_search(self, query: str, count: int) -> Optional[Dict]:
|
||||||
|
url = "https://en.wikipedia.org/w/api.php"
|
||||||
|
params = {
|
||||||
|
"action": "query",
|
||||||
|
"format": "json",
|
||||||
|
"list": "search",
|
||||||
|
"srsearch": query,
|
||||||
|
"srlimit": min(count, 50),
|
||||||
|
}
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
data = await self._safe_get(session, url, params, timeout=10)
|
||||||
|
if not data or not isinstance(data, dict):
|
||||||
|
return None
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for item in data.get("query", {}).get("search", [])[:count]:
|
||||||
|
snippet = item.get("snippet", "")
|
||||||
|
snippet = snippet.replace("<span class='searchmatch'>", "").replace(
|
||||||
|
"</span>", ""
|
||||||
|
)
|
||||||
|
snippet = unescape(snippet)
|
||||||
|
title = item.get("title", "")
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
"title": title,
|
||||||
|
"url": "https://en.wikipedia.org/wiki/" + title.replace(" ", "_"),
|
||||||
|
"description": snippet[:500],
|
||||||
|
"source": "wikipedia",
|
||||||
|
"extra": {
|
||||||
|
"pageid": item.get("pageid"),
|
||||||
|
"size": item.get("size"),
|
||||||
|
"wordcount": item.get("wordcount"),
|
||||||
|
"timestamp": item.get("timestamp"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return self._wrap(query, "wikipedia", results)
|
||||||
|
|
||||||
|
async def wikidata_search(self, query: str, count: int) -> Optional[Dict]:
|
||||||
|
url = "https://www.wikidata.org/w/api.php"
|
||||||
|
params = {
|
||||||
|
"action": "wbsearchentities",
|
||||||
|
"search": query,
|
||||||
|
"format": "json",
|
||||||
|
"language": "en",
|
||||||
|
"limit": min(count, 50),
|
||||||
|
}
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
data = await self._safe_get(session, url, params, timeout=10)
|
||||||
|
if not data or not isinstance(data, dict):
|
||||||
|
return None
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for item in data.get("search", [])[:count]:
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
"title": item.get("label", ""),
|
||||||
|
"url": item.get("url", ""),
|
||||||
|
"description": (item.get("description") or "")[:500],
|
||||||
|
"source": "wikidata",
|
||||||
|
"extra": {
|
||||||
|
"id": item.get("id"),
|
||||||
|
"aliases": (item.get("aliases") or [])[:5],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return self._wrap(query, "wikidata", results)
|
||||||
|
|
||||||
|
async def jina_search(self, query: str, count: int) -> Optional[Dict]:
|
||||||
|
# This may change; if JSON not available, this will just return None
|
||||||
|
url = f"https://s.jina.ai/{query}"
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
try:
|
||||||
|
async with session.get(
|
||||||
|
url,
|
||||||
|
headers={"Accept": "application/json", **self._headers()},
|
||||||
|
timeout=aiohttp.ClientTimeout(total=15),
|
||||||
|
) as resp:
|
||||||
|
if resp.status != 200:
|
||||||
|
return None
|
||||||
|
data = await resp.json()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Jina search failed: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
return None
|
||||||
|
|
||||||
|
raw_list = data.get("data") or data.get("results") or []
|
||||||
|
if not isinstance(raw_list, list):
|
||||||
|
return None
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for item in raw_list[:count]:
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
"title": (item.get("title") or "")[:200],
|
||||||
|
"url": item.get("url", ""),
|
||||||
|
"description": (item.get("description") or "")[:500],
|
||||||
|
"source": "jina",
|
||||||
|
"extra": {},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return self._wrap(query, "jina", results)
|
||||||
|
|
||||||
|
async def brave_search(self, query: str, count: int) -> Optional[Dict]:
|
||||||
|
url = "https://search.brave.com/search"
|
||||||
|
params = {"q": query, "source": "web"}
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
html = await self._safe_get_html(session, url, params)
|
||||||
|
if not html:
|
||||||
|
return None
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
results = []
|
||||||
|
for snippet in soup.select('div[data-type="web"][data-pos]')[:count]:
|
||||||
|
link = snippet.select_one('a[href^="http"]')
|
||||||
|
title_div = snippet.select_one('div[class*="title"]')
|
||||||
|
desc_div = snippet.select_one('div.generic-snippet div.content')
|
||||||
|
if not desc_div:
|
||||||
|
desc_div = snippet.select_one('div[class*="snippet-description"]')
|
||||||
|
if link:
|
||||||
|
title = ""
|
||||||
|
if title_div:
|
||||||
|
title = title_div.get("title") or title_div.get_text(strip=True)
|
||||||
|
desc = ""
|
||||||
|
if desc_div:
|
||||||
|
desc = desc_div.get_text(strip=True)
|
||||||
|
results.append({
|
||||||
|
"title": title[:200],
|
||||||
|
"url": link.get("href", ""),
|
||||||
|
"description": desc[:500],
|
||||||
|
"source": "brave",
|
||||||
|
"extra": {},
|
||||||
|
})
|
||||||
|
if not results:
|
||||||
|
return None
|
||||||
|
return self._wrap(query, "brave", results)
|
||||||
|
|
||||||
|
async def duckduckgo_html_search(self, query: str, count: int) -> Optional[Dict]:
|
||||||
|
url = "https://html.duckduckgo.com/html/"
|
||||||
|
params = {"q": query}
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
html = await self._safe_get_html(session, url, params)
|
||||||
|
if not html:
|
||||||
|
return None
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
results = []
|
||||||
|
for result in soup.select('div.result.results_links')[:count * 2]:
|
||||||
|
link = result.select_one('a.result__a')
|
||||||
|
snippet = result.select_one('a.result__snippet')
|
||||||
|
if not link:
|
||||||
|
continue
|
||||||
|
result_url = link.get("href", "")
|
||||||
|
if "uddg=" in result_url:
|
||||||
|
parsed = urlparse(result_url)
|
||||||
|
qs = parse_qs(parsed.query)
|
||||||
|
if "uddg" in qs:
|
||||||
|
result_url = unquote(qs["uddg"][0])
|
||||||
|
elif result_url.startswith("//"):
|
||||||
|
result_url = "https:" + result_url
|
||||||
|
if result_url.startswith("/") or "duckduckgo.com" in result_url:
|
||||||
|
continue
|
||||||
|
results.append({
|
||||||
|
"title": link.get_text(strip=True)[:200],
|
||||||
|
"url": result_url,
|
||||||
|
"description": (snippet.get_text(strip=True) if snippet else "")[:500],
|
||||||
|
"source": "duckduckgo_html",
|
||||||
|
"extra": {},
|
||||||
|
})
|
||||||
|
if len(results) >= count:
|
||||||
|
break
|
||||||
|
if not results:
|
||||||
|
return None
|
||||||
|
return self._wrap(query, "duckduckgo_html", results)
|
||||||
|
|
||||||
|
async def bing_search(self, query: str, count: int) -> Optional[Dict]:
|
||||||
|
url = "https://www.bing.com/search"
|
||||||
|
params = {"q": query, "count": min(count, 50)}
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
html = await self._safe_get_html(session, url, params)
|
||||||
|
if not html:
|
||||||
|
return None
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
results = []
|
||||||
|
for item in soup.select('li.b_algo')[:count]:
|
||||||
|
link = item.select_one('h2 a')
|
||||||
|
desc = item.select_one('p')
|
||||||
|
if link:
|
||||||
|
results.append({
|
||||||
|
"title": link.get_text(strip=True)[:200],
|
||||||
|
"url": link.get("href", ""),
|
||||||
|
"description": (desc.get_text(strip=True) if desc else "")[:500],
|
||||||
|
"source": "bing",
|
||||||
|
"extra": {},
|
||||||
|
})
|
||||||
|
if not results:
|
||||||
|
return None
|
||||||
|
return self._wrap(query, "bing", results)
|
||||||
|
|
||||||
|
async def mojeek_search(self, query: str, count: int) -> Optional[Dict]:
|
||||||
|
url = "https://www.mojeek.com/search"
|
||||||
|
params = {"q": query}
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
html = await self._safe_get_html(session, url, params)
|
||||||
|
if not html:
|
||||||
|
return None
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
results = []
|
||||||
|
for item in soup.select('ul.results-standard li')[:count]:
|
||||||
|
link = item.select_one('a.ob')
|
||||||
|
title_el = item.select_one('a.title')
|
||||||
|
desc = item.select_one('p.s')
|
||||||
|
if link:
|
||||||
|
title = title_el.get_text(strip=True) if title_el else link.get_text(strip=True)
|
||||||
|
results.append({
|
||||||
|
"title": title[:200],
|
||||||
|
"url": link.get("href", ""),
|
||||||
|
"description": (desc.get_text(strip=True) if desc else "")[:500],
|
||||||
|
"source": "mojeek",
|
||||||
|
"extra": {},
|
||||||
|
})
|
||||||
|
if not results:
|
||||||
|
return None
|
||||||
|
return self._wrap(query, "mojeek", results)
|
||||||
|
|
||||||
|
def _wrap(self, query: str, service: str, results: List[Dict]) -> Dict:
|
||||||
|
return {
|
||||||
|
"query": query,
|
||||||
|
"source": service,
|
||||||
|
"count": len(results),
|
||||||
|
"results": results,
|
||||||
|
"timestamp": datetime.utcnow().isoformat() + "Z",
|
||||||
|
"success": True,
|
||||||
|
"error": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
async def search(self, query: str, count: int) -> Dict:
|
||||||
|
query = (query or "").strip()
|
||||||
|
if not query:
|
||||||
|
return {
|
||||||
|
"query": "",
|
||||||
|
"source": "none",
|
||||||
|
"count": 0,
|
||||||
|
"results": [],
|
||||||
|
"timestamp": datetime.utcnow().isoformat() + "Z",
|
||||||
|
"success": False,
|
||||||
|
"error": "Empty query",
|
||||||
|
}
|
||||||
|
|
||||||
|
count = max(1, min(int(count), 100))
|
||||||
|
services = self.services
|
||||||
|
|
||||||
|
logger.info(f"search '{query}' count={count} services={len(services)}")
|
||||||
|
|
||||||
|
for fn in services:
|
||||||
|
name = fn.__name__
|
||||||
|
try:
|
||||||
|
result = await asyncio.wait_for(fn(query, count), timeout=20)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning(f"{name} timed out")
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"{name} failed: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if result and result.get("success") and result.get("count", 0) > 0:
|
||||||
|
logger.info(f"using {result.get('source')} for '{query}'")
|
||||||
|
return result
|
||||||
|
|
||||||
|
return {
|
||||||
|
"query": query,
|
||||||
|
"source": "none",
|
||||||
|
"count": 0,
|
||||||
|
"results": [],
|
||||||
|
"timestamp": datetime.utcnow().isoformat() + "Z",
|
||||||
|
"success": False,
|
||||||
|
"error": "All services failed",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
searcher = MultiSearch()
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_search(request: web.Request) -> web.Response:
|
||||||
|
q = request.query.get("query", "")
|
||||||
|
count_raw = request.query.get("count", "10")
|
||||||
|
try:
|
||||||
|
count = int(count_raw)
|
||||||
|
except ValueError:
|
||||||
|
count = 10
|
||||||
|
result = await searcher.search(q, count)
|
||||||
|
status = 200 if result.get("success") else 400
|
||||||
|
return web.json_response(result, status=status)
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_health(request: web.Request) -> web.Response:
|
||||||
|
return web.json_response(
|
||||||
|
{
|
||||||
|
"status": "ok",
|
||||||
|
"services": [
|
||||||
|
"brave",
|
||||||
|
"duckduckgo_html",
|
||||||
|
"bing",
|
||||||
|
"mojeek",
|
||||||
|
"duckduckgo",
|
||||||
|
"wikipedia",
|
||||||
|
"wikidata",
|
||||||
|
],
|
||||||
|
"timestamp": datetime.utcnow().isoformat() + "Z",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def create_app() -> web.Application:
|
||||||
|
app = web.Application()
|
||||||
|
app.router.add_get("/search", handle_search)
|
||||||
|
app.router.add_get("/health", handle_health)
|
||||||
|
return app
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args(args=None):
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog="rsearch",
|
||||||
|
description="Multi-source search aggregator API"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-H", "--host",
|
||||||
|
default="0.0.0.0",
|
||||||
|
help="Host to bind to (default: 0.0.0.0)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-p", "--port",
|
||||||
|
type=int,
|
||||||
|
default=8080,
|
||||||
|
help="Port to listen on (default: 8080)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-l", "--log-level",
|
||||||
|
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||||
|
default="INFO",
|
||||||
|
help="Log level (default: INFO)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-v", "--version",
|
||||||
|
action="version",
|
||||||
|
version="%(prog)s 1.0.0"
|
||||||
|
)
|
||||||
|
return parser.parse_args(args)
|
||||||
|
|
||||||
|
|
||||||
|
def main(args=None):
|
||||||
|
opts = parse_args(args)
|
||||||
|
logging.basicConfig(
|
||||||
|
level=getattr(logging, opts.log_level),
|
||||||
|
format="%(asctime)s %(levelname)s %(name)s: %(message)s"
|
||||||
|
)
|
||||||
|
app = create_app()
|
||||||
|
logger.info(f"Starting server on {opts.host}:{opts.port}")
|
||||||
|
web.run_app(app, host=opts.host, port=opts.port, print=None)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
||||||
1
tests/__init__.py
Normal file
1
tests/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
# retoor <retoor@molodetz.nl>
|
||||||
167
tests/test_providers.py
Normal file
167
tests/test_providers.py
Normal file
@ -0,0 +1,167 @@
|
|||||||
|
# retoor <retoor@molodetz.nl>
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import pytest
|
||||||
|
from rsearch.app import MultiSearch
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def searcher():
|
||||||
|
return MultiSearch()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def query():
|
||||||
|
return "python programming"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def count():
|
||||||
|
return 3
|
||||||
|
|
||||||
|
|
||||||
|
class TestProviders:
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_brave_search(self, searcher, query, count):
|
||||||
|
result = await asyncio.wait_for(searcher.brave_search(query, count), timeout=20)
|
||||||
|
assert result is not None
|
||||||
|
assert result.get("success") is True
|
||||||
|
assert result.get("count", 0) > 0
|
||||||
|
assert result.get("source") == "brave"
|
||||||
|
assert len(result.get("results", [])) > 0
|
||||||
|
first = result["results"][0]
|
||||||
|
assert "title" in first
|
||||||
|
assert "url" in first
|
||||||
|
assert "description" in first
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_duckduckgo_html_search(self, searcher, query, count):
|
||||||
|
result = await asyncio.wait_for(searcher.duckduckgo_html_search(query, count), timeout=20)
|
||||||
|
assert result is not None
|
||||||
|
assert result.get("success") is True
|
||||||
|
assert result.get("count", 0) > 0
|
||||||
|
assert result.get("source") == "duckduckgo_html"
|
||||||
|
assert len(result.get("results", [])) > 0
|
||||||
|
first = result["results"][0]
|
||||||
|
assert "title" in first
|
||||||
|
assert "url" in first
|
||||||
|
assert first["url"].startswith("http")
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_bing_search(self, searcher, query, count):
|
||||||
|
result = await asyncio.wait_for(searcher.bing_search(query, count), timeout=20)
|
||||||
|
assert result is not None
|
||||||
|
assert result.get("success") is True
|
||||||
|
assert result.get("count", 0) > 0
|
||||||
|
assert result.get("source") == "bing"
|
||||||
|
assert len(result.get("results", [])) > 0
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_mojeek_search(self, searcher, query, count):
|
||||||
|
result = await asyncio.wait_for(searcher.mojeek_search(query, count), timeout=20)
|
||||||
|
assert result is not None
|
||||||
|
assert result.get("success") is True
|
||||||
|
assert result.get("count", 0) > 0
|
||||||
|
assert result.get("source") == "mojeek"
|
||||||
|
assert len(result.get("results", [])) > 0
|
||||||
|
first = result["results"][0]
|
||||||
|
assert not first["title"].startswith("http"), "Title should not be a URL"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_duckduckgo_api_search(self, searcher, query, count):
|
||||||
|
result = await asyncio.wait_for(searcher.duckduckgo_search(query, count), timeout=20)
|
||||||
|
if result is not None:
|
||||||
|
assert result.get("source") == "duckduckgo"
|
||||||
|
if result.get("count", 0) > 0:
|
||||||
|
assert result.get("success") is True
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_wikipedia_search(self, searcher, query, count):
|
||||||
|
result = await asyncio.wait_for(searcher.wikipedia_search(query, count), timeout=20)
|
||||||
|
assert result is not None
|
||||||
|
assert result.get("success") is True
|
||||||
|
assert result.get("count", 0) > 0
|
||||||
|
assert result.get("source") == "wikipedia"
|
||||||
|
assert len(result.get("results", [])) > 0
|
||||||
|
first = result["results"][0]
|
||||||
|
assert "wikipedia.org" in first["url"]
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_wikidata_search(self, searcher, query, count):
|
||||||
|
result = await asyncio.wait_for(searcher.wikidata_search(query, count), timeout=20)
|
||||||
|
assert result is not None
|
||||||
|
assert result.get("success") is True
|
||||||
|
assert result.get("count", 0) > 0
|
||||||
|
assert result.get("source") == "wikidata"
|
||||||
|
assert len(result.get("results", [])) > 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestSearchAggregator:
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_search_returns_results(self, searcher, query, count):
|
||||||
|
result = await searcher.search(query, count)
|
||||||
|
assert result is not None
|
||||||
|
assert result.get("success") is True
|
||||||
|
assert result.get("count", 0) > 0
|
||||||
|
assert result.get("query") == query
|
||||||
|
assert "timestamp" in result
|
||||||
|
assert "results" in result
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_search_empty_query(self, searcher, count):
|
||||||
|
result = await searcher.search("", count)
|
||||||
|
assert result is not None
|
||||||
|
assert result.get("success") is False
|
||||||
|
assert result.get("error") == "Empty query"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_search_count_limit(self, searcher, query):
|
||||||
|
result = await searcher.search(query, 5)
|
||||||
|
assert result is not None
|
||||||
|
if result.get("success"):
|
||||||
|
assert result.get("count", 0) <= 5
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_result_format(self, searcher, query, count):
|
||||||
|
result = await searcher.search(query, count)
|
||||||
|
assert "query" in result
|
||||||
|
assert "source" in result
|
||||||
|
assert "count" in result
|
||||||
|
assert "results" in result
|
||||||
|
assert "timestamp" in result
|
||||||
|
assert "success" in result
|
||||||
|
assert "error" in result
|
||||||
|
|
||||||
|
|
||||||
|
class TestAllProviders:
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_all_providers_return_valid_format(self, searcher, query, count):
|
||||||
|
providers = [
|
||||||
|
("brave", searcher.brave_search),
|
||||||
|
("duckduckgo_html", searcher.duckduckgo_html_search),
|
||||||
|
("bing", searcher.bing_search),
|
||||||
|
("mojeek", searcher.mojeek_search),
|
||||||
|
("duckduckgo", searcher.duckduckgo_search),
|
||||||
|
("wikipedia", searcher.wikipedia_search),
|
||||||
|
("wikidata", searcher.wikidata_search),
|
||||||
|
]
|
||||||
|
|
||||||
|
for name, fn in providers:
|
||||||
|
try:
|
||||||
|
result = await asyncio.wait_for(fn(query, count), timeout=20)
|
||||||
|
if result is not None:
|
||||||
|
assert "query" in result, f"{name}: missing query"
|
||||||
|
assert "source" in result, f"{name}: missing source"
|
||||||
|
assert "count" in result, f"{name}: missing count"
|
||||||
|
assert "results" in result, f"{name}: missing results"
|
||||||
|
assert "timestamp" in result, f"{name}: missing timestamp"
|
||||||
|
assert "success" in result, f"{name}: missing success"
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
pytest.skip(f"{name} timed out")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, "-v"])
|
||||||
Loading…
Reference in New Issue
Block a user