This commit is contained in:
retoor 2025-10-06 07:48:53 +02:00
parent 018b4e431a
commit 30c3821d98
3 changed files with 205 additions and 22 deletions

85
app.py
View File

@ -4,11 +4,17 @@ from fastapi.templating import Jinja2Templates
import dataset
import asyncio
from datetime import datetime
import chromadb
from chromadb.config import Settings
app = FastAPI(title="RSS Feed Manager")
# Database setup
# Database and ChromaDB setup (accessible by background tasks)
db = dataset.connect('sqlite:///feeds.db')
chroma_client = chromadb.Client(
Settings(is_persistent=True, persist_directory="chroma_db")
)
chroma_collection = chroma_client.get_or_create_collection(name="articles")
# Templates setup
templates = Jinja2Templates(directory="templates")
@ -18,20 +24,81 @@ from routers import router as manage_router, run_sync_task
app.include_router(manage_router)
@app.on_event("startup")
async def startup_event():
# Ensure feeds table exists
feeds_table = db['feeds']
# Start background sync task
asyncio.create_task(hourly_sync_task())
async def hourly_sync_task():
"""Periodically fetches new articles from RSS feeds."""
await asyncio.sleep(15)
while True:
await asyncio.sleep(3600) # Wait 1 hour
print("Hourly Sync: Starting feed synchronization.")
try:
await run_sync_task()
print("Hourly Sync: Feed synchronization finished.")
except Exception as e:
print(f"Error in hourly sync: {e}")
await asyncio.sleep(3600) # Wait 1 hour
async def chroma_sync_task():
"""
A continuous background service that syncs the latest 100 articles,
checking against ChromaDB to avoid duplicates.
"""
print("Chroma Sync Service: Task started.")
articles_table = db['articles']
while True:
try:
print("Chroma Sync Service: Checking latest 100 articles from the database...")
# 1. Fetch the 100 most recent articles from SQLite
latest_articles = list(articles_table.find(order_by='-id', _limit=100))
if not latest_articles:
print("Chroma Sync Service: No articles in the database yet. Waiting...")
await asyncio.sleep(10)
continue
# 2. Get the IDs to check against ChromaDB
guids_to_check = [article['guid'] for article in latest_articles]
# 3. Check which articles already exist in ChromaDB
existing_chroma_docs = chroma_collection.get(ids=guids_to_check)
existing_guids = set(existing_chroma_docs['ids'])
# 4. Filter out the articles that are already synced
articles_to_index = [article for article in latest_articles if article['guid'] not in existing_guids]
if articles_to_index:
print(f"Chroma Sync Service: Found {len(articles_to_index)} new articles to index.")
documents, metadatas, ids = [], [], []
for article in articles_to_index:
doc_content = f"{article.get('title', '')}\n{article.get('description', '')}\n{article.get('content', '')}"
metadata = {key: str(value) for key, value in article.items()}
documents.append(doc_content)
metadatas.append(metadata)
ids.append(article['guid'])
# 5. Index the new batch to ChromaDB
chroma_collection.upsert(ids=ids, documents=documents, metadatas=metadatas)
print(f"Chroma Sync Service: Successfully indexed {len(articles_to_index)} articles.")
else:
print("Chroma Sync Service: Latest 100 articles are already synced.")
await asyncio.sleep(10) # Wait 10 seconds before the next check.
except Exception as e:
print(f"Error in Chroma sync service: {e}")
await asyncio.sleep(30) # Wait longer after an error
@app.on_event("startup")
async def startup_event():
# Ensure tables exist
db['feeds']
db['articles']
# Start background tasks
print("Application startup: Initializing background tasks.")
asyncio.create_task(hourly_sync_task())
asyncio.create_task(chroma_sync_task())
if __name__ == "__main__":
import uvicorn

View File

@ -1,5 +1,5 @@
from fastapi import APIRouter, Request, UploadFile, File, Form, WebSocket, WebSocketDisconnect
from fastapi.responses import HTMLResponse, RedirectResponse
from fastapi import APIRouter, Request, UploadFile, File, Form, WebSocket, WebSocketDisconnect, Query
from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
from fastapi.templating import Jinja2Templates
import dataset
import json
@ -9,11 +9,18 @@ import asyncio
from datetime import datetime
import time
import trafilatura
import chromadb
from chromadb.config import Settings # Import Settings
router = APIRouter()
templates = Jinja2Templates(directory="templates")
db = dataset.connect('sqlite:///feeds.db')
# ChromaDB setup
# This creates a persistent database in the 'chroma_db' directory
chroma_client = chroma_client = chromadb.PersistentClient(path="chroma_db")
chroma_collection = chroma_client.get_or_create_collection(name="articles")
# Browser-like headers
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
@ -104,6 +111,15 @@ async def perform_sync():
articles_table.upsert(article_data, ['guid'])
# Index the article to ChromaDB
doc_content = f"{article_data.get('title', '')}\n{article_data.get('description', '')}\n{article_data.get('content', '')}"
metadata = {key: str(value) for key, value in article_data.items()}
chroma_collection.upsert(
documents=[doc_content],
metadatas=[metadata],
ids=[article_data['guid']]
)
feeds_table.update({
'id': [f for f in feeds if f['url'] == feed_url][0]['id'],
'last_synced': datetime.now().isoformat()
@ -347,6 +363,15 @@ async def websocket_sync(websocket: WebSocket):
articles_table.upsert(article_data, ['guid'])
# Index the article to ChromaDB
doc_content = f"{article_data.get('title', '')}\n{article_data.get('description', '')}"
metadata = {key: str(value) for key, value in article_data.items() if key != 'content'} # Exclude large content from metadata
chroma_collection.upsert(
documents=[doc_content],
metadatas=[metadata],
ids=[article_data['guid']]
)
total_articles_added += articles_count
feeds_table.update({
@ -398,6 +423,92 @@ async def websocket_sync(websocket: WebSocket):
"message": str(e)
})
## --- API Endpoints ---
@router.post("/api/sync-to-chroma", tags=["API"], status_code=200)
async def sync_all_articles_to_chroma():
"""
Manually synchronizes all articles from the SQLite database to the ChromaDB vector store.
This is useful for initializing the search index with existing data.
"""
articles_table = db['articles']
all_articles = list(articles_table.all())
if not all_articles:
return JSONResponse(content={"status": "noop", "message": "No articles in the database to sync."})
documents, metadatas, ids = [], [], []
for article in all_articles:
# The document is what ChromaDB will search against. A combo of title and content is good.
doc_content = f"{article.get('title', '')}\n{article.get('description', '')}\n{article.get('content', '')}"
# Metadata must have values of type str, int, float, or bool.
metadata = {key: str(value) for key, value in article.items()}
documents.append(doc_content)
metadatas.append(metadata)
ids.append(article['guid'])
# Upsert in batches to be memory-efficient
batch_size = 100
for i in range(0, len(ids), batch_size):
chroma_collection.upsert(
ids=ids[i:i+batch_size],
documents=documents[i:i+batch_size],
metadatas=metadatas[i:i+batch_size]
)
return JSONResponse(content={
"status": "success",
"message": f"Successfully indexed {len(all_articles)} articles to ChromaDB."
})
@router.get("/api/search", tags=["API"])
async def search_articles(
q: str = Query(None, description="The search term to query for."),
limit: int = Query(20, ge=1, le=100, description="The maximum number of results to return."),
page: int = Query(1, ge=1, description="The page number for paginated results (used when 'q' is not provided).")
):
"""
Searches for articles within the ChromaDB vector store.
- If **q** is provided, performs a similarity search based on the query text.
- If **q** is not provided, returns a paginated list of all articles, sorted by insertion order.
"""
if q:
# Perform a similarity search
results = chroma_collection.query(
query_texts=[q],
n_results=limit,
include=['metadatas', 'distances']
)
# Format results into a cleaner list of objects
formatted_results = []
if results and results.get('ids', [[]])[0]:
for i, doc_id in enumerate(results['ids'][0]):
res = results['metadatas'][0][i]
res['distance'] = results['distances'][0][i]
formatted_results.append(res)
return JSONResponse(content={"results": formatted_results})
else:
# Return a paginated list of articles
page_limit = 20
offset = (page - 1) * page_limit
results = chroma_collection.get(
limit=page_limit,
offset=offset,
include=['metadatas']
)
return JSONResponse(content={"results": results['metadatas']})
## --- HTML Page Routes ---
@router.get("/newspapers", response_class=HTMLResponse)
async def newspapers_list(request: Request):
newspapers_table = db['newspapers']
@ -441,17 +552,22 @@ async def newspaper_latest(request: Request):
newspapers_table = db['newspapers']
newspaper = None
try:
newspaper = list(db.query("select * from newspapers order by id desc limit 1"))[0]
newspapers= list(db.query("select * from newspapers order by id desc limit 10"))
except IndexError:
pass
if not newspaper:
return RedirectResponse(url="/newspapers")
for newspaper in newspapers:
articles = json.loads(newspaper['articles_json'])
if articles:
for article in articles:
for key, value in article.items():
article[key] = str(value).strip()
return templates.TemplateResponse("newspaper_view.html", {
"request": request,
"newspaper": newspaper,
"articles": articles
})
return RedirectResponse(url="/newspapers")

View File

@ -191,7 +191,7 @@
</div>
<div class="footer">
Generated from RSS Feed Manager | Synchronized at {{ newspaper.created_at[:19].replace('T', ' ') }}
Molodetz News - always up to date | Synchronized at {{ newspaper.created_at[:19].replace('T', ' ') }}
</div>
</div>
</body>