From 30c3821d98d85f7808ea89e6b0d79e984181d890 Mon Sep 17 00:00:00 2001 From: retoor Date: Mon, 6 Oct 2025 07:48:53 +0200 Subject: [PATCH] Update. --- app.py | 85 ++++++++++++++++++--- routers.py | 140 +++++++++++++++++++++++++++++++--- templates/newspaper_view.html | 2 +- 3 files changed, 205 insertions(+), 22 deletions(-) diff --git a/app.py b/app.py index ef81a4e..d9e0670 100644 --- a/app.py +++ b/app.py @@ -4,11 +4,17 @@ from fastapi.templating import Jinja2Templates import dataset import asyncio from datetime import datetime +import chromadb +from chromadb.config import Settings app = FastAPI(title="RSS Feed Manager") -# Database setup +# Database and ChromaDB setup (accessible by background tasks) db = dataset.connect('sqlite:///feeds.db') +chroma_client = chromadb.Client( + Settings(is_persistent=True, persist_directory="chroma_db") +) +chroma_collection = chroma_client.get_or_create_collection(name="articles") # Templates setup templates = Jinja2Templates(directory="templates") @@ -18,20 +24,81 @@ from routers import router as manage_router, run_sync_task app.include_router(manage_router) -@app.on_event("startup") -async def startup_event(): - # Ensure feeds table exists - feeds_table = db['feeds'] - # Start background sync task - asyncio.create_task(hourly_sync_task()) - async def hourly_sync_task(): + """Periodically fetches new articles from RSS feeds.""" + await asyncio.sleep(15) while True: - await asyncio.sleep(3600) # Wait 1 hour + print("Hourly Sync: Starting feed synchronization.") try: await run_sync_task() + print("Hourly Sync: Feed synchronization finished.") except Exception as e: print(f"Error in hourly sync: {e}") + await asyncio.sleep(3600) # Wait 1 hour + +async def chroma_sync_task(): + """ + A continuous background service that syncs the latest 100 articles, + checking against ChromaDB to avoid duplicates. + """ + print("Chroma Sync Service: Task started.") + articles_table = db['articles'] + + while True: + try: + print("Chroma Sync Service: Checking latest 100 articles from the database...") + # 1. Fetch the 100 most recent articles from SQLite + latest_articles = list(articles_table.find(order_by='-id', _limit=100)) + + if not latest_articles: + print("Chroma Sync Service: No articles in the database yet. Waiting...") + await asyncio.sleep(10) + continue + + # 2. Get the IDs to check against ChromaDB + guids_to_check = [article['guid'] for article in latest_articles] + + # 3. Check which articles already exist in ChromaDB + existing_chroma_docs = chroma_collection.get(ids=guids_to_check) + existing_guids = set(existing_chroma_docs['ids']) + + # 4. Filter out the articles that are already synced + articles_to_index = [article for article in latest_articles if article['guid'] not in existing_guids] + + if articles_to_index: + print(f"Chroma Sync Service: Found {len(articles_to_index)} new articles to index.") + documents, metadatas, ids = [], [], [] + + for article in articles_to_index: + doc_content = f"{article.get('title', '')}\n{article.get('description', '')}\n{article.get('content', '')}" + metadata = {key: str(value) for key, value in article.items()} + + documents.append(doc_content) + metadatas.append(metadata) + ids.append(article['guid']) + + # 5. Index the new batch to ChromaDB + chroma_collection.upsert(ids=ids, documents=documents, metadatas=metadatas) + print(f"Chroma Sync Service: Successfully indexed {len(articles_to_index)} articles.") + + else: + print("Chroma Sync Service: Latest 100 articles are already synced.") + + await asyncio.sleep(10) # Wait 10 seconds before the next check. + + except Exception as e: + print(f"Error in Chroma sync service: {e}") + await asyncio.sleep(30) # Wait longer after an error + +@app.on_event("startup") +async def startup_event(): + # Ensure tables exist + db['feeds'] + db['articles'] + # Start background tasks + print("Application startup: Initializing background tasks.") + asyncio.create_task(hourly_sync_task()) + asyncio.create_task(chroma_sync_task()) if __name__ == "__main__": import uvicorn diff --git a/routers.py b/routers.py index 8c5b817..e1d1c73 100644 --- a/routers.py +++ b/routers.py @@ -1,5 +1,5 @@ -from fastapi import APIRouter, Request, UploadFile, File, Form, WebSocket, WebSocketDisconnect -from fastapi.responses import HTMLResponse, RedirectResponse +from fastapi import APIRouter, Request, UploadFile, File, Form, WebSocket, WebSocketDisconnect, Query +from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse from fastapi.templating import Jinja2Templates import dataset import json @@ -9,11 +9,18 @@ import asyncio from datetime import datetime import time import trafilatura +import chromadb +from chromadb.config import Settings # Import Settings router = APIRouter() templates = Jinja2Templates(directory="templates") db = dataset.connect('sqlite:///feeds.db') +# ChromaDB setup +# This creates a persistent database in the 'chroma_db' directory +chroma_client = chroma_client = chromadb.PersistentClient(path="chroma_db") +chroma_collection = chroma_client.get_or_create_collection(name="articles") + # Browser-like headers HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', @@ -103,6 +110,15 @@ async def perform_sync(): total_articles_added += 1 articles_table.upsert(article_data, ['guid']) + + # Index the article to ChromaDB + doc_content = f"{article_data.get('title', '')}\n{article_data.get('description', '')}\n{article_data.get('content', '')}" + metadata = {key: str(value) for key, value in article_data.items()} + chroma_collection.upsert( + documents=[doc_content], + metadatas=[metadata], + ids=[article_data['guid']] + ) feeds_table.update({ 'id': [f for f in feeds if f['url'] == feed_url][0]['id'], @@ -346,6 +362,15 @@ async def websocket_sync(websocket: WebSocket): articles_count += 1 articles_table.upsert(article_data, ['guid']) + + # Index the article to ChromaDB + doc_content = f"{article_data.get('title', '')}\n{article_data.get('description', '')}" + metadata = {key: str(value) for key, value in article_data.items() if key != 'content'} # Exclude large content from metadata + chroma_collection.upsert( + documents=[doc_content], + metadatas=[metadata], + ids=[article_data['guid']] + ) total_articles_added += articles_count @@ -398,6 +423,92 @@ async def websocket_sync(websocket: WebSocket): "message": str(e) }) +## --- API Endpoints --- + +@router.post("/api/sync-to-chroma", tags=["API"], status_code=200) +async def sync_all_articles_to_chroma(): + """ + Manually synchronizes all articles from the SQLite database to the ChromaDB vector store. + This is useful for initializing the search index with existing data. + """ + articles_table = db['articles'] + all_articles = list(articles_table.all()) + + if not all_articles: + return JSONResponse(content={"status": "noop", "message": "No articles in the database to sync."}) + + documents, metadatas, ids = [], [], [] + + for article in all_articles: + # The document is what ChromaDB will search against. A combo of title and content is good. + doc_content = f"{article.get('title', '')}\n{article.get('description', '')}\n{article.get('content', '')}" + + # Metadata must have values of type str, int, float, or bool. + metadata = {key: str(value) for key, value in article.items()} + + documents.append(doc_content) + metadatas.append(metadata) + ids.append(article['guid']) + + # Upsert in batches to be memory-efficient + batch_size = 100 + for i in range(0, len(ids), batch_size): + chroma_collection.upsert( + ids=ids[i:i+batch_size], + documents=documents[i:i+batch_size], + metadatas=metadatas[i:i+batch_size] + ) + + return JSONResponse(content={ + "status": "success", + "message": f"Successfully indexed {len(all_articles)} articles to ChromaDB." + }) + +@router.get("/api/search", tags=["API"]) +async def search_articles( + q: str = Query(None, description="The search term to query for."), + limit: int = Query(20, ge=1, le=100, description="The maximum number of results to return."), + page: int = Query(1, ge=1, description="The page number for paginated results (used when 'q' is not provided).") +): + """ + Searches for articles within the ChromaDB vector store. + + - If **q** is provided, performs a similarity search based on the query text. + - If **q** is not provided, returns a paginated list of all articles, sorted by insertion order. + """ + if q: + # Perform a similarity search + results = chroma_collection.query( + query_texts=[q], + n_results=limit, + include=['metadatas', 'distances'] + ) + + # Format results into a cleaner list of objects + formatted_results = [] + if results and results.get('ids', [[]])[0]: + for i, doc_id in enumerate(results['ids'][0]): + res = results['metadatas'][0][i] + res['distance'] = results['distances'][0][i] + formatted_results.append(res) + + return JSONResponse(content={"results": formatted_results}) + + else: + # Return a paginated list of articles + page_limit = 20 + offset = (page - 1) * page_limit + + results = chroma_collection.get( + limit=page_limit, + offset=offset, + include=['metadatas'] + ) + + return JSONResponse(content={"results": results['metadatas']}) + +## --- HTML Page Routes --- + @router.get("/newspapers", response_class=HTMLResponse) async def newspapers_list(request: Request): newspapers_table = db['newspapers'] @@ -441,17 +552,22 @@ async def newspaper_latest(request: Request): newspapers_table = db['newspapers'] newspaper = None try: - newspaper = list(db.query("select * from newspapers order by id desc limit 1"))[0] + newspapers= list(db.query("select * from newspapers order by id desc limit 10")) except IndexError: pass - if not newspaper: - return RedirectResponse(url="/newspapers") + for newspaper in newspapers: + - articles = json.loads(newspaper['articles_json']) - - return templates.TemplateResponse("newspaper_view.html", { - "request": request, - "newspaper": newspaper, - "articles": articles - }) + articles = json.loads(newspaper['articles_json']) + if articles: + for article in articles: + for key, value in article.items(): + article[key] = str(value).strip() + return templates.TemplateResponse("newspaper_view.html", { + "request": request, + "newspaper": newspaper, + "articles": articles + }) + + return RedirectResponse(url="/newspapers") diff --git a/templates/newspaper_view.html b/templates/newspaper_view.html index 485aa32..f0d7c28 100644 --- a/templates/newspaper_view.html +++ b/templates/newspaper_view.html @@ -191,7 +191,7 @@