From 30c3821d98d85f7808ea89e6b0d79e984181d890 Mon Sep 17 00:00:00 2001
From: retoor <retoor@molodetz.nl>
Date: Mon, 6 Oct 2025 07:48:53 +0200
Subject: [PATCH] Update.

---
 app.py                        |  85 ++++++++++++++++++---
 routers.py                    | 140 +++++++++++++++++++++++++++++++---
 templates/newspaper_view.html |   2 +-
 3 files changed, 205 insertions(+), 22 deletions(-)

diff --git a/app.py b/app.py
index ef81a4e..d9e0670 100644
--- a/app.py
+++ b/app.py
@@ -4,11 +4,17 @@ from fastapi.templating import Jinja2Templates
 import dataset
 import asyncio
 from datetime import datetime
+import chromadb
+from chromadb.config import Settings
 
 app = FastAPI(title="RSS Feed Manager")
 
-# Database setup
+# Database and ChromaDB setup (accessible by background tasks)
 db = dataset.connect('sqlite:///feeds.db')
+chroma_client = chromadb.Client(
+    Settings(is_persistent=True, persist_directory="chroma_db")
+)
+chroma_collection = chroma_client.get_or_create_collection(name="articles")
 
 # Templates setup
 templates = Jinja2Templates(directory="templates")
@@ -18,20 +24,81 @@ from routers import router as manage_router, run_sync_task
 
 app.include_router(manage_router)
 
-@app.on_event("startup")
-async def startup_event():
-    # Ensure feeds table exists
-    feeds_table = db['feeds']
-    # Start background sync task
-    asyncio.create_task(hourly_sync_task())
-
 async def hourly_sync_task():
+    """Periodically fetches new articles from RSS feeds."""
+    await asyncio.sleep(15) 
     while True:
-        await asyncio.sleep(3600)  # Wait 1 hour
+        print("Hourly Sync: Starting feed synchronization.")
         try:
             await run_sync_task()
+            print("Hourly Sync: Feed synchronization finished.")
         except Exception as e:
             print(f"Error in hourly sync: {e}")
+        await asyncio.sleep(3600)  # Wait 1 hour
+
+async def chroma_sync_task():
+    """
+    A continuous background service that syncs the latest 100 articles,
+    checking against ChromaDB to avoid duplicates.
+    """
+    print("Chroma Sync Service: Task started.")
+    articles_table = db['articles']
+    
+    while True:
+        try:
+            print("Chroma Sync Service: Checking latest 100 articles from the database...")
+            # 1. Fetch the 100 most recent articles from SQLite
+            latest_articles = list(articles_table.find(order_by='-id', _limit=100))
+
+            if not latest_articles:
+                print("Chroma Sync Service: No articles in the database yet. Waiting...")
+                await asyncio.sleep(10)
+                continue
+
+            # 2. Get the IDs to check against ChromaDB
+            guids_to_check = [article['guid'] for article in latest_articles]
+
+            # 3. Check which articles already exist in ChromaDB
+            existing_chroma_docs = chroma_collection.get(ids=guids_to_check)
+            existing_guids = set(existing_chroma_docs['ids'])
+            
+            # 4. Filter out the articles that are already synced
+            articles_to_index = [article for article in latest_articles if article['guid'] not in existing_guids]
+
+            if articles_to_index:
+                print(f"Chroma Sync Service: Found {len(articles_to_index)} new articles to index.")
+                documents, metadatas, ids = [], [], []
+
+                for article in articles_to_index:
+                    doc_content = f"{article.get('title', '')}\n{article.get('description', '')}\n{article.get('content', '')}"
+                    metadata = {key: str(value) for key, value in article.items()}
+                    
+                    documents.append(doc_content)
+                    metadatas.append(metadata)
+                    ids.append(article['guid'])
+
+                # 5. Index the new batch to ChromaDB
+                chroma_collection.upsert(ids=ids, documents=documents, metadatas=metadatas)
+                print(f"Chroma Sync Service: Successfully indexed {len(articles_to_index)} articles.")
+
+            else:
+                print("Chroma Sync Service: Latest 100 articles are already synced.")
+            
+            await asyncio.sleep(10) # Wait 10 seconds before the next check.
+
+        except Exception as e:
+            print(f"Error in Chroma sync service: {e}")
+            await asyncio.sleep(30) # Wait longer after an error
+
+@app.on_event("startup")
+async def startup_event():
+    # Ensure tables exist
+    db['feeds']
+    db['articles']
+    # Start background tasks
+    print("Application startup: Initializing background tasks.")
+    asyncio.create_task(hourly_sync_task())
+    asyncio.create_task(chroma_sync_task())
 
 if __name__ == "__main__":
     import uvicorn
diff --git a/routers.py b/routers.py
index 8c5b817..e1d1c73 100644
--- a/routers.py
+++ b/routers.py
@@ -1,5 +1,5 @@
-from fastapi import APIRouter, Request, UploadFile, File, Form, WebSocket, WebSocketDisconnect
-from fastapi.responses import HTMLResponse, RedirectResponse
+from fastapi import APIRouter, Request, UploadFile, File, Form, WebSocket, WebSocketDisconnect, Query
+from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
 from fastapi.templating import Jinja2Templates
 import dataset
 import json
@@ -9,11 +9,18 @@ import asyncio
 from datetime import datetime
 import time
 import trafilatura
+import chromadb
+from chromadb.config import Settings # Import Settings
 
 router = APIRouter()
 templates = Jinja2Templates(directory="templates")
 db = dataset.connect('sqlite:///feeds.db')
 
+# ChromaDB setup
+# This creates a persistent database in the 'chroma_db' directory
+chroma_client = chroma_client = chromadb.PersistentClient(path="chroma_db")
+chroma_collection = chroma_client.get_or_create_collection(name="articles")
+
 # Browser-like headers
 HEADERS = {
     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
@@ -103,6 +110,15 @@ async def perform_sync():
                     total_articles_added += 1
                 
                 articles_table.upsert(article_data, ['guid'])
+
+                # Index the article to ChromaDB
+                doc_content = f"{article_data.get('title', '')}\n{article_data.get('description', '')}\n{article_data.get('content', '')}"
+                metadata = {key: str(value) for key, value in article_data.items()}
+                chroma_collection.upsert(
+                    documents=[doc_content],
+                    metadatas=[metadata],
+                    ids=[article_data['guid']]
+                )
             
             feeds_table.update({
                 'id': [f for f in feeds if f['url'] == feed_url][0]['id'],
@@ -346,6 +362,15 @@ async def websocket_sync(websocket: WebSocket):
                         articles_count += 1
                     
                     articles_table.upsert(article_data, ['guid'])
+
+                    # Index the article to ChromaDB
+                    doc_content = f"{article_data.get('title', '')}\n{article_data.get('description', '')}"
+                    metadata = {key: str(value) for key, value in article_data.items() if key != 'content'} # Exclude large content from metadata
+                    chroma_collection.upsert(
+                        documents=[doc_content],
+                        metadatas=[metadata],
+                        ids=[article_data['guid']]
+                    )
                 
                 total_articles_added += articles_count
                 
@@ -398,6 +423,92 @@ async def websocket_sync(websocket: WebSocket):
             "message": str(e)
         })
 
+## --- API Endpoints ---
+
+@router.post("/api/sync-to-chroma", tags=["API"], status_code=200)
+async def sync_all_articles_to_chroma():
+    """
+    Manually synchronizes all articles from the SQLite database to the ChromaDB vector store.
+    This is useful for initializing the search index with existing data.
+    """
+    articles_table = db['articles']
+    all_articles = list(articles_table.all())
+    
+    if not all_articles:
+        return JSONResponse(content={"status": "noop", "message": "No articles in the database to sync."})
+
+    documents, metadatas, ids = [], [], []
+    
+    for article in all_articles:
+        # The document is what ChromaDB will search against. A combo of title and content is good.
+        doc_content = f"{article.get('title', '')}\n{article.get('description', '')}\n{article.get('content', '')}"
+        
+        # Metadata must have values of type str, int, float, or bool.
+        metadata = {key: str(value) for key, value in article.items()}
+
+        documents.append(doc_content)
+        metadatas.append(metadata)
+        ids.append(article['guid'])
+
+    # Upsert in batches to be memory-efficient
+    batch_size = 100
+    for i in range(0, len(ids), batch_size):
+        chroma_collection.upsert(
+            ids=ids[i:i+batch_size],
+            documents=documents[i:i+batch_size],
+            metadatas=metadatas[i:i+batch_size]
+        )
+
+    return JSONResponse(content={
+        "status": "success",
+        "message": f"Successfully indexed {len(all_articles)} articles to ChromaDB."
+    })
+
+@router.get("/api/search", tags=["API"])
+async def search_articles(
+    q: str = Query(None, description="The search term to query for."),
+    limit: int = Query(20, ge=1, le=100, description="The maximum number of results to return."),
+    page: int = Query(1, ge=1, description="The page number for paginated results (used when 'q' is not provided).")
+):
+    """
+    Searches for articles within the ChromaDB vector store.
+
+    - If **q** is provided, performs a similarity search based on the query text.
+    - If **q** is not provided, returns a paginated list of all articles, sorted by insertion order.
+    """
+    if q:
+        # Perform a similarity search
+        results = chroma_collection.query(
+            query_texts=[q],
+            n_results=limit,
+            include=['metadatas', 'distances']
+        )
+        
+        # Format results into a cleaner list of objects
+        formatted_results = []
+        if results and results.get('ids', [[]])[0]:
+            for i, doc_id in enumerate(results['ids'][0]):
+                res = results['metadatas'][0][i]
+                res['distance'] = results['distances'][0][i]
+                formatted_results.append(res)
+        
+        return JSONResponse(content={"results": formatted_results})
+
+    else:
+        # Return a paginated list of articles
+        page_limit = 20
+        offset = (page - 1) * page_limit
+        
+        results = chroma_collection.get(
+            limit=page_limit,
+            offset=offset,
+            include=['metadatas']
+        )
+        
+        return JSONResponse(content={"results": results['metadatas']})
+
+## --- HTML Page Routes ---
+
 @router.get("/newspapers", response_class=HTMLResponse)
 async def newspapers_list(request: Request):
     newspapers_table = db['newspapers']
@@ -441,17 +552,22 @@ async def newspaper_latest(request: Request):
     newspapers_table = db['newspapers']
     newspaper = None
     try:
-        newspaper = list(db.query("select * from newspapers order by id desc limit 1"))[0]
+        newspapers= list(db.query("select * from newspapers order by id desc limit 10"))
     except IndexError:
         pass 
 
-    if not newspaper:
-        return RedirectResponse(url="/newspapers")
+    for newspaper in newspapers:
+
     
-    articles = json.loads(newspaper['articles_json'])
-    
-    return templates.TemplateResponse("newspaper_view.html", {
-        "request": request,
-        "newspaper": newspaper,
-        "articles": articles
-    })
+        articles = json.loads(newspaper['articles_json'])
+        if articles: 
+            for article in articles:
+                for key, value in article.items():
+                    article[key] = str(value).strip()
+            return templates.TemplateResponse("newspaper_view.html", {
+                "request": request,
+                "newspaper": newspaper,
+                "articles": articles
+            })
+
+    return RedirectResponse(url="/newspapers")
diff --git a/templates/newspaper_view.html b/templates/newspaper_view.html
index 485aa32..f0d7c28 100644
--- a/templates/newspaper_view.html
+++ b/templates/newspaper_view.html
@@ -191,7 +191,7 @@
         </div>
 
         <div class="footer">
-            Generated from RSS Feed Manager | Synchronized at {{ newspaper.created_at[:19].replace('T', ' ') }}
+            Molodetz News - always up to date | Synchronized at {{ newspaper.created_at[:19].replace('T', ' ') }}
         </div>
     </div>
 </body>