Update.
This commit is contained in:
parent
018b4e431a
commit
30c3821d98
85
app.py
85
app.py
@ -4,11 +4,17 @@ from fastapi.templating import Jinja2Templates
|
||||
import dataset
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
import chromadb
|
||||
from chromadb.config import Settings
|
||||
|
||||
app = FastAPI(title="RSS Feed Manager")
|
||||
|
||||
# Database setup
|
||||
# Database and ChromaDB setup (accessible by background tasks)
|
||||
db = dataset.connect('sqlite:///feeds.db')
|
||||
chroma_client = chromadb.Client(
|
||||
Settings(is_persistent=True, persist_directory="chroma_db")
|
||||
)
|
||||
chroma_collection = chroma_client.get_or_create_collection(name="articles")
|
||||
|
||||
# Templates setup
|
||||
templates = Jinja2Templates(directory="templates")
|
||||
@ -18,20 +24,81 @@ from routers import router as manage_router, run_sync_task
|
||||
|
||||
app.include_router(manage_router)
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
# Ensure feeds table exists
|
||||
feeds_table = db['feeds']
|
||||
# Start background sync task
|
||||
asyncio.create_task(hourly_sync_task())
|
||||
|
||||
async def hourly_sync_task():
|
||||
"""Periodically fetches new articles from RSS feeds."""
|
||||
await asyncio.sleep(15)
|
||||
while True:
|
||||
await asyncio.sleep(3600) # Wait 1 hour
|
||||
print("Hourly Sync: Starting feed synchronization.")
|
||||
try:
|
||||
await run_sync_task()
|
||||
print("Hourly Sync: Feed synchronization finished.")
|
||||
except Exception as e:
|
||||
print(f"Error in hourly sync: {e}")
|
||||
await asyncio.sleep(3600) # Wait 1 hour
|
||||
|
||||
async def chroma_sync_task():
|
||||
"""
|
||||
A continuous background service that syncs the latest 100 articles,
|
||||
checking against ChromaDB to avoid duplicates.
|
||||
"""
|
||||
print("Chroma Sync Service: Task started.")
|
||||
articles_table = db['articles']
|
||||
|
||||
while True:
|
||||
try:
|
||||
print("Chroma Sync Service: Checking latest 100 articles from the database...")
|
||||
# 1. Fetch the 100 most recent articles from SQLite
|
||||
latest_articles = list(articles_table.find(order_by='-id', _limit=100))
|
||||
|
||||
if not latest_articles:
|
||||
print("Chroma Sync Service: No articles in the database yet. Waiting...")
|
||||
await asyncio.sleep(10)
|
||||
continue
|
||||
|
||||
# 2. Get the IDs to check against ChromaDB
|
||||
guids_to_check = [article['guid'] for article in latest_articles]
|
||||
|
||||
# 3. Check which articles already exist in ChromaDB
|
||||
existing_chroma_docs = chroma_collection.get(ids=guids_to_check)
|
||||
existing_guids = set(existing_chroma_docs['ids'])
|
||||
|
||||
# 4. Filter out the articles that are already synced
|
||||
articles_to_index = [article for article in latest_articles if article['guid'] not in existing_guids]
|
||||
|
||||
if articles_to_index:
|
||||
print(f"Chroma Sync Service: Found {len(articles_to_index)} new articles to index.")
|
||||
documents, metadatas, ids = [], [], []
|
||||
|
||||
for article in articles_to_index:
|
||||
doc_content = f"{article.get('title', '')}\n{article.get('description', '')}\n{article.get('content', '')}"
|
||||
metadata = {key: str(value) for key, value in article.items()}
|
||||
|
||||
documents.append(doc_content)
|
||||
metadatas.append(metadata)
|
||||
ids.append(article['guid'])
|
||||
|
||||
# 5. Index the new batch to ChromaDB
|
||||
chroma_collection.upsert(ids=ids, documents=documents, metadatas=metadatas)
|
||||
print(f"Chroma Sync Service: Successfully indexed {len(articles_to_index)} articles.")
|
||||
|
||||
else:
|
||||
print("Chroma Sync Service: Latest 100 articles are already synced.")
|
||||
|
||||
await asyncio.sleep(10) # Wait 10 seconds before the next check.
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in Chroma sync service: {e}")
|
||||
await asyncio.sleep(30) # Wait longer after an error
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
# Ensure tables exist
|
||||
db['feeds']
|
||||
db['articles']
|
||||
# Start background tasks
|
||||
print("Application startup: Initializing background tasks.")
|
||||
asyncio.create_task(hourly_sync_task())
|
||||
asyncio.create_task(chroma_sync_task())
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
140
routers.py
140
routers.py
@ -1,5 +1,5 @@
|
||||
from fastapi import APIRouter, Request, UploadFile, File, Form, WebSocket, WebSocketDisconnect
|
||||
from fastapi.responses import HTMLResponse, RedirectResponse
|
||||
from fastapi import APIRouter, Request, UploadFile, File, Form, WebSocket, WebSocketDisconnect, Query
|
||||
from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
|
||||
from fastapi.templating import Jinja2Templates
|
||||
import dataset
|
||||
import json
|
||||
@ -9,11 +9,18 @@ import asyncio
|
||||
from datetime import datetime
|
||||
import time
|
||||
import trafilatura
|
||||
import chromadb
|
||||
from chromadb.config import Settings # Import Settings
|
||||
|
||||
router = APIRouter()
|
||||
templates = Jinja2Templates(directory="templates")
|
||||
db = dataset.connect('sqlite:///feeds.db')
|
||||
|
||||
# ChromaDB setup
|
||||
# This creates a persistent database in the 'chroma_db' directory
|
||||
chroma_client = chroma_client = chromadb.PersistentClient(path="chroma_db")
|
||||
chroma_collection = chroma_client.get_or_create_collection(name="articles")
|
||||
|
||||
# Browser-like headers
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
@ -103,6 +110,15 @@ async def perform_sync():
|
||||
total_articles_added += 1
|
||||
|
||||
articles_table.upsert(article_data, ['guid'])
|
||||
|
||||
# Index the article to ChromaDB
|
||||
doc_content = f"{article_data.get('title', '')}\n{article_data.get('description', '')}\n{article_data.get('content', '')}"
|
||||
metadata = {key: str(value) for key, value in article_data.items()}
|
||||
chroma_collection.upsert(
|
||||
documents=[doc_content],
|
||||
metadatas=[metadata],
|
||||
ids=[article_data['guid']]
|
||||
)
|
||||
|
||||
feeds_table.update({
|
||||
'id': [f for f in feeds if f['url'] == feed_url][0]['id'],
|
||||
@ -346,6 +362,15 @@ async def websocket_sync(websocket: WebSocket):
|
||||
articles_count += 1
|
||||
|
||||
articles_table.upsert(article_data, ['guid'])
|
||||
|
||||
# Index the article to ChromaDB
|
||||
doc_content = f"{article_data.get('title', '')}\n{article_data.get('description', '')}"
|
||||
metadata = {key: str(value) for key, value in article_data.items() if key != 'content'} # Exclude large content from metadata
|
||||
chroma_collection.upsert(
|
||||
documents=[doc_content],
|
||||
metadatas=[metadata],
|
||||
ids=[article_data['guid']]
|
||||
)
|
||||
|
||||
total_articles_added += articles_count
|
||||
|
||||
@ -398,6 +423,92 @@ async def websocket_sync(websocket: WebSocket):
|
||||
"message": str(e)
|
||||
})
|
||||
|
||||
## --- API Endpoints ---
|
||||
|
||||
@router.post("/api/sync-to-chroma", tags=["API"], status_code=200)
|
||||
async def sync_all_articles_to_chroma():
|
||||
"""
|
||||
Manually synchronizes all articles from the SQLite database to the ChromaDB vector store.
|
||||
This is useful for initializing the search index with existing data.
|
||||
"""
|
||||
articles_table = db['articles']
|
||||
all_articles = list(articles_table.all())
|
||||
|
||||
if not all_articles:
|
||||
return JSONResponse(content={"status": "noop", "message": "No articles in the database to sync."})
|
||||
|
||||
documents, metadatas, ids = [], [], []
|
||||
|
||||
for article in all_articles:
|
||||
# The document is what ChromaDB will search against. A combo of title and content is good.
|
||||
doc_content = f"{article.get('title', '')}\n{article.get('description', '')}\n{article.get('content', '')}"
|
||||
|
||||
# Metadata must have values of type str, int, float, or bool.
|
||||
metadata = {key: str(value) for key, value in article.items()}
|
||||
|
||||
documents.append(doc_content)
|
||||
metadatas.append(metadata)
|
||||
ids.append(article['guid'])
|
||||
|
||||
# Upsert in batches to be memory-efficient
|
||||
batch_size = 100
|
||||
for i in range(0, len(ids), batch_size):
|
||||
chroma_collection.upsert(
|
||||
ids=ids[i:i+batch_size],
|
||||
documents=documents[i:i+batch_size],
|
||||
metadatas=metadatas[i:i+batch_size]
|
||||
)
|
||||
|
||||
return JSONResponse(content={
|
||||
"status": "success",
|
||||
"message": f"Successfully indexed {len(all_articles)} articles to ChromaDB."
|
||||
})
|
||||
|
||||
@router.get("/api/search", tags=["API"])
|
||||
async def search_articles(
|
||||
q: str = Query(None, description="The search term to query for."),
|
||||
limit: int = Query(20, ge=1, le=100, description="The maximum number of results to return."),
|
||||
page: int = Query(1, ge=1, description="The page number for paginated results (used when 'q' is not provided).")
|
||||
):
|
||||
"""
|
||||
Searches for articles within the ChromaDB vector store.
|
||||
|
||||
- If **q** is provided, performs a similarity search based on the query text.
|
||||
- If **q** is not provided, returns a paginated list of all articles, sorted by insertion order.
|
||||
"""
|
||||
if q:
|
||||
# Perform a similarity search
|
||||
results = chroma_collection.query(
|
||||
query_texts=[q],
|
||||
n_results=limit,
|
||||
include=['metadatas', 'distances']
|
||||
)
|
||||
|
||||
# Format results into a cleaner list of objects
|
||||
formatted_results = []
|
||||
if results and results.get('ids', [[]])[0]:
|
||||
for i, doc_id in enumerate(results['ids'][0]):
|
||||
res = results['metadatas'][0][i]
|
||||
res['distance'] = results['distances'][0][i]
|
||||
formatted_results.append(res)
|
||||
|
||||
return JSONResponse(content={"results": formatted_results})
|
||||
|
||||
else:
|
||||
# Return a paginated list of articles
|
||||
page_limit = 20
|
||||
offset = (page - 1) * page_limit
|
||||
|
||||
results = chroma_collection.get(
|
||||
limit=page_limit,
|
||||
offset=offset,
|
||||
include=['metadatas']
|
||||
)
|
||||
|
||||
return JSONResponse(content={"results": results['metadatas']})
|
||||
|
||||
## --- HTML Page Routes ---
|
||||
|
||||
@router.get("/newspapers", response_class=HTMLResponse)
|
||||
async def newspapers_list(request: Request):
|
||||
newspapers_table = db['newspapers']
|
||||
@ -441,17 +552,22 @@ async def newspaper_latest(request: Request):
|
||||
newspapers_table = db['newspapers']
|
||||
newspaper = None
|
||||
try:
|
||||
newspaper = list(db.query("select * from newspapers order by id desc limit 1"))[0]
|
||||
newspapers= list(db.query("select * from newspapers order by id desc limit 10"))
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
if not newspaper:
|
||||
return RedirectResponse(url="/newspapers")
|
||||
for newspaper in newspapers:
|
||||
|
||||
|
||||
articles = json.loads(newspaper['articles_json'])
|
||||
|
||||
return templates.TemplateResponse("newspaper_view.html", {
|
||||
"request": request,
|
||||
"newspaper": newspaper,
|
||||
"articles": articles
|
||||
})
|
||||
articles = json.loads(newspaper['articles_json'])
|
||||
if articles:
|
||||
for article in articles:
|
||||
for key, value in article.items():
|
||||
article[key] = str(value).strip()
|
||||
return templates.TemplateResponse("newspaper_view.html", {
|
||||
"request": request,
|
||||
"newspaper": newspaper,
|
||||
"articles": articles
|
||||
})
|
||||
|
||||
return RedirectResponse(url="/newspapers")
|
||||
|
||||
@ -191,7 +191,7 @@
|
||||
</div>
|
||||
|
||||
<div class="footer">
|
||||
Generated from RSS Feed Manager | Synchronized at {{ newspaper.created_at[:19].replace('T', ' ') }}
|
||||
Molodetz News - always up to date | Synchronized at {{ newspaper.created_at[:19].replace('T', ' ') }}
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
|
||||
Loading…
Reference in New Issue
Block a user