rlibrary/app.py

import asyncio
import hashlib
import io
import json
import os
import re
import uuid
from datetime import datetime
from typing import Optional, List
from contextlib import asynccontextmanager

from fastapi import FastAPI, UploadFile, File, Form, WebSocket, WebSocketDisconnect, HTTPException, BackgroundTasks
from fastapi.responses import FileResponse, StreamingResponse
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
from sqlmodel import Field, SQLModel, create_engine, Session, select
import chromadb
from chromadb.config import Settings
import tiktoken
import openai
import pypdf

# Configuration
UPLOAD_DIR = "uploads"
DB_URL = "sqlite:///./database.db"
CHROMA_DIR = "./chroma_db"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "your-api-key-here")
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50

os.makedirs(UPLOAD_DIR, exist_ok=True)
os.makedirs(CHROMA_DIR, exist_ok=True)

# Database Models
class Document(SQLModel, table=True):
    id: Optional[str] = Field(default=None, primary_key=True)
    name: str
    filename: str
    markdown_content: str = Field(default="")
    upload_time: datetime
    status: str = "pending"
    downloads: int = 0

class SearchResult(SQLModel, table=True):
    id: Optional[int] = Field(default=None, primary_key=True)
    query: str
    slug: str = Field(index=True)
    results_json: str
    created_at: datetime
    tokens_used: int = 0
    cost_eur: float = 0.0

class PromptResult(SQLModel, table=True):
    id: Optional[int] = Field(default=None, primary_key=True)
    query: str
    slug: str = Field(index=True)
    prompt_response: str
    search_results_json: str
    created_at: datetime
    input_tokens: int = 0
    output_tokens: int = 0
    cost_eur: float = 0.0

# Database Setup
engine = create_engine(DB_URL, connect_args={"check_same_thread": False})

def init_db():
    SQLModel.metadata.create_all(engine)

# ChromaDB Setup
chroma_client = chromadb.PersistentClient(path=CHROMA_DIR, settings=Settings(anonymized_telemetry=False))

try:
    collection = chroma_client.get_collection("documents")
except:
    collection = chroma_client.create_collection("documents")

# OpenAI Setup
openai.api_key = OPENAI_API_KEY
tokenizer = tiktoken.get_encoding("cl100k_base")

# WebSocket Manager
class ConnectionManager:
    def __init__(self):
        self.active_connections: dict[str, WebSocket] = {}

    async def connect(self, document_id: str, websocket: WebSocket):
        await websocket.accept()
        self.active_connections[document_id] = websocket

    def disconnect(self, document_id: str):
        if document_id in self.active_connections:
            del self.active_connections[document_id]

    async def send_message(self, document_id: str, message: dict):
        if document_id in self.active_connections:
            try:
                await self.active_connections[document_id].send_json(message)
            except:
                self.disconnect(document_id)

manager = ConnectionManager()

# Lifespan
@asynccontextmanager
async def lifespan(app: FastAPI):
    init_db()
    yield

# FastAPI App
app = FastAPI(lifespan=lifespan)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Helper Functions
def generate_slug(text: str) -> str:
    """Generate URL-friendly slug from text"""
    slug = re.sub(r'[^\w\s-]', '', text.lower())
    slug = re.sub(r'[-\s]+', '-', slug)
    return slug[:50] + "-" + hashlib.md5(text.encode()).hexdigest()[:8]

def convert_to_markdown(filepath: str) -> str:
    """Convert various document formats to markdown"""
    ext = os.path.splitext(filepath)[1].lower()
    
    if ext == '.pdf':
        markdown_text = ""
        with open(filepath, 'rb') as f:
            pdf_reader = pypdf.PdfReader(f)
            for page_num, page in enumerate(pdf_reader.pages):
                text = page.extract_text()
                markdown_text += f"\n\n## Page {page_num + 1}\n\n{text}"
        return markdown_text
    
    elif ext == '.md':
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            return f.read()
    
    else:
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
            lines = content.split('\n')
            markdown_lines = []
            for line in lines:
                if line.strip():
                    markdown_lines.append(line)
                else:
                    markdown_lines.append('')
            return '\n'.join(markdown_lines)

def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[dict]:
    tokens = tokenizer.encode(text)
    chunks = []
    start = 0
    
    # Extract page numbers from markdown headers
    page_pattern = re.compile(r'##\s*Page\s+(\d+)', re.IGNORECASE)
    
    while start < len(tokens):
        end = start + chunk_size
        chunk_tokens = tokens[start:end]
        chunk_text = tokenizer.decode(chunk_tokens)
        
        # Find page number in chunk
        page_match = page_pattern.search(chunk_text)
        page_num = int(page_match.group(1)) if page_match else 1
        
        chunks.append({
            "text": chunk_text,
            "page": page_num
        })
        start = end - overlap
        
    return chunks

async def get_embedding(text: str) -> tuple[List[float], int]:
    response = await asyncio.to_thread(
        openai.embeddings.create,
        input=text,
        model="text-embedding-3-small"
    )
    tokens_used = response.usage.total_tokens
    return response.data[0].embedding, tokens_used

async def detect_prompt_intent(query: str) -> bool:
    """Detect if query is a prompt vs simple search"""
    prompt_keywords = ['make', 'create', 'list', 'summarize', 'explain', 'compare', 'analyze', 'generate', 'write', 'show me', 'give me', 'find all', 'extract', 'what are', 'how many']
    query_lower = query.lower()
    
    # Check for question words or action verbs
    if any(keyword in query_lower for keyword in prompt_keywords):
        return True
    if query_lower.endswith('?') and len(query.split()) > 3:
        return True
    return False

async def execute_prompt(query: str, search_results: List[dict]) -> tuple[str, int, int, float]:
    """Execute prompt using GPT-4 with search results as context"""
    
    # Prepare context from search results
    context = "\n\n".join([
        f"Document: {r['name']}\nPage: {r.get('page', 'N/A')}\nContent: {r['snippet']}"
        for r in search_results[:10]  # Use top 10 results
    ])
    
    messages = [
        {"role": "system", "content": "You are a helpful assistant that answers questions based on the provided document context. Be concise and accurate."},
        {"role": "user", "content": f"Context from documents:\n\n{context}\n\nUser query: {query}\n\nPlease answer based on the context provided."}
    ]
    
    response = await asyncio.to_thread(
        openai.chat.completions.create,
        model="gpt-4o-mini",
        messages=messages,
        temperature=0.7,
        max_tokens=1000
    )
    
    input_tokens = response.usage.prompt_tokens
    output_tokens = response.usage.completion_tokens
    
    # Calculate cost: GPT-4o-mini pricing
    # $0.150 per 1M input tokens, $0.600 per 1M output tokens
    cost_usd = (input_tokens / 1_000_000 * 0.150) + (output_tokens / 1_000_000 * 0.600)
    cost_eur = cost_usd * 0.92
    
    return response.choices[0].message.content, input_tokens, output_tokens, cost_eur

async def process_document(document_id: str, filepath: str):
    total_tokens = 0
    
    await manager.send_message(document_id, {
        "step": "reading",
        "progress": 5,
        "message": "Reading file...",
        "tokens": 0,
        "cost_eur": 0.0
    })
    
    try:
        await manager.send_message(document_id, {
            "step": "converting",
            "progress": 15,
            "message": "Converting to markdown...",
            "tokens": 0,
            "cost_eur": 0.0
        })
        
        content = await asyncio.to_thread(convert_to_markdown, filepath)
        
        with Session(engine) as session:
            doc = session.get(Document, document_id)
            if doc:
                doc.markdown_content = content
                session.add(doc)
                session.commit()
        
        await manager.send_message(document_id, {
            "step": "chunking",
            "progress": 25,
            "message": "Splitting text into chunks...",
            "tokens": 0,
            "cost_eur": 0.0
        })
        
        chunks = chunk_text(content)
        
        await manager.send_message(document_id, {
            "step": "embedding",
            "progress": 35,
            "message": f"Processing {len(chunks)} chunks...",
            "tokens": 0,
            "cost_eur": 0.0
        })
        
        for i, chunk_data in enumerate(chunks):
            embedding, tokens = await get_embedding(chunk_data["text"])
            total_tokens += tokens
            
            cost_usd = (total_tokens / 1000) * 0.00002
            cost_eur = cost_usd * 0.92
            
            collection.add(
                ids=[f"{document_id}_chunk_{i}"],
                embeddings=[embedding],
                documents=[chunk_data["text"]],
                metadatas=[{
                    "document_id": document_id,
                    "chunk_index": i,
                    "total_chunks": len(chunks),
                    "page": chunk_data["page"]
                }]
            )
            
            progress = 35 + int((i + 1) / len(chunks) * 55)
            await manager.send_message(document_id, {
                "step": "embedding",
                "progress": progress,
                "message": f"Embedded chunk {i + 1}/{len(chunks)}",
                "tokens": total_tokens,
                "cost_eur": cost_eur
            })
        
        await manager.send_message(document_id, {
            "step": "indexing",
            "progress": 95,
            "message": "Finalizing index...",
            "tokens": total_tokens,
            "cost_eur": (total_tokens / 1000) * 0.00002 * 0.92
        })
        
        with Session(engine) as session:
            doc = session.get(Document, document_id)
            if doc:
                doc.status = "completed"
                session.add(doc)
                session.commit()
        
        await manager.send_message(document_id, {
            "step": "completed",
            "progress": 100,
            "message": "Processing complete!",
            "tokens": total_tokens,
            "cost_eur": (total_tokens / 1000) * 0.00002 * 0.92
        })
        
    except Exception as e:
        with Session(engine) as session:
            doc = session.get(Document, document_id)
            if doc:
                doc.status = "failed"
                session.add(doc)
                session.commit()
        
        await manager.send_message(document_id, {
            "step": "error",
            "progress": 0,
            "message": f"Error: {str(e)}",
            "tokens": total_tokens,
            "cost_eur": 0.0
        })

# API Endpoints
@app.post("/api/upload")
async def upload_documents(
    background_tasks: BackgroundTasks,
    files: List[UploadFile] = File(...)
):
    uploaded_docs = []
    
    for file in files:
        doc_id = str(uuid.uuid4())
        file_ext = os.path.splitext(file.filename)[1]
        filename = f"{doc_id}{file_ext}"
        filepath = os.path.join(UPLOAD_DIR, filename)
        
        content = await file.read()
        with open(filepath, 'wb') as f:
            f.write(content)
        
        document = Document(
            id=doc_id,
            name=file.filename,
            filename=filename,
            upload_time=datetime.now(),
            status="processing"
        )
        
        with Session(engine) as session:
            session.add(document)
            session.commit()
        
        background_tasks.add_task(process_document, doc_id, filepath)
        
        uploaded_docs.append({
            "document_id": doc_id,
            "name": file.filename,
            "status": "processing"
        })
    
    return {"documents": uploaded_docs}

@app.get("/api/search")
async def search_documents(query: str, page: int = 1, page_size: int = 10):
    try:
        slug = generate_slug(query)
        
        # Check cache first
        with Session(engine) as session:
            cached = session.exec(select(SearchResult).where(SearchResult.slug == slug)).first()
            if cached:
                results_data = json.loads(cached.results_json)
                return {
                    "results": results_data["results"],
                    "total": results_data["total"],
                    "page": page,
                    "page_size": page_size,
                    "slug": slug,
                    "tokens": cached.tokens_used,
                    "cost_eur": cached.cost_eur,
                    "cached": True
                }
        
        collection_count = collection.count()
        if collection_count == 0:
            return {
                "results": [],
                "total": 0,
                "page": page,
                "page_size": page_size,
                "slug": slug,
                "tokens": 0,
                "cost_eur": 0.0
            }
        
        query_embedding, tokens = await get_embedding(query)
        cost_eur = (tokens / 1000) * 0.00002 * 0.92
        
        results = collection.query(
            query_embeddings=[query_embedding],
            n_results=min(50, collection_count)
        )
        
        search_results = []
        
        if results['ids'] and len(results['ids'][0]) > 0:
            for i in range(len(results['ids'][0])):
                document_id = results['metadatas'][0][i]['document_id']
                snippet = results['documents'][0][i]
                distance = results['distances'][0][i] if 'distances' in results else 0
                page_num = results['metadatas'][0][i].get('page', 1)
                
                with Session(engine) as session:
                    doc = session.get(Document, document_id)
                    if doc:
                        search_results.append({
                            "document_id": document_id,
                            "name": doc.name,
                            "snippet": snippet[:300] + "..." if len(snippet) > 300 else snippet,
                            "chunk_index": results['metadatas'][0][i]['chunk_index'],
                            "page": page_num,
                            "upload_date": doc.upload_time.isoformat(),
                            "score": 1 - distance
                        })
        
        # Cache results
        with Session(engine) as session:
            search_cache = SearchResult(
                query=query,
                slug=slug,
                results_json=json.dumps({"results": search_results, "total": len(search_results)}),
                created_at=datetime.now(),
                tokens_used=tokens,
                cost_eur=cost_eur
            )
            session.add(search_cache)
            session.commit()
        
        start_idx = (page - 1) * page_size
        end_idx = page * page_size
        
        return {
            "results": search_results[start_idx:end_idx],
            "total": len(search_results),
            "page": page,
            "page_size": page_size,
            "slug": slug,
            "tokens": tokens,
            "cost_eur": cost_eur
        }
        
    except Exception as e:
        print(f"Search error: {str(e)}")
        import traceback
        traceback.print_exc()
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/api/search/{slug}")
async def get_cached_search(slug: str):
    with Session(engine) as session:
        cached = session.exec(select(SearchResult).where(SearchResult.slug == slug)).first()
        if not cached:
            raise HTTPException(status_code=404, detail="Search not found")
        
        results_data = json.loads(cached.results_json)
        return {
            "query": cached.query,
            "results": results_data["results"],
            "total": results_data["total"],
            "slug": slug,
            "tokens": cached.tokens_used,
            "cost_eur": cached.cost_eur
        }

@app.post("/api/prompt")
async def execute_prompt_endpoint(query: str):
    try:
        slug = generate_slug(query)
        
        # Check cache
        with Session(engine) as session:
            cached = session.exec(select(PromptResult).where(PromptResult.slug == slug)).first()
            if cached:
                return {
                    "response": cached.prompt_response,
                    "search_results": json.loads(cached.search_results_json),
                    "slug": slug,
                    "input_tokens": cached.input_tokens,
                    "output_tokens": cached.output_tokens,
                    "cost_eur": cached.cost_eur,
                    "cached": True
                }
        
        # First get search results
        search_response = await search_documents(query, page=1, page_size=20)
        
        # Execute prompt with results
        response_text, input_tokens, output_tokens, cost_eur = await execute_prompt(
            query, 
            search_response["results"]
        )
        
        # Add search cost to total
        total_cost = cost_eur + search_response.get("cost_eur", 0)
        
        # Cache prompt result
        with Session(engine) as session:
            prompt_cache = PromptResult(
                query=query,
                slug=slug,
                prompt_response=response_text,
                search_results_json=json.dumps(search_response["results"]),
                created_at=datetime.now(),
                input_tokens=input_tokens,
                output_tokens=output_tokens,
                cost_eur=total_cost
            )
            session.add(prompt_cache)
            session.commit()
        
        return {
            "response": response_text,
            "search_results": search_response["results"],
            "slug": slug,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "cost_eur": total_cost
        }
        
    except Exception as e:
        print(f"Prompt error: {str(e)}")
        import traceback
        traceback.print_exc()
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/api/prompt/{slug}")
async def get_cached_prompt(slug: str):
    with Session(engine) as session:
        cached = session.exec(select(PromptResult).where(PromptResult.slug == slug)).first()
        if not cached:
            raise HTTPException(status_code=404, detail="Prompt result not found")
        
        return {
            "query": cached.query,
            "response": cached.prompt_response,
            "search_results": json.loads(cached.search_results_json),
            "slug": slug,
            "input_tokens": cached.input_tokens,
            "output_tokens": cached.output_tokens,
            "cost_eur": cached.cost_eur
        }

@app.get("/api/documents")
async def list_documents():
    with Session(engine) as session:
        documents = session.exec(select(Document)).all()
        return [{
            "id": doc.id,
            "name": doc.name,
            "upload_time": doc.upload_time.isoformat(),
            "status": doc.status,
            "downloads": doc.downloads
        } for doc in documents]

@app.get("/api/document/{document_id}")
async def get_document(document_id: str):
    with Session(engine) as session:
        doc = session.get(Document, document_id)
        if not doc:
            raise HTTPException(status_code=404, detail="Document not found")
        
        return {
            "id": doc.id,
            "name": doc.name,
            "markdown_content": doc.markdown_content,
            "upload_time": doc.upload_time.isoformat(),
            "status": doc.status
        }

@app.get("/api/download/{document_id}")
async def download_document(document_id: str):
    with Session(engine) as session:
        doc = session.get(Document, document_id)
        if not doc:
            raise HTTPException(status_code=404, detail="Document not found")
        
        doc.downloads += 1
        session.add(doc)
        session.commit()
        
        filepath = os.path.join(UPLOAD_DIR, doc.filename)
        if not os.path.exists(filepath):
            raise HTTPException(status_code=404, detail="File not found")
        
        return FileResponse(filepath, filename=doc.name)

@app.websocket("/ws/status/{document_id}")
async def websocket_endpoint(websocket: WebSocket, document_id: str):
    await manager.connect(document_id, websocket)
    try:
        while True:
            await websocket.receive_text()
    except WebSocketDisconnect:
        manager.disconnect(document_id)

@app.get("/")
async def read_root():
    return FileResponse("index.html")

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=9900)