import json import sqlite3 import threading import time from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple from .semantic_index import SemanticIndex @dataclass class KnowledgeEntry: entry_id: str category: str content: str metadata: Dict[str, Any] created_at: float updated_at: float access_count: int = 0 importance_score: float = 1.0 def to_dict(self) -> Dict[str, Any]: return { "entry_id": self.entry_id, "category": self.category, "content": self.content, "metadata": self.metadata, "created_at": self.created_at, "updated_at": self.updated_at, "access_count": self.access_count, "importance_score": self.importance_score, } class KnowledgeStore: def __init__(self, db_path: str): self.db_path = db_path self.conn = sqlite3.connect(self.db_path, check_same_thread=False) self.lock = threading.Lock() self.semantic_index = SemanticIndex() self._initialize_store() self._load_index() def _initialize_store(self): with self.lock: cursor = self.conn.cursor() cursor.execute( """ CREATE TABLE IF NOT EXISTS knowledge_entries ( entry_id TEXT PRIMARY KEY, category TEXT NOT NULL, content TEXT NOT NULL, metadata TEXT, created_at REAL NOT NULL, updated_at REAL NOT NULL, access_count INTEGER DEFAULT 0, importance_score REAL DEFAULT 1.0 ) """ ) cursor.execute( """ CREATE INDEX IF NOT EXISTS idx_category ON knowledge_entries(category) """ ) cursor.execute( """ CREATE INDEX IF NOT EXISTS idx_importance ON knowledge_entries(importance_score DESC) """ ) cursor.execute( """ CREATE INDEX IF NOT EXISTS idx_created ON knowledge_entries(created_at DESC) """ ) self.conn.commit() def _load_index(self): with self.lock: cursor = self.conn.cursor() cursor.execute("SELECT entry_id, content FROM knowledge_entries") for row in cursor.fetchall(): self.semantic_index.add_document(row[0], row[1]) def add_entry(self, entry: KnowledgeEntry): with self.lock: cursor = self.conn.cursor() cursor.execute( """ INSERT OR REPLACE INTO knowledge_entries (entry_id, category, content, metadata, created_at, updated_at, access_count, importance_score) VALUES (?, ?, ?, ?, ?, ?, ?, ?) """, ( entry.entry_id, entry.category, entry.content, json.dumps(entry.metadata), entry.created_at, entry.updated_at, entry.access_count, entry.importance_score, ), ) self.conn.commit() self.semantic_index.add_document(entry.entry_id, entry.content) def get_entry(self, entry_id: str) -> Optional[KnowledgeEntry]: with self.lock: cursor = self.conn.cursor() cursor.execute( """ SELECT entry_id, category, content, metadata, created_at, updated_at, access_count, importance_score FROM knowledge_entries WHERE entry_id = ? """, (entry_id,), ) row = cursor.fetchone() if row: cursor.execute( """ UPDATE knowledge_entries SET access_count = access_count + 1 WHERE entry_id = ? """, (entry_id,), ) self.conn.commit() return KnowledgeEntry( entry_id=row[0], category=row[1], content=row[2], metadata=json.loads(row[3]) if row[3] else {}, created_at=row[4], updated_at=row[5], access_count=row[6] + 1, importance_score=row[7], ) return None def search_entries( self, query: str, category: Optional[str] = None, top_k: int = 5 ) -> List[KnowledgeEntry]: # Combine semantic search with exact matching semantic_results = self.semantic_index.search(query, top_k * 2) # Add FTS (Full Text Search) with exact word/phrase matching fts_results = self._fts_search(query, top_k * 2) # Combine and deduplicate results with weighted scoring combined_results = {} # Add semantic results with weight 0.7 for entry_id, score in semantic_results: combined_results[entry_id] = score * 0.7 # Add FTS results with weight 1.0 (higher priority for exact matches) for entry_id, score in fts_results: if entry_id in combined_results: combined_results[entry_id] = max(combined_results[entry_id], score * 1.0) else: combined_results[entry_id] = score * 1.0 # Sort by combined score sorted_results = sorted(combined_results.items(), key=lambda x: x[1], reverse=True) with self.lock: cursor = self.conn.cursor() entries = [] for entry_id, score in sorted_results[:top_k]: if category: cursor.execute( """ SELECT entry_id, category, content, metadata, created_at, updated_at, access_count, importance_score FROM knowledge_entries WHERE entry_id = ? AND category = ? """, (entry_id, category), ) else: cursor.execute( """ SELECT entry_id, category, content, metadata, created_at, updated_at, access_count, importance_score FROM knowledge_entries WHERE entry_id = ? """, (entry_id,), ) row = cursor.fetchone() if row: entry = KnowledgeEntry( entry_id=row[0], category=row[1], content=row[2], metadata=json.loads(row[3]) if row[3] else {}, created_at=row[4], updated_at=row[5], access_count=row[6], importance_score=row[7], ) # Add search score to metadata for context entry.metadata["search_score"] = score entries.append(entry) return entries def _fts_search(self, query: str, top_k: int = 10) -> List[Tuple[str, float]]: """Full Text Search with exact word and partial sentence matching.""" with self.lock: cursor = self.conn.cursor() # Prepare query for FTS query_lower = query.lower() query_words = query_lower.split() # Search for exact phrase matches first cursor.execute( """ SELECT entry_id, content FROM knowledge_entries WHERE LOWER(content) LIKE ? """, (f"%{query_lower}%",), ) exact_matches = [] partial_matches = [] for row in cursor.fetchall(): entry_id, content = row content_lower = content.lower() # Exact phrase match gets highest score if query_lower in content_lower: exact_matches.append((entry_id, 1.0)) continue # Count matching words content_words = set(content_lower.split()) query_word_set = set(query_words) matching_words = len(query_word_set & content_words) if matching_words > 0: # Score based on word overlap and position word_overlap_score = matching_words / len(query_word_set) # Bonus for consecutive word sequences consecutive_bonus = 0.0 for i in range(len(query_words)): for j in range(i + 1, min(i + 4, len(query_words) + 1)): phrase = " ".join(query_words[i:j]) if phrase in content_lower: consecutive_bonus += 0.2 * (j - i) total_score = min(0.99, word_overlap_score + consecutive_bonus) partial_matches.append((entry_id, total_score)) # Combine results, prioritizing exact matches all_results = exact_matches + partial_matches all_results.sort(key=lambda x: x[1], reverse=True) return all_results[:top_k] def get_by_category(self, category: str, limit: int = 20) -> List[KnowledgeEntry]: with self.lock: cursor = self.conn.cursor() cursor.execute( """ SELECT entry_id, category, content, metadata, created_at, updated_at, access_count, importance_score FROM knowledge_entries WHERE category = ? ORDER BY importance_score DESC, created_at DESC LIMIT ? """, (category, limit), ) entries = [] for row in cursor.fetchall(): entries.append( KnowledgeEntry( entry_id=row[0], category=row[1], content=row[2], metadata=json.loads(row[3]) if row[3] else {}, created_at=row[4], updated_at=row[5], access_count=row[6], importance_score=row[7], ) ) return entries def update_importance(self, entry_id: str, importance_score: float): with self.lock: cursor = self.conn.cursor() cursor.execute( """ UPDATE knowledge_entries SET importance_score = ?, updated_at = ? WHERE entry_id = ? """, (importance_score, time.time(), entry_id), ) self.conn.commit() def delete_entry(self, entry_id: str) -> bool: with self.lock: cursor = self.conn.cursor() cursor.execute("DELETE FROM knowledge_entries WHERE entry_id = ?", (entry_id,)) deleted = cursor.rowcount > 0 self.conn.commit() if deleted: self.semantic_index.remove_document(entry_id) return deleted def get_statistics(self) -> Dict[str, Any]: with self.lock: cursor = self.conn.cursor() cursor.execute("SELECT COUNT(*) FROM knowledge_entries") total_entries = cursor.fetchone()[0] cursor.execute("SELECT COUNT(DISTINCT category) FROM knowledge_entries") total_categories = cursor.fetchone()[0] cursor.execute( """ SELECT category, COUNT(*) as count FROM knowledge_entries GROUP BY category ORDER BY count DESC """ ) category_counts = {row[0]: row[1] for row in cursor.fetchall()} cursor.execute("SELECT SUM(access_count) FROM knowledge_entries") total_accesses = cursor.fetchone()[0] or 0 return { "total_entries": total_entries, "total_categories": total_categories, "category_distribution": category_counts, "total_accesses": total_accesses, "vocabulary_size": len(self.semantic_index.vocabulary), }