rp/knowledge_store.py at 6509ccc5d3d054284d60cbf084023bcdd5da12a2

 import json
 import sqlite3
 import threading
 import time
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple
 from .semantic_index import SemanticIndex
 @dataclass
 class KnowledgeEntry:
     entry_id: str
     category: str
     content: str
     metadata: Dict[str, Any]
     created_at: float
     updated_at: float
     access_count: int = 0
     importance_score: float = 1.0
     def to_dict(self) -> Dict[str, Any]:
         return {
             "entry_id": self.entry_id,
             "category": self.category,
             "content": self.content,
             "metadata": self.metadata,
             "created_at": self.created_at,
             "updated_at": self.updated_at,
             "access_count": self.access_count,
             "importance_score": self.importance_score,
         }
 class KnowledgeStore:
     def __init__(self, db_path: str):
         self.db_path = db_path
         self.conn = sqlite3.connect(self.db_path, check_same_thread=False)
         self.lock = threading.Lock()
         self.semantic_index = SemanticIndex()
         self._initialize_store()
         self._load_index()
     def _initialize_store(self):
         with self.lock:
             cursor = self.conn.cursor()
             cursor.execute(
                 """
                 CREATE TABLE IF NOT EXISTS knowledge_entries (
                     entry_id TEXT PRIMARY KEY,
                     category TEXT NOT NULL,
                     content TEXT NOT NULL,
                     metadata TEXT,
                     created_at REAL NOT NULL,
                     updated_at REAL NOT NULL,
                     access_count INTEGER DEFAULT 0,
                     importance_score REAL DEFAULT 1.0
                 )
             """
             )
             cursor.execute(
                 """
                 CREATE INDEX IF NOT EXISTS idx_category ON knowledge_entries(category)
             """
             )
             cursor.execute(
                 """
                 CREATE INDEX IF NOT EXISTS idx_importance ON knowledge_entries(importance_score DESC)
             """
             )
             cursor.execute(
                 """
                 CREATE INDEX IF NOT EXISTS idx_created ON knowledge_entries(created_at DESC)
             """
             )
             self.conn.commit()
     def _load_index(self):
         with self.lock:
             cursor = self.conn.cursor()
             cursor.execute("SELECT entry_id, content FROM knowledge_entries")
             for row in cursor.fetchall():
                 self.semantic_index.add_document(row[0], row[1])
     def add_entry(self, entry: KnowledgeEntry):
         with self.lock:
             cursor = self.conn.cursor()
             cursor.execute(
                 """
                 INSERT OR REPLACE INTO knowledge_entries
                 (entry_id, category, content, metadata, created_at, updated_at, access_count, importance_score)
                 VALUES (?, ?, ?, ?, ?, ?, ?, ?)
             """,
                 (
                     entry.entry_id,
                     entry.category,
                     entry.content,
                     json.dumps(entry.metadata),
                     entry.created_at,
                     entry.updated_at,
                     entry.access_count,
                     entry.importance_score,
                 ),
             )
             self.conn.commit()
             self.semantic_index.add_document(entry.entry_id, entry.content)
     def get_entry(self, entry_id: str) -> Optional[KnowledgeEntry]:
         with self.lock:
             cursor = self.conn.cursor()
             cursor.execute(
                 """
                 SELECT entry_id, category, content, metadata, created_at, updated_at, access_count, importance_score
                 FROM knowledge_entries
                 WHERE entry_id = ?
             """,
                 (entry_id,),
             )
             row = cursor.fetchone()
             if row:
                 cursor.execute(
                     """
                     UPDATE knowledge_entries
                     SET access_count = access_count + 1
                     WHERE entry_id = ?
                 """,
                     (entry_id,),
                 )
                 self.conn.commit()
                 return KnowledgeEntry(
                     entry_id=row[0],
                     category=row[1],
                     content=row[2],
                     metadata=json.loads(row[3]) if row[3] else {},
                     created_at=row[4],
                     updated_at=row[5],
                     access_count=row[6] + 1,
                     importance_score=row[7],
                 )
             return None
     def search_entries(
         self, query: str, category: Optional[str] = None, top_k: int = 5
     ) -> List[KnowledgeEntry]:
         # Combine semantic search with exact matching
         semantic_results = self.semantic_index.search(query, top_k * 2)
         # Add FTS (Full Text Search) with exact word/phrase matching
         fts_results = self._fts_search(query, top_k * 2)
         # Combine and deduplicate results with weighted scoring
         combined_results = {}
         # Add semantic results with weight 0.7
         for entry_id, score in semantic_results:
             combined_results[entry_id] = score * 0.7
         # Add FTS results with weight 1.0 (higher priority for exact matches)
         for entry_id, score in fts_results:
             if entry_id in combined_results:
                 combined_results[entry_id] = max(combined_results[entry_id], score * 1.0)
             else:
                 combined_results[entry_id] = score * 1.0
         # Sort by combined score
         sorted_results = sorted(combined_results.items(), key=lambda x: x[1], reverse=True)
         with self.lock:
             cursor = self.conn.cursor()
             entries = []
             for entry_id, score in sorted_results[:top_k]:
                 if category:
                     cursor.execute(
                         """
                         SELECT entry_id, category, content, metadata, created_at, updated_at, access_count, importance_score
                         FROM knowledge_entries
                         WHERE entry_id = ? AND category = ?
                     """,
                         (entry_id, category),
                     )
                 else:
                     cursor.execute(
                         """
                         SELECT entry_id, category, content, metadata, created_at, updated_at, access_count, importance_score
                         FROM knowledge_entries
                         WHERE entry_id = ?
                     """,
                         (entry_id,),
                     )
                 row = cursor.fetchone()
                 if row:
                     entry = KnowledgeEntry(
                         entry_id=row[0],
                         category=row[1],
                         content=row[2],
                         metadata=json.loads(row[3]) if row[3] else {},
                         created_at=row[4],
                         updated_at=row[5],
                         access_count=row[6],
                         importance_score=row[7],
                     )
                     # Add search score to metadata for context
                     entry.metadata["search_score"] = score
                     entries.append(entry)
             return entries
     def _fts_search(self, query: str, top_k: int = 10) -> List[Tuple[str, float]]:
         """Full Text Search with exact word and partial sentence matching."""
         with self.lock:
             cursor = self.conn.cursor()
             # Prepare query for FTS
             query_lower = query.lower()
             query_words = query_lower.split()
             # Search for exact phrase matches first
             cursor.execute(
                 """
                 SELECT entry_id, content
                 FROM knowledge_entries
                 WHERE LOWER(content) LIKE ?
             """,
                 (f"%{query_lower}%",),
             )
             exact_matches = []
             partial_matches = []
             for row in cursor.fetchall():
                 entry_id, content = row
                 content_lower = content.lower()
                 # Exact phrase match gets highest score
                 if query_lower in content_lower:
                     exact_matches.append((entry_id, 1.0))
                     continue
                 # Count matching words
                 content_words = set(content_lower.split())
                 query_word_set = set(query_words)
                 matching_words = len(query_word_set & content_words)
                 if matching_words > 0:
                     # Score based on word overlap and position
                     word_overlap_score = matching_words / len(query_word_set)
                     # Bonus for consecutive word sequences
                     consecutive_bonus = 0.0
                     for i in range(len(query_words)):
                         for j in range(i + 1, min(i + 4, len(query_words) + 1)):
                             phrase = " ".join(query_words[i:j])
                             if phrase in content_lower:
                                 consecutive_bonus += 0.2 * (j - i)
                     total_score = min(0.99, word_overlap_score + consecutive_bonus)
                     partial_matches.append((entry_id, total_score))
             # Combine results, prioritizing exact matches
             all_results = exact_matches + partial_matches
             all_results.sort(key=lambda x: x[1], reverse=True)
             return all_results[:top_k]
     def get_by_category(self, category: str, limit: int = 20) -> List[KnowledgeEntry]:
         with self.lock:
             cursor = self.conn.cursor()
             cursor.execute(
                 """
                 SELECT entry_id, category, content, metadata, created_at, updated_at, access_count, importance_score
                 FROM knowledge_entries
                 WHERE category = ?
                 ORDER BY importance_score DESC, created_at DESC
                 LIMIT ?
             """,
                 (category, limit),
             )
             entries = []
             for row in cursor.fetchall():
                 entries.append(
                     KnowledgeEntry(
                         entry_id=row[0],
                         category=row[1],
                         content=row[2],
                         metadata=json.loads(row[3]) if row[3] else {},
                         created_at=row[4],
                         updated_at=row[5],
                         access_count=row[6],
                         importance_score=row[7],
                     )
                 )
             return entries
     def update_importance(self, entry_id: str, importance_score: float):
         with self.lock:
             cursor = self.conn.cursor()
             cursor.execute(
                 """
                 UPDATE knowledge_entries
                 SET importance_score = ?, updated_at = ?
                 WHERE entry_id = ?
             """,
                 (importance_score, time.time(), entry_id),
             )
             self.conn.commit()
     def delete_entry(self, entry_id: str) -> bool:
         with self.lock:
             cursor = self.conn.cursor()
             cursor.execute("DELETE FROM knowledge_entries WHERE entry_id = ?", (entry_id,))
             deleted = cursor.rowcount > 0
             self.conn.commit()
             if deleted:
                 self.semantic_index.remove_document(entry_id)
             return deleted
     def get_statistics(self) -> Dict[str, Any]:
         with self.lock:
             cursor = self.conn.cursor()
             cursor.execute("SELECT COUNT(*) FROM knowledge_entries")
             total_entries = cursor.fetchone()[0]
             cursor.execute("SELECT COUNT(DISTINCT category) FROM knowledge_entries")
             total_categories = cursor.fetchone()[0]
             cursor.execute(
                 """
                 SELECT category, COUNT(*) as count
                 FROM knowledge_entries
                 GROUP BY category
                 ORDER BY count DESC
             """
             )
             category_counts = {row[0]: row[1] for row in cursor.fetchall()}
             cursor.execute("SELECT SUM(access_count) FROM knowledge_entries")
             total_accesses = cursor.fetchone()[0] or 0
             return {
                 "total_entries": total_entries,
                 "total_categories": total_categories,
                 "category_distribution": category_counts,
                 "total_accesses": total_accesses,
                 "vocabulary_size": len(self.semantic_index.vocabulary),
             }