import math import re from collections import Counter, defaultdict from typing import Dict, List, Set, Tuple class SemanticIndex: def __init__(self): self.documents: Dict[str, str] = {} self.vocabulary: Set[str] = set() self.idf_scores: Dict[str, float] = {} self.doc_vectors: Dict[str, Dict[str, float]] = {} def _tokenize(self, text: str) -> List[str]: text = text.lower() text = re.sub(r"[^a-z0-9\s]", " ", text) tokens = text.split() return tokens def _compute_tf(self, tokens: List[str]) -> Dict[str, float]: term_count = Counter(tokens) total_terms = len(tokens) return {term: count / total_terms for term, count in term_count.items()} def _compute_idf(self): doc_count = len(self.documents) if doc_count == 0: return token_doc_count = defaultdict(int) for doc_id, doc_text in self.documents.items(): tokens = set(self._tokenize(doc_text)) for token in tokens: token_doc_count[token] += 1 if doc_count == 1: self.idf_scores = {token: 1.0 for token in token_doc_count} else: self.idf_scores = { token: math.log(doc_count / count) for token, count in token_doc_count.items() } def add_document(self, doc_id: str, text: str): self.documents[doc_id] = text tokens = self._tokenize(text) self.vocabulary.update(tokens) self._compute_idf() tf_scores = self._compute_tf(tokens) self.doc_vectors[doc_id] = { token: tf_scores.get(token, 0) * self.idf_scores.get(token, 0) for token in tokens } def remove_document(self, doc_id: str): if doc_id in self.documents: del self.documents[doc_id] if doc_id in self.doc_vectors: del self.doc_vectors[doc_id] self._compute_idf() def search(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]: query_tokens = self._tokenize(query) query_tf = self._compute_tf(query_tokens) query_vector = { token: query_tf.get(token, 0) * self.idf_scores.get(token, 0) for token in query_tokens } scores = [] for doc_id, doc_vector in self.doc_vectors.items(): similarity = self._cosine_similarity(query_vector, doc_vector) scores.append((doc_id, similarity)) scores.sort(key=lambda x: x[1], reverse=True) return scores[:top_k] def _cosine_similarity(self, vec1: Dict[str, float], vec2: Dict[str, float]) -> float: dot_product = sum( vec1.get(token, 0) * vec2.get(token, 0) for token in set(vec1) | set(vec2) ) norm1 = math.sqrt(sum(val**2 for val in vec1.values())) norm2 = math.sqrt(sum(val**2 for val in vec2.values())) if norm1 == 0 or norm2 == 0: return 0 return dot_product / (norm1 * norm2)