2025-11-04 05:17:27 +01:00
|
|
|
import math
|
|
|
|
|
import re
|
|
|
|
|
from collections import Counter, defaultdict
|
2025-11-04 08:09:12 +01:00
|
|
|
from typing import Dict, List, Set, Tuple
|
|
|
|
|
|
2025-11-04 05:17:27 +01:00
|
|
|
|
|
|
|
|
class SemanticIndex:
|
|
|
|
|
def __init__(self):
|
|
|
|
|
self.documents: Dict[str, str] = {}
|
|
|
|
|
self.vocabulary: Set[str] = set()
|
|
|
|
|
self.idf_scores: Dict[str, float] = {}
|
|
|
|
|
self.doc_vectors: Dict[str, Dict[str, float]] = {}
|
|
|
|
|
|
|
|
|
|
def _tokenize(self, text: str) -> List[str]:
|
|
|
|
|
text = text.lower()
|
2025-11-04 08:09:12 +01:00
|
|
|
text = re.sub(r"[^a-z0-9\s]", " ", text)
|
2025-11-04 05:17:27 +01:00
|
|
|
tokens = text.split()
|
|
|
|
|
return tokens
|
|
|
|
|
|
|
|
|
|
def _compute_tf(self, tokens: List[str]) -> Dict[str, float]:
|
|
|
|
|
term_count = Counter(tokens)
|
|
|
|
|
total_terms = len(tokens)
|
|
|
|
|
return {term: count / total_terms for term, count in term_count.items()}
|
|
|
|
|
|
|
|
|
|
def _compute_idf(self):
|
|
|
|
|
doc_count = len(self.documents)
|
|
|
|
|
if doc_count == 0:
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
token_doc_count = defaultdict(int)
|
|
|
|
|
|
|
|
|
|
for doc_id, doc_text in self.documents.items():
|
|
|
|
|
tokens = set(self._tokenize(doc_text))
|
|
|
|
|
for token in tokens:
|
|
|
|
|
token_doc_count[token] += 1
|
|
|
|
|
|
|
|
|
|
if doc_count == 1:
|
|
|
|
|
self.idf_scores = {token: 1.0 for token in token_doc_count}
|
|
|
|
|
else:
|
|
|
|
|
self.idf_scores = {
|
|
|
|
|
token: math.log(doc_count / count)
|
|
|
|
|
for token, count in token_doc_count.items()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def add_document(self, doc_id: str, text: str):
|
|
|
|
|
self.documents[doc_id] = text
|
|
|
|
|
tokens = self._tokenize(text)
|
|
|
|
|
self.vocabulary.update(tokens)
|
|
|
|
|
|
|
|
|
|
self._compute_idf()
|
|
|
|
|
|
|
|
|
|
tf_scores = self._compute_tf(tokens)
|
|
|
|
|
self.doc_vectors[doc_id] = {
|
|
|
|
|
token: tf_scores.get(token, 0) * self.idf_scores.get(token, 0)
|
|
|
|
|
for token in tokens
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def remove_document(self, doc_id: str):
|
|
|
|
|
if doc_id in self.documents:
|
|
|
|
|
del self.documents[doc_id]
|
|
|
|
|
if doc_id in self.doc_vectors:
|
|
|
|
|
del self.doc_vectors[doc_id]
|
|
|
|
|
self._compute_idf()
|
|
|
|
|
|
|
|
|
|
def search(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]:
|
|
|
|
|
query_tokens = self._tokenize(query)
|
|
|
|
|
query_tf = self._compute_tf(query_tokens)
|
|
|
|
|
|
|
|
|
|
query_vector = {
|
|
|
|
|
token: query_tf.get(token, 0) * self.idf_scores.get(token, 0)
|
|
|
|
|
for token in query_tokens
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
scores = []
|
|
|
|
|
for doc_id, doc_vector in self.doc_vectors.items():
|
|
|
|
|
similarity = self._cosine_similarity(query_vector, doc_vector)
|
|
|
|
|
scores.append((doc_id, similarity))
|
|
|
|
|
|
|
|
|
|
scores.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
|
return scores[:top_k]
|
|
|
|
|
|
2025-11-04 08:09:12 +01:00
|
|
|
def _cosine_similarity(
|
|
|
|
|
self, vec1: Dict[str, float], vec2: Dict[str, float]
|
|
|
|
|
) -> float:
|
|
|
|
|
dot_product = sum(
|
|
|
|
|
vec1.get(token, 0) * vec2.get(token, 0) for token in set(vec1) | set(vec2)
|
|
|
|
|
)
|
2025-11-04 05:17:27 +01:00
|
|
|
norm1 = math.sqrt(sum(val**2 for val in vec1.values()))
|
|
|
|
|
norm2 = math.sqrt(sum(val**2 for val in vec2.values()))
|
|
|
|
|
if norm1 == 0 or norm2 == 0:
|
|
|
|
|
return 0
|
|
|
|
|
return dot_product / (norm1 * norm2)
|