88 lines
2.9 KiB
Python
Raw Normal View History

2025-11-04 05:17:27 +01:00
import math
import re
from collections import Counter, defaultdict
2025-11-04 08:09:12 +01:00
from typing import Dict, List, Set, Tuple
2025-11-04 05:17:27 +01:00
class SemanticIndex:
def __init__(self):
self.documents: Dict[str, str] = {}
self.vocabulary: Set[str] = set()
self.idf_scores: Dict[str, float] = {}
self.doc_vectors: Dict[str, Dict[str, float]] = {}
def _tokenize(self, text: str) -> List[str]:
text = text.lower()
2025-11-04 08:09:12 +01:00
text = re.sub(r"[^a-z0-9\s]", " ", text)
2025-11-04 05:17:27 +01:00
tokens = text.split()
return tokens
def _compute_tf(self, tokens: List[str]) -> Dict[str, float]:
term_count = Counter(tokens)
total_terms = len(tokens)
return {term: count / total_terms for term, count in term_count.items()}
def _compute_idf(self):
doc_count = len(self.documents)
if doc_count == 0:
return
token_doc_count = defaultdict(int)
for doc_id, doc_text in self.documents.items():
tokens = set(self._tokenize(doc_text))
for token in tokens:
token_doc_count[token] += 1
if doc_count == 1:
self.idf_scores = {token: 1.0 for token in token_doc_count}
else:
self.idf_scores = {
2025-11-04 08:10:37 +01:00
token: math.log(doc_count / count) for token, count in token_doc_count.items()
2025-11-04 05:17:27 +01:00
}
def add_document(self, doc_id: str, text: str):
self.documents[doc_id] = text
tokens = self._tokenize(text)
self.vocabulary.update(tokens)
self._compute_idf()
tf_scores = self._compute_tf(tokens)
self.doc_vectors[doc_id] = {
2025-11-04 08:10:37 +01:00
token: tf_scores.get(token, 0) * self.idf_scores.get(token, 0) for token in tokens
2025-11-04 05:17:27 +01:00
}
def remove_document(self, doc_id: str):
if doc_id in self.documents:
del self.documents[doc_id]
if doc_id in self.doc_vectors:
del self.doc_vectors[doc_id]
self._compute_idf()
def search(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]:
query_tokens = self._tokenize(query)
query_tf = self._compute_tf(query_tokens)
query_vector = {
2025-11-04 08:10:37 +01:00
token: query_tf.get(token, 0) * self.idf_scores.get(token, 0) for token in query_tokens
2025-11-04 05:17:27 +01:00
}
scores = []
for doc_id, doc_vector in self.doc_vectors.items():
similarity = self._cosine_similarity(query_vector, doc_vector)
scores.append((doc_id, similarity))
scores.sort(key=lambda x: x[1], reverse=True)
return scores[:top_k]
2025-11-04 08:10:37 +01:00
def _cosine_similarity(self, vec1: Dict[str, float], vec2: Dict[str, float]) -> float:
2025-11-04 08:09:12 +01:00
dot_product = sum(
vec1.get(token, 0) * vec2.get(token, 0) for token in set(vec1) | set(vec2)
)
2025-11-04 05:17:27 +01:00
norm1 = math.sqrt(sum(val**2 for val in vec1.values()))
norm2 = math.sqrt(sum(val**2 for val in vec2.values()))
if norm1 == 0 or norm2 == 0:
return 0
return dot_product / (norm1 * norm2)