262 lines
7.6 KiB
Python
Raw Normal View History

2025-11-04 05:17:27 +01:00
import re
from collections import defaultdict
2025-11-04 08:09:12 +01:00
from typing import Any, Dict, List
2025-11-04 05:17:27 +01:00
class FactExtractor:
def __init__(self):
self.fact_patterns = [
2025-11-04 08:09:12 +01:00
(r"([A-Z][a-z]+ [A-Z][a-z]+) is (a|an) ([^.]+)", "definition"),
(r"([A-Z][a-z]+) (was|is) (born|created|founded) in (\d{4})", "temporal"),
(r"([A-Z][a-z]+) (invented|created|developed) ([^.]+)", "attribution"),
(r"([^.]+) (costs?|worth) (\$[\d,]+)", "numeric"),
(r"([A-Z][a-z]+) (lives?|works?|located) in ([A-Z][a-z]+)", "location"),
2025-11-04 05:17:27 +01:00
]
def extract_facts(self, text: str) -> List[Dict[str, Any]]:
facts = []
for pattern, fact_type in self.fact_patterns:
matches = re.finditer(pattern, text)
for match in matches:
2025-11-04 08:09:12 +01:00
facts.append(
{
"type": fact_type,
"text": match.group(0),
"components": match.groups(),
"confidence": 0.7,
}
)
2025-11-04 05:17:27 +01:00
noun_phrases = self._extract_noun_phrases(text)
for phrase in noun_phrases:
if len(phrase.split()) >= 2:
2025-11-04 08:09:12 +01:00
facts.append(
{
"type": "entity",
"text": phrase,
"components": [phrase],
"confidence": 0.5,
}
)
2025-11-04 05:17:27 +01:00
return facts
def _extract_noun_phrases(self, text: str) -> List[str]:
2025-11-04 08:09:12 +01:00
sentences = re.split(r"[.!?]", text)
2025-11-04 05:17:27 +01:00
phrases = []
for sentence in sentences:
words = sentence.split()
current_phrase = []
for word in words:
if word and word[0].isupper() and len(word) > 1:
current_phrase.append(word)
else:
if len(current_phrase) >= 2:
2025-11-04 08:09:12 +01:00
phrases.append(" ".join(current_phrase))
2025-11-04 05:17:27 +01:00
current_phrase = []
if len(current_phrase) >= 2:
2025-11-04 08:09:12 +01:00
phrases.append(" ".join(current_phrase))
2025-11-04 05:17:27 +01:00
return list(set(phrases))
def extract_key_terms(self, text: str, top_k: int = 10) -> List[tuple]:
2025-11-04 08:09:12 +01:00
words = re.findall(r"\b[a-z]{4,}\b", text.lower())
2025-11-04 05:17:27 +01:00
stopwords = {
2025-11-04 08:09:12 +01:00
"this",
"that",
"these",
"those",
"what",
"which",
"where",
"when",
"with",
"from",
"have",
"been",
"were",
"will",
"would",
"could",
"should",
"about",
"their",
"there",
"other",
"than",
"then",
"them",
"some",
"more",
"very",
"such",
"into",
"through",
"during",
"before",
"after",
"above",
"below",
"between",
"under",
"again",
"further",
"once",
"here",
"both",
"each",
"doing",
"only",
"over",
"same",
"being",
"does",
"just",
"also",
"make",
"made",
"know",
"like",
2025-11-04 05:17:27 +01:00
}
filtered_words = [w for w in words if w not in stopwords]
word_freq = defaultdict(int)
for word in filtered_words:
word_freq[word] += 1
sorted_terms = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
return sorted_terms[:top_k]
def extract_relationships(self, text: str) -> List[Dict[str, Any]]:
relationships = []
relationship_patterns = [
2025-11-04 08:09:12 +01:00
(
r"([A-Z][a-z]+) (works for|employed by|member of) ([A-Z][a-z]+)",
"employment",
),
(r"([A-Z][a-z]+) (owns|has|possesses) ([^.]+)", "ownership"),
(
r"([A-Z][a-z]+) (located in|part of|belongs to) ([A-Z][a-z]+)",
"location",
),
(r"([A-Z][a-z]+) (uses|utilizes|implements) ([^.]+)", "usage"),
2025-11-04 05:17:27 +01:00
]
for pattern, rel_type in relationship_patterns:
matches = re.finditer(pattern, text)
for match in matches:
2025-11-04 08:09:12 +01:00
relationships.append(
{
"type": rel_type,
"subject": match.group(1),
"predicate": match.group(2),
"object": match.group(3),
"confidence": 0.6,
}
)
2025-11-04 05:17:27 +01:00
return relationships
def extract_metadata(self, text: str) -> Dict[str, Any]:
word_count = len(text.split())
2025-11-04 08:09:12 +01:00
sentence_count = len(re.split(r"[.!?]", text))
2025-11-04 05:17:27 +01:00
2025-11-04 08:09:12 +01:00
urls = re.findall(r"https?://[^\s]+", text)
email_addresses = re.findall(
r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", text
)
dates = re.findall(
r"\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b|\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b", text
)
numbers = re.findall(r"\b\d+(?:,\d{3})*(?:\.\d+)?\b", text)
2025-11-04 05:17:27 +01:00
return {
2025-11-04 08:09:12 +01:00
"word_count": word_count,
"sentence_count": sentence_count,
"avg_words_per_sentence": round(word_count / max(sentence_count, 1), 2),
"urls": urls,
"email_addresses": email_addresses,
"dates": dates,
"numeric_values": numbers,
"has_code": bool(re.search(r"```|def |class |import |function ", text)),
"has_questions": bool(re.search(r"\?", text)),
2025-11-04 05:17:27 +01:00
}
def categorize_content(self, text: str) -> List[str]:
categories = []
category_keywords = {
2025-11-04 08:09:12 +01:00
"programming": [
"code",
"function",
"class",
"variable",
"programming",
"software",
"debug",
],
"data": [
"data",
"database",
"query",
"table",
"record",
"statistics",
"analysis",
],
"documentation": [
"documentation",
"guide",
"tutorial",
"manual",
"readme",
"explain",
],
"configuration": [
"config",
"settings",
"configuration",
"setup",
"install",
"deployment",
],
"testing": [
"test",
"testing",
"validate",
"verification",
"quality",
"assertion",
],
"research": [
"research",
"study",
"analysis",
"investigation",
"findings",
"results",
],
"planning": [
"plan",
"planning",
"schedule",
"roadmap",
"milestone",
"timeline",
],
2025-11-04 05:17:27 +01:00
}
text_lower = text.lower()
for category, keywords in category_keywords.items():
if any(keyword in text_lower for keyword in keywords):
categories.append(category)
2025-11-04 08:09:12 +01:00
return categories if categories else ["general"]