import re
|
|
from collections import defaultdict
|
|
from typing import Any, Dict, List
|
|
|
|
|
|
class FactExtractor:
|
|
def __init__(self):
|
|
self.fact_patterns = [
|
|
(r"([A-Z][a-z]+ [A-Z][a-z]+) is (a|an) ([^.]+)", "definition"),
|
|
(r"([A-Z][a-z]+) (was|is) (born|created|founded) in (\d{4})", "temporal"),
|
|
(r"([A-Z][a-z]+) (invented|created|developed) ([^.]+)", "attribution"),
|
|
(r"([^.]+) (costs?|worth) (\$[\d,]+)", "numeric"),
|
|
(r"([A-Z][a-z]+) (lives?|works?|located) in ([A-Z][a-z]+)", "location"),
|
|
]
|
|
|
|
def extract_facts(self, text: str) -> List[Dict[str, Any]]:
|
|
facts = []
|
|
|
|
for pattern, fact_type in self.fact_patterns:
|
|
matches = re.finditer(pattern, text)
|
|
for match in matches:
|
|
facts.append(
|
|
{
|
|
"type": fact_type,
|
|
"text": match.group(0),
|
|
"components": match.groups(),
|
|
"confidence": 0.7,
|
|
}
|
|
)
|
|
|
|
noun_phrases = self._extract_noun_phrases(text)
|
|
for phrase in noun_phrases:
|
|
if len(phrase.split()) >= 2:
|
|
facts.append(
|
|
{
|
|
"type": "entity",
|
|
"text": phrase,
|
|
"components": [phrase],
|
|
"confidence": 0.5,
|
|
}
|
|
)
|
|
|
|
return facts
|
|
|
|
def _extract_noun_phrases(self, text: str) -> List[str]:
|
|
sentences = re.split(r"[.!?]", text)
|
|
phrases = []
|
|
|
|
for sentence in sentences:
|
|
words = sentence.split()
|
|
current_phrase = []
|
|
|
|
for word in words:
|
|
if word and word[0].isupper() and len(word) > 1:
|
|
current_phrase.append(word)
|
|
else:
|
|
if len(current_phrase) >= 2:
|
|
phrases.append(" ".join(current_phrase))
|
|
current_phrase = []
|
|
|
|
if len(current_phrase) >= 2:
|
|
phrases.append(" ".join(current_phrase))
|
|
|
|
return list(set(phrases))
|
|
|
|
def extract_key_terms(self, text: str, top_k: int = 10) -> List[tuple]:
|
|
words = re.findall(r"\b[a-z]{4,}\b", text.lower())
|
|
|
|
stopwords = {
|
|
"this",
|
|
"that",
|
|
"these",
|
|
"those",
|
|
"what",
|
|
"which",
|
|
"where",
|
|
"when",
|
|
"with",
|
|
"from",
|
|
"have",
|
|
"been",
|
|
"were",
|
|
"will",
|
|
"would",
|
|
"could",
|
|
"should",
|
|
"about",
|
|
"their",
|
|
"there",
|
|
"other",
|
|
"than",
|
|
"then",
|
|
"them",
|
|
"some",
|
|
"more",
|
|
"very",
|
|
"such",
|
|
"into",
|
|
"through",
|
|
"during",
|
|
"before",
|
|
"after",
|
|
"above",
|
|
"below",
|
|
"between",
|
|
"under",
|
|
"again",
|
|
"further",
|
|
"once",
|
|
"here",
|
|
"both",
|
|
"each",
|
|
"doing",
|
|
"only",
|
|
"over",
|
|
"same",
|
|
"being",
|
|
"does",
|
|
"just",
|
|
"also",
|
|
"make",
|
|
"made",
|
|
"know",
|
|
"like",
|
|
}
|
|
|
|
filtered_words = [w for w in words if w not in stopwords]
|
|
|
|
word_freq = defaultdict(int)
|
|
for word in filtered_words:
|
|
word_freq[word] += 1
|
|
|
|
sorted_terms = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
|
|
return sorted_terms[:top_k]
|
|
|
|
def extract_relationships(self, text: str) -> List[Dict[str, Any]]:
|
|
relationships = []
|
|
|
|
relationship_patterns = [
|
|
(
|
|
r"([A-Z][a-z]+) (works for|employed by|member of) ([A-Z][a-z]+)",
|
|
"employment",
|
|
),
|
|
(r"([A-Z][a-z]+) (owns|has|possesses) ([^.]+)", "ownership"),
|
|
(
|
|
r"([A-Z][a-z]+) (located in|part of|belongs to) ([A-Z][a-z]+)",
|
|
"location",
|
|
),
|
|
(r"([A-Z][a-z]+) (uses|utilizes|implements) ([^.]+)", "usage"),
|
|
]
|
|
|
|
for pattern, rel_type in relationship_patterns:
|
|
matches = re.finditer(pattern, text)
|
|
for match in matches:
|
|
relationships.append(
|
|
{
|
|
"type": rel_type,
|
|
"subject": match.group(1),
|
|
"predicate": match.group(2),
|
|
"object": match.group(3),
|
|
"confidence": 0.6,
|
|
}
|
|
)
|
|
|
|
return relationships
|
|
|
|
def extract_metadata(self, text: str) -> Dict[str, Any]:
|
|
word_count = len(text.split())
|
|
sentence_count = len(re.split(r"[.!?]", text))
|
|
|
|
urls = re.findall(r"https?://[^\s]+", text)
|
|
email_addresses = re.findall(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", text)
|
|
dates = re.findall(
|
|
r"\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b|\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b", text
|
|
)
|
|
numbers = re.findall(r"\b\d+(?:,\d{3})*(?:\.\d+)?\b", text)
|
|
|
|
return {
|
|
"word_count": word_count,
|
|
"sentence_count": sentence_count,
|
|
"avg_words_per_sentence": round(word_count / max(sentence_count, 1), 2),
|
|
"urls": urls,
|
|
"email_addresses": email_addresses,
|
|
"dates": dates,
|
|
"numeric_values": numbers,
|
|
"has_code": bool(re.search(r"```|def |class |import |function ", text)),
|
|
"has_questions": bool(re.search(r"\?", text)),
|
|
}
|
|
|
|
def categorize_content(self, text: str) -> List[str]:
|
|
categories = []
|
|
|
|
category_keywords = {
|
|
"programming": [
|
|
"code",
|
|
"function",
|
|
"class",
|
|
"variable",
|
|
"programming",
|
|
"software",
|
|
"debug",
|
|
],
|
|
"data": [
|
|
"data",
|
|
"database",
|
|
"query",
|
|
"table",
|
|
"record",
|
|
"statistics",
|
|
"analysis",
|
|
],
|
|
"documentation": [
|
|
"documentation",
|
|
"guide",
|
|
"tutorial",
|
|
"manual",
|
|
"readme",
|
|
"explain",
|
|
],
|
|
"configuration": [
|
|
"config",
|
|
"settings",
|
|
"configuration",
|
|
"setup",
|
|
"install",
|
|
"deployment",
|
|
],
|
|
"testing": [
|
|
"test",
|
|
"testing",
|
|
"validate",
|
|
"verification",
|
|
"quality",
|
|
"assertion",
|
|
],
|
|
"research": [
|
|
"research",
|
|
"study",
|
|
"analysis",
|
|
"investigation",
|
|
"findings",
|
|
"results",
|
|
],
|
|
"planning": [
|
|
"plan",
|
|
"planning",
|
|
"schedule",
|
|
"roadmap",
|
|
"milestone",
|
|
"timeline",
|
|
],
|
|
}
|
|
|
|
text_lower = text.lower()
|
|
for category, keywords in category_keywords.items():
|
|
if any(keyword in text_lower for keyword in keywords):
|
|
categories.append(category)
|
|
|
|
return categories if categories else ["general"]
|