147 lines
5.9 KiB
Python
Raw Normal View History

2025-11-04 05:17:27 +01:00
import re
import json
from typing import List, Dict, Any, Set
from collections import defaultdict
class FactExtractor:
def __init__(self):
self.fact_patterns = [
(r'([A-Z][a-z]+ [A-Z][a-z]+) is (a|an) ([^.]+)', 'definition'),
(r'([A-Z][a-z]+) (was|is) (born|created|founded) in (\d{4})', 'temporal'),
(r'([A-Z][a-z]+) (invented|created|developed) ([^.]+)', 'attribution'),
(r'([^.]+) (costs?|worth) (\$[\d,]+)', 'numeric'),
(r'([A-Z][a-z]+) (lives?|works?|located) in ([A-Z][a-z]+)', 'location'),
]
def extract_facts(self, text: str) -> List[Dict[str, Any]]:
facts = []
for pattern, fact_type in self.fact_patterns:
matches = re.finditer(pattern, text)
for match in matches:
facts.append({
'type': fact_type,
'text': match.group(0),
'components': match.groups(),
'confidence': 0.7
})
noun_phrases = self._extract_noun_phrases(text)
for phrase in noun_phrases:
if len(phrase.split()) >= 2:
facts.append({
'type': 'entity',
'text': phrase,
'components': [phrase],
'confidence': 0.5
})
return facts
def _extract_noun_phrases(self, text: str) -> List[str]:
sentences = re.split(r'[.!?]', text)
phrases = []
for sentence in sentences:
words = sentence.split()
current_phrase = []
for word in words:
if word and word[0].isupper() and len(word) > 1:
current_phrase.append(word)
else:
if len(current_phrase) >= 2:
phrases.append(' '.join(current_phrase))
current_phrase = []
if len(current_phrase) >= 2:
phrases.append(' '.join(current_phrase))
return list(set(phrases))
def extract_key_terms(self, text: str, top_k: int = 10) -> List[tuple]:
words = re.findall(r'\b[a-z]{4,}\b', text.lower())
stopwords = {
'this', 'that', 'these', 'those', 'what', 'which', 'where', 'when',
'with', 'from', 'have', 'been', 'were', 'will', 'would', 'could',
'should', 'about', 'their', 'there', 'other', 'than', 'then', 'them',
'some', 'more', 'very', 'such', 'into', 'through', 'during', 'before',
'after', 'above', 'below', 'between', 'under', 'again', 'further',
'once', 'here', 'both', 'each', 'doing', 'only', 'over', 'same',
'being', 'does', 'just', 'also', 'make', 'made', 'know', 'like'
}
filtered_words = [w for w in words if w not in stopwords]
word_freq = defaultdict(int)
for word in filtered_words:
word_freq[word] += 1
sorted_terms = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
return sorted_terms[:top_k]
def extract_relationships(self, text: str) -> List[Dict[str, Any]]:
relationships = []
relationship_patterns = [
(r'([A-Z][a-z]+) (works for|employed by|member of) ([A-Z][a-z]+)', 'employment'),
(r'([A-Z][a-z]+) (owns|has|possesses) ([^.]+)', 'ownership'),
(r'([A-Z][a-z]+) (located in|part of|belongs to) ([A-Z][a-z]+)', 'location'),
(r'([A-Z][a-z]+) (uses|utilizes|implements) ([^.]+)', 'usage'),
]
for pattern, rel_type in relationship_patterns:
matches = re.finditer(pattern, text)
for match in matches:
relationships.append({
'type': rel_type,
'subject': match.group(1),
'predicate': match.group(2),
'object': match.group(3),
'confidence': 0.6
})
return relationships
def extract_metadata(self, text: str) -> Dict[str, Any]:
word_count = len(text.split())
sentence_count = len(re.split(r'[.!?]', text))
urls = re.findall(r'https?://[^\s]+', text)
email_addresses = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
dates = re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b|\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b', text)
numbers = re.findall(r'\b\d+(?:,\d{3})*(?:\.\d+)?\b', text)
return {
'word_count': word_count,
'sentence_count': sentence_count,
'avg_words_per_sentence': round(word_count / max(sentence_count, 1), 2),
'urls': urls,
'email_addresses': email_addresses,
'dates': dates,
'numeric_values': numbers,
'has_code': bool(re.search(r'```|def |class |import |function ', text)),
'has_questions': bool(re.search(r'\?', text))
}
def categorize_content(self, text: str) -> List[str]:
categories = []
category_keywords = {
'programming': ['code', 'function', 'class', 'variable', 'programming', 'software', 'debug'],
'data': ['data', 'database', 'query', 'table', 'record', 'statistics', 'analysis'],
'documentation': ['documentation', 'guide', 'tutorial', 'manual', 'readme', 'explain'],
'configuration': ['config', 'settings', 'configuration', 'setup', 'install', 'deployment'],
'testing': ['test', 'testing', 'validate', 'verification', 'quality', 'assertion'],
'research': ['research', 'study', 'analysis', 'investigation', 'findings', 'results'],
'planning': ['plan', 'planning', 'schedule', 'roadmap', 'milestone', 'timeline'],
}
text_lower = text.lower()
for category, keywords in category_keywords.items():
if any(keyword in text_lower for keyword in keywords):
categories.append(category)
return categories if categories else ['general']