147 lines
5.9 KiB
Python
147 lines
5.9 KiB
Python
|
|
import re
|
||
|
|
import json
|
||
|
|
from typing import List, Dict, Any, Set
|
||
|
|
from collections import defaultdict
|
||
|
|
|
||
|
|
class FactExtractor:
|
||
|
|
def __init__(self):
|
||
|
|
self.fact_patterns = [
|
||
|
|
(r'([A-Z][a-z]+ [A-Z][a-z]+) is (a|an) ([^.]+)', 'definition'),
|
||
|
|
(r'([A-Z][a-z]+) (was|is) (born|created|founded) in (\d{4})', 'temporal'),
|
||
|
|
(r'([A-Z][a-z]+) (invented|created|developed) ([^.]+)', 'attribution'),
|
||
|
|
(r'([^.]+) (costs?|worth) (\$[\d,]+)', 'numeric'),
|
||
|
|
(r'([A-Z][a-z]+) (lives?|works?|located) in ([A-Z][a-z]+)', 'location'),
|
||
|
|
]
|
||
|
|
|
||
|
|
def extract_facts(self, text: str) -> List[Dict[str, Any]]:
|
||
|
|
facts = []
|
||
|
|
|
||
|
|
for pattern, fact_type in self.fact_patterns:
|
||
|
|
matches = re.finditer(pattern, text)
|
||
|
|
for match in matches:
|
||
|
|
facts.append({
|
||
|
|
'type': fact_type,
|
||
|
|
'text': match.group(0),
|
||
|
|
'components': match.groups(),
|
||
|
|
'confidence': 0.7
|
||
|
|
})
|
||
|
|
|
||
|
|
noun_phrases = self._extract_noun_phrases(text)
|
||
|
|
for phrase in noun_phrases:
|
||
|
|
if len(phrase.split()) >= 2:
|
||
|
|
facts.append({
|
||
|
|
'type': 'entity',
|
||
|
|
'text': phrase,
|
||
|
|
'components': [phrase],
|
||
|
|
'confidence': 0.5
|
||
|
|
})
|
||
|
|
|
||
|
|
return facts
|
||
|
|
|
||
|
|
def _extract_noun_phrases(self, text: str) -> List[str]:
|
||
|
|
sentences = re.split(r'[.!?]', text)
|
||
|
|
phrases = []
|
||
|
|
|
||
|
|
for sentence in sentences:
|
||
|
|
words = sentence.split()
|
||
|
|
current_phrase = []
|
||
|
|
|
||
|
|
for word in words:
|
||
|
|
if word and word[0].isupper() and len(word) > 1:
|
||
|
|
current_phrase.append(word)
|
||
|
|
else:
|
||
|
|
if len(current_phrase) >= 2:
|
||
|
|
phrases.append(' '.join(current_phrase))
|
||
|
|
current_phrase = []
|
||
|
|
|
||
|
|
if len(current_phrase) >= 2:
|
||
|
|
phrases.append(' '.join(current_phrase))
|
||
|
|
|
||
|
|
return list(set(phrases))
|
||
|
|
|
||
|
|
def extract_key_terms(self, text: str, top_k: int = 10) -> List[tuple]:
|
||
|
|
words = re.findall(r'\b[a-z]{4,}\b', text.lower())
|
||
|
|
|
||
|
|
stopwords = {
|
||
|
|
'this', 'that', 'these', 'those', 'what', 'which', 'where', 'when',
|
||
|
|
'with', 'from', 'have', 'been', 'were', 'will', 'would', 'could',
|
||
|
|
'should', 'about', 'their', 'there', 'other', 'than', 'then', 'them',
|
||
|
|
'some', 'more', 'very', 'such', 'into', 'through', 'during', 'before',
|
||
|
|
'after', 'above', 'below', 'between', 'under', 'again', 'further',
|
||
|
|
'once', 'here', 'both', 'each', 'doing', 'only', 'over', 'same',
|
||
|
|
'being', 'does', 'just', 'also', 'make', 'made', 'know', 'like'
|
||
|
|
}
|
||
|
|
|
||
|
|
filtered_words = [w for w in words if w not in stopwords]
|
||
|
|
|
||
|
|
word_freq = defaultdict(int)
|
||
|
|
for word in filtered_words:
|
||
|
|
word_freq[word] += 1
|
||
|
|
|
||
|
|
sorted_terms = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
|
||
|
|
return sorted_terms[:top_k]
|
||
|
|
|
||
|
|
def extract_relationships(self, text: str) -> List[Dict[str, Any]]:
|
||
|
|
relationships = []
|
||
|
|
|
||
|
|
relationship_patterns = [
|
||
|
|
(r'([A-Z][a-z]+) (works for|employed by|member of) ([A-Z][a-z]+)', 'employment'),
|
||
|
|
(r'([A-Z][a-z]+) (owns|has|possesses) ([^.]+)', 'ownership'),
|
||
|
|
(r'([A-Z][a-z]+) (located in|part of|belongs to) ([A-Z][a-z]+)', 'location'),
|
||
|
|
(r'([A-Z][a-z]+) (uses|utilizes|implements) ([^.]+)', 'usage'),
|
||
|
|
]
|
||
|
|
|
||
|
|
for pattern, rel_type in relationship_patterns:
|
||
|
|
matches = re.finditer(pattern, text)
|
||
|
|
for match in matches:
|
||
|
|
relationships.append({
|
||
|
|
'type': rel_type,
|
||
|
|
'subject': match.group(1),
|
||
|
|
'predicate': match.group(2),
|
||
|
|
'object': match.group(3),
|
||
|
|
'confidence': 0.6
|
||
|
|
})
|
||
|
|
|
||
|
|
return relationships
|
||
|
|
|
||
|
|
def extract_metadata(self, text: str) -> Dict[str, Any]:
|
||
|
|
word_count = len(text.split())
|
||
|
|
sentence_count = len(re.split(r'[.!?]', text))
|
||
|
|
|
||
|
|
urls = re.findall(r'https?://[^\s]+', text)
|
||
|
|
email_addresses = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
|
||
|
|
dates = re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b|\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b', text)
|
||
|
|
numbers = re.findall(r'\b\d+(?:,\d{3})*(?:\.\d+)?\b', text)
|
||
|
|
|
||
|
|
return {
|
||
|
|
'word_count': word_count,
|
||
|
|
'sentence_count': sentence_count,
|
||
|
|
'avg_words_per_sentence': round(word_count / max(sentence_count, 1), 2),
|
||
|
|
'urls': urls,
|
||
|
|
'email_addresses': email_addresses,
|
||
|
|
'dates': dates,
|
||
|
|
'numeric_values': numbers,
|
||
|
|
'has_code': bool(re.search(r'```|def |class |import |function ', text)),
|
||
|
|
'has_questions': bool(re.search(r'\?', text))
|
||
|
|
}
|
||
|
|
|
||
|
|
def categorize_content(self, text: str) -> List[str]:
|
||
|
|
categories = []
|
||
|
|
|
||
|
|
category_keywords = {
|
||
|
|
'programming': ['code', 'function', 'class', 'variable', 'programming', 'software', 'debug'],
|
||
|
|
'data': ['data', 'database', 'query', 'table', 'record', 'statistics', 'analysis'],
|
||
|
|
'documentation': ['documentation', 'guide', 'tutorial', 'manual', 'readme', 'explain'],
|
||
|
|
'configuration': ['config', 'settings', 'configuration', 'setup', 'install', 'deployment'],
|
||
|
|
'testing': ['test', 'testing', 'validate', 'verification', 'quality', 'assertion'],
|
||
|
|
'research': ['research', 'study', 'analysis', 'investigation', 'findings', 'results'],
|
||
|
|
'planning': ['plan', 'planning', 'schedule', 'roadmap', 'milestone', 'timeline'],
|
||
|
|
}
|
||
|
|
|
||
|
|
text_lower = text.lower()
|
||
|
|
for category, keywords in category_keywords.items():
|
||
|
|
if any(keyword in text_lower for keyword in keywords):
|
||
|
|
categories.append(category)
|
||
|
|
|
||
|
|
return categories if categories else ['general']
|