import re import json from typing import List, Dict, Any, Set from collections import defaultdict class FactExtractor: def __init__(self): self.fact_patterns = [ (r'([A-Z][a-z]+ [A-Z][a-z]+) is (a|an) ([^.]+)', 'definition'), (r'([A-Z][a-z]+) (was|is) (born|created|founded) in (\d{4})', 'temporal'), (r'([A-Z][a-z]+) (invented|created|developed) ([^.]+)', 'attribution'), (r'([^.]+) (costs?|worth) (\$[\d,]+)', 'numeric'), (r'([A-Z][a-z]+) (lives?|works?|located) in ([A-Z][a-z]+)', 'location'), ] def extract_facts(self, text: str) -> List[Dict[str, Any]]: facts = [] for pattern, fact_type in self.fact_patterns: matches = re.finditer(pattern, text) for match in matches: facts.append({ 'type': fact_type, 'text': match.group(0), 'components': match.groups(), 'confidence': 0.7 }) noun_phrases = self._extract_noun_phrases(text) for phrase in noun_phrases: if len(phrase.split()) >= 2: facts.append({ 'type': 'entity', 'text': phrase, 'components': [phrase], 'confidence': 0.5 }) return facts def _extract_noun_phrases(self, text: str) -> List[str]: sentences = re.split(r'[.!?]', text) phrases = [] for sentence in sentences: words = sentence.split() current_phrase = [] for word in words: if word and word[0].isupper() and len(word) > 1: current_phrase.append(word) else: if len(current_phrase) >= 2: phrases.append(' '.join(current_phrase)) current_phrase = [] if len(current_phrase) >= 2: phrases.append(' '.join(current_phrase)) return list(set(phrases)) def extract_key_terms(self, text: str, top_k: int = 10) -> List[tuple]: words = re.findall(r'\b[a-z]{4,}\b', text.lower()) stopwords = { 'this', 'that', 'these', 'those', 'what', 'which', 'where', 'when', 'with', 'from', 'have', 'been', 'were', 'will', 'would', 'could', 'should', 'about', 'their', 'there', 'other', 'than', 'then', 'them', 'some', 'more', 'very', 'such', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'under', 'again', 'further', 'once', 'here', 'both', 'each', 'doing', 'only', 'over', 'same', 'being', 'does', 'just', 'also', 'make', 'made', 'know', 'like' } filtered_words = [w for w in words if w not in stopwords] word_freq = defaultdict(int) for word in filtered_words: word_freq[word] += 1 sorted_terms = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) return sorted_terms[:top_k] def extract_relationships(self, text: str) -> List[Dict[str, Any]]: relationships = [] relationship_patterns = [ (r'([A-Z][a-z]+) (works for|employed by|member of) ([A-Z][a-z]+)', 'employment'), (r'([A-Z][a-z]+) (owns|has|possesses) ([^.]+)', 'ownership'), (r'([A-Z][a-z]+) (located in|part of|belongs to) ([A-Z][a-z]+)', 'location'), (r'([A-Z][a-z]+) (uses|utilizes|implements) ([^.]+)', 'usage'), ] for pattern, rel_type in relationship_patterns: matches = re.finditer(pattern, text) for match in matches: relationships.append({ 'type': rel_type, 'subject': match.group(1), 'predicate': match.group(2), 'object': match.group(3), 'confidence': 0.6 }) return relationships def extract_metadata(self, text: str) -> Dict[str, Any]: word_count = len(text.split()) sentence_count = len(re.split(r'[.!?]', text)) urls = re.findall(r'https?://[^\s]+', text) email_addresses = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text) dates = re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b|\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b', text) numbers = re.findall(r'\b\d+(?:,\d{3})*(?:\.\d+)?\b', text) return { 'word_count': word_count, 'sentence_count': sentence_count, 'avg_words_per_sentence': round(word_count / max(sentence_count, 1), 2), 'urls': urls, 'email_addresses': email_addresses, 'dates': dates, 'numeric_values': numbers, 'has_code': bool(re.search(r'```|def |class |import |function ', text)), 'has_questions': bool(re.search(r'\?', text)) } def categorize_content(self, text: str) -> List[str]: categories = [] category_keywords = { 'programming': ['code', 'function', 'class', 'variable', 'programming', 'software', 'debug'], 'data': ['data', 'database', 'query', 'table', 'record', 'statistics', 'analysis'], 'documentation': ['documentation', 'guide', 'tutorial', 'manual', 'readme', 'explain'], 'configuration': ['config', 'settings', 'configuration', 'setup', 'install', 'deployment'], 'testing': ['test', 'testing', 'validate', 'verification', 'quality', 'assertion'], 'research': ['research', 'study', 'analysis', 'investigation', 'findings', 'results'], 'planning': ['plan', 'planning', 'schedule', 'roadmap', 'milestone', 'timeline'], } text_lower = text.lower() for category, keywords in category_keywords.items(): if any(keyword in text_lower for keyword in keywords): categories.append(category) return categories if categories else ['general']