import re from collections import defaultdict from typing import Any, Dict, List class FactExtractor: def __init__(self): self.fact_patterns = [ (r"([A-Z][a-z]+ [A-Z][a-z]+) is (a|an) ([^.]+)", "definition"), (r"([A-Z][a-z]+) (was|is) (born|created|founded) in (\d{4})", "temporal"), (r"([A-Z][a-z]+) (invented|created|developed) ([^.]+)", "attribution"), (r"([^.]+) (costs?|worth) (\$[\d,]+)", "numeric"), (r"([A-Z][a-z]+) (lives?|works?|located) in ([A-Z][a-z]+)", "location"), ] def extract_facts(self, text: str) -> List[Dict[str, Any]]: facts = [] for pattern, fact_type in self.fact_patterns: matches = re.finditer(pattern, text) for match in matches: facts.append( { "type": fact_type, "text": match.group(0), "components": match.groups(), "confidence": 0.7, } ) noun_phrases = self._extract_noun_phrases(text) for phrase in noun_phrases: if len(phrase.split()) >= 2: facts.append( { "type": "entity", "text": phrase, "components": [phrase], "confidence": 0.5, } ) return facts def _extract_noun_phrases(self, text: str) -> List[str]: sentences = re.split(r"[.!?]", text) phrases = [] for sentence in sentences: words = sentence.split() current_phrase = [] for word in words: if word and word[0].isupper() and len(word) > 1: current_phrase.append(word) else: if len(current_phrase) >= 2: phrases.append(" ".join(current_phrase)) elif len(current_phrase) == 1: phrases.append(current_phrase[0]) # Single capitalized words current_phrase = [] if len(current_phrase) >= 2: phrases.append(" ".join(current_phrase)) elif len(current_phrase) == 1: phrases.append(current_phrase[0]) # Single capitalized words return list(set(phrases)) def extract_key_terms(self, text: str, top_k: int = 10) -> List[tuple]: words = re.findall(r"\b[a-z]{4,}\b", text.lower()) stopwords = { "this", "that", "these", "those", "what", "which", "where", "when", "with", "from", "have", "been", "were", "will", "would", "could", "should", "about", "their", "there", "other", "than", "then", "them", "some", "more", "very", "such", "into", "through", "during", "before", "after", "above", "below", "between", "under", "again", "further", "once", "here", "both", "each", "doing", "only", "over", "same", "being", "does", "just", "also", "make", "made", "know", "like", } filtered_words = [w for w in words if w not in stopwords] word_freq = defaultdict(int) for word in filtered_words: word_freq[word] += 1 sorted_terms = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) return sorted_terms[:top_k] def extract_relationships(self, text: str) -> List[Dict[str, Any]]: relationships = [] relationship_patterns = [ ( r"([A-Z][a-z]+) (works for|employed by|member of) ([A-Z][a-z]+)", "employment", ), (r"([A-Z][a-z]+) (owns|has|possesses) ([^.]+)", "ownership"), ( r"([A-Z][a-z]+) (located in|part of|belongs to) ([A-Z][a-z]+)", "location", ), (r"([A-Z][a-z]+) (uses|utilizes|implements) ([^.]+)", "usage"), ] for pattern, rel_type in relationship_patterns: matches = re.finditer(pattern, text) for match in matches: relationships.append( { "type": rel_type, "subject": match.group(1), "predicate": match.group(2), "object": match.group(3), "confidence": 0.6, } ) return relationships def extract_metadata(self, text: str) -> Dict[str, Any]: word_count = len(text.split()) if text.strip() else 0 sentences = re.split(r"[.!?]", text.strip()) sentence_count = len([s for s in sentences if s.strip()]) if text.strip() else 0 urls = re.findall(r"https?://[^\s]+", text) email_addresses = re.findall(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", text) dates = re.findall( r"\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b|\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b|\b\d{4}\b", text ) numbers = re.findall(r"\b\d+(?:,\d{3})*(?:\.\d+)?\b", text) return { "word_count": word_count, "sentence_count": sentence_count, "avg_words_per_sentence": round(word_count / max(sentence_count, 1), 2), "urls": urls, "email_addresses": email_addresses, "dates": dates, "numeric_values": numbers, "has_code": bool(re.search(r"```|def |class |import |function ", text)), "has_questions": bool(re.search(r"\?", text)), } def categorize_content(self, text: str) -> List[str]: categories = [] category_keywords = { "programming": [ "code", "function", "class", "variable", "programming", "software", "debug", ], "data": [ "data", "database", "query", "table", "record", "statistics", "analysis", ], "documentation": [ "documentation", "guide", "tutorial", "manual", "readme", "explain", ], "configuration": [ "config", "settings", "configuration", "setup", "install", "deployment", ], "testing": [ "test", "testing", "validate", "verification", "quality", "assertion", ], "research": [ "research", "study", "analysis", "investigation", "findings", "results", ], "planning": [ "plan", "planning", "schedule", "roadmap", "milestone", "timeline", ], } text_lower = text.lower() for category, keywords in category_keywords.items(): if any(keyword in text_lower for keyword in keywords): categories.append(category) return categories if categories else ["general"]