rp/fact_extractor.py at 85808f10a86ae380c2de0e2598c259e776ffc39a

 import re
 from collections import defaultdict
 from typing import Any, Dict, List
 class FactExtractor:
     def __init__(self):
         self.fact_patterns = [
             (r"([A-Z][a-z]+ [A-Z][a-z]+) is (a|an) ([^.]+)", "definition"),
             (r"([A-Z][a-z]+) (was|is) (born|created|founded) in (\d{4})", "temporal"),
             (r"([A-Z][a-z]+) (invented|created|developed) ([^.]+)", "attribution"),
             (r"([^.]+) (costs?|worth) (\$[\d,]+)", "numeric"),
             (r"([A-Z][a-z]+) (lives?|works?|located) in ([A-Z][a-z]+)", "location"),
         ]
     def extract_facts(self, text: str) -> List[Dict[str, Any]]:
         facts = []
         for pattern, fact_type in self.fact_patterns:
             matches = re.finditer(pattern, text)
             for match in matches:
                 facts.append(
                     {
                         "type": fact_type,
                         "text": match.group(0),
                         "components": match.groups(),
                         "confidence": 0.7,
                     }
                 )
         noun_phrases = self._extract_noun_phrases(text)
         for phrase in noun_phrases:
             if len(phrase.split()) >= 2:
                 facts.append(
                     {
                         "type": "entity",
                         "text": phrase,
                         "components": [phrase],
                         "confidence": 0.5,
                     }
                 )
         return facts
     def _extract_noun_phrases(self, text: str) -> List[str]:
         sentences = re.split(r"[.!?]", text)
         phrases = []
         for sentence in sentences:
             words = sentence.split()
             current_phrase = []
             for word in words:
                 if word and word[0].isupper() and len(word) > 1:
                     current_phrase.append(word)
                 else:
                     if len(current_phrase) >= 2:
                         phrases.append(" ".join(current_phrase))
                     current_phrase = []
             if len(current_phrase) >= 2:
                 phrases.append(" ".join(current_phrase))
         return list(set(phrases))
     def extract_key_terms(self, text: str, top_k: int = 10) -> List[tuple]:
         words = re.findall(r"\b[a-z]{4,}\b", text.lower())
         stopwords = {
             "this",
             "that",
             "these",
             "those",
             "what",
             "which",
             "where",
             "when",
             "with",
             "from",
             "have",
             "been",
             "were",
             "will",
             "would",
             "could",
             "should",
             "about",
             "their",
             "there",
             "other",
             "than",
             "then",
             "them",
             "some",
             "more",
             "very",
             "such",
             "into",
             "through",
             "during",
             "before",
             "after",
             "above",
             "below",
             "between",
             "under",
             "again",
             "further",
             "once",
             "here",
             "both",
             "each",
             "doing",
             "only",
             "over",
             "same",
             "being",
             "does",
             "just",
             "also",
             "make",
             "made",
             "know",
             "like",
         }
         filtered_words = [w for w in words if w not in stopwords]
         word_freq = defaultdict(int)
         for word in filtered_words:
             word_freq[word] += 1
         sorted_terms = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
         return sorted_terms[:top_k]
     def extract_relationships(self, text: str) -> List[Dict[str, Any]]:
         relationships = []
         relationship_patterns = [
             (
                 r"([A-Z][a-z]+) (works for|employed by|member of) ([A-Z][a-z]+)",
                 "employment",
             ),
             (r"([A-Z][a-z]+) (owns|has|possesses) ([^.]+)", "ownership"),
             (
                 r"([A-Z][a-z]+) (located in|part of|belongs to) ([A-Z][a-z]+)",
                 "location",
             ),
             (r"([A-Z][a-z]+) (uses|utilizes|implements) ([^.]+)", "usage"),
         ]
         for pattern, rel_type in relationship_patterns:
             matches = re.finditer(pattern, text)
             for match in matches:
                 relationships.append(
                     {
                         "type": rel_type,
                         "subject": match.group(1),
                         "predicate": match.group(2),
                         "object": match.group(3),
                         "confidence": 0.6,
                     }
                 )
         return relationships
     def extract_metadata(self, text: str) -> Dict[str, Any]:
         word_count = len(text.split())
         sentence_count = len(re.split(r"[.!?]", text))
         urls = re.findall(r"https?://[^\s]+", text)
         email_addresses = re.findall(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", text)
         dates = re.findall(
             r"\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b|\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b", text
         )
         numbers = re.findall(r"\b\d+(?:,\d{3})*(?:\.\d+)?\b", text)
         return {
             "word_count": word_count,
             "sentence_count": sentence_count,
             "avg_words_per_sentence": round(word_count / max(sentence_count, 1), 2),
             "urls": urls,
             "email_addresses": email_addresses,
             "dates": dates,
             "numeric_values": numbers,
             "has_code": bool(re.search(r"```|def |class |import |function ", text)),
             "has_questions": bool(re.search(r"\?", text)),
         }
     def categorize_content(self, text: str) -> List[str]:
         categories = []
         category_keywords = {
             "programming": [
                 "code",
                 "function",
                 "class",
                 "variable",
                 "programming",
                 "software",
                 "debug",
             ],
             "data": [
                 "data",
                 "database",
                 "query",
                 "table",
                 "record",
                 "statistics",
                 "analysis",
             ],
             "documentation": [
                 "documentation",
                 "guide",
                 "tutorial",
                 "manual",
                 "readme",
                 "explain",
             ],
             "configuration": [
                 "config",
                 "settings",
                 "configuration",
                 "setup",
                 "install",
                 "deployment",
             ],
             "testing": [
                 "test",
                 "testing",
                 "validate",
                 "verification",
                 "quality",
                 "assertion",
             ],
             "research": [
                 "research",
                 "study",
                 "analysis",
                 "investigation",
                 "findings",
                 "results",
             ],
             "planning": [
                 "plan",
                 "planning",
                 "schedule",
                 "roadmap",
                 "milestone",
                 "timeline",
             ],
         }
         text_lower = text.lower()
         for category, keywords in category_keywords.items():
             if any(keyword in text_lower for keyword in keywords):
                 categories.append(category)
         return categories if categories else ["general"]