rp/pr/memory/fact_extractor.py

import re
from collections import defaultdict
from typing import Any, Dict, List


class FactExtractor:
    def __init__(self):
        self.fact_patterns = [
            (r"([A-Z][a-z]+ [A-Z][a-z]+) is (a|an) ([^.]+)", "definition"),
            (r"([A-Z][a-z]+) (was|is) (born|created|founded) in (\d{4})", "temporal"),
            (r"([A-Z][a-z]+) (invented|created|developed) ([^.]+)", "attribution"),
            (r"([^.]+) (costs?|worth) (\$[\d,]+)", "numeric"),
            (r"([A-Z][a-z]+) (lives?|works?|located) in ([A-Z][a-z]+)", "location"),
        ]

    def extract_facts(self, text: str) -> List[Dict[str, Any]]:
        facts = []

        for pattern, fact_type in self.fact_patterns:
            matches = re.finditer(pattern, text)
            for match in matches:
                facts.append(
                    {
                        "type": fact_type,
                        "text": match.group(0),
                        "components": match.groups(),
                        "confidence": 0.7,
                    }
                )

        noun_phrases = self._extract_noun_phrases(text)
        for phrase in noun_phrases:
            if len(phrase.split()) >= 2:
                facts.append(
                    {
                        "type": "entity",
                        "text": phrase,
                        "components": [phrase],
                        "confidence": 0.5,
                    }
                )

        return facts

    def _extract_noun_phrases(self, text: str) -> List[str]:
        sentences = re.split(r"[.!?]", text)
        phrases = []

        for sentence in sentences:
            words = sentence.split()
            current_phrase = []

            for word in words:
                if word and word[0].isupper() and len(word) > 1:
                    current_phrase.append(word)
                else:
                    if len(current_phrase) >= 2:
                        phrases.append(" ".join(current_phrase))
                    current_phrase = []

            if len(current_phrase) >= 2:
                phrases.append(" ".join(current_phrase))

        return list(set(phrases))

    def extract_key_terms(self, text: str, top_k: int = 10) -> List[tuple]:
        words = re.findall(r"\b[a-z]{4,}\b", text.lower())

        stopwords = {
            "this",
            "that",
            "these",
            "those",
            "what",
            "which",
            "where",
            "when",
            "with",
            "from",
            "have",
            "been",
            "were",
            "will",
            "would",
            "could",
            "should",
            "about",
            "their",
            "there",
            "other",
            "than",
            "then",
            "them",
            "some",
            "more",
            "very",
            "such",
            "into",
            "through",
            "during",
            "before",
            "after",
            "above",
            "below",
            "between",
            "under",
            "again",
            "further",
            "once",
            "here",
            "both",
            "each",
            "doing",
            "only",
            "over",
            "same",
            "being",
            "does",
            "just",
            "also",
            "make",
            "made",
            "know",
            "like",
        }

        filtered_words = [w for w in words if w not in stopwords]

        word_freq = defaultdict(int)
        for word in filtered_words:
            word_freq[word] += 1

        sorted_terms = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
        return sorted_terms[:top_k]

    def extract_relationships(self, text: str) -> List[Dict[str, Any]]:
        relationships = []

        relationship_patterns = [
            (
                r"([A-Z][a-z]+) (works for|employed by|member of) ([A-Z][a-z]+)",
                "employment",
            ),
            (r"([A-Z][a-z]+) (owns|has|possesses) ([^.]+)", "ownership"),
            (
                r"([A-Z][a-z]+) (located in|part of|belongs to) ([A-Z][a-z]+)",
                "location",
            ),
            (r"([A-Z][a-z]+) (uses|utilizes|implements) ([^.]+)", "usage"),
        ]

        for pattern, rel_type in relationship_patterns:
            matches = re.finditer(pattern, text)
            for match in matches:
                relationships.append(
                    {
                        "type": rel_type,
                        "subject": match.group(1),
                        "predicate": match.group(2),
                        "object": match.group(3),
                        "confidence": 0.6,
                    }
                )

        return relationships

    def extract_metadata(self, text: str) -> Dict[str, Any]:
        word_count = len(text.split())
        sentence_count = len(re.split(r"[.!?]", text))

        urls = re.findall(r"https?://[^\s]+", text)
        email_addresses = re.findall(
            r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", text
        )
        dates = re.findall(
            r"\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b|\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b", text
        )
        numbers = re.findall(r"\b\d+(?:,\d{3})*(?:\.\d+)?\b", text)

        return {
            "word_count": word_count,
            "sentence_count": sentence_count,
            "avg_words_per_sentence": round(word_count / max(sentence_count, 1), 2),
            "urls": urls,
            "email_addresses": email_addresses,
            "dates": dates,
            "numeric_values": numbers,
            "has_code": bool(re.search(r"```|def |class |import |function ", text)),
            "has_questions": bool(re.search(r"\?", text)),
        }

    def categorize_content(self, text: str) -> List[str]:
        categories = []

        category_keywords = {
            "programming": [
                "code",
                "function",
                "class",
                "variable",
                "programming",
                "software",
                "debug",
            ],
            "data": [
                "data",
                "database",
                "query",
                "table",
                "record",
                "statistics",
                "analysis",
            ],
            "documentation": [
                "documentation",
                "guide",
                "tutorial",
                "manual",
                "readme",
                "explain",
            ],
            "configuration": [
                "config",
                "settings",
                "configuration",
                "setup",
                "install",
                "deployment",
            ],
            "testing": [
                "test",
                "testing",
                "validate",
                "verification",
                "quality",
                "assertion",
            ],
            "research": [
                "research",
                "study",
                "analysis",
                "investigation",
                "findings",
                "results",
            ],
            "planning": [
                "plan",
                "planning",
                "schedule",
                "roadmap",
                "milestone",
                "timeline",
            ],
        }

        text_lower = text.lower()
        for category, keywords in category_keywords.items():
            if any(keyword in text_lower for keyword in keywords):
                categories.append(category)

        return categories if categories else ["general"]