rp/pr/memory/fact_extractor.py

import re
import json
from typing import List, Dict, Any, Set
from collections import defaultdict

class FactExtractor:
    def __init__(self):
        self.fact_patterns = [
            (r'([A-Z][a-z]+ [A-Z][a-z]+) is (a|an) ([^.]+)', 'definition'),
            (r'([A-Z][a-z]+) (was|is) (born|created|founded) in (\d{4})', 'temporal'),
            (r'([A-Z][a-z]+) (invented|created|developed) ([^.]+)', 'attribution'),
            (r'([^.]+) (costs?|worth) (\$[\d,]+)', 'numeric'),
            (r'([A-Z][a-z]+) (lives?|works?|located) in ([A-Z][a-z]+)', 'location'),
        ]

    def extract_facts(self, text: str) -> List[Dict[str, Any]]:
        facts = []

        for pattern, fact_type in self.fact_patterns:
            matches = re.finditer(pattern, text)
            for match in matches:
                facts.append({
                    'type': fact_type,
                    'text': match.group(0),
                    'components': match.groups(),
                    'confidence': 0.7
                })

        noun_phrases = self._extract_noun_phrases(text)
        for phrase in noun_phrases:
            if len(phrase.split()) >= 2:
                facts.append({
                    'type': 'entity',
                    'text': phrase,
                    'components': [phrase],
                    'confidence': 0.5
                })

        return facts

    def _extract_noun_phrases(self, text: str) -> List[str]:
        sentences = re.split(r'[.!?]', text)
        phrases = []

        for sentence in sentences:
            words = sentence.split()
            current_phrase = []

            for word in words:
                if word and word[0].isupper() and len(word) > 1:
                    current_phrase.append(word)
                else:
                    if len(current_phrase) >= 2:
                        phrases.append(' '.join(current_phrase))
                    current_phrase = []

            if len(current_phrase) >= 2:
                phrases.append(' '.join(current_phrase))

        return list(set(phrases))

    def extract_key_terms(self, text: str, top_k: int = 10) -> List[tuple]:
        words = re.findall(r'\b[a-z]{4,}\b', text.lower())

        stopwords = {
            'this', 'that', 'these', 'those', 'what', 'which', 'where', 'when',
            'with', 'from', 'have', 'been', 'were', 'will', 'would', 'could',
            'should', 'about', 'their', 'there', 'other', 'than', 'then', 'them',
            'some', 'more', 'very', 'such', 'into', 'through', 'during', 'before',
            'after', 'above', 'below', 'between', 'under', 'again', 'further',
            'once', 'here', 'both', 'each', 'doing', 'only', 'over', 'same',
            'being', 'does', 'just', 'also', 'make', 'made', 'know', 'like'
        }

        filtered_words = [w for w in words if w not in stopwords]

        word_freq = defaultdict(int)
        for word in filtered_words:
            word_freq[word] += 1

        sorted_terms = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
        return sorted_terms[:top_k]

    def extract_relationships(self, text: str) -> List[Dict[str, Any]]:
        relationships = []

        relationship_patterns = [
            (r'([A-Z][a-z]+) (works for|employed by|member of) ([A-Z][a-z]+)', 'employment'),
            (r'([A-Z][a-z]+) (owns|has|possesses) ([^.]+)', 'ownership'),
            (r'([A-Z][a-z]+) (located in|part of|belongs to) ([A-Z][a-z]+)', 'location'),
            (r'([A-Z][a-z]+) (uses|utilizes|implements) ([^.]+)', 'usage'),
        ]

        for pattern, rel_type in relationship_patterns:
            matches = re.finditer(pattern, text)
            for match in matches:
                relationships.append({
                    'type': rel_type,
                    'subject': match.group(1),
                    'predicate': match.group(2),
                    'object': match.group(3),
                    'confidence': 0.6
                })

        return relationships

    def extract_metadata(self, text: str) -> Dict[str, Any]:
        word_count = len(text.split())
        sentence_count = len(re.split(r'[.!?]', text))

        urls = re.findall(r'https?://[^\s]+', text)
        email_addresses = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
        dates = re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b|\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b', text)
        numbers = re.findall(r'\b\d+(?:,\d{3})*(?:\.\d+)?\b', text)

        return {
            'word_count': word_count,
            'sentence_count': sentence_count,
            'avg_words_per_sentence': round(word_count / max(sentence_count, 1), 2),
            'urls': urls,
            'email_addresses': email_addresses,
            'dates': dates,
            'numeric_values': numbers,
            'has_code': bool(re.search(r'```|def |class |import |function ', text)),
            'has_questions': bool(re.search(r'\?', text))
        }

    def categorize_content(self, text: str) -> List[str]:
        categories = []

        category_keywords = {
            'programming': ['code', 'function', 'class', 'variable', 'programming', 'software', 'debug'],
            'data': ['data', 'database', 'query', 'table', 'record', 'statistics', 'analysis'],
            'documentation': ['documentation', 'guide', 'tutorial', 'manual', 'readme', 'explain'],
            'configuration': ['config', 'settings', 'configuration', 'setup', 'install', 'deployment'],
            'testing': ['test', 'testing', 'validate', 'verification', 'quality', 'assertion'],
            'research': ['research', 'study', 'analysis', 'investigation', 'findings', 'results'],
            'planning': ['plan', 'planning', 'schedule', 'roadmap', 'milestone', 'timeline'],
        }

        text_lower = text.lower()
        for category, keywords in category_keywords.items():
            if any(keyword in text_lower for keyword in keywords):
                categories.append(category)

        return categories if categories else ['general']