rp/test_fact_extractor.py at 0f5dce1617a75f63a79013782582529df17a4871

 from pr.memory.fact_extractor import FactExtractor
 class TestFactExtractor:
     def setup_method(self):
         """Set up test fixture."""
         self.extractor = FactExtractor()
     def test_init(self):
         """Test FactExtractor initialization."""
         assert self.extractor.fact_patterns is not None
         assert len(self.extractor.fact_patterns) > 0
     def test_extract_facts_definition(self):
         """Test extracting definition facts."""
         text = "John Smith is a software engineer. Python is a programming language."
         facts = self.extractor.extract_facts(text)
         assert len(facts) >= 2
         # Check for definition pattern matches
         definition_facts = [f for f in facts if f["type"] == "definition"]
         assert len(definition_facts) >= 1
     def test_extract_facts_temporal(self):
         """Test extracting temporal facts."""
         text = "John was born in 1990. The company was founded in 2010."
         facts = self.extractor.extract_facts(text)
         temporal_facts = [f for f in facts if f["type"] == "temporal"]
         assert len(temporal_facts) >= 1
     def test_extract_facts_attribution(self):
         """Test extracting attribution facts."""
         text = "John invented the widget. Mary developed the software."
         facts = self.extractor.extract_facts(text)
         attribution_facts = [f for f in facts if f["type"] == "attribution"]
         assert len(attribution_facts) >= 1
     def test_extract_facts_numeric(self):
         """Test extracting numeric facts."""
         text = "The car costs $25,000. The house is worth $500,000."
         facts = self.extractor.extract_facts(text)
         numeric_facts = [f for f in facts if f["type"] == "numeric"]
         assert len(numeric_facts) >= 1
     def test_extract_facts_location(self):
         """Test extracting location facts."""
         text = "John lives in San Francisco. The office is located in New York."
         facts = self.extractor.extract_facts(text)
         location_facts = [f for f in facts if f["type"] == "location"]
         assert len(location_facts) >= 1
     def test_extract_facts_entity(self):
         """Test extracting entity facts from noun phrases."""
         text = "John Smith works at Google Inc. He uses Python programming."
         facts = self.extractor.extract_facts(text)
         entity_facts = [f for f in facts if f["type"] == "entity"]
         assert len(entity_facts) >= 1
     def test_extract_noun_phrases(self):
         """Test noun phrase extraction."""
         text = "John Smith is a software engineer at Google. He works on Python Projects."
         phrases = self.extractor._extract_noun_phrases(text)
         assert "John Smith" in phrases
         assert "Google" in phrases
         assert "Python Projects" in phrases
     def test_extract_noun_phrases_capitalized(self):
         """Test that only capitalized noun phrases are extracted."""
         text = "the quick brown fox jumps over the lazy dog"
         phrases = self.extractor._extract_noun_phrases(text)
         # Should be empty since no capitalized words
         assert len(phrases) == 0
     def test_extract_key_terms(self):
         """Test key term extraction."""
         text = "Python is a programming language used for software development and data analysis."
         terms = self.extractor.extract_key_terms(text, top_k=5)
         assert len(terms) <= 5
         # Should contain programming, language, software, development, data, analysis
         term_words = [term[0] for term in terms]
         assert "programming" in term_words
         assert "language" in term_words
         assert "software" in term_words
     def test_extract_key_terms_stopwords_filtered(self):
         """Test that stopwords are filtered from key terms."""
         text = "This is a test of the system that should work properly."
         terms = self.extractor.extract_key_terms(text)
         term_words = [term[0] for term in terms]
         # Stopwords should not appear
         assert "this" not in term_words
         assert "is" not in term_words
         assert "a" not in term_words
         assert "of" not in term_words
         assert "the" not in term_words
     def test_extract_relationships_employment(self):
         """Test extracting employment relationships."""
         text = "John works for Google. Mary is employed by Microsoft."
         relationships = self.extractor.extract_relationships(text)
         employment_rels = [r for r in relationships if r["type"] == "employment"]
         assert len(employment_rels) >= 1
     def test_extract_relationships_ownership(self):
         """Test extracting ownership relationships."""
         text = "John owns a car. Mary has a house."
         relationships = self.extractor.extract_relationships(text)
         ownership_rels = [r for r in relationships if r["type"] == "ownership"]
         assert len(ownership_rels) >= 1
     def test_extract_relationships_location(self):
         """Test extracting location relationships."""
         text = "John located in New York. The factory belongs to Google."
         relationships = self.extractor.extract_relationships(text)
         location_rels = [r for r in relationships if r["type"] == "location"]
         assert len(location_rels) >= 1
     def test_extract_relationships_usage(self):
         """Test extracting usage relationships."""
         text = "John uses Python. The company implements agile methodology."
         relationships = self.extractor.extract_relationships(text)
         usage_rels = [r for r in relationships if r["type"] == "usage"]
         assert len(usage_rels) >= 1
     def test_extract_metadata(self):
         """Test metadata extraction."""
         text = "This is a test document. It contains some information about Python programming. You can visit https://python.org for more details. Contact john@example.com for questions. The project started in 2020 and costs $10,000."
         metadata = self.extractor.extract_metadata(text)
         assert metadata["word_count"] > 0
         assert metadata["sentence_count"] > 0
         assert metadata["avg_words_per_sentence"] > 0
         assert len(metadata["urls"]) > 0
         assert len(metadata["email_addresses"]) > 0
         assert len(metadata["dates"]) > 0
         assert len(metadata["numeric_values"]) > 0
         assert metadata["has_code"] is False  # No code in this text
     def test_extract_metadata_with_code(self):
         """Test metadata extraction with code content."""
         text = "Here is a function: def hello(): print('Hello, world!')"
         metadata = self.extractor.extract_metadata(text)
         assert metadata["has_code"] is True
     def test_extract_metadata_with_questions(self):
         """Test metadata extraction with questions."""
         text = "What is Python? How does it work? Why use it?"
         metadata = self.extractor.extract_metadata(text)
         assert metadata["has_questions"] is True
     def test_categorize_content_programming(self):
         """Test content categorization for programming."""
         text = "Python is a programming language used for code development and debugging."
         categories = self.extractor.categorize_content(text)
         assert "programming" in categories
     def test_categorize_content_data(self):
         """Test content categorization for data."""
         text = "The database contains records and tables with statistical analysis."
         categories = self.extractor.categorize_content(text)
         assert "data" in categories
     def test_categorize_content_documentation(self):
         """Test content categorization for documentation."""
         text = "This guide explains how to use the tutorial and manual."
         categories = self.extractor.categorize_content(text)
         assert "documentation" in categories
     def test_categorize_content_configuration(self):
         """Test content categorization for configuration."""
         text = "Configure the settings and setup the deployment environment."
         categories = self.extractor.categorize_content(text)
         assert "configuration" in categories
     def test_categorize_content_testing(self):
         """Test content categorization for testing."""
         text = "Run the tests to validate the functionality and verify quality."
         categories = self.extractor.categorize_content(text)
         assert "testing" in categories
     def test_categorize_content_research(self):
         """Test content categorization for research."""
         text = "The study investigates findings and results from the analysis."
         categories = self.extractor.categorize_content(text)
         assert "research" in categories
     def test_categorize_content_planning(self):
         """Test content categorization for planning."""
         text = "Plan the project schedule with milestones and timeline."
         categories = self.extractor.categorize_content(text)
         assert "planning" in categories
     def test_categorize_content_general(self):
         """Test content categorization defaults to general."""
         text = "This is some random text without specific keywords."
         categories = self.extractor.categorize_content(text)
         assert "general" in categories
     def test_extract_facts_empty_text(self):
         """Test fact extraction with empty text."""
         facts = self.extractor.extract_facts("")
         assert len(facts) == 0
     def test_extract_key_terms_empty_text(self):
         """Test key term extraction with empty text."""
         terms = self.extractor.extract_key_terms("")
         assert len(terms) == 0
     def test_extract_relationships_empty_text(self):
         """Test relationship extraction with empty text."""
         relationships = self.extractor.extract_relationships("")
         assert len(relationships) == 0
     def test_extract_metadata_empty_text(self):
         """Test metadata extraction with empty text."""
         metadata = self.extractor.extract_metadata("")
         assert metadata["word_count"] == 0
         assert metadata["sentence_count"] == 0
         assert metadata["avg_words_per_sentence"] == 0.0