243 lines
9.9 KiB
Python
Raw Normal View History

from rp.memory.fact_extractor import FactExtractor
class TestFactExtractor:
def setup_method(self):
"""Set up test fixture."""
self.extractor = FactExtractor()
def test_init(self):
"""Test FactExtractor initialization."""
assert self.extractor.fact_patterns is not None
assert len(self.extractor.fact_patterns) > 0
def test_extract_facts_definition(self):
"""Test extracting definition facts."""
text = "John Smith is a software engineer. Python is a programming language."
facts = self.extractor.extract_facts(text)
assert len(facts) >= 2
# Check for definition pattern matches
definition_facts = [f for f in facts if f["type"] == "definition"]
assert len(definition_facts) >= 1
def test_extract_facts_temporal(self):
"""Test extracting temporal facts."""
text = "John was born in 1990. The company was founded in 2010."
facts = self.extractor.extract_facts(text)
temporal_facts = [f for f in facts if f["type"] == "temporal"]
assert len(temporal_facts) >= 1
def test_extract_facts_attribution(self):
"""Test extracting attribution facts."""
text = "John invented the widget. Mary developed the software."
facts = self.extractor.extract_facts(text)
attribution_facts = [f for f in facts if f["type"] == "attribution"]
assert len(attribution_facts) >= 1
def test_extract_facts_numeric(self):
"""Test extracting numeric facts."""
text = "The car costs $25,000. The house is worth $500,000."
facts = self.extractor.extract_facts(text)
numeric_facts = [f for f in facts if f["type"] == "numeric"]
assert len(numeric_facts) >= 1
def test_extract_facts_location(self):
"""Test extracting location facts."""
text = "John lives in San Francisco. The office is located in New York."
facts = self.extractor.extract_facts(text)
location_facts = [f for f in facts if f["type"] == "location"]
assert len(location_facts) >= 1
def test_extract_facts_entity(self):
"""Test extracting entity facts from noun phrases."""
text = "John Smith works at Google Inc. He uses Python programming."
facts = self.extractor.extract_facts(text)
entity_facts = [f for f in facts if f["type"] == "entity"]
assert len(entity_facts) >= 1
def test_extract_noun_phrases(self):
"""Test noun phrase extraction."""
text = "John Smith is a software engineer at Google. He works on Python Projects."
phrases = self.extractor._extract_noun_phrases(text)
assert "John Smith" in phrases
assert "Google" in phrases
assert "Python Projects" in phrases
def test_extract_noun_phrases_capitalized(self):
"""Test that only capitalized noun phrases are extracted."""
text = "the quick brown fox jumps over the lazy dog"
phrases = self.extractor._extract_noun_phrases(text)
# Should be empty since no capitalized words
assert len(phrases) == 0
def test_extract_key_terms(self):
"""Test key term extraction."""
text = "Python is a programming language used for software development and data analysis."
terms = self.extractor.extract_key_terms(text, top_k=5)
assert len(terms) <= 5
# Should contain programming, language, software, development, data, analysis
term_words = [term[0] for term in terms]
assert "programming" in term_words
assert "language" in term_words
assert "software" in term_words
def test_extract_key_terms_stopwords_filtered(self):
"""Test that stopwords are filtered from key terms."""
text = "This is a test of the system that should work properly."
terms = self.extractor.extract_key_terms(text)
term_words = [term[0] for term in terms]
# Stopwords should not appear
assert "this" not in term_words
assert "is" not in term_words
assert "a" not in term_words
assert "of" not in term_words
assert "the" not in term_words
def test_extract_relationships_employment(self):
"""Test extracting employment relationships."""
text = "John works for Google. Mary is employed by Microsoft."
relationships = self.extractor.extract_relationships(text)
employment_rels = [r for r in relationships if r["type"] == "employment"]
assert len(employment_rels) >= 1
def test_extract_relationships_ownership(self):
"""Test extracting ownership relationships."""
text = "John owns a car. Mary has a house."
relationships = self.extractor.extract_relationships(text)
ownership_rels = [r for r in relationships if r["type"] == "ownership"]
assert len(ownership_rels) >= 1
def test_extract_relationships_location(self):
"""Test extracting location relationships."""
text = "John located in New York. The factory belongs to Google."
relationships = self.extractor.extract_relationships(text)
location_rels = [r for r in relationships if r["type"] == "location"]
assert len(location_rels) >= 1
def test_extract_relationships_usage(self):
"""Test extracting usage relationships."""
text = "John uses Python. The company implements agile methodology."
relationships = self.extractor.extract_relationships(text)
usage_rels = [r for r in relationships if r["type"] == "usage"]
assert len(usage_rels) >= 1
def test_extract_metadata(self):
"""Test metadata extraction."""
text = "This is a test document. It contains some information about Python programming. You can visit https://python.org for more details. Contact john@example.com for questions. The project started in 2020 and costs $10,000."
metadata = self.extractor.extract_metadata(text)
assert metadata["word_count"] > 0
assert metadata["sentence_count"] > 0
assert metadata["avg_words_per_sentence"] > 0
assert len(metadata["urls"]) > 0
assert len(metadata["email_addresses"]) > 0
assert len(metadata["dates"]) > 0
assert len(metadata["numeric_values"]) > 0
assert metadata["has_code"] is False # No code in this text
def test_extract_metadata_with_code(self):
"""Test metadata extraction with code content."""
text = "Here is a function: def hello(): print('Hello, world!')"
metadata = self.extractor.extract_metadata(text)
assert metadata["has_code"] is True
def test_extract_metadata_with_questions(self):
"""Test metadata extraction with questions."""
text = "What is Python? How does it work? Why use it?"
metadata = self.extractor.extract_metadata(text)
assert metadata["has_questions"] is True
def test_categorize_content_programming(self):
"""Test content categorization for programming."""
text = "Python is a programming language used for code development and debugging."
categories = self.extractor.categorize_content(text)
assert "programming" in categories
def test_categorize_content_data(self):
"""Test content categorization for data."""
text = "The database contains records and tables with statistical analysis."
categories = self.extractor.categorize_content(text)
assert "data" in categories
def test_categorize_content_documentation(self):
"""Test content categorization for documentation."""
text = "This guide explains how to use the tutorial and manual."
categories = self.extractor.categorize_content(text)
assert "documentation" in categories
def test_categorize_content_configuration(self):
"""Test content categorization for configuration."""
text = "Configure the settings and setup the deployment environment."
categories = self.extractor.categorize_content(text)
assert "configuration" in categories
def test_categorize_content_testing(self):
"""Test content categorization for testing."""
text = "Run the tests to validate the functionality and verify quality."
categories = self.extractor.categorize_content(text)
assert "testing" in categories
def test_categorize_content_research(self):
"""Test content categorization for research."""
text = "The study investigates findings and results from the analysis."
categories = self.extractor.categorize_content(text)
assert "research" in categories
def test_categorize_content_planning(self):
"""Test content categorization for planning."""
text = "Plan the project schedule with milestones and timeline."
categories = self.extractor.categorize_content(text)
assert "planning" in categories
def test_categorize_content_general(self):
"""Test content categorization defaults to general."""
text = "This is some random text without specific keywords."
categories = self.extractor.categorize_content(text)
assert "general" in categories
def test_extract_facts_empty_text(self):
"""Test fact extraction with empty text."""
facts = self.extractor.extract_facts("")
assert len(facts) == 0
def test_extract_key_terms_empty_text(self):
"""Test key term extraction with empty text."""
terms = self.extractor.extract_key_terms("")
assert len(terms) == 0
def test_extract_relationships_empty_text(self):
"""Test relationship extraction with empty text."""
relationships = self.extractor.extract_relationships("")
assert len(relationships) == 0
def test_extract_metadata_empty_text(self):
"""Test metadata extraction with empty text."""
metadata = self.extractor.extract_metadata("")
assert metadata["word_count"] == 0
assert metadata["sentence_count"] == 0
assert metadata["avg_words_per_sentence"] == 0.0