|
from pr.memory.fact_extractor import FactExtractor
|
|
|
|
|
|
class TestFactExtractor:
|
|
def setup_method(self):
|
|
"""Set up test fixture."""
|
|
self.extractor = FactExtractor()
|
|
|
|
def test_init(self):
|
|
"""Test FactExtractor initialization."""
|
|
assert self.extractor.fact_patterns is not None
|
|
assert len(self.extractor.fact_patterns) > 0
|
|
|
|
def test_extract_facts_definition(self):
|
|
"""Test extracting definition facts."""
|
|
text = "John Smith is a software engineer. Python is a programming language."
|
|
facts = self.extractor.extract_facts(text)
|
|
|
|
assert len(facts) >= 2
|
|
# Check for definition pattern matches
|
|
definition_facts = [f for f in facts if f["type"] == "definition"]
|
|
assert len(definition_facts) >= 1
|
|
|
|
def test_extract_facts_temporal(self):
|
|
"""Test extracting temporal facts."""
|
|
text = "John was born in 1990. The company was founded in 2010."
|
|
facts = self.extractor.extract_facts(text)
|
|
|
|
temporal_facts = [f for f in facts if f["type"] == "temporal"]
|
|
assert len(temporal_facts) >= 1
|
|
|
|
def test_extract_facts_attribution(self):
|
|
"""Test extracting attribution facts."""
|
|
text = "John invented the widget. Mary developed the software."
|
|
facts = self.extractor.extract_facts(text)
|
|
|
|
attribution_facts = [f for f in facts if f["type"] == "attribution"]
|
|
assert len(attribution_facts) >= 1
|
|
|
|
def test_extract_facts_numeric(self):
|
|
"""Test extracting numeric facts."""
|
|
text = "The car costs $25,000. The house is worth $500,000."
|
|
facts = self.extractor.extract_facts(text)
|
|
|
|
numeric_facts = [f for f in facts if f["type"] == "numeric"]
|
|
assert len(numeric_facts) >= 1
|
|
|
|
def test_extract_facts_location(self):
|
|
"""Test extracting location facts."""
|
|
text = "John lives in San Francisco. The office is located in New York."
|
|
facts = self.extractor.extract_facts(text)
|
|
|
|
location_facts = [f for f in facts if f["type"] == "location"]
|
|
assert len(location_facts) >= 1
|
|
|
|
def test_extract_facts_entity(self):
|
|
"""Test extracting entity facts from noun phrases."""
|
|
text = "John Smith works at Google Inc. He uses Python programming."
|
|
facts = self.extractor.extract_facts(text)
|
|
|
|
entity_facts = [f for f in facts if f["type"] == "entity"]
|
|
assert len(entity_facts) >= 1
|
|
|
|
def test_extract_noun_phrases(self):
|
|
"""Test noun phrase extraction."""
|
|
text = "John Smith is a software engineer at Google. He works on Python Projects."
|
|
phrases = self.extractor._extract_noun_phrases(text)
|
|
|
|
assert "John Smith" in phrases
|
|
assert "Google" in phrases
|
|
assert "Python Projects" in phrases
|
|
|
|
def test_extract_noun_phrases_capitalized(self):
|
|
"""Test that only capitalized noun phrases are extracted."""
|
|
text = "the quick brown fox jumps over the lazy dog"
|
|
phrases = self.extractor._extract_noun_phrases(text)
|
|
|
|
# Should be empty since no capitalized words
|
|
assert len(phrases) == 0
|
|
|
|
def test_extract_key_terms(self):
|
|
"""Test key term extraction."""
|
|
text = "Python is a programming language used for software development and data analysis."
|
|
terms = self.extractor.extract_key_terms(text, top_k=5)
|
|
|
|
assert len(terms) <= 5
|
|
# Should contain programming, language, software, development, data, analysis
|
|
term_words = [term[0] for term in terms]
|
|
assert "programming" in term_words
|
|
assert "language" in term_words
|
|
assert "software" in term_words
|
|
|
|
def test_extract_key_terms_stopwords_filtered(self):
|
|
"""Test that stopwords are filtered from key terms."""
|
|
text = "This is a test of the system that should work properly."
|
|
terms = self.extractor.extract_key_terms(text)
|
|
|
|
term_words = [term[0] for term in terms]
|
|
# Stopwords should not appear
|
|
assert "this" not in term_words
|
|
assert "is" not in term_words
|
|
assert "a" not in term_words
|
|
assert "of" not in term_words
|
|
assert "the" not in term_words
|
|
|
|
def test_extract_relationships_employment(self):
|
|
"""Test extracting employment relationships."""
|
|
text = "John works for Google. Mary is employed by Microsoft."
|
|
relationships = self.extractor.extract_relationships(text)
|
|
|
|
employment_rels = [r for r in relationships if r["type"] == "employment"]
|
|
assert len(employment_rels) >= 1
|
|
|
|
def test_extract_relationships_ownership(self):
|
|
"""Test extracting ownership relationships."""
|
|
text = "John owns a car. Mary has a house."
|
|
relationships = self.extractor.extract_relationships(text)
|
|
|
|
ownership_rels = [r for r in relationships if r["type"] == "ownership"]
|
|
assert len(ownership_rels) >= 1
|
|
|
|
def test_extract_relationships_location(self):
|
|
"""Test extracting location relationships."""
|
|
text = "John located in New York. The factory belongs to Google."
|
|
relationships = self.extractor.extract_relationships(text)
|
|
|
|
location_rels = [r for r in relationships if r["type"] == "location"]
|
|
assert len(location_rels) >= 1
|
|
|
|
def test_extract_relationships_usage(self):
|
|
"""Test extracting usage relationships."""
|
|
text = "John uses Python. The company implements agile methodology."
|
|
relationships = self.extractor.extract_relationships(text)
|
|
|
|
usage_rels = [r for r in relationships if r["type"] == "usage"]
|
|
assert len(usage_rels) >= 1
|
|
|
|
def test_extract_metadata(self):
|
|
"""Test metadata extraction."""
|
|
text = "This is a test document. It contains some information about Python programming. You can visit https://python.org for more details. Contact john@example.com for questions. The project started in 2020 and costs $10,000."
|
|
metadata = self.extractor.extract_metadata(text)
|
|
|
|
assert metadata["word_count"] > 0
|
|
assert metadata["sentence_count"] > 0
|
|
assert metadata["avg_words_per_sentence"] > 0
|
|
assert len(metadata["urls"]) > 0
|
|
assert len(metadata["email_addresses"]) > 0
|
|
assert len(metadata["dates"]) > 0
|
|
assert len(metadata["numeric_values"]) > 0
|
|
assert metadata["has_code"] is False # No code in this text
|
|
|
|
def test_extract_metadata_with_code(self):
|
|
"""Test metadata extraction with code content."""
|
|
text = "Here is a function: def hello(): print('Hello, world!')"
|
|
metadata = self.extractor.extract_metadata(text)
|
|
|
|
assert metadata["has_code"] is True
|
|
|
|
def test_extract_metadata_with_questions(self):
|
|
"""Test metadata extraction with questions."""
|
|
text = "What is Python? How does it work? Why use it?"
|
|
metadata = self.extractor.extract_metadata(text)
|
|
|
|
assert metadata["has_questions"] is True
|
|
|
|
def test_categorize_content_programming(self):
|
|
"""Test content categorization for programming."""
|
|
text = "Python is a programming language used for code development and debugging."
|
|
categories = self.extractor.categorize_content(text)
|
|
|
|
assert "programming" in categories
|
|
|
|
def test_categorize_content_data(self):
|
|
"""Test content categorization for data."""
|
|
text = "The database contains records and tables with statistical analysis."
|
|
categories = self.extractor.categorize_content(text)
|
|
|
|
assert "data" in categories
|
|
|
|
def test_categorize_content_documentation(self):
|
|
"""Test content categorization for documentation."""
|
|
text = "This guide explains how to use the tutorial and manual."
|
|
categories = self.extractor.categorize_content(text)
|
|
|
|
assert "documentation" in categories
|
|
|
|
def test_categorize_content_configuration(self):
|
|
"""Test content categorization for configuration."""
|
|
text = "Configure the settings and setup the deployment environment."
|
|
categories = self.extractor.categorize_content(text)
|
|
|
|
assert "configuration" in categories
|
|
|
|
def test_categorize_content_testing(self):
|
|
"""Test content categorization for testing."""
|
|
text = "Run the tests to validate the functionality and verify quality."
|
|
categories = self.extractor.categorize_content(text)
|
|
|
|
assert "testing" in categories
|
|
|
|
def test_categorize_content_research(self):
|
|
"""Test content categorization for research."""
|
|
text = "The study investigates findings and results from the analysis."
|
|
categories = self.extractor.categorize_content(text)
|
|
|
|
assert "research" in categories
|
|
|
|
def test_categorize_content_planning(self):
|
|
"""Test content categorization for planning."""
|
|
text = "Plan the project schedule with milestones and timeline."
|
|
categories = self.extractor.categorize_content(text)
|
|
|
|
assert "planning" in categories
|
|
|
|
def test_categorize_content_general(self):
|
|
"""Test content categorization defaults to general."""
|
|
text = "This is some random text without specific keywords."
|
|
categories = self.extractor.categorize_content(text)
|
|
|
|
assert "general" in categories
|
|
|
|
def test_extract_facts_empty_text(self):
|
|
"""Test fact extraction with empty text."""
|
|
facts = self.extractor.extract_facts("")
|
|
assert len(facts) == 0
|
|
|
|
def test_extract_key_terms_empty_text(self):
|
|
"""Test key term extraction with empty text."""
|
|
terms = self.extractor.extract_key_terms("")
|
|
assert len(terms) == 0
|
|
|
|
def test_extract_relationships_empty_text(self):
|
|
"""Test relationship extraction with empty text."""
|
|
relationships = self.extractor.extract_relationships("")
|
|
assert len(relationships) == 0
|
|
|
|
def test_extract_metadata_empty_text(self):
|
|
"""Test metadata extraction with empty text."""
|
|
metadata = self.extractor.extract_metadata("")
|
|
assert metadata["word_count"] == 0
|
|
assert metadata["sentence_count"] == 0
|
|
assert metadata["avg_words_per_sentence"] == 0.0
|