2025-11-07 18:50:28 +01:00
from rp . memory . fact_extractor import FactExtractor
2025-11-06 15:15:06 +01:00
class TestFactExtractor :
def setup_method ( self ) :
""" Set up test fixture. """
self . extractor = FactExtractor ( )
def test_init ( self ) :
""" Test FactExtractor initialization. """
assert self . extractor . fact_patterns is not None
assert len ( self . extractor . fact_patterns ) > 0
def test_extract_facts_definition ( self ) :
""" Test extracting definition facts. """
text = " John Smith is a software engineer. Python is a programming language. "
facts = self . extractor . extract_facts ( text )
assert len ( facts ) > = 2
# Check for definition pattern matches
definition_facts = [ f for f in facts if f [ " type " ] == " definition " ]
assert len ( definition_facts ) > = 1
def test_extract_facts_temporal ( self ) :
""" Test extracting temporal facts. """
text = " John was born in 1990. The company was founded in 2010. "
facts = self . extractor . extract_facts ( text )
temporal_facts = [ f for f in facts if f [ " type " ] == " temporal " ]
assert len ( temporal_facts ) > = 1
def test_extract_facts_attribution ( self ) :
""" Test extracting attribution facts. """
text = " John invented the widget. Mary developed the software. "
facts = self . extractor . extract_facts ( text )
attribution_facts = [ f for f in facts if f [ " type " ] == " attribution " ]
assert len ( attribution_facts ) > = 1
def test_extract_facts_numeric ( self ) :
""" Test extracting numeric facts. """
text = " The car costs $25,000. The house is worth $500,000. "
facts = self . extractor . extract_facts ( text )
numeric_facts = [ f for f in facts if f [ " type " ] == " numeric " ]
assert len ( numeric_facts ) > = 1
def test_extract_facts_location ( self ) :
""" Test extracting location facts. """
text = " John lives in San Francisco. The office is located in New York. "
facts = self . extractor . extract_facts ( text )
location_facts = [ f for f in facts if f [ " type " ] == " location " ]
assert len ( location_facts ) > = 1
def test_extract_facts_entity ( self ) :
""" Test extracting entity facts from noun phrases. """
text = " John Smith works at Google Inc. He uses Python programming. "
facts = self . extractor . extract_facts ( text )
entity_facts = [ f for f in facts if f [ " type " ] == " entity " ]
assert len ( entity_facts ) > = 1
def test_extract_noun_phrases ( self ) :
""" Test noun phrase extraction. """
text = " John Smith is a software engineer at Google. He works on Python Projects. "
phrases = self . extractor . _extract_noun_phrases ( text )
assert " John Smith " in phrases
assert " Google " in phrases
assert " Python Projects " in phrases
def test_extract_noun_phrases_capitalized ( self ) :
""" Test that only capitalized noun phrases are extracted. """
text = " the quick brown fox jumps over the lazy dog "
phrases = self . extractor . _extract_noun_phrases ( text )
# Should be empty since no capitalized words
assert len ( phrases ) == 0
def test_extract_key_terms ( self ) :
""" Test key term extraction. """
text = " Python is a programming language used for software development and data analysis. "
terms = self . extractor . extract_key_terms ( text , top_k = 5 )
assert len ( terms ) < = 5
# Should contain programming, language, software, development, data, analysis
term_words = [ term [ 0 ] for term in terms ]
assert " programming " in term_words
assert " language " in term_words
assert " software " in term_words
def test_extract_key_terms_stopwords_filtered ( self ) :
""" Test that stopwords are filtered from key terms. """
text = " This is a test of the system that should work properly. "
terms = self . extractor . extract_key_terms ( text )
term_words = [ term [ 0 ] for term in terms ]
# Stopwords should not appear
assert " this " not in term_words
assert " is " not in term_words
assert " a " not in term_words
assert " of " not in term_words
assert " the " not in term_words
def test_extract_relationships_employment ( self ) :
""" Test extracting employment relationships. """
text = " John works for Google. Mary is employed by Microsoft. "
relationships = self . extractor . extract_relationships ( text )
employment_rels = [ r for r in relationships if r [ " type " ] == " employment " ]
assert len ( employment_rels ) > = 1
def test_extract_relationships_ownership ( self ) :
""" Test extracting ownership relationships. """
text = " John owns a car. Mary has a house. "
relationships = self . extractor . extract_relationships ( text )
ownership_rels = [ r for r in relationships if r [ " type " ] == " ownership " ]
assert len ( ownership_rels ) > = 1
def test_extract_relationships_location ( self ) :
""" Test extracting location relationships. """
text = " John located in New York. The factory belongs to Google. "
relationships = self . extractor . extract_relationships ( text )
location_rels = [ r for r in relationships if r [ " type " ] == " location " ]
assert len ( location_rels ) > = 1
def test_extract_relationships_usage ( self ) :
""" Test extracting usage relationships. """
text = " John uses Python. The company implements agile methodology. "
relationships = self . extractor . extract_relationships ( text )
usage_rels = [ r for r in relationships if r [ " type " ] == " usage " ]
assert len ( usage_rels ) > = 1
def test_extract_metadata ( self ) :
""" Test metadata extraction. """
text = " This is a test document. It contains some information about Python programming. You can visit https://python.org for more details. Contact john@example.com for questions. The project started in 2020 and costs $10,000. "
metadata = self . extractor . extract_metadata ( text )
assert metadata [ " word_count " ] > 0
assert metadata [ " sentence_count " ] > 0
assert metadata [ " avg_words_per_sentence " ] > 0
assert len ( metadata [ " urls " ] ) > 0
assert len ( metadata [ " email_addresses " ] ) > 0
assert len ( metadata [ " dates " ] ) > 0
assert len ( metadata [ " numeric_values " ] ) > 0
assert metadata [ " has_code " ] is False # No code in this text
def test_extract_metadata_with_code ( self ) :
""" Test metadata extraction with code content. """
text = " Here is a function: def hello(): print( ' Hello, world! ' ) "
metadata = self . extractor . extract_metadata ( text )
assert metadata [ " has_code " ] is True
def test_extract_metadata_with_questions ( self ) :
""" Test metadata extraction with questions. """
text = " What is Python? How does it work? Why use it? "
metadata = self . extractor . extract_metadata ( text )
assert metadata [ " has_questions " ] is True
def test_categorize_content_programming ( self ) :
""" Test content categorization for programming. """
text = " Python is a programming language used for code development and debugging. "
categories = self . extractor . categorize_content ( text )
assert " programming " in categories
def test_categorize_content_data ( self ) :
""" Test content categorization for data. """
text = " The database contains records and tables with statistical analysis. "
categories = self . extractor . categorize_content ( text )
assert " data " in categories
def test_categorize_content_documentation ( self ) :
""" Test content categorization for documentation. """
text = " This guide explains how to use the tutorial and manual. "
categories = self . extractor . categorize_content ( text )
assert " documentation " in categories
def test_categorize_content_configuration ( self ) :
""" Test content categorization for configuration. """
text = " Configure the settings and setup the deployment environment. "
categories = self . extractor . categorize_content ( text )
assert " configuration " in categories
def test_categorize_content_testing ( self ) :
""" Test content categorization for testing. """
text = " Run the tests to validate the functionality and verify quality. "
categories = self . extractor . categorize_content ( text )
assert " testing " in categories
def test_categorize_content_research ( self ) :
""" Test content categorization for research. """
text = " The study investigates findings and results from the analysis. "
categories = self . extractor . categorize_content ( text )
assert " research " in categories
def test_categorize_content_planning ( self ) :
""" Test content categorization for planning. """
text = " Plan the project schedule with milestones and timeline. "
categories = self . extractor . categorize_content ( text )
assert " planning " in categories
def test_categorize_content_general ( self ) :
""" Test content categorization defaults to general. """
text = " This is some random text without specific keywords. "
categories = self . extractor . categorize_content ( text )
assert " general " in categories
def test_extract_facts_empty_text ( self ) :
""" Test fact extraction with empty text. """
facts = self . extractor . extract_facts ( " " )
assert len ( facts ) == 0
def test_extract_key_terms_empty_text ( self ) :
""" Test key term extraction with empty text. """
terms = self . extractor . extract_key_terms ( " " )
assert len ( terms ) == 0
def test_extract_relationships_empty_text ( self ) :
""" Test relationship extraction with empty text. """
relationships = self . extractor . extract_relationships ( " " )
assert len ( relationships ) == 0
def test_extract_metadata_empty_text ( self ) :
""" Test metadata extraction with empty text. """
metadata = self . extractor . extract_metadata ( " " )
assert metadata [ " word_count " ] == 0
assert metadata [ " sentence_count " ] == 0
assert metadata [ " avg_words_per_sentence " ] == 0.0