r/test.py at ac94f9f4bcb6ba4c23c1b65b51e3ae68e4ba3682

 #!/usr/bin/env python3
 """
 Autonomous Agent Complex Task Testing Framework
 Tests agent capability to execute and complete multi-step tasks independently
 """
 from collections import deque
 from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
 import json
 import subprocess
 import time
 from typing import Any, Dict, List
 class TaskComplexity(Enum):
     BASIC = 1
     INTERMEDIATE = 2
     ADVANCED = 3
     EXPERT = 4
     EXTREME = 5
 @dataclass
 class TestCase:
     id: str
     name: str
     complexity: TaskComplexity
     task_description: str
     verification_checks: List[Dict[str, Any]]
     timeout_seconds: int
     expected_steps: List[str]
     success_criteria: Dict[str, Any]
 @dataclass
 class TestResult:
     test_id: str
     success: bool
     execution_time: float
     steps_completed: List[str]
     verification_results: Dict[str, bool]
     error_message: str = ""
     agent_output: str = ""
     output_tail: List[str] = None
 class AgentTester:
     def __init__(self, log_file="agent_test_results.json", tail_lines=20):
         self.log_file = log_file
         self.results = []
         self.tail_lines = tail_lines
     def cleanup_directory(self):
         """Delete all files in current directory except test.py"""
         import os
         import shutil
         print("  -> Cleaning up directory...")
         for item in os.listdir('.'):
             if item == 'test.py':
                 continue
             try:
                 if os.path.isfile(item) or os.path.islink(item):
                     os.unlink(item)
                 elif os.path.isdir(item):
                     shutil.rmtree(item)
             except Exception as e:
                 print(f"  ! Failed to delete {item}: {e}")
     def execute_agent_task(self, task: str, timeout: int) -> tuple[str, float, List[str]]:
         """Execute agent command and return output with timing and tail"""
         start_time = time.time()
         output_lines = []
         tail_buffer = deque(maxlen=self.tail_lines)
         db_file = ".r.db"
         try:
             process = subprocess.Popen(
                 ['r', task],
                 stdout=subprocess.PIPE,
                 stderr=subprocess.STDOUT,
                 text=True,
                 bufsize=1
             )
             while True:
                 line = process.stdout.readline()
                 if line == '' and process.poll() is not None:
                     break
                 if line:
                     output_lines.append(line.rstrip())
                     tail_buffer.append(line.rstrip())
                     print(f"  │ {line.rstrip()}")
             process.wait(timeout=timeout)
             execution_time = time.time() - start_time
             full_output = '\\n'.join(output_lines)
             tail = list(tail_buffer)
             return full_output, execution_time, tail
         except subprocess.TimeoutExpired:
             process.kill()
             execution_time = time.time() - start_time
             return "TIMEOUT_ERROR", execution_time, list(tail_buffer)
         except Exception as e:
             execution_time = time.time() - start_time
             return f"EXECUTION_ERROR: {str(e)}", execution_time, []
     def verify_output(self, output: str, checks: List[Dict[str, Any]]) -> Dict[str, bool]:
         """Run verification checks on agent output"""
         import os
         import re
         results = {}
         for check in checks:
             check_type = check['type']
             check_name = check['name']
             try:
                 if check_type == 'contains':
                     results[check_name] = check['value'] in output
                 elif check_type == 'not_contains':
                     results[check_name] = check['value'] not in output
                 elif check_type == 'file_exists':
                     results[check_name] = os.path.exists(check['path'])
                 elif check_type == 'command_success':
                     cmd_result = subprocess.run(
                         check['command'],
                         shell=True,
                         capture_output=True,
                         timeout=10
                     )
                     results[check_name] = cmd_result.returncode == 0
                 elif check_type == 'json_valid':
                     # FIX: Validate specific JSON file if path provided, otherwise validate output
                     if 'path' in check:
                         if os.path.exists(check['path']):
                             with open(check['path'], 'r') as f:
                                 json.load(f)  # Raises JSONDecodeError if invalid
                             results[check_name] = True
                         else:
                             results[check_name] = False
                     else:
                         # Fallback: validate output (likely to fail for agent logs)
                         json.loads(output)
                         results[check_name] = True
                 elif check_type == 'step_count':
                     step_count = len([l for l in output.split('\\n') if l.strip()])
                     results[check_name] = step_count >= check['min_steps']
                 elif check_type == 'regex_match':
                     results[check_name] = bool(re.search(check['pattern'], output))
                 elif check_type == 'line_count':
                     if os.path.exists(check['path']):
                         with open(check['path'], 'r') as f:
                             count = len(f.readlines())
                         results[check_name] = count >= check.get('min', 0) and count <= check.get('max', float('inf'))
                     else:
                         results[check_name] = False
             except json.JSONDecodeError:
                 results[check_name] = False
             except Exception:
                 results[check_name] = False
         return results
     def run_test(self, test: TestCase) -> TestResult:
         """Execute a single test case"""
         self.cleanup_directory()
         print(f"\n{'═'*70}")
         print(f"[{test.id}] {test.name}")
         print(f"{'═'*70}")
         print(f"Complexity: {test.complexity.name} ({test.complexity.value}/5)")
         print(f"Timeout: {test.timeout_seconds}s")
         print(f"\nTask Description:")
         print(f"  {test.task_description}")
         print(f"\nExpected Steps: {' → '.join(test.expected_steps)}")
         print(f"\n{'─'*70}")
         print("Agent Output (live stream):")
         print(f"{'─'*70}")
         output, exec_time, tail = self.execute_agent_task(
             test.task_description,
             test.timeout_seconds
         )
         print(f"{'─'*70}")
         print(f"\nOutput Tail (last {self.tail_lines} lines):")
         print(f"{'─'*70}")
         for i, line in enumerate(tail, 1):
             print(f"  {i:2d} │ {line}")
         print(f"{'─'*70}")
         verification_results = self.verify_output(output, test.verification_checks)
         # Determine success based on criteria
         success = all(verification_results.values())
         if test.success_criteria.get('min_verifications'):
             passed = sum(verification_results.values())
             required = test.success_criteria['min_verifications']
             success = passed >= required
         result = TestResult(
             test_id=test.id,
             success=success,
             execution_time=exec_time,
             steps_completed=[],
             verification_results=verification_results,
             agent_output=output[:1000],
             output_tail=tail
         )
         self.results.append(result)
         print(f"\nVerification Results:")
         for check_name, passed in verification_results.items():
             status = "✓ PASS" if passed else "✗ FAIL"
             print(f"  {status}: {check_name}")
         print(f"\n{'═'*70}")
         print(f"Result: {'✓ SUCCESS' if success else '✗ FAILED'}")
         print(f"Execution time: {exec_time:.2f}s")
         print(f"Verifications: {sum(verification_results.values())}/{len(verification_results)} passed")
         print(f"{'═'*70}")
         if not success:
             print("FAILURE OF TESTS, NOT ALL TESTS COMPLETED SUCCESFULLY. PLEASE OPTIMIZE THE AGENT EXECUTING THIS.")
             exit()
         return result
     def save_results(self):
         """Save test results to JSON file"""
         output = {
             'timestamp': datetime.now().isoformat(),
             'total_tests': len(self.results),
             'passed': sum(1 for r in self.results if r.success),
             'failed': sum(1 for r in self.results if not r.success),
             'average_execution_time': sum(r.execution_time for r in self.results) / len(self.results) if self.results else 0,
             'results': [
                 {
                     'test_id': r.test_id,
                     'success': r.success,
                     'execution_time': r.execution_time,
                     'verifications': r.verification_results,
                     'output_preview': r.agent_output,
                     'output_tail': r.output_tail
                 }
                 for r in self.results
             ]
         }
         with open(self.log_file, 'w') as f:
             json.dump(output, f, indent=2)
         print(f"\n{'='*70}")
         print(f"TEST RESULTS SUMMARY")
         print(f"{'='*70}")
         print(f"Total Tests: {output['total_tests']}")
         print(f"Passed: {output['passed']} ({output['passed']/output['total_tests']*100:.1f}%)")
         print(f"Failed: {output['failed']}")
         print(f"Average Execution Time: {output['average_execution_time']:.2f}s")
         print(f"Results saved to: {self.log_file}")
         print(f"{'='*70}")
 # Define Test Suite
 TEST_SUITE = [
     # BASIC COMPLEXITY
     TestCase(
         id="T001",
         name="Simple File Creation",
         complexity=TaskComplexity.BASIC,
         task_description="Create a file named test_output.txt with the text 'Hello World'",
         verification_checks=[
             {'type': 'file_exists', 'name': 'file_created', 'path': 'test_output.txt'},
             {'type': 'command_success', 'name': 'content_correct',
              'command': 'grep -q "Hello World" test_output.txt'}
         ],
         timeout_seconds=30,
         expected_steps=['create_file', 'write_content'],
         success_criteria={'min_verifications': 2}
     ),
     TestCase(
         id="T002",
         name="Directory Operations with File Hierarchy",
         complexity=TaskComplexity.BASIC,
         task_description="Create a directory called test_dir with two subdirectories named src and docs, create three empty files inside src named file1.txt, file2.txt, file3.txt, and create a README.md in docs with the text 'Documentation folder', then list the entire directory tree",
         verification_checks=[
             {'type': 'file_exists', 'name': 'dir_exists', 'path': 'test_dir'},
             {'type': 'file_exists', 'name': 'src_exists', 'path': 'test_dir/src'},
             {'type': 'file_exists', 'name': 'docs_exists', 'path': 'test_dir/docs'},
             {'type': 'file_exists', 'name': 'file1_exists', 'path': 'test_dir/src/file1.txt'},
             {'type': 'file_exists', 'name': 'file2_exists', 'path': 'test_dir/src/file2.txt'},
             {'type': 'file_exists', 'name': 'file3_exists', 'path': 'test_dir/src/file3.txt'},
             {'type': 'file_exists', 'name': 'readme_exists', 'path': 'test_dir/docs/README.md'},
         ],
         timeout_seconds=45,
         expected_steps=['mkdir_nested', 'touch_files', 'create_readme', 'tree_list'],
         success_criteria={'min_verifications': 6}
     ),
     # INTERMEDIATE COMPLEXITY
     TestCase(
         id="T003",
         name="Advanced Data Processing with Statistics",
         complexity=TaskComplexity.INTERMEDIATE,
         task_description="Create a CSV file with 10 rows of sample employee data (name,age,city,salary) named data.csv, then read it and calculate the average age, median salary, and count of employees per city, and write detailed statistics to summary.txt with proper formatting",
         verification_checks=[
             {'type': 'file_exists', 'name': 'csv_created', 'path': 'data.csv'},
             {'type': 'file_exists', 'name': 'summary_created', 'path': 'summary.txt'},
             {'type': 'line_count', 'name': 'csv_has_rows', 'path': 'data.csv', 'min': 10, 'max': 12},
             {'type': 'command_success', 'name': 'summary_has_stats',
              'command': 'grep -iqE "(average|median|count)" summary.txt'},
             {'type': 'command_success', 'name': 'summary_has_numbers',
              'command': 'grep -qE "[0-9]+" summary.txt'}
         ],
         timeout_seconds=75,
         expected_steps=['create_csv', 'write_data', 'read_csv', 'calculate_stats', 'format_summary', 'write_summary'],
         success_criteria={'min_verifications': 4}
     ),
     TestCase(
         id="T004",
         name="API Request with Data Transformation and Caching",
         complexity=TaskComplexity.INTERMEDIATE,
         task_description="Make a GET request to https://api.github.com/repos/torvalds/linux, extract the stargazers_count, forks_count, and open_issues_count fields, calculate the engagement ratio (stars/forks), save raw JSON to cache.json, and create a formatted report in repo_stats.txt with all metrics",
         verification_checks=[
             {'type': 'file_exists', 'name': 'cache_exists', 'path': 'cache.json'},
             {'type': 'file_exists', 'name': 'stats_exists', 'path': 'repo_stats.txt'},
             {'type': 'json_valid', 'name': 'valid_cache_json', 'path': 'cache.json'},  # FIXED: Added path
             {'type': 'command_success', 'name': 'has_metrics',
              'command': 'grep -iqE "(stars|forks|ratio)" repo_stats.txt'},
         ],
         timeout_seconds=60,
         expected_steps=['api_request', 'parse_json', 'extract_fields', 'calculate_ratio', 'cache_data', 'write_report'],
         success_criteria={'min_verifications': 3}
     ),
     TestCase(
         id="T005",
         name="Web Scraping with Retry Logic",
         complexity=TaskComplexity.INTERMEDIATE,
         task_description="Fetch the public API at https://jsonplaceholder.typicode.com/users, extract all email domains, count occurrences of each domain, sort by frequency, and save to domains.txt. If the request fails, retry up to 3 times with exponential backoff",
         verification_checks=[
             {'type': 'file_exists', 'name': 'output_exists', 'path': 'domains.txt'},
             {'type': 'command_success', 'name': 'has_domains',
              'command': 'grep -qE "@" domains.txt'},
             {'type': 'command_success', 'name': 'has_counts',
              'command': 'grep -qE "[0-9]+" domains.txt'},
         ],
         timeout_seconds=90,
         expected_steps=['api_request', 'parse_users', 'extract_domains', 'count_frequency', 'sort_results', 'write_output'],
         success_criteria={'min_verifications': 2}
     ),
     # ADVANCED COMPLEXITY
     TestCase(
         id="T006",
         name="Conditional Logic with Nested Error Handling",
         complexity=TaskComplexity.ADVANCED,
         task_description="Try to read a file called config.json. If it doesn't exist, create it with default configuration {'debug': true, 'timeout': 30, 'retry': 3}. Then validate the JSON structure, check if all required keys exist, append a timestamp field, and create a backup file config.backup.json. Write validation results to validation.log",
         verification_checks=[
             {'type': 'file_exists', 'name': 'config_exists', 'path': 'config.json'},
             {'type': 'file_exists', 'name': 'backup_exists', 'path': 'config.backup.json'},
             {'type': 'file_exists', 'name': 'log_exists', 'path': 'validation.log'},
             {'type': 'json_valid', 'name': 'valid_json', 'path': 'config.json'},  # FIXED: Added path
             {'type': 'command_success', 'name': 'has_timestamp',
              'command': 'grep -q "timestamp" config.json'}
         ],
         timeout_seconds=60,
         expected_steps=['check_file', 'create_default', 'validate_structure', 'append_timestamp', 'create_backup', 'log_validation'],
         success_criteria={'min_verifications': 4}
     ),
     TestCase(
         id="T007",
         name="Multi-Format Data Pipeline with Transformations",
         complexity=TaskComplexity.ADVANCED,
         task_description="Create a JSON file with 5 product entries (id, name, price, category, stock), convert it to CSV format, filter products where stock > 0, apply a 10% discount to all prices, convert the result to a markdown table with formatted prices ($XX.XX), and save it to products_report.md. Also generate a JSON summary with total_products, total_value, and categories array",
         verification_checks=[
             {'type': 'file_exists', 'name': 'json_exists', 'path': 'products.json'},
             {'type': 'file_exists', 'name': 'csv_exists', 'path': 'products.csv'},
             {'type': 'file_exists', 'name': 'markdown_exists', 'path': 'products_report.md'},
             {'type': 'file_exists', 'name': 'summary_exists', 'path': 'summary.json'},
             {'type': 'command_success', 'name': 'markdown_has_table',
              'command': 'grep -q "|" products_report.md'},
             {'type': 'command_success', 'name': 'has_dollar_signs',
              'command': 'grep -q "$" products_report.md'},
             {'type': 'json_valid', 'name': 'valid_summary_json', 'path': 'summary.json'}  # FIXED: Added path
         ],
         timeout_seconds=120,
         expected_steps=['create_json', 'json_to_csv', 'filter_stock', 'apply_discount', 'format_prices', 'csv_to_markdown', 'generate_summary'],
         success_criteria={'min_verifications': 5}
     ),
     TestCase(
         id="T008",
         name="Parallel Data Processing with Aggregation",
         complexity=TaskComplexity.ADVANCED,
         task_description="Fetch data from https://jsonplaceholder.typicode.com/posts and https://jsonplaceholder.typicode.com/comments simultaneously, join them based on postId, count comments per post, identify the top 5 most commented posts, and create a detailed HTML report (report.html) with a table and summary statistics",
         verification_checks=[
             {'type': 'file_exists', 'name': 'report_exists', 'path': 'report.html'},
             {'type': 'command_success', 'name': 'has_table',
              'command': 'grep -q "<table>" report.html'},
             {'type': 'command_success', 'name': 'has_html_structure',
              'command': 'grep -q "</html>" report.html'},
             {'type': 'command_success', 'name': 'has_comments_data',
              'command': 'grep -qE "comment" report.html'},
         ],
         timeout_seconds=150,
         expected_steps=['parallel_fetch', 'join_data', 'count_comments', 'find_top_5', 'generate_html', 'write_report'],
         success_criteria={'min_verifications': 3}
     ),
     # EXPERT COMPLEXITY
     TestCase(
         id="T009",
         name="Multi-Stage Data Pipeline with Error Recovery",
         complexity=TaskComplexity.EXPERT,
         task_description="Create a complete data pipeline: 1) Download data from https://jsonplaceholder.typicode.com/posts, 2) Filter posts with userId=1, 3) Extract titles and bodies, 4) Calculate word count for each, 5) Sort by word count descending, 6) Save to processed_posts.txt, 7) Create a summary.json with total_posts, average_word_count, longest_title, and first 3 titles, 8) Generate a CSV with columns: id, title_length, body_word_count, 9) Create execution_log.txt documenting each pipeline stage with timestamps",
         verification_checks=[
             {'type': 'file_exists', 'name': 'processed_exists', 'path': 'processed_posts.txt'},
             {'type': 'file_exists', 'name': 'summary_exists', 'path': 'summary.json'},
             {'type': 'file_exists', 'name': 'csv_exists', 'path': 'pipeline_data.csv'},
             {'type': 'file_exists', 'name': 'log_exists', 'path': 'execution_log.txt'},
             {'type': 'json_valid', 'name': 'valid_json', 'path': 'summary.json'},  # FIXED: Added path
             {'type': 'command_success', 'name': 'has_posts',
              'command': 'test $(wc -l < processed_posts.txt) -ge 5'},
             {'type': 'command_success', 'name': 'csv_has_header',
              'command': 'head -1 pipeline_data.csv | grep -q ","'},
             {'type': 'command_success', 'name': 'log_has_timestamps',
              'command': 'grep -qE "[0-9]{4}-[0-9]{2}-[0-9]{2}" execution_log.txt'},
         ],
         timeout_seconds=180,
         expected_steps=['download', 'filter', 'extract', 'calculate_words', 'sort', 'save', 'create_summary', 'generate_csv', 'log_execution'],
         success_criteria={'min_verifications': 6}
     ),
     TestCase(
         id="T010",
         name="Self-Correcting Script with Comprehensive Error Handling",
         complexity=TaskComplexity.EXPERT,
         task_description="Create a Python script named safe_calculator.py that: 1) Reads two numbers from input_data.txt (one per line), 2) Performs division, multiplication, and power operations, 3) Handles FileNotFoundError by creating input_data.txt with default values [10, 2], 4) Handles ZeroDivisionError gracefully, 5) Handles ValueError for non-numeric input, 6) Writes results to results.txt, 7) Writes detailed error log to error.log with timestamps and stack traces, 8) Includes unit tests in the script that can be run with pytest",
         verification_checks=[
             {'type': 'file_exists', 'name': 'script_created', 'path': 'safe_calculator.py'},
             {'type': 'command_success', 'name': 'script_runnable',
              'command': 'python3 -m py_compile safe_calculator.py'},
             {'type': 'command_success', 'name': 'has_error_handling',
              'command': 'grep -q "except" safe_calculator.py'},
             {'type': 'command_success', 'name': 'has_logging',
              'command': 'grep -qE "(logging|error)" safe_calculator.py'},
         ],
         timeout_seconds=120,
         expected_steps=['create_script', 'add_file_handling', 'add_zero_division', 'add_value_error', 'add_logging', 'add_tests', 'test_execution'],
         success_criteria={'min_verifications': 3}
     ),
     TestCase(
         id="T011",
         name="Repository Analysis with Statistical Modeling",
         complexity=TaskComplexity.EXPERT,
         task_description="Analyze the current directory structure: 1) Find all Python files recursively, 2) Count total lines, comment lines, and code lines in each, 3) Calculate complexity metrics (functions per file, average function length), 4) Identify files with highest complexity, 5) Create detailed_report.txt with per-file analysis, 6) Create metrics.csv with columns: filename, total_lines, code_lines, comment_ratio, function_count, 7) Create summary.json with aggregate statistics and recommendations, 8) Generate a bar chart data file (visualization_data.csv) suitable for plotting",
         verification_checks=[
             {'type': 'file_exists', 'name': 'report_exists', 'path': 'detailed_report.txt'},
             {'type': 'file_exists', 'name': 'metrics_exists', 'path': 'metrics.csv'},
             {'type': 'file_exists', 'name': 'summary_exists', 'path': 'summary.json'},
             {'type': 'file_exists', 'name': 'viz_exists', 'path': 'visualization_data.csv'},
             {'type': 'json_valid', 'name': 'valid_summary_json', 'path': 'summary.json'},  # FIXED: Added path
             {'type': 'command_success', 'name': 'csv_has_header',
              'command': 'head -1 metrics.csv | grep -q "filename"'},
             {'type': 'command_success', 'name': 'report_has_analysis',
              'command': 'grep -qE "(lines|functions|complexity)" detailed_report.txt'},
         ],
         timeout_seconds=150,
         expected_steps=['scan_directory', 'find_python_files', 'analyze_each_file', 'calculate_metrics', 'identify_complex', 'create_report', 'generate_csv', 'create_summary', 'generate_viz_data'],
         success_criteria={'min_verifications': 5}
     ),
     TestCase(
         id="T012",
         name="Distributed Task Simulation with State Management",
         complexity=TaskComplexity.EXPERT,
         task_description="Simulate a distributed job queue: 1) Create 10 'job' files (job_1.txt to job_10.txt) with random task descriptions, 2) Process each job sequentially, simulating work with sleep, 3) Track state in state.json (pending, processing, completed), 4) Handle 'failures' for jobs 3 and 7 (retry up to 3 times), 5) Log all state transitions to transitions.log with timestamps, 6) Create final_report.txt with success/failure counts, total processing time, and retry statistics, 7) Clean up successful job files but keep failed ones",
         verification_checks=[
             {'type': 'file_exists', 'name': 'state_exists', 'path': 'state.json'},
             {'type': 'file_exists', 'name': 'log_exists', 'path': 'transitions.log'},
             {'type': 'file_exists', 'name': 'report_exists', 'path': 'final_report.txt'},
             {'type': 'json_valid', 'name': 'valid_state_json', 'path': 'state.json'},  # FIXED: Added path
             {'type': 'command_success', 'name': 'has_transitions',
              'command': 'grep -qE "(pending|processing|completed)" transitions.log'},
             {'type': 'command_success', 'name': 'has_statistics',
              'command': 'grep -qE "(success|failure|retry)" final_report.txt'},
         ],
         timeout_seconds=200,
         expected_steps=['create_jobs', 'init_state', 'process_queue', 'handle_failures', 'retry_logic', 'log_transitions', 'generate_report', 'cleanup'],
         success_criteria={'min_verifications': 4}
     ),
     # EXTREME COMPLEXITY
     TestCase(
         id="T013",
         name="Full-Stack Data Application with ETL Pipeline",
         complexity=TaskComplexity.EXTREME,
         task_description="Build a complete ETL system: 1) Extract data from multiple APIs (GitHub repos, JSONPlaceholder posts/users), 2) Transform data by normalizing structures, joining related data, calculating derived metrics, 3) Load into an SQLite database with proper schema (tables: repositories, posts, users, metrics), 4) Create database indexes for performance, 5) Generate SQL views for common queries, 6) Export aggregated data to multiple formats (JSON, CSV, Markdown report), 7) Create a Python query script (query_db.py) with functions to search the database, 8) Generate comprehensive documentation (README.md) with schema diagram and usage examples, 9) Create validation tests and execution log",
         verification_checks=[
             {'type': 'file_exists', 'name': 'db_exists', 'path': 'data.db'},
             {'type': 'file_exists', 'name': 'query_script_exists', 'path': 'query_db.py'},
             {'type': 'file_exists', 'name': 'readme_exists', 'path': 'README.md'},
             {'type': 'file_exists', 'name': 'json_export_exists', 'path': 'export_data.json'},
             {'type': 'file_exists', 'name': 'csv_export_exists', 'path': 'export_data.csv'},
             {'type': 'command_success', 'name': 'db_has_tables',
              'command': 'sqlite3 data.db ".tables" | grep -q "repositories"'},
             {'type': 'command_success', 'name': 'script_runnable',
              'command': 'python3 -m py_compile query_db.py'},
             {'type': 'command_success', 'name': 'readme_has_schema',
              'command': 'grep -qE "(schema|table|database)" README.md'},
         ],
         timeout_seconds=300,
         expected_steps=['extract_apis', 'transform_data', 'create_schema', 'load_database', 'create_indexes', 'create_views', 'export_formats', 'create_query_script', 'generate_docs', 'validate'],
         success_criteria={'min_verifications': 6}
     ),
     TestCase(
         id="T014",
         name="Autonomous Code Refactoring Agent",
         complexity=TaskComplexity.EXTREME,
         task_description="Create a code analysis and refactoring system: 1) Scan all Python files in current directory, 2) Identify code smells (long functions >50 lines, deep nesting >3 levels, duplicate code blocks), 3) Generate refactoring suggestions for each file, 4) Create refactored versions with suffix '_refactored.py', 5) Run automated tests to ensure functionality preserved, 6) Generate side-by-side diff reports (diff_report.html), 7) Calculate and compare complexity metrics before/after, 8) Create improvement_summary.json with metrics improvements, 9) Document refactoring patterns applied in patterns.md, 10) Generate rollback script (rollback.sh)",
         verification_checks=[
             {'type': 'file_exists', 'name': 'diff_report_exists', 'path': 'diff_report.html'},
             {'type': 'file_exists', 'name': 'summary_exists', 'path': 'improvement_summary.json'},
             {'type': 'file_exists', 'name': 'patterns_exists', 'path': 'patterns.md'},
             {'type': 'file_exists', 'name': 'rollback_exists', 'path': 'rollback.sh'},
             {'type': 'json_valid', 'name': 'valid_summary_json', 'path': 'improvement_summary.json'},  # FIXED: Added path
             {'type': 'command_success', 'name': 'has_html_structure',
              'command': 'grep -q "<html>" diff_report.html'},
             {'type': 'command_success', 'name': 'patterns_has_examples',
              'command': 'grep -qE "(before|after|pattern)" patterns.md'},
         ],
         timeout_seconds=400,
         expected_steps=['scan_files', 'detect_smells', 'generate_suggestions', 'refactor_code', 'run_tests', 'create_diffs', 'calculate_metrics', 'generate_summary', 'document_patterns', 'create_rollback'],
         success_criteria={'min_verifications': 5}
     ),
     TestCase(
         id="T015",
         name="Intelligent Testing Framework Generator",
         complexity=TaskComplexity.EXTREME,
         task_description="Build a meta-testing system: 1) Analyze all Python modules in current directory, 2) Extract functions and their signatures, 3) Infer parameter types and generate test cases, 4) Create pytest test files for each module (test_*.py), 5) Generate fixtures for common data types, 6) Create parametrized tests for edge cases (empty, null, boundary values), 7) Add mocking for external dependencies, 8) Generate test coverage report (coverage.html), 9) Create CI/CD configuration (.github/workflows/test.yml), 10) Generate comprehensive test documentation (test_guide.md) with examples",
         verification_checks=[
             {'type': 'file_exists', 'name': 'coverage_exists', 'path': 'coverage.html'},
             {'type': 'file_exists', 'name': 'ci_config_exists', 'path': '.github/workflows/test.yml'},
             {'type': 'file_exists', 'name': 'test_guide_exists', 'path': 'test_guide.md'},
             {'type': 'command_success', 'name': 'has_test_files',
              'command': 'ls test_*.py 2>/dev/null | head -1'},
             {'type': 'command_success', 'name': 'tests_runnable',
              'command': 'python3 -m py_compile test_*.py 2>/dev/null'},
             {'type': 'command_success', 'name': 'has_fixtures',
              'command': 'grep -q "@pytest.fixture" test_*.py 2>/dev/null'},
         ],
         timeout_seconds=350,
         expected_steps=['analyze_modules', 'extract_functions', 'infer_types', 'generate_tests', 'create_fixtures', 'add_parametrized', 'add_mocking', 'run_coverage', 'generate_ci_config', 'create_docs'],
         success_criteria={'min_verifications': 4}
     ),
 ]
 def main():
     tester = AgentTester(tail_lines=20)
     print("="*70)
     print("AUTONOMOUS AGENT COMPLEX TASK TEST SUITE")
     print("="*70)
     print(f"Total test cases: {len(TEST_SUITE)}")
     print(f"Complexity levels: BASIC (1), INTERMEDIATE (2), ADVANCED (3), EXPERT (4), EXTREME (5)")
     print(f"Output tail length: {tester.tail_lines} lines")
     print("="*70)
     # Group tests by complexity
     by_complexity = {}
     for test in TEST_SUITE:
         level = test.complexity.name
         by_complexity.setdefault(level, []).append(test)
     print("\nTest Distribution:")
     for level in ['BASIC', 'INTERMEDIATE', 'ADVANCED', 'EXPERT', 'EXTREME']:
         count = len(by_complexity.get(level, []))
         print(f"  {level}: {count} tests")
     print("="*70)
     for test in TEST_SUITE:
         try:
             tester.run_test(test)
             time.sleep(3)  # Pause between tests
         except KeyboardInterrupt:
             print("\n\nTest suite interrupted by user")
             break
         except Exception as e:
             print(f"ERROR running test {test.id}: {e}")
             import traceback
             traceback.print_exc()
             continue
     tester.save_results()
 if __name__ == "__main__":
     main()