#!/usr/bin/env python3 """ Autonomous Agent Complex Task Testing Framework Tests agent capability to execute and complete multi-step tasks independently """ from collections import deque from dataclasses import dataclass from datetime import datetime from enum import Enum import json import subprocess import time from typing import Any, Dict, List class TaskComplexity(Enum): BASIC = 1 INTERMEDIATE = 2 ADVANCED = 3 EXPERT = 4 EXTREME = 5 @dataclass class TestCase: id: str name: str complexity: TaskComplexity task_description: str verification_checks: List[Dict[str, Any]] timeout_seconds: int expected_steps: List[str] success_criteria: Dict[str, Any] @dataclass class TestResult: test_id: str success: bool execution_time: float steps_completed: List[str] verification_results: Dict[str, bool] error_message: str = "" agent_output: str = "" output_tail: List[str] = None class AgentTester: def __init__(self, log_file="agent_test_results.json", tail_lines=20): self.log_file = log_file self.results = [] self.tail_lines = tail_lines def cleanup_directory(self): """Delete all files in current directory except test.py""" import os import shutil print(" -> Cleaning up directory...") for item in os.listdir('.'): if item == 'test.py': continue try: if os.path.isfile(item) or os.path.islink(item): os.unlink(item) elif os.path.isdir(item): shutil.rmtree(item) except Exception as e: print(f" ! Failed to delete {item}: {e}") def execute_agent_task(self, task: str, timeout: int) -> tuple[str, float, List[str]]: """Execute agent command and return output with timing and tail""" start_time = time.time() output_lines = [] tail_buffer = deque(maxlen=self.tail_lines) db_file = ".r.db" try: process = subprocess.Popen( ['r', task], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1 ) while True: line = process.stdout.readline() if line == '' and process.poll() is not None: break if line: output_lines.append(line.rstrip()) tail_buffer.append(line.rstrip()) print(f" │ {line.rstrip()}") process.wait(timeout=timeout) execution_time = time.time() - start_time full_output = '\\n'.join(output_lines) tail = list(tail_buffer) return full_output, execution_time, tail except subprocess.TimeoutExpired: process.kill() execution_time = time.time() - start_time return "TIMEOUT_ERROR", execution_time, list(tail_buffer) except Exception as e: execution_time = time.time() - start_time return f"EXECUTION_ERROR: {str(e)}", execution_time, [] def verify_output(self, output: str, checks: List[Dict[str, Any]]) -> Dict[str, bool]: """Run verification checks on agent output""" import os import re results = {} for check in checks: check_type = check['type'] check_name = check['name'] try: if check_type == 'contains': results[check_name] = check['value'] in output elif check_type == 'not_contains': results[check_name] = check['value'] not in output elif check_type == 'file_exists': results[check_name] = os.path.exists(check['path']) elif check_type == 'command_success': cmd_result = subprocess.run( check['command'], shell=True, capture_output=True, timeout=10 ) results[check_name] = cmd_result.returncode == 0 elif check_type == 'json_valid': # FIX: Validate specific JSON file if path provided, otherwise validate output if 'path' in check: if os.path.exists(check['path']): with open(check['path'], 'r') as f: json.load(f) # Raises JSONDecodeError if invalid results[check_name] = True else: results[check_name] = False else: # Fallback: validate output (likely to fail for agent logs) json.loads(output) results[check_name] = True elif check_type == 'step_count': step_count = len([l for l in output.split('\\n') if l.strip()]) results[check_name] = step_count >= check['min_steps'] elif check_type == 'regex_match': results[check_name] = bool(re.search(check['pattern'], output)) elif check_type == 'line_count': if os.path.exists(check['path']): with open(check['path'], 'r') as f: count = len(f.readlines()) results[check_name] = count >= check.get('min', 0) and count <= check.get('max', float('inf')) else: results[check_name] = False except json.JSONDecodeError: results[check_name] = False except Exception: results[check_name] = False return results def run_test(self, test: TestCase) -> TestResult: """Execute a single test case""" self.cleanup_directory() print(f"\n{'═'*70}") print(f"[{test.id}] {test.name}") print(f"{'═'*70}") print(f"Complexity: {test.complexity.name} ({test.complexity.value}/5)") print(f"Timeout: {test.timeout_seconds}s") print(f"\nTask Description:") print(f" {test.task_description}") print(f"\nExpected Steps: {' → '.join(test.expected_steps)}") print(f"\n{'─'*70}") print("Agent Output (live stream):") print(f"{'─'*70}") output, exec_time, tail = self.execute_agent_task( test.task_description, test.timeout_seconds ) print(f"{'─'*70}") print(f"\nOutput Tail (last {self.tail_lines} lines):") print(f"{'─'*70}") for i, line in enumerate(tail, 1): print(f" {i:2d} │ {line}") print(f"{'─'*70}") verification_results = self.verify_output(output, test.verification_checks) # Determine success based on criteria success = all(verification_results.values()) if test.success_criteria.get('min_verifications'): passed = sum(verification_results.values()) required = test.success_criteria['min_verifications'] success = passed >= required result = TestResult( test_id=test.id, success=success, execution_time=exec_time, steps_completed=[], verification_results=verification_results, agent_output=output[:1000], output_tail=tail ) self.results.append(result) print(f"\nVerification Results:") for check_name, passed in verification_results.items(): status = "✓ PASS" if passed else "✗ FAIL" print(f" {status}: {check_name}") print(f"\n{'═'*70}") print(f"Result: {'✓ SUCCESS' if success else '✗ FAILED'}") print(f"Execution time: {exec_time:.2f}s") print(f"Verifications: {sum(verification_results.values())}/{len(verification_results)} passed") print(f"{'═'*70}") if not success: print("FAILURE OF TESTS, NOT ALL TESTS COMPLETED SUCCESFULLY. PLEASE OPTIMIZE THE AGENT EXECUTING THIS.") exit() return result def save_results(self): """Save test results to JSON file""" output = { 'timestamp': datetime.now().isoformat(), 'total_tests': len(self.results), 'passed': sum(1 for r in self.results if r.success), 'failed': sum(1 for r in self.results if not r.success), 'average_execution_time': sum(r.execution_time for r in self.results) / len(self.results) if self.results else 0, 'results': [ { 'test_id': r.test_id, 'success': r.success, 'execution_time': r.execution_time, 'verifications': r.verification_results, 'output_preview': r.agent_output, 'output_tail': r.output_tail } for r in self.results ] } with open(self.log_file, 'w') as f: json.dump(output, f, indent=2) print(f"\n{'='*70}") print(f"TEST RESULTS SUMMARY") print(f"{'='*70}") print(f"Total Tests: {output['total_tests']}") print(f"Passed: {output['passed']} ({output['passed']/output['total_tests']*100:.1f}%)") print(f"Failed: {output['failed']}") print(f"Average Execution Time: {output['average_execution_time']:.2f}s") print(f"Results saved to: {self.log_file}") print(f"{'='*70}") # Define Test Suite TEST_SUITE = [ # BASIC COMPLEXITY TestCase( id="T001", name="Simple File Creation", complexity=TaskComplexity.BASIC, task_description="Create a file named test_output.txt with the text 'Hello World'", verification_checks=[ {'type': 'file_exists', 'name': 'file_created', 'path': 'test_output.txt'}, {'type': 'command_success', 'name': 'content_correct', 'command': 'grep -q "Hello World" test_output.txt'} ], timeout_seconds=30, expected_steps=['create_file', 'write_content'], success_criteria={'min_verifications': 2} ), TestCase( id="T002", name="Directory Operations with File Hierarchy", complexity=TaskComplexity.BASIC, task_description="Create a directory called test_dir with two subdirectories named src and docs, create three empty files inside src named file1.txt, file2.txt, file3.txt, and create a README.md in docs with the text 'Documentation folder', then list the entire directory tree", verification_checks=[ {'type': 'file_exists', 'name': 'dir_exists', 'path': 'test_dir'}, {'type': 'file_exists', 'name': 'src_exists', 'path': 'test_dir/src'}, {'type': 'file_exists', 'name': 'docs_exists', 'path': 'test_dir/docs'}, {'type': 'file_exists', 'name': 'file1_exists', 'path': 'test_dir/src/file1.txt'}, {'type': 'file_exists', 'name': 'file2_exists', 'path': 'test_dir/src/file2.txt'}, {'type': 'file_exists', 'name': 'file3_exists', 'path': 'test_dir/src/file3.txt'}, {'type': 'file_exists', 'name': 'readme_exists', 'path': 'test_dir/docs/README.md'}, ], timeout_seconds=45, expected_steps=['mkdir_nested', 'touch_files', 'create_readme', 'tree_list'], success_criteria={'min_verifications': 6} ), # INTERMEDIATE COMPLEXITY TestCase( id="T003", name="Advanced Data Processing with Statistics", complexity=TaskComplexity.INTERMEDIATE, task_description="Create a CSV file with 10 rows of sample employee data (name,age,city,salary) named data.csv, then read it and calculate the average age, median salary, and count of employees per city, and write detailed statistics to summary.txt with proper formatting", verification_checks=[ {'type': 'file_exists', 'name': 'csv_created', 'path': 'data.csv'}, {'type': 'file_exists', 'name': 'summary_created', 'path': 'summary.txt'}, {'type': 'line_count', 'name': 'csv_has_rows', 'path': 'data.csv', 'min': 10, 'max': 12}, {'type': 'command_success', 'name': 'summary_has_stats', 'command': 'grep -iqE "(average|median|count)" summary.txt'}, {'type': 'command_success', 'name': 'summary_has_numbers', 'command': 'grep -qE "[0-9]+" summary.txt'} ], timeout_seconds=75, expected_steps=['create_csv', 'write_data', 'read_csv', 'calculate_stats', 'format_summary', 'write_summary'], success_criteria={'min_verifications': 4} ), TestCase( id="T004", name="API Request with Data Transformation and Caching", complexity=TaskComplexity.INTERMEDIATE, task_description="Make a GET request to https://api.github.com/repos/torvalds/linux, extract the stargazers_count, forks_count, and open_issues_count fields, calculate the engagement ratio (stars/forks), save raw JSON to cache.json, and create a formatted report in repo_stats.txt with all metrics", verification_checks=[ {'type': 'file_exists', 'name': 'cache_exists', 'path': 'cache.json'}, {'type': 'file_exists', 'name': 'stats_exists', 'path': 'repo_stats.txt'}, {'type': 'json_valid', 'name': 'valid_cache_json', 'path': 'cache.json'}, # FIXED: Added path {'type': 'command_success', 'name': 'has_metrics', 'command': 'grep -iqE "(stars|forks|ratio)" repo_stats.txt'}, ], timeout_seconds=60, expected_steps=['api_request', 'parse_json', 'extract_fields', 'calculate_ratio', 'cache_data', 'write_report'], success_criteria={'min_verifications': 3} ), TestCase( id="T005", name="Web Scraping with Retry Logic", complexity=TaskComplexity.INTERMEDIATE, task_description="Fetch the public API at https://jsonplaceholder.typicode.com/users, extract all email domains, count occurrences of each domain, sort by frequency, and save to domains.txt. If the request fails, retry up to 3 times with exponential backoff", verification_checks=[ {'type': 'file_exists', 'name': 'output_exists', 'path': 'domains.txt'}, {'type': 'command_success', 'name': 'has_domains', 'command': 'grep -qE "@" domains.txt'}, {'type': 'command_success', 'name': 'has_counts', 'command': 'grep -qE "[0-9]+" domains.txt'}, ], timeout_seconds=90, expected_steps=['api_request', 'parse_users', 'extract_domains', 'count_frequency', 'sort_results', 'write_output'], success_criteria={'min_verifications': 2} ), # ADVANCED COMPLEXITY TestCase( id="T006", name="Conditional Logic with Nested Error Handling", complexity=TaskComplexity.ADVANCED, task_description="Try to read a file called config.json. If it doesn't exist, create it with default configuration {'debug': true, 'timeout': 30, 'retry': 3}. Then validate the JSON structure, check if all required keys exist, append a timestamp field, and create a backup file config.backup.json. Write validation results to validation.log", verification_checks=[ {'type': 'file_exists', 'name': 'config_exists', 'path': 'config.json'}, {'type': 'file_exists', 'name': 'backup_exists', 'path': 'config.backup.json'}, {'type': 'file_exists', 'name': 'log_exists', 'path': 'validation.log'}, {'type': 'json_valid', 'name': 'valid_json', 'path': 'config.json'}, # FIXED: Added path {'type': 'command_success', 'name': 'has_timestamp', 'command': 'grep -q "timestamp" config.json'} ], timeout_seconds=60, expected_steps=['check_file', 'create_default', 'validate_structure', 'append_timestamp', 'create_backup', 'log_validation'], success_criteria={'min_verifications': 4} ), TestCase( id="T007", name="Multi-Format Data Pipeline with Transformations", complexity=TaskComplexity.ADVANCED, task_description="Create a JSON file with 5 product entries (id, name, price, category, stock), convert it to CSV format, filter products where stock > 0, apply a 10% discount to all prices, convert the result to a markdown table with formatted prices ($XX.XX), and save it to products_report.md. Also generate a JSON summary with total_products, total_value, and categories array", verification_checks=[ {'type': 'file_exists', 'name': 'json_exists', 'path': 'products.json'}, {'type': 'file_exists', 'name': 'csv_exists', 'path': 'products.csv'}, {'type': 'file_exists', 'name': 'markdown_exists', 'path': 'products_report.md'}, {'type': 'file_exists', 'name': 'summary_exists', 'path': 'summary.json'}, {'type': 'command_success', 'name': 'markdown_has_table', 'command': 'grep -q "|" products_report.md'}, {'type': 'command_success', 'name': 'has_dollar_signs', 'command': 'grep -q "$" products_report.md'}, {'type': 'json_valid', 'name': 'valid_summary_json', 'path': 'summary.json'} # FIXED: Added path ], timeout_seconds=120, expected_steps=['create_json', 'json_to_csv', 'filter_stock', 'apply_discount', 'format_prices', 'csv_to_markdown', 'generate_summary'], success_criteria={'min_verifications': 5} ), TestCase( id="T008", name="Parallel Data Processing with Aggregation", complexity=TaskComplexity.ADVANCED, task_description="Fetch data from https://jsonplaceholder.typicode.com/posts and https://jsonplaceholder.typicode.com/comments simultaneously, join them based on postId, count comments per post, identify the top 5 most commented posts, and create a detailed HTML report (report.html) with a table and summary statistics", verification_checks=[ {'type': 'file_exists', 'name': 'report_exists', 'path': 'report.html'}, {'type': 'command_success', 'name': 'has_table', 'command': 'grep -q "" report.html'}, {'type': 'command_success', 'name': 'has_html_structure', 'command': 'grep -q "" report.html'}, {'type': 'command_success', 'name': 'has_comments_data', 'command': 'grep -qE "comment" report.html'}, ], timeout_seconds=150, expected_steps=['parallel_fetch', 'join_data', 'count_comments', 'find_top_5', 'generate_html', 'write_report'], success_criteria={'min_verifications': 3} ), # EXPERT COMPLEXITY TestCase( id="T009", name="Multi-Stage Data Pipeline with Error Recovery", complexity=TaskComplexity.EXPERT, task_description="Create a complete data pipeline: 1) Download data from https://jsonplaceholder.typicode.com/posts, 2) Filter posts with userId=1, 3) Extract titles and bodies, 4) Calculate word count for each, 5) Sort by word count descending, 6) Save to processed_posts.txt, 7) Create a summary.json with total_posts, average_word_count, longest_title, and first 3 titles, 8) Generate a CSV with columns: id, title_length, body_word_count, 9) Create execution_log.txt documenting each pipeline stage with timestamps", verification_checks=[ {'type': 'file_exists', 'name': 'processed_exists', 'path': 'processed_posts.txt'}, {'type': 'file_exists', 'name': 'summary_exists', 'path': 'summary.json'}, {'type': 'file_exists', 'name': 'csv_exists', 'path': 'pipeline_data.csv'}, {'type': 'file_exists', 'name': 'log_exists', 'path': 'execution_log.txt'}, {'type': 'json_valid', 'name': 'valid_json', 'path': 'summary.json'}, # FIXED: Added path {'type': 'command_success', 'name': 'has_posts', 'command': 'test $(wc -l < processed_posts.txt) -ge 5'}, {'type': 'command_success', 'name': 'csv_has_header', 'command': 'head -1 pipeline_data.csv | grep -q ","'}, {'type': 'command_success', 'name': 'log_has_timestamps', 'command': 'grep -qE "[0-9]{4}-[0-9]{2}-[0-9]{2}" execution_log.txt'}, ], timeout_seconds=180, expected_steps=['download', 'filter', 'extract', 'calculate_words', 'sort', 'save', 'create_summary', 'generate_csv', 'log_execution'], success_criteria={'min_verifications': 6} ), TestCase( id="T010", name="Self-Correcting Script with Comprehensive Error Handling", complexity=TaskComplexity.EXPERT, task_description="Create a Python script named safe_calculator.py that: 1) Reads two numbers from input_data.txt (one per line), 2) Performs division, multiplication, and power operations, 3) Handles FileNotFoundError by creating input_data.txt with default values [10, 2], 4) Handles ZeroDivisionError gracefully, 5) Handles ValueError for non-numeric input, 6) Writes results to results.txt, 7) Writes detailed error log to error.log with timestamps and stack traces, 8) Includes unit tests in the script that can be run with pytest", verification_checks=[ {'type': 'file_exists', 'name': 'script_created', 'path': 'safe_calculator.py'}, {'type': 'command_success', 'name': 'script_runnable', 'command': 'python3 -m py_compile safe_calculator.py'}, {'type': 'command_success', 'name': 'has_error_handling', 'command': 'grep -q "except" safe_calculator.py'}, {'type': 'command_success', 'name': 'has_logging', 'command': 'grep -qE "(logging|error)" safe_calculator.py'}, ], timeout_seconds=120, expected_steps=['create_script', 'add_file_handling', 'add_zero_division', 'add_value_error', 'add_logging', 'add_tests', 'test_execution'], success_criteria={'min_verifications': 3} ), TestCase( id="T011", name="Repository Analysis with Statistical Modeling", complexity=TaskComplexity.EXPERT, task_description="Analyze the current directory structure: 1) Find all Python files recursively, 2) Count total lines, comment lines, and code lines in each, 3) Calculate complexity metrics (functions per file, average function length), 4) Identify files with highest complexity, 5) Create detailed_report.txt with per-file analysis, 6) Create metrics.csv with columns: filename, total_lines, code_lines, comment_ratio, function_count, 7) Create summary.json with aggregate statistics and recommendations, 8) Generate a bar chart data file (visualization_data.csv) suitable for plotting", verification_checks=[ {'type': 'file_exists', 'name': 'report_exists', 'path': 'detailed_report.txt'}, {'type': 'file_exists', 'name': 'metrics_exists', 'path': 'metrics.csv'}, {'type': 'file_exists', 'name': 'summary_exists', 'path': 'summary.json'}, {'type': 'file_exists', 'name': 'viz_exists', 'path': 'visualization_data.csv'}, {'type': 'json_valid', 'name': 'valid_summary_json', 'path': 'summary.json'}, # FIXED: Added path {'type': 'command_success', 'name': 'csv_has_header', 'command': 'head -1 metrics.csv | grep -q "filename"'}, {'type': 'command_success', 'name': 'report_has_analysis', 'command': 'grep -qE "(lines|functions|complexity)" detailed_report.txt'}, ], timeout_seconds=150, expected_steps=['scan_directory', 'find_python_files', 'analyze_each_file', 'calculate_metrics', 'identify_complex', 'create_report', 'generate_csv', 'create_summary', 'generate_viz_data'], success_criteria={'min_verifications': 5} ), TestCase( id="T012", name="Distributed Task Simulation with State Management", complexity=TaskComplexity.EXPERT, task_description="Simulate a distributed job queue: 1) Create 10 'job' files (job_1.txt to job_10.txt) with random task descriptions, 2) Process each job sequentially, simulating work with sleep, 3) Track state in state.json (pending, processing, completed), 4) Handle 'failures' for jobs 3 and 7 (retry up to 3 times), 5) Log all state transitions to transitions.log with timestamps, 6) Create final_report.txt with success/failure counts, total processing time, and retry statistics, 7) Clean up successful job files but keep failed ones", verification_checks=[ {'type': 'file_exists', 'name': 'state_exists', 'path': 'state.json'}, {'type': 'file_exists', 'name': 'log_exists', 'path': 'transitions.log'}, {'type': 'file_exists', 'name': 'report_exists', 'path': 'final_report.txt'}, {'type': 'json_valid', 'name': 'valid_state_json', 'path': 'state.json'}, # FIXED: Added path {'type': 'command_success', 'name': 'has_transitions', 'command': 'grep -qE "(pending|processing|completed)" transitions.log'}, {'type': 'command_success', 'name': 'has_statistics', 'command': 'grep -qE "(success|failure|retry)" final_report.txt'}, ], timeout_seconds=200, expected_steps=['create_jobs', 'init_state', 'process_queue', 'handle_failures', 'retry_logic', 'log_transitions', 'generate_report', 'cleanup'], success_criteria={'min_verifications': 4} ), # EXTREME COMPLEXITY TestCase( id="T013", name="Full-Stack Data Application with ETL Pipeline", complexity=TaskComplexity.EXTREME, task_description="Build a complete ETL system: 1) Extract data from multiple APIs (GitHub repos, JSONPlaceholder posts/users), 2) Transform data by normalizing structures, joining related data, calculating derived metrics, 3) Load into an SQLite database with proper schema (tables: repositories, posts, users, metrics), 4) Create database indexes for performance, 5) Generate SQL views for common queries, 6) Export aggregated data to multiple formats (JSON, CSV, Markdown report), 7) Create a Python query script (query_db.py) with functions to search the database, 8) Generate comprehensive documentation (README.md) with schema diagram and usage examples, 9) Create validation tests and execution log", verification_checks=[ {'type': 'file_exists', 'name': 'db_exists', 'path': 'data.db'}, {'type': 'file_exists', 'name': 'query_script_exists', 'path': 'query_db.py'}, {'type': 'file_exists', 'name': 'readme_exists', 'path': 'README.md'}, {'type': 'file_exists', 'name': 'json_export_exists', 'path': 'export_data.json'}, {'type': 'file_exists', 'name': 'csv_export_exists', 'path': 'export_data.csv'}, {'type': 'command_success', 'name': 'db_has_tables', 'command': 'sqlite3 data.db ".tables" | grep -q "repositories"'}, {'type': 'command_success', 'name': 'script_runnable', 'command': 'python3 -m py_compile query_db.py'}, {'type': 'command_success', 'name': 'readme_has_schema', 'command': 'grep -qE "(schema|table|database)" README.md'}, ], timeout_seconds=300, expected_steps=['extract_apis', 'transform_data', 'create_schema', 'load_database', 'create_indexes', 'create_views', 'export_formats', 'create_query_script', 'generate_docs', 'validate'], success_criteria={'min_verifications': 6} ), TestCase( id="T014", name="Autonomous Code Refactoring Agent", complexity=TaskComplexity.EXTREME, task_description="Create a code analysis and refactoring system: 1) Scan all Python files in current directory, 2) Identify code smells (long functions >50 lines, deep nesting >3 levels, duplicate code blocks), 3) Generate refactoring suggestions for each file, 4) Create refactored versions with suffix '_refactored.py', 5) Run automated tests to ensure functionality preserved, 6) Generate side-by-side diff reports (diff_report.html), 7) Calculate and compare complexity metrics before/after, 8) Create improvement_summary.json with metrics improvements, 9) Document refactoring patterns applied in patterns.md, 10) Generate rollback script (rollback.sh)", verification_checks=[ {'type': 'file_exists', 'name': 'diff_report_exists', 'path': 'diff_report.html'}, {'type': 'file_exists', 'name': 'summary_exists', 'path': 'improvement_summary.json'}, {'type': 'file_exists', 'name': 'patterns_exists', 'path': 'patterns.md'}, {'type': 'file_exists', 'name': 'rollback_exists', 'path': 'rollback.sh'}, {'type': 'json_valid', 'name': 'valid_summary_json', 'path': 'improvement_summary.json'}, # FIXED: Added path {'type': 'command_success', 'name': 'has_html_structure', 'command': 'grep -q "" diff_report.html'}, {'type': 'command_success', 'name': 'patterns_has_examples', 'command': 'grep -qE "(before|after|pattern)" patterns.md'}, ], timeout_seconds=400, expected_steps=['scan_files', 'detect_smells', 'generate_suggestions', 'refactor_code', 'run_tests', 'create_diffs', 'calculate_metrics', 'generate_summary', 'document_patterns', 'create_rollback'], success_criteria={'min_verifications': 5} ), TestCase( id="T015", name="Intelligent Testing Framework Generator", complexity=TaskComplexity.EXTREME, task_description="Build a meta-testing system: 1) Analyze all Python modules in current directory, 2) Extract functions and their signatures, 3) Infer parameter types and generate test cases, 4) Create pytest test files for each module (test_*.py), 5) Generate fixtures for common data types, 6) Create parametrized tests for edge cases (empty, null, boundary values), 7) Add mocking for external dependencies, 8) Generate test coverage report (coverage.html), 9) Create CI/CD configuration (.github/workflows/test.yml), 10) Generate comprehensive test documentation (test_guide.md) with examples", verification_checks=[ {'type': 'file_exists', 'name': 'coverage_exists', 'path': 'coverage.html'}, {'type': 'file_exists', 'name': 'ci_config_exists', 'path': '.github/workflows/test.yml'}, {'type': 'file_exists', 'name': 'test_guide_exists', 'path': 'test_guide.md'}, {'type': 'command_success', 'name': 'has_test_files', 'command': 'ls test_*.py 2>/dev/null | head -1'}, {'type': 'command_success', 'name': 'tests_runnable', 'command': 'python3 -m py_compile test_*.py 2>/dev/null'}, {'type': 'command_success', 'name': 'has_fixtures', 'command': 'grep -q "@pytest.fixture" test_*.py 2>/dev/null'}, ], timeout_seconds=350, expected_steps=['analyze_modules', 'extract_functions', 'infer_types', 'generate_tests', 'create_fixtures', 'add_parametrized', 'add_mocking', 'run_coverage', 'generate_ci_config', 'create_docs'], success_criteria={'min_verifications': 4} ), ] def main(): tester = AgentTester(tail_lines=20) print("="*70) print("AUTONOMOUS AGENT COMPLEX TASK TEST SUITE") print("="*70) print(f"Total test cases: {len(TEST_SUITE)}") print(f"Complexity levels: BASIC (1), INTERMEDIATE (2), ADVANCED (3), EXPERT (4), EXTREME (5)") print(f"Output tail length: {tester.tail_lines} lines") print("="*70) # Group tests by complexity by_complexity = {} for test in TEST_SUITE: level = test.complexity.name by_complexity.setdefault(level, []).append(test) print("\nTest Distribution:") for level in ['BASIC', 'INTERMEDIATE', 'ADVANCED', 'EXPERT', 'EXTREME']: count = len(by_complexity.get(level, [])) print(f" {level}: {count} tests") print("="*70) for test in TEST_SUITE: try: tester.run_test(test) time.sleep(3) # Pause between tests except KeyboardInterrupt: print("\n\nTest suite interrupted by user") break except Exception as e: print(f"ERROR running test {test.id}: {e}") import traceback traceback.print_exc() continue tester.save_results() if __name__ == "__main__": main()