#!/usr/bin/env python3 """ Autonomous Agent Complex Task Testing Framework Tests agent capability to execute and complete multi-step tasks independently """ from collections import deque from dataclasses import dataclass from datetime import datetime from enum import Enum import json import subprocess import time from typing import Any, Dict, List class TaskComplexity(Enum): BASIC = 1 INTERMEDIATE = 2 ADVANCED = 3 EXPERT = 4 EXTREME = 5 @dataclass class TestCase: id: str name: str complexity: TaskComplexity task_description: str verification_checks: List[Dict[str, Any]] timeout_seconds: int expected_steps: List[str] success_criteria: Dict[str, Any] @dataclass class TestResult: test_id: str success: bool execution_time: float steps_completed: List[str] verification_results: Dict[str, bool] error_message: str = "" agent_output: str = "" output_tail: List[str] = None class AgentTester: def __init__(self, log_file="agent_test_results.json", tail_lines=20): self.log_file = log_file self.results = [] self.tail_lines = tail_lines def cleanup_directory(self): """Delete all files in current directory except test.py""" import os import shutil print(" -> Cleaning up directory...") for item in os.listdir('.'): if item == 'test.py': continue try: if os.path.isfile(item) or os.path.islink(item): os.unlink(item) elif os.path.isdir(item): shutil.rmtree(item) except Exception as e: print(f" ! Failed to delete {item}: {e}") def execute_agent_task(self, task: str, timeout: int) -> tuple[str, float, List[str]]: """Execute agent command and return output with timing and tail""" start_time = time.time() output_lines = [] tail_buffer = deque(maxlen=self.tail_lines) db_file = ".r.db" try: process = subprocess.Popen( ['r', task], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1 ) while True: line = process.stdout.readline() if line == '' and process.poll() is not None: break if line: output_lines.append(line.rstrip()) tail_buffer.append(line.rstrip()) print(f" │ {line.rstrip()}") process.wait(timeout=timeout) execution_time = time.time() - start_time full_output = '\\n'.join(output_lines) tail = list(tail_buffer) return full_output, execution_time, tail except subprocess.TimeoutExpired: process.kill() execution_time = time.time() - start_time return "TIMEOUT_ERROR", execution_time, list(tail_buffer) except Exception as e: execution_time = time.time() - start_time return f"EXECUTION_ERROR: {str(e)}", execution_time, [] def verify_output(self, output: str, checks: List[Dict[str, Any]]) -> Dict[str, bool]: """Run verification checks on agent output""" import os import re results = {} for check in checks: check_type = check['type'] check_name = check['name'] try: if check_type == 'contains': results[check_name] = check['value'] in output elif check_type == 'not_contains': results[check_name] = check['value'] not in output elif check_type == 'file_exists': results[check_name] = os.path.exists(check['path']) elif check_type == 'command_success': cmd_result = subprocess.run( check['command'], shell=True, capture_output=True, timeout=10 ) results[check_name] = cmd_result.returncode == 0 elif check_type == 'json_valid': # FIX: Validate specific JSON file if path provided, otherwise validate output if 'path' in check: if os.path.exists(check['path']): with open(check['path'], 'r') as f: json.load(f) # Raises JSONDecodeError if invalid results[check_name] = True else: results[check_name] = False else: # Fallback: validate output (likely to fail for agent logs) json.loads(output) results[check_name] = True elif check_type == 'step_count': step_count = len([l for l in output.split('\\n') if l.strip()]) results[check_name] = step_count >= check['min_steps'] elif check_type == 'regex_match': results[check_name] = bool(re.search(check['pattern'], output)) elif check_type == 'line_count': if os.path.exists(check['path']): with open(check['path'], 'r') as f: count = len(f.readlines()) results[check_name] = count >= check.get('min', 0) and count <= check.get('max', float('inf')) else: results[check_name] = False except json.JSONDecodeError: results[check_name] = False except Exception: results[check_name] = False return results def run_test(self, test: TestCase) -> TestResult: """Execute a single test case""" self.cleanup_directory() print(f"\n{'═'*70}") print(f"[{test.id}] {test.name}") print(f"{'═'*70}") print(f"Complexity: {test.complexity.name} ({test.complexity.value}/5)") print(f"Timeout: {test.timeout_seconds}s") print(f"\nTask Description:") print(f" {test.task_description}") print(f"\nExpected Steps: {' → '.join(test.expected_steps)}") print(f"\n{'─'*70}") print("Agent Output (live stream):") print(f"{'─'*70}") output, exec_time, tail = self.execute_agent_task( test.task_description, test.timeout_seconds ) print(f"{'─'*70}") print(f"\nOutput Tail (last {self.tail_lines} lines):") print(f"{'─'*70}") for i, line in enumerate(tail, 1): print(f" {i:2d} │ {line}") print(f"{'─'*70}") verification_results = self.verify_output(output, test.verification_checks) # Determine success based on criteria success = all(verification_results.values()) if test.success_criteria.get('min_verifications'): passed = sum(verification_results.values()) required = test.success_criteria['min_verifications'] success = passed >= required result = TestResult( test_id=test.id, success=success, execution_time=exec_time, steps_completed=[], verification_results=verification_results, agent_output=output[:1000], output_tail=tail ) self.results.append(result) print(f"\nVerification Results:") for check_name, passed in verification_results.items(): status = "✓ PASS" if passed else "✗ FAIL" print(f" {status}: {check_name}") print(f"\n{'═'*70}") print(f"Result: {'✓ SUCCESS' if success else '✗ FAILED'}") print(f"Execution time: {exec_time:.2f}s") print(f"Verifications: {sum(verification_results.values())}/{len(verification_results)} passed") print(f"{'═'*70}") if not success: print("FAILURE OF TESTS, NOT ALL TESTS COMPLETED SUCCESFULLY. PLEASE OPTIMIZE THE AGENT EXECUTING THIS.") exit() return result def save_results(self): """Save test results to JSON file""" output = { 'timestamp': datetime.now().isoformat(), 'total_tests': len(self.results), 'passed': sum(1 for r in self.results if r.success), 'failed': sum(1 for r in self.results if not r.success), 'average_execution_time': sum(r.execution_time for r in self.results) / len(self.results) if self.results else 0, 'results': [ { 'test_id': r.test_id, 'success': r.success, 'execution_time': r.execution_time, 'verifications': r.verification_results, 'output_preview': r.agent_output, 'output_tail': r.output_tail } for r in self.results ] } with open(self.log_file, 'w') as f: json.dump(output, f, indent=2) print(f"\n{'='*70}") print(f"TEST RESULTS SUMMARY") print(f"{'='*70}") print(f"Total Tests: {output['total_tests']}") print(f"Passed: {output['passed']} ({output['passed']/output['total_tests']*100:.1f}%)") print(f"Failed: {output['failed']}") print(f"Average Execution Time: {output['average_execution_time']:.2f}s") print(f"Results saved to: {self.log_file}") print(f"{'='*70}") # Define Test Suite TEST_SUITE = [ # BASIC COMPLEXITY TestCase( id="T001", name="Simple File Creation", complexity=TaskComplexity.BASIC, task_description="Create a file named test_output.txt with the text 'Hello World'", verification_checks=[ {'type': 'file_exists', 'name': 'file_created', 'path': 'test_output.txt'}, {'type': 'command_success', 'name': 'content_correct', 'command': 'grep -q "Hello World" test_output.txt'} ], timeout_seconds=30, expected_steps=['create_file', 'write_content'], success_criteria={'min_verifications': 2} ), TestCase( id="T002", name="Directory Operations with File Hierarchy", complexity=TaskComplexity.BASIC, task_description="Create a directory called test_dir with two subdirectories named src and docs, create three empty files inside src named file1.txt, file2.txt, file3.txt, and create a README.md in docs with the text 'Documentation folder', then list the entire directory tree", verification_checks=[ {'type': 'file_exists', 'name': 'dir_exists', 'path': 'test_dir'}, {'type': 'file_exists', 'name': 'src_exists', 'path': 'test_dir/src'}, {'type': 'file_exists', 'name': 'docs_exists', 'path': 'test_dir/docs'}, {'type': 'file_exists', 'name': 'file1_exists', 'path': 'test_dir/src/file1.txt'}, {'type': 'file_exists', 'name': 'file2_exists', 'path': 'test_dir/src/file2.txt'}, {'type': 'file_exists', 'name': 'file3_exists', 'path': 'test_dir/src/file3.txt'}, {'type': 'file_exists', 'name': 'readme_exists', 'path': 'test_dir/docs/README.md'}, ], timeout_seconds=45, expected_steps=['mkdir_nested', 'touch_files', 'create_readme', 'tree_list'], success_criteria={'min_verifications': 6} ), # INTERMEDIATE COMPLEXITY TestCase( id="T003", name="Advanced Data Processing with Statistics", complexity=TaskComplexity.INTERMEDIATE, task_description="Create a CSV file with 10 rows of sample employee data (name,age,city,salary) named data.csv, then read it and calculate the average age, median salary, and count of employees per city, and write detailed statistics to summary.txt with proper formatting", verification_checks=[ {'type': 'file_exists', 'name': 'csv_created', 'path': 'data.csv'}, {'type': 'file_exists', 'name': 'summary_created', 'path': 'summary.txt'}, {'type': 'line_count', 'name': 'csv_has_rows', 'path': 'data.csv', 'min': 10, 'max': 12}, {'type': 'command_success', 'name': 'summary_has_stats', 'command': 'grep -iqE "(average|median|count)" summary.txt'}, {'type': 'command_success', 'name': 'summary_has_numbers', 'command': 'grep -qE "[0-9]+" summary.txt'} ], timeout_seconds=75, expected_steps=['create_csv', 'write_data', 'read_csv', 'calculate_stats', 'format_summary', 'write_summary'], success_criteria={'min_verifications': 4} ), TestCase( id="T004", name="API Request with Data Transformation and Caching", complexity=TaskComplexity.INTERMEDIATE, task_description="Make a GET request to https://api.github.com/repos/torvalds/linux, extract the stargazers_count, forks_count, and open_issues_count fields, calculate the engagement ratio (stars/forks), save raw JSON to cache.json, and create a formatted report in repo_stats.txt with all metrics", verification_checks=[ {'type': 'file_exists', 'name': 'cache_exists', 'path': 'cache.json'}, {'type': 'file_exists', 'name': 'stats_exists', 'path': 'repo_stats.txt'}, {'type': 'json_valid', 'name': 'valid_cache_json', 'path': 'cache.json'}, # FIXED: Added path {'type': 'command_success', 'name': 'has_metrics', 'command': 'grep -iqE "(stars|forks|ratio)" repo_stats.txt'}, ], timeout_seconds=60, expected_steps=['api_request', 'parse_json', 'extract_fields', 'calculate_ratio', 'cache_data', 'write_report'], success_criteria={'min_verifications': 3} ), TestCase( id="T005", name="Web Scraping with Retry Logic", complexity=TaskComplexity.INTERMEDIATE, task_description="Fetch the public API at https://jsonplaceholder.typicode.com/users, extract all email domains, count occurrences of each domain, sort by frequency, and save to domains.txt. If the request fails, retry up to 3 times with exponential backoff", verification_checks=[ {'type': 'file_exists', 'name': 'output_exists', 'path': 'domains.txt'}, {'type': 'command_success', 'name': 'has_domains', 'command': 'grep -qE "@" domains.txt'}, {'type': 'command_success', 'name': 'has_counts', 'command': 'grep -qE "[0-9]+" domains.txt'}, ], timeout_seconds=90, expected_steps=['api_request', 'parse_users', 'extract_domains', 'count_frequency', 'sort_results', 'write_output'], success_criteria={'min_verifications': 2} ), # ADVANCED COMPLEXITY TestCase( id="T006", name="Conditional Logic with Nested Error Handling", complexity=TaskComplexity.ADVANCED, task_description="Try to read a file called config.json. If it doesn't exist, create it with default configuration {'debug': true, 'timeout': 30, 'retry': 3}. Then validate the JSON structure, check if all required keys exist, append a timestamp field, and create a backup file config.backup.json. Write validation results to validation.log", verification_checks=[ {'type': 'file_exists', 'name': 'config_exists', 'path': 'config.json'}, {'type': 'file_exists', 'name': 'backup_exists', 'path': 'config.backup.json'}, {'type': 'file_exists', 'name': 'log_exists', 'path': 'validation.log'}, {'type': 'json_valid', 'name': 'valid_json', 'path': 'config.json'}, # FIXED: Added path {'type': 'command_success', 'name': 'has_timestamp', 'command': 'grep -q "timestamp" config.json'} ], timeout_seconds=60, expected_steps=['check_file', 'create_default', 'validate_structure', 'append_timestamp', 'create_backup', 'log_validation'], success_criteria={'min_verifications': 4} ), TestCase( id="T007", name="Multi-Format Data Pipeline with Transformations", complexity=TaskComplexity.ADVANCED, task_description="Create a JSON file with 5 product entries (id, name, price, category, stock), convert it to CSV format, filter products where stock > 0, apply a 10% discount to all prices, convert the result to a markdown table with formatted prices ($XX.XX), and save it to products_report.md. Also generate a JSON summary with total_products, total_value, and categories array", verification_checks=[ {'type': 'file_exists', 'name': 'json_exists', 'path': 'products.json'}, {'type': 'file_exists', 'name': 'csv_exists', 'path': 'products.csv'}, {'type': 'file_exists', 'name': 'markdown_exists', 'path': 'products_report.md'}, {'type': 'file_exists', 'name': 'summary_exists', 'path': 'summary.json'}, {'type': 'command_success', 'name': 'markdown_has_table', 'command': 'grep -q "|" products_report.md'}, {'type': 'command_success', 'name': 'has_dollar_signs', 'command': 'grep -q "$" products_report.md'}, {'type': 'json_valid', 'name': 'valid_summary_json', 'path': 'summary.json'} # FIXED: Added path ], timeout_seconds=120, expected_steps=['create_json', 'json_to_csv', 'filter_stock', 'apply_discount', 'format_prices', 'csv_to_markdown', 'generate_summary'], success_criteria={'min_verifications': 5} ), TestCase( id="T008", name="Parallel Data Processing with Aggregation", complexity=TaskComplexity.ADVANCED, task_description="Fetch data from https://jsonplaceholder.typicode.com/posts and https://jsonplaceholder.typicode.com/comments simultaneously, join them based on postId, count comments per post, identify the top 5 most commented posts, and create a detailed HTML report (report.html) with a table and summary statistics", verification_checks=[ {'type': 'file_exists', 'name': 'report_exists', 'path': 'report.html'}, {'type': 'command_success', 'name': 'has_table', 'command': 'grep -q "