r/agent_benchmark.py at ac94f9f4bcb6ba4c23c1b65b51e3ae68e4ba3682

 #!/usr/bin/env python3
 import subprocess
 import json
 import os
 import time
 import logging
 import sys
 from datetime import datetime
 from typing import List, Dict, Any
 # Configure logging
 LOG_FILE = "benchmark_results.log"
 AGENT_OUTPUT_DIR = "test_results"
 os.makedirs(AGENT_OUTPUT_DIR, exist_ok=True)
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s [%(levelname)s] %(message)s',
     handlers=[
         logging.FileHandler(LOG_FILE),
         logging.StreamHandler(sys.stdout)
     ]
 )
 class TestCase:
     def __init__(self, id: str, name: str, description: str, task: str, validation_fn: Any):
         self.id = id
         self.name = name
         self.description = description
         self.task = task
         self.validation_fn = validation_fn
         self.result = "PENDING"
         self.output = ""
         self.execution_time = 0
 def validate_file_exists(path):
     return os.path.exists(path)
 def validate_file_contains(path, text):
     if not os.path.exists(path): return False
     with open(path, 'r') as f:
         return text.lower() in f.read().lower()
 class AgentBenchmark:
     def __init__(self, binary_path: str = "./r"):
         self.binary_path = binary_path
         self.test_cases: List[TestCase] = []
     def add_test(self, test: TestCase):
         self.test_cases.append(test)
     def run_all(self):
         logging.info(f"Starting benchmark with {len(self.test_cases)} tasks...")
         for test in self.test_cases:
             self.run_test(test)
         self.summary()
     def run_test(self, test: TestCase):
         logging.info(f"--- Running Test {test.id}: {test.name} ---")
         start_time = time.time()
         try:
             # Execute the agent
             process = subprocess.Popen(
                 [self.binary_path, "--verbose", test.task],
                 stdout=subprocess.PIPE,
                 stderr=subprocess.STDOUT,
                 text=True,
                 bufsize=1
             )
             full_output = []
             logging.info(f"Agent executing Task {test.id}...")
             for line in process.stdout:
                 full_output.append(line)
                 print(line, end="", flush=True) # Print to screen in real-time
             process.wait(timeout=600) # 10 minute timeout per task
             test.execution_time = time.time() - start_time
             test.output = "".join(full_output)
             # Save raw agent output
             output_file = os.path.join(AGENT_OUTPUT_DIR, f"{test.id}_output.txt")
             with open(output_file, 'w') as f:
                 f.write(f"TASK: {test.task}\n")
                 f.write("-" * 40 + "\n")
                 f.write(test.output)
             # Validate
             if test.validation_fn(test):
                 test.result = "PASSED"
                 logging.info(f"Test {test.id} PASSED in {test.execution_time:.2f}s")
             else:
                 test.result = "FAILED"
                 logging.error(f"Test {test.id} FAILED validation")
         except Exception as e:
             logging.error(f"Error executing test {test.id}: {str(e)}")
             test.result = "ERROR"
     def summary(self):
         logging.info("=" * 50)
         logging.info("BENCHMARK SUMMARY")
         logging.info("=" * 50)
         passed = sum(1 for t in self.test_cases if t.result == "PASSED")
         for t in self.test_cases:
             logging.info(f"[{t.result}] {t.id}: {t.name} ({t.execution_time:.2f}s)")
         logging.info("=" * 50)
         logging.info(f"TOTAL PASSED: {passed}/{len(self.test_cases)}")
         logging.info("=" * 50)
 # Validation Functions
 def v01(t): return validate_file_contains("sorting_algo.py", "def quicksort")
 def v02(t): return validate_file_exists("refactor_report.md")
 def v03(t): return validate_file_exists("security_scan.txt")
 def v04(t): return validate_file_exists("data_export.csv")
 def v05(t): return validate_file_exists("system_monitor.py")
 def v06(t): return validate_file_exists("cloud_comparison.md")
 def v07(t): return validate_file_exists("network_report.txt")
 def v08(t): return validate_file_exists("db_migration.sql")
 def v09(t): return validate_file_contains("src/main.c", "retoor") # Dummy check
 def v10(t): return validate_file_exists("CODE_DOCS.md")
 def v11(t): return validate_file_exists("log_analysis.json")
 def v12(t): return validate_file_exists("venv_test/bin/python") or validate_file_exists("venv_test/Scripts/python.exe")
 def v13(t): return validate_file_exists("git_summary.md")
 def v14(t): return validate_file_exists("research_and_demo.py")
 def v15(t): return validate_file_exists("stats_summary.txt")
 if __name__ == "__main__":
     benchmark = AgentBenchmark()
     # 1. Research & Develop
     benchmark.add_test(TestCase("T01", "Research & Develop", "Research Quicksort and implement it",
         "Research the Quicksort algorithm and write a robust Python implementation to 'sorting_algo.py'.", v01))
     # 2. Code Analysis & Refactor
     benchmark.add_test(TestCase("T02", "Refactor Suggestion", "Index project and suggest refactor",
         "Index the current source directory and identify a complex function in src/agent.c. Suggest a refactor and save it to 'refactor_report.md'.", v02))
     # 3. Security Audit
     benchmark.add_test(TestCase("T03", "Security Audit", "Scan for security issues",
         "Perform a security audit of the current directory using your tools. Look for insecure patterns and save findings to 'security_scan.txt'.", v03))
     # 4. Data ETL Pipeline
     benchmark.add_test(TestCase("T04", "Data ETL", "Fetch, process, store, export",
         "Fetch data from https://jsonplaceholder.typicode.com/users, process it to extract just names and emails, store it in a local SQLite table named 'bench_users', and export it to 'data_export.csv'.", v04))
     # 5. System Monitoring
     benchmark.add_test(TestCase("T05", "System Monitor", "Create monitoring script",
         "Write a Python script 'system_monitor.py' that logs CPU and memory usage to 'usage.log' every 5 seconds. Ensure it handles keyboard interrupts.", v05))
     # 6. Web Research
     benchmark.add_test(TestCase("T06", "Web Research", "Compare cloud providers",
         "Research and compare the latest AI offerings from AWS, Azure, and Google Cloud in 2026. Create a comparison table in 'cloud_comparison.md'.", v06))
     # 7. Network Diagnosis
     benchmark.add_test(TestCase("T07", "Network Diagnosis", "Check connectivity and DNS",
         "Check network connectivity to google.com and github.com. Perform DNS lookups and save a report with latency to 'network_report.txt'.", v07))
     # 8. DB Migration
     benchmark.add_test(TestCase("T08", "DB Migration", "Create and migrate schema",
         "Create an SQLite schema for a library system (books, authors), insert 5 sample records, and generate a SQL dump to 'db_migration.sql'.", v08))
     # 9. Code Maintenance
     benchmark.add_test(TestCase("T09", "Code Maintenance", "Verify headers",
         "Ensure all .c and .h files in the src directory start with the comment '// retoor <retoor@molodetz.nl>'. If missing, add it.", v09))
     # 10. Documentation Generator
     benchmark.add_test(TestCase("T10", "Docs Generator", "Generate markdown docs",
         "Analyze src/agent.c and include/agent.h to extract public function signatures and generate a professional 'CODE_DOCS.md'.", v10))
     # 11. Log Analysis
     benchmark.add_test(TestCase("T11", "Log Analysis", "Parse and categorize logs",
         "Create a dummy log file with 20 lines of mixed INFO and ERROR messages. Parse it using Python to count errors and save a JSON summary to 'log_analysis.json'.", v11))
     # 12. Env Setup
     benchmark.add_test(TestCase("T12", "Env Setup", "Create virtualenv",
         "Create a Python virtual environment named 'venv_test' in the current directory.", v12))
     # 13. Git Summary
     benchmark.add_test(TestCase("T13", "Git Summary", "Summarize git history",
         "Get the last 5 git commit messages and summarize the changes in 'git_summary.md'.", v13))
     # 14. Multi-agent Collaboration
     benchmark.add_test(TestCase("T14", "Agent Collaboration", "Research and Code",
         "Spawn a researcher to find the best way to implement a websocket server in Python, then write a functional demo to 'research_and_demo.py'.", v14))
     # 15. CSV Processing
     benchmark.add_test(TestCase("T15", "CSV Stats", "Process large CSV",
         "Create a CSV 'test_data.csv' with 100 rows of random numbers, calculate mean and standard deviation using Python, and save results to 'stats_summary.txt'.", v15))
     benchmark.run_all()