|
#!/usr/bin/env python3
|
|
import subprocess
|
|
import json
|
|
import os
|
|
import time
|
|
import logging
|
|
import sys
|
|
from datetime import datetime
|
|
from typing import List, Dict, Any
|
|
|
|
# Configure logging
|
|
LOG_FILE = "benchmark_results.log"
|
|
AGENT_OUTPUT_DIR = "test_results"
|
|
os.makedirs(AGENT_OUTPUT_DIR, exist_ok=True)
|
|
|
|
# Truncate log file at start
|
|
with open(LOG_FILE, 'w') as f:
|
|
f.write(f"=== Benchmark Session Started at {datetime.now()} ===\n")
|
|
|
|
def log_all(message, end="\n"):
|
|
"""Write to both stdout and the log file immediately."""
|
|
sys.stdout.write(message + end)
|
|
sys.stdout.flush()
|
|
with open(LOG_FILE, 'a') as f:
|
|
f.write(message + end)
|
|
|
|
class TestCase:
|
|
def __init__(self, id: str, name: str, description: str, task: str, validation_fn: Any):
|
|
self.id = id
|
|
self.name = name
|
|
self.description = description
|
|
self.task = task
|
|
self.validation_fn = validation_fn
|
|
self.result = "PENDING"
|
|
self.output = ""
|
|
self.execution_time = 0
|
|
|
|
def validate_file_exists(path):
|
|
return os.path.exists(path)
|
|
|
|
def validate_file_contains(path, text):
|
|
if not os.path.exists(path): return False
|
|
with open(path, 'r') as f:
|
|
return text.lower() in f.read().lower()
|
|
|
|
class AgentBenchmark:
|
|
def __init__(self, binary_path: str = "./r"):
|
|
self.binary_path = binary_path
|
|
self.test_cases: List[TestCase] = []
|
|
|
|
def add_test(self, test: TestCase):
|
|
self.test_cases.append(test)
|
|
|
|
def run_all(self):
|
|
log_all(f"Starting benchmark with {len(self.test_cases)} tasks...")
|
|
for test in self.test_cases:
|
|
self.run_test(test)
|
|
|
|
self.summary()
|
|
|
|
def run_test(self, test: TestCase):
|
|
log_all(f"\n" + "="*80)
|
|
log_all(f"--- Running Test {test.id}: {test.name} ---")
|
|
log_all(f"Description: {test.description}")
|
|
log_all(f"Task: {test.task}")
|
|
log_all("="*80 + "\n")
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Execute the agent with verbose output
|
|
process = subprocess.Popen(
|
|
[self.binary_path, test.task],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT,
|
|
text=True,
|
|
bufsize=1
|
|
)
|
|
|
|
full_output = []
|
|
log_all(f"[Agent Execution Start]")
|
|
|
|
for line in process.stdout:
|
|
full_output.append(line)
|
|
log_all(line, end="") # Real-time log to both
|
|
|
|
process.wait(timeout=600) # 10 minute timeout per task
|
|
test.execution_time = time.time() - start_time
|
|
test.output = "".join(full_output)
|
|
|
|
log_all(f"\n[Agent Execution Finished in {test.execution_time:.2f}s]")
|
|
|
|
# Save raw agent output to a dedicated file as well
|
|
output_file = os.path.join(AGENT_OUTPUT_DIR, f"{test.id}_output.txt")
|
|
with open(output_file, 'w') as f:
|
|
f.write(f"TASK: {test.task}\n")
|
|
f.write("-" * 40 + "\n")
|
|
f.write(test.output)
|
|
|
|
# Validate
|
|
if test.validation_fn(test):
|
|
test.result = "PASSED"
|
|
log_all(f"RESULT: Test {test.id} PASSED")
|
|
else:
|
|
test.result = "FAILED"
|
|
log_all(f"RESULT: Test {test.id} FAILED validation")
|
|
|
|
except Exception as e:
|
|
log_all(f"ERROR executing test {test.id}: {str(e)}")
|
|
test.result = "ERROR"
|
|
|
|
def summary(self):
|
|
log_all("\n" + "=" * 50)
|
|
log_all("BENCHMARK SUMMARY")
|
|
log_all("=" * 50)
|
|
passed = sum(1 for t in self.test_cases if t.result == "PASSED")
|
|
for t in self.test_cases:
|
|
log_all(f"[{t.result}] {t.id}: {t.name} ({t.execution_time:.2f}s)")
|
|
|
|
log_all("=" * 50)
|
|
log_all(f"TOTAL PASSED: {passed}/{len(self.test_cases)}")
|
|
log_all("=" * 50)
|
|
|
|
# Validation Functions
|
|
def v01(t): return validate_file_contains("sorting_algo.py", "def quicksort")
|
|
def v02(t): return validate_file_exists("refactor_report.md")
|
|
def v03(t): return validate_file_exists("security_scan.txt")
|
|
def v04(t): return validate_file_exists("data_export.csv")
|
|
def v05(t): return validate_file_exists("system_monitor.py")
|
|
def v06(t): return validate_file_exists("cloud_comparison.md")
|
|
def v07(t): return validate_file_exists("network_report.txt")
|
|
def v08(t): return validate_file_exists("db_migration.sql")
|
|
def v09(t): return validate_file_contains("src/main.c", "retoor")
|
|
def v10(t): return validate_file_exists("CODE_DOCS.md")
|
|
def v11(t): return validate_file_exists("log_analysis.json")
|
|
def v12(t): return validate_file_exists("venv_test/bin/python") or validate_file_exists("venv_test/Scripts/python.exe")
|
|
def v13(t): return validate_file_exists("git_summary.md")
|
|
def v14(t): return validate_file_exists("research_and_demo.py")
|
|
def v15(t): return validate_file_exists("stats_summary.txt")
|
|
|
|
# New Process/Async Magic Validations
|
|
def v20(t): return validate_file_contains("parallel_results.txt", "Script A Done") and validate_file_contains("parallel_results.txt", "Script B Done")
|
|
def v19(t): return validate_file_contains("exit_code_status.txt", "99")
|
|
def v18(t): return validate_file_contains("termination_verify.txt", "terminated successfully")
|
|
def v17(t): return validate_file_contains("mixed_async.txt", "Python OK") and validate_file_contains("mixed_async.txt", "Shell OK")
|
|
def v16(t): return validate_file_contains("timeout_bg_test.txt", "backgrounded") and validate_file_contains("timeout_bg_test.txt", "finished successfully")
|
|
|
|
if __name__ == "__main__":
|
|
benchmark = AgentBenchmark()
|
|
|
|
# --- Async & Process Magic Tests (New) ---
|
|
|
|
benchmark.add_test(TestCase("T20", "Parallel Python Magic", "Run two python scripts async together",
|
|
"Run two different Python scripts asynchronously at the same time. Script A: 'import time; time.sleep(5); print(\"Script A Done\")'. Script B: 'import time; time.sleep(5); print(\"Script B Done\")'. Poll both until finished and write their combined outputs to 'parallel_results.txt'.", v20))
|
|
|
|
benchmark.add_test(TestCase("T19", "Async Exit Code Verify", "Verify non-zero exit code async",
|
|
"Run a Python script async that exits with code 99 ('import sys; sys.exit(99)'). Poll it, capture the exit status, and save it to 'exit_code_status.txt'.", v19))
|
|
|
|
benchmark.add_test(TestCase("T18", "Process Termination Case", "Start long task and cancel it",
|
|
"Start a shell command 'sleep 100' asynchronously. Verify it is running, then terminate it. Save a confirmation that it was terminated to 'termination_verify.txt'.", v18))
|
|
|
|
benchmark.add_test(TestCase("T17", "Mixed Parallel Magic", "Python + Terminal async",
|
|
"Execute a Python script ('print(\"Python OK\")') and a Shell command ('echo Shell OK') in parallel using async mode. Wait for both and save results to 'mixed_async.txt'.", v17))
|
|
|
|
benchmark.add_test(TestCase("T16", "Timeout Auto-Background", "Verify sync timeout backgrounds task",
|
|
"Execute 'echo Starting; sleep 5; echo Finished' with a 2 second timeout (NOT async). It should background. Poll it until it finishes and save a report to 'timeout_bg_test.txt' confirming it backgrounded and then finished.", v16))
|
|
|
|
# --- Original Tests (Reversed) ---
|
|
|
|
benchmark.add_test(TestCase("T15", "CSV Stats", "Process large CSV",
|
|
"Create a CSV 'test_data.csv' with 100 rows of random numbers, calculate mean and standard deviation using Python, and save results to 'stats_summary.txt'.", v15))
|
|
|
|
benchmark.add_test(TestCase("T14", "Agent Collaboration", "Research and Code",
|
|
"Spawn a researcher to find the best way to implement a websocket server in Python, then write a functional demo to 'research_and_demo.py'.", v14))
|
|
|
|
benchmark.add_test(TestCase("T13", "Git Summary", "Summarize git history",
|
|
"Get the last 5 git commit messages and summarize the changes in 'git_summary.md'.", v13))
|
|
|
|
benchmark.add_test(TestCase("T12", "Env Setup", "Create virtualenv",
|
|
"Create a Python virtual environment named 'venv_test' in the current directory.", v12))
|
|
|
|
benchmark.add_test(TestCase("T11", "Log Analysis", "Parse and categorize logs",
|
|
"Create a dummy log file with 20 lines of mixed INFO and ERROR messages. Parse it using Python to count errors and save a JSON summary to 'log_analysis.json'.", v11))
|
|
|
|
benchmark.add_test(TestCase("T10", "Docs Generator", "Generate markdown docs",
|
|
"Analyze src/agent.c and include/agent.h to extract public function signatures and generate a professional 'CODE_DOCS.md'.", v10))
|
|
|
|
benchmark.add_test(TestCase("T09", "Code Maintenance", "Verify headers",
|
|
"Ensure all .c and .h files in the src directory start with the comment '// retoor <retoor@molodetz.nl>'. If missing, add it.", v09))
|
|
|
|
benchmark.add_test(TestCase("T08", "DB Migration", "Create and migrate schema",
|
|
"Create an SQLite schema for a library system (books, authors), insert 5 sample records, and generate a SQL dump to 'db_migration.sql'.", v08))
|
|
|
|
benchmark.add_test(TestCase("T07", "Network Diagnosis", "Check connectivity and DNS",
|
|
"Check network connectivity to google.com and github.com. Perform DNS lookups and save a report with latency to 'network_report.txt'.", v07))
|
|
|
|
benchmark.add_test(TestCase("T06", "Web Research", "Compare cloud providers",
|
|
"Research and compare the latest AI offerings from AWS, Azure, and Google Cloud in 2026. Create a comparison table in 'cloud_comparison.md'.", v06))
|
|
|
|
benchmark.add_test(TestCase("T05", "System Monitor", "Create monitoring script",
|
|
"Write a Python script 'system_monitor.py' that logs CPU and memory usage to 'usage.log' every 5 seconds. Ensure it handles keyboard interrupts.", v05))
|
|
|
|
benchmark.add_test(TestCase("T04", "Data ETL", "Fetch, process, store, export",
|
|
"Fetch data from https://jsonplaceholder.typicode.com/users, process it to extract just names and emails, store it in a local SQLite table named 'bench_users', and export it to 'data_export.csv'.", v04))
|
|
|
|
benchmark.add_test(TestCase("T03", "Security Audit", "Scan for security issues",
|
|
"Perform a security audit of the current directory using your tools. Look for insecure patterns and save findings to 'security_scan.txt'.", v03))
|
|
|
|
benchmark.add_test(TestCase("T02", "Refactor Suggestion", "Index project and suggest refactor",
|
|
"Index the current source directory and identify a complex function in src/agent.c. Suggest a refactor and save it to 'refactor_report.md'.", v02))
|
|
|
|
benchmark.add_test(TestCase("T01", "Research & Develop", "Research Quicksort and implement it",
|
|
"Research the Quicksort algorithm and write a robust Python implementation to 'sorting_algo.py'.", v01))
|
|
|
|
benchmark.run_all()
|