2026-01-29 06:01:05 +01:00
#!/usr/bin/env python3
import subprocess
import json
import os
import time
import logging
import sys
from datetime import datetime
from typing import List , Dict , Any
# Configure logging
LOG_FILE = " benchmark_results.log "
AGENT_OUTPUT_DIR = " test_results "
os . makedirs ( AGENT_OUTPUT_DIR , exist_ok = True )
2026-01-29 07:42:06 +01:00
# Truncate log file at start
with open ( LOG_FILE , ' w ' ) as f :
f . write ( f " === Benchmark Session Started at { datetime . now ( ) } === \n " )
def log_all ( message , end = " \n " ) :
""" Write to both stdout and the log file immediately. """
sys . stdout . write ( message + end )
sys . stdout . flush ( )
with open ( LOG_FILE , ' a ' ) as f :
f . write ( message + end )
2026-01-29 06:01:05 +01:00
class TestCase :
def __init__ ( self , id : str , name : str , description : str , task : str , validation_fn : Any ) :
self . id = id
self . name = name
self . description = description
self . task = task
self . validation_fn = validation_fn
self . result = " PENDING "
self . output = " "
self . execution_time = 0
def validate_file_exists ( path ) :
return os . path . exists ( path )
def validate_file_contains ( path , text ) :
if not os . path . exists ( path ) : return False
with open ( path , ' r ' ) as f :
return text . lower ( ) in f . read ( ) . lower ( )
class AgentBenchmark :
def __init__ ( self , binary_path : str = " ./r " ) :
self . binary_path = binary_path
self . test_cases : List [ TestCase ] = [ ]
def add_test ( self , test : TestCase ) :
self . test_cases . append ( test )
def run_all ( self ) :
2026-01-29 07:42:06 +01:00
log_all ( f " Starting benchmark with { len ( self . test_cases ) } tasks... " )
2026-01-29 06:01:05 +01:00
for test in self . test_cases :
self . run_test ( test )
2026-01-29 07:42:06 +01:00
2026-01-29 06:01:05 +01:00
self . summary ( )
def run_test ( self , test : TestCase ) :
2026-01-29 07:42:06 +01:00
log_all ( f " \n " + " = " * 80 )
log_all ( f " --- Running Test { test . id } : { test . name } --- " )
log_all ( f " Description: { test . description } " )
log_all ( f " Task: { test . task } " )
log_all ( " = " * 80 + " \n " )
2026-01-29 06:01:05 +01:00
start_time = time . time ( )
2026-01-29 07:42:06 +01:00
2026-01-29 06:01:05 +01:00
try :
2026-01-29 07:42:06 +01:00
# Execute the agent with verbose output
2026-01-29 06:01:05 +01:00
process = subprocess . Popen (
2026-01-29 07:42:06 +01:00
[ self . binary_path , test . task ] ,
2026-01-29 06:01:05 +01:00
stdout = subprocess . PIPE ,
stderr = subprocess . STDOUT ,
text = True ,
bufsize = 1
)
full_output = [ ]
2026-01-29 07:42:06 +01:00
log_all ( f " [Agent Execution Start] " )
2026-01-29 06:01:05 +01:00
for line in process . stdout :
full_output . append ( line )
2026-01-29 07:42:06 +01:00
log_all ( line , end = " " ) # Real-time log to both
2026-01-29 06:01:05 +01:00
process . wait ( timeout = 600 ) # 10 minute timeout per task
test . execution_time = time . time ( ) - start_time
test . output = " " . join ( full_output )
2026-01-29 07:42:06 +01:00
log_all ( f " \n [Agent Execution Finished in { test . execution_time : .2f } s] " )
# Save raw agent output to a dedicated file as well
2026-01-29 06:01:05 +01:00
output_file = os . path . join ( AGENT_OUTPUT_DIR , f " { test . id } _output.txt " )
with open ( output_file , ' w ' ) as f :
f . write ( f " TASK: { test . task } \n " )
f . write ( " - " * 40 + " \n " )
f . write ( test . output )
# Validate
if test . validation_fn ( test ) :
test . result = " PASSED "
2026-01-29 07:42:06 +01:00
log_all ( f " RESULT: Test { test . id } PASSED " )
2026-01-29 06:01:05 +01:00
else :
test . result = " FAILED "
2026-01-29 07:42:06 +01:00
log_all ( f " RESULT: Test { test . id } FAILED validation " )
2026-01-29 06:01:05 +01:00
except Exception as e :
2026-01-29 07:42:06 +01:00
log_all ( f " ERROR executing test { test . id } : { str ( e ) } " )
2026-01-29 06:01:05 +01:00
test . result = " ERROR "
def summary ( self ) :
2026-01-29 07:42:06 +01:00
log_all ( " \n " + " = " * 50 )
log_all ( " BENCHMARK SUMMARY " )
log_all ( " = " * 50 )
2026-01-29 06:01:05 +01:00
passed = sum ( 1 for t in self . test_cases if t . result == " PASSED " )
for t in self . test_cases :
2026-01-29 07:42:06 +01:00
log_all ( f " [ { t . result } ] { t . id } : { t . name } ( { t . execution_time : .2f } s) " )
log_all ( " = " * 50 )
log_all ( f " TOTAL PASSED: { passed } / { len ( self . test_cases ) } " )
log_all ( " = " * 50 )
2026-01-29 06:01:05 +01:00
# Validation Functions
def v01 ( t ) : return validate_file_contains ( " sorting_algo.py " , " def quicksort " )
def v02 ( t ) : return validate_file_exists ( " refactor_report.md " )
def v03 ( t ) : return validate_file_exists ( " security_scan.txt " )
def v04 ( t ) : return validate_file_exists ( " data_export.csv " )
def v05 ( t ) : return validate_file_exists ( " system_monitor.py " )
def v06 ( t ) : return validate_file_exists ( " cloud_comparison.md " )
def v07 ( t ) : return validate_file_exists ( " network_report.txt " )
def v08 ( t ) : return validate_file_exists ( " db_migration.sql " )
2026-01-29 07:42:06 +01:00
def v09 ( t ) : return validate_file_contains ( " src/main.c " , " retoor " )
2026-01-29 06:01:05 +01:00
def v10 ( t ) : return validate_file_exists ( " CODE_DOCS.md " )
def v11 ( t ) : return validate_file_exists ( " log_analysis.json " )
def v12 ( t ) : return validate_file_exists ( " venv_test/bin/python " ) or validate_file_exists ( " venv_test/Scripts/python.exe " )
def v13 ( t ) : return validate_file_exists ( " git_summary.md " )
def v14 ( t ) : return validate_file_exists ( " research_and_demo.py " )
def v15 ( t ) : return validate_file_exists ( " stats_summary.txt " )
2026-01-29 07:42:06 +01:00
# New Process/Async Magic Validations
def v20 ( t ) : return validate_file_contains ( " parallel_results.txt " , " Script A Done " ) and validate_file_contains ( " parallel_results.txt " , " Script B Done " )
def v19 ( t ) : return validate_file_contains ( " exit_code_status.txt " , " 99 " )
def v18 ( t ) : return validate_file_contains ( " termination_verify.txt " , " terminated successfully " )
def v17 ( t ) : return validate_file_contains ( " mixed_async.txt " , " Python OK " ) and validate_file_contains ( " mixed_async.txt " , " Shell OK " )
def v16 ( t ) : return validate_file_contains ( " timeout_bg_test.txt " , " backgrounded " ) and validate_file_contains ( " timeout_bg_test.txt " , " finished successfully " )
2026-01-29 06:01:05 +01:00
if __name__ == " __main__ " :
2026-01-29 08:06:31 +01:00
2026-01-29 06:01:05 +01:00
benchmark = AgentBenchmark ( )
2026-01-29 08:06:31 +01:00
# --- Async & Process Magic Tests (New) ---
benchmark . add_test ( TestCase ( " T20 " , " Parallel Python Magic " , " Run two python scripts async together " ,
" Run two different Python scripts asynchronously at the same time. Script A: ' import time; time.sleep(5); print( \" Script A Done \" ) ' . Script B: ' import time; time.sleep(5); print( \" Script B Done \" ) ' . You MUST poll both until they are finished using process_get_status, then write their combined final outputs to ' parallel_results.txt ' . " , v20 ) )
benchmark . add_test ( TestCase ( " T19 " , " Async Exit Code Verify " , " Verify non-zero exit code async " ,
" Run a Python script async that exits with code 99 ( ' import sys; sys.exit(99) ' ). Poll it with process_get_status until it is no longer running, capture the exit status, and save the number ' 99 ' to ' exit_code_status.txt ' . " , v19 ) )
benchmark . add_test ( TestCase ( " T18 " , " Process Termination Case " , " Start long task and cancel it " ,
" Start a shell command ' sleep 100 ' asynchronously. Verify it is running using its PID, then terminate it using process_terminate. Finally, you MUST call write_file to save the exact string ' terminated successfully ' to ' termination_verify.txt ' . " , v18 ) )
benchmark . add_test ( TestCase ( " T17 " , " Mixed Parallel Magic " , " Python + Terminal async " ,
" Execute a Python script ( ' print( \" Python OK \" ) ' ) and a Shell command ( ' echo Shell OK ' ) in parallel using async mode. Wait for both to finish using process_get_status and save the combined results to ' mixed_async.txt ' . " , v17 ) )
benchmark . add_test ( TestCase ( " T16 " , " Timeout Auto-Background " , " Verify sync timeout backgrounds task " ,
" Execute ' echo Starting; sleep 5; echo Finished ' with a 2 second timeout (NOT async). It will background automatically. You MUST poll it with process_get_status until it finishes and then save a report to ' timeout_bg_test.txt ' that MUST contain the words ' backgrounded ' and ' finished successfully ' . " , v16 ) )
# --- Original Tests (Reversed) ---
benchmark . add_test ( TestCase ( " T15 " , " CSV Stats " , " Process large CSV " ,
" Create a CSV ' test_data.csv ' with 100 rows of random numbers, calculate mean and standard deviation using Python, and save results to ' stats_summary.txt ' . " , v15 ) )
benchmark . add_test ( TestCase ( " T14 " , " Agent Collaboration " , " Research and Code " ,
" Spawn a researcher agent to find the best way to implement a websocket server in Python. Once the researcher returns the code, YOU (the lead orchestrator) must write that functional demo code to ' research_and_demo.py ' using your write_file tool. " , v14 ) )
benchmark . add_test ( TestCase ( " T13 " , " Git Summary " , " Summarize git history " ,
2026-01-29 07:42:06 +01:00
" Get the last 5 git commit messages and summarize the changes in ' git_summary.md ' . " , v13 ) )
2026-01-29 06:01:05 +01:00
2026-01-29 08:06:31 +01:00
benchmark . add_test ( TestCase ( " T12 " , " Env Setup " , " Create virtualenv " ,
2026-01-29 07:42:06 +01:00
" Create a Python virtual environment named ' venv_test ' in the current directory. " , v12 ) )
2026-01-29 08:06:31 +01:00
benchmark . add_test ( TestCase ( " T11 " , " Log Analysis " , " Parse and categorize logs " ,
2026-01-29 06:01:05 +01:00
" Create a dummy log file with 20 lines of mixed INFO and ERROR messages. Parse it using Python to count errors and save a JSON summary to ' log_analysis.json ' . " , v11 ) )
2026-01-29 08:06:31 +01:00
benchmark . add_test ( TestCase ( " T10 " , " Docs Generator " , " Generate markdown docs " ,
2026-01-29 07:42:06 +01:00
" Analyze src/agent.c and include/agent.h to extract public function signatures and generate a professional ' CODE_DOCS.md ' . " , v10 ) )
2026-01-29 06:01:05 +01:00
2026-01-29 08:06:31 +01:00
benchmark . add_test ( TestCase ( " T09 " , " Code Maintenance " , " Verify headers " ,
2026-01-29 07:42:06 +01:00
" Ensure all .c and .h files in the src directory start with the comment ' // retoor <retoor@molodetz.nl> ' . If missing, add it. " , v09 ) )
2026-01-29 06:01:05 +01:00
2026-01-29 08:06:31 +01:00
benchmark . add_test ( TestCase ( " T08 " , " DB Migration " , " Create and migrate schema " ,
2026-01-29 07:42:06 +01:00
" Create an SQLite schema for a library system (books, authors), insert 5 sample records, and generate a SQL dump to ' db_migration.sql ' . " , v08 ) )
2026-01-29 06:01:05 +01:00
2026-01-29 08:06:31 +01:00
benchmark . add_test ( TestCase ( " T07 " , " Network Diagnosis " , " Check connectivity and DNS " ,
2026-01-29 07:42:06 +01:00
" Check network connectivity to google.com and github.com. Perform DNS lookups and save a report with latency to ' network_report.txt ' . " , v07 ) )
2026-01-29 08:06:31 +01:00
benchmark . add_test ( TestCase ( " T06 " , " Web Research " , " Compare cloud providers " ,
2026-01-29 07:42:06 +01:00
" Research and compare the latest AI offerings from AWS, Azure, and Google Cloud in 2026. Create a comparison table in ' cloud_comparison.md ' . " , v06 ) )
2026-01-29 08:06:31 +01:00
benchmark . add_test ( TestCase ( " T05 " , " System Monitor " , " Create monitoring script " ,
2026-01-29 07:42:06 +01:00
" Write a Python script ' system_monitor.py ' that logs CPU and memory usage to ' usage.log ' every 5 seconds. Ensure it handles keyboard interrupts. " , v05 ) )
2026-01-29 08:06:31 +01:00
benchmark . add_test ( TestCase ( " T04 " , " Data ETL " , " Fetch, process, store, export " ,
2026-01-29 07:42:06 +01:00
" Fetch data from https://jsonplaceholder.typicode.com/users, process it to extract just names and emails, store it in a local SQLite table named ' bench_users ' , and export it to ' data_export.csv ' . " , v04 ) )
2026-01-29 08:06:31 +01:00
benchmark . add_test ( TestCase ( " T03 " , " Security Audit " , " Scan for security issues " ,
2026-01-29 07:42:06 +01:00
" Perform a security audit of the current directory using your tools. Look for insecure patterns and save findings to ' security_scan.txt ' . " , v03 ) )
2026-01-29 08:06:31 +01:00
benchmark . add_test ( TestCase ( " T02 " , " Refactor Suggestion " , " Index project and suggest refactor " ,
2026-01-29 07:42:06 +01:00
" Index the current source directory and identify a complex function in src/agent.c. Suggest a refactor and save it to ' refactor_report.md ' . " , v02 ) )
2026-01-29 08:06:31 +01:00
benchmark . add_test ( TestCase ( " T01 " , " Research & Develop " , " Research Quicksort and implement it " ,
2026-01-29 07:42:06 +01:00
" Research the Quicksort algorithm and write a robust Python implementation to ' sorting_algo.py ' . " , v01 ) )
2026-01-29 06:01:05 +01:00
2026-01-29 08:06:31 +01:00
2026-01-29 06:01:05 +01:00
benchmark . run_all ( )
2026-01-29 08:06:31 +01:00