benchmark.add_test(TestCase("T20","Parallel Python Magic","Run two python scripts async together",
"Run two different Python scripts asynchronously at the same time. Script A: 'import time; time.sleep(5); print(\"Script A Done\")'. Script B: 'import time; time.sleep(5); print(\"Script B Done\")'. You MUST poll both until they are finished using process_get_status, then write their combined final outputs to 'parallel_results.txt'.",v20))
"Run a Python script async that exits with code 99 ('import sys; sys.exit(99)'). Poll it with process_get_status until it is no longer running, capture the exit status, and save the number '99' to 'exit_code_status.txt'.",v19))
benchmark.add_test(TestCase("T18","Process Termination Case","Start long task and cancel it",
"Start a shell command 'sleep 100' asynchronously. Verify it is running using its PID, then terminate it using process_terminate. Finally, you MUST call write_file to save the exact string 'terminated successfully' to 'termination_verify.txt'.",v18))
"Execute a Python script ('print(\"Python OK\")') and a Shell command ('echo Shell OK') in parallel using async mode. Wait for both to finish using process_get_status and save the combined results to 'mixed_async.txt'.",v17))
"Execute 'echo Starting; sleep 5; echo Finished' with a 2 second timeout (NOT async). It will background automatically. You MUST poll it with process_get_status until it finishes and then save a report to 'timeout_bg_test.txt' that MUST contain the words 'backgrounded' and 'finished successfully'.",v16))
# --- Original Tests (Reversed) ---
benchmark.add_test(TestCase("T15","CSV Stats","Process large CSV",
"Create a CSV 'test_data.csv' with 100 rows of random numbers, calculate mean and standard deviation using Python, and save results to 'stats_summary.txt'.",v15))
benchmark.add_test(TestCase("T14","Agent Collaboration","Research and Code",
"Spawn a researcher agent to find the best way to implement a websocket server in Python. Once the researcher returns the code, YOU (the lead orchestrator) must write that functional demo code to 'research_and_demo.py' using your write_file tool.",v14))
"Create a dummy log file with 20 lines of mixed INFO and ERROR messages. Parse it using Python to count errors and save a JSON summary to 'log_analysis.json'.",v11))
"Write a Python script 'system_monitor.py' that logs CPU and memory usage to 'usage.log' every 5 seconds. Ensure it handles keyboard interrupts.",v05))
"Fetch data from https://jsonplaceholder.typicode.com/users, process it to extract just names and emails, store it in a local SQLite table named 'bench_users', and export it to 'data_export.csv'.",v04))
benchmark.add_test(TestCase("T03","Security Audit","Scan for security issues",
"Perform a security audit of the current directory using your tools. Look for insecure patterns and save findings to 'security_scan.txt'.",v03))
benchmark.add_test(TestCase("T02","Refactor Suggestion","Index project and suggest refactor",
"Index the current source directory and identify a complex function in src/agent.c. Suggest a refactor and save it to 'refactor_report.md'.",v02))
benchmark.add_test(TestCase("T01","Research & Develop","Research Quicksort and implement it",
"Research the Quicksort algorithm and write a robust Python implementation to 'sorting_algo.py'.",v01))