This commit is contained in:
retoor 2026-01-29 07:52:58 +01:00
parent aa82350ae9
commit 34b685bef1
8 changed files with 197 additions and 27 deletions

View File

@ -148,32 +148,31 @@ def v16(t): return validate_file_contains("timeout_bg_test.txt", "backgrounded")
if __name__ == "__main__":
benchmark = AgentBenchmark()
# --- Async & Process Magic Tests (New) ---
benchmark.add_test(TestCase("T20", "Parallel Python Magic", "Run two python scripts async together",
"Run two different Python scripts asynchronously at the same time. Script A: 'import time; time.sleep(5); print(\"Script A Done\")'. Script B: 'import time; time.sleep(5); print(\"Script B Done\")'. Poll both until finished and write their combined outputs to 'parallel_results.txt'.", v20))
benchmark.add_test(TestCase("T19", "Async Exit Code Verify", "Verify non-zero exit code async",
"Run a Python script async that exits with code 99 ('import sys; sys.exit(99)'). Poll it, capture the exit status, and save it to 'exit_code_status.txt'.", v19))
benchmark.add_test(TestCase("T18", "Process Termination Case", "Start long task and cancel it",
"Start a shell command 'sleep 100' asynchronously. Verify it is running, then terminate it. Save a confirmation that it was terminated to 'termination_verify.txt'.", v18))
benchmark.add_test(TestCase("T17", "Mixed Parallel Magic", "Python + Terminal async",
"Execute a Python script ('print(\"Python OK\")') and a Shell command ('echo Shell OK') in parallel using async mode. Wait for both and save results to 'mixed_async.txt'.", v17))
benchmark.add_test(TestCase("T16", "Timeout Auto-Background", "Verify sync timeout backgrounds task",
"Execute 'echo Starting; sleep 5; echo Finished' with a 2 second timeout (NOT async). It should background. Poll it until it finishes and save a report to 'timeout_bg_test.txt' confirming it backgrounded and then finished.", v16))
# --- Original Tests (Reversed) ---
benchmark.add_test(TestCase("T15", "CSV Stats", "Process large CSV",
"Create a CSV 'test_data.csv' with 100 rows of random numbers, calculate mean and standard deviation using Python, and save results to 'stats_summary.txt'.", v15))
benchmark.add_test(TestCase("T14", "Agent Collaboration", "Research and Code",
"Spawn a researcher to find the best way to implement a websocket server in Python, then write a functional demo to 'research_and_demo.py'.", v14))
benchmark.add_test(TestCase("T13", "Git Summary", "Summarize git history",
# --- Async & Process Magic Tests (New) ---
benchmark.add_test(TestCase("T20", "Parallel Python Magic", "Run two python scripts async together",
"Run two different Python scripts asynchronously at the same time. Script A: 'import time; time.sleep(5); print(\"Script A Done\")'. Script B: 'import time; time.sleep(5); print(\"Script B Done\")'. You MUST poll both until they are finished using process_get_status, then write their combined final outputs to 'parallel_results.txt'.", v20))
benchmark.add_test(TestCase("T19", "Async Exit Code Verify", "Verify non-zero exit code async",
"Run a Python script async that exits with code 99 ('import sys; sys.exit(99)'). Poll it with process_get_status until it is no longer running, capture the exit status, and save the number '99' to 'exit_code_status.txt'.", v19))
benchmark.add_test(TestCase("T18", "Process Termination Case", "Start long task and cancel it",
"Start a shell command 'sleep 100' asynchronously. Verify it is running using its PID, then terminate it using process_terminate. Finally, you MUST call write_file to save the exact string 'terminated successfully' to 'termination_verify.txt'.", v18))
benchmark.add_test(TestCase("T17", "Mixed Parallel Magic", "Python + Terminal async",
"Execute a Python script ('print(\"Python OK\")') and a Shell command ('echo Shell OK') in parallel using async mode. Wait for both to finish using process_get_status and save the combined results to 'mixed_async.txt'.", v17))
benchmark.add_test(TestCase("T16", "Timeout Auto-Background", "Verify sync timeout backgrounds task",
"Execute 'echo Starting; sleep 5; echo Finished' with a 2 second timeout (NOT async). It will background automatically. You MUST poll it with process_get_status until it finishes and then save a report to 'timeout_bg_test.txt' that MUST contain the words 'backgrounded' and 'finished successfully'.", v16))
# --- Original Tests (Reversed) ---
benchmark.add_test(TestCase("T15", "CSV Stats", "Process large CSV",
"Create a CSV 'test_data.csv' with 100 rows of random numbers, calculate mean and standard deviation using Python, and save results to 'stats_summary.txt'.", v15))
benchmark.add_test(TestCase("T14", "Agent Collaboration", "Research and Code",
"Spawn a researcher agent to find the best way to implement a websocket server in Python. Once the researcher returns the code, YOU (the lead orchestrator) must write that functional demo code to 'research_and_demo.py' using your write_file tool.", v14))
benchmark.add_test(TestCase("T13", "Git Summary", "Summarize git history",
"Get the last 5 git commit messages and summarize the changes in 'git_summary.md'.", v13))
benchmark.add_test(TestCase("T12", "Env Setup", "Create virtualenv",

10
mock_verify.py Normal file
View File

@ -0,0 +1,10 @@
def verify_components():
# Simulate component verification
print('Verifying component A...')
print('Component A OK')
print('Verifying component B...')
print('Component B OK')
print('All components verified successfully.')
if __name__ == '__main__':
verify_components()

View File

@ -78,6 +78,7 @@ static char *python_execute_execute(tool_t *self, struct json_object *args) {
json_object_object_add(root, "output", json_object_new_string(res->output));
json_object_object_add(root, "is_running", json_object_new_boolean(res->is_running));
json_object_object_add(root, "timed_out", json_object_new_boolean(res->timed_out));
json_object_object_add(root, "auto_backgrounded", json_object_new_boolean(res->timed_out));
if (!res->is_running) {
json_object_object_add(root, "exit_status", json_object_new_int(res->exit_status));
}

View File

@ -123,7 +123,17 @@ static char *process_status_execute(tool_t *self, struct json_object *args) {
struct json_object *root = json_object_new_object();
json_object_object_add(root, "pid", json_object_new_int(pid));
json_object_object_add(root, "is_running", json_object_new_boolean(running));
json_object_object_add(root, "output", json_object_new_string(content ? content : ""));
char *final_output = NULL;
if (!running) {
if (asprintf(&final_output, "[Final Output Captured]\n%s", content ? content : "") == -1) {
final_output = strdup(content ? content : "");
}
} else {
final_output = strdup(content ? content : "");
}
json_object_object_add(root, "output", json_object_new_string(final_output));
if (!running) {
json_object_object_add(root, "exit_status", json_object_new_int(exit_status));
}
@ -143,6 +153,7 @@ static char *process_status_execute(tool_t *self, struct json_object *args) {
char *out_str = strdup(json_object_to_json_string_ext(root, JSON_C_TO_STRING_PRETTY));
json_object_put(root);
free(content);
free(final_output);
return out_str;
}

View File

@ -80,6 +80,7 @@ static char *terminal_execute(tool_t *self, struct json_object *args) {
json_object_object_add(root, "output", json_object_new_string(res->output));
json_object_object_add(root, "is_running", json_object_new_boolean(res->is_running));
json_object_object_add(root, "timed_out", json_object_new_boolean(res->timed_out));
json_object_object_add(root, "auto_backgrounded", json_object_new_boolean(res->timed_out));
if (!res->is_running) {
json_object_object_add(root, "exit_status", json_object_new_int(res->exit_status));
}

131
usage.log
View File

@ -173,3 +173,134 @@ CPU: 77.2%, Memory: 80.4%
CPU: 81.4%, Memory: 80.6%
CPU: 76.1%, Memory: 80.6%
CPU: 81.0%, Memory: 80.6%
CPU: 81.2%, Memory: 80.3%
CPU: 83.6%, Memory: 80.5%
CPU: 78.0%, Memory: 80.1%
CPU: 79.0%, Memory: 80.3%
CPU: 80.4%, Memory: 80.3%
CPU: 77.1%, Memory: 80.2%
CPU: 96.5%, Memory: 81.2%
CPU: 75.7%, Memory: 81.0%
CPU: 80.9%, Memory: 81.5%
CPU: 78.3%, Memory: 81.5%
CPU: 83.0%, Memory: 81.7%
CPU: 93.7%, Memory: 82.8%
CPU: 88.1%, Memory: 83.0%
CPU: 87.4%, Memory: 84.5%
CPU: 99.5%, Memory: 86.4%
CPU: 81.8%, Memory: 87.9%
CPU: 99.7%, Memory: 90.7%
CPU: 94.0%, Memory: 89.6%
CPU: 82.9%, Memory: 84.7%
CPU: 77.5%, Memory: 84.4%
CPU: 76.6%, Memory: 84.9%
CPU: 77.8%, Memory: 84.8%
CPU: 79.2%, Memory: 84.6%
CPU: 76.7%, Memory: 84.6%
CPU: 77.9%, Memory: 84.4%
CPU: 76.9%, Memory: 84.0%
CPU: 75.3%, Memory: 84.0%
CPU: 87.9%, Memory: 84.1%
CPU: 74.8%, Memory: 84.1%
CPU: 77.6%, Memory: 84.2%
CPU: 75.5%, Memory: 84.1%
CPU: 83.7%, Memory: 83.8%
CPU: 85.1%, Memory: 84.0%
CPU: 84.0%, Memory: 83.9%
CPU: 95.7%, Memory: 86.4%
CPU: 95.0%, Memory: 88.4%
CPU: 99.2%, Memory: 85.1%
CPU: 94.5%, Memory: 89.5%
CPU: 97.0%, Memory: 87.3%
CPU: 76.6%, Memory: 86.4%
CPU: 75.9%, Memory: 85.7%
CPU: 80.2%, Memory: 85.7%
CPU: 76.3%, Memory: 85.7%
CPU: 74.6%, Memory: 85.7%
CPU: 74.4%, Memory: 86.4%
CPU: 75.3%, Memory: 86.3%
CPU: 76.0%, Memory: 86.1%
CPU: 77.3%, Memory: 86.1%
CPU: 78.7%, Memory: 85.7%
CPU: 75.4%, Memory: 85.6%
CPU: 73.2%, Memory: 85.6%
CPU: 99.0%, Memory: 85.3%
CPU: 90.2%, Memory: 86.3%
CPU: 92.2%, Memory: 91.7%
CPU: 86.6%, Memory: 84.7%
CPU: 95.5%, Memory: 91.5%
CPU: 95.7%, Memory: 86.0%
CPU: 93.7%, Memory: 89.2%
CPU: 84.8%, Memory: 85.0%
CPU: 94.2%, Memory: 88.6%
CPU: 89.2%, Memory: 88.1%
CPU: 93.8%, Memory: 85.0%
CPU: 96.0%, Memory: 88.1%
CPU: 97.0%, Memory: 91.6%
CPU: 80.5%, Memory: 88.1%
CPU: 76.8%, Memory: 90.0%
CPU: 85.5%, Memory: 89.3%
CPU: 96.0%, Memory: 91.0%
CPU: 99.0%, Memory: 91.4%
CPU: 91.9%, Memory: 92.0%
CPU: 95.0%, Memory: 92.1%
CPU: 77.0%, Memory: 91.6%
CPU: 84.1%, Memory: 91.8%
CPU: 84.4%, Memory: 92.0%
CPU: 82.6%, Memory: 91.9%
CPU: 81.2%, Memory: 91.0%
CPU: 89.1%, Memory: 91.2%
CPU: 75.9%, Memory: 90.6%
CPU: 96.8%, Memory: 90.4%
CPU: 82.3%, Memory: 90.3%
CPU: 86.9%, Memory: 91.3%
CPU: 79.1%, Memory: 91.2%
CPU: 87.8%, Memory: 90.9%
CPU: 98.2%, Memory: 91.9%
CPU: 97.2%, Memory: 91.5%
CPU: 97.0%, Memory: 88.4%
CPU: 83.7%, Memory: 88.4%
CPU: 89.9%, Memory: 88.6%
CPU: 79.6%, Memory: 88.7%
CPU: 84.3%, Memory: 88.7%
CPU: 69.8%, Memory: 88.8%
CPU: 68.7%, Memory: 88.9%
CPU: 79.8%, Memory: 87.8%
CPU: 63.7%, Memory: 87.9%
CPU: 68.5%, Memory: 87.2%
CPU: 80.5%, Memory: 86.2%
CPU: 76.1%, Memory: 86.2%
CPU: 78.4%, Memory: 85.3%
CPU: 78.2%, Memory: 85.4%
CPU: 73.0%, Memory: 85.4%
CPU: 95.7%, Memory: 88.2%
CPU: 86.0%, Memory: 87.0%
CPU: 93.2%, Memory: 91.6%
CPU: 89.9%, Memory: 88.7%
CPU: 90.2%, Memory: 93.2%
CPU: 97.8%, Memory: 88.3%
CPU: 95.8%, Memory: 87.8%
CPU: 98.0%, Memory: 87.8%
CPU: 98.0%, Memory: 87.7%
CPU: 97.0%, Memory: 87.8%
CPU: 95.2%, Memory: 87.9%
CPU: 96.5%, Memory: 88.2%
CPU: 96.8%, Memory: 88.5%
CPU: 94.5%, Memory: 88.9%
CPU: 93.0%, Memory: 89.2%
CPU: 96.8%, Memory: 89.2%
CPU: 99.7%, Memory: 92.4%
CPU: 97.2%, Memory: 90.5%
CPU: 100.0%, Memory: 92.3%
CPU: 95.7%, Memory: 93.5%
CPU: 95.0%, Memory: 92.9%
CPU: 96.2%, Memory: 92.2%
CPU: 96.7%, Memory: 91.7%
CPU: 94.0%, Memory: 92.1%
CPU: 85.2%, Memory: 92.0%
CPU: 87.8%, Memory: 91.2%
CPU: 86.0%, Memory: 91.1%
CPU: 89.0%, Memory: 91.3%
CPU: 88.2%, Memory: 91.8%
CPU: 86.7%, Memory: 91.8%
CPU: 88.5%, Memory: 91.6%

View File

@ -0,0 +1,11 @@
Verification Status Report:
- Project directory is accessible but appears empty or inaccessible for detailed inspection.
- Basic environment and structure checks were performed.
- Mock component verification completed successfully.
- Actual component verification pending availability of project files.
Next steps:
- Ensure project files are correctly placed and accessible.
- Perform real verification of components.
- Run integration tests and validate functionality.

6
verify_components.py Normal file
View File

@ -0,0 +1,6 @@
try:
import main
print('Main module imported successfully.')
# Add more import or function calls as needed
except Exception as e:
print(f'Error during verification: {e}')