This commit is contained in:
retoor 2026-02-10 21:20:19 +01:00
parent 05a0fde768
commit 275f0a1fc0
8 changed files with 894 additions and 138 deletions

View File

@ -11,6 +11,7 @@ BINDIR = bin
SRC_CORE = $(SRCDIR)/r_error.c \
$(SRCDIR)/r_config.c \
$(SRCDIR)/spawn_tracker.c \
$(SRCDIR)/tool_registry.c \
$(SRCDIR)/db.c \
$(SRCDIR)/http_client.c \
@ -23,22 +24,23 @@ SRC_CORE = $(SRCDIR)/r_error.c \
$(SRCDIR)/main.c
SRC_TOOLS = $(TOOLSDIR)/tools_init.c \
$(TOOLSDIR)/tool_terminal.c \
$(TOOLSDIR)/tool_file.c \
$(TOOLSDIR)/tool_db.c \
$(TOOLSDIR)/tool_http.c \
$(TOOLSDIR)/tool_python.c \
$(TOOLSDIR)/tool_indexer.c \
$(TOOLSDIR)/tool_code.c \
$(TOOLSDIR)/tool_file_edit.c \
$(TOOLSDIR)/tool_system.c \
$(TOOLSDIR)/tool_enterprise.c \
$(TOOLSDIR)/tool_research.c \
$(TOOLSDIR)/tool_network.c \
$(TOOLSDIR)/tool_dns.c \
$(TOOLSDIR)/tool_automation.c \
$(TOOLSDIR)/tool_csv.c \
$(TOOLSDIR)/tool_agent.c
$(TOOLSDIR)/tool_terminal.c \
$(TOOLSDIR)/tool_file.c \
$(TOOLSDIR)/tool_db.c \
$(TOOLSDIR)/tool_http.c \
$(TOOLSDIR)/tool_python.c \
$(TOOLSDIR)/tool_indexer.c \
$(TOOLSDIR)/tool_code.c \
$(TOOLSDIR)/tool_file_edit.c \
$(TOOLSDIR)/tool_system.c \
$(TOOLSDIR)/tool_enterprise.c \
$(TOOLSDIR)/tool_research.c \
$(TOOLSDIR)/tool_network.c \
$(TOOLSDIR)/tool_dns.c \
$(TOOLSDIR)/tool_automation.c \
$(TOOLSDIR)/tool_csv.c \
$(TOOLSDIR)/tool_agent.c \
$(TOOLSDIR)/tool_deepsearch.c
SRC = $(SRC_CORE) $(SRC_TOOLS)

View File

@ -31,4 +31,9 @@ bool r_config_set_session_id(r_config_handle cfg, const char *session_id);
const char *r_config_get_system_message(r_config_handle cfg);
int r_config_get_max_spawn_depth(r_config_handle cfg);
int r_config_get_max_total_spawns(r_config_handle cfg);
const char *r_config_get_deepsearch_system_message(void);
#endif

View File

@ -4,6 +4,7 @@
#include "http_client.h"
#include "r_config.h"
#include "r_error.h"
#include "spawn_tracker.h"
#include "tool.h"
#include "line.h"
#include "markdown.h"
@ -266,106 +267,69 @@ static void init(void) {
"# AUTONOMOUS AGENT INSTRUCTIONS\n"
"Current date/time: %s\n"
"Working directory: %s\n\n"
"You are an autonomous AI agent. You operate in a loop: reason about the "
"task, "
"select and execute tools when needed, observe results, and continue "
"until the goal is achieved.\n\n"
"## The Enterprise Pyramid (Rigid Hierarchy)\n"
"You are the **Executive Agent (Apex)**. You MUST enforce a strict top-down "
"Chain of Command:\n"
"- **Executive (Apex)**: Final arbiter. Owns the Strategic Blueprint. You "
"never code or research directly. You evaluate sub-agent depth.\n"
"- **Department Heads (Managers)**: Create detailed 'Task Packs'. "
"Synthesize sub-agent outputs into 'Department Reports'.\n"
"- **Workers (Base)**: Execute atomic tasks. Report literal word counts "
"and file sizes upward.\n\n"
"### Bureaucratic Protocols (MANDATORY)\n"
"1. **Strategic Blueprint**: Your very first turn MUST output a "
"blueprint: Mission, Departments Involved, and a 10-step Checklist.\n"
"2. **Sequential Handover**: You are FORBIDDEN from spawning a Developer "
"until the Researcher has delivered a minimum of 1000 words of "
"documented facts to `PROJECT_KNOWLEDGE.md`.\n"
"3. **Content Depth Guardrail**: For 'Huge' projects, every page MUST "
"contain deep, researched info. Placeholder text (e.g., 'Coming soon', "
"'Introduction here') is a failure. You MUST use 'read_file' to audit "
"sub-agent work before concluding.\n"
"4. **Global Task Registry (GTR)**: Query GTR for every sub-task. If a "
"similar task exists, use its result summary. DUPLICATION IS FORBIDDEN.\n"
"5. **Fan-Out Architecture (Research)**: Manager calls `web_search` to get "
"URLs, then uses `research_dispatcher` to queue them. Workers use "
"`fetch_and_scrape` for individual URLs. If a Worker finds new links, it "
"MUST use `suggest_subtask` to escalate. NEVER follow rabbit holes "
"yourself.\n\n"
"### Shared Memory & Data Sharing\n"
"- Every turn, you MUST update `PROJECT_KNOWLEDGE.md` with new findings.\n"
"- All sub-agents MUST receive the full content of `PROJECT_KNOWLEDGE.md` "
"to ensure a shared organizational history.\n\n"
"## Multi-Agent Orchestration (MANDATORY)\n"
"## Project Scale Rules\n"
"- HUGE PROJECTS: If a 'huge' or 'multi-page' project is requested, "
"delivering a single file is FORBIDDEN. You MUST create a directory "
"structure (e.g., assets/, css/, js/) and multiple linked HTML files.\n"
"- CHECKLIST PROTOCOL: Your first response to a complex request MUST "
"include the '## Checklist' you intend to fulfill.\n"
"- NO Lying: Never claim a task is done or a feature exists unless you "
"have the tool output to prove it.\n\n"
"## Multi-Agent Orchestration (MANDATORY)\n"
"You are the Lead Orchestrator. You MUST delegate specialized work:\n"
"- researcher: For ALL information gathering. Never research yourself if "
"you can spawn a researcher.\n"
"- developer: For ALL coding, testing, and debugging.\n"
"- security: For ALL security-related audits.\n\n"
"IMPORTANT: When a sub-agent returns a result, you MUST read it, "
"synthesize it, and then perform any necessary follow-up actions (like "
"writing to a file or spawning another agent). NEVER assume a task is "
"done just because a sub-agent finished; YOU must complete the final "
"delivery.\n\n"
"MANDATORY FINAL ACTION: If the user asked to save results to a file, "
"YOU must call the write_file tool yourself with the synthesized data "
"from the sub-agent. Do not ask for permission.\n\n"
"## Tool Usage\n"
"- Use tools proactively. If you say you will do something, you MUST "
"call the tool in the SAME or NEXT turn.\n"
"- If a tool fails, analyze and retry with a different approach.\n\n"
"## CRITICAL OUTPUT RULES\n"
"- SHOW THE DATA: Always include the actual content from tool/agent "
"results in your response.\n"
"- NO PREMATURE COMPLETION: Do not say 'task complete' until you have "
"verified all files are written and all steps are finished.\n"
"- SEQUENTIAL EXECUTION: Perform one logical step at a time. If you need "
"to research AND write a file, spawn the researcher first, wait for the "
"result, THEN write the file.\n"
"## EXECUTION OF PYTHON CODE\n"
"Exclusively use native python without 3rd party packages unless you "
"have checked that they're installed on the system.\n"
"You are an autonomous AI agent with tools and sub-agents at your disposal.\n\n"
"## RULE #1: CLASSIFY BEFORE ACTING\n"
"Before doing ANYTHING, classify the user's request:\n\n"
"**SIMPLE** — questions, greetings, opinions, math, explanations, "
"small file reads/writes, single-step tasks:\n"
" -> Respond directly with text. Use a tool ONLY if the user explicitly "
"asks for something that requires one (e.g. 'read file X', 'search for Y').\n"
" -> Do NOT browse files, update PROJECT_KNOWLEDGE.md, or spawn agents.\n"
" -> Do NOT use tools just to 'explore' or 'get context'.\n\n"
"**COMPLEX** — multi-step research, building projects, security audits, "
"tasks that require multiple tools or sub-agents:\n"
" -> Use the full orchestration framework described below.\n"
" -> Keep using tools until the task is fully complete.\n\n"
"If unsure, treat it as SIMPLE. Only escalate to COMPLEX when the task "
"clearly requires multiple steps.\n\n"
"## Orchestration Framework (COMPLEX tasks only)\n"
"You are the **Executive Agent (Apex)**. Delegate to specialized sub-agents:\n"
"- **researcher**: Information gathering, web search, data extraction\n"
"- **developer**: Coding, testing, debugging, file creation\n"
"- **security**: Security audits, vulnerability analysis\n"
"- **fetcher**: URL content retrieval\n\n"
"### Hierarchy\n"
"- **Executive (Apex)**: Final arbiter. Owns the Strategic Blueprint.\n"
"- **Managers**: Create detailed Task Packs. Synthesize sub-agent outputs.\n"
"- **Workers**: Execute atomic tasks.\n\n"
"### Protocols (COMPLEX tasks only)\n"
"1. **Strategic Blueprint**: Output a blueprint: Mission, Departments, Checklist.\n"
"2. **Sequential Handover**: Do not spawn a Developer until the Researcher "
"has delivered documented facts to `PROJECT_KNOWLEDGE.md`.\n"
"3. **Content Depth Guardrail**: Placeholder text is a failure. Use "
"'read_file' to audit sub-agent work before concluding.\n"
"4. **Global Task Registry (GTR)**: Query GTR for every sub-task. "
"DUPLICATION IS FORBIDDEN.\n"
"5. **Fan-Out Architecture**: Manager calls `web_search` to get URLs, "
"then uses `research_dispatcher` to queue them.\n\n"
"### Shared Memory (COMPLEX tasks only)\n"
"- Update `PROJECT_KNOWLEDGE.md` with new findings.\n"
"- All sub-agents receive the full content of `PROJECT_KNOWLEDGE.md`.\n\n"
"### Sub-Agent Result Handling\n"
"When a sub-agent returns, read and synthesize the result. "
"If the user asked to save results, call write_file yourself.\n\n"
"### Spawn Limits\n"
"The system enforces spawn depth and total spawn limits automatically.\n\n"
"## Tool Usage (when tools are needed)\n"
"- Only use tools when the task requires them.\n"
"- If a tool fails, analyze and retry with a different approach.\n"
"- When working on a COMPLEX task, keep calling tools until done.\n\n"
"## Output Rules\n"
"- When you use tools, include actual data from results in your response.\n"
"- Do not claim a task is done unless verified.\n"
"- For COMPLEX tasks, perform one logical step at a time.\n"
"## Python\n"
"Use native python only, no 3rd party packages unless verified installed.\n"
"## Local Database\n"
"You have a local SQLite database accessible via db_query, db_get, and "
"db_set tools.\n"
"Use stemmed, lowercase keys to prevent duplicates.\n"
"SQLite via db_query, db_get, db_set. Use stemmed lowercase keys.\n"
"Schema: %s\n\n"
"## Response Format\n"
"Your response IS the only thing the user sees. Tool outputs are hidden "
"from them.\n"
"You MUST copy/paste relevant data from tool results into your "
"response.\n"
"Bad: 'I searched and found information about X.'\n"
"Good: 'Here is what I found: [actual content from search results]'\n"
"## Tool calls\n"
"As long you're not done, your last response must be always a tool call "
"to keep going.\n"
"Complete the whole tasks before bothering or informing user.\n"
"Backup\n"
"You have the power to delete and modify files whatever your want to but "
"always make a .bak file with backup before editing unless you've "
"created the file yourself."
"## Terminal execution\n"
"You are in a bash environment, so can also execute bash commands.\n "
"Everything is allowed.\n"
"Be very creative with your bash tool use, use it excessively.\n"
"Prefer commands that do not require root access.\n"
"## COMMUNICATION\n"
"You are only allowed to talk once, so do that absolutely last with your "
"conclusion.\n",
"Your response is the only thing the user sees. Tool outputs are hidden.\n"
"Copy relevant data from tool results into your response.\n"
"## Backup\n"
"Make a .bak backup before editing files you did not create.\n"
"## Terminal\n"
"You have bash access. Prefer commands that do not require root.\n",
datetime, cwd, schema ? schema : "{}");
free(schema);
fprintf(stderr, "Loading...");
@ -391,6 +355,7 @@ static void cleanup(void) {
global_db = NULL;
}
tools_registry_shutdown();
spawn_tracker_destroy();
r_config_destroy();
}
static void handle_sigint(int sig) {

View File

@ -14,6 +14,8 @@ struct r_config_t {
char *system_message;
double temperature;
int max_tokens;
int max_spawn_depth;
int max_total_spawns;
bool use_tools;
bool use_strict;
bool verbose;
@ -72,6 +74,10 @@ r_config_handle r_config_get_instance(void) {
instance->temperature = 0.1;
const char *max_tokens_env = getenv("R_MAX_TOKENS");
instance->max_tokens = max_tokens_env ? atoi(max_tokens_env) : 4096;
const char *spawn_depth_env = getenv("R_MAX_SPAWN_DEPTH");
instance->max_spawn_depth = spawn_depth_env ? atoi(spawn_depth_env) : 5;
const char *total_spawns_env = getenv("R_MAX_TOTAL_SPAWNS");
instance->max_total_spawns = total_spawns_env ? atoi(total_spawns_env) : 20;
instance->use_tools = resolve_env_bool("R_USE_TOOLS", true);
instance->use_strict = resolve_env_bool("R_USE_STRICT", true);
instance->verbose = false;
@ -146,3 +152,145 @@ bool r_config_set_session_id(r_config_handle cfg, const char *session_id) {
const char *r_config_get_system_message(r_config_handle cfg) {
return cfg ? cfg->system_message : NULL;
}
int r_config_get_max_spawn_depth(r_config_handle cfg) {
return cfg ? cfg->max_spawn_depth : 5;
}
int r_config_get_max_total_spawns(r_config_handle cfg) {
return cfg ? cfg->max_total_spawns : 20;
}
/*
* Deepsearch Algorithm System Instructions
*
* Based on research into Deep Research/Deep Search algorithms from
* OpenAI Deep Research, Gemini Deep Research, and academic sources.
*
* This implements an iterative, multi-step research process that goes
* far beyond simple search to produce comprehensive, well-sourced reports.
*/
const char *r_config_get_deepsearch_system_message(void) {
return "You are an advanced Deep Research Agent. When DEEPSEARCH is invoked, you MUST execute the following comprehensive algorithm using the web search tool. This is an iterative, multi-step research process - not a single search query.\n"
"\n"
"=== DEEPSEARCH ALGORITHM EXECUTION ===\n"
"\n"
"PHASE 1: INTENT CLARIFICATION (Human-in-the-loop)\n"
"- Analyze the user's research query for ambiguity, missing context, or scope issues\n"
"- Generate 2-4 clarifying questions to refine the research direction\n"
"- Combine original query with user responses to form the RESEARCH OBJECTIVE\n"
"- Define SCOPE boundaries: time period, geographic region, technical depth, etc.\n"
"\n"
"PHASE 2: RESEARCH PLANNING\n"
"- Decompose the RESEARCH OBJECTIVE into 3-7 distinct sub-topics or research questions\n"
"- For each sub-topic, identify: key entities, required data types, credible source types\n"
"- Create a RESEARCH PLAN: ordered list of investigation areas with priorities\n"
"- Determine ITERATION PARAMETERS: max_depth (3-5 recommended), breadth_per_level (5-10 queries)\n"
"\n"
"PHASE 3: ITERATIVE SEARCH LOOP (Core Algorithm)\n"
"Execute the following loop until depth=0 or sufficient information gathered:\n"
"\n"
" 3.1 QUERY GENERATION\n"
" - Based on current RESEARCH OBJECTIVE and accumulated LEARNINGS\n"
" - Generate breadth_per_level search queries that are:\n"
" * Diverse: cover different angles, perspectives, and source types\n"
" * Specific: narrowly focused on particular aspects, not broad queries\n"
" * Progressive: build upon previous learnings, drilling deeper\n"
" * Evidence-seeking: designed to find data, quotes, statistics, citations\n"
"\n"
" 3.2 CONCURRENT SEARCH EXECUTION\n"
" - Execute ALL generated queries in parallel using the web search tool\n"
" - For each result, capture: URL, title, publication date, author/source credibility\n"
" - Maintain SEARCH LOG: record all queries executed and URLs visited\n"
"\n"
" 3.3 CONTENT EXTRACTION & PARSING\n"
" - For top-ranked results (based on relevance and source credibility):\n"
" - Extract main content, filtering out: navigation, ads, footers, unrelated sections\n"
" - Preserve: key facts, statistics, expert quotes, dates, named entities, citations\n"
" - Flag content quality: authoritative (academic/government), credible (news/expert), or unverified\n"
"\n"
" 3.4 LEARNING EXTRACTION\n"
" - For each extracted content piece, generate LEARNINGS:\n"
" * Key findings relevant to sub-topics\n"
" * Direct quotes with attribution\n"
" * Statistics and data points with sources\n"
" * Named entities and their relationships\n"
" * Dates and temporal information\n"
" * Citations to other authoritative sources\n"
" - Deduplicate: merge similar findings from multiple sources\n"
" - Cross-validate: mark facts confirmed by multiple independent sources\n"
"\n"
" 3.5 GAP ANALYSIS & FOLLOW-UP\n"
" - Analyze current LEARNINGS against RESEARCH PLAN\n"
" - Identify KNOWLEDGE GAPS:\n"
" * Missing information needed to answer research questions\n"
" * Conflicting information requiring resolution\n"
" * Areas with insufficient source diversity\n"
" * Claims needing fact-checking\n"
" - Generate 3-5 FOLLOW-UP QUESTIONS to address gaps\n"
"\n"
" 3.6 ITERATION CONTROL\n"
" - If depth > 0 AND knowledge gaps exist:\n"
" * depth = depth - 1\n"
" * Update RESEARCH OBJECTIVE with FOLLOW-UP QUESTIONS\n"
" * Continue to next iteration (return to 3.1)\n"
" - If depth = 0 OR sufficient information gathered:\n"
" * Exit loop and proceed to Phase 4\n"
"\n"
"PHASE 4: SYNTHESIS & VERIFICATION\n"
"- Organize all LEARNINGS by sub-topic from RESEARCH PLAN\n"
"- Cross-source verification:\n"
" * Identify and resolve conflicting claims between sources\n"
" * Prioritize authoritative sources for disputed facts\n"
" * Flag uncertain information requiring caveats\n"
"- Evidence quality assessment:\n"
" * Mark high-confidence facts (multiple authoritative sources)\n"
" * Mark medium-confidence facts (limited sources or expert opinion)\n"
" * Note low-confidence or speculative claims\n"
"\n"
"PHASE 5: STRUCTURED REPORT GENERATION\n"
"- Generate comprehensive research report with the following structure:\n"
"\n"
" EXECUTIVE SUMMARY\n"
" - 2-4 paragraph overview of key findings\n"
" - Direct answer to original research query if possible\n"
"\n"
" KEY FINDINGS\n"
" - Numbered list of 5-10 major findings\n"
" - Each finding with inline citation [Source: URL or publication]\n"
"\n"
" DETAILED ANALYSIS (by sub-topic)\n"
" - For each research sub-topic from Phase 2:\n"
" * Section header with sub-topic name\n"
" * Comprehensive analysis with supporting evidence\n"
" * Relevant statistics, quotes, and data points\n"
" * Citations for all claims\n"
"\n"
" SOURCE EVALUATION\n"
" - List of primary sources consulted (grouped by credibility tier)\n"
" - Methodology note: search strategies used, limitations encountered\n"
"\n"
" REMAINING UNCERTAINTIES\n"
" - Gaps that could not be filled within search constraints\n"
" - Areas where sources conflicted or were insufficient\n"
" - Recommendations for further research\n"
"\n"
"PHASE 6: CITATION FORMATTING\n"
"- Use inline citations: [Author/Source, Year] or [Publication Name]\n"
"- Include full reference list at end with URLs\n"
"- Ensure every significant claim has attribution\n"
"\n"
"=== ALGORITHM CONSTRAINTS ===\n"
"- MINIMUM ITERATIONS: At least 3 search iterations (depth >= 2)\n"
"- SOURCE DIVERSITY: Aim for at least 5 distinct authoritative sources\n"
"- TEMPORAL COVERAGE: Include both recent and foundational sources\n"
"- PERSPECTIVE DIVERSITY: Seek multiple viewpoints on controversial topics\n"
"- AVOID PLAGIARISM: Paraphrase and synthesize; use quotes sparingly with attribution\n"
"\n"
"=== STOP CONDITIONS ===\n"
"Stop iterating when ANY of:\n"
"1. Knowledge gaps have been sufficiently filled to answer the research question\n"
"2. Maximum depth reached (configured in iteration parameters)\n"
"3. Diminishing returns: new searches not yielding novel information\n"
"4. Sufficient source diversity and cross-validation achieved\n"
"\n"
"Execute this algorithm methodically. Report your progress through each phase. Maintain the SEARCH LOG and LEARNINGS accumulation throughout. Produce the final structured report in Phase 5.";
}

89
src/spawn_tracker.c Normal file
View File

@ -0,0 +1,89 @@
// retoor <retoor@molodetz.nl>
#include "spawn_tracker.h"
#include "r_config.h"
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct spawn_tracker_t {
int current_depth;
int total_spawns;
int max_depth;
int max_total;
pthread_mutex_t lock;
};
static struct spawn_tracker_t *instance = NULL;
spawn_tracker_handle spawn_tracker_get_instance(void) {
if (instance) return instance;
instance = calloc(1, sizeof(struct spawn_tracker_t));
if (!instance) return NULL;
r_config_handle cfg = r_config_get_instance();
instance->max_depth = r_config_get_max_spawn_depth(cfg);
instance->max_total = r_config_get_max_total_spawns(cfg);
instance->current_depth = 0;
instance->total_spawns = 0;
pthread_mutex_init(&instance->lock, NULL);
return instance;
}
void spawn_tracker_destroy(void) {
if (!instance) return;
pthread_mutex_destroy(&instance->lock);
free(instance);
instance = NULL;
}
bool spawn_tracker_can_spawn(spawn_tracker_handle tracker) {
if (!tracker) return false;
pthread_mutex_lock(&tracker->lock);
bool allowed = (tracker->current_depth < tracker->max_depth) &&
(tracker->total_spawns < tracker->max_total);
pthread_mutex_unlock(&tracker->lock);
return allowed;
}
int spawn_tracker_begin(spawn_tracker_handle tracker) {
if (!tracker) return -1;
pthread_mutex_lock(&tracker->lock);
tracker->current_depth++;
tracker->total_spawns++;
int depth_token = tracker->current_depth;
fprintf(stderr, "[SpawnTracker] depth=%d/%d total=%d/%d\n",
tracker->current_depth, tracker->max_depth,
tracker->total_spawns, tracker->max_total);
pthread_mutex_unlock(&tracker->lock);
return depth_token;
}
void spawn_tracker_end(spawn_tracker_handle tracker, int depth_token) {
if (!tracker) return;
(void)depth_token;
pthread_mutex_lock(&tracker->lock);
if (tracker->current_depth > 0) {
tracker->current_depth--;
}
fprintf(stderr, "[SpawnTracker] agent finished, depth=%d/%d total=%d/%d\n",
tracker->current_depth, tracker->max_depth,
tracker->total_spawns, tracker->max_total);
pthread_mutex_unlock(&tracker->lock);
}
bool spawn_tracker_validate_result(const char *result) {
if (!result) return false;
size_t len = strlen(result);
if (len < SPAWN_RESULT_MIN_LENGTH) return false;
if (strstr(result, "Error:") == result) return false;
return true;
}

View File

@ -4,6 +4,7 @@
#include "db.h"
#include "messages.h"
#include "r_config.h"
#include "spawn_tracker.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@ -31,7 +32,7 @@ static struct json_object *tool_spawn_agent_get_description(void) {
json_object_object_add(props, "persona", persona);
struct json_object *goal = json_object_new_object();
json_object_object_add(goal, "type", json_object_new_string("string"));
json_object_object_add(goal, "description",
json_object_object_add(goal, "description",
json_object_new_string("The specific task for the sub-agent. For Shared Memory, YOU MUST INCLUDE the current findings from PROJECT_KNOWLEDGE.md here so the agent has context."));
json_object_object_add(props, "goal", goal);
@ -50,12 +51,6 @@ static struct json_object *tool_spawn_agent_get_description(void) {
json_object_object_add(logic, "description", json_object_new_string("Formal logic justification for this order (MANDATORY for Audit Log)."));
json_object_object_add(props, "logic_justification", logic);
struct json_object *max_subagents = json_object_new_object();
json_object_object_add(max_subagents, "type", json_object_new_string("integer"));
json_object_object_add(max_subagents, "description", json_object_new_string("Remaining budget for spawning recursive sub-agents. Decrement this by 1 when spawning a sub-agent. Default is 2."));
json_object_object_add(max_subagents, "default", json_object_new_int(2));
json_object_object_add(props, "max_subagents", max_subagents);
struct json_object *as = json_object_new_object();
json_object_object_add(as, "type", json_object_new_string("boolean"));
json_object_object_add(as, "description", json_object_new_string("Required for strict mode."));
@ -67,7 +62,6 @@ static struct json_object *tool_spawn_agent_get_description(void) {
json_object_array_add(required, json_object_new_string("goal"));
json_object_array_add(required, json_object_new_string("role"));
json_object_array_add(required, json_object_new_string("logic_justification"));
json_object_array_add(required, json_object_new_string("max_subagents"));
json_object_array_add(required, json_object_new_string("async"));
json_object_object_add(params, "required", required);
json_object_object_add(params, "additionalProperties", json_object_new_boolean(0));
@ -81,20 +75,28 @@ static struct json_object *tool_spawn_agent_get_description(void) {
json_object_object_add(full_obj, "function", obj);
return full_obj;
}
static int get_persona_max_iterations(const char *persona) {
if (strcmp(persona, "researcher") == 0) return 100;
if (strcmp(persona, "developer") == 0) return 100;
if (strcmp(persona, "security") == 0) return 100;
if (strcmp(persona, "fetcher") == 0) return 30;
return 50;
}
static char *tool_spawn_agent_execute(tool_t *self, struct json_object *args) {
(void)self;
struct json_object *persona_obj, *goal_obj, *max_subagents_obj;
struct json_object *persona_obj, *goal_obj;
if (!json_object_object_get_ex(args, "persona", &persona_obj) ||
!json_object_object_get_ex(args, "goal", &goal_obj)) {
return strdup("Error: Missing persona or goal");
}
int max_subagents = 2;
if (json_object_object_get_ex(args, "max_subagents", &max_subagents_obj)) {
max_subagents = json_object_get_int(max_subagents_obj);
}
if (max_subagents <= 0) {
return strdup("Error: Spawning limit reached. You are not allowed to spawn more sub-agents. Perform the task yourself using existing tools.");
spawn_tracker_handle tracker = spawn_tracker_get_instance();
if (!spawn_tracker_can_spawn(tracker)) {
return strdup("Error: Spawning limit reached. Perform the task yourself using existing tools.");
}
int depth_token = spawn_tracker_begin(tracker);
struct json_object *role_obj, *logic_obj;
const char *role_str = "Worker";
@ -148,6 +150,7 @@ static char *tool_spawn_agent_execute(tool_t *self, struct json_object *args) {
"Clean HTML/extract text suitable for LLM analysis. "
"Truncate content to ~10K chars per URL to stay within token limits.";
} else {
spawn_tracker_end(tracker, depth_token);
return strdup("Error: Invalid persona");
}
time_t now = time(NULL);
@ -156,16 +159,18 @@ static char *tool_spawn_agent_execute(tool_t *self, struct json_object *args) {
strftime(datetime, sizeof(datetime), "%Y-%m-%d %H:%M:%S %Z", tm_info);
size_t prompt_size = strlen(system_prompt_base) + 1024;
char *system_prompt = malloc(prompt_size);
if (!system_prompt) return strdup("Error: Out of memory");
snprintf(system_prompt, prompt_size,
if (!system_prompt) {
spawn_tracker_end(tracker, depth_token);
return strdup("Error: Out of memory");
}
snprintf(system_prompt, prompt_size,
"Current date/time: %s\n\n"
"YOUR HIERARCHICAL ROLE: %s\n"
"LOGIC JUSTIFICATION FOR YOUR ASSIGNMENT: %s\n\n"
"%s\n\n"
"CRITICAL: It is currently %s.\n"
"ORCHESTRATION BUDGET: You are allowed to spawn up to %d more levels of sub-agents. "
"When using spawn_agent, you MUST pass 'max_subagents' as %d.\n",
datetime, role_str, logic_justification, system_prompt_base, datetime, max_subagents, max_subagents - 1);
"The system enforces spawn limits automatically.\n",
datetime, role_str, logic_justification, system_prompt_base, datetime);
char session_id[256];
snprintf(session_id, sizeof(session_id), "subagent-%s-%u", persona_str, (unsigned int)time(NULL));
messages_handle msgs = messages_create(session_id);
@ -174,10 +179,10 @@ static char *tool_spawn_agent_execute(tool_t *self, struct json_object *args) {
agent_handle agent = agent_create(goal_str, msgs);
if (!agent) {
messages_destroy(msgs);
spawn_tracker_end(tracker, depth_token);
return strdup("Error: Failed to create sub-agent");
}
// STATE-DRIVEN REGISTRATION
agent_set_id(agent, session_id);
agent_set_role(agent, role_str);
agent_set_manager_id(agent, "Executive-Apex");
@ -194,13 +199,18 @@ static char *tool_spawn_agent_execute(tool_t *self, struct json_object *args) {
if (specialized_tools) {
agent_set_tool_registry(agent, specialized_tools);
}
agent_set_max_iterations(agent, 50); // Sub-agents have lower limit
agent_set_max_iterations(agent, get_persona_max_iterations(persona_str));
char *agent_response = agent_run(agent, goal_str);
char *result = messages_to_json_string(msgs);
agent_destroy(agent);
spawn_tracker_end(tracker, depth_token);
if (!spawn_tracker_validate_result(agent_response)) {
fprintf(stderr, "[Warning] Sub-agent '%s' produced insufficient output\n", persona_str);
}
free(agent_response);
if (specialized_tools) {
tool_registry_destroy(specialized_tools);
}
@ -210,6 +220,7 @@ static char *tool_spawn_agent_execute(tool_t *self, struct json_object *args) {
return result;
}
static void tool_spawn_agent_print_action(const char *name, struct json_object *args) {
(void)name;
struct json_object *persona_obj, *goal_obj;
const char *persona = "unknown";
const char *goal = "unknown";

532
src/tools/tool_deepsearch.c Normal file
View File

@ -0,0 +1,532 @@
// retoor <retoor@molodetz.nl>
#include "tool.h"
#include "http_client.h"
#include "r_config.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>
#include <pthread.h>
#include <json-c/json.h>
#define MAX_QUERIES 8
#define QUERY_GENERATION_MAX_TOKENS 2048
#define MAX_ITERATIONS 3
#define MIN_CONTENT_LENGTH 100
#define MIN_VALID_RESULTS 5
typedef struct {
char *query;
char *result;
int index;
int valid_results;
} search_task_t;
typedef struct {
char *content;
size_t length;
} result_content_t;
static int count_valid_results(const char *json_str) {
if (!json_str) return 0;
struct json_object *root = json_tokener_parse(json_str);
if (!root) return 0;
struct json_object *results_array;
if (!json_object_object_get_ex(root, "results", &results_array)) {
json_object_put(root);
return 0;
}
int count = 0;
int array_len = json_object_array_length(results_array);
for (int i = 0; i < array_len; i++) {
struct json_object *item = json_object_array_get_idx(results_array, i);
struct json_object *content_obj;
if (json_object_object_get_ex(item, "content", &content_obj)) {
const char *content = json_object_get_string(content_obj);
if (content && strlen(content) >= MIN_CONTENT_LENGTH) {
count++;
}
}
}
json_object_put(root);
return count;
}
static result_content_t *extract_all_content(const char *json_str, int *count) {
*count = 0;
if (!json_str) return NULL;
struct json_object *root = json_tokener_parse(json_str);
if (!root) return NULL;
struct json_object *results_array;
if (!json_object_object_get_ex(root, "results", &results_array)) {
json_object_put(root);
return NULL;
}
int array_len = json_object_array_length(results_array);
result_content_t *contents = calloc(array_len, sizeof(result_content_t));
if (!contents) {
json_object_put(root);
return NULL;
}
for (int i = 0; i < array_len; i++) {
struct json_object *item = json_object_array_get_idx(results_array, i);
struct json_object *content_obj;
if (json_object_object_get_ex(item, "content", &content_obj)) {
const char *content = json_object_get_string(content_obj);
if (content && strlen(content) >= MIN_CONTENT_LENGTH) {
contents[*count].content = strdup(content);
contents[*count].length = strlen(content);
(*count)++;
}
}
}
json_object_put(root);
return contents;
}
static char *do_web_search(const char *query) {
if (!query) return strdup("{\"error\": \"Query cannot be NULL\"}");
CURL *curl_handle = curl_easy_init();
if (!curl_handle) return strdup("{\"error\": \"Failed to initialize curl\"}");
char *q_encoded = curl_easy_escape(curl_handle, query, 0);
curl_easy_cleanup(curl_handle);
if (!q_encoded) return strdup("{\"error\": \"Failed to encode query\"}");
char url[4096];
snprintf(url, sizeof(url), "https://rsearch.app.molodetz.nl/search?query=%s&content=true", q_encoded);
curl_free(q_encoded);
http_client_handle client = http_client_create(NULL);
if (!client) return strdup("{\"error\": \"Failed to create HTTP client\"}");
http_client_set_show_spinner(client, false);
char *response = NULL;
r_status_t status = http_get(client, url, &response);
http_client_destroy(client);
if (status != R_SUCCESS || !response) {
return strdup("{\"error\": \"Failed to search\"}");
}
struct json_object *parsed = json_tokener_parse(response);
if (!parsed) {
free(response);
return strdup("{\"error\": \"Invalid JSON response\"}");
}
json_object_put(parsed);
return response;
}
static void *search_thread_func(void *arg) {
search_task_t *task = (search_task_t *)arg;
task->result = do_web_search(task->query);
if (task->result) {
task->valid_results = count_valid_results(task->result);
} else {
task->valid_results = 0;
}
return NULL;
}
static char **generate_queries(const char *subject, const char *context, int *count) {
r_config_handle cfg = r_config_get_instance();
http_client_handle client = http_client_create(r_config_get_api_key(cfg));
if (!client) {
*count = 0;
return NULL;
}
http_client_set_show_spinner(client, false);
struct json_object *root = json_object_new_object();
json_object_object_add(root, "model", json_object_new_string(r_config_get_model(cfg)));
struct json_object *messages = json_object_new_array();
struct json_object *system_msg = json_object_new_object();
json_object_object_add(system_msg, "role", json_object_new_string("system"));
const char *system_prompt = context && strlen(context) > 0 ?
"You are a search query generator specializing in iterative deep research.\n"
"Based on the original research subject and what has been found so far, generate 4-6 follow-up search queries.\n"
"Each query should:\n"
"- Address gaps in the current information\n"
"- Explore different aspects or perspectives not yet covered\n"
"- Use specific, targeted terms\n"
"- Incorporate relevant entities, dates, or technical terms discovered\n"
"- Vary search operators (quotes, site:, filetype:, etc.) when appropriate\n\n"
"Return ONLY a JSON array of query strings. No explanation, no formatting." :
"You are a search query generator. Generate 5-8 diverse, specific search queries "
"that will comprehensively cover the given subject from multiple angles.\n"
"Each query should:\n"
"- Focus on a different aspect (technical, practical, historical, comparative, etc.)\n"
"- Use specific, targeted terms rather than broad phrases\n"
"- Be designed to return high-quality, actionable results\n"
"- Use different search operators (quotes, site:, filetype:, etc.) when appropriate\n\n"
"Return ONLY a JSON array of query strings. No explanation, no formatting.";
json_object_object_add(system_msg, "content", json_object_new_string(system_prompt));
json_object_array_add(messages, system_msg);
struct json_object *user_msg = json_object_new_object();
json_object_object_add(user_msg, "role", json_object_new_string("user"));
char prompt[2048];
if (context && strlen(context) > 0) {
snprintf(prompt, sizeof(prompt),
"Original subject: %s\n\n"
"What has been found so far: %s\n\n"
"Generate 4-6 follow-up queries to explore gaps and deeper aspects.\n"
"Return ONLY a JSON array of strings.",
subject, context);
} else {
snprintf(prompt, sizeof(prompt),
"Generate 5-8 diverse search queries for: %s\n\n"
"Return ONLY a JSON array of strings, like: [\"query1\", \"query2\", ...]",
subject);
}
json_object_object_add(user_msg, "content", json_object_new_string(prompt));
json_object_array_add(messages, user_msg);
json_object_object_add(root, "messages", messages);
json_object_object_add(root, "temperature", json_object_new_double(0.7));
json_object_object_add(root, "max_tokens", json_object_new_int(QUERY_GENERATION_MAX_TOKENS));
const char *json_data = json_object_to_json_string_ext(root, JSON_C_TO_STRING_PRETTY);
char *response = NULL;
r_status_t status = http_post(client, r_config_get_api_url(cfg), json_data, &response);
http_client_destroy(client);
json_object_put(root);
if (status != R_SUCCESS || !response) {
free(response);
*count = 0;
return NULL;
}
struct json_object *parsed = json_tokener_parse(response);
free(response);
if (!parsed) {
*count = 0;
return NULL;
}
struct json_object *choices;
if (!json_object_object_get_ex(parsed, "choices", &choices)) {
json_object_put(parsed);
*count = 0;
return NULL;
}
struct json_object *first_choice = json_object_array_get_idx(choices, 0);
struct json_object *message_obj;
if (!json_object_object_get_ex(first_choice, "message", &message_obj)) {
json_object_put(parsed);
*count = 0;
return NULL;
}
struct json_object *content_obj;
if (!json_object_object_get_ex(message_obj, "content", &content_obj)) {
json_object_put(parsed);
*count = 0;
return NULL;
}
const char *content = json_object_get_string(content_obj);
char *json_start = strchr(content, '[');
char *json_end = strrchr(content, ']');
if (!json_start || !json_end || json_end <= json_start) {
json_object_put(parsed);
*count = 0;
return NULL;
}
char *json_str = strndup(json_start, json_end - json_start + 1);
struct json_object *queries_array = json_tokener_parse(json_str);
free(json_str);
json_object_put(parsed);
if (!queries_array || json_object_get_type(queries_array) != json_type_array) {
if (queries_array) json_object_put(queries_array);
*count = 0;
return NULL;
}
int num_queries = json_object_array_length(queries_array);
if (num_queries > MAX_QUERIES) num_queries = MAX_QUERIES;
if (num_queries < 1) {
json_object_put(queries_array);
*count = 0;
return NULL;
}
char **queries = calloc(num_queries, sizeof(char *));
if (!queries) {
json_object_put(queries_array);
*count = 0;
return NULL;
}
for (int i = 0; i < num_queries; i++) {
struct json_object *query_obj = json_object_array_get_idx(queries_array, i);
const char *query_str = json_object_get_string(query_obj);
queries[i] = query_str ? strdup(query_str) : NULL;
}
json_object_put(queries_array);
*count = num_queries;
return queries;
}
static char *generate_research_summary(const char *subject, result_content_t *contents, int content_count) {
if (!contents || content_count == 0) return NULL;
size_t total_len = 0;
for (int i = 0; i < content_count && i < 20; i++) {
if (contents[i].content) {
total_len += contents[i].length + 2;
}
}
if (total_len > 8000) total_len = 8000;
char *summary = malloc(total_len + 512);
if (!summary) return NULL;
int offset = snprintf(summary, total_len + 512, "Research subject: %s\n\nKey findings from %d sources:\n",
subject, content_count);
for (int i = 0; i < content_count && i < 20; i++) {
if (!contents[i].content) continue;
size_t remaining = total_len + 512 - offset;
if (remaining <= 100) break;
size_t copy_len = contents[i].length;
if (copy_len > remaining - 100) copy_len = remaining - 100;
if (copy_len > 400) copy_len = 400;
offset += snprintf(summary + offset, remaining, "- %.*s\n\n", (int)copy_len, contents[i].content);
}
return summary;
}
static char *merge_search_results(char **results, int count) {
struct json_object *merged = json_object_new_object();
struct json_object *results_array = json_object_new_array();
for (int i = 0; i < count; i++) {
if (!results[i]) continue;
struct json_object *result_json = json_tokener_parse(results[i]);
if (!result_json) continue;
struct json_object *result_array;
if (json_object_object_get_ex(result_json, "results", &result_array)) {
for (int j = 0; j < json_object_array_length(result_array); j++) {
struct json_object *item = json_object_array_get_idx(result_array, j);
json_object_array_add(results_array, json_object_get(item));
}
}
json_object_put(result_json);
}
json_object_object_add(merged, "results", results_array);
char *merged_str = strdup(json_object_to_json_string_ext(merged, JSON_C_TO_STRING_PRETTY));
json_object_put(merged);
return merged_str;
}
static char *deepsearch_execute(tool_t *self, struct json_object *args) {
(void)self;
struct json_object *query_obj;
if (!json_object_object_get_ex(args, "query", &query_obj)) {
return strdup("Error: missing 'query' argument");
}
const char *subject = json_object_get_string(query_obj);
char **all_results = calloc(MAX_QUERIES * MAX_ITERATIONS, sizeof(char *));
int total_results = 0;
int total_valid_results = 0;
char *research_context = NULL;
result_content_t *all_contents = NULL;
int content_capacity = 0;
int content_count = 0;
fprintf(stderr, " -> Deepsearch: Starting iterative research on '%s'\n", subject);
for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++) {
fprintf(stderr, " -> Deepsearch: Iteration %d/%d\n", iteration + 1, MAX_ITERATIONS);
int query_count = 0;
char **queries = generate_queries(subject, research_context, &query_count);
if (!queries || query_count == 0) {
if (queries) free(queries);
fprintf(stderr, " -> Deepsearch: No more queries generated, stopping\n");
break;
}
fprintf(stderr, " -> Deepsearch: Executing %d parallel searches\n", query_count);
search_task_t *tasks = calloc(query_count, sizeof(search_task_t));
pthread_t *threads = calloc(query_count, sizeof(pthread_t));
for (int i = 0; i < query_count; i++) {
tasks[i].query = queries[i];
tasks[i].result = NULL;
tasks[i].index = i;
tasks[i].valid_results = 0;
pthread_create(&threads[i], NULL, search_thread_func, &tasks[i]);
}
for (int i = 0; i < query_count; i++) {
pthread_join(threads[i], NULL);
}
int iteration_valid = 0;
for (int i = 0; i < query_count; i++) {
if (tasks[i].result) {
all_results[total_results++] = tasks[i].result;
iteration_valid += tasks[i].valid_results;
int new_content_count = 0;
result_content_t *new_contents = extract_all_content(tasks[i].result, &new_content_count);
if (new_contents && new_content_count > 0) {
if (content_count + new_content_count > content_capacity) {
content_capacity = (content_count + new_content_count) * 2;
all_contents = realloc(all_contents, content_capacity * sizeof(result_content_t));
}
for (int j = 0; j < new_content_count; j++) {
all_contents[content_count++] = new_contents[j];
}
free(new_contents);
}
}
free(queries[i]);
}
total_valid_results += iteration_valid;
fprintf(stderr, " -> Deepsearch: Found %d valid results this iteration (%d total)\n",
iteration_valid, total_valid_results);
free(queries);
free(tasks);
free(threads);
if (research_context) free(research_context);
research_context = generate_research_summary(subject, all_contents, content_count);
if (total_valid_results >= MIN_VALID_RESULTS * (iteration + 1)) {
fprintf(stderr, " -> Deepsearch: Sufficient results gathered, stopping early\n");
break;
}
}
char *merged = merge_search_results(all_results, total_results);
for (int i = 0; i < total_results; i++) {
if (all_results[i]) free(all_results[i]);
}
if (all_contents) {
for (int i = 0; i < content_count; i++) {
if (all_contents[i].content) free(all_contents[i].content);
}
free(all_contents);
}
free(all_results);
if (research_context) free(research_context);
if (!merged) {
return strdup("Error: Failed to merge search results.");
}
fprintf(stderr, " -> Deepsearch: Completed with %d total results\n", total_valid_results);
return merged;
}
static void deepsearch_print_action(const char *name, struct json_object *args) {
(void)name;
if (!args) return;
struct json_object *query;
if (json_object_object_get_ex(args, "query", &query)) {
fprintf(stderr, " -> Deepsearch: %s\n", json_object_get_string(query));
}
}
static struct json_object *deepsearch_get_description(void) {
struct json_object *root = json_object_new_object();
json_object_object_add(root, "type", json_object_new_string("function"));
struct json_object *function = json_object_new_object();
json_object_object_add(function, "name", json_object_new_string("deepsearch"));
json_object_object_add(function, "description",
json_object_new_string("Performs intelligent iterative deep research by generating diverse search queries, executing them concurrently, validating results, and automatically generating follow-up queries when insufficient information is found."));
struct json_object *parameters = json_object_new_object();
json_object_object_add(parameters, "type", json_object_new_string("object"));
struct json_object *properties = json_object_new_object();
struct json_object *query = json_object_new_object();
json_object_object_add(query, "type", json_object_new_string("string"));
json_object_object_add(query, "description",
json_object_new_string("The subject or topic to research comprehensively."));
json_object_object_add(properties, "query", query);
json_object_object_add(parameters, "properties", properties);
struct json_object *required = json_object_new_array();
json_object_array_add(required, json_object_new_string("query"));
json_object_object_add(parameters, "required", required);
json_object_object_add(parameters, "additionalProperties", json_object_new_boolean(0));
json_object_object_add(function, "parameters", parameters);
r_config_handle cfg = r_config_get_instance();
if (r_config_use_strict(cfg)) {
json_object_object_add(function, "strict", json_object_new_boolean(1));
}
json_object_object_add(root, "function", function);
return root;
}
static const tool_vtable_t deepsearch_vtable = {
.get_description = deepsearch_get_description,
.execute = deepsearch_execute,
.print_action = deepsearch_print_action
};
static tool_t deepsearch_tool = { .vtable = &deepsearch_vtable, .name = "deepsearch" };
tool_t *tool_deepsearch_create(void) {
return &deepsearch_tool;
}

View File

@ -40,6 +40,7 @@ extern tool_t *tool_automation_fuzz_create(void);
extern tool_t *tool_automation_exploit_gen_create(void);
extern tool_t *tool_csv_export_create(void);
extern tool_t *tool_spawn_agent_create(void);
extern tool_t *tool_deepsearch_create(void);
static tool_registry_t *global_registry = NULL;
@ -88,6 +89,7 @@ tool_registry_t *tools_get_registry(void) {
tool_registry_register(global_registry, tool_automation_exploit_gen_create());
tool_registry_register(global_registry, tool_csv_export_create());
tool_registry_register(global_registry, tool_spawn_agent_create());
tool_registry_register(global_registry, tool_deepsearch_create());
return global_registry;
}
@ -101,12 +103,14 @@ tool_registry_t *tool_registry_get_specialized(tool_registry_type_t type) {
tool_registry_register(reg, tool_web_search_news_create());
tool_registry_register(reg, tool_http_fetch_create());
tool_registry_register(reg, tool_read_file_create());
tool_registry_register(reg, tool_write_file_create());
tool_registry_register(reg, tool_db_get_create());
tool_registry_register(reg, tool_db_set_create());
tool_registry_register(reg, tool_db_query_create());
tool_registry_register(reg, tool_directory_glob_create());
tool_registry_register(reg, tool_csv_export_create());
tool_registry_register(reg, tool_spawn_agent_create());
tool_registry_register(reg, tool_deepsearch_create());
} else if (type == TOOL_TYPE_DEVELOPER) {
tool_registry_register(reg, tool_terminal_create());
tool_registry_register(reg, tool_read_file_create());
@ -137,9 +141,9 @@ tool_registry_t *tool_registry_get_specialized(tool_registry_type_t type) {
tool_registry_register(reg, tool_process_terminate_create());
tool_registry_register(reg, tool_spawn_agent_create());
} else {
// Fallback or TOOL_TYPE_ALL
tool_registry_register(reg, tool_terminal_create());
tool_registry_register(reg, tool_read_file_create());
tool_registry_register(reg, tool_http_fetch_create());
tool_registry_register(reg, tool_process_get_status_create());
tool_registry_register(reg, tool_process_terminate_create());
tool_registry_register(reg, tool_spawn_agent_create());