Skip to main content

battlecommand_forge/
swebench_tools.rs

1/// SWE-bench ReAct agent tool implementations.
2/// 7 tools for exploring repos and fixing bugs: read_file, grep_search,
3/// list_directory, run_command, write_file, apply_edit, submit.
4use serde_json::Value;
5use std::path::Path;
6
7const MAX_READ_LINES: usize = 200;
8const MAX_GREP_MATCHES: usize = 30;
9const MAX_OUTPUT_CHARS: usize = 4096;
10const COMMAND_TIMEOUT_SECS: u64 = 30;
11
12/// Result from executing a tool.
13#[derive(Debug)]
14pub struct ToolResult {
15    pub tool_name: String,
16    pub content: String,
17    pub success: bool,
18    pub is_write: bool,
19    pub is_submit: bool,
20}
21
22/// Execute a tool by name with given arguments, scoped to workspace.
23pub async fn execute(tool_name: &str, args: &Value, workspace: &str) -> ToolResult {
24    match tool_name {
25        "read_file" => execute_read_file(args, workspace).await,
26        "grep_search" => execute_grep_search(args, workspace).await,
27        "list_directory" => execute_list_directory(args, workspace).await,
28        "run_command" => execute_run_command(args, workspace).await,
29        "write_file" => execute_write_file(args, workspace).await,
30        "apply_edit" => execute_apply_edit(args, workspace).await,
31        "submit" => execute_submit(),
32        _ => ToolResult {
33            tool_name: tool_name.to_string(),
34            content: format!("Unknown tool: {}", tool_name),
35            success: false,
36            is_write: false,
37            is_submit: false,
38        },
39    }
40}
41
42fn resolve_path(workspace: &str, relative: &str) -> Result<String, String> {
43    let cleaned = relative.trim_start_matches("./");
44    let root = Path::new(workspace);
45    let joined = crate::sandbox::validate_path_within(root, cleaned)?;
46    Ok(joined.to_string_lossy().into_owned())
47}
48
49fn safe_truncate(s: &str, max: usize) -> &str {
50    if max >= s.len() {
51        return s;
52    }
53    let mut end = max;
54    while end > 0 && !s.is_char_boundary(end) {
55        end -= 1;
56    }
57    &s[..end]
58}
59
60async fn execute_read_file(args: &Value, workspace: &str) -> ToolResult {
61    let path = args.get("path").and_then(|v| v.as_str()).unwrap_or("");
62    let start_line = args.get("start_line").and_then(|v| v.as_u64()).unwrap_or(1) as usize;
63    let end_line = args
64        .get("end_line")
65        .and_then(|v| v.as_u64())
66        .map(|v| v as usize);
67
68    if path.is_empty() {
69        return ToolResult {
70            tool_name: "read_file".into(),
71            content: "Error: path is required".into(),
72            success: false,
73            is_write: false,
74            is_submit: false,
75        };
76    }
77
78    let full_path = match resolve_path(workspace, path) {
79        Ok(p) => p,
80        Err(e) => {
81            return ToolResult {
82                tool_name: "read_file".into(),
83                content: format!("Error: {}", e),
84                success: false,
85                is_write: false,
86                is_submit: false,
87            }
88        }
89    };
90
91    match tokio::fs::read_to_string(&full_path).await {
92        Ok(contents) => {
93            let lines: Vec<&str> = contents.lines().collect();
94            let total = lines.len();
95            let start = start_line.saturating_sub(1).min(total);
96            let end = end_line.unwrap_or(start + MAX_READ_LINES).min(total);
97
98            let mut output = format!(
99                "File: {} ({} lines total, showing {}-{})\n\n",
100                path,
101                total,
102                start + 1,
103                end
104            );
105            for (i, line) in lines[start..end].iter().enumerate() {
106                output.push_str(&format!("{:>5} | {}\n", start + i + 1, line));
107            }
108            if end < total {
109                output.push_str(&format!(
110                    "\n... {} more lines. Use start_line={} to continue.\n",
111                    total - end,
112                    end + 1
113                ));
114            }
115
116            ToolResult {
117                tool_name: "read_file".into(),
118                content: output,
119                success: true,
120                is_write: false,
121                is_submit: false,
122            }
123        }
124        Err(e) => ToolResult {
125            tool_name: "read_file".into(),
126            content: format!("Error reading {}: {}", path, e),
127            success: false,
128            is_write: false,
129            is_submit: false,
130        },
131    }
132}
133
134async fn execute_grep_search(args: &Value, workspace: &str) -> ToolResult {
135    let pattern = args.get("pattern").and_then(|v| v.as_str()).unwrap_or("");
136    let search_path = args.get("path").and_then(|v| v.as_str()).unwrap_or(".");
137
138    if pattern.is_empty() {
139        return ToolResult {
140            tool_name: "grep_search".into(),
141            content: "Error: pattern is required".into(),
142            success: false,
143            is_write: false,
144            is_submit: false,
145        };
146    }
147
148    let full_path = match resolve_path(workspace, search_path) {
149        Ok(p) => p,
150        Err(e) => {
151            return ToolResult {
152                tool_name: "grep_search".into(),
153                content: format!("Error: {}", e),
154                success: false,
155                is_write: false,
156                is_submit: false,
157            }
158        }
159    };
160
161    let result = tokio::process::Command::new("grep")
162        .args([
163            "-rn",
164            "--include=*.py",
165            "--include=*.pyx",
166            "--include=*.pyi",
167            "--include=*.cfg",
168            "--include=*.toml",
169            "--include=*.txt",
170            "--include=*.rst",
171            "--include=*.md",
172            "--include=*.yml",
173            "--include=*.yaml",
174            "--include=*.json",
175            "--exclude-dir=.git",
176            "--exclude-dir=__pycache__",
177            "--exclude-dir=*.egg-info",
178            "--exclude-dir=.tox",
179            "--exclude-dir=build",
180            pattern,
181            &full_path,
182        ])
183        .output()
184        .await;
185
186    match result {
187        Ok(output) => {
188            let stdout = String::from_utf8_lossy(&output.stdout);
189            let lines: Vec<&str> = stdout.lines().collect();
190            let total_matches = lines.len();
191            let prefix = format!("{}/", workspace.trim_end_matches('/'));
192            let mut result_text = format!("Found {} matches for '{}'\n\n", total_matches, pattern);
193            for line in lines.iter().take(MAX_GREP_MATCHES) {
194                let clean = line.strip_prefix(&prefix).unwrap_or(line);
195                result_text.push_str(clean);
196                result_text.push('\n');
197            }
198            if total_matches > MAX_GREP_MATCHES {
199                result_text.push_str(&format!(
200                    "\n... {} more matches (showing first {})\n",
201                    total_matches - MAX_GREP_MATCHES,
202                    MAX_GREP_MATCHES
203                ));
204            }
205            ToolResult {
206                tool_name: "grep_search".into(),
207                content: result_text,
208                success: true,
209                is_write: false,
210                is_submit: false,
211            }
212        }
213        Err(e) => ToolResult {
214            tool_name: "grep_search".into(),
215            content: format!("Error running grep: {}", e),
216            success: false,
217            is_write: false,
218            is_submit: false,
219        },
220    }
221}
222
223async fn execute_list_directory(args: &Value, workspace: &str) -> ToolResult {
224    let dir_path = args.get("path").and_then(|v| v.as_str()).unwrap_or(".");
225    let full_path = match resolve_path(workspace, dir_path) {
226        Ok(p) => p,
227        Err(e) => {
228            return ToolResult {
229                tool_name: "list_directory".into(),
230                content: format!("Error: {}", e),
231                success: false,
232                is_write: false,
233                is_submit: false,
234            }
235        }
236    };
237
238    let path = Path::new(&full_path);
239    if !path.is_dir() {
240        return ToolResult {
241            tool_name: "list_directory".into(),
242            content: format!("Not a directory: {}", dir_path),
243            success: false,
244            is_write: false,
245            is_submit: false,
246        };
247    }
248
249    let mut entries: Vec<String> = Vec::new();
250    match tokio::fs::read_dir(&full_path).await {
251        Ok(mut dir) => {
252            while let Ok(Some(entry)) = dir.next_entry().await {
253                let name = entry.file_name().to_string_lossy().to_string();
254                if name.starts_with('.')
255                    || name == "__pycache__"
256                    || name == "node_modules"
257                    || name.ends_with(".egg-info")
258                    || name == ".tox"
259                {
260                    continue;
261                }
262                if let Ok(meta) = entry.metadata().await {
263                    if meta.is_dir() {
264                        entries.push(format!("[dir]  {}/", name));
265                    } else {
266                        let size = meta.len();
267                        let size_str = if size < 1024 {
268                            format!("{} B", size)
269                        } else if size < 1024 * 1024 {
270                            format!("{:.1} KB", size as f64 / 1024.0)
271                        } else {
272                            format!("{:.1} MB", size as f64 / (1024.0 * 1024.0))
273                        };
274                        entries.push(format!("[file] {} ({})", name, size_str));
275                    }
276                }
277            }
278            entries.sort();
279            let mut output = format!("Directory: {} ({} entries)\n\n", dir_path, entries.len());
280            for e in &entries {
281                output.push_str(e);
282                output.push('\n');
283            }
284            ToolResult {
285                tool_name: "list_directory".into(),
286                content: output,
287                success: true,
288                is_write: false,
289                is_submit: false,
290            }
291        }
292        Err(e) => ToolResult {
293            tool_name: "list_directory".into(),
294            content: format!("Error reading directory {}: {}", dir_path, e),
295            success: false,
296            is_write: false,
297            is_submit: false,
298        },
299    }
300}
301
302/// Programs the ReAct agent is allowed to invoke. Any new entry is a
303/// deliberate security decision — keep this list tight.
304const ALLOWED_RUN_COMMAND_HEADS: &[&str] = &[
305    "pytest",
306    "python",
307    "python3",
308    "pip",
309    "pip3",
310    "uv",
311    "ruff",
312    "black",
313    "mypy",
314    "tox",
315    "pre-commit",
316    "ls",
317    "cat",
318    "head",
319    "tail",
320    "grep",
321    "find",
322    "wc",
323    "echo",
324    "true",
325    "false",
326    "test",
327    "mkdir",
328    "rmdir",
329    "rm",
330    "touch",
331    "cp",
332    "mv",
333    "git",
334    "make",
335    "cargo",
336    "npm",
337    "yarn",
338    "node",
339];
340
341async fn execute_run_command(args: &Value, workspace: &str) -> ToolResult {
342    let cmd = args.get("command").and_then(|v| v.as_str()).unwrap_or("");
343    if cmd.is_empty() {
344        return ToolResult {
345            tool_name: "run_command".into(),
346            content: "Error: command is required".into(),
347            success: false,
348            is_write: false,
349            is_submit: false,
350        };
351    }
352
353    // Parse argv with shell quoting rules. We do NOT pass to `sh -c` — running
354    // direct via Command::new(argv[0]).args(argv[1..]) means shell substitution
355    // (`$(...)`, backticks, `&&`, `|`, redirects) is never interpreted, which
356    // closes the substring-blocklist bypass class outright.
357    let mut argv: Vec<String> = match shell_words::split(cmd) {
358        Ok(v) if !v.is_empty() => v,
359        Ok(_) => {
360            return ToolResult {
361                tool_name: "run_command".into(),
362                content: "Error: empty command after parsing".into(),
363                success: false,
364                is_write: false,
365                is_submit: false,
366            };
367        }
368        Err(e) => {
369            return ToolResult {
370                tool_name: "run_command".into(),
371                content: format!(
372                    "Error: cannot parse command (use simple argv form, no unbalanced quotes): {}",
373                    e
374                ),
375                success: false,
376                is_write: false,
377                is_submit: false,
378            };
379        }
380    };
381
382    // Reject shell-composition tokens. They would only have meaning under
383    // `sh -c`; here they'd just be passed as literal args and confuse the
384    // child. Failing loudly is friendlier than silently misexecuting.
385    for token in &argv {
386        if matches!(
387            token.as_str(),
388            "&&" | "||" | ";" | "|" | ">" | ">>" | "<" | "<<" | "&"
389        ) {
390            return ToolResult {
391                tool_name: "run_command".into(),
392                content: format!(
393                    "Error: shell metacharacter '{}' is not supported. Run as separate commands.",
394                    token
395                ),
396                success: false,
397                is_write: false,
398                is_submit: false,
399            };
400        }
401        if token.contains('\0') {
402            return ToolResult {
403                tool_name: "run_command".into(),
404                content: "Error: null byte in argument".into(),
405                success: false,
406                is_write: false,
407                is_submit: false,
408            };
409        }
410    }
411
412    // python → python3 only when argv[0] is exactly the bare interpreter.
413    // The old substring-replace corrupted strings like `pythonic_test_file`.
414    if argv[0] == "python" {
415        argv[0] = "python3".to_string();
416    }
417
418    let head = argv[0].clone();
419    if !ALLOWED_RUN_COMMAND_HEADS.iter().any(|a| *a == head) {
420        return ToolResult {
421            tool_name: "run_command".into(),
422            content: format!(
423                "Error: program '{}' is not in the allowlist. Edit ALLOWED_RUN_COMMAND_HEADS in swebench_tools.rs to permit it.",
424                head
425            ),
426            success: false,
427            is_write: false,
428            is_submit: false,
429        };
430    }
431
432    let result = tokio::time::timeout(
433        std::time::Duration::from_secs(COMMAND_TIMEOUT_SECS),
434        tokio::process::Command::new(&head)
435            .args(&argv[1..])
436            .current_dir(workspace)
437            .output(),
438    )
439    .await;
440
441    match result {
442        Ok(Ok(output)) => {
443            let stdout = String::from_utf8_lossy(&output.stdout);
444            let stderr = String::from_utf8_lossy(&output.stderr);
445            let exit_code = output.status.code().unwrap_or(-1);
446            let mut text = format!("Exit code: {}\n", exit_code);
447            if !stdout.is_empty() {
448                text.push_str(&format!(
449                    "\nSTDOUT:\n{}",
450                    safe_truncate(&stdout, MAX_OUTPUT_CHARS)
451                ));
452                if stdout.len() > MAX_OUTPUT_CHARS {
453                    text.push_str(&format!("\n... truncated ({} chars total)\n", stdout.len()));
454                }
455            }
456            if !stderr.is_empty() {
457                text.push_str(&format!(
458                    "\nSTDERR:\n{}",
459                    safe_truncate(&stderr, MAX_OUTPUT_CHARS)
460                ));
461            }
462            ToolResult {
463                tool_name: "run_command".into(),
464                content: text,
465                success: exit_code == 0,
466                is_write: false,
467                is_submit: false,
468            }
469        }
470        Ok(Err(e)) => ToolResult {
471            tool_name: "run_command".into(),
472            content: format!("Error executing command: {}", e),
473            success: false,
474            is_write: false,
475            is_submit: false,
476        },
477        Err(_) => ToolResult {
478            tool_name: "run_command".into(),
479            content: format!("Command timed out after {}s", COMMAND_TIMEOUT_SECS),
480            success: false,
481            is_write: false,
482            is_submit: false,
483        },
484    }
485}
486
487async fn execute_write_file(args: &Value, workspace: &str) -> ToolResult {
488    let path = args.get("path").and_then(|v| v.as_str()).unwrap_or("");
489    let content = args.get("content").and_then(|v| v.as_str()).unwrap_or("");
490    if path.is_empty() {
491        return ToolResult {
492            tool_name: "write_file".into(),
493            content: "Error: path is required".into(),
494            success: false,
495            is_write: true,
496            is_submit: false,
497        };
498    }
499    let full_path = match resolve_path(workspace, path) {
500        Ok(p) => p,
501        Err(e) => {
502            return ToolResult {
503                tool_name: "write_file".into(),
504                content: format!("Error: {}", e),
505                success: false,
506                is_write: true,
507                is_submit: false,
508            }
509        }
510    };
511
512    if let Some(parent) = Path::new(&full_path).parent() {
513        if let Err(e) = tokio::fs::create_dir_all(parent).await {
514            return ToolResult {
515                tool_name: "write_file".into(),
516                content: format!("Error creating directories: {}", e),
517                success: false,
518                is_write: true,
519                is_submit: false,
520            };
521        }
522    }
523
524    match tokio::fs::write(&full_path, content).await {
525        Ok(()) => {
526            let lines = content.lines().count();
527            ToolResult {
528                tool_name: "write_file".into(),
529                content: format!(
530                    "File written: {} ({} lines, {} bytes)",
531                    path,
532                    lines,
533                    content.len()
534                ),
535                success: true,
536                is_write: true,
537                is_submit: false,
538            }
539        }
540        Err(e) => ToolResult {
541            tool_name: "write_file".into(),
542            content: format!("Error writing {}: {}", path, e),
543            success: false,
544            is_write: true,
545            is_submit: false,
546        },
547    }
548}
549
550async fn execute_apply_edit(args: &Value, workspace: &str) -> ToolResult {
551    let path = args.get("path").and_then(|v| v.as_str()).unwrap_or("");
552    let old_text = args.get("old_text").and_then(|v| v.as_str()).unwrap_or("");
553    let new_text = args.get("new_text").and_then(|v| v.as_str()).unwrap_or("");
554
555    if path.is_empty() || old_text.is_empty() {
556        return ToolResult {
557            tool_name: "apply_edit".into(),
558            content: "Error: path and old_text are required".into(),
559            success: false,
560            is_write: true,
561            is_submit: false,
562        };
563    }
564
565    let full_path = match resolve_path(workspace, path) {
566        Ok(p) => p,
567        Err(e) => {
568            return ToolResult {
569                tool_name: "apply_edit".into(),
570                content: format!("Error: {}", e),
571                success: false,
572                is_write: true,
573                is_submit: false,
574            }
575        }
576    };
577
578    match tokio::fs::read_to_string(&full_path).await {
579        Ok(contents) => {
580            if let Some(pos) = contents.find(old_text) {
581                let new_contents = format!(
582                    "{}{}{}",
583                    &contents[..pos],
584                    new_text,
585                    &contents[pos + old_text.len()..]
586                );
587                let remaining = &new_contents[pos + new_text.len()..];
588                let extra = remaining.matches(old_text).count();
589
590                match tokio::fs::write(&full_path, &new_contents).await {
591                    Ok(()) => {
592                        let mut msg = format!("Edit applied to {}", path);
593                        if extra > 0 {
594                            msg.push_str(&format!(
595                                " (warning: {} more occurrence(s) of old_text remain)",
596                                extra
597                            ));
598                        }
599                        ToolResult {
600                            tool_name: "apply_edit".into(),
601                            content: msg,
602                            success: true,
603                            is_write: true,
604                            is_submit: false,
605                        }
606                    }
607                    Err(e) => ToolResult {
608                        tool_name: "apply_edit".into(),
609                        content: format!("Error writing {}: {}", path, e),
610                        success: false,
611                        is_write: true,
612                        is_submit: false,
613                    },
614                }
615            } else {
616                ToolResult {
617                    tool_name: "apply_edit".into(),
618                    content: format!("Error: old_text not found in {}.\nold_text (first 200 chars): '{}'\nFile preview (first 500 chars):\n{}", path, safe_truncate(old_text, 200), safe_truncate(&contents, 500)),
619                    success: false, is_write: true, is_submit: false,
620                }
621            }
622        }
623        Err(e) => ToolResult {
624            tool_name: "apply_edit".into(),
625            content: format!("Error reading {}: {}", path, e),
626            success: false,
627            is_write: true,
628            is_submit: false,
629        },
630    }
631}
632
633fn execute_submit() -> ToolResult {
634    ToolResult {
635        tool_name: "submit".into(),
636        content: "Submission recorded. Your changes will now be evaluated against the test suite."
637            .into(),
638        success: true,
639        is_write: false,
640        is_submit: true,
641    }
642}
643
644#[cfg(test)]
645mod tests {
646    use super::*;
647    use serde_json::json;
648
649    fn run(cmd: &str) -> ToolResult {
650        let workspace = std::env::temp_dir()
651            .join(format!("bcf-sw-{}", std::process::id()))
652            .to_string_lossy()
653            .into_owned();
654        std::fs::create_dir_all(&workspace).ok();
655        let args = json!({"command": cmd});
656        let rt = tokio::runtime::Runtime::new().unwrap();
657        rt.block_on(execute_run_command(&args, &workspace))
658    }
659
660    // Regression: the old substring blocklist let `rm  -rf  /` (double
661    // space), `$(echo rm) -rf /`, and backticks slip through. The new
662    // implementation drops `sh -c` entirely so these literals would just
663    // be passed as arguments to programs that aren't allowlisted.
664    #[test]
665    fn test_run_command_rejects_command_substitution() {
666        let r = run("$(echo rm) -rf /");
667        assert!(!r.success);
668        // `$(echo` is not in the allowlist, so we get an allowlist error,
669        // not silent execution.
670        assert!(
671            r.content.contains("allowlist") || r.content.contains("metacharacter"),
672            "unexpected error: {}",
673            r.content
674        );
675    }
676
677    #[test]
678    fn test_run_command_rejects_compound_metachars() {
679        for cmd in &["echo a && echo b", "echo a ; echo b", "echo a | grep b"] {
680            let r = run(cmd);
681            assert!(!r.success, "should have rejected: {}", cmd);
682            assert!(
683                r.content.contains("metacharacter"),
684                "expected metachar error for {:?}, got: {}",
685                cmd,
686                r.content
687            );
688        }
689    }
690
691    #[test]
692    fn test_run_command_rejects_unallowlisted_head() {
693        let r = run("curl http://example.com");
694        assert!(!r.success);
695        assert!(r.content.contains("allowlist"), "got: {}", r.content);
696    }
697
698    #[test]
699    fn test_run_command_python_rewrite_only_for_bare_head() {
700        // `python` head → rewritten to python3 (then either runs or
701        // fails-to-spawn; we don't care about exit, just that the
702        // allowlist accepted it without rejecting the head).
703        let r = run("python --version");
704        assert!(
705            !r.content.contains("not in the allowlist"),
706            "python head should pass allowlist after rewrite: {}",
707            r.content
708        );
709
710        // A token containing the substring `python ` should NOT be
711        // rewritten — the old `cmd.replace("python ", ...)` corrupted
712        // strings like `pythonic_test_file`. Now: the head is `echo`,
713        // and `pythonic_test_file` is just an arg literal.
714        let r = run("echo pythonic_test_file");
715        assert!(
716            !r.content.contains("python3ic"),
717            "python rewrite should not corrupt non-head tokens: {}",
718            r.content
719        );
720    }
721
722    #[test]
723    fn test_resolve_path_filename_with_dotdot_allowed() {
724        // Old resolve_path rejected `file..py` via `contains("..")`.
725        // New impl delegates to validate_path_within which uses
726        // Component-walk and allows it.
727        let r = resolve_path("/tmp/wsroot", "file..py");
728        assert!(
729            r.is_ok(),
730            "filename with double-dots should be allowed, got {:?}",
731            r
732        );
733    }
734
735    #[test]
736    fn test_resolve_path_blocks_real_traversal() {
737        assert!(resolve_path("/tmp/wsroot", "../etc/passwd").is_err());
738        assert!(resolve_path("/tmp/wsroot", "app/../../etc/shadow").is_err());
739        assert!(resolve_path("/tmp/wsroot", "/etc/passwd").is_err());
740    }
741}