Skip to main content

battlecommand_forge/
codegen.rs

1//! Multi-file code extraction from LLM responses.
2//!
3//! LLMs often produce output containing multiple files, marked with path headers
4//! like `# filepath: app/config.py` or `### app/models/user.py` before code fences.
5//! This module parses those into individual files for writing to disk.
6
7use anyhow::{Context, Result};
8use std::fs;
9use std::path::{Path, PathBuf};
10
11/// A single generated file extracted from LLM output.
12#[derive(Debug, Clone)]
13pub struct GeneratedFile {
14    pub path: PathBuf,
15    pub content: String,
16    pub language: String,
17}
18
19/// Extract multiple files from a raw LLM response.
20///
21/// Recognizes these patterns before/inside code fences:
22/// - `# filepath: app/config.py`
23/// - `### app/models/user.py`
24/// - `**app/config.py**`
25/// - `<!-- file: src/index.ts -->`
26/// - First line inside fence: `# app/config.py` (comment-style path)
27pub fn extract_files(raw: &str, default_language: &str) -> Vec<GeneratedFile> {
28    let mut files = Vec::new();
29    let mut pending_path: Option<String> = None;
30    let mut in_fence = false;
31    let mut fence_lang = String::new();
32    let mut fence_content = String::new();
33
34    for line in raw.lines() {
35        let trimmed = line.trim();
36
37        // Detect code fence start
38        if !in_fence && trimmed.starts_with("```") {
39            in_fence = true;
40            fence_lang = trimmed.trim_start_matches('`').trim().to_string();
41            fence_content.clear();
42            continue;
43        }
44
45        // Detect code fence end
46        if in_fence && trimmed == "```" {
47            in_fence = false;
48            let content = fence_content.trim().to_string();
49            if content.is_empty() {
50                pending_path = None;
51                continue;
52            }
53
54            // Determine the file path
55            let file_path = if let Some(ref p) = pending_path {
56                p.clone()
57            } else {
58                // Check if the first line of content is a filepath comment
59                extract_path_from_first_line(&content).unwrap_or_default()
60            };
61
62            if !file_path.is_empty() {
63                // Sanitize path: strip leading #, *, spaces, backticks
64                let file_path = file_path
65                    .trim_start_matches(['#', '*', ' ', '`'])
66                    .to_string();
67
68                if !file_path.is_empty() && looks_like_path(&file_path) {
69                    let lang = if fence_lang.is_empty() {
70                        detect_lang_from_path(&file_path)
71                            .unwrap_or_else(|| default_language.to_string())
72                    } else {
73                        fence_lang.clone()
74                    };
75
76                    // Strip the filepath comment from content if it was the first line
77                    let clean_content = strip_filepath_comment(&content, &file_path);
78
79                    // Strip markdown code fences from non-code content (e.g., ```toml wrapper in .toml files)
80                    let clean_content = strip_inner_code_fences(&clean_content, &file_path);
81
82                    files.push(GeneratedFile {
83                        path: PathBuf::from(&file_path),
84                        content: clean_content,
85                        language: lang,
86                    });
87                }
88            }
89
90            pending_path = None;
91            continue;
92        }
93
94        // Inside a fence — accumulate content
95        if in_fence {
96            fence_content.push_str(line);
97            fence_content.push('\n');
98            continue;
99        }
100
101        // Outside fence — look for file path indicators
102        if let Some(path) = extract_path_from_header(trimmed) {
103            pending_path = Some(path);
104        }
105    }
106
107    files
108}
109
110/// Try to extract a file path from a header/comment line outside a code fence.
111fn extract_path_from_header(line: &str) -> Option<String> {
112    let trimmed = line.trim();
113
114    // `# filepath: app/config.py` or `// filepath: src/main.rs`
115    for prefix in &["# filepath:", "// filepath:", "filepath:"] {
116        if let Some(rest) = trimmed.strip_prefix(prefix) {
117            let path = rest.trim().trim_matches('`');
118            if looks_like_path(path) {
119                return Some(path.to_string());
120            }
121        }
122    }
123
124    // `<!-- file: src/index.ts -->`
125    if trimmed.starts_with("<!--") && trimmed.contains("file:") {
126        if let Some(rest) = trimmed.split("file:").nth(1) {
127            let path = rest.trim().trim_end_matches("-->").trim().trim_matches('`');
128            if looks_like_path(path) {
129                return Some(path.to_string());
130            }
131        }
132    }
133
134    // `### app/models/user.py` or `## app/config.py`
135    if trimmed.starts_with('#') {
136        let after_hashes = trimmed.trim_start_matches('#').trim();
137        // Must contain a dot (extension) and slash (directory) to be a path
138        if looks_like_path(after_hashes) {
139            let path = after_hashes.trim_matches('`').trim_matches('*');
140            return Some(path.to_string());
141        }
142    }
143
144    // `**app/config.py**`
145    if trimmed.starts_with("**") && trimmed.ends_with("**") {
146        let inner = &trimmed[2..trimmed.len() - 2];
147        if looks_like_path(inner) {
148            return Some(inner.to_string());
149        }
150    }
151
152    // `File: app/main.py` or `File: `app/main.py``
153    if let Some(rest) = trimmed.strip_prefix("File:") {
154        let path = rest.trim().trim_matches('`');
155        if looks_like_path(path) {
156            return Some(path.to_string());
157        }
158    }
159
160    None
161}
162
163/// Try to extract a file path from the first line inside a code fence.
164/// E.g., `# app/config.py` as the first line of a Python code block.
165fn extract_path_from_first_line(content: &str) -> Option<String> {
166    let first_line = content.lines().next()?.trim();
167
168    // `# app/config.py`
169    if let Some(rest) = first_line.strip_prefix('#') {
170        let path = rest.trim();
171        if looks_like_path(path) {
172            return Some(path.to_string());
173        }
174    }
175
176    // `// src/index.ts`
177    if let Some(rest) = first_line.strip_prefix("//") {
178        let path = rest.trim();
179        if looks_like_path(path) {
180            return Some(path.to_string());
181        }
182    }
183
184    None
185}
186
187/// Strip the filepath comment from the first line of content, if present.
188fn strip_filepath_comment(content: &str, path: &str) -> String {
189    let mut lines = content.lines();
190    if let Some(first_line) = lines.next() {
191        let trimmed = first_line.trim();
192        // Check if the first line is just a comment with the file path
193        if trimmed.contains(path) && (trimmed.starts_with('#') || trimmed.starts_with("//")) {
194            return lines.collect::<Vec<_>>().join("\n").trim().to_string();
195        }
196    }
197    content.to_string()
198}
199
200/// Strip nested code fences from file content.
201/// When the LLM wraps a non-code file (e.g., pyproject.toml) in ```toml...```,
202/// the outer fence is stripped by extract_files but the content still starts
203/// with ```toml and ends with ```. This strips those inner fences.
204fn strip_inner_code_fences(content: &str, path: &str) -> String {
205    let trimmed = content.trim();
206    // Only strip if the content starts with a code fence that matches the file type
207    if !trimmed.starts_with("```") {
208        return content.to_string();
209    }
210    // Don't strip from actual code files (Python, Rust, etc.) — only config/data files
211    let config_extensions = [
212        ".toml", ".yaml", ".yml", ".json", ".ini", ".cfg", ".env", ".md", ".txt", ".html", ".css",
213        ".sql", ".sh",
214    ];
215    if !config_extensions.iter().any(|ext| path.ends_with(ext)) {
216        return content.to_string();
217    }
218    // Strip opening ```lang and closing ```
219    let first_newline = trimmed.find('\n').unwrap_or(0);
220    let after_fence = &trimmed[first_newline + 1..];
221    let stripped = if after_fence.trim_end().ends_with("```") {
222        let end = after_fence.rfind("```").unwrap_or(after_fence.len());
223        &after_fence[..end]
224    } else {
225        after_fence
226    };
227    stripped.trim().to_string()
228}
229
230/// Check if a string looks like a file path (has extension and no weird chars).
231fn looks_like_path(s: &str) -> bool {
232    let s = s.trim();
233    if s.is_empty() || s.len() > 200 {
234        return false;
235    }
236    // Reject placeholder paths that LLMs copy from instructions
237    if s == "path/to/file.py" || s.starts_with("path/to/") || s == "file.py" {
238        return false;
239    }
240    // Must have a dot (file extension)
241    if !s.contains('.') {
242        return false;
243    }
244    // Must not contain path traversal
245    if s.contains("..") {
246        return false;
247    }
248    // Must not start with /
249    if s.starts_with('/') {
250        return false;
251    }
252    // Should contain common file extensions
253    let extensions = [
254        ".py", ".rs", ".ts", ".tsx", ".js", ".jsx", ".go", ".java", ".toml", ".yaml", ".yml",
255        ".json", ".md", ".txt", ".html", ".css", ".sql", ".sh", ".cfg", ".ini", ".env",
256    ];
257    extensions.iter().any(|ext| s.ends_with(ext))
258}
259
260/// Detect language from file extension.
261fn detect_lang_from_path(path: &str) -> Option<String> {
262    if path.ends_with(".py") {
263        Some("python".into())
264    } else if path.ends_with(".ts") || path.ends_with(".tsx") {
265        Some("typescript".into())
266    } else if path.ends_with(".js") || path.ends_with(".jsx") {
267        Some("javascript".into())
268    } else if path.ends_with(".rs") {
269        Some("rust".into())
270    } else if path.ends_with(".go") {
271        Some("go".into())
272    } else {
273        None
274    }
275}
276
277/// Write a list of generated files to an output directory.
278/// Creates parent directories as needed. Returns paths of all written files.
279pub fn write_files(output_dir: &Path, files: &[GeneratedFile]) -> Result<Vec<PathBuf>> {
280    let mut written = Vec::new();
281    for file in files {
282        let path_str = file.path.display().to_string();
283        let full_path = match crate::sandbox::validate_path_within(output_dir, &path_str) {
284            Ok(p) => p,
285            Err(e) => {
286                eprintln!("[SECURITY] Skipping file write: {}", e);
287                continue;
288            }
289        };
290        if let Some(parent) = full_path.parent() {
291            fs::create_dir_all(parent)
292                .with_context(|| format!("Failed to create dir for {}", file.path.display()))?;
293        }
294        fs::write(&full_path, &file.content)
295            .with_context(|| format!("Failed to write {}", file.path.display()))?;
296        written.push(full_path);
297    }
298    Ok(written)
299}
300
301/// Write boilerplate files (README, Dockerfile, requirements.txt, __init__.py, etc.)
302pub fn write_boilerplate(output_dir: &Path, language: &str, prompt: &str) -> Result<()> {
303    if language == "python" {
304        // __init__.py files for any directories that contain .py files
305        create_init_files(output_dir)?;
306
307        if !output_dir.join("requirements.txt").exists() {
308            fs::write(
309                output_dir.join("requirements.txt"),
310                "fastapi\nuvicorn[standard]\nPyJWT\ncryptography\npydantic[email]\npython-multipart\nslowapi\npytest\nhttpx\npasslib[bcrypt]\npython-jose[cryptography]\nsqlalchemy\n",
311            )?;
312        }
313        if !output_dir.join("Dockerfile").exists() {
314            fs::write(
315                output_dir.join("Dockerfile"),
316                "FROM python:3.12-slim\nWORKDIR /app\nCOPY requirements.txt .\nRUN pip install --no-cache-dir -r requirements.txt\nCOPY . .\nEXPOSE 8000\nCMD [\"uvicorn\", \"app.main:app\", \"--host\", \"0.0.0.0\"]\n",
317            )?;
318        }
319    }
320
321    // README
322    if !output_dir.join("README.md").exists() {
323        fs::write(
324            output_dir.join("README.md"),
325            format!(
326                "# Generated by BattleCommand Forge v1.1\n\n**Prompt:** {}\n\n**Quality gate:** >= 9.2/10\n",
327                prompt
328            ),
329        )?;
330    }
331
332    Ok(())
333}
334
335/// Create __init__.py in every directory that contains .py files.
336fn create_init_files(dir: &Path) -> Result<()> {
337    if !dir.is_dir() {
338        return Ok(());
339    }
340    let has_py = fs::read_dir(dir)?.any(|e| {
341        e.ok()
342            .map(|e| e.path().extension().map(|ext| ext == "py").unwrap_or(false))
343            .unwrap_or(false)
344    });
345    if has_py {
346        let init = dir.join("__init__.py");
347        if !init.exists() {
348            fs::write(&init, "")?;
349        }
350    }
351    for entry in fs::read_dir(dir)? {
352        let entry = entry?;
353        if entry.path().is_dir() {
354            create_init_files(&entry.path())?;
355        }
356    }
357    Ok(())
358}
359
360#[cfg(test)]
361mod tests {
362    use super::*;
363
364    #[test]
365    fn test_extract_files_with_filepath_headers() {
366        let raw = "\
367Here are the files:
368
369### app/main.py
370
371```python
372from fastapi import FastAPI
373
374app = FastAPI()
375```
376
377### app/config.py
378
379```python
380import os
381
382SECRET = os.getenv(\"SECRET\")
383```
384";
385        let files = extract_files(raw, "python");
386        assert_eq!(files.len(), 2);
387        assert_eq!(files[0].path, PathBuf::from("app/main.py"));
388        assert!(files[0].content.contains("FastAPI"));
389        assert_eq!(files[1].path, PathBuf::from("app/config.py"));
390        assert!(files[1].content.contains("SECRET"));
391    }
392
393    #[test]
394    fn test_extract_files_with_inline_path() {
395        let raw = "\
396```python
397# app/models.py
398from sqlalchemy import Column
399
400class User:
401    pass
402```
403";
404        let files = extract_files(raw, "python");
405        assert_eq!(files.len(), 1);
406        assert_eq!(files[0].path, PathBuf::from("app/models.py"));
407        // The filepath comment should be stripped
408        assert!(!files[0].content.starts_with("# app/models.py"));
409    }
410
411    #[test]
412    fn test_extract_files_bold_header() {
413        let raw = "\
414**app/utils.py**
415
416```python
417def helper():
418    return 42
419```
420";
421        let files = extract_files(raw, "python");
422        assert_eq!(files.len(), 1);
423        assert_eq!(files[0].path, PathBuf::from("app/utils.py"));
424    }
425
426    #[test]
427    fn test_looks_like_path() {
428        assert!(looks_like_path("app/main.py"));
429        assert!(looks_like_path("src/index.ts"));
430        assert!(!looks_like_path("just some text"));
431        assert!(!looks_like_path("../etc/passwd"));
432        assert!(!looks_like_path("/root/file.py"));
433    }
434
435    #[test]
436    fn test_fallback_single_fence_no_path() {
437        let raw = "```python\nprint('hello')\n```";
438        let files = extract_files(raw, "python");
439        // No path detected — should produce 0 files (caller uses fallback)
440        assert_eq!(files.len(), 0);
441    }
442
443    #[test]
444    fn test_file_prefix_header() {
445        let raw = "File: `app/routes.py`\n\n```python\n@app.get('/')\ndef root():\n    pass\n```";
446        let files = extract_files(raw, "python");
447        assert_eq!(files.len(), 1);
448        assert_eq!(files[0].path, PathBuf::from("app/routes.py"));
449    }
450}