Skip to main content

infiniloom_engine/output/
xml.rs

1//! Claude-optimized XML output formatter
2//!
3//! This formatter is designed to maximize LLM comprehension of codebases by:
4//! 1. Providing an executive summary for quick understanding
5//! 2. Identifying entry points and key files
6//! 3. Showing architecture and dependencies
7//! 4. Prioritizing files by importance for code tasks
8//!
9//! Supports both in-memory (`format()`) and streaming (`format_to_writer()`) modes.
10
11use crate::output::{Formatter, StreamingFormatter};
12use crate::repomap::RepoMap;
13use crate::types::{Repository, TokenizerModel};
14use std::io::{self, Write};
15
16/// XML formatter optimized for Claude
17pub struct XmlFormatter {
18    /// Include line numbers in code
19    include_line_numbers: bool,
20    /// Optimize for prompt caching
21    cache_optimized: bool,
22    /// Include CDATA sections for code
23    use_cdata: bool,
24    /// Include file index/summary section
25    show_file_index: bool,
26    /// Token model for counts in output
27    token_model: TokenizerModel,
28}
29
30impl XmlFormatter {
31    /// Create a new XML formatter
32    pub fn new(cache_optimized: bool) -> Self {
33        Self {
34            include_line_numbers: true,
35            cache_optimized,
36            use_cdata: true,
37            show_file_index: true,
38            token_model: TokenizerModel::Claude,
39        }
40    }
41
42    /// Set line numbers option
43    pub fn with_line_numbers(mut self, enabled: bool) -> Self {
44        self.include_line_numbers = enabled;
45        self
46    }
47
48    /// Set CDATA option
49    pub fn with_cdata(mut self, enabled: bool) -> Self {
50        self.use_cdata = enabled;
51        self
52    }
53
54    /// Set file index/summary option
55    pub fn with_file_index(mut self, enabled: bool) -> Self {
56        self.show_file_index = enabled;
57        self
58    }
59
60    /// Set token model for token counts in output
61    pub fn with_model(mut self, model: TokenizerModel) -> Self {
62        self.token_model = model;
63        self
64    }
65
66    /// Estimate output size for pre-allocation
67    fn estimate_output_size(repo: &Repository) -> usize {
68        let base = 2000;
69        let files = repo.files.len() * 500;
70        let content: usize = repo
71            .files
72            .iter()
73            .filter_map(|f| f.content.as_ref())
74            .map(|c| c.len())
75            .sum();
76        base + files + content
77    }
78
79    fn detect_project_type(&self, repo: &Repository) -> String {
80        let has_cargo = repo.files.iter().any(|f| f.relative_path == "Cargo.toml");
81        let has_package_json = repo.files.iter().any(|f| f.relative_path == "package.json");
82        let has_pyproject = repo
83            .files
84            .iter()
85            .any(|f| f.relative_path == "pyproject.toml" || f.relative_path == "setup.py");
86        let has_go_mod = repo.files.iter().any(|f| f.relative_path == "go.mod");
87
88        let has_routes = repo
89            .files
90            .iter()
91            .any(|f| f.relative_path.contains("routes") || f.relative_path.contains("api/"));
92        let has_components = repo
93            .files
94            .iter()
95            .any(|f| f.relative_path.contains("components/") || f.relative_path.contains("views/"));
96
97        if has_cargo {
98            if repo
99                .files
100                .iter()
101                .any(|f| f.relative_path.ends_with("lib.rs"))
102            {
103                "Rust Library"
104            } else {
105                "Rust Application"
106            }
107        } else if has_package_json {
108            if has_components {
109                "Frontend Application (JavaScript/TypeScript)"
110            } else if has_routes {
111                "Backend API (Node.js)"
112            } else {
113                "JavaScript/TypeScript Project"
114            }
115        } else if has_pyproject {
116            if has_routes {
117                "Python Web API"
118            } else {
119                "Python Package"
120            }
121        } else if has_go_mod {
122            "Go Application"
123        } else {
124            "Software Project"
125        }
126        .to_owned()
127    }
128
129    fn is_entry_point(&self, path: &str) -> bool {
130        let entry_patterns = [
131            "main.rs",
132            "main.go",
133            "main.py",
134            "main.ts",
135            "main.js",
136            "main.c",
137            "main.cpp",
138            "index.ts",
139            "index.js",
140            "index.tsx",
141            "index.jsx",
142            "index.py",
143            "app.py",
144            "app.ts",
145            "app.js",
146            "app.tsx",
147            "app.jsx",
148            "app.go",
149            "server.py",
150            "server.ts",
151            "server.js",
152            "server.go",
153            "mod.rs",
154            "lib.rs",
155            "__main__.py",
156            "__init__.py",
157            "cmd/main.go",
158        ];
159        entry_patterns
160            .iter()
161            .any(|p| path.ends_with(p) || path.contains(&format!("/{}", p)))
162    }
163
164    fn get_entry_type(&self, path: &str) -> &'static str {
165        if path.contains("main") {
166            "main"
167        } else if path.contains("index") {
168            "index"
169        } else if path.contains("app") {
170            "app"
171        } else if path.contains("server") {
172            "server"
173        } else if path.contains("lib") {
174            "library"
175        } else if path.contains("mod.rs") {
176            "module"
177        } else {
178            "entry"
179        }
180    }
181
182    fn is_config_file(&self, path: &str) -> bool {
183        let config_files = [
184            "Cargo.toml",
185            "package.json",
186            "pyproject.toml",
187            "go.mod",
188            "pom.xml",
189            "build.gradle",
190            "Gemfile",
191            "requirements.txt",
192            "setup.py",
193            "setup.cfg",
194            "tsconfig.json",
195            "webpack.config",
196            "vite.config",
197            "next.config",
198            "Makefile",
199            "CMakeLists.txt",
200            "Dockerfile",
201            "docker-compose",
202            ".env.example",
203            "config.yaml",
204            "config.yml",
205            "config.json",
206        ];
207        let filename = path.rsplit('/').next().unwrap_or(path);
208        config_files.iter().any(|c| filename.contains(c)) && path.matches('/').count() <= 1
209    }
210
211    // =========================================================================
212    // Streaming methods (write to impl std::io::Write)
213    // =========================================================================
214
215    fn stream_llm_instructions<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
216        writeln!(w, "  <llm_context_guide>")?;
217        writeln!(w, "    <purpose>This is a comprehensive code context for the {} repository, optimized for AI-assisted code understanding and generation.</purpose>", escape_xml(&repo.name))?;
218        writeln!(w, "    <how_to_use>")?;
219        writeln!(w, "      <tip>Start with the &lt;overview&gt; section to understand the project's purpose and structure</tip>")?;
220        writeln!(w, "      <tip>Check &lt;entry_points&gt; to find main application files</tip>")?;
221        writeln!(
222            w,
223            "      <tip>Use &lt;repository_map&gt; to understand relationships between modules</tip>"
224        )?;
225        writeln!(
226            w,
227            "      <tip>Files are ordered by importance - most critical files come first</tip>"
228        )?;
229        writeln!(w, "    </how_to_use>")?;
230        writeln!(w, "  </llm_context_guide>")
231    }
232
233    fn stream_overview<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
234        writeln!(w, "  <overview>")?;
235        let project_type = self.detect_project_type(repo);
236        writeln!(w, "    <project_type>{}</project_type>", escape_xml(&project_type))?;
237
238        if let Some(lang) = repo.metadata.languages.iter().max_by_key(|l| l.files) {
239            writeln!(w, "    <primary_language>{}</primary_language>", escape_xml(&lang.language))?;
240        }
241        if let Some(framework) = &repo.metadata.framework {
242            writeln!(w, "    <framework>{}</framework>", escape_xml(framework))?;
243        }
244
245        writeln!(w, "    <entry_points>")?;
246        let mut entry_count = 0;
247        for file in &repo.files {
248            if self.is_entry_point(&file.relative_path) {
249                if file.relative_path.ends_with("__init__.py")
250                    && file.token_count.get(self.token_model) < 50
251                {
252                    continue;
253                }
254                let entry_type = self.get_entry_type(&file.relative_path);
255                writeln!(
256                    w,
257                    "      <entry path=\"{}\" type=\"{}\" tokens=\"{}\"/>",
258                    escape_xml(&file.relative_path),
259                    entry_type,
260                    file.token_count.get(self.token_model)
261                )?;
262                entry_count += 1;
263                if entry_count >= 10 {
264                    break;
265                }
266            }
267        }
268        writeln!(w, "    </entry_points>")?;
269
270        writeln!(w, "    <config_files>")?;
271        for file in &repo.files {
272            if self.is_config_file(&file.relative_path) {
273                writeln!(
274                    w,
275                    "      <config path=\"{}\" tokens=\"{}\"/>",
276                    escape_xml(&file.relative_path),
277                    file.token_count.get(self.token_model)
278                )?;
279            }
280        }
281        writeln!(w, "    </config_files>")?;
282        writeln!(w, "  </overview>")
283    }
284
285    fn stream_metadata<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
286        writeln!(w, "  <metadata>")?;
287        if let Some(desc) = &repo.metadata.description {
288            writeln!(w, "    <description>{}</description>", escape_xml(desc))?;
289        }
290        writeln!(w, "    <stats>")?;
291        writeln!(w, "      <files>{}</files>", repo.metadata.total_files)?;
292        writeln!(w, "      <lines>{}</lines>", repo.metadata.total_lines)?;
293        writeln!(
294            w,
295            "      <tokens model=\"claude\">{}</tokens>",
296            repo.metadata.total_tokens.get(self.token_model)
297        )?;
298        writeln!(w, "    </stats>")?;
299
300        if !repo.metadata.languages.is_empty() {
301            writeln!(w, "    <languages>")?;
302            for lang in &repo.metadata.languages {
303                writeln!(
304                    w,
305                    "      <language name=\"{}\" files=\"{}\" percentage=\"{:.1}\"/>",
306                    escape_xml(&lang.language),
307                    lang.files,
308                    lang.percentage
309                )?;
310            }
311            writeln!(w, "    </languages>")?;
312        }
313
314        if let Some(ref structure) = repo.metadata.directory_structure {
315            writeln!(w, "    <directory_structure><![CDATA[")?;
316            write!(w, "{}", structure)?;
317            writeln!(w, "]]></directory_structure>")?;
318        }
319
320        if !repo.metadata.external_dependencies.is_empty() {
321            writeln!(
322                w,
323                "    <dependencies count=\"{}\">",
324                repo.metadata.external_dependencies.len()
325            )?;
326            for dep in &repo.metadata.external_dependencies {
327                writeln!(w, "      <dependency name=\"{}\"/>", escape_xml(dep))?;
328            }
329            writeln!(w, "    </dependencies>")?;
330        }
331
332        // Add explicit file extension counts for accurate file counting queries
333        let mut ext_counts: std::collections::HashMap<String, usize> =
334            std::collections::HashMap::new();
335        for file in &repo.files {
336            if let Some(ext) = std::path::Path::new(&file.relative_path).extension() {
337                *ext_counts
338                    .entry(ext.to_string_lossy().to_string())
339                    .or_insert(0) += 1;
340            }
341        }
342        if !ext_counts.is_empty() {
343            writeln!(w, "    <file_extensions>")?;
344            let mut sorted_exts: Vec<_> = ext_counts.iter().collect();
345            sorted_exts.sort_by(|a, b| b.1.cmp(a.1)); // Sort by count descending
346            for (ext, count) in sorted_exts {
347                writeln!(
348                    w,
349                    "      <extension name=\".{}\" count=\"{}\"/>",
350                    escape_xml(ext),
351                    count
352                )?;
353            }
354            writeln!(w, "    </file_extensions>")?;
355        }
356
357        writeln!(w, "  </metadata>")
358    }
359
360    fn stream_git_history<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
361        if let Some(ref git_history) = repo.metadata.git_history {
362            writeln!(w, "  <git_history>")?;
363            if !git_history.commits.is_empty() {
364                writeln!(w, "    <recent_commits count=\"{}\">", git_history.commits.len())?;
365                for commit in &git_history.commits {
366                    writeln!(
367                        w,
368                        "      <commit hash=\"{}\" author=\"{}\" date=\"{}\">",
369                        escape_xml(&commit.short_hash),
370                        escape_xml(&commit.author),
371                        escape_xml(&commit.date)
372                    )?;
373                    writeln!(w, "        <message><![CDATA[{}]]></message>", commit.message)?;
374                    writeln!(w, "      </commit>")?;
375                }
376                writeln!(w, "    </recent_commits>")?;
377            }
378            if !git_history.changed_files.is_empty() {
379                writeln!(
380                    w,
381                    "    <uncommitted_changes count=\"{}\">",
382                    git_history.changed_files.len()
383                )?;
384                for file in &git_history.changed_files {
385                    if let Some(diff) = &file.diff_content {
386                        writeln!(
387                            w,
388                            "      <change path=\"{}\" status=\"{}\">",
389                            escape_xml(&file.path),
390                            escape_xml(&file.status)
391                        )?;
392                        writeln!(w, "        <diff><![CDATA[{}]]></diff>", diff)?;
393                        writeln!(w, "      </change>")?;
394                    } else {
395                        writeln!(
396                            w,
397                            "      <change path=\"{}\" status=\"{}\"/>",
398                            escape_xml(&file.path),
399                            escape_xml(&file.status)
400                        )?;
401                    }
402                }
403                writeln!(w, "    </uncommitted_changes>")?;
404            }
405            writeln!(w, "  </git_history>")?;
406        }
407        Ok(())
408    }
409
410    fn stream_repomap<W: Write>(&self, w: &mut W, map: &RepoMap) -> io::Result<()> {
411        writeln!(w, "  <repository_map token_budget=\"{}\">", map.token_count)?;
412        writeln!(w, "    <summary><![CDATA[{}]]></summary>", map.summary)?;
413
414        writeln!(w, "    <key_symbols>")?;
415        for symbol in &map.key_symbols {
416            writeln!(
417                w,
418                "      <symbol name=\"{}\" type=\"{}\" file=\"{}\" line=\"{}\" rank=\"{}\">",
419                escape_xml(&symbol.name),
420                escape_xml(&symbol.kind),
421                escape_xml(&symbol.file),
422                symbol.line,
423                symbol.rank
424            )?;
425            if let Some(sig) = &symbol.signature {
426                writeln!(w, "        <signature><![CDATA[{}]]></signature>", sig)?;
427            }
428            if let Some(summary) = &symbol.summary {
429                writeln!(w, "        <summary><![CDATA[{}]]></summary>", summary)?;
430            }
431            writeln!(w, "      </symbol>")?;
432        }
433        writeln!(w, "    </key_symbols>")?;
434
435        if !map.module_graph.nodes.is_empty() {
436            writeln!(w, "    <modules>")?;
437            for module in &map.module_graph.nodes {
438                writeln!(
439                    w,
440                    "      <module name=\"{}\" files=\"{}\" tokens=\"{}\"/>",
441                    escape_xml(&module.name),
442                    module.files,
443                    module.tokens
444                )?;
445            }
446            writeln!(w, "    </modules>")?;
447        }
448        writeln!(w, "  </repository_map>")
449    }
450
451    fn stream_file_index<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
452        writeln!(w, "  <file_index entries=\"{}\">", repo.files.len())?;
453        for file in &repo.files {
454            let importance = if file.importance > 0.8 {
455                "critical"
456            } else if file.importance > 0.6 {
457                "high"
458            } else if file.importance > 0.3 {
459                "normal"
460            } else {
461                "low"
462            };
463            writeln!(
464                w,
465                "    <file path=\"{}\" tokens=\"{}\" importance=\"{}\"/>",
466                escape_xml(&file.relative_path),
467                file.token_count.get(self.token_model),
468                importance
469            )?;
470        }
471        writeln!(w, "  </file_index>")
472    }
473
474    fn stream_files<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
475        writeln!(w, "  <files>")?;
476        for file in &repo.files {
477            if let Some(content) = &file.content {
478                writeln!(
479                    w,
480                    "    <file path=\"{}\" language=\"{}\" tokens=\"{}\">",
481                    escape_xml(&file.relative_path),
482                    file.language.as_deref().unwrap_or("unknown"),
483                    file.token_count.get(self.token_model)
484                )?;
485
486                if self.include_line_numbers {
487                    writeln!(w, "      <content line_numbers=\"original\"><![CDATA[")?;
488                    // Check if content has embedded line numbers (format: "N:content")
489                    // This preserves original line numbers when content has been compressed
490                    let first_line = content.lines().next().unwrap_or("");
491                    let has_embedded_line_nums = first_line.contains(':')
492                        && first_line
493                            .split(':')
494                            .next()
495                            .is_some_and(|s| s.parse::<u32>().is_ok());
496
497                    if has_embedded_line_nums {
498                        // Content has embedded line numbers - parse and output
499                        for line in content.lines() {
500                            if let Some((num_str, rest)) = line.split_once(':') {
501                                if let Ok(line_num) = num_str.parse::<u32>() {
502                                    writeln!(w, "{:4} | {}", line_num, rest)?;
503                                } else {
504                                    // Fallback for malformed lines
505                                    writeln!(w, "     | {}", line)?;
506                                }
507                            } else {
508                                writeln!(w, "     | {}", line)?;
509                            }
510                        }
511                    } else {
512                        // No embedded line numbers - use sequential (uncompressed content)
513                        for (i, line) in content.lines().enumerate() {
514                            writeln!(w, "{:4} | {}", i + 1, line)?;
515                        }
516                    }
517                    writeln!(w, "]]></content>")?;
518                } else if self.use_cdata {
519                    writeln!(w, "      <content><![CDATA[{}]]></content>", content)?;
520                } else {
521                    writeln!(w, "      <content>{}</content>", escape_xml(content))?;
522                }
523                writeln!(w, "    </file>")?;
524            }
525        }
526        writeln!(w, "  </files>")
527    }
528}
529
530impl Formatter for XmlFormatter {
531    fn format(&self, repo: &Repository, map: &RepoMap) -> String {
532        // Use streaming internally for consistency
533        let mut output = Vec::with_capacity(Self::estimate_output_size(repo));
534        // Vec<u8> write cannot fail, ignore result
535        drop(self.format_to_writer(repo, map, &mut output));
536        // Use lossy conversion to handle any edge cases with invalid UTF-8
537        String::from_utf8(output)
538            .unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
539    }
540
541    fn format_repo(&self, repo: &Repository) -> String {
542        let mut output = Vec::with_capacity(Self::estimate_output_size(repo));
543        // Vec<u8> write cannot fail, ignore result
544        drop(self.format_repo_to_writer(repo, &mut output));
545        // Use lossy conversion to handle any edge cases with invalid UTF-8
546        String::from_utf8(output)
547            .unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
548    }
549
550    fn name(&self) -> &'static str {
551        "xml"
552    }
553}
554
555impl StreamingFormatter for XmlFormatter {
556    fn format_to_writer<W: Write>(
557        &self,
558        repo: &Repository,
559        map: &RepoMap,
560        writer: &mut W,
561    ) -> io::Result<()> {
562        writeln!(writer, r#"<?xml version="1.0" encoding="UTF-8"?>"#)?;
563        writeln!(writer, r#"<repository name="{}" version="1.0.0">"#, escape_xml(&repo.name))?;
564
565        self.stream_llm_instructions(writer, repo)?;
566
567        if self.cache_optimized {
568            writeln!(writer, "  <!-- CACHEABLE_PREFIX_START -->")?;
569        }
570
571        self.stream_overview(writer, repo)?;
572        self.stream_metadata(writer, repo)?;
573        self.stream_git_history(writer, repo)?;
574        self.stream_repomap(writer, map)?;
575
576        if self.show_file_index {
577            self.stream_file_index(writer, repo)?;
578        }
579
580        if self.cache_optimized {
581            writeln!(writer, "  <!-- CACHEABLE_PREFIX_END -->")?;
582            writeln!(writer, "  <!-- DYNAMIC_CONTENT_START -->")?;
583        }
584
585        self.stream_files(writer, repo)?;
586
587        if self.cache_optimized {
588            writeln!(writer, "  <!-- DYNAMIC_CONTENT_END -->")?;
589        }
590
591        writeln!(writer, "</repository>")?;
592        Ok(())
593    }
594
595    fn format_repo_to_writer<W: Write>(&self, repo: &Repository, writer: &mut W) -> io::Result<()> {
596        writeln!(writer, r#"<?xml version="1.0" encoding="UTF-8"?>"#)?;
597        writeln!(writer, r#"<repository name="{}">"#, escape_xml(&repo.name))?;
598
599        self.stream_metadata(writer, repo)?;
600        if self.show_file_index {
601            self.stream_file_index(writer, repo)?;
602        }
603        self.stream_files(writer, repo)?;
604
605        writeln!(writer, "</repository>")?;
606        Ok(())
607    }
608}
609
610/// Escape XML special characters (single-pass for performance)
611fn escape_xml(s: &str) -> String {
612    // Pre-allocate with some extra capacity for escapes
613    let mut result = String::with_capacity(s.len() + s.len() / 10);
614
615    for c in s.chars() {
616        match c {
617            '&' => result.push_str("&amp;"),
618            '<' => result.push_str("&lt;"),
619            '>' => result.push_str("&gt;"),
620            '"' => result.push_str("&quot;"),
621            '\'' => result.push_str("&apos;"),
622            _ => result.push(c),
623        }
624    }
625
626    result
627}
628
629#[cfg(test)]
630#[allow(clippy::str_to_string)]
631mod tests {
632    use super::*;
633    use crate::repomap::RepoMapGenerator;
634    use crate::types::{LanguageStats, RepoFile, RepoMetadata, TokenCounts};
635
636    fn create_test_repo() -> Repository {
637        Repository {
638            name: "test".to_string(),
639            path: "/tmp/test".into(),
640            files: vec![RepoFile {
641                path: "/tmp/test/main.py".into(),
642                relative_path: "main.py".to_string(),
643                language: Some("python".to_string()),
644                size_bytes: 100,
645                token_count: TokenCounts {
646                    o200k: 48,
647                    cl100k: 49,
648                    claude: 50,
649                    gemini: 47,
650                    llama: 46,
651                    mistral: 46,
652                    deepseek: 46,
653                    qwen: 46,
654                    cohere: 47,
655                    grok: 46,
656                },
657                symbols: Vec::new(),
658                importance: 0.8,
659                content: Some("def main():\n    print('hello')".to_string()),
660            }],
661            metadata: RepoMetadata {
662                total_files: 1,
663                total_lines: 2,
664                total_tokens: TokenCounts {
665                    o200k: 48,
666                    cl100k: 49,
667                    claude: 50,
668                    gemini: 47,
669                    llama: 46,
670                    mistral: 46,
671                    deepseek: 46,
672                    qwen: 46,
673                    cohere: 47,
674                    grok: 46,
675                },
676                languages: vec![LanguageStats {
677                    language: "Python".to_string(),
678                    files: 1,
679                    lines: 2,
680                    percentage: 100.0,
681                }],
682                framework: None,
683                description: None,
684                branch: None,
685                commit: None,
686                directory_structure: Some("main.py\n".to_string()),
687                external_dependencies: vec!["requests".to_string(), "numpy".to_string()],
688                git_history: None,
689            },
690        }
691    }
692
693    #[test]
694    fn test_xml_output() {
695        let repo = create_test_repo();
696        let map = RepoMapGenerator::new(1000).generate(&repo);
697
698        let formatter = XmlFormatter::new(true);
699        let output = formatter.format(&repo, &map);
700
701        assert!(output.contains("<?xml version=\"1.0\""));
702        assert!(output.contains("<repository name=\"test\""));
703        assert!(output.contains("CACHEABLE_PREFIX_START"));
704        assert!(output.contains("<file path=\"main.py\""));
705    }
706
707    #[test]
708    fn test_xml_escaping() {
709        assert_eq!(escape_xml("<test>"), "&lt;test&gt;");
710        assert_eq!(escape_xml("a & b"), "a &amp; b");
711    }
712}