infiniloom_engine/output/
xml.rs

1//! Claude-optimized XML output formatter
2//!
3//! This formatter is designed to maximize LLM comprehension of codebases by:
4//! 1. Providing an executive summary for quick understanding
5//! 2. Identifying entry points and key files
6//! 3. Showing architecture and dependencies
7//! 4. Prioritizing files by importance for code tasks
8//!
9//! Supports both in-memory (`format()`) and streaming (`format_to_writer()`) modes.
10
11use crate::output::{Formatter, StreamingFormatter};
12use crate::repomap::RepoMap;
13use crate::types::{Repository, TokenizerModel};
14use std::io::{self, Write};
15
16/// XML formatter optimized for Claude
17pub struct XmlFormatter {
18    /// Include line numbers in code
19    include_line_numbers: bool,
20    /// Optimize for prompt caching
21    cache_optimized: bool,
22    /// Include CDATA sections for code
23    use_cdata: bool,
24    /// Include file index/summary section
25    show_file_index: bool,
26    /// Token model for counts in output
27    token_model: TokenizerModel,
28}
29
30impl XmlFormatter {
31    /// Create a new XML formatter
32    pub fn new(cache_optimized: bool) -> Self {
33        Self {
34            include_line_numbers: true,
35            cache_optimized,
36            use_cdata: true,
37            show_file_index: true,
38            token_model: TokenizerModel::Claude,
39        }
40    }
41
42    /// Set line numbers option
43    pub fn with_line_numbers(mut self, enabled: bool) -> Self {
44        self.include_line_numbers = enabled;
45        self
46    }
47
48    /// Set CDATA option
49    pub fn with_cdata(mut self, enabled: bool) -> Self {
50        self.use_cdata = enabled;
51        self
52    }
53
54    /// Set file index/summary option
55    pub fn with_file_index(mut self, enabled: bool) -> Self {
56        self.show_file_index = enabled;
57        self
58    }
59
60    /// Set token model for token counts in output
61    pub fn with_model(mut self, model: TokenizerModel) -> Self {
62        self.token_model = model;
63        self
64    }
65
66    /// Estimate output size for pre-allocation
67    fn estimate_output_size(repo: &Repository) -> usize {
68        let base = 2000;
69        let files = repo.files.len() * 500;
70        let content: usize = repo
71            .files
72            .iter()
73            .filter_map(|f| f.content.as_ref())
74            .map(|c| c.len())
75            .sum();
76        base + files + content
77    }
78
79    fn detect_project_type(&self, repo: &Repository) -> String {
80        let has_cargo = repo.files.iter().any(|f| f.relative_path == "Cargo.toml");
81        let has_package_json = repo.files.iter().any(|f| f.relative_path == "package.json");
82        let has_pyproject = repo
83            .files
84            .iter()
85            .any(|f| f.relative_path == "pyproject.toml" || f.relative_path == "setup.py");
86        let has_go_mod = repo.files.iter().any(|f| f.relative_path == "go.mod");
87
88        let has_routes = repo
89            .files
90            .iter()
91            .any(|f| f.relative_path.contains("routes") || f.relative_path.contains("api/"));
92        let has_components = repo
93            .files
94            .iter()
95            .any(|f| f.relative_path.contains("components/") || f.relative_path.contains("views/"));
96
97        if has_cargo {
98            if repo
99                .files
100                .iter()
101                .any(|f| f.relative_path.ends_with("lib.rs"))
102            {
103                "Rust Library"
104            } else {
105                "Rust Application"
106            }
107        } else if has_package_json {
108            if has_components {
109                "Frontend Application (JavaScript/TypeScript)"
110            } else if has_routes {
111                "Backend API (Node.js)"
112            } else {
113                "JavaScript/TypeScript Project"
114            }
115        } else if has_pyproject {
116            if has_routes {
117                "Python Web API"
118            } else {
119                "Python Package"
120            }
121        } else if has_go_mod {
122            "Go Application"
123        } else {
124            "Software Project"
125        }
126        .to_owned()
127    }
128
129    fn is_entry_point(&self, path: &str) -> bool {
130        let entry_patterns = [
131            "main.rs",
132            "main.go",
133            "main.py",
134            "main.ts",
135            "main.js",
136            "main.c",
137            "main.cpp",
138            "index.ts",
139            "index.js",
140            "index.tsx",
141            "index.jsx",
142            "index.py",
143            "app.py",
144            "app.ts",
145            "app.js",
146            "app.tsx",
147            "app.jsx",
148            "app.go",
149            "server.py",
150            "server.ts",
151            "server.js",
152            "server.go",
153            "mod.rs",
154            "lib.rs",
155            "__main__.py",
156            "__init__.py",
157            "cmd/main.go",
158        ];
159        entry_patterns
160            .iter()
161            .any(|p| path.ends_with(p) || path.contains(&format!("/{}", p)))
162    }
163
164    fn get_entry_type(&self, path: &str) -> &'static str {
165        if path.contains("main") {
166            "main"
167        } else if path.contains("index") {
168            "index"
169        } else if path.contains("app") {
170            "app"
171        } else if path.contains("server") {
172            "server"
173        } else if path.contains("lib") {
174            "library"
175        } else if path.contains("mod.rs") {
176            "module"
177        } else {
178            "entry"
179        }
180    }
181
182    fn is_config_file(&self, path: &str) -> bool {
183        let config_files = [
184            "Cargo.toml",
185            "package.json",
186            "pyproject.toml",
187            "go.mod",
188            "pom.xml",
189            "build.gradle",
190            "Gemfile",
191            "requirements.txt",
192            "setup.py",
193            "setup.cfg",
194            "tsconfig.json",
195            "webpack.config",
196            "vite.config",
197            "next.config",
198            "Makefile",
199            "CMakeLists.txt",
200            "Dockerfile",
201            "docker-compose",
202            ".env.example",
203            "config.yaml",
204            "config.yml",
205            "config.json",
206        ];
207        let filename = path.rsplit('/').next().unwrap_or(path);
208        config_files.iter().any(|c| filename.contains(c)) && path.matches('/').count() <= 1
209    }
210
211    // =========================================================================
212    // Streaming methods (write to impl std::io::Write)
213    // =========================================================================
214
215    fn stream_llm_instructions<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
216        writeln!(w, "  <llm_context_guide>")?;
217        writeln!(w, "    <purpose>This is a comprehensive code context for the {} repository, optimized for AI-assisted code understanding and generation.</purpose>", escape_xml(&repo.name))?;
218        writeln!(w, "    <how_to_use>")?;
219        writeln!(w, "      <tip>Start with the &lt;overview&gt; section to understand the project's purpose and structure</tip>")?;
220        writeln!(w, "      <tip>Check &lt;entry_points&gt; to find main application files</tip>")?;
221        writeln!(
222            w,
223            "      <tip>Use &lt;repository_map&gt; to understand relationships between modules</tip>"
224        )?;
225        writeln!(
226            w,
227            "      <tip>Files are ordered by importance - most critical files come first</tip>"
228        )?;
229        writeln!(w, "    </how_to_use>")?;
230        writeln!(w, "  </llm_context_guide>")
231    }
232
233    fn stream_overview<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
234        writeln!(w, "  <overview>")?;
235        let project_type = self.detect_project_type(repo);
236        writeln!(w, "    <project_type>{}</project_type>", escape_xml(&project_type))?;
237
238        if let Some(lang) = repo.metadata.languages.iter().max_by_key(|l| l.files) {
239            writeln!(w, "    <primary_language>{}</primary_language>", escape_xml(&lang.language))?;
240        }
241        if let Some(framework) = &repo.metadata.framework {
242            writeln!(w, "    <framework>{}</framework>", escape_xml(framework))?;
243        }
244
245        writeln!(w, "    <entry_points>")?;
246        let mut entry_count = 0;
247        for file in &repo.files {
248            if self.is_entry_point(&file.relative_path) {
249                if file.relative_path.ends_with("__init__.py")
250                    && file.token_count.get(self.token_model) < 50
251                {
252                    continue;
253                }
254                let entry_type = self.get_entry_type(&file.relative_path);
255                writeln!(
256                    w,
257                    "      <entry path=\"{}\" type=\"{}\" tokens=\"{}\"/>",
258                    escape_xml(&file.relative_path),
259                    entry_type,
260                    file.token_count.get(self.token_model)
261                )?;
262                entry_count += 1;
263                if entry_count >= 10 {
264                    break;
265                }
266            }
267        }
268        writeln!(w, "    </entry_points>")?;
269
270        writeln!(w, "    <config_files>")?;
271        for file in &repo.files {
272            if self.is_config_file(&file.relative_path) {
273                writeln!(
274                    w,
275                    "      <config path=\"{}\" tokens=\"{}\"/>",
276                    escape_xml(&file.relative_path),
277                    file.token_count.get(self.token_model)
278                )?;
279            }
280        }
281        writeln!(w, "    </config_files>")?;
282        writeln!(w, "  </overview>")
283    }
284
285    fn stream_metadata<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
286        writeln!(w, "  <metadata>")?;
287        if let Some(desc) = &repo.metadata.description {
288            writeln!(w, "    <description>{}</description>", escape_xml(desc))?;
289        }
290        writeln!(w, "    <stats>")?;
291        writeln!(w, "      <files>{}</files>", repo.metadata.total_files)?;
292        writeln!(w, "      <lines>{}</lines>", repo.metadata.total_lines)?;
293        writeln!(
294            w,
295            "      <tokens model=\"claude\">{}</tokens>",
296            repo.metadata.total_tokens.get(self.token_model)
297        )?;
298        writeln!(w, "    </stats>")?;
299
300        if !repo.metadata.languages.is_empty() {
301            writeln!(w, "    <languages>")?;
302            for lang in &repo.metadata.languages {
303                writeln!(
304                    w,
305                    "      <language name=\"{}\" files=\"{}\" percentage=\"{:.1}\"/>",
306                    escape_xml(&lang.language),
307                    lang.files,
308                    lang.percentage
309                )?;
310            }
311            writeln!(w, "    </languages>")?;
312        }
313
314        if let Some(ref structure) = repo.metadata.directory_structure {
315            writeln!(w, "    <directory_structure><![CDATA[")?;
316            write!(w, "{}", structure)?;
317            writeln!(w, "]]></directory_structure>")?;
318        }
319
320        if !repo.metadata.external_dependencies.is_empty() {
321            writeln!(
322                w,
323                "    <dependencies count=\"{}\">",
324                repo.metadata.external_dependencies.len()
325            )?;
326            for dep in &repo.metadata.external_dependencies {
327                writeln!(w, "      <dependency name=\"{}\"/>", escape_xml(dep))?;
328            }
329            writeln!(w, "    </dependencies>")?;
330        }
331
332        // Add explicit file extension counts for accurate file counting queries
333        let mut ext_counts: std::collections::HashMap<String, usize> =
334            std::collections::HashMap::new();
335        for file in &repo.files {
336            if let Some(ext) = std::path::Path::new(&file.relative_path).extension() {
337                *ext_counts
338                    .entry(ext.to_string_lossy().to_string())
339                    .or_insert(0) += 1;
340            }
341        }
342        if !ext_counts.is_empty() {
343            writeln!(w, "    <file_extensions>")?;
344            let mut sorted_exts: Vec<_> = ext_counts.iter().collect();
345            sorted_exts.sort_by(|a, b| b.1.cmp(a.1)); // Sort by count descending
346            for (ext, count) in sorted_exts {
347                writeln!(
348                    w,
349                    "      <extension name=\".{}\" count=\"{}\"/>",
350                    escape_xml(ext),
351                    count
352                )?;
353            }
354            writeln!(w, "    </file_extensions>")?;
355        }
356
357        writeln!(w, "  </metadata>")
358    }
359
360    fn stream_git_history<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
361        if let Some(ref git_history) = repo.metadata.git_history {
362            writeln!(w, "  <git_history>")?;
363            if !git_history.commits.is_empty() {
364                writeln!(w, "    <recent_commits count=\"{}\">", git_history.commits.len())?;
365                for commit in &git_history.commits {
366                    writeln!(
367                        w,
368                        "      <commit hash=\"{}\" author=\"{}\" date=\"{}\">",
369                        escape_xml(&commit.short_hash),
370                        escape_xml(&commit.author),
371                        escape_xml(&commit.date)
372                    )?;
373                    writeln!(w, "        <message><![CDATA[{}]]></message>", commit.message)?;
374                    writeln!(w, "      </commit>")?;
375                }
376                writeln!(w, "    </recent_commits>")?;
377            }
378            if !git_history.changed_files.is_empty() {
379                writeln!(
380                    w,
381                    "    <uncommitted_changes count=\"{}\">",
382                    git_history.changed_files.len()
383                )?;
384                for file in &git_history.changed_files {
385                    if let Some(diff) = &file.diff_content {
386                        writeln!(
387                            w,
388                            "      <change path=\"{}\" status=\"{}\">",
389                            escape_xml(&file.path),
390                            escape_xml(&file.status)
391                        )?;
392                        writeln!(w, "        <diff><![CDATA[{}]]></diff>", diff)?;
393                        writeln!(w, "      </change>")?;
394                    } else {
395                        writeln!(
396                            w,
397                            "      <change path=\"{}\" status=\"{}\"/>",
398                            escape_xml(&file.path),
399                            escape_xml(&file.status)
400                        )?;
401                    }
402                }
403                writeln!(w, "    </uncommitted_changes>")?;
404            }
405            writeln!(w, "  </git_history>")?;
406        }
407        Ok(())
408    }
409
410    fn stream_repomap<W: Write>(&self, w: &mut W, map: &RepoMap) -> io::Result<()> {
411        writeln!(w, "  <repository_map token_budget=\"{}\">", map.token_count)?;
412        writeln!(w, "    <summary><![CDATA[{}]]></summary>", map.summary)?;
413
414        writeln!(w, "    <key_symbols>")?;
415        for symbol in &map.key_symbols {
416            writeln!(
417                w,
418                "      <symbol name=\"{}\" type=\"{}\" file=\"{}\" line=\"{}\" rank=\"{}\">",
419                escape_xml(&symbol.name),
420                escape_xml(&symbol.kind),
421                escape_xml(&symbol.file),
422                symbol.line,
423                symbol.rank
424            )?;
425            if let Some(sig) = &symbol.signature {
426                writeln!(w, "        <signature><![CDATA[{}]]></signature>", sig)?;
427            }
428            if let Some(summary) = &symbol.summary {
429                writeln!(w, "        <summary><![CDATA[{}]]></summary>", summary)?;
430            }
431            writeln!(w, "      </symbol>")?;
432        }
433        writeln!(w, "    </key_symbols>")?;
434
435        if !map.module_graph.nodes.is_empty() {
436            writeln!(w, "    <modules>")?;
437            for module in &map.module_graph.nodes {
438                writeln!(
439                    w,
440                    "      <module name=\"{}\" files=\"{}\" tokens=\"{}\"/>",
441                    escape_xml(&module.name),
442                    module.files,
443                    module.tokens
444                )?;
445            }
446            writeln!(w, "    </modules>")?;
447        }
448        writeln!(w, "  </repository_map>")
449    }
450
451    fn stream_file_index<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
452        writeln!(w, "  <file_index entries=\"{}\">", repo.files.len())?;
453        for file in &repo.files {
454            let importance = if file.importance > 0.8 {
455                "critical"
456            } else if file.importance > 0.6 {
457                "high"
458            } else if file.importance > 0.3 {
459                "normal"
460            } else {
461                "low"
462            };
463            writeln!(
464                w,
465                "    <file path=\"{}\" tokens=\"{}\" importance=\"{}\"/>",
466                escape_xml(&file.relative_path),
467                file.token_count.get(self.token_model),
468                importance
469            )?;
470        }
471        writeln!(w, "  </file_index>")
472    }
473
474    fn stream_files<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
475        writeln!(w, "  <files>")?;
476        for file in &repo.files {
477            if let Some(content) = &file.content {
478                writeln!(
479                    w,
480                    "    <file path=\"{}\" language=\"{}\" tokens=\"{}\">",
481                    escape_xml(&file.relative_path),
482                    file.language.as_deref().unwrap_or("unknown"),
483                    file.token_count.get(self.token_model)
484                )?;
485
486                if self.include_line_numbers {
487                    writeln!(w, "      <content line_numbers=\"original\"><![CDATA[")?;
488                    // Check if content has embedded line numbers (format: "N:content")
489                    // This preserves original line numbers when content has been compressed
490                    let first_line = content.lines().next().unwrap_or("");
491                    let has_embedded_line_nums = first_line.contains(':')
492                        && first_line
493                            .split(':')
494                            .next()
495                            .map(|s| s.parse::<u32>().is_ok())
496                            .unwrap_or(false);
497
498                    if has_embedded_line_nums {
499                        // Content has embedded line numbers - parse and output
500                        for line in content.lines() {
501                            if let Some((num_str, rest)) = line.split_once(':') {
502                                if let Ok(line_num) = num_str.parse::<u32>() {
503                                    writeln!(w, "{:4} | {}", line_num, rest)?;
504                                } else {
505                                    // Fallback for malformed lines
506                                    writeln!(w, "     | {}", line)?;
507                                }
508                            } else {
509                                writeln!(w, "     | {}", line)?;
510                            }
511                        }
512                    } else {
513                        // No embedded line numbers - use sequential (uncompressed content)
514                        for (i, line) in content.lines().enumerate() {
515                            writeln!(w, "{:4} | {}", i + 1, line)?;
516                        }
517                    }
518                    writeln!(w, "]]></content>")?;
519                } else if self.use_cdata {
520                    writeln!(w, "      <content><![CDATA[{}]]></content>", content)?;
521                } else {
522                    writeln!(w, "      <content>{}</content>", escape_xml(content))?;
523                }
524                writeln!(w, "    </file>")?;
525            }
526        }
527        writeln!(w, "  </files>")
528    }
529}
530
531impl Formatter for XmlFormatter {
532    fn format(&self, repo: &Repository, map: &RepoMap) -> String {
533        // Use streaming internally for consistency
534        let mut output = Vec::with_capacity(Self::estimate_output_size(repo));
535        // Vec<u8> write cannot fail, ignore result
536        drop(self.format_to_writer(repo, map, &mut output));
537        // Use lossy conversion to handle any edge cases with invalid UTF-8
538        String::from_utf8(output)
539            .unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
540    }
541
542    fn format_repo(&self, repo: &Repository) -> String {
543        let mut output = Vec::with_capacity(Self::estimate_output_size(repo));
544        // Vec<u8> write cannot fail, ignore result
545        drop(self.format_repo_to_writer(repo, &mut output));
546        // Use lossy conversion to handle any edge cases with invalid UTF-8
547        String::from_utf8(output)
548            .unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
549    }
550
551    fn name(&self) -> &'static str {
552        "xml"
553    }
554}
555
556impl StreamingFormatter for XmlFormatter {
557    fn format_to_writer<W: Write>(
558        &self,
559        repo: &Repository,
560        map: &RepoMap,
561        writer: &mut W,
562    ) -> io::Result<()> {
563        writeln!(writer, r#"<?xml version="1.0" encoding="UTF-8"?>"#)?;
564        writeln!(writer, r#"<repository name="{}" version="1.0.0">"#, escape_xml(&repo.name))?;
565
566        self.stream_llm_instructions(writer, repo)?;
567
568        if self.cache_optimized {
569            writeln!(writer, "  <!-- CACHEABLE_PREFIX_START -->")?;
570        }
571
572        self.stream_overview(writer, repo)?;
573        self.stream_metadata(writer, repo)?;
574        self.stream_git_history(writer, repo)?;
575        self.stream_repomap(writer, map)?;
576
577        if self.show_file_index {
578            self.stream_file_index(writer, repo)?;
579        }
580
581        if self.cache_optimized {
582            writeln!(writer, "  <!-- CACHEABLE_PREFIX_END -->")?;
583            writeln!(writer, "  <!-- DYNAMIC_CONTENT_START -->")?;
584        }
585
586        self.stream_files(writer, repo)?;
587
588        if self.cache_optimized {
589            writeln!(writer, "  <!-- DYNAMIC_CONTENT_END -->")?;
590        }
591
592        writeln!(writer, "</repository>")?;
593        Ok(())
594    }
595
596    fn format_repo_to_writer<W: Write>(&self, repo: &Repository, writer: &mut W) -> io::Result<()> {
597        writeln!(writer, r#"<?xml version="1.0" encoding="UTF-8"?>"#)?;
598        writeln!(writer, r#"<repository name="{}">"#, escape_xml(&repo.name))?;
599
600        self.stream_metadata(writer, repo)?;
601        if self.show_file_index {
602            self.stream_file_index(writer, repo)?;
603        }
604        self.stream_files(writer, repo)?;
605
606        writeln!(writer, "</repository>")?;
607        Ok(())
608    }
609}
610
611/// Escape XML special characters (single-pass for performance)
612fn escape_xml(s: &str) -> String {
613    // Pre-allocate with some extra capacity for escapes
614    let mut result = String::with_capacity(s.len() + s.len() / 10);
615
616    for c in s.chars() {
617        match c {
618            '&' => result.push_str("&amp;"),
619            '<' => result.push_str("&lt;"),
620            '>' => result.push_str("&gt;"),
621            '"' => result.push_str("&quot;"),
622            '\'' => result.push_str("&apos;"),
623            _ => result.push(c),
624        }
625    }
626
627    result
628}
629
630#[cfg(test)]
631#[allow(clippy::str_to_string)]
632mod tests {
633    use super::*;
634    use crate::repomap::RepoMapGenerator;
635    use crate::types::{LanguageStats, RepoFile, RepoMetadata, TokenCounts};
636
637    fn create_test_repo() -> Repository {
638        Repository {
639            name: "test".to_string(),
640            path: "/tmp/test".into(),
641            files: vec![RepoFile {
642                path: "/tmp/test/main.py".into(),
643                relative_path: "main.py".to_string(),
644                language: Some("python".to_string()),
645                size_bytes: 100,
646                token_count: TokenCounts {
647                    o200k: 48,
648                    cl100k: 49,
649                    claude: 50,
650                    gemini: 47,
651                    llama: 46,
652                    mistral: 46,
653                    deepseek: 46,
654                    qwen: 46,
655                    cohere: 47,
656                    grok: 46,
657                },
658                symbols: Vec::new(),
659                importance: 0.8,
660                content: Some("def main():\n    print('hello')".to_string()),
661            }],
662            metadata: RepoMetadata {
663                total_files: 1,
664                total_lines: 2,
665                total_tokens: TokenCounts {
666                    o200k: 48,
667                    cl100k: 49,
668                    claude: 50,
669                    gemini: 47,
670                    llama: 46,
671                    mistral: 46,
672                    deepseek: 46,
673                    qwen: 46,
674                    cohere: 47,
675                    grok: 46,
676                },
677                languages: vec![LanguageStats {
678                    language: "Python".to_string(),
679                    files: 1,
680                    lines: 2,
681                    percentage: 100.0,
682                }],
683                framework: None,
684                description: None,
685                branch: None,
686                commit: None,
687                directory_structure: Some("main.py\n".to_string()),
688                external_dependencies: vec!["requests".to_string(), "numpy".to_string()],
689                git_history: None,
690            },
691        }
692    }
693
694    #[test]
695    fn test_xml_output() {
696        let repo = create_test_repo();
697        let map = RepoMapGenerator::new(1000).generate(&repo);
698
699        let formatter = XmlFormatter::new(true);
700        let output = formatter.format(&repo, &map);
701
702        assert!(output.contains("<?xml version=\"1.0\""));
703        assert!(output.contains("<repository name=\"test\""));
704        assert!(output.contains("CACHEABLE_PREFIX_START"));
705        assert!(output.contains("<file path=\"main.py\""));
706    }
707
708    #[test]
709    fn test_xml_escaping() {
710        assert_eq!(escape_xml("<test>"), "&lt;test&gt;");
711        assert_eq!(escape_xml("a & b"), "a &amp; b");
712    }
713}