1use crate::output::{Formatter, StreamingFormatter};
12use crate::repomap::RepoMap;
13use crate::types::{Repository, TokenizerModel};
14use std::io::{self, Write};
15
16pub struct XmlFormatter {
18 include_line_numbers: bool,
20 cache_optimized: bool,
22 use_cdata: bool,
24 show_file_index: bool,
26 token_model: TokenizerModel,
28}
29
30impl XmlFormatter {
31 pub fn new(cache_optimized: bool) -> Self {
33 Self {
34 include_line_numbers: true,
35 cache_optimized,
36 use_cdata: true,
37 show_file_index: true,
38 token_model: TokenizerModel::Claude,
39 }
40 }
41
42 pub fn with_line_numbers(mut self, enabled: bool) -> Self {
44 self.include_line_numbers = enabled;
45 self
46 }
47
48 pub fn with_cdata(mut self, enabled: bool) -> Self {
50 self.use_cdata = enabled;
51 self
52 }
53
54 pub fn with_file_index(mut self, enabled: bool) -> Self {
56 self.show_file_index = enabled;
57 self
58 }
59
60 pub fn with_model(mut self, model: TokenizerModel) -> Self {
62 self.token_model = model;
63 self
64 }
65
66 fn estimate_output_size(repo: &Repository) -> usize {
68 let base = 2000;
69 let files = repo.files.len() * 500;
70 let content: usize = repo
71 .files
72 .iter()
73 .filter_map(|f| f.content.as_ref())
74 .map(|c| c.len())
75 .sum();
76 base + files + content
77 }
78
79 fn detect_project_type(&self, repo: &Repository) -> String {
80 let has_cargo = repo.files.iter().any(|f| f.relative_path == "Cargo.toml");
81 let has_package_json = repo.files.iter().any(|f| f.relative_path == "package.json");
82 let has_pyproject = repo
83 .files
84 .iter()
85 .any(|f| f.relative_path == "pyproject.toml" || f.relative_path == "setup.py");
86 let has_go_mod = repo.files.iter().any(|f| f.relative_path == "go.mod");
87
88 let has_routes = repo
89 .files
90 .iter()
91 .any(|f| f.relative_path.contains("routes") || f.relative_path.contains("api/"));
92 let has_components = repo
93 .files
94 .iter()
95 .any(|f| f.relative_path.contains("components/") || f.relative_path.contains("views/"));
96
97 if has_cargo {
98 if repo
99 .files
100 .iter()
101 .any(|f| f.relative_path.ends_with("lib.rs"))
102 {
103 "Rust Library"
104 } else {
105 "Rust Application"
106 }
107 } else if has_package_json {
108 if has_components {
109 "Frontend Application (JavaScript/TypeScript)"
110 } else if has_routes {
111 "Backend API (Node.js)"
112 } else {
113 "JavaScript/TypeScript Project"
114 }
115 } else if has_pyproject {
116 if has_routes {
117 "Python Web API"
118 } else {
119 "Python Package"
120 }
121 } else if has_go_mod {
122 "Go Application"
123 } else {
124 "Software Project"
125 }
126 .to_owned()
127 }
128
129 fn is_entry_point(&self, path: &str) -> bool {
130 let entry_patterns = [
131 "main.rs",
132 "main.go",
133 "main.py",
134 "main.ts",
135 "main.js",
136 "main.c",
137 "main.cpp",
138 "index.ts",
139 "index.js",
140 "index.tsx",
141 "index.jsx",
142 "index.py",
143 "app.py",
144 "app.ts",
145 "app.js",
146 "app.tsx",
147 "app.jsx",
148 "app.go",
149 "server.py",
150 "server.ts",
151 "server.js",
152 "server.go",
153 "mod.rs",
154 "lib.rs",
155 "__main__.py",
156 "__init__.py",
157 "cmd/main.go",
158 ];
159 entry_patterns
160 .iter()
161 .any(|p| path.ends_with(p) || path.contains(&format!("/{}", p)))
162 }
163
164 fn get_entry_type(&self, path: &str) -> &'static str {
165 if path.contains("main") {
166 "main"
167 } else if path.contains("index") {
168 "index"
169 } else if path.contains("app") {
170 "app"
171 } else if path.contains("server") {
172 "server"
173 } else if path.contains("lib") {
174 "library"
175 } else if path.contains("mod.rs") {
176 "module"
177 } else {
178 "entry"
179 }
180 }
181
182 fn is_config_file(&self, path: &str) -> bool {
183 let config_files = [
184 "Cargo.toml",
185 "package.json",
186 "pyproject.toml",
187 "go.mod",
188 "pom.xml",
189 "build.gradle",
190 "Gemfile",
191 "requirements.txt",
192 "setup.py",
193 "setup.cfg",
194 "tsconfig.json",
195 "webpack.config",
196 "vite.config",
197 "next.config",
198 "Makefile",
199 "CMakeLists.txt",
200 "Dockerfile",
201 "docker-compose",
202 ".env.example",
203 "config.yaml",
204 "config.yml",
205 "config.json",
206 ];
207 let filename = path.rsplit('/').next().unwrap_or(path);
208 config_files.iter().any(|c| filename.contains(c)) && path.matches('/').count() <= 1
209 }
210
211 fn stream_llm_instructions<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
216 writeln!(w, " <llm_context_guide>")?;
217 writeln!(w, " <purpose>This is a comprehensive code context for the {} repository, optimized for AI-assisted code understanding and generation.</purpose>", escape_xml(&repo.name))?;
218 writeln!(w, " <how_to_use>")?;
219 writeln!(w, " <tip>Start with the <overview> section to understand the project's purpose and structure</tip>")?;
220 writeln!(w, " <tip>Check <entry_points> to find main application files</tip>")?;
221 writeln!(
222 w,
223 " <tip>Use <repository_map> to understand relationships between modules</tip>"
224 )?;
225 writeln!(
226 w,
227 " <tip>Files are ordered by importance - most critical files come first</tip>"
228 )?;
229 writeln!(w, " </how_to_use>")?;
230 writeln!(w, " </llm_context_guide>")
231 }
232
233 fn stream_overview<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
234 writeln!(w, " <overview>")?;
235 let project_type = self.detect_project_type(repo);
236 writeln!(w, " <project_type>{}</project_type>", escape_xml(&project_type))?;
237
238 if let Some(lang) = repo.metadata.languages.iter().max_by_key(|l| l.files) {
239 writeln!(w, " <primary_language>{}</primary_language>", escape_xml(&lang.language))?;
240 }
241 if let Some(framework) = &repo.metadata.framework {
242 writeln!(w, " <framework>{}</framework>", escape_xml(framework))?;
243 }
244
245 writeln!(w, " <entry_points>")?;
246 let mut entry_count = 0;
247 for file in &repo.files {
248 if self.is_entry_point(&file.relative_path) {
249 if file.relative_path.ends_with("__init__.py")
250 && file.token_count.get(self.token_model) < 50
251 {
252 continue;
253 }
254 let entry_type = self.get_entry_type(&file.relative_path);
255 writeln!(
256 w,
257 " <entry path=\"{}\" type=\"{}\" tokens=\"{}\"/>",
258 escape_xml(&file.relative_path),
259 entry_type,
260 file.token_count.get(self.token_model)
261 )?;
262 entry_count += 1;
263 if entry_count >= 10 {
264 break;
265 }
266 }
267 }
268 writeln!(w, " </entry_points>")?;
269
270 writeln!(w, " <config_files>")?;
271 for file in &repo.files {
272 if self.is_config_file(&file.relative_path) {
273 writeln!(
274 w,
275 " <config path=\"{}\" tokens=\"{}\"/>",
276 escape_xml(&file.relative_path),
277 file.token_count.get(self.token_model)
278 )?;
279 }
280 }
281 writeln!(w, " </config_files>")?;
282 writeln!(w, " </overview>")
283 }
284
285 fn stream_metadata<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
286 writeln!(w, " <metadata>")?;
287 if let Some(desc) = &repo.metadata.description {
288 writeln!(w, " <description>{}</description>", escape_xml(desc))?;
289 }
290 writeln!(w, " <stats>")?;
291 writeln!(w, " <files>{}</files>", repo.metadata.total_files)?;
292 writeln!(w, " <lines>{}</lines>", repo.metadata.total_lines)?;
293 writeln!(
294 w,
295 " <tokens model=\"claude\">{}</tokens>",
296 repo.metadata.total_tokens.get(self.token_model)
297 )?;
298 writeln!(w, " </stats>")?;
299
300 if !repo.metadata.languages.is_empty() {
301 writeln!(w, " <languages>")?;
302 for lang in &repo.metadata.languages {
303 writeln!(
304 w,
305 " <language name=\"{}\" files=\"{}\" percentage=\"{:.1}\"/>",
306 escape_xml(&lang.language),
307 lang.files,
308 lang.percentage
309 )?;
310 }
311 writeln!(w, " </languages>")?;
312 }
313
314 if let Some(ref structure) = repo.metadata.directory_structure {
315 writeln!(w, " <directory_structure><![CDATA[")?;
316 write!(w, "{}", structure)?;
317 writeln!(w, "]]></directory_structure>")?;
318 }
319
320 if !repo.metadata.external_dependencies.is_empty() {
321 writeln!(
322 w,
323 " <dependencies count=\"{}\">",
324 repo.metadata.external_dependencies.len()
325 )?;
326 for dep in &repo.metadata.external_dependencies {
327 writeln!(w, " <dependency name=\"{}\"/>", escape_xml(dep))?;
328 }
329 writeln!(w, " </dependencies>")?;
330 }
331
332 let mut ext_counts: std::collections::HashMap<String, usize> =
334 std::collections::HashMap::new();
335 for file in &repo.files {
336 if let Some(ext) = std::path::Path::new(&file.relative_path).extension() {
337 *ext_counts
338 .entry(ext.to_string_lossy().to_string())
339 .or_insert(0) += 1;
340 }
341 }
342 if !ext_counts.is_empty() {
343 writeln!(w, " <file_extensions>")?;
344 let mut sorted_exts: Vec<_> = ext_counts.iter().collect();
345 sorted_exts.sort_by(|a, b| b.1.cmp(a.1)); for (ext, count) in sorted_exts {
347 writeln!(
348 w,
349 " <extension name=\".{}\" count=\"{}\"/>",
350 escape_xml(ext),
351 count
352 )?;
353 }
354 writeln!(w, " </file_extensions>")?;
355 }
356
357 writeln!(w, " </metadata>")
358 }
359
360 fn stream_git_history<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
361 if let Some(ref git_history) = repo.metadata.git_history {
362 writeln!(w, " <git_history>")?;
363 if !git_history.commits.is_empty() {
364 writeln!(w, " <recent_commits count=\"{}\">", git_history.commits.len())?;
365 for commit in &git_history.commits {
366 writeln!(
367 w,
368 " <commit hash=\"{}\" author=\"{}\" date=\"{}\">",
369 escape_xml(&commit.short_hash),
370 escape_xml(&commit.author),
371 escape_xml(&commit.date)
372 )?;
373 writeln!(w, " <message><![CDATA[{}]]></message>", commit.message)?;
374 writeln!(w, " </commit>")?;
375 }
376 writeln!(w, " </recent_commits>")?;
377 }
378 if !git_history.changed_files.is_empty() {
379 writeln!(
380 w,
381 " <uncommitted_changes count=\"{}\">",
382 git_history.changed_files.len()
383 )?;
384 for file in &git_history.changed_files {
385 if let Some(diff) = &file.diff_content {
386 writeln!(
387 w,
388 " <change path=\"{}\" status=\"{}\">",
389 escape_xml(&file.path),
390 escape_xml(&file.status)
391 )?;
392 writeln!(w, " <diff><![CDATA[{}]]></diff>", diff)?;
393 writeln!(w, " </change>")?;
394 } else {
395 writeln!(
396 w,
397 " <change path=\"{}\" status=\"{}\"/>",
398 escape_xml(&file.path),
399 escape_xml(&file.status)
400 )?;
401 }
402 }
403 writeln!(w, " </uncommitted_changes>")?;
404 }
405 writeln!(w, " </git_history>")?;
406 }
407 Ok(())
408 }
409
410 fn stream_repomap<W: Write>(&self, w: &mut W, map: &RepoMap) -> io::Result<()> {
411 writeln!(w, " <repository_map token_budget=\"{}\">", map.token_count)?;
412 writeln!(w, " <summary><![CDATA[{}]]></summary>", map.summary)?;
413
414 writeln!(w, " <key_symbols>")?;
415 for symbol in &map.key_symbols {
416 writeln!(
417 w,
418 " <symbol name=\"{}\" type=\"{}\" file=\"{}\" line=\"{}\" rank=\"{}\">",
419 escape_xml(&symbol.name),
420 escape_xml(&symbol.kind),
421 escape_xml(&symbol.file),
422 symbol.line,
423 symbol.rank
424 )?;
425 if let Some(sig) = &symbol.signature {
426 writeln!(w, " <signature><![CDATA[{}]]></signature>", sig)?;
427 }
428 if let Some(summary) = &symbol.summary {
429 writeln!(w, " <summary><![CDATA[{}]]></summary>", summary)?;
430 }
431 writeln!(w, " </symbol>")?;
432 }
433 writeln!(w, " </key_symbols>")?;
434
435 if !map.module_graph.nodes.is_empty() {
436 writeln!(w, " <modules>")?;
437 for module in &map.module_graph.nodes {
438 writeln!(
439 w,
440 " <module name=\"{}\" files=\"{}\" tokens=\"{}\"/>",
441 escape_xml(&module.name),
442 module.files,
443 module.tokens
444 )?;
445 }
446 writeln!(w, " </modules>")?;
447 }
448 writeln!(w, " </repository_map>")
449 }
450
451 fn stream_file_index<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
452 writeln!(w, " <file_index entries=\"{}\">", repo.files.len())?;
453 for file in &repo.files {
454 let importance = if file.importance > 0.8 {
455 "critical"
456 } else if file.importance > 0.6 {
457 "high"
458 } else if file.importance > 0.3 {
459 "normal"
460 } else {
461 "low"
462 };
463 writeln!(
464 w,
465 " <file path=\"{}\" tokens=\"{}\" importance=\"{}\"/>",
466 escape_xml(&file.relative_path),
467 file.token_count.get(self.token_model),
468 importance
469 )?;
470 }
471 writeln!(w, " </file_index>")
472 }
473
474 fn stream_files<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
475 writeln!(w, " <files>")?;
476 for file in &repo.files {
477 if let Some(content) = &file.content {
478 writeln!(
479 w,
480 " <file path=\"{}\" language=\"{}\" tokens=\"{}\">",
481 escape_xml(&file.relative_path),
482 file.language.as_deref().unwrap_or("unknown"),
483 file.token_count.get(self.token_model)
484 )?;
485
486 if self.include_line_numbers {
487 writeln!(w, " <content line_numbers=\"original\"><![CDATA[")?;
488 let first_line = content.lines().next().unwrap_or("");
491 let has_embedded_line_nums = first_line.contains(':')
492 && first_line
493 .split(':')
494 .next()
495 .is_some_and(|s| s.parse::<u32>().is_ok());
496
497 if has_embedded_line_nums {
498 for line in content.lines() {
500 if let Some((num_str, rest)) = line.split_once(':') {
501 if let Ok(line_num) = num_str.parse::<u32>() {
502 writeln!(w, "{:4} | {}", line_num, rest)?;
503 } else {
504 writeln!(w, " | {}", line)?;
506 }
507 } else {
508 writeln!(w, " | {}", line)?;
509 }
510 }
511 } else {
512 for (i, line) in content.lines().enumerate() {
514 writeln!(w, "{:4} | {}", i + 1, line)?;
515 }
516 }
517 writeln!(w, "]]></content>")?;
518 } else if self.use_cdata {
519 writeln!(w, " <content><![CDATA[{}]]></content>", content)?;
520 } else {
521 writeln!(w, " <content>{}</content>", escape_xml(content))?;
522 }
523 writeln!(w, " </file>")?;
524 }
525 }
526 writeln!(w, " </files>")
527 }
528}
529
530impl Formatter for XmlFormatter {
531 fn format(&self, repo: &Repository, map: &RepoMap) -> String {
532 let mut output = Vec::with_capacity(Self::estimate_output_size(repo));
534 drop(self.format_to_writer(repo, map, &mut output));
536 String::from_utf8(output)
538 .unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
539 }
540
541 fn format_repo(&self, repo: &Repository) -> String {
542 let mut output = Vec::with_capacity(Self::estimate_output_size(repo));
543 drop(self.format_repo_to_writer(repo, &mut output));
545 String::from_utf8(output)
547 .unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
548 }
549
550 fn name(&self) -> &'static str {
551 "xml"
552 }
553}
554
555impl StreamingFormatter for XmlFormatter {
556 fn format_to_writer<W: Write>(
557 &self,
558 repo: &Repository,
559 map: &RepoMap,
560 writer: &mut W,
561 ) -> io::Result<()> {
562 writeln!(writer, r#"<?xml version="1.0" encoding="UTF-8"?>"#)?;
563 writeln!(writer, r#"<repository name="{}" version="1.0.0">"#, escape_xml(&repo.name))?;
564
565 self.stream_llm_instructions(writer, repo)?;
566
567 if self.cache_optimized {
568 writeln!(writer, " <!-- CACHEABLE_PREFIX_START -->")?;
569 }
570
571 self.stream_overview(writer, repo)?;
572 self.stream_metadata(writer, repo)?;
573 self.stream_git_history(writer, repo)?;
574 self.stream_repomap(writer, map)?;
575
576 if self.show_file_index {
577 self.stream_file_index(writer, repo)?;
578 }
579
580 if self.cache_optimized {
581 writeln!(writer, " <!-- CACHEABLE_PREFIX_END -->")?;
582 writeln!(writer, " <!-- DYNAMIC_CONTENT_START -->")?;
583 }
584
585 self.stream_files(writer, repo)?;
586
587 if self.cache_optimized {
588 writeln!(writer, " <!-- DYNAMIC_CONTENT_END -->")?;
589 }
590
591 writeln!(writer, "</repository>")?;
592 Ok(())
593 }
594
595 fn format_repo_to_writer<W: Write>(&self, repo: &Repository, writer: &mut W) -> io::Result<()> {
596 writeln!(writer, r#"<?xml version="1.0" encoding="UTF-8"?>"#)?;
597 writeln!(writer, r#"<repository name="{}">"#, escape_xml(&repo.name))?;
598
599 self.stream_metadata(writer, repo)?;
600 if self.show_file_index {
601 self.stream_file_index(writer, repo)?;
602 }
603 self.stream_files(writer, repo)?;
604
605 writeln!(writer, "</repository>")?;
606 Ok(())
607 }
608}
609
610fn escape_xml(s: &str) -> String {
612 let mut result = String::with_capacity(s.len() + s.len() / 10);
614
615 for c in s.chars() {
616 match c {
617 '&' => result.push_str("&"),
618 '<' => result.push_str("<"),
619 '>' => result.push_str(">"),
620 '"' => result.push_str("""),
621 '\'' => result.push_str("'"),
622 _ => result.push(c),
623 }
624 }
625
626 result
627}
628
629#[cfg(test)]
630#[allow(clippy::str_to_string)]
631mod tests {
632 use super::*;
633 use crate::repomap::RepoMapGenerator;
634 use crate::types::{LanguageStats, RepoFile, RepoMetadata, TokenCounts};
635
636 fn create_test_repo() -> Repository {
637 Repository {
638 name: "test".to_string(),
639 path: "/tmp/test".into(),
640 files: vec![RepoFile {
641 path: "/tmp/test/main.py".into(),
642 relative_path: "main.py".to_string(),
643 language: Some("python".to_string()),
644 size_bytes: 100,
645 token_count: TokenCounts {
646 o200k: 48,
647 cl100k: 49,
648 claude: 50,
649 gemini: 47,
650 llama: 46,
651 mistral: 46,
652 deepseek: 46,
653 qwen: 46,
654 cohere: 47,
655 grok: 46,
656 },
657 symbols: Vec::new(),
658 importance: 0.8,
659 content: Some("def main():\n print('hello')".to_string()),
660 }],
661 metadata: RepoMetadata {
662 total_files: 1,
663 total_lines: 2,
664 total_tokens: TokenCounts {
665 o200k: 48,
666 cl100k: 49,
667 claude: 50,
668 gemini: 47,
669 llama: 46,
670 mistral: 46,
671 deepseek: 46,
672 qwen: 46,
673 cohere: 47,
674 grok: 46,
675 },
676 languages: vec![LanguageStats {
677 language: "Python".to_string(),
678 files: 1,
679 lines: 2,
680 percentage: 100.0,
681 }],
682 framework: None,
683 description: None,
684 branch: None,
685 commit: None,
686 directory_structure: Some("main.py\n".to_string()),
687 external_dependencies: vec!["requests".to_string(), "numpy".to_string()],
688 git_history: None,
689 },
690 }
691 }
692
693 #[test]
694 fn test_xml_output() {
695 let repo = create_test_repo();
696 let map = RepoMapGenerator::new(1000).generate(&repo);
697
698 let formatter = XmlFormatter::new(true);
699 let output = formatter.format(&repo, &map);
700
701 assert!(output.contains("<?xml version=\"1.0\""));
702 assert!(output.contains("<repository name=\"test\""));
703 assert!(output.contains("CACHEABLE_PREFIX_START"));
704 assert!(output.contains("<file path=\"main.py\""));
705 }
706
707 #[test]
708 fn test_xml_escaping() {
709 assert_eq!(escape_xml("<test>"), "<test>");
710 assert_eq!(escape_xml("a & b"), "a & b");
711 }
712}