1use crate::output::{Formatter, StreamingFormatter};
12use crate::repomap::RepoMap;
13use crate::types::{Repository, TokenizerModel};
14use std::io::{self, Write};
15
16pub struct XmlFormatter {
18 include_line_numbers: bool,
20 cache_optimized: bool,
22 use_cdata: bool,
24 show_file_index: bool,
26 token_model: TokenizerModel,
28}
29
30impl XmlFormatter {
31 pub fn new(cache_optimized: bool) -> Self {
33 Self {
34 include_line_numbers: true,
35 cache_optimized,
36 use_cdata: true,
37 show_file_index: true,
38 token_model: TokenizerModel::Claude,
39 }
40 }
41
42 pub fn with_line_numbers(mut self, enabled: bool) -> Self {
44 self.include_line_numbers = enabled;
45 self
46 }
47
48 pub fn with_cdata(mut self, enabled: bool) -> Self {
50 self.use_cdata = enabled;
51 self
52 }
53
54 pub fn with_file_index(mut self, enabled: bool) -> Self {
56 self.show_file_index = enabled;
57 self
58 }
59
60 pub fn with_model(mut self, model: TokenizerModel) -> Self {
62 self.token_model = model;
63 self
64 }
65
66 fn estimate_output_size(repo: &Repository) -> usize {
68 let base = 2000;
69 let files = repo.files.len() * 500;
70 let content: usize = repo
71 .files
72 .iter()
73 .filter_map(|f| f.content.as_ref())
74 .map(|c| c.len())
75 .sum();
76 base + files + content
77 }
78
79 fn detect_project_type(&self, repo: &Repository) -> String {
80 let has_cargo = repo.files.iter().any(|f| f.relative_path == "Cargo.toml");
81 let has_package_json = repo.files.iter().any(|f| f.relative_path == "package.json");
82 let has_pyproject = repo
83 .files
84 .iter()
85 .any(|f| f.relative_path == "pyproject.toml" || f.relative_path == "setup.py");
86 let has_go_mod = repo.files.iter().any(|f| f.relative_path == "go.mod");
87
88 let has_routes = repo
89 .files
90 .iter()
91 .any(|f| f.relative_path.contains("routes") || f.relative_path.contains("api/"));
92 let has_components = repo
93 .files
94 .iter()
95 .any(|f| f.relative_path.contains("components/") || f.relative_path.contains("views/"));
96
97 if has_cargo {
98 if repo
99 .files
100 .iter()
101 .any(|f| f.relative_path.ends_with("lib.rs"))
102 {
103 "Rust Library"
104 } else {
105 "Rust Application"
106 }
107 } else if has_package_json {
108 if has_components {
109 "Frontend Application (JavaScript/TypeScript)"
110 } else if has_routes {
111 "Backend API (Node.js)"
112 } else {
113 "JavaScript/TypeScript Project"
114 }
115 } else if has_pyproject {
116 if has_routes {
117 "Python Web API"
118 } else {
119 "Python Package"
120 }
121 } else if has_go_mod {
122 "Go Application"
123 } else {
124 "Software Project"
125 }
126 .to_owned()
127 }
128
129 fn is_entry_point(&self, path: &str) -> bool {
130 let entry_patterns = [
131 "main.rs",
132 "main.go",
133 "main.py",
134 "main.ts",
135 "main.js",
136 "main.c",
137 "main.cpp",
138 "index.ts",
139 "index.js",
140 "index.tsx",
141 "index.jsx",
142 "index.py",
143 "app.py",
144 "app.ts",
145 "app.js",
146 "app.tsx",
147 "app.jsx",
148 "app.go",
149 "server.py",
150 "server.ts",
151 "server.js",
152 "server.go",
153 "mod.rs",
154 "lib.rs",
155 "__main__.py",
156 "__init__.py",
157 "cmd/main.go",
158 ];
159 entry_patterns
160 .iter()
161 .any(|p| path.ends_with(p) || path.contains(&format!("/{}", p)))
162 }
163
164 fn get_entry_type(&self, path: &str) -> &'static str {
165 if path.contains("main") {
166 "main"
167 } else if path.contains("index") {
168 "index"
169 } else if path.contains("app") {
170 "app"
171 } else if path.contains("server") {
172 "server"
173 } else if path.contains("lib") {
174 "library"
175 } else if path.contains("mod.rs") {
176 "module"
177 } else {
178 "entry"
179 }
180 }
181
182 fn is_config_file(&self, path: &str) -> bool {
183 let config_files = [
184 "Cargo.toml",
185 "package.json",
186 "pyproject.toml",
187 "go.mod",
188 "pom.xml",
189 "build.gradle",
190 "Gemfile",
191 "requirements.txt",
192 "setup.py",
193 "setup.cfg",
194 "tsconfig.json",
195 "webpack.config",
196 "vite.config",
197 "next.config",
198 "Makefile",
199 "CMakeLists.txt",
200 "Dockerfile",
201 "docker-compose",
202 ".env.example",
203 "config.yaml",
204 "config.yml",
205 "config.json",
206 ];
207 let filename = path.rsplit('/').next().unwrap_or(path);
208 config_files.iter().any(|c| filename.contains(c)) && path.matches('/').count() <= 1
209 }
210
211 fn stream_llm_instructions<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
216 writeln!(w, " <llm_context_guide>")?;
217 writeln!(w, " <purpose>This is a comprehensive code context for the {} repository, optimized for AI-assisted code understanding and generation.</purpose>", escape_xml(&repo.name))?;
218 writeln!(w, " <how_to_use>")?;
219 writeln!(w, " <tip>Start with the <overview> section to understand the project's purpose and structure</tip>")?;
220 writeln!(w, " <tip>Check <entry_points> to find main application files</tip>")?;
221 writeln!(
222 w,
223 " <tip>Use <repository_map> to understand relationships between modules</tip>"
224 )?;
225 writeln!(
226 w,
227 " <tip>Files are ordered by importance - most critical files come first</tip>"
228 )?;
229 writeln!(w, " </how_to_use>")?;
230 writeln!(w, " </llm_context_guide>")
231 }
232
233 fn stream_overview<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
234 writeln!(w, " <overview>")?;
235 let project_type = self.detect_project_type(repo);
236 writeln!(w, " <project_type>{}</project_type>", escape_xml(&project_type))?;
237
238 if let Some(lang) = repo.metadata.languages.iter().max_by_key(|l| l.files) {
239 writeln!(w, " <primary_language>{}</primary_language>", escape_xml(&lang.language))?;
240 }
241 if let Some(framework) = &repo.metadata.framework {
242 writeln!(w, " <framework>{}</framework>", escape_xml(framework))?;
243 }
244
245 writeln!(w, " <entry_points>")?;
246 let mut entry_count = 0;
247 for file in &repo.files {
248 if self.is_entry_point(&file.relative_path) {
249 if file.relative_path.ends_with("__init__.py")
250 && file.token_count.get(self.token_model) < 50
251 {
252 continue;
253 }
254 let entry_type = self.get_entry_type(&file.relative_path);
255 writeln!(
256 w,
257 " <entry path=\"{}\" type=\"{}\" tokens=\"{}\"/>",
258 escape_xml(&file.relative_path),
259 entry_type,
260 file.token_count.get(self.token_model)
261 )?;
262 entry_count += 1;
263 if entry_count >= 10 {
264 break;
265 }
266 }
267 }
268 writeln!(w, " </entry_points>")?;
269
270 writeln!(w, " <config_files>")?;
271 for file in &repo.files {
272 if self.is_config_file(&file.relative_path) {
273 writeln!(
274 w,
275 " <config path=\"{}\" tokens=\"{}\"/>",
276 escape_xml(&file.relative_path),
277 file.token_count.get(self.token_model)
278 )?;
279 }
280 }
281 writeln!(w, " </config_files>")?;
282 writeln!(w, " </overview>")
283 }
284
285 fn stream_metadata<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
286 writeln!(w, " <metadata>")?;
287 if let Some(desc) = &repo.metadata.description {
288 writeln!(w, " <description>{}</description>", escape_xml(desc))?;
289 }
290 writeln!(w, " <stats>")?;
291 writeln!(w, " <files>{}</files>", repo.metadata.total_files)?;
292 writeln!(w, " <lines>{}</lines>", repo.metadata.total_lines)?;
293 writeln!(
294 w,
295 " <tokens model=\"claude\">{}</tokens>",
296 repo.metadata.total_tokens.get(self.token_model)
297 )?;
298 writeln!(w, " </stats>")?;
299
300 if !repo.metadata.languages.is_empty() {
301 writeln!(w, " <languages>")?;
302 for lang in &repo.metadata.languages {
303 writeln!(
304 w,
305 " <language name=\"{}\" files=\"{}\" percentage=\"{:.1}\"/>",
306 escape_xml(&lang.language),
307 lang.files,
308 lang.percentage
309 )?;
310 }
311 writeln!(w, " </languages>")?;
312 }
313
314 if let Some(ref structure) = repo.metadata.directory_structure {
315 writeln!(w, " <directory_structure><![CDATA[")?;
316 write!(w, "{}", structure)?;
317 writeln!(w, "]]></directory_structure>")?;
318 }
319
320 if !repo.metadata.external_dependencies.is_empty() {
321 writeln!(
322 w,
323 " <dependencies count=\"{}\">",
324 repo.metadata.external_dependencies.len()
325 )?;
326 for dep in &repo.metadata.external_dependencies {
327 writeln!(w, " <dependency name=\"{}\"/>", escape_xml(dep))?;
328 }
329 writeln!(w, " </dependencies>")?;
330 }
331
332 let mut ext_counts: std::collections::HashMap<String, usize> =
334 std::collections::HashMap::new();
335 for file in &repo.files {
336 if let Some(ext) = std::path::Path::new(&file.relative_path).extension() {
337 *ext_counts
338 .entry(ext.to_string_lossy().to_string())
339 .or_insert(0) += 1;
340 }
341 }
342 if !ext_counts.is_empty() {
343 writeln!(w, " <file_extensions>")?;
344 let mut sorted_exts: Vec<_> = ext_counts.iter().collect();
345 sorted_exts.sort_by(|a, b| b.1.cmp(a.1)); for (ext, count) in sorted_exts {
347 writeln!(
348 w,
349 " <extension name=\".{}\" count=\"{}\"/>",
350 escape_xml(ext),
351 count
352 )?;
353 }
354 writeln!(w, " </file_extensions>")?;
355 }
356
357 writeln!(w, " </metadata>")
358 }
359
360 fn stream_git_history<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
361 if let Some(ref git_history) = repo.metadata.git_history {
362 writeln!(w, " <git_history>")?;
363 if !git_history.commits.is_empty() {
364 writeln!(w, " <recent_commits count=\"{}\">", git_history.commits.len())?;
365 for commit in &git_history.commits {
366 writeln!(
367 w,
368 " <commit hash=\"{}\" author=\"{}\" date=\"{}\">",
369 escape_xml(&commit.short_hash),
370 escape_xml(&commit.author),
371 escape_xml(&commit.date)
372 )?;
373 writeln!(w, " <message><![CDATA[{}]]></message>", commit.message)?;
374 writeln!(w, " </commit>")?;
375 }
376 writeln!(w, " </recent_commits>")?;
377 }
378 if !git_history.changed_files.is_empty() {
379 writeln!(
380 w,
381 " <uncommitted_changes count=\"{}\">",
382 git_history.changed_files.len()
383 )?;
384 for file in &git_history.changed_files {
385 if let Some(diff) = &file.diff_content {
386 writeln!(
387 w,
388 " <change path=\"{}\" status=\"{}\">",
389 escape_xml(&file.path),
390 escape_xml(&file.status)
391 )?;
392 writeln!(w, " <diff><![CDATA[{}]]></diff>", diff)?;
393 writeln!(w, " </change>")?;
394 } else {
395 writeln!(
396 w,
397 " <change path=\"{}\" status=\"{}\"/>",
398 escape_xml(&file.path),
399 escape_xml(&file.status)
400 )?;
401 }
402 }
403 writeln!(w, " </uncommitted_changes>")?;
404 }
405 writeln!(w, " </git_history>")?;
406 }
407 Ok(())
408 }
409
410 fn stream_repomap<W: Write>(&self, w: &mut W, map: &RepoMap) -> io::Result<()> {
411 writeln!(w, " <repository_map token_budget=\"{}\">", map.token_count)?;
412 writeln!(w, " <summary><![CDATA[{}]]></summary>", map.summary)?;
413
414 writeln!(w, " <key_symbols>")?;
415 for symbol in &map.key_symbols {
416 writeln!(
417 w,
418 " <symbol name=\"{}\" type=\"{}\" file=\"{}\" line=\"{}\" rank=\"{}\">",
419 escape_xml(&symbol.name),
420 escape_xml(&symbol.kind),
421 escape_xml(&symbol.file),
422 symbol.line,
423 symbol.rank
424 )?;
425 if let Some(sig) = &symbol.signature {
426 writeln!(w, " <signature><![CDATA[{}]]></signature>", sig)?;
427 }
428 if let Some(summary) = &symbol.summary {
429 writeln!(w, " <summary><![CDATA[{}]]></summary>", summary)?;
430 }
431 writeln!(w, " </symbol>")?;
432 }
433 writeln!(w, " </key_symbols>")?;
434
435 if !map.module_graph.nodes.is_empty() {
436 writeln!(w, " <modules>")?;
437 for module in &map.module_graph.nodes {
438 writeln!(
439 w,
440 " <module name=\"{}\" files=\"{}\" tokens=\"{}\"/>",
441 escape_xml(&module.name),
442 module.files,
443 module.tokens
444 )?;
445 }
446 writeln!(w, " </modules>")?;
447 }
448 writeln!(w, " </repository_map>")
449 }
450
451 fn stream_file_index<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
452 writeln!(w, " <file_index entries=\"{}\">", repo.files.len())?;
453 for file in &repo.files {
454 let importance = if file.importance > 0.8 {
455 "critical"
456 } else if file.importance > 0.6 {
457 "high"
458 } else if file.importance > 0.3 {
459 "normal"
460 } else {
461 "low"
462 };
463 writeln!(
464 w,
465 " <file path=\"{}\" tokens=\"{}\" importance=\"{}\"/>",
466 escape_xml(&file.relative_path),
467 file.token_count.get(self.token_model),
468 importance
469 )?;
470 }
471 writeln!(w, " </file_index>")
472 }
473
474 fn stream_files<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
475 writeln!(w, " <files>")?;
476 for file in &repo.files {
477 if let Some(content) = &file.content {
478 writeln!(
479 w,
480 " <file path=\"{}\" language=\"{}\" tokens=\"{}\">",
481 escape_xml(&file.relative_path),
482 file.language.as_deref().unwrap_or("unknown"),
483 file.token_count.get(self.token_model)
484 )?;
485
486 if self.include_line_numbers {
487 writeln!(w, " <content line_numbers=\"original\"><![CDATA[")?;
488 let first_line = content.lines().next().unwrap_or("");
491 let has_embedded_line_nums = first_line.contains(':')
492 && first_line
493 .split(':')
494 .next()
495 .map(|s| s.parse::<u32>().is_ok())
496 .unwrap_or(false);
497
498 if has_embedded_line_nums {
499 for line in content.lines() {
501 if let Some((num_str, rest)) = line.split_once(':') {
502 if let Ok(line_num) = num_str.parse::<u32>() {
503 writeln!(w, "{:4} | {}", line_num, rest)?;
504 } else {
505 writeln!(w, " | {}", line)?;
507 }
508 } else {
509 writeln!(w, " | {}", line)?;
510 }
511 }
512 } else {
513 for (i, line) in content.lines().enumerate() {
515 writeln!(w, "{:4} | {}", i + 1, line)?;
516 }
517 }
518 writeln!(w, "]]></content>")?;
519 } else if self.use_cdata {
520 writeln!(w, " <content><![CDATA[{}]]></content>", content)?;
521 } else {
522 writeln!(w, " <content>{}</content>", escape_xml(content))?;
523 }
524 writeln!(w, " </file>")?;
525 }
526 }
527 writeln!(w, " </files>")
528 }
529}
530
531impl Formatter for XmlFormatter {
532 fn format(&self, repo: &Repository, map: &RepoMap) -> String {
533 let mut output = Vec::with_capacity(Self::estimate_output_size(repo));
535 drop(self.format_to_writer(repo, map, &mut output));
537 String::from_utf8(output)
539 .unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
540 }
541
542 fn format_repo(&self, repo: &Repository) -> String {
543 let mut output = Vec::with_capacity(Self::estimate_output_size(repo));
544 drop(self.format_repo_to_writer(repo, &mut output));
546 String::from_utf8(output)
548 .unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
549 }
550
551 fn name(&self) -> &'static str {
552 "xml"
553 }
554}
555
556impl StreamingFormatter for XmlFormatter {
557 fn format_to_writer<W: Write>(
558 &self,
559 repo: &Repository,
560 map: &RepoMap,
561 writer: &mut W,
562 ) -> io::Result<()> {
563 writeln!(writer, r#"<?xml version="1.0" encoding="UTF-8"?>"#)?;
564 writeln!(writer, r#"<repository name="{}" version="1.0.0">"#, escape_xml(&repo.name))?;
565
566 self.stream_llm_instructions(writer, repo)?;
567
568 if self.cache_optimized {
569 writeln!(writer, " <!-- CACHEABLE_PREFIX_START -->")?;
570 }
571
572 self.stream_overview(writer, repo)?;
573 self.stream_metadata(writer, repo)?;
574 self.stream_git_history(writer, repo)?;
575 self.stream_repomap(writer, map)?;
576
577 if self.show_file_index {
578 self.stream_file_index(writer, repo)?;
579 }
580
581 if self.cache_optimized {
582 writeln!(writer, " <!-- CACHEABLE_PREFIX_END -->")?;
583 writeln!(writer, " <!-- DYNAMIC_CONTENT_START -->")?;
584 }
585
586 self.stream_files(writer, repo)?;
587
588 if self.cache_optimized {
589 writeln!(writer, " <!-- DYNAMIC_CONTENT_END -->")?;
590 }
591
592 writeln!(writer, "</repository>")?;
593 Ok(())
594 }
595
596 fn format_repo_to_writer<W: Write>(&self, repo: &Repository, writer: &mut W) -> io::Result<()> {
597 writeln!(writer, r#"<?xml version="1.0" encoding="UTF-8"?>"#)?;
598 writeln!(writer, r#"<repository name="{}">"#, escape_xml(&repo.name))?;
599
600 self.stream_metadata(writer, repo)?;
601 if self.show_file_index {
602 self.stream_file_index(writer, repo)?;
603 }
604 self.stream_files(writer, repo)?;
605
606 writeln!(writer, "</repository>")?;
607 Ok(())
608 }
609}
610
611fn escape_xml(s: &str) -> String {
613 let mut result = String::with_capacity(s.len() + s.len() / 10);
615
616 for c in s.chars() {
617 match c {
618 '&' => result.push_str("&"),
619 '<' => result.push_str("<"),
620 '>' => result.push_str(">"),
621 '"' => result.push_str("""),
622 '\'' => result.push_str("'"),
623 _ => result.push(c),
624 }
625 }
626
627 result
628}
629
630#[cfg(test)]
631#[allow(clippy::str_to_string)]
632mod tests {
633 use super::*;
634 use crate::repomap::RepoMapGenerator;
635 use crate::types::{LanguageStats, RepoFile, RepoMetadata, TokenCounts};
636
637 fn create_test_repo() -> Repository {
638 Repository {
639 name: "test".to_string(),
640 path: "/tmp/test".into(),
641 files: vec![RepoFile {
642 path: "/tmp/test/main.py".into(),
643 relative_path: "main.py".to_string(),
644 language: Some("python".to_string()),
645 size_bytes: 100,
646 token_count: TokenCounts {
647 o200k: 48,
648 cl100k: 49,
649 claude: 50,
650 gemini: 47,
651 llama: 46,
652 mistral: 46,
653 deepseek: 46,
654 qwen: 46,
655 cohere: 47,
656 grok: 46,
657 },
658 symbols: Vec::new(),
659 importance: 0.8,
660 content: Some("def main():\n print('hello')".to_string()),
661 }],
662 metadata: RepoMetadata {
663 total_files: 1,
664 total_lines: 2,
665 total_tokens: TokenCounts {
666 o200k: 48,
667 cl100k: 49,
668 claude: 50,
669 gemini: 47,
670 llama: 46,
671 mistral: 46,
672 deepseek: 46,
673 qwen: 46,
674 cohere: 47,
675 grok: 46,
676 },
677 languages: vec![LanguageStats {
678 language: "Python".to_string(),
679 files: 1,
680 lines: 2,
681 percentage: 100.0,
682 }],
683 framework: None,
684 description: None,
685 branch: None,
686 commit: None,
687 directory_structure: Some("main.py\n".to_string()),
688 external_dependencies: vec!["requests".to_string(), "numpy".to_string()],
689 git_history: None,
690 },
691 }
692 }
693
694 #[test]
695 fn test_xml_output() {
696 let repo = create_test_repo();
697 let map = RepoMapGenerator::new(1000).generate(&repo);
698
699 let formatter = XmlFormatter::new(true);
700 let output = formatter.format(&repo, &map);
701
702 assert!(output.contains("<?xml version=\"1.0\""));
703 assert!(output.contains("<repository name=\"test\""));
704 assert!(output.contains("CACHEABLE_PREFIX_START"));
705 assert!(output.contains("<file path=\"main.py\""));
706 }
707
708 #[test]
709 fn test_xml_escaping() {
710 assert_eq!(escape_xml("<test>"), "<test>");
711 assert_eq!(escape_xml("a & b"), "a & b");
712 }
713}