1use chrono::Utc;
2use ignore::DirEntry;
3use log::{error, info, warn};
4use std::fs;
5use std::io::{self, Read, Seek, SeekFrom, Write};
6use std::path::Path;
7
8use crate::tree::{FileTree, write_tree_to_file};
9use encoding_rs::{Encoding, UTF_8};
10
11#[cfg(feature = "parallel")]
12use crossbeam_channel::{Receiver, Sender, bounded};
13#[cfg(feature = "parallel")]
14use std::thread;
15
16#[derive(Debug, Clone, Default)]
18pub struct TreeSitterConfig {
19 pub signatures: bool,
21 pub structure: bool,
23 pub truncate: String,
25 pub visibility: String,
27}
28
29#[allow(clippy::too_many_arguments, unused_variables)]
31pub fn generate_markdown(
32 output_path: &str,
33 input_dir: &str,
34 filters: &[String],
35 ignores: &[String],
36 file_tree: &FileTree,
37 files: &[DirEntry],
38 base_path: &Path,
39 line_numbers: bool,
40 encoding_strategy: Option<&str>,
41 max_tokens: Option<usize>,
42 ts_config: &TreeSitterConfig,
43) -> io::Result<()> {
44 if let Some(parent) = Path::new(output_path).parent()
45 && !parent.exists()
46 {
47 fs::create_dir_all(parent)?;
48 }
49
50 let mut output = fs::File::create(output_path)?;
51
52 let input_dir_name = if input_dir == "." {
53 let current_dir = std::env::current_dir()?;
54 current_dir
55 .file_name()
56 .and_then(|n| n.to_str())
57 .unwrap_or_else(|| current_dir.to_str().unwrap_or("project"))
58 .to_string()
59 } else {
60 input_dir.to_string()
61 };
62
63 writeln!(output, "# Directory Structure Report\n")?;
65
66 if !filters.is_empty() {
67 writeln!(
68 output,
69 "This document contains files from the `{}` directory with extensions: {}",
70 input_dir_name,
71 filters.join(", ")
72 )?;
73 } else {
74 writeln!(
75 output,
76 "This document contains all files from the `{}` directory, optimized for LLM consumption.",
77 input_dir_name
78 )?;
79 }
80
81 if !ignores.is_empty() {
82 writeln!(output, "Custom ignored patterns: {}", ignores.join(", "))?;
83 }
84
85 let mut content_hasher = xxhash_rust::xxh3::Xxh3::new();
89 for entry in files {
90 let rel_path = entry.path().strip_prefix(base_path).unwrap_or(entry.path());
94 let normalized = rel_path.to_string_lossy().replace('\\', "/");
95 content_hasher.update(normalized.as_bytes());
96 content_hasher.update(b"\0");
98 if let Ok(bytes) = std::fs::read(entry.path()) {
100 content_hasher.update(&bytes);
101 }
102 content_hasher.update(b"\0");
103 }
104 writeln!(output, "Content hash: {:016x}", content_hasher.digest())?;
105 writeln!(output)?;
106
107 writeln!(output, "## File Tree Structure\n")?;
110
111 write_tree_to_file(&mut output, file_tree, 0)?;
112
113 writeln!(output)?;
114
115 #[cfg(feature = "parallel")]
119 {
120 use rayon::prelude::*;
121
122 type ChunkResult = (usize, io::Result<Vec<u8>>);
124 let (sender, receiver): (Sender<ChunkResult>, Receiver<ChunkResult>) =
125 bounded(num_cpus::get() * 2); let writer_handle = {
128 let mut output = output;
129 let total_files = files.len();
130 let budget = max_tokens;
131
132 thread::spawn(move || -> io::Result<()> {
133 let mut completed_chunks = std::collections::BTreeMap::new();
134 let mut next_index = 0;
135 let mut errors = Vec::new();
136 let mut tokens_used: usize = 0;
137 let mut budget_exceeded = false;
138
139 while next_index < total_files {
141 match receiver.recv() {
142 Ok((index, chunk_result)) => {
143 completed_chunks.insert(index, chunk_result);
144
145 while let Some(chunk_result) = completed_chunks.remove(&next_index) {
147 if budget_exceeded {
148 next_index += 1;
150 continue;
151 }
152
153 match chunk_result {
154 Ok(buf) => {
155 let chunk_tokens = buf.len() / 4;
157
158 if let Some(max) = budget
159 && tokens_used + chunk_tokens > max
160 && tokens_used > 0
161 {
162 let remaining = total_files - next_index;
163 let notice = format!(
164 "---\n\n_⚠️ Token budget ({}) reached. {} remaining files omitted._\n\n",
165 max, remaining
166 );
167 if let Err(e) = output.write_all(notice.as_bytes()) {
168 errors.push(format!(
169 "Failed to write truncation notice: {}",
170 e
171 ));
172 }
173 budget_exceeded = true;
174 next_index += 1;
175 continue;
176 }
177
178 tokens_used += chunk_tokens;
179 if let Err(e) = output.write_all(&buf) {
180 errors.push(format!(
181 "Failed to write output for file index {}: {}",
182 next_index, e
183 ));
184 }
185 }
186 Err(e) => {
187 errors.push(format!(
188 "Failed to process file index {}: {}",
189 next_index, e
190 ));
191 }
192 }
193 next_index += 1;
194 }
195 }
196 Err(_) => break, }
198 }
199
200 if !errors.is_empty() {
201 error!(
202 "Encountered {} errors during parallel processing:",
203 errors.len()
204 );
205 for err in &errors {
206 error!(" {}", err);
207 }
208 return Err(std::io::Error::other(format!(
209 "Failed to process {} files: {}",
210 errors.len(),
211 errors.join("; ")
212 )));
213 }
214
215 Ok(())
216 })
217 };
218
219 let ts_config_clone = ts_config.clone();
221 files.par_iter().enumerate().for_each(|(index, entry)| {
222 let mut buf = Vec::new();
223 let result = process_file(
224 base_path,
225 entry.path(),
226 &mut buf,
227 line_numbers,
228 encoding_strategy,
229 &ts_config_clone,
230 )
231 .map(|_| buf);
232
233 let _ = sender.send((index, result));
235 });
236
237 drop(sender);
239
240 writer_handle
242 .join()
243 .map_err(|_| std::io::Error::other("Writer thread panicked"))??;
244 }
245
246 #[cfg(not(feature = "parallel"))]
247 {
248 let mut tokens_used: usize = 0;
249
250 for (idx, entry) in files.iter().enumerate() {
251 let file_size = std::fs::metadata(entry.path())
253 .map(|m| m.len())
254 .unwrap_or(0);
255 let estimated_file_tokens = (file_size as usize) / 4;
256
257 if let Some(budget) = max_tokens {
258 if tokens_used + estimated_file_tokens > budget && tokens_used > 0 {
259 let remaining = files.len() - idx;
260 writeln!(output, "---\n")?;
261 writeln!(
262 output,
263 "_⚠️ Token budget ({}) reached. {} remaining files omitted._\n",
264 budget, remaining
265 )?;
266 break;
267 }
268 }
269
270 tokens_used += estimated_file_tokens;
271 process_file(
272 base_path,
273 entry.path(),
274 &mut output,
275 line_numbers,
276 encoding_strategy,
277 ts_config,
278 )?;
279 }
280 }
281
282 Ok(())
283}
284
285pub fn process_file(
287 base_path: &Path,
288 file_path: &Path,
289 output: &mut impl Write,
290 line_numbers: bool,
291 encoding_strategy: Option<&str>,
292 ts_config: &TreeSitterConfig,
293) -> io::Result<()> {
294 let relative_path = file_path.strip_prefix(base_path).unwrap_or(file_path);
295 info!("Processing file: {}", relative_path.display());
296
297 let metadata = match fs::metadata(file_path) {
298 Ok(meta) => meta,
299 Err(e) => {
300 error!(
301 "Failed to get metadata for {}: {}",
302 relative_path.display(),
303 e
304 );
305 return Ok(());
306 }
307 };
308
309 let modified_time = metadata
310 .modified()
311 .ok()
312 .map(|time| {
313 let system_time: chrono::DateTime<Utc> = time.into();
314 system_time.format("%Y-%m-%d %H:%M:%S UTC").to_string()
315 })
316 .unwrap_or_else(|| "Unknown".to_string());
317
318 writeln!(output)?;
319 writeln!(output, "### File: `{}`", relative_path.display())?;
320
321 writeln!(output)?;
322
323 writeln!(output, "- Size: {} bytes", metadata.len())?;
324 writeln!(output, "- Modified: {}", modified_time)?;
325 writeln!(output)?;
326
327 let extension = file_path
329 .extension()
330 .and_then(|s| s.to_str())
331 .unwrap_or("text");
332 let language = match extension {
333 "rs" => "rust",
334 "js" => "javascript",
335 "ts" => "typescript",
336 "jsx" => "jsx",
337 "tsx" => "tsx",
338 "json" => "json",
339 "toml" => "toml",
340 "md" => "markdown",
341 "yaml" | "yml" => "yaml",
342 "html" => "html",
343 "css" => "css",
344 "py" => "python",
345 "java" => "java",
346 "cpp" => "cpp",
347 "c" => "c",
348 "h" => "c",
349 "hpp" => "cpp",
350 "sql" => "sql",
351 "sh" => "bash",
352 "xml" => "xml",
353 "lock" => "toml",
354 _ => extension,
355 };
356
357 match fs::File::open(file_path) {
359 Ok(mut file) => {
360 let mut sniff = [0u8; 8192];
361 let n = match file.read(&mut sniff) {
362 Ok(n) => n,
363 Err(e) => {
364 warn!(
365 "Could not read file {}: {}. Skipping content.",
366 relative_path.display(),
367 e
368 );
369
370 writeln!(output, "```text")?;
371
372 writeln!(
373 output,
374 "<Could not read file content (e.g., binary file or permission error)>"
375 )?;
376
377 writeln!(output, "```")?;
378
379 return Ok(());
380 }
381 };
382 let slice = &sniff[..n];
383
384 let check_len = if n == sniff.len() {
388 let mut end = n;
390 while end > 0 && end > n.saturating_sub(4) && sniff[end - 1] & 0xC0 == 0x80 {
391 end -= 1; }
393 if end > 0 && end < n {
395 let leading = sniff[end - 1];
396 let expected_len = if leading & 0xE0 == 0xC0 {
397 2
398 } else if leading & 0xF0 == 0xE0 {
399 3
400 } else if leading & 0xF8 == 0xF0 {
401 4
402 } else {
403 1
404 };
405 if end - 1 + expected_len > n {
406 end - 1 } else {
408 n
409 }
410 } else {
411 n
412 }
413 } else {
414 n };
416
417 let is_utf8 = std::str::from_utf8(&sniff[..check_len]).is_ok();
419
420 if is_utf8 && !slice.contains(&0) {
421 } else {
423 let (encoding, _consumed) =
426 encoding_rs::Encoding::for_bom(slice).unwrap_or((encoding_rs::UTF_8, 0));
427
428 let detected_encoding = if encoding == UTF_8 {
430 detect_text_encoding(slice)
432 } else {
433 Some(encoding)
434 };
435
436 match detected_encoding {
437 Some(enc) if enc != UTF_8 => {
438 let strategy = encoding_strategy.unwrap_or("detect");
439 match strategy {
440 "strict" | "skip" => {
441 warn!(
443 "Skipping non-UTF-8 file {} (encoding: {}, strategy: {})",
444 relative_path.display(),
445 enc.name(),
446 strategy
447 );
448 }
449 _ => {
450 match transcode_file_content(file_path, enc) {
452 Ok(transcoded_content) => {
453 info!(
454 "Successfully transcoded {} from {} to UTF-8",
455 relative_path.display(),
456 enc.name()
457 );
458 write_text_content(
459 output,
460 &transcoded_content,
461 language,
462 line_numbers,
463 )?;
464 return Ok(());
465 }
466 Err(e) => {
467 warn!(
468 "Failed to transcode {} from {}: {}. Treating as binary.",
469 relative_path.display(),
470 enc.name(),
471 e
472 );
473 }
474 }
475 }
476 }
477 }
478 _ => {
479 if slice.contains(&0) {
481 warn!(
482 "Detected binary file {} (contains null bytes). Skipping content.",
483 relative_path.display()
484 );
485 } else {
486 warn!(
487 "Could not determine encoding for {}. Treating as binary.",
488 relative_path.display()
489 );
490 }
491 }
492 }
493
494 writeln!(output, "```text")?;
496 writeln!(
497 output,
498 "<Binary file or unsupported encoding: {} bytes>",
499 metadata.len()
500 )?;
501 writeln!(output, "```")?;
502 return Ok(());
503 }
504
505 if let Err(e) = file.seek(SeekFrom::Start(0)) {
507 warn!(
508 "Could not reset file cursor for {}: {}. Skipping content.",
509 relative_path.display(),
510 e
511 );
512 writeln!(output, "```text")?;
513 writeln!(
514 output,
515 "<Could not read file content (e.g., binary file or permission error)>"
516 )?;
517 writeln!(output, "```")?;
518 return Ok(());
519 }
520
521 let content = match std::fs::read_to_string(file_path) {
523 Ok(content) => content,
524 Err(e) => {
525 warn!(
526 "Error reading file {}: {}. Output may be truncated.",
527 relative_path.display(),
528 e
529 );
530 writeln!(output, "```text")?;
531 writeln!(output, "<Error reading file content>")?;
532 writeln!(output, "```")?;
533 return Ok(());
534 }
535 };
536 let signatures_only =
540 ts_config.signatures && crate::tree_sitter::is_supported_extension(extension);
541
542 if !signatures_only {
543 write_text_content(output, &content, language, line_numbers)?;
548 }
549
550 write_tree_sitter_enrichment(output, &content, extension, ts_config)?;
552 }
553 Err(e) => {
554 warn!(
555 "Could not open file {}: {}. Skipping content.",
556 relative_path.display(),
557 e
558 );
559 writeln!(output, "```text")?;
560 writeln!(
561 output,
562 "<Could not read file content (e.g., binary file or permission error)>"
563 )?;
564 writeln!(output, "```")?;
565 }
566 }
567
568 Ok(())
569}
570
571#[allow(unused_variables)]
573pub fn write_tree_sitter_enrichment(
574 output: &mut impl Write,
575 content: &str,
576 extension: &str,
577 ts_config: &TreeSitterConfig,
578) -> io::Result<()> {
579 if !ts_config.signatures && !ts_config.structure {
580 return Ok(());
581 }
582
583 #[cfg(feature = "tree-sitter-base")]
584 {
585 use crate::tree_sitter::language_support::Visibility;
586
587 let vis_filter: Visibility = ts_config.visibility.parse().unwrap_or(Visibility::All);
588
589 if ts_config.structure
590 && let Some(structure) =
591 crate::tree_sitter::extract_structure_for_file(content, extension)
592 {
593 let summary = crate::tree_sitter::structure::format_structure_as_markdown(&structure);
594 if !summary.is_empty() {
595 writeln!(output)?;
596 write!(output, "{}", summary)?;
597 }
598 }
599
600 if ts_config.signatures
601 && let Some(signatures) =
602 crate::tree_sitter::extract_signatures_for_file(content, extension, vis_filter)
603 && !signatures.is_empty()
604 {
605 let language = match extension {
606 "rs" => "rust",
607 "js" | "mjs" | "cjs" => "javascript",
608 "ts" | "tsx" | "mts" | "cts" => "typescript",
609 "py" | "pyw" => "python",
610 "go" => "go",
611 "java" => "java",
612 "c" | "h" => "c",
613 "cpp" | "cxx" | "cc" | "hpp" | "hxx" | "hh" => "cpp",
614 _ => extension,
615 };
616 writeln!(output)?;
617 writeln!(output, "**Signatures:**")?;
618 writeln!(output)?;
619 let formatted = crate::tree_sitter::signatures::format_signatures_as_markdown(
620 &signatures,
621 language,
622 );
623 write!(output, "{}", formatted)?;
624 }
625 }
626
627 #[cfg(not(feature = "tree-sitter-base"))]
628 {
629 }
632
633 Ok(())
634}
635
636fn detect_text_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
638 let encodings = [
640 encoding_rs::WINDOWS_1252,
641 encoding_rs::UTF_16LE,
642 encoding_rs::UTF_16BE,
643 encoding_rs::SHIFT_JIS,
644 ];
645
646 for encoding in &encodings {
647 let (decoded, _, had_errors) = encoding.decode(bytes);
648 if !had_errors && is_likely_text(&decoded) {
649 return Some(encoding);
650 }
651 }
652
653 None
654}
655
656fn is_likely_text(content: &str) -> bool {
658 let mut control_chars = 0;
659 let mut total_chars = 0;
660
661 for ch in content.chars() {
662 total_chars += 1;
663 if ch.is_control() && ch != '\n' && ch != '\r' && ch != '\t' {
664 control_chars += 1;
665 }
666
667 if total_chars > 100 && control_chars * 20 > total_chars {
669 return false;
670 }
671 }
672
673 if total_chars > 0 {
675 control_chars * 20 <= total_chars
676 } else {
677 true
678 }
679}
680
681fn transcode_file_content(file_path: &Path, encoding: &'static Encoding) -> io::Result<String> {
683 let bytes = std::fs::read(file_path)?;
684 let (decoded, _, had_errors) = encoding.decode(&bytes);
685
686 if had_errors {
687 return Err(io::Error::new(
688 io::ErrorKind::InvalidData,
689 format!("Failed to decode file with encoding {}", encoding.name()),
690 ));
691 }
692
693 Ok(decoded.into_owned())
694}
695
696fn write_text_content(
698 output: &mut impl Write,
699 content: &str,
700 language: &str,
701 line_numbers: bool,
702) -> io::Result<()> {
703 writeln!(output, "```{}", language)?;
704
705 if line_numbers {
706 for (i, line) in content.lines().enumerate() {
707 writeln!(output, "{:>4} | {}", i + 1, line)?;
708 }
709 } else {
710 output.write_all(content.as_bytes())?;
711 if !content.ends_with('\n') {
712 writeln!(output)?;
713 }
714 }
715
716 writeln!(output, "```")?;
717 Ok(())
718}
719
720#[cfg(test)]
721mod tests {
722 use super::*;
723 use serial_test::serial;
724 use std::fs;
725 use tempfile::tempdir;
726
727 #[test]
728 fn test_code_block_formatting() {
729 let dir = tempdir().unwrap();
730 let base_path = dir.path();
731 let file_path = base_path.join("test.rs");
732 let output_path = base_path.join("output.md");
733
734 fs::write(
736 &file_path,
737 "fn main() {\n println!(\"Hello, world!\");\n}",
738 )
739 .unwrap();
740
741 let mut output = fs::File::create(&output_path).unwrap();
743
744 process_file(
746 base_path,
747 &file_path,
748 &mut output,
749 false,
750 None,
751 &TreeSitterConfig::default(),
752 )
753 .unwrap();
754
755 let content = fs::read_to_string(&output_path).unwrap();
757
758 assert!(content.contains("```rust"));
760 assert!(content.contains("```") && content.matches("```").count() >= 2);
761 }
762
763 #[test]
764 fn test_markdown_file_formatting() {
765 let dir = tempdir().unwrap();
766 let base_path = dir.path();
767 let file_path = base_path.join("README.md");
768 let output_path = base_path.join("output.md");
769
770 fs::write(&file_path, "# Test\n\nThis is a test markdown file.").unwrap();
772
773 let mut output = fs::File::create(&output_path).unwrap();
775
776 process_file(
778 base_path,
779 &file_path,
780 &mut output,
781 false,
782 None,
783 &TreeSitterConfig::default(),
784 )
785 .unwrap();
786
787 let content = fs::read_to_string(&output_path).unwrap();
789
790 println!("Generated content:\n{}", content);
792
793 assert!(
795 content.contains("```markdown"),
796 "Content should contain '```markdown' but was: {}",
797 content
798 );
799 let code_block_markers = content.matches("```").count();
801
802 assert!(
803 code_block_markers >= 2,
804 "Expected at least 2 code block markers, found {}",
805 code_block_markers
806 );
807 }
808
809 #[test]
810 fn test_line_numbered_code_blocks() {
811 let dir = tempdir().unwrap();
812 let base_path = dir.path();
813 let file_path = base_path.join("lib.rs");
814 let output_path = base_path.join("out.md");
815
816 fs::write(
818 &file_path,
819 "fn add(a: i32, b: i32) -> i32 {\n a + b\n}\n\nfn main() {\n println!(\"{}\", add(1, 2));\n}\n",
820 )
821 .unwrap();
822
823 let mut output = fs::File::create(&output_path).unwrap();
824 process_file(
825 base_path,
826 &file_path,
827 &mut output,
828 true,
829 None,
830 &TreeSitterConfig::default(),
831 )
832 .unwrap();
833
834 let content = fs::read_to_string(&output_path).unwrap();
835
836 assert!(content.contains("```rust"));
838 assert!(content.contains(" 1 | "));
839 assert!(content.contains(" 2 | "));
840
841 let numbered_lines = content
843 .lines()
844 .filter(|l| {
845 l.trim_start()
846 .chars()
847 .next()
848 .map(|c| c.is_ascii_digit())
849 .unwrap_or(false)
850 && l.contains(" | ")
851 })
852 .count();
853 let original_line_count = fs::read_to_string(&file_path).unwrap().lines().count();
854 assert_eq!(numbered_lines, original_line_count);
855
856 assert!(content.contains("```"));
858 }
859
860 #[test]
861 fn test_binary_file_handling() {
862 let dir = tempdir().unwrap();
863 let base_path = dir.path();
864 let file_path = base_path.join("image.bin");
865 let output_path = base_path.join("out.md");
866
867 let bytes = vec![
869 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ];
874 fs::write(&file_path, bytes).unwrap();
875
876 let mut output = fs::File::create(&output_path).unwrap();
877 process_file(
878 base_path,
879 &file_path,
880 &mut output,
881 false,
882 None,
883 &TreeSitterConfig::default(),
884 )
885 .unwrap();
886
887 let content = fs::read_to_string(&output_path).unwrap();
888
889 assert!(content.contains("```text"));
891 assert!(content.contains("<Binary file or unsupported encoding:"));
892
893 let fence_count = content.matches("```").count();
895 assert!(
896 fence_count >= 2,
897 "expected at least opening and closing fences, got {}",
898 fence_count
899 );
900 }
901
902 #[test]
903 fn test_encoding_detection_and_transcoding() {
904 let dir = tempdir().unwrap();
905 let base_path = dir.path();
906 let output_path = base_path.join("out.md");
907
908 let windows1252_content = [
910 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x93, 0x57, 0x6F, 0x72, 0x6C, 0x64, 0x94, 0x0A, ];
914 let file_path = base_path.join("windows1252.txt");
915 fs::write(&file_path, windows1252_content).unwrap();
916
917 let mut output = fs::File::create(&output_path).unwrap();
918 process_file(
919 base_path,
920 &file_path,
921 &mut output,
922 false,
923 Some("detect"),
924 &TreeSitterConfig::default(),
925 )
926 .unwrap();
927
928 let content = fs::read_to_string(&output_path).unwrap();
929
930 assert!(content.contains("Hello"));
932 assert!(content.contains("World"));
933 assert!(content.contains("```txt"));
935
936 let fence_count = content.matches("```").count();
938 assert!(
939 fence_count >= 2,
940 "expected at least opening and closing fences, got {}",
941 fence_count
942 );
943 }
944
945 #[test]
946 fn test_encoding_strategy_strict() {
947 let dir = tempdir().unwrap();
948 let base_path = dir.path();
949 let output_path = base_path.join("out.md");
950
951 let non_utf8_content = [0xFF, 0xFE, 0x41, 0x00]; let file_path = base_path.join("utf16.txt");
954 fs::write(&file_path, non_utf8_content).unwrap();
955
956 let mut output = fs::File::create(&output_path).unwrap();
957 process_file(
958 base_path,
959 &file_path,
960 &mut output,
961 false,
962 Some("strict"),
963 &TreeSitterConfig::default(),
964 )
965 .unwrap();
966
967 let content = fs::read_to_string(&output_path).unwrap();
968
969 assert!(content.contains("<Binary file or unsupported encoding:"));
971 assert!(content.contains("```text"));
972
973 let fence_count = content.matches("```").count();
975 assert!(
976 fence_count >= 2,
977 "expected at least opening and closing fences, got {}",
978 fence_count
979 );
980 }
981
982 #[test]
983 fn test_encoding_strategy_skip() {
984 let dir = tempdir().unwrap();
985 let base_path = dir.path();
986 let output_path = base_path.join("out.md");
987
988 let utf16_content = [0xFF, 0xFE, 0x48, 0x00, 0x69, 0x00]; let file_path = base_path.join("utf16.txt");
991 fs::write(&file_path, utf16_content).unwrap();
992
993 let mut output = fs::File::create(&output_path).unwrap();
994 process_file(
995 base_path,
996 &file_path,
997 &mut output,
998 false,
999 Some("skip"),
1000 &TreeSitterConfig::default(),
1001 )
1002 .unwrap();
1003
1004 let content = fs::read_to_string(&output_path).unwrap();
1005
1006 assert!(content.contains("<Binary file or unsupported encoding:"));
1008 assert!(content.contains("```text"));
1009 }
1010
1011 #[test]
1012 #[serial]
1013 fn test_generate_markdown_with_current_directory() {
1014 let dir = tempdir().unwrap();
1015 let base_path = dir.path();
1016 let output_path = base_path.join("test.md");
1017
1018 fs::write(base_path.join("readme.txt"), "Hello world").unwrap();
1020
1021 let files = crate::file_utils::collect_files(base_path, &[], &[], &[]).unwrap();
1023 let file_tree = crate::tree::build_file_tree(&files, base_path);
1024
1025 let original_dir = std::env::current_dir().unwrap();
1027 std::env::set_current_dir(base_path).unwrap();
1028
1029 let result = generate_markdown(
1031 &output_path.to_string_lossy(),
1032 ".",
1033 &[],
1034 &[],
1035 &file_tree,
1036 &files,
1037 base_path,
1038 false,
1039 None,
1040 None, &TreeSitterConfig::default(),
1042 );
1043
1044 std::env::set_current_dir(original_dir).unwrap();
1046
1047 assert!(result.is_ok());
1048 let content = fs::read_to_string(&output_path).unwrap();
1049 assert!(content.contains("Directory Structure Report"));
1050 }
1051
1052 #[test]
1053 fn test_generate_markdown_creates_output_directory() {
1054 let dir = tempdir().unwrap();
1055 let base_path = dir.path();
1056 let nested_output = base_path.join("nested").join("deep").join("output.md");
1057
1058 fs::write(base_path.join("test.txt"), "content").unwrap();
1060
1061 let files = crate::file_utils::collect_files(base_path, &[], &[], &[]).unwrap();
1062 let file_tree = crate::tree::build_file_tree(&files, base_path);
1063
1064 let result = generate_markdown(
1065 &nested_output.to_string_lossy(),
1066 "test_dir",
1067 &[],
1068 &[],
1069 &file_tree,
1070 &files,
1071 base_path,
1072 false,
1073 None,
1074 None, &TreeSitterConfig::default(),
1076 );
1077
1078 assert!(result.is_ok());
1079 assert!(nested_output.exists());
1080 assert!(nested_output.parent().unwrap().exists());
1081 }
1082
1083 #[test]
1084 fn test_generate_markdown_with_filters_and_ignores() {
1085 let dir = tempdir().unwrap();
1086 let base_path = dir.path();
1087 let output_path = base_path.join("filtered.md");
1088
1089 fs::write(base_path.join("main.rs"), "fn main() {}").unwrap();
1090 fs::write(base_path.join("config.toml"), "[package]").unwrap();
1091 fs::write(base_path.join("readme.md"), "# README").unwrap();
1092
1093 let files = crate::file_utils::collect_files(base_path, &[], &[], &[]).unwrap();
1094 let file_tree = crate::tree::build_file_tree(&files, base_path);
1095
1096 let result = generate_markdown(
1097 &output_path.to_string_lossy(),
1098 "project",
1099 &["rs".to_string(), "toml".to_string()],
1100 &["readme.md".to_string()],
1101 &file_tree,
1102 &files,
1103 base_path,
1104 true,
1105 Some("strict"),
1106 None, &TreeSitterConfig::default(),
1108 );
1109
1110 assert!(result.is_ok());
1111 let content = fs::read_to_string(&output_path).unwrap();
1112 assert!(content.contains("Directory Structure Report"));
1113 assert!(content.contains("main.rs") || content.contains("config.toml"));
1115 }
1116
1117 #[test]
1118 fn test_write_text_content_with_line_numbers() {
1119 let mut output = Vec::new();
1120 let content = "line one\nline two\nline three";
1121
1122 write_text_content(&mut output, content, "rust", true).unwrap();
1123
1124 let result = String::from_utf8(output).unwrap();
1125 assert!(result.contains("```rust"));
1126 assert!(result.contains(" 1 | line one"));
1127 assert!(result.contains(" 2 | line two"));
1128 assert!(result.contains(" 3 | line three"));
1129 assert!(result.contains("```"));
1130 }
1131
1132 #[test]
1133 fn test_write_text_content_without_line_numbers() {
1134 let mut output = Vec::new();
1135 let content = "function test() {\n return true;\n}";
1136
1137 write_text_content(&mut output, content, "javascript", false).unwrap();
1138
1139 let result = String::from_utf8(output).unwrap();
1140 assert!(result.contains("```javascript"));
1141 assert!(result.contains("function test() {"));
1142 assert!(result.contains(" return true;"));
1143 assert!(result.contains("```"));
1144 assert!(!result.contains(" | ")); }
1146
1147 #[test]
1148 fn test_write_text_content_without_trailing_newline() {
1149 let mut output = Vec::new();
1150 let content = "no newline at end"; write_text_content(&mut output, content, "text", false).unwrap();
1153
1154 let result = String::from_utf8(output).unwrap();
1155 assert!(result.contains("```text"));
1156 assert!(result.contains("no newline at end"));
1157 assert!(result.ends_with("```\n")); }
1159
1160 #[test]
1161 fn test_is_likely_text() {
1162 assert!(is_likely_text("Hello world\nThis is normal text"));
1164
1165 assert!(is_likely_text(
1167 "Line 1\nLine 2\tTabbed\r\nWindows line ending"
1168 ));
1169
1170 let mut bad_text = String::new();
1172 for i in 0..200 {
1173 if i % 5 == 0 {
1174 bad_text.push('\x01'); } else {
1176 bad_text.push('a');
1177 }
1178 }
1179 assert!(!is_likely_text(&bad_text));
1180
1181 assert!(is_likely_text(""));
1183 }
1184
1185 #[test]
1186 fn test_detect_text_encoding() {
1187 let utf8_bytes = "Hello world".as_bytes();
1189 let result = detect_text_encoding(utf8_bytes);
1190 assert!(result.is_some() || result.is_none());
1193
1194 let windows1252_bytes = [
1196 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x93, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x94,
1197 ];
1198 let detected = detect_text_encoding(&windows1252_bytes);
1199 assert!(detected.is_some());
1200 }
1201
1202 #[test]
1203 fn test_transcode_file_content() {
1204 let dir = tempdir().unwrap();
1205 let file_path = dir.path().join("windows1252.txt");
1206
1207 let windows1252_content = [
1209 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x93, 0x57, 0x6F, 0x72, 0x6C, 0x64, 0x94, ];
1212 fs::write(&file_path, windows1252_content).unwrap();
1213
1214 let result = transcode_file_content(&file_path, encoding_rs::WINDOWS_1252);
1215 assert!(result.is_ok());
1216
1217 let transcoded = result.unwrap();
1218 assert!(transcoded.contains("Hello"));
1219 assert!(transcoded.contains("World"));
1220 }
1221
1222 #[test]
1223 fn test_process_file_with_metadata_error() {
1224 let dir = tempdir().unwrap();
1225 let base_path = dir.path();
1226 let nonexistent_file = base_path.join("nonexistent.txt");
1227 let output_path = base_path.join("output.md");
1228
1229 let mut output = fs::File::create(&output_path).unwrap();
1230
1231 let result = process_file(
1233 base_path,
1234 &nonexistent_file,
1235 &mut output,
1236 false,
1237 None,
1238 &TreeSitterConfig::default(),
1239 );
1240 assert!(result.is_ok());
1241
1242 let content = fs::read_to_string(&output_path).unwrap();
1244 assert!(content.is_empty() || content.trim().is_empty());
1245 }
1246
1247 #[test]
1248 fn test_process_file_with_different_extensions() {
1249 let dir = tempdir().unwrap();
1250 let base_path = dir.path();
1251 let output_path = base_path.join("output.md");
1252
1253 let test_files = [
1255 ("script.py", "print('hello')", "python"),
1256 ("data.json", r#"{"key": "value"}"#, "json"),
1257 ("config.yaml", "key: value", "yaml"),
1258 ("style.css", "body { margin: 0; }", "css"),
1259 ("page.html", "<html><body>Test</body></html>", "html"),
1260 ("query.sql", "SELECT * FROM users;", "sql"),
1261 ("build.sh", "#!/bin/bash\necho 'building'", "bash"),
1262 ("unknown.xyz", "unknown content", "xyz"),
1263 ];
1264
1265 for (filename, content, expected_lang) in test_files.iter() {
1266 let file_path = base_path.join(filename);
1267 fs::write(&file_path, content).unwrap();
1268
1269 let mut output = fs::File::create(&output_path).unwrap();
1270 process_file(
1271 base_path,
1272 &file_path,
1273 &mut output,
1274 false,
1275 None,
1276 &TreeSitterConfig::default(),
1277 )
1278 .unwrap();
1279
1280 let result = fs::read_to_string(&output_path).unwrap();
1281 assert!(result.contains(&format!("```{}", expected_lang)));
1282 assert!(result.contains(content));
1283 assert!(result.contains(filename));
1284 }
1285 }
1286
1287 #[test]
1288 fn test_process_file_with_seek_error_handling() {
1289 let dir = tempdir().unwrap();
1290 let base_path = dir.path();
1291 let output_path = base_path.join("output.md");
1292
1293 let file_path = base_path.join("test.txt");
1294 fs::write(&file_path, "test content").unwrap();
1295
1296 let mut output = fs::File::create(&output_path).unwrap();
1297
1298 let result = process_file(
1299 base_path,
1300 &file_path,
1301 &mut output,
1302 false,
1303 None,
1304 &TreeSitterConfig::default(),
1305 );
1306
1307 assert!(result.is_ok());
1308 }
1309
1310 #[test]
1311 fn test_process_file_jsx_tsx_extensions() {
1312 let dir = tempdir().unwrap();
1313 let base_path = dir.path();
1314 let output_path = base_path.join("output.md");
1315
1316 let jsx_file = base_path.join("component.jsx");
1317 fs::write(&jsx_file, "const App = () => <div/>;").unwrap();
1318
1319 let mut output = fs::File::create(&output_path).unwrap();
1320 process_file(
1321 base_path,
1322 &jsx_file,
1323 &mut output,
1324 false,
1325 None,
1326 &TreeSitterConfig::default(),
1327 )
1328 .unwrap();
1329
1330 let content = fs::read_to_string(&output_path).unwrap();
1331 assert!(content.contains("```jsx"));
1332 }
1333
1334 #[test]
1335 fn test_process_file_various_lock_extensions() {
1336 let dir = tempdir().unwrap();
1337 let base_path = dir.path();
1338 let output_path = base_path.join("output.md");
1339
1340 let lock_file = base_path.join("Cargo.lock");
1341 fs::write(&lock_file, "[package]\nname = \"test\"").unwrap();
1342
1343 let mut output = fs::File::create(&output_path).unwrap();
1344 process_file(
1345 base_path,
1346 &lock_file,
1347 &mut output,
1348 false,
1349 None,
1350 &TreeSitterConfig::default(),
1351 )
1352 .unwrap();
1353
1354 let content = fs::read_to_string(&output_path).unwrap();
1355 assert!(content.contains("```toml"));
1356 }
1357
1358 #[test]
1359 fn test_process_file_java_cpp_extensions() {
1360 let dir = tempdir().unwrap();
1361 let base_path = dir.path();
1362
1363 let java_file = base_path.join("Main.java");
1364 fs::write(&java_file, "class Main {}").unwrap();
1365
1366 let cpp_file = base_path.join("main.cpp");
1367 fs::write(&cpp_file, "int main() {}").unwrap();
1368
1369 let c_file = base_path.join("main.c");
1370 fs::write(&c_file, "int main() {}").unwrap();
1371
1372 let h_file = base_path.join("header.h");
1373 fs::write(&h_file, "void func();").unwrap();
1374
1375 let hpp_file = base_path.join("header.hpp");
1376 fs::write(&hpp_file, "void func();").unwrap();
1377
1378 for (file, lang) in [
1379 (&java_file, "java"),
1380 (&cpp_file, "cpp"),
1381 (&c_file, "c"),
1382 (&h_file, "c"),
1383 (&hpp_file, "cpp"),
1384 ] {
1385 let output_path = base_path.join("output.md");
1386 let mut output = fs::File::create(&output_path).unwrap();
1387 process_file(
1388 base_path,
1389 file,
1390 &mut output,
1391 false,
1392 None,
1393 &TreeSitterConfig::default(),
1394 )
1395 .unwrap();
1396
1397 let content = fs::read_to_string(&output_path).unwrap();
1398 assert!(content.contains(&format!("```{}", lang)));
1399 }
1400 }
1401
1402 #[test]
1403 fn test_process_file_with_bom() {
1404 let dir = tempdir().unwrap();
1405 let base_path = dir.path();
1406 let output_path = base_path.join("output.md");
1407
1408 let bom_file = base_path.join("bom.txt");
1409 let bom_content = [0xEF, 0xBB, 0xBF, b'H', b'e', b'l', b'l', b'o'];
1410 fs::write(&bom_file, bom_content).unwrap();
1411
1412 let mut output = fs::File::create(&output_path).unwrap();
1413 process_file(
1414 base_path,
1415 &bom_file,
1416 &mut output,
1417 false,
1418 Some("detect"),
1419 &TreeSitterConfig::default(),
1420 )
1421 .unwrap();
1422
1423 let content = fs::read_to_string(&output_path).unwrap();
1424 assert!(content.contains("Hello") || content.contains("```"));
1425 }
1426
1427 #[test]
1428 fn test_detect_text_encoding_utf16() {
1429 let utf16le_bytes = [0xFF, 0xFE, 0x48, 0x00, 0x69, 0x00];
1430 let result = detect_text_encoding(&utf16le_bytes);
1431 assert!(result.is_some());
1432 }
1433
1434 #[test]
1435 fn test_detect_text_encoding_shift_jis() {
1436 let shift_jis_bytes = [0x82, 0xB1, 0x82, 0xF1, 0x82, 0xC9, 0x82, 0xBF, 0x82, 0xCD];
1437 let result = detect_text_encoding(&shift_jis_bytes);
1438 assert!(result.is_some() || result.is_none());
1439 }
1440
1441 #[test]
1442 fn test_transcode_file_content_with_errors() {
1443 let dir = tempdir().unwrap();
1444 let file_path = dir.path().join("test.txt");
1445
1446 fs::write(&file_path, b"test content").unwrap();
1447
1448 let result = transcode_file_content(&file_path, encoding_rs::UTF_16LE);
1449 assert!(result.is_ok() || result.is_err());
1450 }
1451
1452 #[test]
1453 fn test_write_tree_sitter_enrichment_no_feature() {
1454 let mut output = Vec::new();
1455 let content = "fn main() {}";
1456
1457 let ts_config = TreeSitterConfig {
1458 signatures: false,
1459 structure: false,
1460 truncate: "smart".to_string(),
1461 visibility: "all".to_string(),
1462 };
1463
1464 let result = write_tree_sitter_enrichment(&mut output, content, "rs", &ts_config);
1465 assert!(result.is_ok());
1466 assert!(output.is_empty());
1467 }
1468
1469 #[test]
1470 fn test_generate_markdown_max_tokens_budget() {
1471 let dir = tempdir().unwrap();
1472 let base_path = dir.path();
1473 let output_path = base_path.join("output.md");
1474
1475 fs::write(base_path.join("file1.txt"), "x".repeat(50000)).unwrap();
1476 fs::write(base_path.join("file2.txt"), "y".repeat(50000)).unwrap();
1477
1478 let files = crate::file_utils::collect_files(base_path, &[], &[], &[]).unwrap();
1479 let file_tree = crate::tree::build_file_tree(&files, base_path);
1480
1481 let result = generate_markdown(
1482 &output_path.to_string_lossy(),
1483 "project",
1484 &[],
1485 &[],
1486 &file_tree,
1487 &files,
1488 base_path,
1489 false,
1490 None,
1491 Some(100),
1492 &TreeSitterConfig::default(),
1493 );
1494
1495 assert!(result.is_ok());
1496 let content = fs::read_to_string(&output_path).unwrap();
1497 assert!(content.contains("Token budget") || content.len() < 1000);
1498 }
1499
1500 #[test]
1501 fn test_process_file_empty_file() {
1502 let dir = tempdir().unwrap();
1503 let base_path = dir.path();
1504 let output_path = base_path.join("output.md");
1505
1506 let empty_file = base_path.join("empty.txt");
1507 fs::write(&empty_file, "").unwrap();
1508
1509 let mut output = fs::File::create(&output_path).unwrap();
1510 let result = process_file(
1511 base_path,
1512 &empty_file,
1513 &mut output,
1514 false,
1515 None,
1516 &TreeSitterConfig::default(),
1517 );
1518
1519 assert!(result.is_ok());
1520 let content = fs::read_to_string(&output_path).unwrap();
1521 assert!(content.contains("empty.txt"));
1522 assert!(content.contains("Size: 0 bytes"));
1523 }
1524
1525 #[test]
1526 fn test_process_file_with_multibyte_utf8() {
1527 let dir = tempdir().unwrap();
1528 let base_path = dir.path();
1529 let output_path = base_path.join("output.md");
1530
1531 let content = "Hello 世界 🌍 Здравствуй";
1532 let unicode_file = base_path.join("unicode.txt");
1533 fs::write(&unicode_file, content).unwrap();
1534
1535 let mut output = fs::File::create(&output_path).unwrap();
1536 let result = process_file(
1537 base_path,
1538 &unicode_file,
1539 &mut output,
1540 true,
1541 None,
1542 &TreeSitterConfig::default(),
1543 );
1544
1545 assert!(result.is_ok());
1546 let output_content = fs::read_to_string(&output_path).unwrap();
1547 assert!(output_content.contains("世界") || output_content.contains("```"));
1548 }
1549
1550 #[test]
1551 fn test_generate_markdown_with_ignores_list() {
1552 let dir = tempdir().unwrap();
1553 let base_path = dir.path();
1554 let output_path = base_path.join("output.md");
1555
1556 fs::write(base_path.join("main.rs"), "fn main() {}").unwrap();
1557 fs::write(base_path.join("test.txt"), "test").unwrap();
1558
1559 let files = crate::file_utils::collect_files(base_path, &[], &[], &[]).unwrap();
1560 let file_tree = crate::tree::build_file_tree(&files, base_path);
1561
1562 let result = generate_markdown(
1563 &output_path.to_string_lossy(),
1564 "project",
1565 &[],
1566 &["test.txt".to_string()],
1567 &file_tree,
1568 &files,
1569 base_path,
1570 false,
1571 None,
1572 None,
1573 &TreeSitterConfig::default(),
1574 );
1575
1576 assert!(result.is_ok());
1577 let content = fs::read_to_string(&output_path).unwrap();
1578 assert!(content.contains("Directory Structure Report"));
1579 }
1580}