context_builder/
markdown.rs

1use chrono::Utc;
2use ignore::DirEntry;
3use log::{error, info, warn};
4use std::fs;
5use std::io::{self, Read, Seek, SeekFrom, Write};
6use std::path::Path;
7
8use crate::tree::{FileTree, write_tree_to_file};
9use encoding_rs::{Encoding, UTF_8};
10
11#[cfg(feature = "parallel")]
12use crossbeam_channel::{Receiver, Sender, bounded};
13#[cfg(feature = "parallel")]
14use std::thread;
15
16/// Configuration for tree-sitter powered output.
17#[derive(Debug, Clone, Default)]
18pub struct TreeSitterConfig {
19    /// Output only signatures (function/type declarations) instead of full content.
20    pub signatures: bool,
21    /// Include a structure summary (counts of functions, structs, etc.) per file.
22    pub structure: bool,
23    /// Truncation mode: "smart" uses AST boundaries, anything else uses byte truncation.
24    pub truncate: String,
25    /// Visibility filter: "public", "private", or "all".
26    pub visibility: String,
27}
28
29/// Generates the final Markdown file.
30#[allow(clippy::too_many_arguments, unused_variables)]
31pub fn generate_markdown(
32    output_path: &str,
33    input_dir: &str,
34    filters: &[String],
35    ignores: &[String],
36    file_tree: &FileTree,
37    files: &[DirEntry],
38    base_path: &Path,
39    line_numbers: bool,
40    encoding_strategy: Option<&str>,
41    max_tokens: Option<usize>,
42    ts_config: &TreeSitterConfig,
43) -> io::Result<()> {
44    if let Some(parent) = Path::new(output_path).parent()
45        && !parent.exists()
46    {
47        fs::create_dir_all(parent)?;
48    }
49
50    let mut output = fs::File::create(output_path)?;
51
52    let input_dir_name = if input_dir == "." {
53        let current_dir = std::env::current_dir()?;
54        current_dir
55            .file_name()
56            .and_then(|n| n.to_str())
57            .unwrap_or_else(|| current_dir.to_str().unwrap_or("project"))
58            .to_string()
59    } else {
60        input_dir.to_string()
61    };
62
63    // --- Header --- //
64    writeln!(output, "# Directory Structure Report\n")?;
65
66    if !filters.is_empty() {
67        writeln!(
68            output,
69            "This document contains files from the `{}` directory with extensions: {}",
70            input_dir_name,
71            filters.join(", ")
72        )?;
73    } else {
74        writeln!(
75            output,
76            "This document contains all files from the `{}` directory, optimized for LLM consumption.",
77            input_dir_name
78        )?;
79    }
80
81    if !ignores.is_empty() {
82        writeln!(output, "Custom ignored patterns: {}", ignores.join(", "))?;
83    }
84
85    // Deterministic content hash (enables LLM prompt caching across runs)
86    // Uses xxh3 over file content bytes — stable across Rust versions and machines.
87    // Previous implementation hashed mtime (broken by git checkout, cp, etc.)
88    let mut content_hasher = xxhash_rust::xxh3::Xxh3::new();
89    for entry in files {
90        // Hash relative unix-style path for cross-OS determinism.
91        // Using absolute or OS-native paths would produce different hashes
92        // on different machines or operating systems.
93        let rel_path = entry.path().strip_prefix(base_path).unwrap_or(entry.path());
94        let normalized = rel_path.to_string_lossy().replace('\\', "/");
95        content_hasher.update(normalized.as_bytes());
96        // Null delimiter prevents collision: path="a" content="bc" vs path="ab" content="c"
97        content_hasher.update(b"\0");
98        // Hash actual file content (not mtime!) for determinism
99        if let Ok(bytes) = std::fs::read(entry.path()) {
100            content_hasher.update(&bytes);
101        }
102        content_hasher.update(b"\0");
103    }
104    writeln!(output, "Content hash: {:016x}", content_hasher.digest())?;
105    writeln!(output)?;
106
107    // --- File Tree --- //
108
109    writeln!(output, "## File Tree Structure\n")?;
110
111    write_tree_to_file(&mut output, file_tree, 0)?;
112
113    writeln!(output)?;
114
115    // (No '## Files' heading here; it will be injected later only once during final composition)
116    // (Diff section will be conditionally inserted later by the auto_diff logic in lib.rs)
117
118    #[cfg(feature = "parallel")]
119    {
120        use rayon::prelude::*;
121
122        // Create a bounded channel for ordered chunks
123        type ChunkResult = (usize, io::Result<Vec<u8>>);
124        let (sender, receiver): (Sender<ChunkResult>, Receiver<ChunkResult>) =
125            bounded(num_cpus::get() * 2); // Buffer size based on CPU count
126
127        let writer_handle = {
128            let mut output = output;
129            let total_files = files.len();
130            let budget = max_tokens;
131
132            thread::spawn(move || -> io::Result<()> {
133                let mut completed_chunks = std::collections::BTreeMap::new();
134                let mut next_index = 0;
135                let mut errors = Vec::new();
136                let mut tokens_used: usize = 0;
137                let mut budget_exceeded = false;
138
139                // Receive chunks and write them in order
140                while next_index < total_files {
141                    match receiver.recv() {
142                        Ok((index, chunk_result)) => {
143                            completed_chunks.insert(index, chunk_result);
144
145                            // Write all consecutive chunks starting from next_index
146                            while let Some(chunk_result) = completed_chunks.remove(&next_index) {
147                                if budget_exceeded {
148                                    // Already over budget — skip remaining chunks
149                                    next_index += 1;
150                                    continue;
151                                }
152
153                                match chunk_result {
154                                    Ok(buf) => {
155                                        // Estimate tokens for this chunk (~4 bytes per token)
156                                        let chunk_tokens = buf.len() / 4;
157
158                                        if let Some(max) = budget
159                                            && tokens_used + chunk_tokens > max
160                                            && tokens_used > 0
161                                        {
162                                            let remaining = total_files - next_index;
163                                            let notice = format!(
164                                                "---\n\n_⚠️ Token budget ({}) reached. {} remaining files omitted._\n\n",
165                                                max, remaining
166                                            );
167                                            if let Err(e) = output.write_all(notice.as_bytes()) {
168                                                errors.push(format!(
169                                                    "Failed to write truncation notice: {}",
170                                                    e
171                                                ));
172                                            }
173                                            budget_exceeded = true;
174                                            next_index += 1;
175                                            continue;
176                                        }
177
178                                        tokens_used += chunk_tokens;
179                                        if let Err(e) = output.write_all(&buf) {
180                                            errors.push(format!(
181                                                "Failed to write output for file index {}: {}",
182                                                next_index, e
183                                            ));
184                                        }
185                                    }
186                                    Err(e) => {
187                                        errors.push(format!(
188                                            "Failed to process file index {}: {}",
189                                            next_index, e
190                                        ));
191                                    }
192                                }
193                                next_index += 1;
194                            }
195                        }
196                        Err(_) => break, // Channel closed
197                    }
198                }
199
200                if !errors.is_empty() {
201                    error!(
202                        "Encountered {} errors during parallel processing:",
203                        errors.len()
204                    );
205                    for err in &errors {
206                        error!("  {}", err);
207                    }
208                    return Err(std::io::Error::other(format!(
209                        "Failed to process {} files: {}",
210                        errors.len(),
211                        errors.join("; ")
212                    )));
213                }
214
215                Ok(())
216            })
217        };
218
219        // Process files in parallel and send results to writer
220        let ts_config_clone = ts_config.clone();
221        files.par_iter().enumerate().for_each(|(index, entry)| {
222            let mut buf = Vec::new();
223            let result = process_file(
224                base_path,
225                entry.path(),
226                &mut buf,
227                line_numbers,
228                encoding_strategy,
229                &ts_config_clone,
230            )
231            .map(|_| buf);
232
233            // Send result to writer thread (ignore send errors - channel might be closed)
234            let _ = sender.send((index, result));
235        });
236
237        // Close the sender to signal completion
238        drop(sender);
239
240        // Wait for writer thread to complete and propagate any errors
241        writer_handle
242            .join()
243            .map_err(|_| std::io::Error::other("Writer thread panicked"))??;
244    }
245
246    #[cfg(not(feature = "parallel"))]
247    {
248        let mut tokens_used: usize = 0;
249
250        for (idx, entry) in files.iter().enumerate() {
251            // Estimate tokens for this file (~4 bytes per token)
252            let file_size = std::fs::metadata(entry.path())
253                .map(|m| m.len())
254                .unwrap_or(0);
255            let estimated_file_tokens = (file_size as usize) / 4;
256
257            if let Some(budget) = max_tokens {
258                if tokens_used + estimated_file_tokens > budget && tokens_used > 0 {
259                    let remaining = files.len() - idx;
260                    writeln!(output, "---\n")?;
261                    writeln!(
262                        output,
263                        "_⚠️ Token budget ({}) reached. {} remaining files omitted._\n",
264                        budget, remaining
265                    )?;
266                    break;
267                }
268            }
269
270            tokens_used += estimated_file_tokens;
271            process_file(
272                base_path,
273                entry.path(),
274                &mut output,
275                line_numbers,
276                encoding_strategy,
277                ts_config,
278            )?;
279        }
280    }
281
282    Ok(())
283}
284
285/// Processes a single file and writes its content to the output.
286pub fn process_file(
287    base_path: &Path,
288    file_path: &Path,
289    output: &mut impl Write,
290    line_numbers: bool,
291    encoding_strategy: Option<&str>,
292    ts_config: &TreeSitterConfig,
293) -> io::Result<()> {
294    let relative_path = file_path.strip_prefix(base_path).unwrap_or(file_path);
295    info!("Processing file: {}", relative_path.display());
296
297    let metadata = match fs::metadata(file_path) {
298        Ok(meta) => meta,
299        Err(e) => {
300            error!(
301                "Failed to get metadata for {}: {}",
302                relative_path.display(),
303                e
304            );
305            return Ok(());
306        }
307    };
308
309    let modified_time = metadata
310        .modified()
311        .ok()
312        .map(|time| {
313            let system_time: chrono::DateTime<Utc> = time.into();
314            system_time.format("%Y-%m-%d %H:%M:%S UTC").to_string()
315        })
316        .unwrap_or_else(|| "Unknown".to_string());
317
318    writeln!(output)?;
319    writeln!(output, "### File: `{}`", relative_path.display())?;
320
321    writeln!(output)?;
322
323    writeln!(output, "- Size: {} bytes", metadata.len())?;
324    writeln!(output, "- Modified: {}", modified_time)?;
325    writeln!(output)?;
326
327    // --- File Content --- //
328    let extension = file_path
329        .extension()
330        .and_then(|s| s.to_str())
331        .unwrap_or("text");
332    let language = match extension {
333        "rs" => "rust",
334        "js" => "javascript",
335        "ts" => "typescript",
336        "jsx" => "jsx",
337        "tsx" => "tsx",
338        "json" => "json",
339        "toml" => "toml",
340        "md" => "markdown",
341        "yaml" | "yml" => "yaml",
342        "html" => "html",
343        "css" => "css",
344        "py" => "python",
345        "java" => "java",
346        "cpp" => "cpp",
347        "c" => "c",
348        "h" => "c",
349        "hpp" => "cpp",
350        "sql" => "sql",
351        "sh" => "bash",
352        "xml" => "xml",
353        "lock" => "toml",
354        _ => extension,
355    };
356
357    // Enhanced binary file handling with encoding detection and transcoding
358    match fs::File::open(file_path) {
359        Ok(mut file) => {
360            let mut sniff = [0u8; 8192];
361            let n = match file.read(&mut sniff) {
362                Ok(n) => n,
363                Err(e) => {
364                    warn!(
365                        "Could not read file {}: {}. Skipping content.",
366                        relative_path.display(),
367                        e
368                    );
369
370                    writeln!(output, "```text")?;
371
372                    writeln!(
373                        output,
374                        "<Could not read file content (e.g., binary file or permission error)>"
375                    )?;
376
377                    writeln!(output, "```")?;
378
379                    return Ok(());
380                }
381            };
382            let slice = &sniff[..n];
383
384            // Find a valid UTF-8 boundary by backtracking up to 3 bytes.
385            // If the sniff buffer cuts a multi-byte char (e.g., emoji at byte 8191),
386            // from_utf8 would falsely classify the file as non-UTF-8.
387            let check_len = if n == sniff.len() {
388                // Buffer is full — may have split a multi-byte char at the end
389                let mut end = n;
390                while end > 0 && end > n.saturating_sub(4) && sniff[end - 1] & 0xC0 == 0x80 {
391                    end -= 1; // skip continuation bytes
392                }
393                // If we landed on a leading byte, check if the sequence is complete
394                if end > 0 && end < n {
395                    let leading = sniff[end - 1];
396                    let expected_len = if leading & 0xE0 == 0xC0 {
397                        2
398                    } else if leading & 0xF0 == 0xE0 {
399                        3
400                    } else if leading & 0xF8 == 0xF0 {
401                        4
402                    } else {
403                        1
404                    };
405                    if end - 1 + expected_len > n {
406                        end - 1 // incomplete char — exclude the leading byte too
407                    } else {
408                        n
409                    }
410                } else {
411                    n
412                }
413            } else {
414                n // didn't fill the buffer, so no boundary issue
415            };
416
417            // First check if it's valid UTF-8
418            let is_utf8 = std::str::from_utf8(&sniff[..check_len]).is_ok();
419
420            if is_utf8 && !slice.contains(&0) {
421                // Valid UTF-8 text file - proceed normally
422            } else {
423                // Try encoding detection for non-UTF-8 files
424                // If it's not UTF-8, try to detect the encoding
425                let (encoding, _consumed) =
426                    encoding_rs::Encoding::for_bom(slice).unwrap_or((encoding_rs::UTF_8, 0));
427
428                // If it's not UTF-8, try to detect the encoding
429                let detected_encoding = if encoding == UTF_8 {
430                    // Use chardet-like detection for common encodings
431                    detect_text_encoding(slice)
432                } else {
433                    Some(encoding)
434                };
435
436                match detected_encoding {
437                    Some(enc) if enc != UTF_8 => {
438                        let strategy = encoding_strategy.unwrap_or("detect");
439                        match strategy {
440                            "strict" | "skip" => {
441                                // Skip files with non-UTF-8 encoding
442                                warn!(
443                                    "Skipping non-UTF-8 file {} (encoding: {}, strategy: {})",
444                                    relative_path.display(),
445                                    enc.name(),
446                                    strategy
447                                );
448                            }
449                            _ => {
450                                // Default "detect" strategy: attempt to transcode
451                                match transcode_file_content(file_path, enc) {
452                                    Ok(transcoded_content) => {
453                                        info!(
454                                            "Successfully transcoded {} from {} to UTF-8",
455                                            relative_path.display(),
456                                            enc.name()
457                                        );
458                                        write_text_content(
459                                            output,
460                                            &transcoded_content,
461                                            language,
462                                            line_numbers,
463                                        )?;
464                                        return Ok(());
465                                    }
466                                    Err(e) => {
467                                        warn!(
468                                            "Failed to transcode {} from {}: {}. Treating as binary.",
469                                            relative_path.display(),
470                                            enc.name(),
471                                            e
472                                        );
473                                    }
474                                }
475                            }
476                        }
477                    }
478                    _ => {
479                        // Check if it's likely binary (contains null bytes)
480                        if slice.contains(&0) {
481                            warn!(
482                                "Detected binary file {} (contains null bytes). Skipping content.",
483                                relative_path.display()
484                            );
485                        } else {
486                            warn!(
487                                "Could not determine encoding for {}. Treating as binary.",
488                                relative_path.display()
489                            );
490                        }
491                    }
492                }
493
494                // Fallback to binary file placeholder
495                writeln!(output, "```text")?;
496                writeln!(
497                    output,
498                    "<Binary file or unsupported encoding: {} bytes>",
499                    metadata.len()
500                )?;
501                writeln!(output, "```")?;
502                return Ok(());
503            }
504
505            // Reset cursor and stream the content
506            if let Err(e) = file.seek(SeekFrom::Start(0)) {
507                warn!(
508                    "Could not reset file cursor for {}: {}. Skipping content.",
509                    relative_path.display(),
510                    e
511                );
512                writeln!(output, "```text")?;
513                writeln!(
514                    output,
515                    "<Could not read file content (e.g., binary file or permission error)>"
516                )?;
517                writeln!(output, "```")?;
518                return Ok(());
519            }
520
521            // Stream UTF-8 content
522            let content = match std::fs::read_to_string(file_path) {
523                Ok(content) => content,
524                Err(e) => {
525                    warn!(
526                        "Error reading file {}: {}. Output may be truncated.",
527                        relative_path.display(),
528                        e
529                    );
530                    writeln!(output, "```text")?;
531                    writeln!(output, "<Error reading file content>")?;
532                    writeln!(output, "```")?;
533                    return Ok(());
534                }
535            };
536            // When --signatures is active, replace file content with signatures-only output
537            // ONLY for extensions that tree-sitter actually supports. Non-code files
538            // (Cargo.toml, README.md, .yaml, etc.) must always show full content.
539            let signatures_only = ts_config.signatures
540                && crate::tree_sitter::is_supported_extension(extension);
541
542            if !signatures_only {
543                // Note: Smart truncation (`truncate: "smart"`) indicates AST-boundary
544                // truncation should be preferred when content needs truncating.
545                // Without a per-file max_tokens budget, no truncation is applied.
546                // The flag is stored for future use when per-file token limits are implemented.
547                write_text_content(output, &content, language, line_numbers)?;
548            }
549
550            // Tree-sitter enrichment: signatures and/or structure
551            write_tree_sitter_enrichment(output, &content, extension, ts_config)?;
552        }
553        Err(e) => {
554            warn!(
555                "Could not open file {}: {}. Skipping content.",
556                relative_path.display(),
557                e
558            );
559            writeln!(output, "```text")?;
560            writeln!(
561                output,
562                "<Could not read file content (e.g., binary file or permission error)>"
563            )?;
564            writeln!(output, "```")?;
565        }
566    }
567
568    Ok(())
569}
570
571/// Write tree-sitter enrichment (signatures, structure) after file content.
572#[allow(unused_variables)]
573pub fn write_tree_sitter_enrichment(
574    output: &mut impl Write,
575    content: &str,
576    extension: &str,
577    ts_config: &TreeSitterConfig,
578) -> io::Result<()> {
579    if !ts_config.signatures && !ts_config.structure {
580        return Ok(());
581    }
582
583    #[cfg(feature = "tree-sitter-base")]
584    {
585        use crate::tree_sitter::language_support::Visibility;
586
587        let vis_filter: Visibility = ts_config.visibility.parse().unwrap_or(Visibility::All);
588
589        if ts_config.structure
590            && let Some(structure) =
591                crate::tree_sitter::extract_structure_for_file(content, extension)
592        {
593            let summary = crate::tree_sitter::structure::format_structure_as_markdown(&structure);
594            if !summary.is_empty() {
595                writeln!(output)?;
596                write!(output, "{}", summary)?;
597            }
598        }
599
600        if ts_config.signatures
601            && let Some(signatures) =
602                crate::tree_sitter::extract_signatures_for_file(content, extension, vis_filter)
603            && !signatures.is_empty()
604        {
605            let language = match extension {
606                "rs" => "rust",
607                "js" | "mjs" | "cjs" => "javascript",
608                "ts" | "tsx" | "mts" | "cts" => "typescript",
609                "py" | "pyw" => "python",
610                "go" => "go",
611                "java" => "java",
612                "c" | "h" => "c",
613                "cpp" | "cxx" | "cc" | "hpp" | "hxx" | "hh" => "cpp",
614                _ => extension,
615            };
616            writeln!(output)?;
617            writeln!(output, "**Signatures:**")?;
618            writeln!(output)?;
619            let formatted = crate::tree_sitter::signatures::format_signatures_as_markdown(
620                &signatures,
621                language,
622            );
623            write!(output, "{}", formatted)?;
624        }
625    }
626
627    #[cfg(not(feature = "tree-sitter-base"))]
628    {
629        // Tree-sitter not compiled in — flags have no effect.
630        // Warning is printed once at startup in lib.rs.
631    }
632
633    Ok(())
634}
635
636/// Detect text encoding using heuristics for common encodings
637fn detect_text_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
638    // Try common encodings
639    let encodings = [
640        encoding_rs::WINDOWS_1252,
641        encoding_rs::UTF_16LE,
642        encoding_rs::UTF_16BE,
643        encoding_rs::SHIFT_JIS,
644    ];
645
646    for encoding in &encodings {
647        let (decoded, _, had_errors) = encoding.decode(bytes);
648        if !had_errors && is_likely_text(&decoded) {
649            return Some(encoding);
650        }
651    }
652
653    None
654}
655
656/// Check if decoded content looks like text (no control characters except common ones)
657fn is_likely_text(content: &str) -> bool {
658    let mut control_chars = 0;
659    let mut total_chars = 0;
660
661    for ch in content.chars() {
662        total_chars += 1;
663        if ch.is_control() && ch != '\n' && ch != '\r' && ch != '\t' {
664            control_chars += 1;
665        }
666
667        // If more than 5% control characters, probably not text
668        if total_chars > 100 && control_chars * 20 > total_chars {
669            return false;
670        }
671    }
672
673    // Allow up to 5% control characters in small files
674    if total_chars > 0 {
675        control_chars * 20 <= total_chars
676    } else {
677        true
678    }
679}
680
681/// Transcode file content from detected encoding to UTF-8
682fn transcode_file_content(file_path: &Path, encoding: &'static Encoding) -> io::Result<String> {
683    let bytes = std::fs::read(file_path)?;
684    let (decoded, _, had_errors) = encoding.decode(&bytes);
685
686    if had_errors {
687        return Err(io::Error::new(
688            io::ErrorKind::InvalidData,
689            format!("Failed to decode file with encoding {}", encoding.name()),
690        ));
691    }
692
693    Ok(decoded.into_owned())
694}
695
696/// Write text content with optional line numbers
697fn write_text_content(
698    output: &mut impl Write,
699    content: &str,
700    language: &str,
701    line_numbers: bool,
702) -> io::Result<()> {
703    writeln!(output, "```{}", language)?;
704
705    if line_numbers {
706        for (i, line) in content.lines().enumerate() {
707            writeln!(output, "{:>4} | {}", i + 1, line)?;
708        }
709    } else {
710        output.write_all(content.as_bytes())?;
711        if !content.ends_with('\n') {
712            writeln!(output)?;
713        }
714    }
715
716    writeln!(output, "```")?;
717    Ok(())
718}
719
720#[cfg(test)]
721mod tests {
722    use super::*;
723    use std::fs;
724    use tempfile::tempdir;
725
726    #[test]
727    fn test_code_block_formatting() {
728        let dir = tempdir().unwrap();
729        let base_path = dir.path();
730        let file_path = base_path.join("test.rs");
731        let output_path = base_path.join("output.md");
732
733        // Create a test Rust file
734        fs::write(
735            &file_path,
736            "fn main() {\n    println!(\"Hello, world!\");\n}",
737        )
738        .unwrap();
739
740        // Create an output file
741        let mut output = fs::File::create(&output_path).unwrap();
742
743        // Process the file
744        process_file(
745            base_path,
746            &file_path,
747            &mut output,
748            false,
749            None,
750            &TreeSitterConfig::default(),
751        )
752        .unwrap();
753
754        // Read the output
755        let content = fs::read_to_string(&output_path).unwrap();
756
757        // Check that code blocks are properly formatted
758        assert!(content.contains("```rust"));
759        assert!(content.contains("```") && content.matches("```").count() >= 2);
760    }
761
762    #[test]
763    fn test_markdown_file_formatting() {
764        let dir = tempdir().unwrap();
765        let base_path = dir.path();
766        let file_path = base_path.join("README.md");
767        let output_path = base_path.join("output.md");
768
769        // Create a test Markdown file
770        fs::write(&file_path, "# Test\n\nThis is a test markdown file.").unwrap();
771
772        // Create an output file
773        let mut output = fs::File::create(&output_path).unwrap();
774
775        // Process the file
776        process_file(
777            base_path,
778            &file_path,
779            &mut output,
780            false,
781            None,
782            &TreeSitterConfig::default(),
783        )
784        .unwrap();
785
786        // Read the output
787        let content = fs::read_to_string(&output_path).unwrap();
788
789        // Debug prints the content
790        println!("Generated content:\n{}", content);
791
792        // Check that markdown files use the correct language identifier
793        assert!(
794            content.contains("```markdown"),
795            "Content should contain '```markdown' but was: {}",
796            content
797        );
798        // Count the number of code block markers
799        let code_block_markers = content.matches("```").count();
800
801        assert!(
802            code_block_markers >= 2,
803            "Expected at least 2 code block markers, found {}",
804            code_block_markers
805        );
806    }
807
808    #[test]
809    fn test_line_numbered_code_blocks() {
810        let dir = tempdir().unwrap();
811        let base_path = dir.path();
812        let file_path = base_path.join("lib.rs");
813        let output_path = base_path.join("out.md");
814
815        // Create a multi-line Rust file
816        fs::write(
817                    &file_path,
818                    "fn add(a: i32, b: i32) -> i32 {\n    a + b\n}\n\nfn main() {\n    println!(\"{}\", add(1, 2));\n}\n",
819                )
820                .unwrap();
821
822        let mut output = fs::File::create(&output_path).unwrap();
823        process_file(
824            base_path,
825            &file_path,
826            &mut output,
827            true,
828            None,
829            &TreeSitterConfig::default(),
830        )
831        .unwrap();
832
833        let content = fs::read_to_string(&output_path).unwrap();
834
835        // Check language and line numbers prefix
836        assert!(content.contains("```rust"));
837        assert!(content.contains("   1 | "));
838        assert!(content.contains("   2 | "));
839
840        // Count lines with "|" prefix equals number of lines in an original file
841        let numbered_lines = content
842            .lines()
843            .filter(|l| {
844                l.trim_start()
845                    .chars()
846                    .next()
847                    .map(|c| c.is_ascii_digit())
848                    .unwrap_or(false)
849                    && l.contains(" | ")
850            })
851            .count();
852        let original_line_count = fs::read_to_string(&file_path).unwrap().lines().count();
853        assert_eq!(numbered_lines, original_line_count);
854
855        // Ensure code fence closes
856        assert!(content.contains("```"));
857    }
858
859    #[test]
860    fn test_binary_file_handling() {
861        let dir = tempdir().unwrap();
862        let base_path = dir.path();
863        let file_path = base_path.join("image.bin");
864        let output_path = base_path.join("out.md");
865
866        // Write truly binary data that won't be decoded by encoding detection
867        let bytes = vec![
868            0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG header
869            0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, // PNG chunk
870            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, // More binary data
871            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Null bytes
872        ];
873        fs::write(&file_path, bytes).unwrap();
874
875        let mut output = fs::File::create(&output_path).unwrap();
876        process_file(
877            base_path,
878            &file_path,
879            &mut output,
880            false,
881            None,
882            &TreeSitterConfig::default(),
883        )
884        .unwrap();
885
886        let content = fs::read_to_string(&output_path).unwrap();
887
888        // Expect a text block to fall back with a helpful message
889        assert!(content.contains("```text"));
890        assert!(content.contains("<Binary file or unsupported encoding:"));
891
892        // Ensure the code block is closed
893        let fence_count = content.matches("```").count();
894        assert!(
895            fence_count >= 2,
896            "expected at least opening and closing fences, got {}",
897            fence_count
898        );
899    }
900
901    #[test]
902    fn test_encoding_detection_and_transcoding() {
903        let dir = tempdir().unwrap();
904        let base_path = dir.path();
905        let output_path = base_path.join("out.md");
906
907        // Test Windows-1252 encoded file (common in Windows)
908        let windows1252_content = [
909            0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, // "Hello "
910            0x93, 0x57, 0x6F, 0x72, 0x6C, 0x64, 0x94, // "World" with smart quotes
911            0x0A, // newline
912        ];
913        let file_path = base_path.join("windows1252.txt");
914        fs::write(&file_path, windows1252_content).unwrap();
915
916        let mut output = fs::File::create(&output_path).unwrap();
917        process_file(
918            base_path,
919            &file_path,
920            &mut output,
921            false,
922            Some("detect"),
923            &TreeSitterConfig::default(),
924        )
925        .unwrap();
926
927        let content = fs::read_to_string(&output_path).unwrap();
928
929        // Should contain transcoded content with UTF-8 equivalents
930        assert!(content.contains("Hello"));
931        assert!(content.contains("World"));
932        // Should use text language
933        assert!(content.contains("```txt"));
934
935        // Ensure the code block is closed
936        let fence_count = content.matches("```").count();
937        assert!(
938            fence_count >= 2,
939            "expected at least opening and closing fences, got {}",
940            fence_count
941        );
942    }
943
944    #[test]
945    fn test_encoding_strategy_strict() {
946        let dir = tempdir().unwrap();
947        let base_path = dir.path();
948        let output_path = base_path.join("out.md");
949
950        // Create a file with non-UTF-8 content
951        let non_utf8_content = [0xFF, 0xFE, 0x41, 0x00]; // UTF-16 LE BOM + "A"
952        let file_path = base_path.join("utf16.txt");
953        fs::write(&file_path, non_utf8_content).unwrap();
954
955        let mut output = fs::File::create(&output_path).unwrap();
956        process_file(
957            base_path,
958            &file_path,
959            &mut output,
960            false,
961            Some("strict"),
962            &TreeSitterConfig::default(),
963        )
964        .unwrap();
965
966        let content = fs::read_to_string(&output_path).unwrap();
967
968        // Should contain binary file placeholder
969        assert!(content.contains("<Binary file or unsupported encoding:"));
970        assert!(content.contains("```text"));
971
972        // Ensure the code block is closed
973        let fence_count = content.matches("```").count();
974        assert!(
975            fence_count >= 2,
976            "expected at least opening and closing fences, got {}",
977            fence_count
978        );
979    }
980
981    #[test]
982    fn test_encoding_strategy_skip() {
983        let dir = tempdir().unwrap();
984        let base_path = dir.path();
985        let output_path = base_path.join("out.md");
986
987        // Create a file with UTF-16 content
988        let utf16_content = [0xFF, 0xFE, 0x48, 0x00, 0x69, 0x00]; // UTF-16 LE "Hi"
989        let file_path = base_path.join("utf16.txt");
990        fs::write(&file_path, utf16_content).unwrap();
991
992        let mut output = fs::File::create(&output_path).unwrap();
993        process_file(
994            base_path,
995            &file_path,
996            &mut output,
997            false,
998            Some("skip"),
999            &TreeSitterConfig::default(),
1000        )
1001        .unwrap();
1002
1003        let content = fs::read_to_string(&output_path).unwrap();
1004
1005        // Should contain binary file placeholder (skipped transcoding)
1006        assert!(content.contains("<Binary file or unsupported encoding:"));
1007        assert!(content.contains("```text"));
1008    }
1009
1010    #[test]
1011    fn test_generate_markdown_with_current_directory() {
1012        let dir = tempdir().unwrap();
1013        let base_path = dir.path();
1014        let output_path = base_path.join("test.md");
1015
1016        // Create test files
1017        fs::write(base_path.join("readme.txt"), "Hello world").unwrap();
1018
1019        // Collect files
1020        let files = crate::file_utils::collect_files(base_path, &[], &[], &[]).unwrap();
1021        let file_tree = crate::tree::build_file_tree(&files, base_path);
1022
1023        // Change to the test directory
1024        let original_dir = std::env::current_dir().unwrap();
1025        std::env::set_current_dir(base_path).unwrap();
1026
1027        // Test with "." as input directory
1028        let result = generate_markdown(
1029            &output_path.to_string_lossy(),
1030            ".",
1031            &[],
1032            &[],
1033            &file_tree,
1034            &files,
1035            base_path,
1036            false,
1037            None,
1038            None, // max_tokens
1039            &TreeSitterConfig::default(),
1040        );
1041
1042        // Restore original directory
1043        std::env::set_current_dir(original_dir).unwrap();
1044
1045        assert!(result.is_ok());
1046        let content = fs::read_to_string(&output_path).unwrap();
1047        assert!(content.contains("Directory Structure Report"));
1048    }
1049
1050    #[test]
1051    fn test_generate_markdown_creates_output_directory() {
1052        let dir = tempdir().unwrap();
1053        let base_path = dir.path();
1054        let nested_output = base_path.join("nested").join("deep").join("output.md");
1055
1056        // Create test files
1057        fs::write(base_path.join("test.txt"), "content").unwrap();
1058
1059        let files = crate::file_utils::collect_files(base_path, &[], &[], &[]).unwrap();
1060        let file_tree = crate::tree::build_file_tree(&files, base_path);
1061
1062        let result = generate_markdown(
1063            &nested_output.to_string_lossy(),
1064            "test_dir",
1065            &[],
1066            &[],
1067            &file_tree,
1068            &files,
1069            base_path,
1070            false,
1071            None,
1072            None, // max_tokens
1073            &TreeSitterConfig::default(),
1074        );
1075
1076        assert!(result.is_ok());
1077        assert!(nested_output.exists());
1078        assert!(nested_output.parent().unwrap().exists());
1079    }
1080
1081    #[test]
1082    fn test_generate_markdown_with_filters_and_ignores() {
1083        let dir = tempdir().unwrap();
1084        let base_path = dir.path();
1085        let output_path = base_path.join("filtered.md");
1086
1087        fs::write(base_path.join("main.rs"), "fn main() {}").unwrap();
1088        fs::write(base_path.join("config.toml"), "[package]").unwrap();
1089        fs::write(base_path.join("readme.md"), "# README").unwrap();
1090
1091        let files = crate::file_utils::collect_files(base_path, &[], &[], &[]).unwrap();
1092        let file_tree = crate::tree::build_file_tree(&files, base_path);
1093
1094        let result = generate_markdown(
1095            &output_path.to_string_lossy(),
1096            "project",
1097            &["rs".to_string(), "toml".to_string()],
1098            &["readme.md".to_string()],
1099            &file_tree,
1100            &files,
1101            base_path,
1102            true,
1103            Some("strict"),
1104            None, // max_tokens
1105            &TreeSitterConfig::default(),
1106        );
1107
1108        assert!(result.is_ok());
1109        let content = fs::read_to_string(&output_path).unwrap();
1110        assert!(content.contains("Directory Structure Report"));
1111        // The actual generate_markdown function doesn't format filters/ignores this way
1112        assert!(content.contains("main.rs") || content.contains("config.toml"));
1113    }
1114
1115    #[test]
1116    fn test_write_text_content_with_line_numbers() {
1117        let mut output = Vec::new();
1118        let content = "line one\nline two\nline three";
1119
1120        write_text_content(&mut output, content, "rust", true).unwrap();
1121
1122        let result = String::from_utf8(output).unwrap();
1123        assert!(result.contains("```rust"));
1124        assert!(result.contains("   1 | line one"));
1125        assert!(result.contains("   2 | line two"));
1126        assert!(result.contains("   3 | line three"));
1127        assert!(result.contains("```"));
1128    }
1129
1130    #[test]
1131    fn test_write_text_content_without_line_numbers() {
1132        let mut output = Vec::new();
1133        let content = "function test() {\n  return true;\n}";
1134
1135        write_text_content(&mut output, content, "javascript", false).unwrap();
1136
1137        let result = String::from_utf8(output).unwrap();
1138        assert!(result.contains("```javascript"));
1139        assert!(result.contains("function test() {"));
1140        assert!(result.contains("  return true;"));
1141        assert!(result.contains("```"));
1142        assert!(!result.contains(" | ")); // No line number prefix
1143    }
1144
1145    #[test]
1146    fn test_write_text_content_without_trailing_newline() {
1147        let mut output = Vec::new();
1148        let content = "no newline at end"; // No \n at end
1149
1150        write_text_content(&mut output, content, "text", false).unwrap();
1151
1152        let result = String::from_utf8(output).unwrap();
1153        assert!(result.contains("```text"));
1154        assert!(result.contains("no newline at end"));
1155        assert!(result.ends_with("```\n")); // Should add newline
1156    }
1157
1158    #[test]
1159    fn test_is_likely_text() {
1160        // Normal text should be considered text
1161        assert!(is_likely_text("Hello world\nThis is normal text"));
1162
1163        // Text with some control characters should still be text
1164        assert!(is_likely_text(
1165            "Line 1\nLine 2\tTabbed\r\nWindows line ending"
1166        ));
1167
1168        // Text with too many control characters should not be text
1169        let mut bad_text = String::new();
1170        for i in 0..200 {
1171            if i % 5 == 0 {
1172                bad_text.push('\x01'); // Control character
1173            } else {
1174                bad_text.push('a');
1175            }
1176        }
1177        assert!(!is_likely_text(&bad_text));
1178
1179        // Empty string should be considered text
1180        assert!(is_likely_text(""));
1181    }
1182
1183    #[test]
1184    fn test_detect_text_encoding() {
1185        // UTF-8 should return None (already UTF-8)
1186        let utf8_bytes = "Hello world".as_bytes();
1187        let result = detect_text_encoding(utf8_bytes);
1188        // The function may return an encoding even for UTF-8 text if it detects it differently
1189        // Just verify it doesn't crash
1190        assert!(result.is_some() || result.is_none());
1191
1192        // Windows-1252 encoded text should be detected
1193        let windows1252_bytes = [
1194            0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x93, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x94,
1195        ];
1196        let detected = detect_text_encoding(&windows1252_bytes);
1197        assert!(detected.is_some());
1198    }
1199
1200    #[test]
1201    fn test_transcode_file_content() {
1202        let dir = tempdir().unwrap();
1203        let file_path = dir.path().join("windows1252.txt");
1204
1205        // Write Windows-1252 encoded content
1206        let windows1252_content = [
1207            0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, // "Hello "
1208            0x93, 0x57, 0x6F, 0x72, 0x6C, 0x64, 0x94, // "World" with smart quotes
1209        ];
1210        fs::write(&file_path, windows1252_content).unwrap();
1211
1212        let result = transcode_file_content(&file_path, encoding_rs::WINDOWS_1252);
1213        assert!(result.is_ok());
1214
1215        let transcoded = result.unwrap();
1216        assert!(transcoded.contains("Hello"));
1217        assert!(transcoded.contains("World"));
1218    }
1219
1220    #[test]
1221    fn test_process_file_with_metadata_error() {
1222        let dir = tempdir().unwrap();
1223        let base_path = dir.path();
1224        let nonexistent_file = base_path.join("nonexistent.txt");
1225        let output_path = base_path.join("output.md");
1226
1227        let mut output = fs::File::create(&output_path).unwrap();
1228
1229        // This should handle the metadata error gracefully
1230        let result = process_file(
1231            base_path,
1232            &nonexistent_file,
1233            &mut output,
1234            false,
1235            None,
1236            &TreeSitterConfig::default(),
1237        );
1238        assert!(result.is_ok());
1239
1240        // Output should be minimal since file doesn't exist
1241        let content = fs::read_to_string(&output_path).unwrap();
1242        assert!(content.is_empty() || content.trim().is_empty());
1243    }
1244
1245    #[test]
1246    fn test_process_file_with_different_extensions() {
1247        let dir = tempdir().unwrap();
1248        let base_path = dir.path();
1249        let output_path = base_path.join("output.md");
1250
1251        // Test various file extensions
1252        let test_files = [
1253            ("script.py", "print('hello')", "python"),
1254            ("data.json", r#"{"key": "value"}"#, "json"),
1255            ("config.yaml", "key: value", "yaml"),
1256            ("style.css", "body { margin: 0; }", "css"),
1257            ("page.html", "<html><body>Test</body></html>", "html"),
1258            ("query.sql", "SELECT * FROM users;", "sql"),
1259            ("build.sh", "#!/bin/bash\necho 'building'", "bash"),
1260            ("unknown.xyz", "unknown content", "xyz"),
1261        ];
1262
1263        for (filename, content, expected_lang) in test_files.iter() {
1264            let file_path = base_path.join(filename);
1265            fs::write(&file_path, content).unwrap();
1266
1267            let mut output = fs::File::create(&output_path).unwrap();
1268            process_file(
1269                base_path,
1270                &file_path,
1271                &mut output,
1272                false,
1273                None,
1274                &TreeSitterConfig::default(),
1275            )
1276            .unwrap();
1277
1278            let result = fs::read_to_string(&output_path).unwrap();
1279            assert!(result.contains(&format!("```{}", expected_lang)));
1280            assert!(result.contains(content));
1281            assert!(result.contains(filename));
1282        }
1283    }
1284}
context_builder/markdown.rs

context_builder/
markdown.rs