context_builder/
markdown.rs

1use chrono::Utc;
2use ignore::DirEntry;
3use log::{error, info, warn};
4use std::fs;
5use std::io::{self, Read, Seek, SeekFrom, Write};
6use std::path::Path;
7
8use crate::tree::{FileTree, write_tree_to_file};
9use encoding_rs::{Encoding, UTF_8};
10
11#[cfg(feature = "parallel")]
12use crossbeam_channel::{Receiver, Sender, bounded};
13#[cfg(feature = "parallel")]
14use std::thread;
15
16/// Configuration for tree-sitter powered output.
17#[derive(Debug, Clone, Default)]
18pub struct TreeSitterConfig {
19    /// Output only signatures (function/type declarations) instead of full content.
20    pub signatures: bool,
21    /// Include a structure summary (counts of functions, structs, etc.) per file.
22    pub structure: bool,
23    /// Truncation mode: "smart" uses AST boundaries, anything else uses byte truncation.
24    pub truncate: String,
25    /// Visibility filter: "public", "private", or "all".
26    pub visibility: String,
27}
28
29/// Generates the final Markdown file.
30#[allow(clippy::too_many_arguments, unused_variables)]
31pub fn generate_markdown(
32    output_path: &str,
33    input_dir: &str,
34    filters: &[String],
35    ignores: &[String],
36    file_tree: &FileTree,
37    files: &[DirEntry],
38    base_path: &Path,
39    line_numbers: bool,
40    encoding_strategy: Option<&str>,
41    max_tokens: Option<usize>,
42    ts_config: &TreeSitterConfig,
43) -> io::Result<()> {
44    if let Some(parent) = Path::new(output_path).parent()
45        && !parent.exists()
46    {
47        fs::create_dir_all(parent)?;
48    }
49
50    let mut output = fs::File::create(output_path)?;
51
52    let input_dir_name = if input_dir == "." {
53        let current_dir = std::env::current_dir()?;
54        current_dir
55            .file_name()
56            .and_then(|n| n.to_str())
57            .unwrap_or_else(|| current_dir.to_str().unwrap_or("project"))
58            .to_string()
59    } else {
60        input_dir.to_string()
61    };
62
63    // --- Header --- //
64    writeln!(output, "# Directory Structure Report\n")?;
65
66    if !filters.is_empty() {
67        writeln!(
68            output,
69            "This document contains files from the `{}` directory with extensions: {}",
70            input_dir_name,
71            filters.join(", ")
72        )?;
73    } else {
74        writeln!(
75            output,
76            "This document contains all files from the `{}` directory, optimized for LLM consumption.",
77            input_dir_name
78        )?;
79    }
80
81    if !ignores.is_empty() {
82        writeln!(output, "Custom ignored patterns: {}", ignores.join(", "))?;
83    }
84
85    // Deterministic content hash (enables LLM prompt caching across runs)
86    // Uses xxh3 over file content bytes — stable across Rust versions and machines.
87    // Previous implementation hashed mtime (broken by git checkout, cp, etc.)
88    let mut content_hasher = xxhash_rust::xxh3::Xxh3::new();
89    for entry in files {
90        // Hash relative unix-style path for cross-OS determinism.
91        // Using absolute or OS-native paths would produce different hashes
92        // on different machines or operating systems.
93        let rel_path = entry.path().strip_prefix(base_path).unwrap_or(entry.path());
94        let normalized = rel_path.to_string_lossy().replace('\\', "/");
95        content_hasher.update(normalized.as_bytes());
96        // Null delimiter prevents collision: path="a" content="bc" vs path="ab" content="c"
97        content_hasher.update(b"\0");
98        // Hash actual file content (not mtime!) for determinism
99        if let Ok(bytes) = std::fs::read(entry.path()) {
100            content_hasher.update(&bytes);
101        }
102        content_hasher.update(b"\0");
103    }
104    writeln!(output, "Content hash: {:016x}", content_hasher.digest())?;
105    writeln!(output)?;
106
107    // --- File Tree --- //
108
109    writeln!(output, "## File Tree Structure\n")?;
110
111    write_tree_to_file(&mut output, file_tree, 0)?;
112
113    writeln!(output)?;
114
115    // (No '## Files' heading here; it will be injected later only once during final composition)
116    // (Diff section will be conditionally inserted later by the auto_diff logic in lib.rs)
117
118    #[cfg(feature = "parallel")]
119    {
120        use rayon::prelude::*;
121
122        // Create a bounded channel for ordered chunks
123        type ChunkResult = (usize, io::Result<Vec<u8>>);
124        let (sender, receiver): (Sender<ChunkResult>, Receiver<ChunkResult>) =
125            bounded(num_cpus::get() * 2); // Buffer size based on CPU count
126
127        let writer_handle = {
128            let mut output = output;
129            let total_files = files.len();
130            let budget = max_tokens;
131
132            thread::spawn(move || -> io::Result<()> {
133                let mut completed_chunks = std::collections::BTreeMap::new();
134                let mut next_index = 0;
135                let mut errors = Vec::new();
136                let mut tokens_used: usize = 0;
137                let mut budget_exceeded = false;
138
139                // Receive chunks and write them in order
140                while next_index < total_files {
141                    match receiver.recv() {
142                        Ok((index, chunk_result)) => {
143                            completed_chunks.insert(index, chunk_result);
144
145                            // Write all consecutive chunks starting from next_index
146                            while let Some(chunk_result) = completed_chunks.remove(&next_index) {
147                                if budget_exceeded {
148                                    // Already over budget — skip remaining chunks
149                                    next_index += 1;
150                                    continue;
151                                }
152
153                                match chunk_result {
154                                    Ok(buf) => {
155                                        // Estimate tokens for this chunk (~4 bytes per token)
156                                        let chunk_tokens = buf.len() / 4;
157
158                                        if let Some(max) = budget
159                                            && tokens_used + chunk_tokens > max
160                                            && tokens_used > 0
161                                        {
162                                            let remaining = total_files - next_index;
163                                            let notice = format!(
164                                                "---\n\n_⚠️ Token budget ({}) reached. {} remaining files omitted._\n\n",
165                                                max, remaining
166                                            );
167                                            if let Err(e) = output.write_all(notice.as_bytes()) {
168                                                errors.push(format!(
169                                                    "Failed to write truncation notice: {}",
170                                                    e
171                                                ));
172                                            }
173                                            budget_exceeded = true;
174                                            next_index += 1;
175                                            continue;
176                                        }
177
178                                        tokens_used += chunk_tokens;
179                                        if let Err(e) = output.write_all(&buf) {
180                                            errors.push(format!(
181                                                "Failed to write output for file index {}: {}",
182                                                next_index, e
183                                            ));
184                                        }
185                                    }
186                                    Err(e) => {
187                                        errors.push(format!(
188                                            "Failed to process file index {}: {}",
189                                            next_index, e
190                                        ));
191                                    }
192                                }
193                                next_index += 1;
194                            }
195                        }
196                        Err(_) => break, // Channel closed
197                    }
198                }
199
200                if !errors.is_empty() {
201                    error!(
202                        "Encountered {} errors during parallel processing:",
203                        errors.len()
204                    );
205                    for err in &errors {
206                        error!("  {}", err);
207                    }
208                    return Err(std::io::Error::other(format!(
209                        "Failed to process {} files: {}",
210                        errors.len(),
211                        errors.join("; ")
212                    )));
213                }
214
215                Ok(())
216            })
217        };
218
219        // Process files in parallel and send results to writer
220        let ts_config_clone = ts_config.clone();
221        files.par_iter().enumerate().for_each(|(index, entry)| {
222            let mut buf = Vec::new();
223            let result = process_file(
224                base_path,
225                entry.path(),
226                &mut buf,
227                line_numbers,
228                encoding_strategy,
229                &ts_config_clone,
230            )
231            .map(|_| buf);
232
233            // Send result to writer thread (ignore send errors - channel might be closed)
234            let _ = sender.send((index, result));
235        });
236
237        // Close the sender to signal completion
238        drop(sender);
239
240        // Wait for writer thread to complete and propagate any errors
241        writer_handle
242            .join()
243            .map_err(|_| std::io::Error::other("Writer thread panicked"))??;
244    }
245
246    #[cfg(not(feature = "parallel"))]
247    {
248        let mut tokens_used: usize = 0;
249
250        for (idx, entry) in files.iter().enumerate() {
251            // Estimate tokens for this file (~4 bytes per token)
252            let file_size = std::fs::metadata(entry.path())
253                .map(|m| m.len())
254                .unwrap_or(0);
255            let estimated_file_tokens = (file_size as usize) / 4;
256
257            if let Some(budget) = max_tokens {
258                if tokens_used + estimated_file_tokens > budget && tokens_used > 0 {
259                    let remaining = files.len() - idx;
260                    writeln!(output, "---\n")?;
261                    writeln!(
262                        output,
263                        "_⚠️ Token budget ({}) reached. {} remaining files omitted._\n",
264                        budget, remaining
265                    )?;
266                    break;
267                }
268            }
269
270            tokens_used += estimated_file_tokens;
271            process_file(
272                base_path,
273                entry.path(),
274                &mut output,
275                line_numbers,
276                encoding_strategy,
277                ts_config,
278            )?;
279        }
280    }
281
282    Ok(())
283}
284
285/// Processes a single file and writes its content to the output.
286pub fn process_file(
287    base_path: &Path,
288    file_path: &Path,
289    output: &mut impl Write,
290    line_numbers: bool,
291    encoding_strategy: Option<&str>,
292    ts_config: &TreeSitterConfig,
293) -> io::Result<()> {
294    let relative_path = file_path.strip_prefix(base_path).unwrap_or(file_path);
295    info!("Processing file: {}", relative_path.display());
296
297    let metadata = match fs::metadata(file_path) {
298        Ok(meta) => meta,
299        Err(e) => {
300            error!(
301                "Failed to get metadata for {}: {}",
302                relative_path.display(),
303                e
304            );
305            return Ok(());
306        }
307    };
308
309    let modified_time = metadata
310        .modified()
311        .ok()
312        .map(|time| {
313            let system_time: chrono::DateTime<Utc> = time.into();
314            system_time.format("%Y-%m-%d %H:%M:%S UTC").to_string()
315        })
316        .unwrap_or_else(|| "Unknown".to_string());
317
318    writeln!(output)?;
319    writeln!(output, "### File: `{}`", relative_path.display())?;
320
321    writeln!(output)?;
322
323    writeln!(output, "- Size: {} bytes", metadata.len())?;
324    writeln!(output, "- Modified: {}", modified_time)?;
325    writeln!(output)?;
326
327    // --- File Content --- //
328    let extension = file_path
329        .extension()
330        .and_then(|s| s.to_str())
331        .unwrap_or("text");
332    let language = match extension {
333        "rs" => "rust",
334        "js" => "javascript",
335        "ts" => "typescript",
336        "jsx" => "jsx",
337        "tsx" => "tsx",
338        "json" => "json",
339        "toml" => "toml",
340        "md" => "markdown",
341        "yaml" | "yml" => "yaml",
342        "html" => "html",
343        "css" => "css",
344        "py" => "python",
345        "java" => "java",
346        "cpp" => "cpp",
347        "c" => "c",
348        "h" => "c",
349        "hpp" => "cpp",
350        "sql" => "sql",
351        "sh" => "bash",
352        "xml" => "xml",
353        "lock" => "toml",
354        _ => extension,
355    };
356
357    // Enhanced binary file handling with encoding detection and transcoding
358    match fs::File::open(file_path) {
359        Ok(mut file) => {
360            let mut sniff = [0u8; 8192];
361            let n = match file.read(&mut sniff) {
362                Ok(n) => n,
363                Err(e) => {
364                    warn!(
365                        "Could not read file {}: {}. Skipping content.",
366                        relative_path.display(),
367                        e
368                    );
369
370                    writeln!(output, "```text")?;
371
372                    writeln!(
373                        output,
374                        "<Could not read file content (e.g., binary file or permission error)>"
375                    )?;
376
377                    writeln!(output, "```")?;
378
379                    return Ok(());
380                }
381            };
382            let slice = &sniff[..n];
383
384            // Find a valid UTF-8 boundary by backtracking up to 3 bytes.
385            // If the sniff buffer cuts a multi-byte char (e.g., emoji at byte 8191),
386            // from_utf8 would falsely classify the file as non-UTF-8.
387            let check_len = if n == sniff.len() {
388                // Buffer is full — may have split a multi-byte char at the end
389                let mut end = n;
390                while end > 0 && end > n.saturating_sub(4) && sniff[end - 1] & 0xC0 == 0x80 {
391                    end -= 1; // skip continuation bytes
392                }
393                // If we landed on a leading byte, check if the sequence is complete
394                if end > 0 && end < n {
395                    let leading = sniff[end - 1];
396                    let expected_len = if leading & 0xE0 == 0xC0 {
397                        2
398                    } else if leading & 0xF0 == 0xE0 {
399                        3
400                    } else if leading & 0xF8 == 0xF0 {
401                        4
402                    } else {
403                        1
404                    };
405                    if end - 1 + expected_len > n {
406                        end - 1 // incomplete char — exclude the leading byte too
407                    } else {
408                        n
409                    }
410                } else {
411                    n
412                }
413            } else {
414                n // didn't fill the buffer, so no boundary issue
415            };
416
417            // First check if it's valid UTF-8
418            let is_utf8 = std::str::from_utf8(&sniff[..check_len]).is_ok();
419
420            if is_utf8 && !slice.contains(&0) {
421                // Valid UTF-8 text file - proceed normally
422            } else {
423                // Try encoding detection for non-UTF-8 files
424                // If it's not UTF-8, try to detect the encoding
425                let (encoding, _consumed) =
426                    encoding_rs::Encoding::for_bom(slice).unwrap_or((encoding_rs::UTF_8, 0));
427
428                // If it's not UTF-8, try to detect the encoding
429                let detected_encoding = if encoding == UTF_8 {
430                    // Use chardet-like detection for common encodings
431                    detect_text_encoding(slice)
432                } else {
433                    Some(encoding)
434                };
435
436                match detected_encoding {
437                    Some(enc) if enc != UTF_8 => {
438                        let strategy = encoding_strategy.unwrap_or("detect");
439                        match strategy {
440                            "strict" | "skip" => {
441                                // Skip files with non-UTF-8 encoding
442                                warn!(
443                                    "Skipping non-UTF-8 file {} (encoding: {}, strategy: {})",
444                                    relative_path.display(),
445                                    enc.name(),
446                                    strategy
447                                );
448                            }
449                            _ => {
450                                // Default "detect" strategy: attempt to transcode
451                                match transcode_file_content(file_path, enc) {
452                                    Ok(transcoded_content) => {
453                                        info!(
454                                            "Successfully transcoded {} from {} to UTF-8",
455                                            relative_path.display(),
456                                            enc.name()
457                                        );
458                                        write_text_content(
459                                            output,
460                                            &transcoded_content,
461                                            language,
462                                            line_numbers,
463                                        )?;
464                                        return Ok(());
465                                    }
466                                    Err(e) => {
467                                        warn!(
468                                            "Failed to transcode {} from {}: {}. Treating as binary.",
469                                            relative_path.display(),
470                                            enc.name(),
471                                            e
472                                        );
473                                    }
474                                }
475                            }
476                        }
477                    }
478                    _ => {
479                        // Check if it's likely binary (contains null bytes)
480                        if slice.contains(&0) {
481                            warn!(
482                                "Detected binary file {} (contains null bytes). Skipping content.",
483                                relative_path.display()
484                            );
485                        } else {
486                            warn!(
487                                "Could not determine encoding for {}. Treating as binary.",
488                                relative_path.display()
489                            );
490                        }
491                    }
492                }
493
494                // Fallback to binary file placeholder
495                writeln!(output, "```text")?;
496                writeln!(
497                    output,
498                    "<Binary file or unsupported encoding: {} bytes>",
499                    metadata.len()
500                )?;
501                writeln!(output, "```")?;
502                return Ok(());
503            }
504
505            // Reset cursor and stream the content
506            if let Err(e) = file.seek(SeekFrom::Start(0)) {
507                warn!(
508                    "Could not reset file cursor for {}: {}. Skipping content.",
509                    relative_path.display(),
510                    e
511                );
512                writeln!(output, "```text")?;
513                writeln!(
514                    output,
515                    "<Could not read file content (e.g., binary file or permission error)>"
516                )?;
517                writeln!(output, "```")?;
518                return Ok(());
519            }
520
521            // Stream UTF-8 content
522            let content = match std::fs::read_to_string(file_path) {
523                Ok(content) => content,
524                Err(e) => {
525                    warn!(
526                        "Error reading file {}: {}. Output may be truncated.",
527                        relative_path.display(),
528                        e
529                    );
530                    writeln!(output, "```text")?;
531                    writeln!(output, "<Error reading file content>")?;
532                    writeln!(output, "```")?;
533                    return Ok(());
534                }
535            };
536            // When --signatures is active, replace file content with signatures-only output
537            let signatures_only = ts_config.signatures;
538
539            if !signatures_only {
540                // Note: Smart truncation (`truncate: "smart"`) indicates AST-boundary
541                // truncation should be preferred when content needs truncating.
542                // Without a per-file max_tokens budget, no truncation is applied.
543                // The flag is stored for future use when per-file token limits are implemented.
544                write_text_content(output, &content, language, line_numbers)?;
545            }
546
547            // Tree-sitter enrichment: signatures and/or structure
548            write_tree_sitter_enrichment(output, &content, extension, ts_config)?;
549        }
550        Err(e) => {
551            warn!(
552                "Could not open file {}: {}. Skipping content.",
553                relative_path.display(),
554                e
555            );
556            writeln!(output, "```text")?;
557            writeln!(
558                output,
559                "<Could not read file content (e.g., binary file or permission error)>"
560            )?;
561            writeln!(output, "```")?;
562        }
563    }
564
565    Ok(())
566}
567
568/// Write tree-sitter enrichment (signatures, structure) after file content.
569#[allow(unused_variables)]
570fn write_tree_sitter_enrichment(
571    output: &mut impl Write,
572    content: &str,
573    extension: &str,
574    ts_config: &TreeSitterConfig,
575) -> io::Result<()> {
576    if !ts_config.signatures && !ts_config.structure {
577        return Ok(());
578    }
579
580    #[cfg(feature = "tree-sitter-base")]
581    {
582        use crate::tree_sitter::language_support::Visibility;
583
584        let vis_filter: Visibility = ts_config.visibility.parse().unwrap_or(Visibility::All);
585
586        if ts_config.structure
587            && let Some(structure) =
588                crate::tree_sitter::extract_structure_for_file(content, extension)
589        {
590            let summary = crate::tree_sitter::structure::format_structure_as_markdown(&structure);
591            if !summary.is_empty() {
592                writeln!(output)?;
593                write!(output, "{}", summary)?;
594            }
595        }
596
597        if ts_config.signatures
598            && let Some(signatures) =
599                crate::tree_sitter::extract_signatures_for_file(content, extension, vis_filter)
600            && !signatures.is_empty()
601        {
602            let language = match extension {
603                "rs" => "rust",
604                "js" | "mjs" | "cjs" => "javascript",
605                "ts" | "tsx" | "mts" | "cts" => "typescript",
606                "py" | "pyw" => "python",
607                "go" => "go",
608                "java" => "java",
609                "c" | "h" => "c",
610                "cpp" | "cxx" | "cc" | "hpp" | "hxx" | "hh" => "cpp",
611                _ => extension,
612            };
613            writeln!(output)?;
614            writeln!(output, "**Signatures:**")?;
615            writeln!(output)?;
616            let formatted = crate::tree_sitter::signatures::format_signatures_as_markdown(
617                &signatures,
618                language,
619            );
620            write!(output, "{}", formatted)?;
621        }
622    }
623
624    #[cfg(not(feature = "tree-sitter-base"))]
625    {
626        // Tree-sitter not compiled in — flags have no effect.
627        // Warning is printed once at startup in lib.rs.
628    }
629
630    Ok(())
631}
632
633/// Detect text encoding using heuristics for common encodings
634fn detect_text_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
635    // Try common encodings
636    let encodings = [
637        encoding_rs::WINDOWS_1252,
638        encoding_rs::UTF_16LE,
639        encoding_rs::UTF_16BE,
640        encoding_rs::SHIFT_JIS,
641    ];
642
643    for encoding in &encodings {
644        let (decoded, _, had_errors) = encoding.decode(bytes);
645        if !had_errors && is_likely_text(&decoded) {
646            return Some(encoding);
647        }
648    }
649
650    None
651}
652
653/// Check if decoded content looks like text (no control characters except common ones)
654fn is_likely_text(content: &str) -> bool {
655    let mut control_chars = 0;
656    let mut total_chars = 0;
657
658    for ch in content.chars() {
659        total_chars += 1;
660        if ch.is_control() && ch != '\n' && ch != '\r' && ch != '\t' {
661            control_chars += 1;
662        }
663
664        // If more than 5% control characters, probably not text
665        if total_chars > 100 && control_chars * 20 > total_chars {
666            return false;
667        }
668    }
669
670    // Allow up to 5% control characters in small files
671    if total_chars > 0 {
672        control_chars * 20 <= total_chars
673    } else {
674        true
675    }
676}
677
678/// Transcode file content from detected encoding to UTF-8
679fn transcode_file_content(file_path: &Path, encoding: &'static Encoding) -> io::Result<String> {
680    let bytes = std::fs::read(file_path)?;
681    let (decoded, _, had_errors) = encoding.decode(&bytes);
682
683    if had_errors {
684        return Err(io::Error::new(
685            io::ErrorKind::InvalidData,
686            format!("Failed to decode file with encoding {}", encoding.name()),
687        ));
688    }
689
690    Ok(decoded.into_owned())
691}
692
693/// Write text content with optional line numbers
694fn write_text_content(
695    output: &mut impl Write,
696    content: &str,
697    language: &str,
698    line_numbers: bool,
699) -> io::Result<()> {
700    writeln!(output, "```{}", language)?;
701
702    if line_numbers {
703        for (i, line) in content.lines().enumerate() {
704            writeln!(output, "{:>4} | {}", i + 1, line)?;
705        }
706    } else {
707        output.write_all(content.as_bytes())?;
708        if !content.ends_with('\n') {
709            writeln!(output)?;
710        }
711    }
712
713    writeln!(output, "```")?;
714    Ok(())
715}
716
717#[cfg(test)]
718mod tests {
719    use super::*;
720    use std::fs;
721    use tempfile::tempdir;
722
723    #[test]
724    fn test_code_block_formatting() {
725        let dir = tempdir().unwrap();
726        let base_path = dir.path();
727        let file_path = base_path.join("test.rs");
728        let output_path = base_path.join("output.md");
729
730        // Create a test Rust file
731        fs::write(
732            &file_path,
733            "fn main() {\n    println!(\"Hello, world!\");\n}",
734        )
735        .unwrap();
736
737        // Create an output file
738        let mut output = fs::File::create(&output_path).unwrap();
739
740        // Process the file
741        process_file(
742            base_path,
743            &file_path,
744            &mut output,
745            false,
746            None,
747            &TreeSitterConfig::default(),
748        )
749        .unwrap();
750
751        // Read the output
752        let content = fs::read_to_string(&output_path).unwrap();
753
754        // Check that code blocks are properly formatted
755        assert!(content.contains("```rust"));
756        assert!(content.contains("```") && content.matches("```").count() >= 2);
757    }
758
759    #[test]
760    fn test_markdown_file_formatting() {
761        let dir = tempdir().unwrap();
762        let base_path = dir.path();
763        let file_path = base_path.join("README.md");
764        let output_path = base_path.join("output.md");
765
766        // Create a test Markdown file
767        fs::write(&file_path, "# Test\n\nThis is a test markdown file.").unwrap();
768
769        // Create an output file
770        let mut output = fs::File::create(&output_path).unwrap();
771
772        // Process the file
773        process_file(
774            base_path,
775            &file_path,
776            &mut output,
777            false,
778            None,
779            &TreeSitterConfig::default(),
780        )
781        .unwrap();
782
783        // Read the output
784        let content = fs::read_to_string(&output_path).unwrap();
785
786        // Debug prints the content
787        println!("Generated content:\n{}", content);
788
789        // Check that markdown files use the correct language identifier
790        assert!(
791            content.contains("```markdown"),
792            "Content should contain '```markdown' but was: {}",
793            content
794        );
795        // Count the number of code block markers
796        let code_block_markers = content.matches("```").count();
797
798        assert!(
799            code_block_markers >= 2,
800            "Expected at least 2 code block markers, found {}",
801            code_block_markers
802        );
803    }
804
805    #[test]
806    fn test_line_numbered_code_blocks() {
807        let dir = tempdir().unwrap();
808        let base_path = dir.path();
809        let file_path = base_path.join("lib.rs");
810        let output_path = base_path.join("out.md");
811
812        // Create a multi-line Rust file
813        fs::write(
814                    &file_path,
815                    "fn add(a: i32, b: i32) -> i32 {\n    a + b\n}\n\nfn main() {\n    println!(\"{}\", add(1, 2));\n}\n",
816                )
817                .unwrap();
818
819        let mut output = fs::File::create(&output_path).unwrap();
820        process_file(
821            base_path,
822            &file_path,
823            &mut output,
824            true,
825            None,
826            &TreeSitterConfig::default(),
827        )
828        .unwrap();
829
830        let content = fs::read_to_string(&output_path).unwrap();
831
832        // Check language and line numbers prefix
833        assert!(content.contains("```rust"));
834        assert!(content.contains("   1 | "));
835        assert!(content.contains("   2 | "));
836
837        // Count lines with "|" prefix equals number of lines in an original file
838        let numbered_lines = content
839            .lines()
840            .filter(|l| {
841                l.trim_start()
842                    .chars()
843                    .next()
844                    .map(|c| c.is_ascii_digit())
845                    .unwrap_or(false)
846                    && l.contains(" | ")
847            })
848            .count();
849        let original_line_count = fs::read_to_string(&file_path).unwrap().lines().count();
850        assert_eq!(numbered_lines, original_line_count);
851
852        // Ensure code fence closes
853        assert!(content.contains("```"));
854    }
855
856    #[test]
857    fn test_binary_file_handling() {
858        let dir = tempdir().unwrap();
859        let base_path = dir.path();
860        let file_path = base_path.join("image.bin");
861        let output_path = base_path.join("out.md");
862
863        // Write truly binary data that won't be decoded by encoding detection
864        let bytes = vec![
865            0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG header
866            0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, // PNG chunk
867            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, // More binary data
868            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Null bytes
869        ];
870        fs::write(&file_path, bytes).unwrap();
871
872        let mut output = fs::File::create(&output_path).unwrap();
873        process_file(
874            base_path,
875            &file_path,
876            &mut output,
877            false,
878            None,
879            &TreeSitterConfig::default(),
880        )
881        .unwrap();
882
883        let content = fs::read_to_string(&output_path).unwrap();
884
885        // Expect a text block to fall back with a helpful message
886        assert!(content.contains("```text"));
887        assert!(content.contains("<Binary file or unsupported encoding:"));
888
889        // Ensure the code block is closed
890        let fence_count = content.matches("```").count();
891        assert!(
892            fence_count >= 2,
893            "expected at least opening and closing fences, got {}",
894            fence_count
895        );
896    }
897
898    #[test]
899    fn test_encoding_detection_and_transcoding() {
900        let dir = tempdir().unwrap();
901        let base_path = dir.path();
902        let output_path = base_path.join("out.md");
903
904        // Test Windows-1252 encoded file (common in Windows)
905        let windows1252_content = [
906            0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, // "Hello "
907            0x93, 0x57, 0x6F, 0x72, 0x6C, 0x64, 0x94, // "World" with smart quotes
908            0x0A, // newline
909        ];
910        let file_path = base_path.join("windows1252.txt");
911        fs::write(&file_path, windows1252_content).unwrap();
912
913        let mut output = fs::File::create(&output_path).unwrap();
914        process_file(
915            base_path,
916            &file_path,
917            &mut output,
918            false,
919            Some("detect"),
920            &TreeSitterConfig::default(),
921        )
922        .unwrap();
923
924        let content = fs::read_to_string(&output_path).unwrap();
925
926        // Should contain transcoded content with UTF-8 equivalents
927        assert!(content.contains("Hello"));
928        assert!(content.contains("World"));
929        // Should use text language
930        assert!(content.contains("```txt"));
931
932        // Ensure the code block is closed
933        let fence_count = content.matches("```").count();
934        assert!(
935            fence_count >= 2,
936            "expected at least opening and closing fences, got {}",
937            fence_count
938        );
939    }
940
941    #[test]
942    fn test_encoding_strategy_strict() {
943        let dir = tempdir().unwrap();
944        let base_path = dir.path();
945        let output_path = base_path.join("out.md");
946
947        // Create a file with non-UTF-8 content
948        let non_utf8_content = [0xFF, 0xFE, 0x41, 0x00]; // UTF-16 LE BOM + "A"
949        let file_path = base_path.join("utf16.txt");
950        fs::write(&file_path, non_utf8_content).unwrap();
951
952        let mut output = fs::File::create(&output_path).unwrap();
953        process_file(
954            base_path,
955            &file_path,
956            &mut output,
957            false,
958            Some("strict"),
959            &TreeSitterConfig::default(),
960        )
961        .unwrap();
962
963        let content = fs::read_to_string(&output_path).unwrap();
964
965        // Should contain binary file placeholder
966        assert!(content.contains("<Binary file or unsupported encoding:"));
967        assert!(content.contains("```text"));
968
969        // Ensure the code block is closed
970        let fence_count = content.matches("```").count();
971        assert!(
972            fence_count >= 2,
973            "expected at least opening and closing fences, got {}",
974            fence_count
975        );
976    }
977
978    #[test]
979    fn test_encoding_strategy_skip() {
980        let dir = tempdir().unwrap();
981        let base_path = dir.path();
982        let output_path = base_path.join("out.md");
983
984        // Create a file with UTF-16 content
985        let utf16_content = [0xFF, 0xFE, 0x48, 0x00, 0x69, 0x00]; // UTF-16 LE "Hi"
986        let file_path = base_path.join("utf16.txt");
987        fs::write(&file_path, utf16_content).unwrap();
988
989        let mut output = fs::File::create(&output_path).unwrap();
990        process_file(
991            base_path,
992            &file_path,
993            &mut output,
994            false,
995            Some("skip"),
996            &TreeSitterConfig::default(),
997        )
998        .unwrap();
999
1000        let content = fs::read_to_string(&output_path).unwrap();
1001
1002        // Should contain binary file placeholder (skipped transcoding)
1003        assert!(content.contains("<Binary file or unsupported encoding:"));
1004        assert!(content.contains("```text"));
1005    }
1006
1007    #[test]
1008    fn test_generate_markdown_with_current_directory() {
1009        let dir = tempdir().unwrap();
1010        let base_path = dir.path();
1011        let output_path = base_path.join("test.md");
1012
1013        // Create test files
1014        fs::write(base_path.join("readme.txt"), "Hello world").unwrap();
1015
1016        // Collect files
1017        let files = crate::file_utils::collect_files(base_path, &[], &[], &[]).unwrap();
1018        let file_tree = crate::tree::build_file_tree(&files, base_path);
1019
1020        // Change to the test directory
1021        let original_dir = std::env::current_dir().unwrap();
1022        std::env::set_current_dir(base_path).unwrap();
1023
1024        // Test with "." as input directory
1025        let result = generate_markdown(
1026            &output_path.to_string_lossy(),
1027            ".",
1028            &[],
1029            &[],
1030            &file_tree,
1031            &files,
1032            base_path,
1033            false,
1034            None,
1035            None, // max_tokens
1036            &TreeSitterConfig::default(),
1037        );
1038
1039        // Restore original directory
1040        std::env::set_current_dir(original_dir).unwrap();
1041
1042        assert!(result.is_ok());
1043        let content = fs::read_to_string(&output_path).unwrap();
1044        assert!(content.contains("Directory Structure Report"));
1045    }
1046
1047    #[test]
1048    fn test_generate_markdown_creates_output_directory() {
1049        let dir = tempdir().unwrap();
1050        let base_path = dir.path();
1051        let nested_output = base_path.join("nested").join("deep").join("output.md");
1052
1053        // Create test files
1054        fs::write(base_path.join("test.txt"), "content").unwrap();
1055
1056        let files = crate::file_utils::collect_files(base_path, &[], &[], &[]).unwrap();
1057        let file_tree = crate::tree::build_file_tree(&files, base_path);
1058
1059        let result = generate_markdown(
1060            &nested_output.to_string_lossy(),
1061            "test_dir",
1062            &[],
1063            &[],
1064            &file_tree,
1065            &files,
1066            base_path,
1067            false,
1068            None,
1069            None, // max_tokens
1070            &TreeSitterConfig::default(),
1071        );
1072
1073        assert!(result.is_ok());
1074        assert!(nested_output.exists());
1075        assert!(nested_output.parent().unwrap().exists());
1076    }
1077
1078    #[test]
1079    fn test_generate_markdown_with_filters_and_ignores() {
1080        let dir = tempdir().unwrap();
1081        let base_path = dir.path();
1082        let output_path = base_path.join("filtered.md");
1083
1084        fs::write(base_path.join("main.rs"), "fn main() {}").unwrap();
1085        fs::write(base_path.join("config.toml"), "[package]").unwrap();
1086        fs::write(base_path.join("readme.md"), "# README").unwrap();
1087
1088        let files = crate::file_utils::collect_files(base_path, &[], &[], &[]).unwrap();
1089        let file_tree = crate::tree::build_file_tree(&files, base_path);
1090
1091        let result = generate_markdown(
1092            &output_path.to_string_lossy(),
1093            "project",
1094            &["rs".to_string(), "toml".to_string()],
1095            &["readme.md".to_string()],
1096            &file_tree,
1097            &files,
1098            base_path,
1099            true,
1100            Some("strict"),
1101            None, // max_tokens
1102            &TreeSitterConfig::default(),
1103        );
1104
1105        assert!(result.is_ok());
1106        let content = fs::read_to_string(&output_path).unwrap();
1107        assert!(content.contains("Directory Structure Report"));
1108        // The actual generate_markdown function doesn't format filters/ignores this way
1109        assert!(content.contains("main.rs") || content.contains("config.toml"));
1110    }
1111
1112    #[test]
1113    fn test_write_text_content_with_line_numbers() {
1114        let mut output = Vec::new();
1115        let content = "line one\nline two\nline three";
1116
1117        write_text_content(&mut output, content, "rust", true).unwrap();
1118
1119        let result = String::from_utf8(output).unwrap();
1120        assert!(result.contains("```rust"));
1121        assert!(result.contains("   1 | line one"));
1122        assert!(result.contains("   2 | line two"));
1123        assert!(result.contains("   3 | line three"));
1124        assert!(result.contains("```"));
1125    }
1126
1127    #[test]
1128    fn test_write_text_content_without_line_numbers() {
1129        let mut output = Vec::new();
1130        let content = "function test() {\n  return true;\n}";
1131
1132        write_text_content(&mut output, content, "javascript", false).unwrap();
1133
1134        let result = String::from_utf8(output).unwrap();
1135        assert!(result.contains("```javascript"));
1136        assert!(result.contains("function test() {"));
1137        assert!(result.contains("  return true;"));
1138        assert!(result.contains("```"));
1139        assert!(!result.contains(" | ")); // No line number prefix
1140    }
1141
1142    #[test]
1143    fn test_write_text_content_without_trailing_newline() {
1144        let mut output = Vec::new();
1145        let content = "no newline at end"; // No \n at end
1146
1147        write_text_content(&mut output, content, "text", false).unwrap();
1148
1149        let result = String::from_utf8(output).unwrap();
1150        assert!(result.contains("```text"));
1151        assert!(result.contains("no newline at end"));
1152        assert!(result.ends_with("```\n")); // Should add newline
1153    }
1154
1155    #[test]
1156    fn test_is_likely_text() {
1157        // Normal text should be considered text
1158        assert!(is_likely_text("Hello world\nThis is normal text"));
1159
1160        // Text with some control characters should still be text
1161        assert!(is_likely_text(
1162            "Line 1\nLine 2\tTabbed\r\nWindows line ending"
1163        ));
1164
1165        // Text with too many control characters should not be text
1166        let mut bad_text = String::new();
1167        for i in 0..200 {
1168            if i % 5 == 0 {
1169                bad_text.push('\x01'); // Control character
1170            } else {
1171                bad_text.push('a');
1172            }
1173        }
1174        assert!(!is_likely_text(&bad_text));
1175
1176        // Empty string should be considered text
1177        assert!(is_likely_text(""));
1178    }
1179
1180    #[test]
1181    fn test_detect_text_encoding() {
1182        // UTF-8 should return None (already UTF-8)
1183        let utf8_bytes = "Hello world".as_bytes();
1184        let result = detect_text_encoding(utf8_bytes);
1185        // The function may return an encoding even for UTF-8 text if it detects it differently
1186        // Just verify it doesn't crash
1187        assert!(result.is_some() || result.is_none());
1188
1189        // Windows-1252 encoded text should be detected
1190        let windows1252_bytes = [
1191            0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x93, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x94,
1192        ];
1193        let detected = detect_text_encoding(&windows1252_bytes);
1194        assert!(detected.is_some());
1195    }
1196
1197    #[test]
1198    fn test_transcode_file_content() {
1199        let dir = tempdir().unwrap();
1200        let file_path = dir.path().join("windows1252.txt");
1201
1202        // Write Windows-1252 encoded content
1203        let windows1252_content = [
1204            0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, // "Hello "
1205            0x93, 0x57, 0x6F, 0x72, 0x6C, 0x64, 0x94, // "World" with smart quotes
1206        ];
1207        fs::write(&file_path, windows1252_content).unwrap();
1208
1209        let result = transcode_file_content(&file_path, encoding_rs::WINDOWS_1252);
1210        assert!(result.is_ok());
1211
1212        let transcoded = result.unwrap();
1213        assert!(transcoded.contains("Hello"));
1214        assert!(transcoded.contains("World"));
1215    }
1216
1217    #[test]
1218    fn test_process_file_with_metadata_error() {
1219        let dir = tempdir().unwrap();
1220        let base_path = dir.path();
1221        let nonexistent_file = base_path.join("nonexistent.txt");
1222        let output_path = base_path.join("output.md");
1223
1224        let mut output = fs::File::create(&output_path).unwrap();
1225
1226        // This should handle the metadata error gracefully
1227        let result = process_file(
1228            base_path,
1229            &nonexistent_file,
1230            &mut output,
1231            false,
1232            None,
1233            &TreeSitterConfig::default(),
1234        );
1235        assert!(result.is_ok());
1236
1237        // Output should be minimal since file doesn't exist
1238        let content = fs::read_to_string(&output_path).unwrap();
1239        assert!(content.is_empty() || content.trim().is_empty());
1240    }
1241
1242    #[test]
1243    fn test_process_file_with_different_extensions() {
1244        let dir = tempdir().unwrap();
1245        let base_path = dir.path();
1246        let output_path = base_path.join("output.md");
1247
1248        // Test various file extensions
1249        let test_files = [
1250            ("script.py", "print('hello')", "python"),
1251            ("data.json", r#"{"key": "value"}"#, "json"),
1252            ("config.yaml", "key: value", "yaml"),
1253            ("style.css", "body { margin: 0; }", "css"),
1254            ("page.html", "<html><body>Test</body></html>", "html"),
1255            ("query.sql", "SELECT * FROM users;", "sql"),
1256            ("build.sh", "#!/bin/bash\necho 'building'", "bash"),
1257            ("unknown.xyz", "unknown content", "xyz"),
1258        ];
1259
1260        for (filename, content, expected_lang) in test_files.iter() {
1261            let file_path = base_path.join(filename);
1262            fs::write(&file_path, content).unwrap();
1263
1264            let mut output = fs::File::create(&output_path).unwrap();
1265            process_file(
1266                base_path,
1267                &file_path,
1268                &mut output,
1269                false,
1270                None,
1271                &TreeSitterConfig::default(),
1272            )
1273            .unwrap();
1274
1275            let result = fs::read_to_string(&output_path).unwrap();
1276            assert!(result.contains(&format!("```{}", expected_lang)));
1277            assert!(result.contains(content));
1278            assert!(result.contains(filename));
1279        }
1280    }
1281}
context_builder/markdown.rs

context_builder/
markdown.rs