context_builder/
markdown.rs

1use chrono::Utc;
2use ignore::DirEntry;
3use log::{error, info, warn};
4use std::fs;
5use std::io::{self, Read, Seek, SeekFrom, Write};
6use std::path::Path;
7
8use crate::tree::{FileTree, write_tree_to_file};
9use encoding_rs::{Encoding, UTF_8};
10
11#[cfg(feature = "parallel")]
12use crossbeam_channel::{Receiver, Sender, bounded};
13#[cfg(feature = "parallel")]
14use std::thread;
15
16/// Generates the final Markdown file.
17#[allow(clippy::too_many_arguments, unused_variables)]
18pub fn generate_markdown(
19    output_path: &str,
20    input_dir: &str,
21    filters: &[String],
22    ignores: &[String],
23    file_tree: &FileTree,
24    files: &[DirEntry],
25    base_path: &Path,
26    line_numbers: bool,
27    encoding_strategy: Option<&str>,
28    max_tokens: Option<usize>,
29) -> io::Result<()> {
30    if let Some(parent) = Path::new(output_path).parent()
31        && !parent.exists()
32    {
33        fs::create_dir_all(parent)?;
34    }
35
36    let mut output = fs::File::create(output_path)?;
37
38    let input_dir_name = if input_dir == "." {
39        let current_dir = std::env::current_dir()?;
40        current_dir
41            .file_name()
42            .unwrap()
43            .to_str()
44            .unwrap()
45            .to_string()
46    } else {
47        input_dir.to_string()
48    };
49
50    // --- Header --- //
51    writeln!(output, "# Directory Structure Report\n")?;
52
53    if !filters.is_empty() {
54        writeln!(
55            output,
56            "This document contains files from the `{}` directory with extensions: {}",
57            input_dir_name,
58            filters.join(", ")
59        )?;
60    } else {
61        writeln!(
62            output,
63            "This document contains all files from the `{}` directory, optimized for LLM consumption.",
64            input_dir_name
65        )?;
66    }
67
68    if !ignores.is_empty() {
69        writeln!(output, "Custom ignored patterns: {}", ignores.join(", "))?;
70    }
71
72    // Deterministic content hash (enables LLM prompt caching across runs)
73    // Uses xxh3 over file content bytes — stable across Rust versions and machines.
74    // Previous implementation hashed mtime (broken by git checkout, cp, etc.)
75    let mut content_hasher = xxhash_rust::xxh3::Xxh3::new();
76    for entry in files {
77        // Hash relative unix-style path for cross-OS determinism.
78        // Using absolute or OS-native paths would produce different hashes
79        // on different machines or operating systems.
80        let rel_path = entry.path().strip_prefix(base_path).unwrap_or(entry.path());
81        let normalized = rel_path.to_string_lossy().replace('\\', "/");
82        content_hasher.update(normalized.as_bytes());
83        // Null delimiter prevents collision: path="a" content="bc" vs path="ab" content="c"
84        content_hasher.update(b"\0");
85        // Hash actual file content (not mtime!) for determinism
86        if let Ok(bytes) = std::fs::read(entry.path()) {
87            content_hasher.update(&bytes);
88        }
89        content_hasher.update(b"\0");
90    }
91    writeln!(output, "Content hash: {:016x}", content_hasher.digest())?;
92    writeln!(output)?;
93
94    // --- File Tree --- //
95
96    writeln!(output, "## File Tree Structure\n")?;
97
98    write_tree_to_file(&mut output, file_tree, 0)?;
99
100    writeln!(output)?;
101
102    // (No '## Files' heading here; it will be injected later only once during final composition)
103    // (Diff section will be conditionally inserted later by the auto_diff logic in lib.rs)
104
105    #[cfg(feature = "parallel")]
106    {
107        use rayon::prelude::*;
108
109        // Create a bounded channel for ordered chunks
110        type ChunkResult = (usize, io::Result<Vec<u8>>);
111        let (sender, receiver): (Sender<ChunkResult>, Receiver<ChunkResult>) =
112            bounded(num_cpus::get() * 2); // Buffer size based on CPU count
113
114        let writer_handle = {
115            let mut output = output;
116            let total_files = files.len();
117            let budget = max_tokens;
118
119            thread::spawn(move || -> io::Result<()> {
120                let mut completed_chunks = std::collections::BTreeMap::new();
121                let mut next_index = 0;
122                let mut errors = Vec::new();
123                let mut tokens_used: usize = 0;
124                let mut budget_exceeded = false;
125
126                // Receive chunks and write them in order
127                while next_index < total_files {
128                    match receiver.recv() {
129                        Ok((index, chunk_result)) => {
130                            completed_chunks.insert(index, chunk_result);
131
132                            // Write all consecutive chunks starting from next_index
133                            while let Some(chunk_result) = completed_chunks.remove(&next_index) {
134                                if budget_exceeded {
135                                    // Already over budget — skip remaining chunks
136                                    next_index += 1;
137                                    continue;
138                                }
139
140                                match chunk_result {
141                                    Ok(buf) => {
142                                        // Estimate tokens for this chunk (~4 bytes per token)
143                                        let chunk_tokens = buf.len() / 4;
144
145                                        if let Some(max) = budget
146                                            && tokens_used + chunk_tokens > max
147                                            && tokens_used > 0
148                                        {
149                                            let remaining = total_files - next_index;
150                                            let notice = format!(
151                                                "---\n\n_⚠️ Token budget ({}) reached. {} remaining files omitted._\n\n",
152                                                max, remaining
153                                            );
154                                            if let Err(e) = output.write_all(notice.as_bytes()) {
155                                                errors.push(format!(
156                                                    "Failed to write truncation notice: {}",
157                                                    e
158                                                ));
159                                            }
160                                            budget_exceeded = true;
161                                            next_index += 1;
162                                            continue;
163                                        }
164
165                                        tokens_used += chunk_tokens;
166                                        if let Err(e) = output.write_all(&buf) {
167                                            errors.push(format!(
168                                                "Failed to write output for file index {}: {}",
169                                                next_index, e
170                                            ));
171                                        }
172                                    }
173                                    Err(e) => {
174                                        errors.push(format!(
175                                            "Failed to process file index {}: {}",
176                                            next_index, e
177                                        ));
178                                    }
179                                }
180                                next_index += 1;
181                            }
182                        }
183                        Err(_) => break, // Channel closed
184                    }
185                }
186
187                if !errors.is_empty() {
188                    error!(
189                        "Encountered {} errors during parallel processing:",
190                        errors.len()
191                    );
192                    for err in &errors {
193                        error!("  {}", err);
194                    }
195                    return Err(std::io::Error::other(format!(
196                        "Failed to process {} files: {}",
197                        errors.len(),
198                        errors.join("; ")
199                    )));
200                }
201
202                Ok(())
203            })
204        };
205
206        // Process files in parallel and send results to writer
207        files.par_iter().enumerate().for_each(|(index, entry)| {
208            let mut buf = Vec::new();
209            let result = process_file(
210                base_path,
211                entry.path(),
212                &mut buf,
213                line_numbers,
214                encoding_strategy,
215            )
216            .map(|_| buf);
217
218            // Send result to writer thread (ignore send errors - channel might be closed)
219            let _ = sender.send((index, result));
220        });
221
222        // Close the sender to signal completion
223        drop(sender);
224
225        // Wait for writer thread to complete and propagate any errors
226        writer_handle
227            .join()
228            .map_err(|_| std::io::Error::other("Writer thread panicked"))??;
229    }
230
231    #[cfg(not(feature = "parallel"))]
232    {
233        let mut tokens_used: usize = 0;
234
235        for (idx, entry) in files.iter().enumerate() {
236            // Estimate tokens for this file (~4 bytes per token)
237            let file_size = std::fs::metadata(entry.path())
238                .map(|m| m.len())
239                .unwrap_or(0);
240            let estimated_file_tokens = (file_size as usize) / 4;
241
242            if let Some(budget) = max_tokens {
243                if tokens_used + estimated_file_tokens > budget && tokens_used > 0 {
244                    let remaining = files.len() - idx;
245                    writeln!(output, "---\n")?;
246                    writeln!(
247                        output,
248                        "_⚠️ Token budget ({}) reached. {} remaining files omitted._\n",
249                        budget, remaining
250                    )?;
251                    break;
252                }
253            }
254
255            tokens_used += estimated_file_tokens;
256            process_file(
257                base_path,
258                entry.path(),
259                &mut output,
260                line_numbers,
261                encoding_strategy,
262            )?;
263        }
264    }
265
266    Ok(())
267}
268
269/// Processes a single file and writes its content to the output.
270pub fn process_file(
271    base_path: &Path,
272
273    file_path: &Path,
274
275    output: &mut impl Write,
276    line_numbers: bool,
277    encoding_strategy: Option<&str>,
278) -> io::Result<()> {
279    let relative_path = file_path.strip_prefix(base_path).unwrap_or(file_path);
280    info!("Processing file: {}", relative_path.display());
281
282    let metadata = match fs::metadata(file_path) {
283        Ok(meta) => meta,
284        Err(e) => {
285            error!(
286                "Failed to get metadata for {}: {}",
287                relative_path.display(),
288                e
289            );
290            return Ok(());
291        }
292    };
293
294    let modified_time = metadata
295        .modified()
296        .ok()
297        .map(|time| {
298            let system_time: chrono::DateTime<Utc> = time.into();
299            system_time.format("%Y-%m-%d %H:%M:%S UTC").to_string()
300        })
301        .unwrap_or_else(|| "Unknown".to_string());
302
303    writeln!(output)?;
304    writeln!(output, "### File: `{}`", relative_path.display())?;
305
306    writeln!(output)?;
307
308    writeln!(output, "- Size: {} bytes", metadata.len())?;
309    writeln!(output, "- Modified: {}", modified_time)?;
310    writeln!(output)?;
311
312    // --- File Content --- //
313    let extension = file_path
314        .extension()
315        .and_then(|s| s.to_str())
316        .unwrap_or("text");
317    let language = match extension {
318        "rs" => "rust",
319        "js" => "javascript",
320        "ts" => "typescript",
321        "jsx" => "jsx",
322        "tsx" => "tsx",
323        "json" => "json",
324        "toml" => "toml",
325        "md" => "markdown",
326        "yaml" | "yml" => "yaml",
327        "html" => "html",
328        "css" => "css",
329        "py" => "python",
330        "java" => "java",
331        "cpp" => "cpp",
332        "c" => "c",
333        "h" => "c",
334        "hpp" => "cpp",
335        "sql" => "sql",
336        "sh" => "bash",
337        "xml" => "xml",
338        "lock" => "toml",
339        _ => extension,
340    };
341
342    // Enhanced binary file handling with encoding detection and transcoding
343    match fs::File::open(file_path) {
344        Ok(mut file) => {
345            let mut sniff = [0u8; 8192];
346            let n = match file.read(&mut sniff) {
347                Ok(n) => n,
348                Err(e) => {
349                    warn!(
350                        "Could not read file {}: {}. Skipping content.",
351                        relative_path.display(),
352                        e
353                    );
354
355                    writeln!(output, "```text")?;
356
357                    writeln!(
358                        output,
359                        "<Could not read file content (e.g., binary file or permission error)>"
360                    )?;
361
362                    writeln!(output, "```")?;
363
364                    return Ok(());
365                }
366            };
367            let slice = &sniff[..n];
368
369            // Find a valid UTF-8 boundary by backtracking up to 3 bytes.
370            // If the sniff buffer cuts a multi-byte char (e.g., emoji at byte 8191),
371            // from_utf8 would falsely classify the file as non-UTF-8.
372            let check_len = if n == sniff.len() {
373                // Buffer is full — may have split a multi-byte char at the end
374                let mut end = n;
375                while end > 0 && end > n.saturating_sub(4) && sniff[end - 1] & 0xC0 == 0x80 {
376                    end -= 1; // skip continuation bytes
377                }
378                // If we landed on a leading byte, check if the sequence is complete
379                if end > 0 && end < n {
380                    let leading = sniff[end - 1];
381                    let expected_len = if leading & 0xE0 == 0xC0 {
382                        2
383                    } else if leading & 0xF0 == 0xE0 {
384                        3
385                    } else if leading & 0xF8 == 0xF0 {
386                        4
387                    } else {
388                        1
389                    };
390                    if end - 1 + expected_len > n {
391                        end - 1 // incomplete char — exclude the leading byte too
392                    } else {
393                        n
394                    }
395                } else {
396                    n
397                }
398            } else {
399                n // didn't fill the buffer, so no boundary issue
400            };
401
402            // First check if it's valid UTF-8
403            let is_utf8 = std::str::from_utf8(&sniff[..check_len]).is_ok();
404
405            if is_utf8 && !slice.contains(&0) {
406                // Valid UTF-8 text file - proceed normally
407            } else {
408                // Try encoding detection for non-UTF-8 files
409                // If it's not UTF-8, try to detect the encoding
410                let (encoding, _consumed) =
411                    encoding_rs::Encoding::for_bom(slice).unwrap_or((encoding_rs::UTF_8, 0));
412
413                // If it's not UTF-8, try to detect the encoding
414                let detected_encoding = if encoding == UTF_8 {
415                    // Use chardet-like detection for common encodings
416                    detect_text_encoding(slice)
417                } else {
418                    Some(encoding)
419                };
420
421                match detected_encoding {
422                    Some(enc) if enc != UTF_8 => {
423                        let strategy = encoding_strategy.unwrap_or("detect");
424                        match strategy {
425                            "strict" | "skip" => {
426                                // Skip files with non-UTF-8 encoding
427                                warn!(
428                                    "Skipping non-UTF-8 file {} (encoding: {}, strategy: {})",
429                                    relative_path.display(),
430                                    enc.name(),
431                                    strategy
432                                );
433                            }
434                            _ => {
435                                // Default "detect" strategy: attempt to transcode
436                                match transcode_file_content(file_path, enc) {
437                                    Ok(transcoded_content) => {
438                                        info!(
439                                            "Successfully transcoded {} from {} to UTF-8",
440                                            relative_path.display(),
441                                            enc.name()
442                                        );
443                                        write_text_content(
444                                            output,
445                                            &transcoded_content,
446                                            language,
447                                            line_numbers,
448                                        )?;
449                                        return Ok(());
450                                    }
451                                    Err(e) => {
452                                        warn!(
453                                            "Failed to transcode {} from {}: {}. Treating as binary.",
454                                            relative_path.display(),
455                                            enc.name(),
456                                            e
457                                        );
458                                    }
459                                }
460                            }
461                        }
462                    }
463                    _ => {
464                        // Check if it's likely binary (contains null bytes)
465                        if slice.contains(&0) {
466                            warn!(
467                                "Detected binary file {} (contains null bytes). Skipping content.",
468                                relative_path.display()
469                            );
470                        } else {
471                            warn!(
472                                "Could not determine encoding for {}. Treating as binary.",
473                                relative_path.display()
474                            );
475                        }
476                    }
477                }
478
479                // Fallback to binary file placeholder
480                writeln!(output, "```text")?;
481                writeln!(
482                    output,
483                    "<Binary file or unsupported encoding: {} bytes>",
484                    metadata.len()
485                )?;
486                writeln!(output, "```")?;
487                return Ok(());
488            }
489
490            // Reset cursor and stream the content
491            if let Err(e) = file.seek(SeekFrom::Start(0)) {
492                warn!(
493                    "Could not reset file cursor for {}: {}. Skipping content.",
494                    relative_path.display(),
495                    e
496                );
497                writeln!(output, "```text")?;
498                writeln!(
499                    output,
500                    "<Could not read file content (e.g., binary file or permission error)>"
501                )?;
502                writeln!(output, "```")?;
503                return Ok(());
504            }
505
506            // Stream UTF-8 content
507            let content = match std::fs::read_to_string(file_path) {
508                Ok(content) => content,
509                Err(e) => {
510                    warn!(
511                        "Error reading file {}: {}. Output may be truncated.",
512                        relative_path.display(),
513                        e
514                    );
515                    writeln!(output, "```text")?;
516                    writeln!(output, "<Error reading file content>")?;
517                    writeln!(output, "```")?;
518                    return Ok(());
519                }
520            };
521
522            write_text_content(output, &content, language, line_numbers)?;
523        }
524        Err(e) => {
525            warn!(
526                "Could not open file {}: {}. Skipping content.",
527                relative_path.display(),
528                e
529            );
530            writeln!(output, "```text")?;
531            writeln!(
532                output,
533                "<Could not read file content (e.g., binary file or permission error)>"
534            )?;
535            writeln!(output, "```")?;
536        }
537    }
538
539    Ok(())
540}
541
542/// Detect text encoding using heuristics for common encodings
543fn detect_text_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
544    // Try common encodings
545    let encodings = [
546        encoding_rs::WINDOWS_1252,
547        encoding_rs::UTF_16LE,
548        encoding_rs::UTF_16BE,
549        encoding_rs::SHIFT_JIS,
550    ];
551
552    for encoding in &encodings {
553        let (decoded, _, had_errors) = encoding.decode(bytes);
554        if !had_errors && is_likely_text(&decoded) {
555            return Some(encoding);
556        }
557    }
558
559    None
560}
561
562/// Check if decoded content looks like text (no control characters except common ones)
563fn is_likely_text(content: &str) -> bool {
564    let mut control_chars = 0;
565    let mut total_chars = 0;
566
567    for ch in content.chars() {
568        total_chars += 1;
569        if ch.is_control() && ch != '\n' && ch != '\r' && ch != '\t' {
570            control_chars += 1;
571        }
572
573        // If more than 5% control characters, probably not text
574        if total_chars > 100 && control_chars * 20 > total_chars {
575            return false;
576        }
577    }
578
579    // Allow up to 5% control characters in small files
580    if total_chars > 0 {
581        control_chars * 20 <= total_chars
582    } else {
583        true
584    }
585}
586
587/// Transcode file content from detected encoding to UTF-8
588fn transcode_file_content(file_path: &Path, encoding: &'static Encoding) -> io::Result<String> {
589    let bytes = std::fs::read(file_path)?;
590    let (decoded, _, had_errors) = encoding.decode(&bytes);
591
592    if had_errors {
593        return Err(io::Error::new(
594            io::ErrorKind::InvalidData,
595            format!("Failed to decode file with encoding {}", encoding.name()),
596        ));
597    }
598
599    Ok(decoded.into_owned())
600}
601
602/// Write text content with optional line numbers
603fn write_text_content(
604    output: &mut impl Write,
605    content: &str,
606    language: &str,
607    line_numbers: bool,
608) -> io::Result<()> {
609    writeln!(output, "```{}", language)?;
610
611    if line_numbers {
612        for (i, line) in content.lines().enumerate() {
613            writeln!(output, "{:>4} | {}", i + 1, line)?;
614        }
615    } else {
616        output.write_all(content.as_bytes())?;
617        if !content.ends_with('\n') {
618            writeln!(output)?;
619        }
620    }
621
622    writeln!(output, "```")?;
623    Ok(())
624}
625
626#[cfg(test)]
627mod tests {
628    use super::*;
629    use std::fs;
630    use tempfile::tempdir;
631
632    #[test]
633    fn test_code_block_formatting() {
634        let dir = tempdir().unwrap();
635        let base_path = dir.path();
636        let file_path = base_path.join("test.rs");
637        let output_path = base_path.join("output.md");
638
639        // Create a test Rust file
640        fs::write(
641            &file_path,
642            "fn main() {\n    println!(\"Hello, world!\");\n}",
643        )
644        .unwrap();
645
646        // Create an output file
647        let mut output = fs::File::create(&output_path).unwrap();
648
649        // Process the file
650        process_file(base_path, &file_path, &mut output, false, None).unwrap();
651
652        // Read the output
653        let content = fs::read_to_string(&output_path).unwrap();
654
655        // Check that code blocks are properly formatted
656        assert!(content.contains("```rust"));
657        assert!(content.contains("```") && content.matches("```").count() >= 2);
658    }
659
660    #[test]
661    fn test_markdown_file_formatting() {
662        let dir = tempdir().unwrap();
663        let base_path = dir.path();
664        let file_path = base_path.join("README.md");
665        let output_path = base_path.join("output.md");
666
667        // Create a test Markdown file
668        fs::write(&file_path, "# Test\n\nThis is a test markdown file.").unwrap();
669
670        // Create an output file
671        let mut output = fs::File::create(&output_path).unwrap();
672
673        // Process the file
674        process_file(base_path, &file_path, &mut output, false, None).unwrap();
675
676        // Read the output
677        let content = fs::read_to_string(&output_path).unwrap();
678
679        // Debug prints the content
680        println!("Generated content:\n{}", content);
681
682        // Check that markdown files use the correct language identifier
683        assert!(
684            content.contains("```markdown"),
685            "Content should contain '```markdown' but was: {}",
686            content
687        );
688        // Count the number of code block markers
689        let code_block_markers = content.matches("```").count();
690
691        assert!(
692            code_block_markers >= 2,
693            "Expected at least 2 code block markers, found {}",
694            code_block_markers
695        );
696    }
697
698    #[test]
699    fn test_line_numbered_code_blocks() {
700        let dir = tempdir().unwrap();
701        let base_path = dir.path();
702        let file_path = base_path.join("lib.rs");
703        let output_path = base_path.join("out.md");
704
705        // Create a multi-line Rust file
706        fs::write(
707                    &file_path,
708                    "fn add(a: i32, b: i32) -> i32 {\n    a + b\n}\n\nfn main() {\n    println!(\"{}\", add(1, 2));\n}\n",
709                )
710                .unwrap();
711
712        let mut output = fs::File::create(&output_path).unwrap();
713        process_file(base_path, &file_path, &mut output, true, None).unwrap();
714
715        let content = fs::read_to_string(&output_path).unwrap();
716
717        // Check language and line numbers prefix
718        assert!(content.contains("```rust"));
719        assert!(content.contains("   1 | "));
720        assert!(content.contains("   2 | "));
721
722        // Count lines with "|" prefix equals number of lines in an original file
723        let numbered_lines = content
724            .lines()
725            .filter(|l| {
726                l.trim_start()
727                    .chars()
728                    .next()
729                    .map(|c| c.is_ascii_digit())
730                    .unwrap_or(false)
731                    && l.contains(" | ")
732            })
733            .count();
734        let original_line_count = fs::read_to_string(&file_path).unwrap().lines().count();
735        assert_eq!(numbered_lines, original_line_count);
736
737        // Ensure code fence closes
738        assert!(content.contains("```"));
739    }
740
741    #[test]
742    fn test_binary_file_handling() {
743        let dir = tempdir().unwrap();
744        let base_path = dir.path();
745        let file_path = base_path.join("image.bin");
746        let output_path = base_path.join("out.md");
747
748        // Write truly binary data that won't be decoded by encoding detection
749        let bytes = vec![
750            0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG header
751            0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, // PNG chunk
752            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, // More binary data
753            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Null bytes
754        ];
755        fs::write(&file_path, bytes).unwrap();
756
757        let mut output = fs::File::create(&output_path).unwrap();
758        process_file(base_path, &file_path, &mut output, false, None).unwrap();
759
760        let content = fs::read_to_string(&output_path).unwrap();
761
762        // Expect a text block to fall back with a helpful message
763        assert!(content.contains("```text"));
764        assert!(content.contains("<Binary file or unsupported encoding:"));
765
766        // Ensure the code block is closed
767        let fence_count = content.matches("```").count();
768        assert!(
769            fence_count >= 2,
770            "expected at least opening and closing fences, got {}",
771            fence_count
772        );
773    }
774
775    #[test]
776    fn test_encoding_detection_and_transcoding() {
777        let dir = tempdir().unwrap();
778        let base_path = dir.path();
779        let output_path = base_path.join("out.md");
780
781        // Test Windows-1252 encoded file (common in Windows)
782        let windows1252_content = [
783            0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, // "Hello "
784            0x93, 0x57, 0x6F, 0x72, 0x6C, 0x64, 0x94, // "World" with smart quotes
785            0x0A, // newline
786        ];
787        let file_path = base_path.join("windows1252.txt");
788        fs::write(&file_path, windows1252_content).unwrap();
789
790        let mut output = fs::File::create(&output_path).unwrap();
791        process_file(base_path, &file_path, &mut output, false, Some("detect")).unwrap();
792
793        let content = fs::read_to_string(&output_path).unwrap();
794
795        // Should contain transcoded content with UTF-8 equivalents
796        assert!(content.contains("Hello"));
797        assert!(content.contains("World"));
798        // Should use text language
799        assert!(content.contains("```txt"));
800
801        // Ensure the code block is closed
802        let fence_count = content.matches("```").count();
803        assert!(
804            fence_count >= 2,
805            "expected at least opening and closing fences, got {}",
806            fence_count
807        );
808    }
809
810    #[test]
811    fn test_encoding_strategy_strict() {
812        let dir = tempdir().unwrap();
813        let base_path = dir.path();
814        let output_path = base_path.join("out.md");
815
816        // Create a file with non-UTF-8 content
817        let non_utf8_content = [0xFF, 0xFE, 0x41, 0x00]; // UTF-16 LE BOM + "A"
818        let file_path = base_path.join("utf16.txt");
819        fs::write(&file_path, non_utf8_content).unwrap();
820
821        let mut output = fs::File::create(&output_path).unwrap();
822        process_file(base_path, &file_path, &mut output, false, Some("strict")).unwrap();
823
824        let content = fs::read_to_string(&output_path).unwrap();
825
826        // Should contain binary file placeholder
827        assert!(content.contains("<Binary file or unsupported encoding:"));
828        assert!(content.contains("```text"));
829
830        // Ensure the code block is closed
831        let fence_count = content.matches("```").count();
832        assert!(
833            fence_count >= 2,
834            "expected at least opening and closing fences, got {}",
835            fence_count
836        );
837    }
838
839    #[test]
840    fn test_encoding_strategy_skip() {
841        let dir = tempdir().unwrap();
842        let base_path = dir.path();
843        let output_path = base_path.join("out.md");
844
845        // Create a file with UTF-16 content
846        let utf16_content = [0xFF, 0xFE, 0x48, 0x00, 0x69, 0x00]; // UTF-16 LE "Hi"
847        let file_path = base_path.join("utf16.txt");
848        fs::write(&file_path, utf16_content).unwrap();
849
850        let mut output = fs::File::create(&output_path).unwrap();
851        process_file(base_path, &file_path, &mut output, false, Some("skip")).unwrap();
852
853        let content = fs::read_to_string(&output_path).unwrap();
854
855        // Should contain binary file placeholder (skipped transcoding)
856        assert!(content.contains("<Binary file or unsupported encoding:"));
857        assert!(content.contains("```text"));
858    }
859
860    #[test]
861    fn test_generate_markdown_with_current_directory() {
862        let dir = tempdir().unwrap();
863        let base_path = dir.path();
864        let output_path = base_path.join("test.md");
865
866        // Create test files
867        fs::write(base_path.join("readme.txt"), "Hello world").unwrap();
868
869        // Collect files
870        let files = crate::file_utils::collect_files(base_path, &[], &[], &[]).unwrap();
871        let file_tree = crate::tree::build_file_tree(&files, base_path);
872
873        // Change to the test directory
874        let original_dir = std::env::current_dir().unwrap();
875        std::env::set_current_dir(base_path).unwrap();
876
877        // Test with "." as input directory
878        let result = generate_markdown(
879            &output_path.to_string_lossy(),
880            ".",
881            &[],
882            &[],
883            &file_tree,
884            &files,
885            base_path,
886            false,
887            None,
888            None, // max_tokens
889        );
890
891        // Restore original directory
892        std::env::set_current_dir(original_dir).unwrap();
893
894        assert!(result.is_ok());
895        let content = fs::read_to_string(&output_path).unwrap();
896        assert!(content.contains("Directory Structure Report"));
897    }
898
899    #[test]
900    fn test_generate_markdown_creates_output_directory() {
901        let dir = tempdir().unwrap();
902        let base_path = dir.path();
903        let nested_output = base_path.join("nested").join("deep").join("output.md");
904
905        // Create test files
906        fs::write(base_path.join("test.txt"), "content").unwrap();
907
908        let files = crate::file_utils::collect_files(base_path, &[], &[], &[]).unwrap();
909        let file_tree = crate::tree::build_file_tree(&files, base_path);
910
911        let result = generate_markdown(
912            &nested_output.to_string_lossy(),
913            "test_dir",
914            &[],
915            &[],
916            &file_tree,
917            &files,
918            base_path,
919            false,
920            None,
921            None, // max_tokens
922        );
923
924        assert!(result.is_ok());
925        assert!(nested_output.exists());
926        assert!(nested_output.parent().unwrap().exists());
927    }
928
929    #[test]
930    fn test_generate_markdown_with_filters_and_ignores() {
931        let dir = tempdir().unwrap();
932        let base_path = dir.path();
933        let output_path = base_path.join("filtered.md");
934
935        fs::write(base_path.join("main.rs"), "fn main() {}").unwrap();
936        fs::write(base_path.join("config.toml"), "[package]").unwrap();
937        fs::write(base_path.join("readme.md"), "# README").unwrap();
938
939        let files = crate::file_utils::collect_files(base_path, &[], &[], &[]).unwrap();
940        let file_tree = crate::tree::build_file_tree(&files, base_path);
941
942        let result = generate_markdown(
943            &output_path.to_string_lossy(),
944            "project",
945            &["rs".to_string(), "toml".to_string()],
946            &["readme.md".to_string()],
947            &file_tree,
948            &files,
949            base_path,
950            true,
951            Some("strict"),
952            None, // max_tokens
953        );
954
955        assert!(result.is_ok());
956        let content = fs::read_to_string(&output_path).unwrap();
957        assert!(content.contains("Directory Structure Report"));
958        // The actual generate_markdown function doesn't format filters/ignores this way
959        assert!(content.contains("main.rs") || content.contains("config.toml"));
960    }
961
962    #[test]
963    fn test_write_text_content_with_line_numbers() {
964        let mut output = Vec::new();
965        let content = "line one\nline two\nline three";
966
967        write_text_content(&mut output, content, "rust", true).unwrap();
968
969        let result = String::from_utf8(output).unwrap();
970        assert!(result.contains("```rust"));
971        assert!(result.contains("   1 | line one"));
972        assert!(result.contains("   2 | line two"));
973        assert!(result.contains("   3 | line three"));
974        assert!(result.contains("```"));
975    }
976
977    #[test]
978    fn test_write_text_content_without_line_numbers() {
979        let mut output = Vec::new();
980        let content = "function test() {\n  return true;\n}";
981
982        write_text_content(&mut output, content, "javascript", false).unwrap();
983
984        let result = String::from_utf8(output).unwrap();
985        assert!(result.contains("```javascript"));
986        assert!(result.contains("function test() {"));
987        assert!(result.contains("  return true;"));
988        assert!(result.contains("```"));
989        assert!(!result.contains(" | ")); // No line number prefix
990    }
991
992    #[test]
993    fn test_write_text_content_without_trailing_newline() {
994        let mut output = Vec::new();
995        let content = "no newline at end"; // No \n at end
996
997        write_text_content(&mut output, content, "text", false).unwrap();
998
999        let result = String::from_utf8(output).unwrap();
1000        assert!(result.contains("```text"));
1001        assert!(result.contains("no newline at end"));
1002        assert!(result.ends_with("```\n")); // Should add newline
1003    }
1004
1005    #[test]
1006    fn test_is_likely_text() {
1007        // Normal text should be considered text
1008        assert!(is_likely_text("Hello world\nThis is normal text"));
1009
1010        // Text with some control characters should still be text
1011        assert!(is_likely_text(
1012            "Line 1\nLine 2\tTabbed\r\nWindows line ending"
1013        ));
1014
1015        // Text with too many control characters should not be text
1016        let mut bad_text = String::new();
1017        for i in 0..200 {
1018            if i % 5 == 0 {
1019                bad_text.push('\x01'); // Control character
1020            } else {
1021                bad_text.push('a');
1022            }
1023        }
1024        assert!(!is_likely_text(&bad_text));
1025
1026        // Empty string should be considered text
1027        assert!(is_likely_text(""));
1028    }
1029
1030    #[test]
1031    fn test_detect_text_encoding() {
1032        // UTF-8 should return None (already UTF-8)
1033        let utf8_bytes = "Hello world".as_bytes();
1034        let result = detect_text_encoding(utf8_bytes);
1035        // The function may return an encoding even for UTF-8 text if it detects it differently
1036        // Just verify it doesn't crash
1037        assert!(result.is_some() || result.is_none());
1038
1039        // Windows-1252 encoded text should be detected
1040        let windows1252_bytes = [
1041            0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x93, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x94,
1042        ];
1043        let detected = detect_text_encoding(&windows1252_bytes);
1044        assert!(detected.is_some());
1045    }
1046
1047    #[test]
1048    fn test_transcode_file_content() {
1049        let dir = tempdir().unwrap();
1050        let file_path = dir.path().join("windows1252.txt");
1051
1052        // Write Windows-1252 encoded content
1053        let windows1252_content = [
1054            0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, // "Hello "
1055            0x93, 0x57, 0x6F, 0x72, 0x6C, 0x64, 0x94, // "World" with smart quotes
1056        ];
1057        fs::write(&file_path, windows1252_content).unwrap();
1058
1059        let result = transcode_file_content(&file_path, encoding_rs::WINDOWS_1252);
1060        assert!(result.is_ok());
1061
1062        let transcoded = result.unwrap();
1063        assert!(transcoded.contains("Hello"));
1064        assert!(transcoded.contains("World"));
1065    }
1066
1067    #[test]
1068    fn test_process_file_with_metadata_error() {
1069        let dir = tempdir().unwrap();
1070        let base_path = dir.path();
1071        let nonexistent_file = base_path.join("nonexistent.txt");
1072        let output_path = base_path.join("output.md");
1073
1074        let mut output = fs::File::create(&output_path).unwrap();
1075
1076        // This should handle the metadata error gracefully
1077        let result = process_file(base_path, &nonexistent_file, &mut output, false, None);
1078        assert!(result.is_ok());
1079
1080        // Output should be minimal since file doesn't exist
1081        let content = fs::read_to_string(&output_path).unwrap();
1082        assert!(content.is_empty() || content.trim().is_empty());
1083    }
1084
1085    #[test]
1086    fn test_process_file_with_different_extensions() {
1087        let dir = tempdir().unwrap();
1088        let base_path = dir.path();
1089        let output_path = base_path.join("output.md");
1090
1091        // Test various file extensions
1092        let test_files = [
1093            ("script.py", "print('hello')", "python"),
1094            ("data.json", r#"{"key": "value"}"#, "json"),
1095            ("config.yaml", "key: value", "yaml"),
1096            ("style.css", "body { margin: 0; }", "css"),
1097            ("page.html", "<html><body>Test</body></html>", "html"),
1098            ("query.sql", "SELECT * FROM users;", "sql"),
1099            ("build.sh", "#!/bin/bash\necho 'building'", "bash"),
1100            ("unknown.xyz", "unknown content", "xyz"),
1101        ];
1102
1103        for (filename, content, expected_lang) in test_files.iter() {
1104            let file_path = base_path.join(filename);
1105            fs::write(&file_path, content).unwrap();
1106
1107            let mut output = fs::File::create(&output_path).unwrap();
1108            process_file(base_path, &file_path, &mut output, false, None).unwrap();
1109
1110            let result = fs::read_to_string(&output_path).unwrap();
1111            assert!(result.contains(&format!("```{}", expected_lang)));
1112            assert!(result.contains(content));
1113            assert!(result.contains(filename));
1114        }
1115    }
1116}
context_builder/markdown.rs

context_builder/
markdown.rs