src2md 0.1.8

Turn source code into a Markdown document with syntax highlighting, or extract it back.
Documentation
use crate::utils::get_language_tag;
use anyhow::{Context, Result};
use content_inspector::{ContentType, inspect};
use ignore::DirEntry;
use log::debug;
use memmap2::MmapOptions;
use std::fs::File as StdFile;
use std::path::Path;
use std::str;
use tokio::fs::File;
use tokio::io::{AsyncWriteExt, BufWriter};

/// Magic header that identifies files generated by src2md.
/// This is used to prevent reading our own output files during collection.
/// Format: HTML comment with version identifier for future compatibility.
pub const OUTPUT_MAGIC_HEADER: &str = "<!-- src2md:v1 -->\n";

/// The byte sequence to check at the start of files to detect src2md output.
pub const OUTPUT_MAGIC_BYTES: &[u8] = b"<!-- src2md:v1 -->";

/// A writer that generates Markdown output with syntax-highlighted code blocks.
///
/// Uses dynamic backtick fencing to safely embed code that may contain
/// triple backticks (like other Markdown files).
pub struct MarkdownWriter<W: AsyncWriteExt + Unpin> {
    writer: BufWriter<W>,
    header_written: bool,
}

impl MarkdownWriter<tokio::fs::File> {
    /// Creates a new MarkdownWriter wrapping the given buffered writer.
    pub fn new(writer: BufWriter<File>) -> Self {
        Self {
            writer,
            header_written: false,
        }
    }

    /// Writes the magic header that identifies this as a src2md output file.
    /// This header allows us to skip reading our own output during collection.
    async fn ensure_header_written(&mut self) -> Result<()> {
        if !self.header_written {
            self.writer
                .write_all(OUTPUT_MAGIC_HEADER.as_bytes())
                .await
                .context("Failed to write output header")?;
            self.writer
                .write_all(b"\n")
                .await
                .context("Failed to write newline after header")?;
            self.header_written = true;
        }
        Ok(())
    }

    /// Writes a single file entry to the Markdown output.
    ///
    /// - Text files are wrapped in code blocks with appropriate language tags
    /// - Binary files are marked with "(binary file omitted)"
    /// - Dynamic fencing ensures safe embedding of files containing backticks
    pub async fn write_entry(&mut self, entry: &DirEntry, project_root: &Path) -> Result<()> {
        // Ensure the magic header is written first
        self.ensure_header_written().await?;

        let path = entry.path();
        let rel_path = path.strip_prefix(project_root).unwrap_or(path);

        debug!("Processing: {}", rel_path.display());

        // Write the file header
        self.writer
            .write_all(format!("## {}\n\n", rel_path.display()).as_bytes())
            .await
            .with_context(|| format!("Failed to write heading for {}", rel_path.display()))?;

        // Memory-map the file for efficient reading
        let file = StdFile::open(path)
            .with_context(|| format!("Failed to open file: {}", path.display()))?;

        // SAFETY: We only read from the memory-mapped region and the file
        // remains open for the duration of the mapping.
        let mmap = unsafe {
            MmapOptions::new()
                .map(&file)
                .with_context(|| format!("Failed to mmap file: {}", path.display()))?
        };

        // Inspect file content to detect binary files
        let sample_size = std::cmp::min(8192, mmap.len());
        let content_type = inspect(&mmap[..sample_size]);

        if content_type == ContentType::BINARY {
            self.writer
                .write_all(b"(binary file omitted)\n\n")
                .await
                .with_context(|| {
                    format!("Failed to write binary marker for {}", rel_path.display())
                })?;
            return Ok(());
        }

        let lang = get_language_tag(path);

        // Convert to string, with fallback for edge cases
        let content: String = match str::from_utf8(&mmap) {
            Ok(s) => s.to_string(),
            Err(_) => std::fs::read_to_string(path)
                .with_context(|| format!("Fallback read failed for {}", path.display()))?,
        };
        let text = content.as_str();

        // Calculate the minimum fence length needed to safely wrap this content
        let fence = calculate_fence(text);

        // Write opening fence with language tag
        self.writer
            .write_all(format!("{}{}\n", fence, lang).as_bytes())
            .await
            .with_context(|| format!("Failed to write opening fence for {}", rel_path.display()))?;

        // Write file content
        self.writer
            .write_all(text.as_bytes())
            .await
            .with_context(|| format!("Failed to write content for {}", rel_path.display()))?;

        // Write closing fence
        self.writer
            .write_all(format!("\n{}\n\n", fence).as_bytes())
            .await
            .with_context(|| format!("Failed to write closing fence for {}", rel_path.display()))?;

        debug!("Completed: {}", rel_path.display());
        Ok(())
    }

    /// Flushes any buffered data to the underlying writer.
    pub async fn flush(&mut self) -> Result<()> {
        self.writer.flush().await.context("Failed to flush output")
    }
}

/// Calculates the appropriate fence string for wrapping content.
///
/// Returns a fence with at least 3 backticks, or more if the content
/// contains backtick sequences that would interfere with parsing.
fn calculate_fence(content: &str) -> String {
    let max_backtick_run = content
        .lines()
        .filter_map(|line| {
            let trimmed = line.trim_start();
            if trimmed.starts_with('`') {
                Some(trimmed.chars().take_while(|&c| c == '`').count())
            } else {
                None
            }
        })
        .max()
        .unwrap_or(0);

    // Minimum fence length is 3, or one more than the max found in content
    let fence_len = max_backtick_run.max(2) + 1;
    "`".repeat(fence_len)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_calculate_fence_no_backticks() {
        let content = "fn main() {\n    println!(\"Hello\");\n}";
        assert_eq!(calculate_fence(content), "```");
    }

    #[test]
    fn test_calculate_fence_with_triple_backticks() {
        let content = "# Example\n\n```rust\nfn main() {}\n```";
        assert_eq!(calculate_fence(content), "````");
    }

    #[test]
    fn test_calculate_fence_with_quad_backticks() {
        let content = "````\nsome code\n````";
        assert_eq!(calculate_fence(content), "`````");
    }

    #[test]
    fn test_calculate_fence_indented_backticks() {
        let content = "    ```rust\n    fn main() {}\n    ```";
        assert_eq!(calculate_fence(content), "````");
    }

    #[test]
    fn test_calculate_fence_single_backticks() {
        // Single backticks (inline code) should still result in minimum 3-backtick fence
        let content = "`inline code` at start of line\nmore text";
        assert_eq!(calculate_fence(content), "```");
    }

    #[test]
    fn test_calculate_fence_double_backticks() {
        // Double backticks should still result in minimum 3-backtick fence
        let content = "``double backtick`` code";
        assert_eq!(calculate_fence(content), "```");
    }

    #[test]
    fn test_magic_header_format() {
        // Ensure the magic header is a valid HTML comment
        assert!(OUTPUT_MAGIC_HEADER.starts_with("<!--"));
        assert!(OUTPUT_MAGIC_HEADER.contains("src2md"));
        assert!(OUTPUT_MAGIC_HEADER.contains("v1"));
    }
}