markdownify 0.3.1

markitdown in rust - convert various document and files into markdown
Documentation
use crate::{MarkdownifyInput, error::ParsingError};
use std::{
    collections::BTreeMap,
    io::{Cursor, Read},
    path::Path,
};
use tar::Archive;
use zip::ZipArchive;

pub fn parse_zip(
    content: impl AsRef<[u8]>,
    allow_inline_images: bool,
) -> Result<String, ParsingError> {
    let mut archive = ZipArchive::new(Cursor::new(content))
        .map_err(|e| ParsingError::ArchiveError(e.to_string()))?;
    let mut tree = FileTree::default();

    for i in 0..archive.len() {
        let mut entry = archive
            .by_index(i)
            .map_err(|e| ParsingError::ArchiveError(e.to_string()))?;

        if entry.is_dir() {
            continue;
        }

        let name = entry.name().to_string();
        if should_skip_file(&name) {
            continue;
        }

        let mut contents = Vec::new();
        entry
            .read_to_end(&mut contents)
            .map_err(|e| ParsingError::ArchiveError(e.to_string()))?;

        let mut input = MarkdownifyInput::from_bytes(contents, name.clone())?;
        input.allow_inline_images(allow_inline_images);
        input.ext = Path::new(&name)
            .extension()
            .and_then(|e| e.to_str())
            .map(|e| e.to_lowercase());
        let text = input.convert()?;
        tree.add_file(name, text);
    }

    tree.render()
}

pub fn parse_tar(
    content: impl AsRef<[u8]>,
    allow_inline_images: bool,
) -> Result<String, ParsingError> {
    let mut archive = Archive::new(Cursor::new(content));
    let mut tree = FileTree::default();

    for entry in archive
        .entries()
        .map_err(|e| ParsingError::ArchiveError(e.to_string()))?
    {
        let mut entry = entry.map_err(|e| ParsingError::ArchiveError(e.to_string()))?;
        if !entry.header().entry_type().is_file() {
            continue;
        }

        let name = entry
            .path()
            .map_err(|e| ParsingError::ArchiveError(e.to_string()))?
            .to_string_lossy()
            .to_string();

        let mut contents = Vec::new();
        entry
            .read_to_end(&mut contents)
            .map_err(|e| ParsingError::ArchiveError(e.to_string()))?;

        let mut input = MarkdownifyInput::from_bytes(contents, name.clone())?;
        input.allow_inline_images(allow_inline_images);
        input.ext = Path::new(&name)
            .extension()
            .and_then(|e| e.to_str())
            .map(|e| e.to_lowercase());
        let text = input.convert()?;
        tree.add_file(name, text);
    }

    tree.render()
}

fn should_skip_file(name: &str) -> bool {
    name.starts_with("__MACOSX/")
        || name.contains("/._")
        || Path::new(name)
            .file_name()
            .is_some_and(|f| f.to_string_lossy().starts_with("._"))
}

#[derive(Default)]
pub struct FileTree {
    files: BTreeMap<String, String>,
}

impl FileTree {
    pub fn add_file(&mut self, path: String, content: String) {
        self.files.insert(path, content);
    }

    pub fn render(self) -> Result<String, ParsingError> {
        let mut output = String::new();
        output.push_str("```file-tree\n");
        self.render_tree(&mut output);
        output.push_str("```\n\n");

        for (name, content) in self.files {
            output.push_str("# File: ");
            output.push_str(&name);
            output.push_str("\n\n");
            output.push_str(&content);
            output.push_str("\n\n");
        }
        Ok(output)
    }

    fn render_tree(&self, output: &mut String) {
        let mut root: BTreeMap<String, Node> = BTreeMap::new();

        for path in self.files.keys() {
            let parts: Vec<&str> = path.split('/').collect();
            let mut current = &mut root;

            for (i, &part) in parts.iter().enumerate() {
                let is_file = i == parts.len() - 1;
                current = &mut current
                    .entry(part.to_string())
                    .or_insert(Node {
                        children: BTreeMap::new(),
                        is_file,
                    })
                    .children;
            }
        }

        render_tree(&root, output, String::new());
    }
}

struct Node {
    children: BTreeMap<String, Node>,
    is_file: bool,
}

fn render_tree(nodes: &BTreeMap<String, Node>, output: &mut String, prefix: String) {
    let items: Vec<_> = nodes.iter().collect();

    for (idx, (name, node)) in items.iter().enumerate() {
        let is_last = idx == items.len() - 1;
        let connector = if is_last { "└── " } else { "├── " };

        output.push_str(&prefix);
        output.push_str(connector);
        output.push_str(name);
        if !node.is_file {
            output.push('/');
        }
        output.push('\n');

        if !node.children.is_empty() {
            let extension = if is_last { "    " } else { "" };
            render_tree(&node.children, output, format!("{}{}", prefix, extension));
        }
    }
}