skymark 0.1.0

HTML-to-Markdown converter prioritizing proper conversion for human readability
Documentation
#![forbid(unsafe_code)]
#![warn(missing_docs)]

//! HTML-to-Markdown conversion focused on readable Markdown output.
//!
//! The crate exposes a reusable [`HtmlToMarkdown`] converter for repeated work,
//! plus convenience functions for one-off translations.
//!
//! # Examples
//!
//! Convert a single HTML fragment with the default configuration:
//!
//! ```rust
//! use skymark::translate;
//!
//! let markdown = translate("<p>Hello, <strong>world</strong>!</p>");
//!
//! assert_eq!(markdown.trim(), "Hello, **world**!");
//! ```
//!
//! Reuse a configured converter when translating multiple documents with the
//! same options:
//!
//! ```rust
//! use skymark::{CodeBlockStyle, HtmlToMarkdown, Options};
//!
//! let mut options = Options::default();
//! options.code_block_style = CodeBlockStyle::Indented;
//! options.bullet_marker = "-".to_owned();
//!
//! let converter = HtmlToMarkdown::with_options(options);
//! let markdown = converter.translate("<ul><li>One</li><li>Two</li></ul>");
//!
//! assert_eq!(markdown.trim(), "- One\n- Two");
//! ```
//!
//! Translate many named HTML inputs and keep the results in deterministic key
//! order:
//!
//! ```rust
//! use skymark::translate_many;
//!
//! let converted = translate_many([
//!     ("guide", "<h1>Guide</h1>"),
//!     ("intro", "<p>Welcome</p>"),
//! ]);
//!
//! assert_eq!(converted["guide"].trim(), "# Guide");
//! assert_eq!(converted["intro"].trim(), "Welcome");
//! ```

mod config;
mod options;
mod parser;
mod translator;
mod utilities;
mod visitor;

use std::collections::BTreeMap;

/// Re-exported code block rendering styles used by [`Options`].
pub use options::{CodeBlockStyle, Options};
/// Re-exported translator override types used by [`HtmlToMarkdown::with_options_and_translators`].
pub use translator::{SurroundingNewlines, TranslatorConfig};

use translator::TranslatorCollection;

/// Reusable HTML-to-Markdown converter.
///
/// Construct one instance and call [`translate`](Self::translate) repeatedly
/// when multiple documents share the same configuration.
pub struct HtmlToMarkdown {
    options: Options,
    translators: TranslatorCollection,
    code_block_translators: TranslatorCollection,
}

impl HtmlToMarkdown {
    /// Creates a converter with the crate's default options and translators.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use skymark::HtmlToMarkdown;
    ///
    /// let converter = HtmlToMarkdown::new();
    /// let markdown = converter.translate("<p>Hello</p>");
    ///
    /// assert_eq!(markdown.trim(), "Hello");
    /// ```
    #[must_use]
    #[inline]
    pub fn new() -> Self {
        Self::with_options(Options::default())
    }

    /// Creates a converter with custom [`Options`].
    ///
    /// Default text replacements are still applied before the converter is
    /// built.
    #[must_use]
    #[inline]
    pub fn with_options(options: Options) -> Self {
        Self::with_options_and_translators(
            options,
            std::iter::empty::<(String, TranslatorConfig)>(),
            std::iter::empty::<(String, TranslatorConfig)>(),
        )
    }

    /// Creates a converter with custom options and translator overrides.
    ///
    /// `custom_translators` applies overrides for normal element translation,
    /// and `custom_code_block_translators` applies overrides while translating
    /// code blocks. Each iterator item maps a tag name, or a comma-separated
    /// list of tag names, to one [`TranslatorConfig`].
    #[must_use]
    #[inline]
    pub fn with_options_and_translators<K1, K2, I1, I2>(
        mut options: Options,
        custom_translators: I1,
        custom_code_block_translators: I2,
    ) -> Self
    where
        K1: Into<String>,
        K2: Into<String>,
        I1: IntoIterator<Item = (K1, TranslatorConfig)>,
        I2: IntoIterator<Item = (K2, TranslatorConfig)>,
    {
        options.add_default_text_replacements();
        config::build_converter(options, custom_translators, custom_code_block_translators)
    }

    /// Returns the converter's current options.
    ///
    /// Use this accessor to inspect the configuration after construction.
    #[must_use]
    #[inline]
    pub const fn options(&self) -> &Options {
        &self.options
    }

    /// Returns mutable access to the converter's options.
    ///
    /// Mutating the returned value updates future translations performed by
    /// this converter.
    #[must_use]
    #[inline]
    pub const fn options_mut(&mut self) -> &mut Options {
        &mut self.options
    }

    /// Converts one HTML document to Markdown.
    ///
    /// The input is parsed as an HTML fragment or document, then rendered using
    /// this converter's current options and translator overrides.
    #[must_use]
    #[inline]
    pub fn translate(&self, html: &str) -> String {
        let root = parser::parse_html(html);
        visitor::get_markdown_for_html_nodes(self, &root)
    }

    #[must_use]
    pub(crate) const fn translators(&self) -> &TranslatorCollection {
        &self.translators
    }

    #[must_use]
    pub(crate) const fn code_block_translators(&self) -> &TranslatorCollection {
        &self.code_block_translators
    }
}

impl Default for HtmlToMarkdown {
    #[inline]
    fn default() -> Self {
        Self::new()
    }
}

/// Converts HTML to Markdown with default options.
///
/// This convenience function creates a temporary [`HtmlToMarkdown`] instance
/// for one-off translations.
#[must_use]
#[inline]
pub fn translate(html: &str) -> String {
    HtmlToMarkdown::new().translate(html)
}

/// Converts HTML to Markdown with custom [`Options`].
///
/// This is equivalent to `HtmlToMarkdown::with_options(options).translate(html)`.
#[must_use]
#[inline]
pub fn translate_with_options(html: &str, options: Options) -> String {
    HtmlToMarkdown::with_options(options).translate(html)
}

/// Converts named HTML inputs to Markdown in deterministic key order.
///
/// The returned [`BTreeMap`] sorts entries by input name, which keeps output
/// ordering stable across runs.
#[must_use]
#[inline]
pub fn translate_many<'a>(
    files: impl IntoIterator<Item = (&'a str, &'a str)>,
) -> BTreeMap<String, String> {
    let converter = HtmlToMarkdown::new();
    files
        .into_iter()
        .map(|(name, html)| (name.to_owned(), converter.translate(html)))
        .collect()
}