Skip to main content

mdream/
lib.rs

1pub mod consts;
2pub(crate) mod convert;
3pub(crate) mod entities;
4pub(crate) mod helpers;
5pub mod splitter;
6pub(crate) mod tags;
7pub mod types;
8
9use convert::ConvertState;
10use types::{HTMLToMarkdownOptions, MdreamResult};
11
12/// Convert HTML to Markdown in a single pass.
13pub fn html_to_markdown(html: &str, options: HTMLToMarkdownOptions) -> String {
14    html_to_markdown_result(html, options).markdown
15}
16
17/// Convert HTML to Markdown with full results (extraction, frontmatter).
18pub fn html_to_markdown_result(html: &str, options: HTMLToMarkdownOptions) -> MdreamResult {
19    let capacity = (html.len() / 3).clamp(1024, 256 * 1024);
20    let mut state = ConvertState::new(options, capacity);
21    state.process_html(html);
22
23    let extracted = if state.has_extraction {
24        let results = std::mem::take(&mut state.extraction_results);
25        if results.is_empty() { None } else { Some(results) }
26    } else {
27        None
28    };
29
30    let frontmatter = if state.has_frontmatter {
31        let mut entries: Vec<(String, String)> = Vec::new();
32        if let Some(title) = &state.frontmatter_title {
33            entries.push(("title".to_string(), title.clone()));
34        }
35        for (k, v) in &state.frontmatter_meta {
36            entries.push((k.clone(), v.clone()));
37        }
38        if let Some(add) = state.options.plugins.as_ref()
39            .and_then(|p| p.frontmatter.as_ref())
40            .and_then(|f| f.additional_fields.as_ref()) {
41            for (k, v) in add {
42                entries.push((k.clone(), v.clone()));
43            }
44        }
45        Some(entries)
46    } else {
47        None
48    };
49
50    MdreamResult {
51        markdown: state.get_markdown(),
52        extracted,
53        frontmatter,
54    }
55}
56
57/// Streaming HTML-to-Markdown converter.
58///
59/// Feed chunks of HTML via `process_chunk()`, then call `finish()` for remaining output.
60pub struct MarkdownStreamProcessor {
61    state: ConvertState,
62    buffer: String,
63}
64
65impl MarkdownStreamProcessor {
66    pub fn new(options: HTMLToMarkdownOptions) -> Self {
67        Self {
68            state: ConvertState::new(options, 4096),
69            buffer: String::new(),
70        }
71    }
72
73    pub fn process_chunk(&mut self, chunk: &str) -> String {
74        if self.buffer.is_empty() {
75            self.buffer = self.state.process_html(chunk);
76        } else {
77            self.buffer.push_str(chunk);
78            let full = std::mem::take(&mut self.buffer);
79            self.buffer = self.state.process_html(&full);
80        }
81        self.state.get_markdown_chunk()
82    }
83
84    pub fn finish(&mut self) -> String {
85        if !self.buffer.is_empty() {
86            let chunk = std::mem::take(&mut self.buffer);
87            self.state.process_html(&chunk);
88        }
89        self.state.get_markdown_chunk()
90    }
91}