Skip to main content

mdwright_document/
document.rs

1//! Public parsed-document handle.
2//!
3//! `Document` is the deep facade over `crate::ir::Ir`. Rule authors
4//! only see `Document`'s accessors; the IR's representation is free
5//! to change without breaking the rule API. The data types returned
6//! by accessors are defined once in `crate::ir` and re-exported from
7//! this crate so users importing them directly get a stable path.
8
9use pulldown_cmark::html;
10use std::ops::Range;
11
12use crate::ParseError;
13use crate::ParseOptions;
14use crate::format_facts::FormatFacts;
15use crate::gfm::apply_gfm_render_policy;
16use crate::ir::{
17    BlockCheckpointFact, CodeBlock, Frontmatter, Heading, HtmlBlock, InlineCode, InlineHtml, Ir, LinkDef, ListGroup,
18    Suppression, TextSlice,
19};
20use crate::line_index::LineIndex;
21use crate::parse;
22use crate::render::{RenderOptions, RenderProfile, render_cmark_gfm_html};
23use crate::source::ByteSpan;
24use crate::source::{CanonicalSource, Source};
25use mdwright_math::{MathError, MathRegion};
26
27/// Render Markdown to HTML using the same parser options the IR uses.
28///
29/// Kept as a public utility for callers that need the same `CommonMark`
30/// rendering policy as document recognition.
31///
32/// Inputs are routed through document-owned source canonicalisation before
33/// pulldown sees them (CM §2.1 CR / CRLF → LF, CM §2.3 NUL → U+FFFD),
34/// matching what [`Document::parse`] does. Callers that need to render
35/// raw bytes verbatim should reach for `pulldown_cmark::html` directly.
36///
37/// # Errors
38///
39/// Returns [`ParseError`] if parser execution cannot safely recognise
40/// the canonicalised source.
41pub fn render_html(source: &str) -> Result<String, ParseError> {
42    render_html_with_options(source, ParseOptions::default())
43}
44
45/// Render Markdown to HTML under explicit recognition options.
46///
47/// # Errors
48///
49/// Returns [`ParseError`] if parser execution cannot safely recognise
50/// the canonicalised source.
51pub fn render_html_with_options(source: &str, opts: ParseOptions) -> Result<String, ParseError> {
52    render_html_with_render_options(source, opts, RenderOptions::default())
53}
54
55/// Render Markdown to HTML under explicit recognition and render options.
56///
57/// # Errors
58///
59/// Returns [`ParseError`] if parser execution cannot safely recognise
60/// the canonicalised source.
61pub fn render_html_with_render_options(
62    source: &str,
63    opts: ParseOptions,
64    render: RenderOptions,
65) -> Result<String, ParseError> {
66    let src = Source::new(source);
67    let canonical = CanonicalSource::from_source(&src);
68    let events = parse::collect_events_with_offsets(canonical, parse::options(opts))?;
69    let events = apply_gfm_render_policy(canonical.as_str(), events, opts.extensions().gfm);
70    match render.profile() {
71        RenderProfile::Pulldown => {
72            let mut out = String::with_capacity(canonical.as_str().len());
73            html::push_html(&mut out, events.into_iter());
74            Ok(out)
75        }
76        RenderProfile::CmarkGfm => Ok(render_cmark_gfm_html(events)),
77    }
78}
79
80/// A parsed Markdown document.
81///
82/// Construct with [`Document::parse`] or [`Document::parse_with_options`]
83/// and query with the accessors. Linting and formatting are operations
84/// owned by their respective crates.
85///
86/// `Document` owns both the caller-supplied original bytes and the
87/// canonical view pulldown parses against
88/// (CM §2.1 line endings + CM §2.3 NUL → U+FFFD). The IR's byte
89/// ranges and semantic inventories see the canonical bytes; diagnostic
90/// renderers and safe-fix application map those spans back to the
91/// original.
92#[derive(Debug)]
93pub struct Document {
94    source: Source,
95    ir: Ir,
96    parse_options: ParseOptions,
97}
98
99impl Document {
100    /// Parse `source` into the IR.
101    ///
102    /// The library imposes **no** size cap; callers feeding untrusted
103    /// input are responsible for bounding `source.len()` themselves.
104    /// The `mdwright` CLI does this via `--max-input-bytes` (default
105    /// 10 MB).
106    ///
107    /// # Errors
108    ///
109    /// Returns [`ParseError`] if parser execution cannot safely
110    /// recognise the canonicalised source.
111    #[tracing::instrument(level = "info", name = "Document::parse", skip(source), fields(len = source.len()))]
112    pub fn parse(source: &str) -> Result<Self, ParseError> {
113        Self::parse_with_options(source, ParseOptions::default())
114    }
115
116    /// Parse `source` under explicit recognition options.
117    ///
118    /// # Errors
119    ///
120    /// Returns [`ParseError`] if parser execution cannot safely
121    /// recognise the canonicalised source under `opts`.
122    pub fn parse_with_options(source: &str, opts: ParseOptions) -> Result<Self, ParseError> {
123        let source = Source::new(source);
124        let ir = Ir::parse(&source, opts)?;
125        Ok(Self {
126            source,
127            ir,
128            parse_options: opts,
129        })
130    }
131
132    /// Recognition policy used to build this document.
133    #[must_use]
134    pub fn parse_options(&self) -> ParseOptions {
135        self.parse_options
136    }
137
138    /// The canonical source string the IR was parsed against. Equal
139    /// to the caller's input when no CM §2.1 / §2.3 canonicalisation
140    /// was needed; otherwise CRLF → LF and NUL → U+FFFD substitutions
141    /// were applied.
142    #[must_use]
143    pub fn source(&self) -> &str {
144        self.source.canonical()
145    }
146
147    /// The caller's original source bytes, before `CommonMark`
148    /// line-ending and NUL canonicalisation.
149    #[must_use]
150    pub fn original_source(&self) -> &str {
151        self.source.original()
152    }
153
154    /// Translate a canonical byte range into the caller's original
155    /// source coordinates.
156    #[must_use]
157    pub fn canonical_to_original_range(&self, range: Range<usize>) -> Range<usize> {
158        let span = ByteSpan::from_range(range);
159        self.source.to_original(span).range()
160    }
161
162    /// Byte-offset → (line, column) translator.
163    #[must_use]
164    pub fn line_index(&self) -> &LineIndex {
165        self.ir.line_index()
166    }
167
168    pub(crate) fn format_facts(&self) -> &FormatFacts {
169        &self.ir.format_facts
170    }
171
172    /// Contiguous runs of prose text, with backslash escapes
173    /// preserved. Each chunk is bounded by inline code, inline HTML,
174    /// or a soft/hard line break; never crosses a code span.
175    #[must_use]
176    pub fn prose_chunks(&self) -> &[TextSlice] {
177        &self.ir.prose_chunks
178    }
179
180    /// `CommonMark` and GFM autolinks recognised in source order.
181    #[must_use]
182    pub fn autolinks(&self) -> &[crate::AutolinkFact] {
183        &self.ir.autolinks
184    }
185
186    /// Inline code spans in source order. `text` excludes the
187    /// surrounding backticks; `raw_range` covers them.
188    #[must_use]
189    pub fn inline_codes(&self) -> &[InlineCode] {
190        &self.ir.inline_codes
191    }
192
193    /// TeX-style math regions detected in source (`\[ … \]`,
194    /// `\( … \)`, `\begin{env} … \end{env}`, optionally
195    /// `$$ … $$` / `$ … $`). Lint rules that operate on prose
196    /// (e.g., `latex-command`) consult this slice to skip
197    /// diagnostics that fire inside math content. `\alpha` is
198    /// intentional inside `\[ … \]` and a bug outside it.
199    #[must_use]
200    pub fn math_regions(&self) -> &[MathRegion] {
201        &self.ir.math_regions
202    }
203
204    /// Recogniser errors (unmatched delimiter opens, unmatched
205    /// environment `\begin`). Surfaced by the `math/unbalanced-delim`
206    /// and `math/unbalanced-env` lint rules.
207    #[must_use]
208    pub fn math_errors(&self) -> &[MathError] {
209        &self.ir.math_errors
210    }
211
212    /// Fenced and indented code blocks in source order.
213    #[must_use]
214    pub fn code_blocks(&self) -> &[CodeBlock] {
215        &self.ir.code_blocks
216    }
217
218    /// HTML blocks (`CommonMark` §4.6).
219    #[must_use]
220    pub fn html_blocks(&self) -> &[HtmlBlock] {
221        &self.ir.html_blocks
222    }
223
224    /// Inline HTML tags (open, close, self-closing, comment).
225    #[must_use]
226    pub fn inline_html(&self) -> &[InlineHtml] {
227        &self.ir.inline_html
228    }
229
230    /// ATX and setext headings with trimmed text and level.
231    #[must_use]
232    pub fn headings(&self) -> &[Heading] {
233        &self.ir.headings
234    }
235
236    /// Lists in source order. Nested lists are separate entries.
237    #[must_use]
238    pub fn list_groups(&self) -> &[ListGroup] {
239        &self.ir.list_groups
240    }
241
242    /// Each [`ListGroup`] paired with the tree-derived tightness for
243    /// the matching structural list node. Pairing is by
244    /// `raw_range.start`, which is unique across lists in source
245    /// order.
246    #[must_use]
247    pub fn list_tightness_view(&self) -> Vec<(&ListGroup, bool)> {
248        self.ir
249            .list_groups
250            .iter()
251            .filter_map(|g| {
252                self.ir
253                    .list_tightness
254                    .iter()
255                    .find(|(start, _)| *start == g.raw_range.start)
256                    .map(|(_, tight)| (g, *tight))
257            })
258            .collect()
259    }
260
261    /// Link reference definitions. Materialised on demand from the
262    /// document's internal reference table; callers that hit this in a
263    /// hot loop should cache the result.
264    /// The returned slice borrows from `self` (not from source), so the
265    /// `&str` fields have the document's borrow lifetime.
266    #[must_use]
267    pub fn link_defs(&self) -> Vec<LinkDef<'_>> {
268        self.ir
269            .refs
270            .iter()
271            .map(|t| LinkDef {
272                label: t.label_raw.as_str(),
273                dest: t.dest.as_str(),
274                title: t.title.as_deref(),
275                raw_range: t.raw_range.clone(),
276            })
277            .collect()
278    }
279
280    /// Top-level block checkpoints in canonical source coordinates.
281    #[must_use]
282    pub fn block_checkpoints(&self) -> &[BlockCheckpointFact] {
283        &self.ir.block_checkpoints
284    }
285
286    /// Source ranges for links, images, and autolinks that should be
287    /// treated as link-like regions by prose rules.
288    #[must_use]
289    pub fn link_like_ranges(&self) -> &[Range<usize>] {
290        &self.ir.link_like_ranges
291    }
292
293    /// Frontmatter at the document head, if present. Carries both the
294    /// raw slice and a tag for which delimiter (YAML `---` or TOML
295    /// `+++`) the source used.
296    #[must_use]
297    pub fn frontmatter(&self) -> Option<&Frontmatter> {
298        self.ir.frontmatter.as_ref()
299    }
300
301    /// Inline suppression directives parsed from `<!-- mdwright: … -->`
302    /// HTML comments. Returned in source order so linting and tooling
303    /// can show users where suppressions take effect.
304    #[must_use]
305    pub fn suppressions(&self) -> &[Suppression] {
306        &self.ir.suppressions
307    }
308}