mdwright_document/document.rs
1//! Public parsed-document handle.
2//!
3//! `Document` is the deep facade over `crate::ir::Ir`. Rule authors
4//! only see `Document`'s accessors; the IR's representation is free
5//! to change without breaking the rule API. The data types returned
6//! by accessors are defined once in `crate::ir` and re-exported from
7//! this crate so users importing them directly get a stable path.
8
9use pulldown_cmark::html;
10use std::ops::Range;
11
12use crate::ParseError;
13use crate::ParseOptions;
14use crate::format_facts::FormatFacts;
15use crate::gfm::apply_gfm_render_policy;
16use crate::ir::{
17 BlockCheckpointFact, CodeBlock, Frontmatter, Heading, HtmlBlock, InlineCode, InlineHtml, Ir, LinkDef, ListGroup,
18 Suppression, TextSlice,
19};
20use crate::line_index::LineIndex;
21use crate::parse;
22use crate::render::{RenderOptions, RenderProfile, render_cmark_gfm_html};
23use crate::source::ByteSpan;
24use crate::source::{CanonicalSource, Source};
25use mdwright_math::{MathError, MathRegion};
26
27/// Render Markdown to HTML using the same parser options the IR uses.
28///
29/// Kept as a public utility for callers that need the same `CommonMark`
30/// rendering policy as document recognition.
31///
32/// Inputs are routed through document-owned source canonicalisation before
33/// pulldown sees them (CM §2.1 CR / CRLF → LF, CM §2.3 NUL → U+FFFD),
34/// matching what [`Document::parse`] does. Callers that need to render
35/// raw bytes verbatim should reach for `pulldown_cmark::html` directly.
36///
37/// # Errors
38///
39/// Returns [`ParseError`] if parser execution cannot safely recognise
40/// the canonicalised source.
41pub fn render_html(source: &str) -> Result<String, ParseError> {
42 render_html_with_options(source, ParseOptions::default())
43}
44
45/// Render Markdown to HTML under explicit recognition options.
46///
47/// # Errors
48///
49/// Returns [`ParseError`] if parser execution cannot safely recognise
50/// the canonicalised source.
51pub fn render_html_with_options(source: &str, opts: ParseOptions) -> Result<String, ParseError> {
52 render_html_with_render_options(source, opts, RenderOptions::default())
53}
54
55/// Render Markdown to HTML under explicit recognition and render options.
56///
57/// # Errors
58///
59/// Returns [`ParseError`] if parser execution cannot safely recognise
60/// the canonicalised source.
61pub fn render_html_with_render_options(
62 source: &str,
63 opts: ParseOptions,
64 render: RenderOptions,
65) -> Result<String, ParseError> {
66 let src = Source::new(source);
67 let canonical = CanonicalSource::from_source(&src);
68 let events = parse::collect_events_with_offsets(canonical, parse::options(opts))?;
69 let events = apply_gfm_render_policy(canonical.as_str(), events, opts.extensions().gfm);
70 match render.profile() {
71 RenderProfile::Pulldown => {
72 let mut out = String::with_capacity(canonical.as_str().len());
73 html::push_html(&mut out, events.into_iter());
74 Ok(out)
75 }
76 RenderProfile::CmarkGfm => Ok(render_cmark_gfm_html(events)),
77 }
78}
79
80/// A parsed Markdown document.
81///
82/// Construct with [`Document::parse`] or [`Document::parse_with_options`]
83/// and query with the accessors. Linting and formatting are operations
84/// owned by their respective crates.
85///
86/// `Document` owns both the caller-supplied original bytes and the
87/// canonical view pulldown parses against
88/// (CM §2.1 line endings + CM §2.3 NUL → U+FFFD). The IR's byte
89/// ranges and semantic inventories see the canonical bytes; diagnostic
90/// renderers and safe-fix application map those spans back to the
91/// original.
92#[derive(Debug)]
93pub struct Document {
94 source: Source,
95 ir: Ir,
96 parse_options: ParseOptions,
97}
98
99impl Document {
100 /// Parse `source` into the IR.
101 ///
102 /// The library imposes **no** size cap; callers feeding untrusted
103 /// input are responsible for bounding `source.len()` themselves.
104 /// The `mdwright` CLI does this via `--max-input-bytes` (default
105 /// 10 MB).
106 ///
107 /// # Errors
108 ///
109 /// Returns [`ParseError`] if parser execution cannot safely
110 /// recognise the canonicalised source.
111 #[tracing::instrument(level = "info", name = "Document::parse", skip(source), fields(len = source.len()))]
112 pub fn parse(source: &str) -> Result<Self, ParseError> {
113 Self::parse_with_options(source, ParseOptions::default())
114 }
115
116 /// Parse `source` under explicit recognition options.
117 ///
118 /// # Errors
119 ///
120 /// Returns [`ParseError`] if parser execution cannot safely
121 /// recognise the canonicalised source under `opts`.
122 pub fn parse_with_options(source: &str, opts: ParseOptions) -> Result<Self, ParseError> {
123 let source = Source::new(source);
124 let ir = Ir::parse(&source, opts)?;
125 Ok(Self {
126 source,
127 ir,
128 parse_options: opts,
129 })
130 }
131
132 /// Recognition policy used to build this document.
133 #[must_use]
134 pub fn parse_options(&self) -> ParseOptions {
135 self.parse_options
136 }
137
138 /// The canonical source string the IR was parsed against. Equal
139 /// to the caller's input when no CM §2.1 / §2.3 canonicalisation
140 /// was needed; otherwise CRLF → LF and NUL → U+FFFD substitutions
141 /// were applied.
142 #[must_use]
143 pub fn source(&self) -> &str {
144 self.source.canonical()
145 }
146
147 /// The caller's original source bytes, before `CommonMark`
148 /// line-ending and NUL canonicalisation.
149 #[must_use]
150 pub fn original_source(&self) -> &str {
151 self.source.original()
152 }
153
154 /// Translate a canonical byte range into the caller's original
155 /// source coordinates.
156 #[must_use]
157 pub fn canonical_to_original_range(&self, range: Range<usize>) -> Range<usize> {
158 let span = ByteSpan::from_range(range);
159 self.source.to_original(span).range()
160 }
161
162 /// Byte-offset → (line, column) translator.
163 #[must_use]
164 pub fn line_index(&self) -> &LineIndex {
165 self.ir.line_index()
166 }
167
168 pub(crate) fn format_facts(&self) -> &FormatFacts {
169 &self.ir.format_facts
170 }
171
172 /// Contiguous runs of prose text, with backslash escapes
173 /// preserved. Each chunk is bounded by inline code, inline HTML,
174 /// or a soft/hard line break; never crosses a code span.
175 #[must_use]
176 pub fn prose_chunks(&self) -> &[TextSlice] {
177 &self.ir.prose_chunks
178 }
179
180 /// `CommonMark` and GFM autolinks recognised in source order.
181 #[must_use]
182 pub fn autolinks(&self) -> &[crate::AutolinkFact] {
183 &self.ir.autolinks
184 }
185
186 /// Inline code spans in source order. `text` excludes the
187 /// surrounding backticks; `raw_range` covers them.
188 #[must_use]
189 pub fn inline_codes(&self) -> &[InlineCode] {
190 &self.ir.inline_codes
191 }
192
193 /// TeX-style math regions detected in source (`\[ … \]`,
194 /// `\( … \)`, `\begin{env} … \end{env}`, optionally
195 /// `$$ … $$` / `$ … $`). Lint rules that operate on prose
196 /// (e.g., `latex-command`) consult this slice to skip
197 /// diagnostics that fire inside math content. `\alpha` is
198 /// intentional inside `\[ … \]` and a bug outside it.
199 #[must_use]
200 pub fn math_regions(&self) -> &[MathRegion] {
201 &self.ir.math_regions
202 }
203
204 /// Recogniser errors (unmatched delimiter opens, unmatched
205 /// environment `\begin`). Surfaced by the `math/unbalanced-delim`
206 /// and `math/unbalanced-env` lint rules.
207 #[must_use]
208 pub fn math_errors(&self) -> &[MathError] {
209 &self.ir.math_errors
210 }
211
212 /// Fenced and indented code blocks in source order.
213 #[must_use]
214 pub fn code_blocks(&self) -> &[CodeBlock] {
215 &self.ir.code_blocks
216 }
217
218 /// HTML blocks (`CommonMark` §4.6).
219 #[must_use]
220 pub fn html_blocks(&self) -> &[HtmlBlock] {
221 &self.ir.html_blocks
222 }
223
224 /// Inline HTML tags (open, close, self-closing, comment).
225 #[must_use]
226 pub fn inline_html(&self) -> &[InlineHtml] {
227 &self.ir.inline_html
228 }
229
230 /// ATX and setext headings with trimmed text and level.
231 #[must_use]
232 pub fn headings(&self) -> &[Heading] {
233 &self.ir.headings
234 }
235
236 /// Lists in source order. Nested lists are separate entries.
237 #[must_use]
238 pub fn list_groups(&self) -> &[ListGroup] {
239 &self.ir.list_groups
240 }
241
242 /// Each [`ListGroup`] paired with the tree-derived tightness for
243 /// the matching structural list node. Pairing is by
244 /// `raw_range.start`, which is unique across lists in source
245 /// order.
246 #[must_use]
247 pub fn list_tightness_view(&self) -> Vec<(&ListGroup, bool)> {
248 self.ir
249 .list_groups
250 .iter()
251 .filter_map(|g| {
252 self.ir
253 .list_tightness
254 .iter()
255 .find(|(start, _)| *start == g.raw_range.start)
256 .map(|(_, tight)| (g, *tight))
257 })
258 .collect()
259 }
260
261 /// Link reference definitions. Materialised on demand from the
262 /// document's internal reference table; callers that hit this in a
263 /// hot loop should cache the result.
264 /// The returned slice borrows from `self` (not from source), so the
265 /// `&str` fields have the document's borrow lifetime.
266 #[must_use]
267 pub fn link_defs(&self) -> Vec<LinkDef<'_>> {
268 self.ir
269 .refs
270 .iter()
271 .map(|t| LinkDef {
272 label: t.label_raw.as_str(),
273 dest: t.dest.as_str(),
274 title: t.title.as_deref(),
275 raw_range: t.raw_range.clone(),
276 })
277 .collect()
278 }
279
280 /// Top-level block checkpoints in canonical source coordinates.
281 #[must_use]
282 pub fn block_checkpoints(&self) -> &[BlockCheckpointFact] {
283 &self.ir.block_checkpoints
284 }
285
286 /// Source ranges for links, images, and autolinks that should be
287 /// treated as link-like regions by prose rules.
288 #[must_use]
289 pub fn link_like_ranges(&self) -> &[Range<usize>] {
290 &self.ir.link_like_ranges
291 }
292
293 /// Frontmatter at the document head, if present. Carries both the
294 /// raw slice and a tag for which delimiter (YAML `---` or TOML
295 /// `+++`) the source used.
296 #[must_use]
297 pub fn frontmatter(&self) -> Option<&Frontmatter> {
298 self.ir.frontmatter.as_ref()
299 }
300
301 /// Inline suppression directives parsed from `<!-- mdwright: … -->`
302 /// HTML comments. Returned in source order so linting and tooling
303 /// can show users where suppressions take effect.
304 #[must_use]
305 pub fn suppressions(&self) -> &[Suppression] {
306 &self.ir.suppressions
307 }
308}