Skip to main content

mdwright_document/
signature.rs

1//! Markdown semantic signatures built from the document parser.
2//!
3//! This module is the document-owned parser oracle used by formatter
4//! verification. It exposes a stable, owned signature of recognised
5//! Markdown structure without exposing `pulldown-cmark` events.
6
7use pulldown_cmark::{CowStr, Event, Tag, TagEnd};
8
9use crate::gfm::apply_gfm_render_policy;
10use crate::source::{CanonicalSource, Source};
11use crate::{ParseError, ParseOptions, parse};
12
13/// A canonical Markdown event stream used for semantic comparison.
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub struct MarkdownSignature {
16    events: Vec<CanonicalEvent>,
17}
18
19impl MarkdownSignature {
20    /// Return a short description of the first event divergence.
21    #[must_use]
22    pub fn first_divergence(&self, other: &Self) -> Option<String> {
23        if self == other {
24            return None;
25        }
26        for (i, (x, y)) in self.events.iter().zip(other.events.iter()).enumerate() {
27            if x != y {
28                return Some(format!(
29                    "event {i}: source = {:?}; formatted = {:?}",
30                    short(x),
31                    short(y)
32                ));
33            }
34        }
35        let (longer, label) = if self.events.len() > other.events.len() {
36            (&self.events, "source")
37        } else {
38            (&other.events, "formatted")
39        };
40        let extra = longer
41            .get(self.events.len().min(other.events.len()))
42            .map_or_else(|| "<eos>".to_owned(), |e| format!("{:?}", short(e)));
43        Some(format!(
44            "stream length differs ({} vs {}); first extra event on {label}: {extra}",
45            self.events.len(),
46            other.events.len(),
47        ))
48    }
49}
50
51#[derive(Debug, Clone, PartialEq, Eq)]
52enum CanonicalEvent {
53    Start(StartTag),
54    End(EndTag),
55    Text(String),
56    VerbatimText(String),
57    Code(String),
58    InlineMath(String),
59    DisplayMath(String),
60    Html(String),
61    InlineHtml(String),
62    FootnoteReference(String),
63    HardBreak,
64    Rule,
65    TaskListMarker(bool),
66}
67
68#[derive(Debug, Clone, PartialEq, Eq)]
69enum StartTag {
70    Paragraph,
71    Heading(u32),
72    BlockQuote,
73    CodeBlock { fenced: bool, info: String },
74    HtmlBlock,
75    List { ordered: bool, start: u64 },
76    Item,
77    FootnoteDefinition(String),
78    DefinitionList,
79    DefinitionListTitle,
80    DefinitionListDefinition,
81    Table(Vec<TableAlign>),
82    TableHead,
83    TableRow,
84    TableCell,
85    Emphasis,
86    Strong,
87    Strikethrough,
88    Superscript,
89    Subscript,
90    Link { dest: String, title: String, id: String },
91    Image { dest: String, title: String, id: String },
92    MetadataBlock,
93}
94
95#[derive(Debug, Clone, Copy, PartialEq, Eq)]
96enum TableAlign {
97    None,
98    Left,
99    Center,
100    Right,
101}
102
103#[derive(Debug, Clone, PartialEq, Eq)]
104enum EndTag {
105    Paragraph,
106    Heading(u32),
107    BlockQuote,
108    CodeBlock,
109    HtmlBlock,
110    List(bool),
111    Item,
112    FootnoteDefinition,
113    DefinitionList,
114    DefinitionListTitle,
115    DefinitionListDefinition,
116    Table,
117    TableHead,
118    TableRow,
119    TableCell,
120    Emphasis,
121    Strong,
122    Strikethrough,
123    Superscript,
124    Subscript,
125    Link,
126    Image,
127    MetadataBlock,
128}
129
130/// Build a semantic signature under explicit recognition policy.
131///
132/// # Errors
133///
134/// Returns [`ParseError`] if parser execution cannot safely recognise
135/// the canonicalised source.
136pub fn markdown_signature(source: &str, opts: ParseOptions) -> Result<MarkdownSignature, ParseError> {
137    let source = Source::new(source);
138    let src = CanonicalSource::from_source(&source);
139    let mut signature_events: Vec<CanonicalEvent> = Vec::new();
140    let mut code_block_depth: u32 = 0;
141    let mut pending: Option<String> = None;
142
143    let flush = |pending: &mut Option<String>, events: &mut Vec<CanonicalEvent>| {
144        if let Some(buf) = pending.take() {
145            let collapsed = collapse_whitespace(&buf);
146            if !collapsed.is_empty() {
147                events.push(CanonicalEvent::Text(collapsed));
148            }
149        }
150    };
151
152    let parser_events = apply_gfm_render_policy(
153        src.as_str(),
154        parse::collect_events_with_offsets(src, parse::options(opts))?,
155        opts.extensions().gfm,
156    );
157    for ev in parser_events {
158        match ev {
159            Event::Start(tag) => {
160                if matches!(tag, Tag::CodeBlock(_)) {
161                    code_block_depth = code_block_depth.saturating_add(1);
162                }
163                flush(&mut pending, &mut signature_events);
164                signature_events.push(CanonicalEvent::Start(canonical_start(tag)));
165            }
166            Event::End(tag) => {
167                if matches!(tag, TagEnd::CodeBlock) {
168                    code_block_depth = code_block_depth.saturating_sub(1);
169                }
170                flush(&mut pending, &mut signature_events);
171                signature_events.push(CanonicalEvent::End(canonical_end(tag)));
172            }
173            Event::Text(s) if code_block_depth > 0 => {
174                flush(&mut pending, &mut signature_events);
175                signature_events.push(CanonicalEvent::VerbatimText(s.into_string()));
176            }
177            Event::Text(s) => {
178                pending.get_or_insert_with(String::new).push_str(&s);
179            }
180            Event::SoftBreak => {
181                let buf = pending.get_or_insert_with(String::new);
182                if !buf.is_empty() && !buf.ends_with(' ') {
183                    buf.push(' ');
184                }
185            }
186            Event::HardBreak => {
187                flush(&mut pending, &mut signature_events);
188                signature_events.push(CanonicalEvent::HardBreak);
189            }
190            Event::Code(s) => {
191                flush(&mut pending, &mut signature_events);
192                signature_events.push(CanonicalEvent::Code(s.into_string()));
193            }
194            Event::InlineMath(s) => {
195                flush(&mut pending, &mut signature_events);
196                signature_events.push(CanonicalEvent::InlineMath(s.into_string()));
197            }
198            Event::DisplayMath(s) => {
199                flush(&mut pending, &mut signature_events);
200                signature_events.push(CanonicalEvent::DisplayMath(s.into_string()));
201            }
202            Event::Html(s) => {
203                flush(&mut pending, &mut signature_events);
204                signature_events.push(CanonicalEvent::Html(s.into_string()));
205            }
206            Event::InlineHtml(s) => {
207                flush(&mut pending, &mut signature_events);
208                signature_events.push(CanonicalEvent::InlineHtml(s.into_string()));
209            }
210            Event::FootnoteReference(s) => {
211                flush(&mut pending, &mut signature_events);
212                signature_events.push(CanonicalEvent::FootnoteReference(s.into_string()));
213            }
214            Event::Rule => {
215                flush(&mut pending, &mut signature_events);
216                signature_events.push(CanonicalEvent::Rule);
217            }
218            Event::TaskListMarker(b) => {
219                flush(&mut pending, &mut signature_events);
220                signature_events.push(CanonicalEvent::TaskListMarker(b));
221            }
222        }
223    }
224    flush(&mut pending, &mut signature_events);
225    Ok(MarkdownSignature {
226        events: signature_events,
227    })
228}
229
230fn cow_to_string(c: CowStr<'_>) -> String {
231    c.into_string()
232}
233
234#[allow(clippy::too_many_lines, reason = "one-to-one variant mapping")]
235fn canonical_start(tag: Tag<'_>) -> StartTag {
236    use pulldown_cmark::{Alignment, CodeBlockKind, HeadingLevel};
237    match tag {
238        Tag::Paragraph => StartTag::Paragraph,
239        Tag::Heading { level, .. } => StartTag::Heading(match level {
240            HeadingLevel::H1 => 1,
241            HeadingLevel::H2 => 2,
242            HeadingLevel::H3 => 3,
243            HeadingLevel::H4 => 4,
244            HeadingLevel::H5 => 5,
245            HeadingLevel::H6 => 6,
246        }),
247        Tag::BlockQuote(_) => StartTag::BlockQuote,
248        Tag::CodeBlock(kind) => match kind {
249            CodeBlockKind::Fenced(info) => StartTag::CodeBlock {
250                fenced: true,
251                info: info.into_string(),
252            },
253            CodeBlockKind::Indented => StartTag::CodeBlock {
254                fenced: false,
255                info: String::new(),
256            },
257        },
258        Tag::HtmlBlock => StartTag::HtmlBlock,
259        Tag::List(start) => StartTag::List {
260            ordered: start.is_some(),
261            start: start.unwrap_or(0),
262        },
263        Tag::Item => StartTag::Item,
264        Tag::FootnoteDefinition(label) => StartTag::FootnoteDefinition(label.into_string()),
265        Tag::DefinitionList => StartTag::DefinitionList,
266        Tag::DefinitionListTitle => StartTag::DefinitionListTitle,
267        Tag::DefinitionListDefinition => StartTag::DefinitionListDefinition,
268        Tag::Table(alignments) => StartTag::Table(
269            alignments
270                .into_iter()
271                .map(|a| match a {
272                    Alignment::None => TableAlign::None,
273                    Alignment::Left => TableAlign::Left,
274                    Alignment::Center => TableAlign::Center,
275                    Alignment::Right => TableAlign::Right,
276                })
277                .collect(),
278        ),
279        Tag::TableHead => StartTag::TableHead,
280        Tag::TableRow => StartTag::TableRow,
281        Tag::TableCell => StartTag::TableCell,
282        Tag::Emphasis => StartTag::Emphasis,
283        Tag::Strong => StartTag::Strong,
284        Tag::Strikethrough => StartTag::Strikethrough,
285        Tag::Superscript => StartTag::Superscript,
286        Tag::Subscript => StartTag::Subscript,
287        Tag::Link {
288            dest_url, title, id, ..
289        } => StartTag::Link {
290            dest: cow_to_string(dest_url),
291            title: cow_to_string(title),
292            id: cow_to_string(id),
293        },
294        Tag::Image {
295            dest_url, title, id, ..
296        } => StartTag::Image {
297            dest: cow_to_string(dest_url),
298            title: cow_to_string(title),
299            id: cow_to_string(id),
300        },
301        Tag::MetadataBlock(_) => StartTag::MetadataBlock,
302    }
303}
304
305fn canonical_end(tag: TagEnd) -> EndTag {
306    use pulldown_cmark::HeadingLevel;
307    match tag {
308        TagEnd::Paragraph => EndTag::Paragraph,
309        TagEnd::Heading(level) => EndTag::Heading(match level {
310            HeadingLevel::H1 => 1,
311            HeadingLevel::H2 => 2,
312            HeadingLevel::H3 => 3,
313            HeadingLevel::H4 => 4,
314            HeadingLevel::H5 => 5,
315            HeadingLevel::H6 => 6,
316        }),
317        TagEnd::BlockQuote(_) => EndTag::BlockQuote,
318        TagEnd::CodeBlock => EndTag::CodeBlock,
319        TagEnd::HtmlBlock => EndTag::HtmlBlock,
320        TagEnd::List(ordered) => EndTag::List(ordered),
321        TagEnd::Item => EndTag::Item,
322        TagEnd::FootnoteDefinition => EndTag::FootnoteDefinition,
323        TagEnd::DefinitionList => EndTag::DefinitionList,
324        TagEnd::DefinitionListTitle => EndTag::DefinitionListTitle,
325        TagEnd::DefinitionListDefinition => EndTag::DefinitionListDefinition,
326        TagEnd::Table => EndTag::Table,
327        TagEnd::TableHead => EndTag::TableHead,
328        TagEnd::TableRow => EndTag::TableRow,
329        TagEnd::TableCell => EndTag::TableCell,
330        TagEnd::Emphasis => EndTag::Emphasis,
331        TagEnd::Strong => EndTag::Strong,
332        TagEnd::Strikethrough => EndTag::Strikethrough,
333        TagEnd::Superscript => EndTag::Superscript,
334        TagEnd::Subscript => EndTag::Subscript,
335        TagEnd::Link => EndTag::Link,
336        TagEnd::Image => EndTag::Image,
337        TagEnd::MetadataBlock(_) => EndTag::MetadataBlock,
338    }
339}
340
341fn collapse_whitespace(s: &str) -> String {
342    let mut out = String::with_capacity(s.len());
343    let mut in_ws = false;
344    for c in s.chars() {
345        if c.is_whitespace() {
346            in_ws = true;
347        } else {
348            if in_ws && !out.is_empty() {
349                out.push(' ');
350            }
351            in_ws = false;
352            out.push(c);
353        }
354    }
355    out
356}
357
358fn short(ev: &CanonicalEvent) -> CanonicalEvent {
359    const MAX: usize = 60;
360    let clip = |s: &str| {
361        if s.chars().count() <= MAX {
362            s.to_owned()
363        } else {
364            let mut t: String = s.chars().take(MAX).collect();
365            t.push_str("...");
366            t
367        }
368    };
369    match ev {
370        CanonicalEvent::Text(s) => CanonicalEvent::Text(clip(s)),
371        CanonicalEvent::VerbatimText(s) => CanonicalEvent::VerbatimText(clip(s)),
372        CanonicalEvent::Code(s) => CanonicalEvent::Code(clip(s)),
373        CanonicalEvent::Html(s) => CanonicalEvent::Html(clip(s)),
374        CanonicalEvent::InlineHtml(s) => CanonicalEvent::InlineHtml(clip(s)),
375        other @ (CanonicalEvent::Start(_)
376        | CanonicalEvent::End(_)
377        | CanonicalEvent::InlineMath(_)
378        | CanonicalEvent::DisplayMath(_)
379        | CanonicalEvent::FootnoteReference(_)
380        | CanonicalEvent::HardBreak
381        | CanonicalEvent::Rule
382        | CanonicalEvent::TaskListMarker(_)) => other.clone(),
383    }
384}