1use pulldown_cmark::{CowStr, Event, Tag, TagEnd};
8
9use crate::gfm::apply_gfm_render_policy;
10use crate::source::{CanonicalSource, Source};
11use crate::{ParseError, ParseOptions, parse};
12
13#[derive(Debug, Clone, PartialEq, Eq)]
15pub struct MarkdownSignature {
16 events: Vec<CanonicalEvent>,
17}
18
19impl MarkdownSignature {
20 #[must_use]
22 pub fn first_divergence(&self, other: &Self) -> Option<String> {
23 if self == other {
24 return None;
25 }
26 for (i, (x, y)) in self.events.iter().zip(other.events.iter()).enumerate() {
27 if x != y {
28 return Some(format!(
29 "event {i}: source = {:?}; formatted = {:?}",
30 short(x),
31 short(y)
32 ));
33 }
34 }
35 let (longer, label) = if self.events.len() > other.events.len() {
36 (&self.events, "source")
37 } else {
38 (&other.events, "formatted")
39 };
40 let extra = longer
41 .get(self.events.len().min(other.events.len()))
42 .map_or_else(|| "<eos>".to_owned(), |e| format!("{:?}", short(e)));
43 Some(format!(
44 "stream length differs ({} vs {}); first extra event on {label}: {extra}",
45 self.events.len(),
46 other.events.len(),
47 ))
48 }
49}
50
51#[derive(Debug, Clone, PartialEq, Eq)]
52enum CanonicalEvent {
53 Start(StartTag),
54 End(EndTag),
55 Text(String),
56 VerbatimText(String),
57 Code(String),
58 InlineMath(String),
59 DisplayMath(String),
60 Html(String),
61 InlineHtml(String),
62 FootnoteReference(String),
63 HardBreak,
64 Rule,
65 TaskListMarker(bool),
66}
67
68#[derive(Debug, Clone, PartialEq, Eq)]
69enum StartTag {
70 Paragraph,
71 Heading(u32),
72 BlockQuote,
73 CodeBlock { fenced: bool, info: String },
74 HtmlBlock,
75 List { ordered: bool, start: u64 },
76 Item,
77 FootnoteDefinition(String),
78 DefinitionList,
79 DefinitionListTitle,
80 DefinitionListDefinition,
81 Table(Vec<TableAlign>),
82 TableHead,
83 TableRow,
84 TableCell,
85 Emphasis,
86 Strong,
87 Strikethrough,
88 Superscript,
89 Subscript,
90 Link { dest: String, title: String, id: String },
91 Image { dest: String, title: String, id: String },
92 MetadataBlock,
93}
94
95#[derive(Debug, Clone, Copy, PartialEq, Eq)]
96enum TableAlign {
97 None,
98 Left,
99 Center,
100 Right,
101}
102
103#[derive(Debug, Clone, PartialEq, Eq)]
104enum EndTag {
105 Paragraph,
106 Heading(u32),
107 BlockQuote,
108 CodeBlock,
109 HtmlBlock,
110 List(bool),
111 Item,
112 FootnoteDefinition,
113 DefinitionList,
114 DefinitionListTitle,
115 DefinitionListDefinition,
116 Table,
117 TableHead,
118 TableRow,
119 TableCell,
120 Emphasis,
121 Strong,
122 Strikethrough,
123 Superscript,
124 Subscript,
125 Link,
126 Image,
127 MetadataBlock,
128}
129
130pub fn markdown_signature(source: &str, opts: ParseOptions) -> Result<MarkdownSignature, ParseError> {
137 let source = Source::new(source);
138 let src = CanonicalSource::from_source(&source);
139 let mut signature_events: Vec<CanonicalEvent> = Vec::new();
140 let mut code_block_depth: u32 = 0;
141 let mut pending: Option<String> = None;
142
143 let flush = |pending: &mut Option<String>, events: &mut Vec<CanonicalEvent>| {
144 if let Some(buf) = pending.take() {
145 let collapsed = collapse_whitespace(&buf);
146 if !collapsed.is_empty() {
147 events.push(CanonicalEvent::Text(collapsed));
148 }
149 }
150 };
151
152 let parser_events = apply_gfm_render_policy(
153 src.as_str(),
154 parse::collect_events_with_offsets(src, parse::options(opts))?,
155 opts.extensions().gfm,
156 );
157 for ev in parser_events {
158 match ev {
159 Event::Start(tag) => {
160 if matches!(tag, Tag::CodeBlock(_)) {
161 code_block_depth = code_block_depth.saturating_add(1);
162 }
163 flush(&mut pending, &mut signature_events);
164 signature_events.push(CanonicalEvent::Start(canonical_start(tag)));
165 }
166 Event::End(tag) => {
167 if matches!(tag, TagEnd::CodeBlock) {
168 code_block_depth = code_block_depth.saturating_sub(1);
169 }
170 flush(&mut pending, &mut signature_events);
171 signature_events.push(CanonicalEvent::End(canonical_end(tag)));
172 }
173 Event::Text(s) if code_block_depth > 0 => {
174 flush(&mut pending, &mut signature_events);
175 signature_events.push(CanonicalEvent::VerbatimText(s.into_string()));
176 }
177 Event::Text(s) => {
178 pending.get_or_insert_with(String::new).push_str(&s);
179 }
180 Event::SoftBreak => {
181 let buf = pending.get_or_insert_with(String::new);
182 if !buf.is_empty() && !buf.ends_with(' ') {
183 buf.push(' ');
184 }
185 }
186 Event::HardBreak => {
187 flush(&mut pending, &mut signature_events);
188 signature_events.push(CanonicalEvent::HardBreak);
189 }
190 Event::Code(s) => {
191 flush(&mut pending, &mut signature_events);
192 signature_events.push(CanonicalEvent::Code(s.into_string()));
193 }
194 Event::InlineMath(s) => {
195 flush(&mut pending, &mut signature_events);
196 signature_events.push(CanonicalEvent::InlineMath(s.into_string()));
197 }
198 Event::DisplayMath(s) => {
199 flush(&mut pending, &mut signature_events);
200 signature_events.push(CanonicalEvent::DisplayMath(s.into_string()));
201 }
202 Event::Html(s) => {
203 flush(&mut pending, &mut signature_events);
204 signature_events.push(CanonicalEvent::Html(s.into_string()));
205 }
206 Event::InlineHtml(s) => {
207 flush(&mut pending, &mut signature_events);
208 signature_events.push(CanonicalEvent::InlineHtml(s.into_string()));
209 }
210 Event::FootnoteReference(s) => {
211 flush(&mut pending, &mut signature_events);
212 signature_events.push(CanonicalEvent::FootnoteReference(s.into_string()));
213 }
214 Event::Rule => {
215 flush(&mut pending, &mut signature_events);
216 signature_events.push(CanonicalEvent::Rule);
217 }
218 Event::TaskListMarker(b) => {
219 flush(&mut pending, &mut signature_events);
220 signature_events.push(CanonicalEvent::TaskListMarker(b));
221 }
222 }
223 }
224 flush(&mut pending, &mut signature_events);
225 Ok(MarkdownSignature {
226 events: signature_events,
227 })
228}
229
230fn cow_to_string(c: CowStr<'_>) -> String {
231 c.into_string()
232}
233
234#[allow(clippy::too_many_lines, reason = "one-to-one variant mapping")]
235fn canonical_start(tag: Tag<'_>) -> StartTag {
236 use pulldown_cmark::{Alignment, CodeBlockKind, HeadingLevel};
237 match tag {
238 Tag::Paragraph => StartTag::Paragraph,
239 Tag::Heading { level, .. } => StartTag::Heading(match level {
240 HeadingLevel::H1 => 1,
241 HeadingLevel::H2 => 2,
242 HeadingLevel::H3 => 3,
243 HeadingLevel::H4 => 4,
244 HeadingLevel::H5 => 5,
245 HeadingLevel::H6 => 6,
246 }),
247 Tag::BlockQuote(_) => StartTag::BlockQuote,
248 Tag::CodeBlock(kind) => match kind {
249 CodeBlockKind::Fenced(info) => StartTag::CodeBlock {
250 fenced: true,
251 info: info.into_string(),
252 },
253 CodeBlockKind::Indented => StartTag::CodeBlock {
254 fenced: false,
255 info: String::new(),
256 },
257 },
258 Tag::HtmlBlock => StartTag::HtmlBlock,
259 Tag::List(start) => StartTag::List {
260 ordered: start.is_some(),
261 start: start.unwrap_or(0),
262 },
263 Tag::Item => StartTag::Item,
264 Tag::FootnoteDefinition(label) => StartTag::FootnoteDefinition(label.into_string()),
265 Tag::DefinitionList => StartTag::DefinitionList,
266 Tag::DefinitionListTitle => StartTag::DefinitionListTitle,
267 Tag::DefinitionListDefinition => StartTag::DefinitionListDefinition,
268 Tag::Table(alignments) => StartTag::Table(
269 alignments
270 .into_iter()
271 .map(|a| match a {
272 Alignment::None => TableAlign::None,
273 Alignment::Left => TableAlign::Left,
274 Alignment::Center => TableAlign::Center,
275 Alignment::Right => TableAlign::Right,
276 })
277 .collect(),
278 ),
279 Tag::TableHead => StartTag::TableHead,
280 Tag::TableRow => StartTag::TableRow,
281 Tag::TableCell => StartTag::TableCell,
282 Tag::Emphasis => StartTag::Emphasis,
283 Tag::Strong => StartTag::Strong,
284 Tag::Strikethrough => StartTag::Strikethrough,
285 Tag::Superscript => StartTag::Superscript,
286 Tag::Subscript => StartTag::Subscript,
287 Tag::Link {
288 dest_url, title, id, ..
289 } => StartTag::Link {
290 dest: cow_to_string(dest_url),
291 title: cow_to_string(title),
292 id: cow_to_string(id),
293 },
294 Tag::Image {
295 dest_url, title, id, ..
296 } => StartTag::Image {
297 dest: cow_to_string(dest_url),
298 title: cow_to_string(title),
299 id: cow_to_string(id),
300 },
301 Tag::MetadataBlock(_) => StartTag::MetadataBlock,
302 }
303}
304
305fn canonical_end(tag: TagEnd) -> EndTag {
306 use pulldown_cmark::HeadingLevel;
307 match tag {
308 TagEnd::Paragraph => EndTag::Paragraph,
309 TagEnd::Heading(level) => EndTag::Heading(match level {
310 HeadingLevel::H1 => 1,
311 HeadingLevel::H2 => 2,
312 HeadingLevel::H3 => 3,
313 HeadingLevel::H4 => 4,
314 HeadingLevel::H5 => 5,
315 HeadingLevel::H6 => 6,
316 }),
317 TagEnd::BlockQuote(_) => EndTag::BlockQuote,
318 TagEnd::CodeBlock => EndTag::CodeBlock,
319 TagEnd::HtmlBlock => EndTag::HtmlBlock,
320 TagEnd::List(ordered) => EndTag::List(ordered),
321 TagEnd::Item => EndTag::Item,
322 TagEnd::FootnoteDefinition => EndTag::FootnoteDefinition,
323 TagEnd::DefinitionList => EndTag::DefinitionList,
324 TagEnd::DefinitionListTitle => EndTag::DefinitionListTitle,
325 TagEnd::DefinitionListDefinition => EndTag::DefinitionListDefinition,
326 TagEnd::Table => EndTag::Table,
327 TagEnd::TableHead => EndTag::TableHead,
328 TagEnd::TableRow => EndTag::TableRow,
329 TagEnd::TableCell => EndTag::TableCell,
330 TagEnd::Emphasis => EndTag::Emphasis,
331 TagEnd::Strong => EndTag::Strong,
332 TagEnd::Strikethrough => EndTag::Strikethrough,
333 TagEnd::Superscript => EndTag::Superscript,
334 TagEnd::Subscript => EndTag::Subscript,
335 TagEnd::Link => EndTag::Link,
336 TagEnd::Image => EndTag::Image,
337 TagEnd::MetadataBlock(_) => EndTag::MetadataBlock,
338 }
339}
340
341fn collapse_whitespace(s: &str) -> String {
342 let mut out = String::with_capacity(s.len());
343 let mut in_ws = false;
344 for c in s.chars() {
345 if c.is_whitespace() {
346 in_ws = true;
347 } else {
348 if in_ws && !out.is_empty() {
349 out.push(' ');
350 }
351 in_ws = false;
352 out.push(c);
353 }
354 }
355 out
356}
357
358fn short(ev: &CanonicalEvent) -> CanonicalEvent {
359 const MAX: usize = 60;
360 let clip = |s: &str| {
361 if s.chars().count() <= MAX {
362 s.to_owned()
363 } else {
364 let mut t: String = s.chars().take(MAX).collect();
365 t.push_str("...");
366 t
367 }
368 };
369 match ev {
370 CanonicalEvent::Text(s) => CanonicalEvent::Text(clip(s)),
371 CanonicalEvent::VerbatimText(s) => CanonicalEvent::VerbatimText(clip(s)),
372 CanonicalEvent::Code(s) => CanonicalEvent::Code(clip(s)),
373 CanonicalEvent::Html(s) => CanonicalEvent::Html(clip(s)),
374 CanonicalEvent::InlineHtml(s) => CanonicalEvent::InlineHtml(clip(s)),
375 other @ (CanonicalEvent::Start(_)
376 | CanonicalEvent::End(_)
377 | CanonicalEvent::InlineMath(_)
378 | CanonicalEvent::DisplayMath(_)
379 | CanonicalEvent::FootnoteReference(_)
380 | CanonicalEvent::HardBreak
381 | CanonicalEvent::Rule
382 | CanonicalEvent::TaskListMarker(_)) => other.clone(),
383 }
384}