doc_chunks/
markdown.rs

1//! Erase cmark syntax
2//!
3//! Resulting overlay is plain and can be fed into a grammar or spell checker.
4
5use super::*;
6
7use indexmap::IndexMap;
8
9use pulldown_cmark::{Event, LinkType, Options, Parser, Tag, TagEnd};
10
11use crate::util::sub_chars;
12use crate::Span;
13use crate::{CheckableChunk, Range};
14
15/// Describes whether there is a matching segment in the source, of if it is a
16/// placeholder for i.e. a code block or inline code. These placeholders are
17/// required for grammar checks.
18#[derive(Debug, Clone)]
19pub enum SourceRange {
20    Direct(Range),
21    Alias(Range, String),
22}
23
24impl SourceRange {
25    /// Apply an offset to `start` and `end` members, equaling a shift of the
26    /// range.
27    #[allow(dead_code)]
28    pub(crate) fn apply_offset(&mut self, offset: usize) {
29        match self {
30            Self::Direct(range) => apply_offset(range, offset),
31            Self::Alias(range, _) => apply_offset(range, offset),
32        }
33    }
34
35    /// Extract a clone of the inner `Range<usize>`.
36    ///
37    /// Use `deref()` or `*` for a reference.
38    pub fn range(&self) -> Range {
39        match self {
40            Self::Direct(range) => range.clone(),
41            Self::Alias(range, _) => range.clone(),
42        }
43    }
44}
45
46impl std::ops::Deref for SourceRange {
47    type Target = Range;
48    fn deref(&self) -> &Self::Target {
49        match self {
50            Self::Direct(range) => range,
51            Self::Alias(range, _) => range,
52        }
53    }
54}
55
56pub(crate) fn is_html_tag_on_no_scope_list(text: &str) -> bool {
57    use regex::RegexSet;
58    lazy_static::lazy_static! {
59        static ref HTML_TAG_EMPTY_OR_SPECIAL_CASE: RegexSet = RegexSet::new([
60            r####"^<\s*[A-Za-z0-9]+(?:\s+.*)*\s*/>$"####, // any self closing empty
61            r####"^<\s*br\s*>$"####,
62            r####"^</?\s*(?:i|b|span|font|color|style)\s*/?>$"####,
63            r####"^<\s*pre\s*>.*</\s*pre\s*>\s?$"####,
64        ]).unwrap();
65    };
66    HTML_TAG_EMPTY_OR_SPECIAL_CASE.is_match(text)
67}
68
69#[test]
70fn scoped() {
71    assert_eq!(false, is_html_tag_on_no_scope_list("<code>"));
72    assert_eq!(false, is_html_tag_on_no_scope_list("</code>"));
73    assert_eq!(true, is_html_tag_on_no_scope_list("<code />"));
74    assert_eq!(true, is_html_tag_on_no_scope_list("<pre>🌡</pre>\n"));
75}
76
77/// A plain representation of cmark riddled chunk.
78#[derive(Clone)]
79pub struct PlainOverlay<'a> {
80    /// A reference to the underlying [`CheckableChunk`][super::chunk].
81    raw: &'a CheckableChunk,
82    /// The rendered string with all common mark annotations removed.
83    plain: String,
84    // require a sorted map, so we have the chance of binary search
85    // key: plain string range
86    // value: the corresponding areas in the full cmark
87    mapping: IndexMap<Range, SourceRange>,
88}
89
90impl<'a> PlainOverlay<'a> {
91    /// Track the origin of the annotation free content string fragments in the
92    /// common mark formatted text, to the fragments in the plain string.
93    fn track(
94        s: &str,
95        cmark_range: SourceRange,
96        plain_acc: &mut String,
97        mapping: &mut IndexMap<Range, SourceRange>,
98    ) {
99        // map the range within the plain data,
100        // which is fed to the checker,
101        // back to the repr with markdown modifiers
102
103        // avoid repeated calculation of this
104        let cursor = plain_acc.chars().count();
105        let plain_range = match &cmark_range {
106            SourceRange::Alias(_range, alias) => {
107                if alias.is_empty() {
108                    log::debug!("Alias for {s:?} was empty. Ignoring.");
109                    return;
110                }
111                // limit the lias names to 16 chars, all ascii
112                // and as such byte length equals char length
113                let alias16 = &alias[..std::cmp::min(alias.len(), 16)];
114                plain_acc.push_str(alias16);
115                Range {
116                    start: cursor,
117                    end: cursor + alias16.len(),
118                }
119            }
120            SourceRange::Direct(_range) => {
121                plain_acc.push_str(s);
122                Range {
123                    start: cursor,
124                    end: cursor + s.chars().count(),
125                }
126            }
127        };
128        let _ = mapping.insert(plain_range, cmark_range);
129    }
130
131    /// Append n newlines to the current state string `plain`.
132    fn newlines(plain: &mut String, n: usize) {
133        for _ in 0..n {
134            plain.push('\n');
135        }
136    }
137
138    /// Ranges are mapped `cmark reduced/plain -> raw`.
139    pub fn extract_plain_with_mapping(
140        cmark: &str,
141        ignores: &Ignores,
142    ) -> (String, IndexMap<Range, SourceRange>) {
143        let mut plain = String::with_capacity(cmark.len());
144        let mut mapping = indexmap::IndexMap::with_capacity(128);
145
146        let broken_link_handler = &mut |_broken: pulldown_cmark::BrokenLink| -> Option<(
147            pulldown_cmark::CowStr,
148            pulldown_cmark::CowStr,
149        )> {
150            Some((
151                pulldown_cmark::CowStr::Borrowed(""),
152                pulldown_cmark::CowStr::Borrowed(""),
153            ))
154        };
155        let parser = Parser::new_with_broken_link_callback(
156            cmark,
157            Options::all() ^ Options::ENABLE_SMART_PUNCTUATION,
158            Some(broken_link_handler),
159        );
160
161        let rust_fence =
162            pulldown_cmark::CodeBlockKind::Fenced(pulldown_cmark::CowStr::Borrowed("rust"));
163
164        let mut html_block = 0_usize;
165        let mut code_block = 0_usize;
166        let mut html_code_block = 0_usize;
167        let mut inception = false;
168        let mut skip_link_text = false;
169        let mut skip_table_text = false;
170
171        for (event, byte_range) in parser.into_offset_iter() {
172            if byte_range.start > byte_range.end {
173                log::warn!(
174                    "Dropping event {event:?} due to negative byte range {byte_range:?}, see {}",
175                    "https://github.com/raphlinus/pulldown-cmark/issues/478"
176                );
177                continue;
178            }
179
180            log::trace!("Parsing event (bytes: {byte_range:?}): {event:?}");
181
182            let cursor = cmark.char_indices().enumerate().peekable();
183            let mut char_cursor = 0usize;
184
185            // let the cursor catch up to the current byte position
186            for (char_idx, (byte_offset, _c)) in cursor {
187                char_cursor = char_idx;
188                if byte_offset >= byte_range.start {
189                    break;
190                }
191            }
192            // convert to a character range given the char_cursor
193            // TODO defer the length calculation into the tags, where the string is already extracted.
194            let char_range = {
195                let bytes_start = std::cmp::min(byte_range.start, cmark.len());
196                let bytes_end = std::cmp::min(byte_range.end, cmark.len());
197                assert!(bytes_start <= bytes_end);
198                let char_count = cmark[bytes_start..bytes_end].chars().count();
199                char_cursor..(char_cursor + char_count)
200            };
201
202            match event {
203                Event::InlineHtml(html) => {
204                    if html.starts_with("<code") {
205                        html_code_block += 1;
206                    } else if html.ends_with("code>") {
207                        html_code_block = html_code_block.saturating_sub(1);
208                    }
209                }
210                Event::InlineMath(_s) => {
211                    // skip math content
212                }
213                Event::DisplayMath(_s) => {
214                    // skip math content
215                }
216                Event::Start(tag) => match tag {
217                    Tag::Table(_alignments) => {
218                        skip_table_text = true;
219                    }
220                    Tag::TableCell | Tag::TableHead | Tag::TableRow => {}
221                    Tag::CodeBlock(fenced) => {
222                        code_block += 1;
223                        inception = fenced == rust_fence;
224                    }
225                    Tag::Link {
226                        link_type,
227                        dest_url: _,
228                        title: _,
229                        id: _,
230                    } => {
231                        skip_link_text = match link_type {
232                            LinkType::ReferenceUnknown
233                            | LinkType::Reference
234                            | LinkType::Inline
235                            | LinkType::Collapsed
236                            | LinkType::CollapsedUnknown
237                            | LinkType::Shortcut
238                            | LinkType::ShortcutUnknown => false,
239                            LinkType::Autolink | LinkType::Email => true,
240                        };
241                    }
242                    Tag::List(_) => {
243                        // make sure nested lists are not clumped together
244                        Self::newlines(&mut plain, 1);
245                    }
246                    Tag::Image {
247                        link_type: _,
248                        dest_url: _,
249                        title,
250                        id: _,
251                    } => {
252                        Self::track(
253                            &title,
254                            SourceRange::Direct(char_range),
255                            &mut plain,
256                            &mut mapping,
257                        );
258                    }
259                    _ => {}
260                },
261                Event::End(tag) => {
262                    match tag {
263                        TagEnd::Table { .. } => {
264                            skip_table_text = false;
265                            Self::newlines(&mut plain, 1);
266                        }
267                        TagEnd::Link => {
268                            // the actual rendered content is in a text section
269                        }
270                        TagEnd::Image => {}
271                        TagEnd::Heading(_level) => {
272                            Self::newlines(&mut plain, 2);
273                        }
274                        TagEnd::CodeBlock => {
275                            code_block = code_block.saturating_sub(1);
276
277                            // if fenced == rust_fence {
278                            // TODO validate as if it was another document entity
279                            // }
280                        }
281                        TagEnd::Paragraph => Self::newlines(&mut plain, 2),
282
283                        TagEnd::Item => {
284                            // assure individual list items are not clumped together
285                            Self::newlines(&mut plain, 1);
286                        }
287                        _ => {}
288                    }
289                }
290                Event::Text(s) => {
291                    if html_block > 0 {
292                    } else if html_code_block > 0 {
293                    } else if code_block > 0 {
294                        if inception {
295                            // let offset = char_range.start;
296                            // TODO validate as additional, virtual document
297                            // TODO https://github.com/drahnr/cargo-spellcheck/issues/43
298                            // FIXME must also run the whole syn/ra_syntax pipeline not just another mapping
299                            // let (inner, inner_mapping) = Self::extract_plain_with_mapping(s.as_str());
300                            // mapping.extend(inner_mapping.into_iter().map(|(mut k,mut v)|
301                            //     {
302                            //         apply_offset(&mut k, offset);
303                            //         v.apply_offset(offset);
304                            //         (k,v)
305                            //     }));
306                            // plain.push_str(dbg!(inner.as_str()));
307                        }
308                    } else if skip_link_text {
309                        skip_link_text = false
310                    } else if !skip_table_text {
311                        Self::track(
312                            &s,
313                            SourceRange::Direct(char_range),
314                            &mut plain,
315                            &mut mapping,
316                        );
317                    }
318                }
319                Event::Code(s) => {
320                    // inline code such as `YakShave` shall be ignored, but we must keep a placeholder for grammar
321                    // rules to avoid misleading suggestions.
322                    let shortened_range = Range {
323                        start: char_range.start.saturating_add(1),
324                        end: char_range.end.saturating_sub(1),
325                    };
326                    let alias = cmark[byte_range]
327                        .chars()
328                        .skip(1)
329                        .take(shortened_range.len())
330                        .filter(|x| x.is_ascii_alphanumeric())
331                        .collect::<String>();
332
333                    if !shortened_range.is_empty() && !alias.is_empty() {
334                        Self::track(
335                            &s,
336                            SourceRange::Alias(shortened_range, alias),
337                            &mut plain,
338                            &mut mapping,
339                        );
340                    }
341                }
342                Event::Html(tag) => {
343                    if is_html_tag_on_no_scope_list(&tag) {
344                    } else if tag.ends_with("/>") {
345                        html_block = html_block.saturating_sub(1);
346                    } else {
347                        html_block += 1;
348                    }
349                }
350                Event::FootnoteReference(s) => {
351                    if !ignores.footnote_references && !s.is_empty() {
352                        let char_range = Range {
353                            start: char_range.start + 2,
354                            end: char_range.end - 1,
355                        };
356                        Self::track(
357                            &s,
358                            SourceRange::Direct(char_range),
359                            &mut plain,
360                            &mut mapping,
361                        );
362                    }
363                }
364                Event::SoftBreak => {
365                    Self::newlines(&mut plain, 1);
366                }
367                Event::HardBreak => {
368                    Self::newlines(&mut plain, 2);
369                }
370                Event::Rule => {
371                    Self::newlines(&mut plain, 1);
372                }
373                Event::TaskListMarker(_checked) => {}
374            }
375        }
376
377        // the parser yields single lines as a paragraph, for which we add trailing newlines
378        // which are pointless and clutter the test strings, so track and remove them
379        let trailing_newlines = plain.chars().rev().take_while(|x| *x == '\n').count();
380        if trailing_newlines <= plain.len() {
381            plain.truncate(plain.len() - trailing_newlines)
382        }
383        if let Some((mut plain_range, raw_range)) = mapping.pop() {
384            if plain_range.end > plain.len() {
385                plain_range.end = plain.len();
386            }
387            if plain_range.start > plain_range.end {
388                let content = String::from_iter(
389                    cmark
390                        .char_indices()
391                        .filter(|(idx, _c)| raw_range.contains(idx))
392                        .map(|(_idx, c)| c),
393                );
394                panic!(
395                    "failed: {} <= {}, raw range: {:?}\ncontent: >>{}<<",
396                    plain_range.start, plain_range.end, raw_range, content
397                );
398            }
399            mapping.insert(plain_range, raw_range);
400        }
401        (plain, mapping)
402    }
403
404    /// Create a common mark overlay based on the provided `CheckableChunk`
405    /// reference.
406    // TODO consider returning a `Vec<PlainOverlay<'a>>` to account for list items
407    // or other non-linear information which might not pass a grammar check as a whole
408    pub fn erase_cmark(chunk: &'a CheckableChunk, ignores: &Ignores) -> Self {
409        let (plain, mapping) = Self::extract_plain_with_mapping(chunk.as_str(), ignores);
410        Self {
411            raw: chunk,
412            plain,
413            mapping,
414        }
415    }
416
417    /// Since most checkers will operate on the plain data, an indirection to
418    /// map cmark reduced / plain back to raw ranges, which are then mapped back
419    /// to `Span`s. The returned key `Ranges` are in the condensed domain.
420    pub fn find_spans(&self, condensed_range: Range) -> IndexMap<Range, Span> {
421        let mut active = false;
422        let Range { start, end } = condensed_range;
423        let n = self.mapping.len();
424        self.mapping
425            .iter()
426            .skip_while(|(sub, _raw)| sub.end <= start)
427            .take_while(|(sub, _raw)| sub.start < end)
428            .inspect(|x| {
429                log::trace!(">>> item {:?} ∈ {:?}", condensed_range, x.0);
430            })
431            .filter(|(sub, _)| {
432                // could possibly happen on empty documentation lines with `///`
433                !sub.is_empty()
434            })
435            .filter(|(_, raw)| {
436                // aliases are not required for span search
437                if let SourceRange::Direct(_) = raw {
438                    true
439                } else {
440                    false
441                }
442            })
443            .fold(
444                IndexMap::<Range, Span>::with_capacity(n),
445                |mut acc, (sub, raw)| {
446                    fn recombine(range: Range, offset: usize, len: usize) -> Range {
447                        Range {
448                            start: range.start + offset,
449                            end: range.start + offset + len,
450                        }
451                    }
452
453                    let _ = if sub.contains(&start) {
454                        // calculate the offset between our `condensed_range.start` and
455                        // the `sub` which is one entry in the mappings
456                        let offset = start - sub.start;
457                        let overlay_range = if sub.contains(&(end - 1)) {
458                            // complete start to end
459                            active = false;
460                            start..end
461                        } else {
462                            // only start, continue taking until end
463                            active = true;
464                            start..sub.end
465                        };
466                        let raw = recombine(raw.range(), offset, overlay_range.len());
467                        Some((overlay_range, raw))
468                    // TODO must be implemented properly
469                    // } else if active {
470                    //     let offset = sub.end - end;
471                    //     if sub.contains(&(end - 1)) {
472                    //         active = false;
473                    //         Some((sub.start..end, offset))
474                    //     } else {
475                    //         Some((sub.clone(), offset))
476                    //     }
477                    } else {
478                        None
479                    }
480                    .map(|(sub, raw)| {
481                        log::trace!("convert:  cmark-erased={sub:?} -> raw={raw:?}");
482
483                        if raw.is_empty() {
484                            log::warn!("linear range to spans: {raw:?} empty!");
485                        } else {
486                            let resolved = self.raw.find_spans(raw.clone());
487                            log::trace!("cmark-erased range to spans: {raw:?} -> {resolved:?}");
488                            acc.extend(resolved);
489                        }
490                    });
491                    acc
492                },
493            )
494    }
495
496    /// Obtains a reference to the plain, cmark erased representation.
497    pub fn as_str(&self) -> &str {
498        self.plain.as_str()
499    }
500}
501
502use std::fmt;
503
504impl<'a> fmt::Display for PlainOverlay<'a> {
505    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
506        formatter.write_str(self.plain.as_str())
507    }
508}
509
510impl<'a> fmt::Debug for PlainOverlay<'a> {
511    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
512        use console::Style;
513
514        let styles = [
515            Style::new().italic().bold().dim().red(),
516            Style::new().italic().bold().dim().green(),
517            Style::new().italic().bold().dim().yellow(),
518            Style::new().italic().bold().dim().magenta(),
519            Style::new().italic().bold().dim().cyan(),
520        ];
521
522        let uncovered = Style::new().bold().on_black().dim().white();
523
524        let color_cycle = styles.iter().cycle();
525
526        let commonmark = self.raw.as_str().to_owned();
527
528        let mut coloured_plain = String::with_capacity(1024);
529        let mut coloured_md = String::with_capacity(1024);
530
531        let mut previous_md_end = 0usize;
532        for (plain_range, md_range, style) in
533            itertools::cons_tuples(self.mapping.iter().zip(color_cycle))
534        {
535            // TODO do this properly, `saturating sub` just prevents crashing
536            let delta = md_range.start.saturating_sub(previous_md_end);
537            // take care of the markers and things that are not rendered
538            if delta > 0 {
539                let s = sub_chars(commonmark.as_str(), previous_md_end..md_range.start);
540                coloured_md.push_str(uncovered.apply_to(s.as_str()).to_string().as_str());
541            }
542            previous_md_end = md_range.end;
543
544            let s = sub_chars(commonmark.as_str(), md_range.range());
545            coloured_md.push_str(style.apply_to(s.as_str()).to_string().as_str());
546
547            let s = sub_chars(self.plain.as_str(), plain_range.clone());
548            coloured_plain.push_str(style.apply_to(s.as_str()).to_string().as_str());
549        }
550        // write!(formatter, "{coloured_md}")?;
551
552        writeln!(formatter, "Commonmark:\n{coloured_md}")?;
553        writeln!(formatter, "Plain:\n{coloured_plain}")?;
554        Ok(())
555    }
556}
557
558/// Explicitly ignored markdown entities.  The `Default` implementation means we
559/// do not ignore anything, which is the backwards compatible configuration.
560#[derive(Clone, Default)]
561pub struct Ignores {
562    /// Ignore [footnote references](Event::FootnoteReference).
563    pub footnote_references: bool,
564}