Skip to main content

doc_chunks/
markdown.rs

1//! Erase cmark syntax
2//!
3//! Resulting overlay is plain and can be fed into a grammar or spell checker.
4
5use super::*;
6
7use indexmap::IndexMap;
8
9use pulldown_cmark::{Event, LinkType, Options, Parser, Tag, TagEnd};
10
11use crate::util::sub_chars;
12use crate::Span;
13use crate::{CheckableChunk, Range};
14
15/// Describes whether there is a matching segment in the source, of if it is a
16/// placeholder for i.e. a code block or inline code. These placeholders are
17/// required for grammar checks.
18#[derive(Debug, Clone)]
19pub enum SourceRange {
20    Direct(Range),
21    Alias(Range, String),
22}
23
24impl SourceRange {
25    /// Apply an offset to `start` and `end` members, equaling a shift of the
26    /// range.
27    #[allow(dead_code)]
28    pub(crate) fn apply_offset(&mut self, offset: usize) {
29        match self {
30            Self::Direct(range) => apply_offset(range, offset),
31            Self::Alias(range, _) => apply_offset(range, offset),
32        }
33    }
34
35    /// Extract a clone of the inner `Range<usize>`.
36    ///
37    /// Use `deref()` or `*` for a reference.
38    pub fn range(&self) -> Range {
39        match self {
40            Self::Direct(range) => range.clone(),
41            Self::Alias(range, _) => range.clone(),
42        }
43    }
44}
45
46impl std::ops::Deref for SourceRange {
47    type Target = Range;
48    fn deref(&self) -> &Self::Target {
49        match self {
50            Self::Direct(range) => range,
51            Self::Alias(range, _) => range,
52        }
53    }
54}
55
56pub(crate) fn is_html_tag_on_no_scope_list(text: &str) -> bool {
57    use regex::RegexSet;
58    lazy_static::lazy_static! {
59        static ref HTML_TAG_EMPTY_OR_SPECIAL_CASE: RegexSet = RegexSet::new([
60            r####"^<\s*[A-Za-z0-9]+(?:\s+.*)*\s*/>$"####, // any self closing empty
61            r####"^<\s*br\s*>$"####,
62            r####"^</?\s*(?:i|b|span|font|color|style)\s*/?>$"####,
63            r####"^<\s*pre\s*>.*</\s*pre\s*>\s?$"####,
64        ]).unwrap();
65    };
66    HTML_TAG_EMPTY_OR_SPECIAL_CASE.is_match(text)
67}
68
69#[test]
70fn scoped() {
71    assert_eq!(false, is_html_tag_on_no_scope_list("<code>"));
72    assert_eq!(false, is_html_tag_on_no_scope_list("</code>"));
73    assert_eq!(true, is_html_tag_on_no_scope_list("<code />"));
74    assert_eq!(true, is_html_tag_on_no_scope_list("<pre>🌡</pre>\n"));
75}
76
77/// A plain representation of cmark riddled chunk.
78#[derive(Clone)]
79pub struct PlainOverlay<'a> {
80    /// A reference to the underlying [`CheckableChunk`][super::chunk].
81    raw: &'a CheckableChunk,
82    /// The rendered string with all common mark annotations removed.
83    plain: String,
84    // require a sorted map, so we have the chance of binary search
85    // key: plain string range
86    // value: the corresponding areas in the full cmark
87    mapping: IndexMap<Range, SourceRange>,
88}
89
90impl<'a> PlainOverlay<'a> {
91    /// Track the origin of the annotation free content string fragments in the
92    /// common mark formatted text, to the fragments in the plain string.
93    fn track(
94        s: &str,
95        cmark_range: SourceRange,
96        plain_acc: &mut String,
97        mapping: &mut IndexMap<Range, SourceRange>,
98    ) {
99        // map the range within the plain data,
100        // which is fed to the checker,
101        // back to the repr with markdown modifiers
102
103        // avoid repeated calculation of this
104        let cursor = plain_acc.chars().count();
105        let plain_range = match &cmark_range {
106            SourceRange::Alias(_range, alias) => {
107                if alias.is_empty() {
108                    log::debug!("Alias for {s:?} was empty. Ignoring.");
109                    return;
110                }
111                // limit the lias names to 16 chars, all ascii
112                // and as such byte length equals char length
113                let alias16 = &alias[..std::cmp::min(alias.len(), 16)];
114                plain_acc.push_str(alias16);
115                Range {
116                    start: cursor,
117                    end: cursor + alias16.len(),
118                }
119            }
120            SourceRange::Direct(_range) => {
121                plain_acc.push_str(s);
122                Range {
123                    start: cursor,
124                    end: cursor + s.chars().count(),
125                }
126            }
127        };
128        let _ = mapping.insert(plain_range, cmark_range);
129    }
130
131    /// Append n newlines to the current state string `plain`.
132    fn newlines(plain: &mut String, n: usize) {
133        for _ in 0..n {
134            plain.push('\n');
135        }
136    }
137
138    /// Ranges are mapped `cmark reduced/plain -> raw`.
139    pub fn extract_plain_with_mapping(
140        cmark: &str,
141        ignores: &Ignores,
142    ) -> (String, IndexMap<Range, SourceRange>) {
143        let mut plain = String::with_capacity(cmark.len());
144        let mut mapping = indexmap::IndexMap::with_capacity(128);
145
146        let broken_link_handler = &mut |_broken: pulldown_cmark::BrokenLink| -> Option<(
147            pulldown_cmark::CowStr,
148            pulldown_cmark::CowStr,
149        )> {
150            Some((
151                pulldown_cmark::CowStr::Borrowed(""),
152                pulldown_cmark::CowStr::Borrowed(""),
153            ))
154        };
155        let parser = Parser::new_with_broken_link_callback(
156            cmark,
157            Options::all() ^ Options::ENABLE_SMART_PUNCTUATION,
158            Some(broken_link_handler),
159        );
160
161        let rust_fence =
162            pulldown_cmark::CodeBlockKind::Fenced(pulldown_cmark::CowStr::Borrowed("rust"));
163
164        let mut html_block = 0_usize;
165        let mut code_block = 0_usize;
166        let mut html_code_block = 0_usize;
167        let mut inception = false;
168        let mut skip_link_text = false;
169        let mut skip_table_text = false;
170
171        for (event, byte_range) in parser.into_offset_iter() {
172            if byte_range.start > byte_range.end {
173                log::warn!(
174                    "Dropping event {event:?} due to negative byte range {byte_range:?}, see {}",
175                    "https://github.com/raphlinus/pulldown-cmark/issues/478"
176                );
177                continue;
178            }
179
180            log::trace!("Parsing event (bytes: {byte_range:?}): {event:?}");
181
182            let cursor = cmark.char_indices().enumerate().peekable();
183            let mut char_cursor = 0usize;
184
185            // let the cursor catch up to the current byte position
186            for (char_idx, (byte_offset, _c)) in cursor {
187                char_cursor = char_idx;
188                if byte_offset >= byte_range.start {
189                    break;
190                }
191            }
192            // convert to a character range given the char_cursor
193            // TODO defer the length calculation into the tags, where the string is already extracted.
194            let char_range = {
195                let bytes_start = std::cmp::min(byte_range.start, cmark.len());
196                let bytes_end = std::cmp::min(byte_range.end, cmark.len());
197                assert!(bytes_start <= bytes_end);
198                let char_count = cmark[bytes_start..bytes_end].chars().count();
199                char_cursor..(char_cursor + char_count)
200            };
201
202            match event {
203                Event::InlineHtml(html) => {
204                    if html.starts_with("<code") {
205                        html_code_block += 1;
206                    } else if html.ends_with("code>") {
207                        html_code_block = html_code_block.saturating_sub(1);
208                    }
209                }
210                Event::InlineMath(_s) => {
211                    // skip math content
212                }
213                Event::DisplayMath(_s) => {
214                    // skip math content
215                }
216                Event::Start(tag) => match tag {
217                    Tag::Table(_alignments) => {
218                        skip_table_text = true;
219                    }
220                    Tag::TableCell | Tag::TableHead | Tag::TableRow => {}
221                    Tag::CodeBlock(fenced) => {
222                        code_block += 1;
223                        inception = fenced == rust_fence;
224                    }
225                    Tag::HtmlBlock => {
226                        html_block += 1;
227                    }
228                    Tag::Link {
229                        link_type,
230                        dest_url: _,
231                        title: _,
232                        id: _,
233                    } => {
234                        skip_link_text = match link_type {
235                            LinkType::ReferenceUnknown
236                            | LinkType::Reference
237                            | LinkType::Inline
238                            | LinkType::Collapsed
239                            | LinkType::CollapsedUnknown
240                            | LinkType::Shortcut
241                            | LinkType::ShortcutUnknown => false,
242                            LinkType::Autolink | LinkType::Email => true,
243                        };
244                    }
245                    Tag::List(_) => {
246                        // make sure nested lists are not clumped together
247                        Self::newlines(&mut plain, 1);
248                    }
249                    Tag::Image {
250                        link_type: _,
251                        dest_url: _,
252                        title,
253                        id: _,
254                    } => {
255                        Self::track(
256                            &title,
257                            SourceRange::Direct(char_range),
258                            &mut plain,
259                            &mut mapping,
260                        );
261                    }
262                    _ => {}
263                },
264                Event::End(tag) => {
265                    match tag {
266                        TagEnd::Table => {
267                            skip_table_text = false;
268                            Self::newlines(&mut plain, 1);
269                        }
270                        TagEnd::Link => {
271                            // the actual rendered content is in a text section
272                        }
273                        TagEnd::Image => {}
274                        TagEnd::Heading(_level) => {
275                            Self::newlines(&mut plain, 2);
276                        }
277                        TagEnd::CodeBlock => {
278                            code_block = code_block.saturating_sub(1);
279
280                            // if fenced == rust_fence {
281                            // TODO validate as if it was another document entity
282                            // }
283                        }
284                        TagEnd::HtmlBlock => {
285                            html_block = html_block.saturating_sub(1);
286                        }
287                        TagEnd::Paragraph => Self::newlines(&mut plain, 2),
288
289                        TagEnd::Item => {
290                            // assure individual list items are not clumped together
291                            Self::newlines(&mut plain, 1);
292                        }
293                        _ => {}
294                    }
295                }
296                Event::Text(s) => {
297                    if html_block > 0 || html_code_block > 0 {
298                    } else if code_block > 0 {
299                        if inception {
300                            // let offset = char_range.start;
301                            // TODO validate as additional, virtual document
302                            // TODO https://github.com/drahnr/cargo-spellcheck/issues/43
303                            // FIXME must also run the whole syn/ra_syntax pipeline not just another mapping
304                            // let (inner, inner_mapping) = Self::extract_plain_with_mapping(s.as_str());
305                            // mapping.extend(inner_mapping.into_iter().map(|(mut k,mut v)|
306                            //     {
307                            //         apply_offset(&mut k, offset);
308                            //         v.apply_offset(offset);
309                            //         (k,v)
310                            //     }));
311                            // plain.push_str(dbg!(inner.as_str()));
312                        }
313                    } else if skip_link_text {
314                        skip_link_text = false
315                    } else if !skip_table_text {
316                        Self::track(
317                            &s,
318                            SourceRange::Direct(char_range),
319                            &mut plain,
320                            &mut mapping,
321                        );
322                    }
323                }
324                Event::Code(s) => {
325                    // inline code such as `YakShave` shall be ignored, but we must keep a placeholder for grammar
326                    // rules to avoid misleading suggestions.
327                    let shortened_range = Range {
328                        start: char_range.start.saturating_add(1),
329                        end: char_range.end.saturating_sub(1),
330                    };
331                    let alias = cmark[byte_range]
332                        .chars()
333                        .skip(1)
334                        .take(shortened_range.len())
335                        .filter(|x| x.is_ascii_alphanumeric())
336                        .collect::<String>();
337
338                    if !shortened_range.is_empty() && !alias.is_empty() {
339                        Self::track(
340                            &s,
341                            SourceRange::Alias(shortened_range, alias),
342                            &mut plain,
343                            &mut mapping,
344                        );
345                    }
346                }
347                Event::Html(tag) => {
348                    if is_html_tag_on_no_scope_list(&tag) {
349                    }
350                }
351                Event::FootnoteReference(s) => {
352                    if !ignores.footnote_references && !s.is_empty() {
353                        let char_range = Range {
354                            start: char_range.start + 2,
355                            end: char_range.end - 1,
356                        };
357                        Self::track(
358                            &s,
359                            SourceRange::Direct(char_range),
360                            &mut plain,
361                            &mut mapping,
362                        );
363                    }
364                }
365                Event::SoftBreak => {
366                    Self::newlines(&mut plain, 1);
367                }
368                Event::HardBreak => {
369                    Self::newlines(&mut plain, 2);
370                }
371                Event::Rule => {
372                    Self::newlines(&mut plain, 1);
373                }
374                Event::TaskListMarker(_checked) => {}
375            }
376        }
377
378        // the parser yields single lines as a paragraph, for which we add trailing newlines
379        // which are pointless and clutter the test strings, so track and remove them
380        let trailing_newlines = plain.chars().rev().take_while(|x| *x == '\n').count();
381        if trailing_newlines <= plain.len() {
382            plain.truncate(plain.len() - trailing_newlines)
383        }
384        if let Some((mut plain_range, raw_range)) = mapping.pop() {
385            if plain_range.end > plain.len() {
386                plain_range.end = plain.len();
387            }
388            if plain_range.start > plain_range.end {
389                let content = String::from_iter(
390                    cmark
391                        .char_indices()
392                        .filter(|(idx, _c)| raw_range.contains(idx))
393                        .map(|(_idx, c)| c),
394                );
395                panic!(
396                    "failed: {} <= {}, raw range: {:?}\ncontent: >>{}<<",
397                    plain_range.start, plain_range.end, raw_range, content
398                );
399            }
400            mapping.insert(plain_range, raw_range);
401        }
402        (plain, mapping)
403    }
404
405    /// Create a common mark overlay based on the provided `CheckableChunk`
406    /// reference.
407    // TODO consider returning a `Vec<PlainOverlay<'a>>` to account for list items
408    // or other non-linear information which might not pass a grammar check as a whole
409    pub fn erase_cmark(chunk: &'a CheckableChunk, ignores: &Ignores) -> Self {
410        let (plain, mapping) = Self::extract_plain_with_mapping(chunk.as_str(), ignores);
411        Self {
412            raw: chunk,
413            plain,
414            mapping,
415        }
416    }
417
418    /// Since most checkers will operate on the plain data, an indirection to
419    /// map cmark reduced / plain back to raw ranges, which are then mapped back
420    /// to `Span`s. The returned key `Ranges` are in the condensed domain.
421    pub fn find_spans(&self, condensed_range: Range) -> IndexMap<Range, Span> {
422        let mut active = false;
423        let Range { start, end } = condensed_range;
424        let n = self.mapping.len();
425        self.mapping
426            .iter()
427            .skip_while(|(sub, _raw)| sub.end <= start)
428            .take_while(|(sub, _raw)| sub.start < end)
429            .inspect(|x| {
430                log::trace!(">>> item {:?} ∈ {:?}", condensed_range, x.0);
431            })
432            .filter(|(sub, _)| {
433                // could possibly happen on empty documentation lines with `///`
434                !sub.is_empty()
435            })
436            .filter(|(_, raw)| {
437                // aliases are not required for span search
438                matches!(raw, SourceRange::Direct(_))
439            })
440            .fold(
441                IndexMap::<Range, Span>::with_capacity(n),
442                |mut acc, (sub, raw)| {
443                    fn recombine(range: Range, offset: usize, len: usize) -> Range {
444                        Range {
445                            start: range.start + offset,
446                            end: range.start + offset + len,
447                        }
448                    }
449
450                    let _ = if sub.contains(&start) {
451                        // calculate the offset between our `condensed_range.start` and
452                        // the `sub` which is one entry in the mappings
453                        let offset = start - sub.start;
454                        let overlay_range = if sub.contains(&(end - 1)) {
455                            // complete start to end
456                            active = false;
457                            start..end
458                        } else {
459                            // only start, continue taking until end
460                            active = true;
461                            start..sub.end
462                        };
463                        let raw = recombine(raw.range(), offset, overlay_range.len());
464                        Some((overlay_range, raw))
465                    // TODO must be implemented properly
466                    // } else if active {
467                    //     let offset = sub.end - end;
468                    //     if sub.contains(&(end - 1)) {
469                    //         active = false;
470                    //         Some((sub.start..end, offset))
471                    //     } else {
472                    //         Some((sub.clone(), offset))
473                    //     }
474                    } else {
475                        None
476                    }
477                    .map(|(sub, raw)| {
478                        log::trace!("convert:  cmark-erased={sub:?} -> raw={raw:?}");
479
480                        if raw.is_empty() {
481                            log::warn!("linear range to spans: {raw:?} empty!");
482                        } else {
483                            let resolved = self.raw.find_spans(raw.clone());
484                            log::trace!("cmark-erased range to spans: {raw:?} -> {resolved:?}");
485                            acc.extend(resolved);
486                        }
487                    });
488                    acc
489                },
490            )
491    }
492
493    /// Obtains a reference to the plain, cmark erased representation.
494    pub fn as_str(&self) -> &str {
495        self.plain.as_str()
496    }
497}
498
499use std::fmt;
500
501impl<'a> fmt::Display for PlainOverlay<'a> {
502    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
503        formatter.write_str(self.plain.as_str())
504    }
505}
506
507impl<'a> fmt::Debug for PlainOverlay<'a> {
508    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
509        use console::Style;
510
511        let styles = [
512            Style::new().italic().bold().dim().red(),
513            Style::new().italic().bold().dim().green(),
514            Style::new().italic().bold().dim().yellow(),
515            Style::new().italic().bold().dim().magenta(),
516            Style::new().italic().bold().dim().cyan(),
517        ];
518
519        let uncovered = Style::new().bold().on_black().dim().white();
520
521        let color_cycle = styles.iter().cycle();
522
523        let commonmark = self.raw.as_str().to_owned();
524
525        let mut coloured_plain = String::with_capacity(1024);
526        let mut coloured_md = String::with_capacity(1024);
527
528        let mut previous_md_end = 0usize;
529        for (plain_range, md_range, style) in
530            itertools::cons_tuples(self.mapping.iter().zip(color_cycle))
531        {
532            // TODO do this properly, `saturating sub` just prevents crashing
533            let delta = md_range.start.saturating_sub(previous_md_end);
534            // take care of the markers and things that are not rendered
535            if delta > 0 {
536                let s = sub_chars(commonmark.as_str(), previous_md_end..md_range.start);
537                coloured_md.push_str(uncovered.apply_to(s.as_str()).to_string().as_str());
538            }
539            previous_md_end = md_range.end;
540
541            let s = sub_chars(commonmark.as_str(), md_range.range());
542            coloured_md.push_str(style.apply_to(s.as_str()).to_string().as_str());
543
544            let s = sub_chars(self.plain.as_str(), plain_range.clone());
545            coloured_plain.push_str(style.apply_to(s.as_str()).to_string().as_str());
546        }
547        // write!(formatter, "{coloured_md}")?;
548
549        writeln!(formatter, "Commonmark:\n{coloured_md}")?;
550        writeln!(formatter, "Plain:\n{coloured_plain}")?;
551        Ok(())
552    }
553}
554
555/// Explicitly ignored markdown entities.  The `Default` implementation means we
556/// do not ignore anything, which is the backwards compatible configuration.
557#[derive(Clone, Default)]
558pub struct Ignores {
559    /// Ignore [footnote references](Event::FootnoteReference).
560    pub footnote_references: bool,
561}