satteri-pulldown-cmark 0.4.1

A fork of the pulldown-cmark crate with MDX extensions, used in the satteri project.
Documentation
//! Miscellaneous utilities to increase comfort.
//! Special thanks to:
//!
//! - <https://github.com/BenjaminRi/Redwood-Wiki/blob/master/src/markdown_utils.rs>.
//!   Its author authorized the use of this GPL code in this project in
//!   <https://github.com/raphlinus/pulldown-cmark/issues/507>.
//!
//! - <https://gist.github.com/rambip/a507c312ed61c99c24b2a54f98325721>.
//!   Its author proposed the solution in
//!   <https://github.com/raphlinus/pulldown-cmark/issues/708>.

use alloc::borrow::Cow;
use alloc::string::String;
use core::ops::Range;

use crate::{CowStr, Event};

/// Decode HTML5 character references (`&gt;`, `&amp;`, `&#x3C;`, `&#123;`, …)
/// inside a string. Unrecognised `&foo` runs are left as-is.
///
/// JSX text and JSX literal attribute values both go through HTML entity
/// decoding before reaching the runtime: `<p>&gt;</p>` and `<p title="&gt;"/>`
/// both materialise as a `>` character. This helper is shared so the two
/// call sites agree on the entity table.
pub fn decode_html_entities(s: &str) -> Cow<'_, str> {
    if !s.contains('&') {
        return Cow::Borrowed(s);
    }
    let bytes = s.as_bytes();
    let mut out = String::with_capacity(s.len());
    let mut i = 0;
    while i < bytes.len() {
        if bytes[i] == b'&' {
            let (consumed, replacement) = crate::scanners::scan_entity(&bytes[i..]);
            if consumed > 0 {
                if let Some(rep) = replacement {
                    out.push_str(&rep);
                }
                i += consumed;
                continue;
            }
        }
        let b = bytes[i];
        let ch_len = if b < 0xC0 {
            1
        } else if b < 0xE0 {
            2
        } else if b < 0xF0 {
            3
        } else {
            4
        };
        out.push_str(&s[i..i + ch_len]);
        i += ch_len;
    }
    Cow::Owned(out)
}

/// Merge consecutive `Event::Text` events into only one.
#[derive(Debug)]
pub struct TextMergeStream<'a, I> {
    inner: TextMergeWithOffset<'a, DummyOffsets<I>>,
}

impl<'a, I> TextMergeStream<'a, I>
where
    I: Iterator<Item = Event<'a>>,
{
    pub fn new(iter: I) -> Self {
        Self {
            inner: TextMergeWithOffset::new(DummyOffsets(iter)),
        }
    }
}

impl<'a, I> Iterator for TextMergeStream<'a, I>
where
    I: Iterator<Item = Event<'a>>,
{
    type Item = Event<'a>;

    fn next(&mut self) -> Option<Self::Item> {
        self.inner.next().map(|(event, _)| event)
    }
}

#[derive(Debug)]
struct DummyOffsets<I>(I);

impl<'a, I> Iterator for DummyOffsets<I>
where
    I: Iterator<Item = Event<'a>>,
{
    type Item = (Event<'a>, Range<usize>);

    fn next(&mut self) -> Option<Self::Item> {
        self.0.next().map(|event| (event, 0..0))
    }
}

/// Merge consecutive `Event::Text` events into only one, with offsets.
///
/// Compatible with with [`OffsetIter`](crate::OffsetIter).
#[derive(Debug)]
pub struct TextMergeWithOffset<'a, I> {
    iter: I,
    last_event: Option<(Event<'a>, Range<usize>)>,
}

impl<'a, I> TextMergeWithOffset<'a, I>
where
    I: Iterator<Item = (Event<'a>, Range<usize>)>,
{
    pub fn new(iter: I) -> Self {
        Self {
            iter,
            last_event: None,
        }
    }

    /// Access the inner iterator (e.g. to retrieve parser state after iteration).
    pub fn inner(&self) -> &I {
        &self.iter
    }
}

impl<'a, I> Iterator for TextMergeWithOffset<'a, I>
where
    I: Iterator<Item = (Event<'a>, Range<usize>)>,
{
    type Item = (Event<'a>, Range<usize>);

    fn next(&mut self) -> Option<Self::Item> {
        match (self.last_event.take(), self.iter.next()) {
            (
                Some((Event::Text(last_text), last_offset)),
                Some((Event::Text(next_text), next_offset)),
            ) => {
                // We need to start merging consecutive text events together into one
                let mut string_buf: String = last_text.into_string();
                string_buf.push_str(&next_text);
                let mut offset = last_offset;
                offset.end = next_offset.end;
                loop {
                    // Avoid recursion to avoid stack overflow and to optimize concatenation
                    match self.iter.next() {
                        Some((Event::Text(next_text), next_offset)) => {
                            string_buf.push_str(&next_text);
                            offset.end = next_offset.end;
                        }
                        next_event => {
                            self.last_event = next_event;
                            if string_buf.is_empty() {
                                // Discard text event(s) altogether if there is no text
                                break self.next();
                            } else {
                                break Some((
                                    Event::Text(CowStr::Boxed(string_buf.into_boxed_str())),
                                    offset,
                                ));
                            }
                        }
                    }
                }
            }
            (None, Some(next_event)) => {
                // This only happens once during the first iteration and if there are items
                self.last_event = Some(next_event);
                self.next()
            }
            (None, None) => {
                // This happens when the iterator is depleted
                None
            }
            (last_event, next_event) => {
                // The ordinary case, emit one event after the other without modification
                self.last_event = next_event;
                last_event
            }
        }
    }
}

#[cfg(test)]
mod test {
    use alloc::vec::Vec;

    use super::*;
    use crate::Parser;

    #[test]
    fn text_merge_stream_indent() {
        let source = r#"
    first line
    second line
"#;
        let parser = TextMergeStream::new(Parser::new(source));
        let text_events: Vec<_> = parser.filter(|e| matches!(e, Event::Text(_))).collect();
        assert_eq!(
            text_events,
            [Event::Text("first line\nsecond line\n".into())]
        );
    }

    #[test]
    fn text_merge_with_offset_indent() {
        let source = r#"
    first line
    second line
"#;
        let parser = TextMergeWithOffset::new(Parser::new(source).into_offset_iter());
        let text_events: Vec<_> = parser
            .filter(|e| matches!(e, (Event::Text(_), _)))
            .collect();
        assert_eq!(
            text_events,
            [(Event::Text("first line\nsecond line\n".into()), 5..32)]
        );
    }

    #[test]
    fn text_merge_empty_is_discarded() {
        let events = [
            Event::Rule,
            Event::Text("".into()),
            Event::Text("".into()),
            Event::Rule,
        ];
        let result: Vec<_> = TextMergeStream::new(events.into_iter()).collect();
        assert_eq!(result, [Event::Rule, Event::Rule]);
    }
}