Skip to main content

satteri_pulldown_cmark/
utils.rs

1//! Miscellaneous utilities to increase comfort.
2//! Special thanks to:
3//!
4//! - <https://github.com/BenjaminRi/Redwood-Wiki/blob/master/src/markdown_utils.rs>.
5//!   Its author authorized the use of this GPL code in this project in
6//!   <https://github.com/raphlinus/pulldown-cmark/issues/507>.
7//!
8//! - <https://gist.github.com/rambip/a507c312ed61c99c24b2a54f98325721>.
9//!   Its author proposed the solution in
10//!   <https://github.com/raphlinus/pulldown-cmark/issues/708>.
11
12use alloc::borrow::Cow;
13use alloc::string::String;
14use core::ops::Range;
15
16use crate::{CowStr, Event};
17
18/// Decode HTML5 character references (`&gt;`, `&amp;`, `&#x3C;`, `&#123;`, …)
19/// inside a string. Unrecognised `&foo` runs are left as-is.
20///
21/// JSX text and JSX literal attribute values both go through HTML entity
22/// decoding before reaching the runtime: `<p>&gt;</p>` and `<p title="&gt;"/>`
23/// both materialise as a `>` character. This helper is shared so the two
24/// call sites agree on the entity table.
25pub fn decode_html_entities(s: &str) -> Cow<'_, str> {
26    if !s.contains('&') {
27        return Cow::Borrowed(s);
28    }
29    let bytes = s.as_bytes();
30    let mut out = String::with_capacity(s.len());
31    let mut i = 0;
32    while i < bytes.len() {
33        if bytes[i] == b'&' {
34            let (consumed, replacement) = crate::scanners::scan_entity(&bytes[i..]);
35            if consumed > 0 {
36                if let Some(rep) = replacement {
37                    out.push_str(&rep);
38                }
39                i += consumed;
40                continue;
41            }
42        }
43        let b = bytes[i];
44        let ch_len = if b < 0xC0 {
45            1
46        } else if b < 0xE0 {
47            2
48        } else if b < 0xF0 {
49            3
50        } else {
51            4
52        };
53        out.push_str(&s[i..i + ch_len]);
54        i += ch_len;
55    }
56    Cow::Owned(out)
57}
58
59/// Merge consecutive `Event::Text` events into only one.
60#[derive(Debug)]
61pub struct TextMergeStream<'a, I> {
62    inner: TextMergeWithOffset<'a, DummyOffsets<I>>,
63}
64
65impl<'a, I> TextMergeStream<'a, I>
66where
67    I: Iterator<Item = Event<'a>>,
68{
69    pub fn new(iter: I) -> Self {
70        Self {
71            inner: TextMergeWithOffset::new(DummyOffsets(iter)),
72        }
73    }
74}
75
76impl<'a, I> Iterator for TextMergeStream<'a, I>
77where
78    I: Iterator<Item = Event<'a>>,
79{
80    type Item = Event<'a>;
81
82    fn next(&mut self) -> Option<Self::Item> {
83        self.inner.next().map(|(event, _)| event)
84    }
85}
86
87#[derive(Debug)]
88struct DummyOffsets<I>(I);
89
90impl<'a, I> Iterator for DummyOffsets<I>
91where
92    I: Iterator<Item = Event<'a>>,
93{
94    type Item = (Event<'a>, Range<usize>);
95
96    fn next(&mut self) -> Option<Self::Item> {
97        self.0.next().map(|event| (event, 0..0))
98    }
99}
100
101/// Merge consecutive `Event::Text` events into only one, with offsets.
102///
103/// Compatible with with [`OffsetIter`](crate::OffsetIter).
104#[derive(Debug)]
105pub struct TextMergeWithOffset<'a, I> {
106    iter: I,
107    last_event: Option<(Event<'a>, Range<usize>)>,
108}
109
110impl<'a, I> TextMergeWithOffset<'a, I>
111where
112    I: Iterator<Item = (Event<'a>, Range<usize>)>,
113{
114    pub fn new(iter: I) -> Self {
115        Self {
116            iter,
117            last_event: None,
118        }
119    }
120
121    /// Access the inner iterator (e.g. to retrieve parser state after iteration).
122    pub fn inner(&self) -> &I {
123        &self.iter
124    }
125}
126
127impl<'a, I> Iterator for TextMergeWithOffset<'a, I>
128where
129    I: Iterator<Item = (Event<'a>, Range<usize>)>,
130{
131    type Item = (Event<'a>, Range<usize>);
132
133    fn next(&mut self) -> Option<Self::Item> {
134        match (self.last_event.take(), self.iter.next()) {
135            (
136                Some((Event::Text(last_text), last_offset)),
137                Some((Event::Text(next_text), next_offset)),
138            ) => {
139                // We need to start merging consecutive text events together into one
140                let mut string_buf: String = last_text.into_string();
141                string_buf.push_str(&next_text);
142                let mut offset = last_offset;
143                offset.end = next_offset.end;
144                loop {
145                    // Avoid recursion to avoid stack overflow and to optimize concatenation
146                    match self.iter.next() {
147                        Some((Event::Text(next_text), next_offset)) => {
148                            string_buf.push_str(&next_text);
149                            offset.end = next_offset.end;
150                        }
151                        next_event => {
152                            self.last_event = next_event;
153                            if string_buf.is_empty() {
154                                // Discard text event(s) altogether if there is no text
155                                break self.next();
156                            } else {
157                                break Some((
158                                    Event::Text(CowStr::Boxed(string_buf.into_boxed_str())),
159                                    offset,
160                                ));
161                            }
162                        }
163                    }
164                }
165            }
166            (None, Some(next_event)) => {
167                // This only happens once during the first iteration and if there are items
168                self.last_event = Some(next_event);
169                self.next()
170            }
171            (None, None) => {
172                // This happens when the iterator is depleted
173                None
174            }
175            (last_event, next_event) => {
176                // The ordinary case, emit one event after the other without modification
177                self.last_event = next_event;
178                last_event
179            }
180        }
181    }
182}
183
184#[cfg(test)]
185mod test {
186    use alloc::vec::Vec;
187
188    use super::*;
189    use crate::Parser;
190
191    #[test]
192    fn text_merge_stream_indent() {
193        let source = r#"
194    first line
195    second line
196"#;
197        let parser = TextMergeStream::new(Parser::new(source));
198        let text_events: Vec<_> = parser.filter(|e| matches!(e, Event::Text(_))).collect();
199        assert_eq!(
200            text_events,
201            [Event::Text("first line\nsecond line\n".into())]
202        );
203    }
204
205    #[test]
206    fn text_merge_with_offset_indent() {
207        let source = r#"
208    first line
209    second line
210"#;
211        let parser = TextMergeWithOffset::new(Parser::new(source).into_offset_iter());
212        let text_events: Vec<_> = parser
213            .filter(|e| matches!(e, (Event::Text(_), _)))
214            .collect();
215        assert_eq!(
216            text_events,
217            [(Event::Text("first line\nsecond line\n".into()), 5..32)]
218        );
219    }
220
221    #[test]
222    fn text_merge_empty_is_discarded() {
223        let events = [
224            Event::Rule,
225            Event::Text("".into()),
226            Event::Text("".into()),
227            Event::Rule,
228        ];
229        let result: Vec<_> = TextMergeStream::new(events.into_iter()).collect();
230        assert_eq!(result, [Event::Rule, Event::Rule]);
231    }
232}