Skip to main content

panache_parser/parser/utils/
helpers.rs

1//! Shared utilities for block parsing.
2
3use crate::syntax::SyntaxKind;
4use rowan::GreenNodeBuilder;
5
6/// Helper to emit a line's text and newline tokens separately.
7/// Lines from split_lines_inclusive contain trailing newlines (LF or CRLF) that must be separated.
8pub(crate) fn emit_line_tokens(builder: &mut GreenNodeBuilder<'static>, line: &str) {
9    // Handle both CRLF and LF line endings
10    if let Some(text) = line.strip_suffix("\r\n") {
11        builder.token(SyntaxKind::TEXT.into(), text);
12        builder.token(SyntaxKind::NEWLINE.into(), "\r\n");
13    } else if let Some(text) = line.strip_suffix('\n') {
14        builder.token(SyntaxKind::TEXT.into(), text);
15        builder.token(SyntaxKind::NEWLINE.into(), "\n");
16    } else {
17        // No trailing newline (last line of input)
18        builder.token(SyntaxKind::TEXT.into(), line);
19    }
20}
21
22/// Emit a table separator line as distinct marker tokens instead of one
23/// coalesced `TEXT`. Splits the column delimiters (`|` / `+`), dash runs,
24/// equals runs (grid `===` dividers), colons, and interior whitespace into
25/// separate CST tokens so downstream alignment/width derivations read
26/// structure rather than re-scanning a string. Any unexpected bytes fall back
27/// to a `TEXT` token so the emission stays lossless. The concatenation of all
28/// emitted token texts byte-equals `line`.
29///
30/// The caller has already emitted any container prefix (indentation,
31/// blockquote markers) as separate tokens; `line` is the separator tail.
32pub(crate) fn emit_separator_tokens(builder: &mut GreenNodeBuilder<'static>, line: &str) {
33    let (content, newline) = strip_newline(line);
34    let bytes = content.as_bytes();
35    let mut i = 0;
36    while i < bytes.len() {
37        let b = bytes[i];
38        match b {
39            b'|' | b'+' => {
40                builder.token(SyntaxKind::TABLE_SEP_DELIM.into(), &content[i..i + 1]);
41                i += 1;
42            }
43            b':' => {
44                builder.token(SyntaxKind::TABLE_SEP_COLON.into(), &content[i..i + 1]);
45                i += 1;
46            }
47            b'-' => {
48                let start = i;
49                while i < bytes.len() && bytes[i] == b'-' {
50                    i += 1;
51                }
52                builder.token(SyntaxKind::TABLE_SEP_DASHES.into(), &content[start..i]);
53            }
54            b'=' => {
55                let start = i;
56                while i < bytes.len() && bytes[i] == b'=' {
57                    i += 1;
58                }
59                builder.token(SyntaxKind::TABLE_SEP_EQUALS.into(), &content[start..i]);
60            }
61            b' ' | b'\t' => {
62                let start = i;
63                while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
64                    i += 1;
65                }
66                builder.token(SyntaxKind::TABLE_SEP_WHITESPACE.into(), &content[start..i]);
67            }
68            _ => {
69                // Unexpected byte (the block detector validated this is a
70                // separator, but stay lossless and total): accumulate the run
71                // of unrecognized bytes and emit as TEXT. Advance by whole
72                // chars so we never split a multibyte sequence.
73                let start = i;
74                while i < bytes.len()
75                    && !matches!(bytes[i], b'|' | b'+' | b':' | b'-' | b'=' | b' ' | b'\t')
76                {
77                    i += 1;
78                }
79                builder.token(SyntaxKind::TEXT.into(), &content[start..i]);
80            }
81        }
82    }
83    if !newline.is_empty() {
84        builder.token(SyntaxKind::NEWLINE.into(), newline);
85    }
86}
87
88/// Strip up to N leading spaces from a line.
89/// This is the generalized version of the previous strip_leading_spaces (which stripped up to 3).
90pub(crate) fn strip_leading_spaces_n(line: &str, max_spaces: usize) -> &str {
91    let spaces_to_strip = line
92        .chars()
93        .take(max_spaces)
94        .take_while(|&c| c == ' ')
95        .count();
96    &line[spaces_to_strip..]
97}
98
99/// Strip up to 3 leading spaces from a line.
100/// This is a convenience wrapper for the common case in Markdown parsing.
101pub(crate) fn strip_leading_spaces(line: &str) -> &str {
102    strip_leading_spaces_n(line, 3)
103}
104
105/// Strip trailing newline (LF or CRLF) from a line, returning the content and the newline string.
106/// Returns (content_without_newline, newline_str).
107pub(crate) fn strip_newline(line: &str) -> (&str, &str) {
108    if let Some(content) = line.strip_suffix("\r\n") {
109        (content, "\r\n")
110    } else if let Some(content) = line.strip_suffix('\n') {
111        (content, "\n")
112    } else {
113        (line, "")
114    }
115}
116
117/// Strip trailing `\n` and `\r` bytes. ASCII byte-level equivalent of
118/// `s.trim_end_matches(['\r', '\n'])` — avoids the slice-pattern
119/// `MultiCharEqSearcher` codepath that goes through `char_indices` and
120/// shows up as a measurable hot frame in per-line block detect work.
121#[inline]
122pub(crate) fn trim_end_newlines(s: &str) -> &str {
123    let bytes = s.as_bytes();
124    let mut end = bytes.len();
125    while end > 0 {
126        let b = bytes[end - 1];
127        if b == b'\n' || b == b'\r' {
128            end -= 1;
129        } else {
130            break;
131        }
132    }
133    // SAFETY: we only stripped ASCII `\n` / `\r` bytes from the end, so the
134    // remaining prefix is still valid UTF-8 ending on a char boundary.
135    unsafe { std::str::from_utf8_unchecked(&bytes[..end]) }
136}
137
138/// Strip leading ASCII space and tab bytes. Equivalent to
139/// `s.trim_start_matches([' ', '\t'])` but byte-level.
140#[inline]
141pub(crate) fn trim_start_spaces_tabs(s: &str) -> &str {
142    let bytes = s.as_bytes();
143    let mut start = 0;
144    while start < bytes.len() {
145        let b = bytes[start];
146        if b == b' ' || b == b'\t' {
147            start += 1;
148        } else {
149            break;
150        }
151    }
152    // SAFETY: only ASCII bytes stripped from the start.
153    unsafe { std::str::from_utf8_unchecked(&bytes[start..]) }
154}
155
156/// Test whether `s` is a blank line: empty or composed only of ASCII
157/// whitespace (`' '`, `'\t'`, `'\n'`, `'\r'`). Equivalent to
158/// `s.trim_end_matches('\n').trim().is_empty()` for ASCII-whitespace
159/// inputs but bypasses the unicode-whitespace iteration in `str::trim`.
160#[inline]
161pub(crate) fn is_blank_line(s: &str) -> bool {
162    s.as_bytes()
163        .iter()
164        .all(|&b| b == b' ' || b == b'\t' || b == b'\n' || b == b'\r')
165}
166
167/// Strip trailing ASCII space and tab bytes. Equivalent to
168/// `s.trim_end_matches([' ', '\t'])` but byte-level.
169#[inline]
170pub(crate) fn trim_end_spaces_tabs(s: &str) -> &str {
171    let bytes = s.as_bytes();
172    let mut end = bytes.len();
173    while end > 0 {
174        let b = bytes[end - 1];
175        if b == b' ' || b == b'\t' {
176            end -= 1;
177        } else {
178            break;
179        }
180    }
181    // SAFETY: only ASCII bytes stripped from the end.
182    unsafe { std::str::from_utf8_unchecked(&bytes[..end]) }
183}
184
185/// Split input into lines while preserving line endings (LF or CRLF).
186/// This is like split_inclusive but handles both \n and \r\n.
187pub(crate) fn split_lines_inclusive(input: &str) -> Vec<&str> {
188    if input.is_empty() {
189        return vec![];
190    }
191
192    let mut lines = Vec::new();
193    let mut start = 0;
194    let bytes = input.as_bytes();
195    let len = bytes.len();
196
197    let mut i = 0;
198    while i < len {
199        if bytes[i] == b'\n' {
200            // Found LF, include it in the line
201            lines.push(&input[start..=i]);
202            start = i + 1;
203            i += 1;
204        } else if bytes[i] == b'\r' && i + 1 < len && bytes[i + 1] == b'\n' {
205            // Found CRLF, include both in the line
206            lines.push(&input[start..=i + 1]);
207            start = i + 2;
208            i += 2;
209        } else {
210            i += 1;
211        }
212    }
213
214    // Add remaining text if any (last line without newline)
215    if start < len {
216        lines.push(&input[start..]);
217    }
218
219    lines
220}
221
222#[cfg(test)]
223mod tests {
224    use super::*;
225
226    #[test]
227    fn test_strip_leading_spaces_n() {
228        assert_eq!(strip_leading_spaces_n("   text", 3), "text");
229        assert_eq!(strip_leading_spaces_n("  text", 3), "text");
230        assert_eq!(strip_leading_spaces_n(" text", 3), "text");
231        assert_eq!(strip_leading_spaces_n("text", 3), "text");
232        assert_eq!(strip_leading_spaces_n("    text", 3), " text");
233    }
234
235    #[test]
236    fn test_strip_newline() {
237        assert_eq!(strip_newline("text\n"), ("text", "\n"));
238        assert_eq!(strip_newline("text\r\n"), ("text", "\r\n"));
239        assert_eq!(strip_newline("text"), ("text", ""));
240    }
241
242    #[test]
243    fn test_trim_end_newlines() {
244        assert_eq!(trim_end_newlines("foo\n"), "foo");
245        assert_eq!(trim_end_newlines("foo\r\n"), "foo");
246        assert_eq!(trim_end_newlines("foo\n\n"), "foo");
247        assert_eq!(trim_end_newlines("foo"), "foo");
248        assert_eq!(trim_end_newlines(""), "");
249        assert_eq!(trim_end_newlines("\n"), "");
250        // Non-ASCII byte sequences stay intact.
251        assert_eq!(trim_end_newlines("föö\n"), "föö");
252    }
253
254    fn separator_tokens(line: &str) -> Vec<(SyntaxKind, String)> {
255        let mut builder = GreenNodeBuilder::new();
256        builder.start_node(SyntaxKind::TABLE_SEPARATOR.into());
257        emit_separator_tokens(&mut builder, line);
258        builder.finish_node();
259        let node = crate::syntax::SyntaxNode::new_root(builder.finish());
260        node.children_with_tokens()
261            .filter_map(|el| el.into_token())
262            .map(|t| (t.kind(), t.text().to_string()))
263            .collect()
264    }
265
266    #[test]
267    fn test_emit_separator_tokens_reconstruction() {
268        // Concatenation of token texts must byte-equal the input.
269        for line in [
270            "|:--|--:|:-:|\n",
271            "+------+:----:+------+\n",
272            "+======+======+\r\n",
273            "------- ------ ----------\n",
274            ":--:",               // no bounding delims, no newline
275            "|:--|--:|?weird|\n", // unexpected byte falls back to TEXT
276        ] {
277            let reconstructed: String = separator_tokens(line)
278                .iter()
279                .map(|(_, t)| t.as_str())
280                .collect();
281            assert_eq!(reconstructed, line, "round-trip failed for {line:?}");
282        }
283    }
284
285    #[test]
286    fn test_emit_separator_tokens_kinds() {
287        use SyntaxKind::*;
288        assert_eq!(
289            separator_tokens("|:--|--:|\n"),
290            vec![
291                (TABLE_SEP_DELIM, "|".to_string()),
292                (TABLE_SEP_COLON, ":".to_string()),
293                (TABLE_SEP_DASHES, "--".to_string()),
294                (TABLE_SEP_DELIM, "|".to_string()),
295                (TABLE_SEP_DASHES, "--".to_string()),
296                (TABLE_SEP_COLON, ":".to_string()),
297                (TABLE_SEP_DELIM, "|".to_string()),
298                (NEWLINE, "\n".to_string()),
299            ],
300        );
301        // Grid `===` divider and interior whitespace in a simple separator.
302        assert_eq!(
303            separator_tokens("--- ---\n"),
304            vec![
305                (TABLE_SEP_DASHES, "---".to_string()),
306                (TABLE_SEP_WHITESPACE, " ".to_string()),
307                (TABLE_SEP_DASHES, "---".to_string()),
308                (NEWLINE, "\n".to_string()),
309            ],
310        );
311    }
312
313    #[test]
314    fn test_trim_spaces_tabs() {
315        assert_eq!(trim_start_spaces_tabs("  \tfoo"), "foo");
316        assert_eq!(trim_start_spaces_tabs("foo"), "foo");
317        assert_eq!(trim_start_spaces_tabs(""), "");
318        assert_eq!(trim_end_spaces_tabs("foo  \t"), "foo");
319        assert_eq!(trim_end_spaces_tabs("foo"), "foo");
320        assert_eq!(trim_end_spaces_tabs(""), "");
321        assert_eq!(trim_end_spaces_tabs("föö  "), "föö");
322    }
323}