Skip to main content

panache_parser/parser/utils/
helpers.rs

1//! Shared utilities for block parsing.
2
3use crate::syntax::SyntaxKind;
4use rowan::GreenNodeBuilder;
5
6/// Helper to emit a line's text and newline tokens separately.
7/// Lines from split_lines_inclusive contain trailing newlines (LF or CRLF) that must be separated.
8pub(crate) fn emit_line_tokens(builder: &mut GreenNodeBuilder<'static>, line: &str) {
9    // Handle both CRLF and LF line endings
10    if let Some(text) = line.strip_suffix("\r\n") {
11        builder.token(SyntaxKind::TEXT.into(), text);
12        builder.token(SyntaxKind::NEWLINE.into(), "\r\n");
13    } else if let Some(text) = line.strip_suffix('\n') {
14        builder.token(SyntaxKind::TEXT.into(), text);
15        builder.token(SyntaxKind::NEWLINE.into(), "\n");
16    } else {
17        // No trailing newline (last line of input)
18        builder.token(SyntaxKind::TEXT.into(), line);
19    }
20}
21
22/// Strip up to N leading spaces from a line.
23/// This is the generalized version of the previous strip_leading_spaces (which stripped up to 3).
24pub(crate) fn strip_leading_spaces_n(line: &str, max_spaces: usize) -> &str {
25    let spaces_to_strip = line
26        .chars()
27        .take(max_spaces)
28        .take_while(|&c| c == ' ')
29        .count();
30    &line[spaces_to_strip..]
31}
32
33/// Strip up to 3 leading spaces from a line.
34/// This is a convenience wrapper for the common case in Markdown parsing.
35pub(crate) fn strip_leading_spaces(line: &str) -> &str {
36    strip_leading_spaces_n(line, 3)
37}
38
39/// Strip trailing newline (LF or CRLF) from a line, returning the content and the newline string.
40/// Returns (content_without_newline, newline_str).
41pub(crate) fn strip_newline(line: &str) -> (&str, &str) {
42    if let Some(content) = line.strip_suffix("\r\n") {
43        (content, "\r\n")
44    } else if let Some(content) = line.strip_suffix('\n') {
45        (content, "\n")
46    } else {
47        (line, "")
48    }
49}
50
51/// Strip trailing `\n` and `\r` bytes. ASCII byte-level equivalent of
52/// `s.trim_end_matches(['\r', '\n'])` — avoids the slice-pattern
53/// `MultiCharEqSearcher` codepath that goes through `char_indices` and
54/// shows up as a measurable hot frame in per-line block detect work.
55#[inline]
56pub(crate) fn trim_end_newlines(s: &str) -> &str {
57    let bytes = s.as_bytes();
58    let mut end = bytes.len();
59    while end > 0 {
60        let b = bytes[end - 1];
61        if b == b'\n' || b == b'\r' {
62            end -= 1;
63        } else {
64            break;
65        }
66    }
67    // SAFETY: we only stripped ASCII `\n` / `\r` bytes from the end, so the
68    // remaining prefix is still valid UTF-8 ending on a char boundary.
69    unsafe { std::str::from_utf8_unchecked(&bytes[..end]) }
70}
71
72/// Strip leading ASCII space and tab bytes. Equivalent to
73/// `s.trim_start_matches([' ', '\t'])` but byte-level.
74#[inline]
75pub(crate) fn trim_start_spaces_tabs(s: &str) -> &str {
76    let bytes = s.as_bytes();
77    let mut start = 0;
78    while start < bytes.len() {
79        let b = bytes[start];
80        if b == b' ' || b == b'\t' {
81            start += 1;
82        } else {
83            break;
84        }
85    }
86    // SAFETY: only ASCII bytes stripped from the start.
87    unsafe { std::str::from_utf8_unchecked(&bytes[start..]) }
88}
89
90/// Test whether `s` is a blank line: empty or composed only of ASCII
91/// whitespace (`' '`, `'\t'`, `'\n'`, `'\r'`). Equivalent to
92/// `s.trim_end_matches('\n').trim().is_empty()` for ASCII-whitespace
93/// inputs but bypasses the unicode-whitespace iteration in `str::trim`.
94#[inline]
95pub(crate) fn is_blank_line(s: &str) -> bool {
96    s.as_bytes()
97        .iter()
98        .all(|&b| b == b' ' || b == b'\t' || b == b'\n' || b == b'\r')
99}
100
101/// Strip trailing ASCII space and tab bytes. Equivalent to
102/// `s.trim_end_matches([' ', '\t'])` but byte-level.
103#[inline]
104pub(crate) fn trim_end_spaces_tabs(s: &str) -> &str {
105    let bytes = s.as_bytes();
106    let mut end = bytes.len();
107    while end > 0 {
108        let b = bytes[end - 1];
109        if b == b' ' || b == b'\t' {
110            end -= 1;
111        } else {
112            break;
113        }
114    }
115    // SAFETY: only ASCII bytes stripped from the end.
116    unsafe { std::str::from_utf8_unchecked(&bytes[..end]) }
117}
118
119/// Split input into lines while preserving line endings (LF or CRLF).
120/// This is like split_inclusive but handles both \n and \r\n.
121pub(crate) fn split_lines_inclusive(input: &str) -> Vec<&str> {
122    if input.is_empty() {
123        return vec![];
124    }
125
126    let mut lines = Vec::new();
127    let mut start = 0;
128    let bytes = input.as_bytes();
129    let len = bytes.len();
130
131    let mut i = 0;
132    while i < len {
133        if bytes[i] == b'\n' {
134            // Found LF, include it in the line
135            lines.push(&input[start..=i]);
136            start = i + 1;
137            i += 1;
138        } else if bytes[i] == b'\r' && i + 1 < len && bytes[i + 1] == b'\n' {
139            // Found CRLF, include both in the line
140            lines.push(&input[start..=i + 1]);
141            start = i + 2;
142            i += 2;
143        } else {
144            i += 1;
145        }
146    }
147
148    // Add remaining text if any (last line without newline)
149    if start < len {
150        lines.push(&input[start..]);
151    }
152
153    lines
154}
155
156#[cfg(test)]
157mod tests {
158    use super::*;
159
160    #[test]
161    fn test_strip_leading_spaces_n() {
162        assert_eq!(strip_leading_spaces_n("   text", 3), "text");
163        assert_eq!(strip_leading_spaces_n("  text", 3), "text");
164        assert_eq!(strip_leading_spaces_n(" text", 3), "text");
165        assert_eq!(strip_leading_spaces_n("text", 3), "text");
166        assert_eq!(strip_leading_spaces_n("    text", 3), " text");
167    }
168
169    #[test]
170    fn test_strip_newline() {
171        assert_eq!(strip_newline("text\n"), ("text", "\n"));
172        assert_eq!(strip_newline("text\r\n"), ("text", "\r\n"));
173        assert_eq!(strip_newline("text"), ("text", ""));
174    }
175
176    #[test]
177    fn test_trim_end_newlines() {
178        assert_eq!(trim_end_newlines("foo\n"), "foo");
179        assert_eq!(trim_end_newlines("foo\r\n"), "foo");
180        assert_eq!(trim_end_newlines("foo\n\n"), "foo");
181        assert_eq!(trim_end_newlines("foo"), "foo");
182        assert_eq!(trim_end_newlines(""), "");
183        assert_eq!(trim_end_newlines("\n"), "");
184        // Non-ASCII byte sequences stay intact.
185        assert_eq!(trim_end_newlines("föö\n"), "föö");
186    }
187
188    #[test]
189    fn test_trim_spaces_tabs() {
190        assert_eq!(trim_start_spaces_tabs("  \tfoo"), "foo");
191        assert_eq!(trim_start_spaces_tabs("foo"), "foo");
192        assert_eq!(trim_start_spaces_tabs(""), "");
193        assert_eq!(trim_end_spaces_tabs("foo  \t"), "foo");
194        assert_eq!(trim_end_spaces_tabs("foo"), "foo");
195        assert_eq!(trim_end_spaces_tabs(""), "");
196        assert_eq!(trim_end_spaces_tabs("föö  "), "föö");
197    }
198}