Skip to main content

perl_parser_core/syntax/
text_line.rs

1//! Text-line cursor helpers.
2//!
3//! This crate has a single responsibility: map cursor offsets to line
4//! boundaries and provide conservative token-boundary primitives for
5//! single-line scanning.
6
7#![deny(unsafe_code)]
8#![warn(rust_2018_idioms)]
9#![warn(missing_docs)]
10
11/// Return the byte span of the line containing `cursor_pos`.
12///
13/// The returned range is inclusive of the first line byte and exclusive of
14/// one past the last byte, matching half-open Rust range conventions.
15#[must_use]
16pub fn line_bounds_at(text: &str, cursor_pos: usize) -> (usize, usize) {
17    let cursor = cursor_pos.min(text.len());
18    let start = text[..cursor].rfind('\n').map_or(0, |idx| idx + 1);
19    let end = text[cursor..].find('\n').map_or(text.len(), |idx| cursor + idx);
20    (start, end)
21}
22
23/// Return `true` when `byte` is an identifier character (`[A-Za-z0-9_]`).
24#[must_use]
25pub fn is_identifier_byte(byte: u8) -> bool {
26    byte.is_ascii_alphanumeric() || byte == b'_'
27}
28
29/// Return `true` when token `keyword` bytes in `[start, start + len)` are
30/// bounded on both sides by non-identifier bytes.
31#[must_use]
32pub fn is_keyword_boundary(bytes: &[u8], start: usize, len: usize) -> bool {
33    if start > bytes.len() {
34        return false;
35    }
36
37    let end = start.saturating_add(len);
38    if end > bytes.len() {
39        return false;
40    }
41
42    if start > 0 && is_identifier_byte(bytes[start - 1]) {
43        return false;
44    }
45
46    if end < bytes.len() && is_identifier_byte(bytes[end]) {
47        return false;
48    }
49
50    true
51}
52
53/// Advance `idx` while bytes at the cursor are ASCII whitespace.
54#[must_use]
55pub fn skip_ascii_whitespace(bytes: &[u8], mut idx: usize) -> usize {
56    while idx < bytes.len() && bytes[idx].is_ascii_whitespace() {
57        idx += 1;
58    }
59    idx
60}
61
62#[cfg(test)]
63mod tests {
64    use super::*;
65
66    // --- line_bounds_at ---
67
68    #[test]
69    fn line_bounds_empty_input() -> Result<(), Box<dyn std::error::Error>> {
70        assert_eq!(line_bounds_at("", 0), (0, 0));
71        Ok(())
72    }
73
74    #[test]
75    fn line_bounds_single_line_cursor_at_start() -> Result<(), Box<dyn std::error::Error>> {
76        assert_eq!(line_bounds_at("hello", 0), (0, 5));
77        Ok(())
78    }
79
80    #[test]
81    fn line_bounds_single_line_cursor_at_mid() -> Result<(), Box<dyn std::error::Error>> {
82        assert_eq!(line_bounds_at("hello", 2), (0, 5));
83        Ok(())
84    }
85
86    #[test]
87    fn line_bounds_single_line_cursor_at_end() -> Result<(), Box<dyn std::error::Error>> {
88        assert_eq!(line_bounds_at("hello", 5), (0, 5));
89        Ok(())
90    }
91
92    #[test]
93    fn line_bounds_multiline_cursor_on_first_line() -> Result<(), Box<dyn std::error::Error>> {
94        let text = "foo\nbar\nbaz";
95        // cursor at 'f' → first line is [0, 3)
96        assert_eq!(line_bounds_at(text, 0), (0, 3));
97        Ok(())
98    }
99
100    #[test]
101    fn line_bounds_multiline_cursor_on_second_line() -> Result<(), Box<dyn std::error::Error>> {
102        let text = "foo\nbar\nbaz";
103        // cursor at 'b' of "bar" (index 4)
104        assert_eq!(line_bounds_at(text, 4), (4, 7));
105        Ok(())
106    }
107
108    #[test]
109    fn line_bounds_multiline_cursor_on_last_line() -> Result<(), Box<dyn std::error::Error>> {
110        let text = "foo\nbar\nbaz";
111        // cursor at 'b' of "baz" (index 8)
112        assert_eq!(line_bounds_at(text, 8), (8, 11));
113        Ok(())
114    }
115
116    #[test]
117    fn line_bounds_cursor_on_newline_itself() -> Result<(), Box<dyn std::error::Error>> {
118        let text = "foo\nbar";
119        // cursor on the '\n' at index 3:
120        // start = rfind('\n') in "foo" → None → 0
121        // end   = find('\n') in "\nbar" starting at 3 → idx 0 → cursor+0 = 3
122        assert_eq!(line_bounds_at(text, 3), (0, 3));
123        Ok(())
124    }
125
126    #[test]
127    fn line_bounds_cursor_past_end() -> Result<(), Box<dyn std::error::Error>> {
128        let text = "hello";
129        // cursor_pos is clamped to text.len() (5) before use
130        assert_eq!(line_bounds_at(text, 100), (0, 5));
131        Ok(())
132    }
133
134    #[test]
135    fn line_bounds_crlf_cursor_on_cr() -> Result<(), Box<dyn std::error::Error>> {
136        let text = "foo\r\nbar";
137        // cursor on '\r' at index 3
138        // start = rfind('\n') in "foo\r" → None → 0
139        // end   = find('\n') in "\r\nbar" → index 1 → cursor+1 = 4
140        assert_eq!(line_bounds_at(text, 3), (0, 4));
141        Ok(())
142    }
143
144    #[test]
145    fn line_bounds_crlf_cursor_after_lf() -> Result<(), Box<dyn std::error::Error>> {
146        let text = "foo\r\nbar";
147        // cursor on 'b' at index 5
148        // start = rfind('\n') in "foo\r\n" → index 4 → start = 5
149        // end   = find('\n') in "bar" → None → text.len() = 8
150        assert_eq!(line_bounds_at(text, 5), (5, 8));
151        Ok(())
152    }
153
154    // --- is_identifier_byte ---
155
156    #[test]
157    fn identifier_byte_lowercase_letters() -> Result<(), Box<dyn std::error::Error>> {
158        for b in b'a'..=b'z' {
159            assert!(is_identifier_byte(b), "expected true for '{}'", b as char);
160        }
161        Ok(())
162    }
163
164    #[test]
165    fn identifier_byte_uppercase_letters() -> Result<(), Box<dyn std::error::Error>> {
166        for b in b'A'..=b'Z' {
167            assert!(is_identifier_byte(b), "expected true for '{}'", b as char);
168        }
169        Ok(())
170    }
171
172    #[test]
173    fn identifier_byte_digits() -> Result<(), Box<dyn std::error::Error>> {
174        for b in b'0'..=b'9' {
175            assert!(is_identifier_byte(b), "expected true for '{}'", b as char);
176        }
177        Ok(())
178    }
179
180    #[test]
181    fn identifier_byte_underscore() -> Result<(), Box<dyn std::error::Error>> {
182        assert!(is_identifier_byte(b'_'));
183        Ok(())
184    }
185
186    #[test]
187    fn identifier_byte_space_is_false() -> Result<(), Box<dyn std::error::Error>> {
188        assert!(!is_identifier_byte(b' '));
189        Ok(())
190    }
191
192    #[test]
193    fn identifier_byte_punctuation_is_false() -> Result<(), Box<dyn std::error::Error>> {
194        for b in [b'!', b'@', b'#', b'$', b'%', b'^', b'&', b'*', b'(', b')', b'-', b'+'] {
195            assert!(!is_identifier_byte(b), "expected false for '{}'", b as char);
196        }
197        Ok(())
198    }
199
200    #[test]
201    fn identifier_byte_control_char_is_false() -> Result<(), Box<dyn std::error::Error>> {
202        assert!(!is_identifier_byte(b'\t'));
203        assert!(!is_identifier_byte(b'\n'));
204        assert!(!is_identifier_byte(0x00));
205        Ok(())
206    }
207
208    #[test]
209    fn identifier_byte_high_bit_is_false() -> Result<(), Box<dyn std::error::Error>> {
210        // High-bit bytes are not ASCII alphanumeric and not '_'
211        assert!(!is_identifier_byte(0x80));
212        assert!(!is_identifier_byte(0xFF));
213        Ok(())
214    }
215
216    // --- is_keyword_boundary ---
217
218    #[test]
219    fn keyword_boundary_at_index_zero_start() -> Result<(), Box<dyn std::error::Error>> {
220        let bytes = b"if foo";
221        // "if" at start (index 0, len 2): no preceding byte → left bound ok
222        // bytes[2] == b' ' → not identifier → right bound ok
223        assert!(is_keyword_boundary(bytes, 0, 2));
224        Ok(())
225    }
226
227    #[test]
228    fn keyword_boundary_false_when_start_past_end() -> Result<(), Box<dyn std::error::Error>> {
229        let bytes = b"hi";
230        assert!(!is_keyword_boundary(bytes, 5, 2));
231        Ok(())
232    }
233
234    #[test]
235    fn keyword_boundary_false_when_token_runs_past_end() -> Result<(), Box<dyn std::error::Error>> {
236        let bytes = b"hi";
237        assert!(!is_keyword_boundary(bytes, 0, 10));
238        Ok(())
239    }
240
241    #[test]
242    fn keyword_boundary_false_when_preceded_by_identifier_byte()
243    -> Result<(), Box<dyn std::error::Error>> {
244        // "if" with a letter immediately before it: "xif "
245        let bytes = b"xif bar";
246        // start=1, len=2 → bytes[0] = b'x' → identifier → false
247        assert!(!is_keyword_boundary(bytes, 1, 2));
248        Ok(())
249    }
250
251    #[test]
252    fn keyword_boundary_false_when_followed_by_identifier_byte()
253    -> Result<(), Box<dyn std::error::Error>> {
254        // "if" followed immediately by a letter: "iffoo"
255        let bytes = b"iffoo";
256        // start=0, len=2 → bytes[2] = b'f' → identifier → false
257        assert!(!is_keyword_boundary(bytes, 0, 2));
258        Ok(())
259    }
260
261    #[test]
262    fn keyword_boundary_true_at_end_of_input() -> Result<(), Box<dyn std::error::Error>> {
263        // "if" at the very end of the buffer with preceding space
264        let bytes = b" if";
265        // start=1, len=2, end=3 == bytes.len() → right bound ok
266        // bytes[0] = b' ' → not identifier → left bound ok
267        assert!(is_keyword_boundary(bytes, 1, 2));
268        Ok(())
269    }
270
271    #[test]
272    fn keyword_boundary_true_surrounded_by_whitespace() -> Result<(), Box<dyn std::error::Error>> {
273        let bytes = b" if ";
274        assert!(is_keyword_boundary(bytes, 1, 2));
275        Ok(())
276    }
277
278    #[test]
279    fn keyword_boundary_true_surrounded_by_punctuation() -> Result<(), Box<dyn std::error::Error>> {
280        let bytes = b";if;";
281        assert!(is_keyword_boundary(bytes, 1, 2));
282        Ok(())
283    }
284
285    // --- skip_ascii_whitespace ---
286
287    #[test]
288    fn skip_whitespace_empty_input() -> Result<(), Box<dyn std::error::Error>> {
289        assert_eq!(skip_ascii_whitespace(b"", 0), 0);
290        Ok(())
291    }
292
293    #[test]
294    fn skip_whitespace_no_whitespace_at_index() -> Result<(), Box<dyn std::error::Error>> {
295        assert_eq!(skip_ascii_whitespace(b"hello", 0), 0);
296        Ok(())
297    }
298
299    #[test]
300    fn skip_whitespace_space() -> Result<(), Box<dyn std::error::Error>> {
301        assert_eq!(skip_ascii_whitespace(b"   x", 0), 3);
302        Ok(())
303    }
304
305    #[test]
306    fn skip_whitespace_tab() -> Result<(), Box<dyn std::error::Error>> {
307        assert_eq!(skip_ascii_whitespace(b"\t\tx", 0), 2);
308        Ok(())
309    }
310
311    #[test]
312    fn skip_whitespace_newline() -> Result<(), Box<dyn std::error::Error>> {
313        assert_eq!(skip_ascii_whitespace(b"\nx", 0), 1);
314        Ok(())
315    }
316
317    #[test]
318    fn skip_whitespace_carriage_return() -> Result<(), Box<dyn std::error::Error>> {
319        assert_eq!(skip_ascii_whitespace(b"\rx", 0), 1);
320        Ok(())
321    }
322
323    #[test]
324    fn skip_whitespace_mixed_whitespace() -> Result<(), Box<dyn std::error::Error>> {
325        assert_eq!(skip_ascii_whitespace(b" \t\n\r!", 0), 4);
326        Ok(())
327    }
328
329    #[test]
330    fn skip_whitespace_all_whitespace_advances_to_end() -> Result<(), Box<dyn std::error::Error>> {
331        assert_eq!(skip_ascii_whitespace(b"   ", 0), 3);
332        Ok(())
333    }
334
335    #[test]
336    fn skip_whitespace_index_already_past_whitespace() -> Result<(), Box<dyn std::error::Error>> {
337        // idx starts after the spaces
338        assert_eq!(skip_ascii_whitespace(b"   hello", 3), 3);
339        Ok(())
340    }
341
342    #[test]
343    fn skip_whitespace_index_mid_whitespace() -> Result<(), Box<dyn std::error::Error>> {
344        assert_eq!(skip_ascii_whitespace(b"x  y", 1), 3);
345        Ok(())
346    }
347}