Skip to main content

rlsp_yaml_parser/
pos.rs

1// SPDX-License-Identifier: MIT
2
3/// A position within the input stream.
4///
5/// `line` is 1-based; `column` is 0-based (codepoints from the start of the line).
6///
7/// Used internally by the lexer and for error reporting. Consumers that need
8/// line/column from a `Span` should use [`LineIndex`] instead.
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub struct Pos {
11    /// Byte offset from the start of the input (0-based).
12    pub byte_offset: usize,
13    /// Line number (1-based).
14    pub line: usize,
15    /// Codepoint column within the current line (0-based).
16    pub column: usize,
17}
18
19impl Pos {
20    /// The position representing the start of a document.
21    pub const ORIGIN: Self = Self {
22        byte_offset: 0,
23        line: 1,
24        column: 0,
25    };
26
27    /// Advance the position by one character.
28    ///
29    /// If `ch` is a line feed (`\n`) the line counter is incremented and the
30    /// column is reset to 0.  For all other characters the column advances by
31    /// one.  `byte_offset` advances by `ch.len_utf8()`.
32    #[must_use]
33    pub const fn advance(self, ch: char) -> Self {
34        let byte_offset = self.byte_offset + ch.len_utf8();
35        if ch == '\n' {
36            Self {
37                byte_offset,
38                line: self.line + 1,
39                column: 0,
40            }
41        } else {
42            Self {
43                byte_offset,
44                line: self.line,
45                column: self.column + 1,
46            }
47        }
48    }
49}
50
51/// Compute the 0-based column (codepoint count) for a position within a line.
52///
53/// `byte_offset_in_line` must be a valid byte-boundary index into `line_content`.
54/// Uses an ASCII fast path: if the prefix is pure ASCII, the column equals the
55/// byte offset (1 byte = 1 codepoint).
56pub fn column_at(line_content: &str, byte_offset_in_line: usize) -> usize {
57    let prefix = &line_content[..byte_offset_in_line];
58    if prefix.is_ascii() {
59        byte_offset_in_line
60    } else {
61        prefix.chars().count()
62    }
63}
64
65/// Advance `pos` past `content`, assuming `content` contains no line break.
66/// Uses the ASCII fast path in [`column_at`].
67pub fn advance_within_line(pos: Pos, content: &str) -> Pos {
68    Pos {
69        byte_offset: pos.byte_offset + content.len(),
70        line: pos.line,
71        column: pos.column + column_at(content, content.len()),
72    }
73}
74
75// ---------------------------------------------------------------------------
76// LineIndex
77// ---------------------------------------------------------------------------
78
79/// An index over a document's byte offsets that resolves byte offsets to
80/// `(line, column)` pairs on demand.
81///
82/// Constructed once per document from the source `&str`. Stores the byte
83/// offset of each line terminator (`\n`, `\r`, or `\r\n`) in sorted order.
84/// Line/column are resolved via binary search in O(log n) time.
85///
86/// Line numbers are 1-based; column numbers are 0-based codepoint counts.
87#[derive(Debug, Clone, PartialEq, Eq)]
88pub struct LineIndex {
89    /// Sorted byte offsets of newline characters in the source.
90    /// For `\r\n` pairs, stores the offset of the `\r`.
91    newlines: Vec<u32>,
92    /// Full source kept for codepoint-column computation.
93    source: String,
94}
95
96impl LineIndex {
97    /// Build a `LineIndex` from the document source string.
98    #[must_use]
99    pub fn new(source: &str) -> Self {
100        let mut newlines = Vec::new();
101        let mut chars = source.char_indices().peekable();
102        while let Some((i, ch)) = chars.next() {
103            match ch {
104                '\r' => {
105                    #[expect(
106                        clippy::cast_possible_truncation,
107                        reason = "YAML files <= 4 GB; u32 offset is sufficient"
108                    )]
109                    newlines.push(i as u32);
110                    // Consume the following `\n` of a CRLF pair.
111                    if chars.peek().is_some_and(|(_, next)| *next == '\n') {
112                        let _ = chars.next();
113                    }
114                }
115                '\n' => {
116                    #[expect(
117                        clippy::cast_possible_truncation,
118                        reason = "YAML files <= 4 GB; u32 offset is sufficient"
119                    )]
120                    newlines.push(i as u32);
121                }
122                _ => {}
123            }
124        }
125        Self {
126            newlines,
127            source: source.to_owned(),
128        }
129    }
130
131    /// Resolve a byte offset to a `(line, column)` pair.
132    ///
133    /// `line` is 1-based; `column` is the 0-based codepoint count from the
134    /// start of the line.
135    ///
136    /// The offset must be a valid byte boundary within the source string.
137    #[must_use]
138    pub fn line_column(&self, offset: u32) -> (u32, u32) {
139        // Binary-search for the number of newlines before `offset`.
140        // `partition_point` returns the index of the first element >= offset,
141        // so all elements before that index are < offset (i.e., preceding newlines).
142        let newline_idx = self.newlines.partition_point(|&nl| nl < offset);
143        #[expect(
144            clippy::cast_possible_truncation,
145            reason = "line count fits u32 for any realistic document"
146        )]
147        let line = (newline_idx as u32) + 1;
148
149        // Byte offset of the start of this line.
150        let line_start_byte = if newline_idx == 0 {
151            0usize
152        } else {
153            // Safety: newline_idx > 0 here, so newline_idx - 1 is valid.
154            #[expect(clippy::indexing_slicing, reason = "newline_idx > 0 is checked above")]
155            let nl_byte = self.newlines[newline_idx - 1] as usize;
156            // The stored byte is the \r or \n character itself.
157            // Advance past it (and a following \n for CRLF).
158            let nl_char = self
159                .source
160                .get(nl_byte..)
161                .and_then(|s| s.chars().next())
162                .unwrap_or('\n');
163            if nl_char == '\r'
164                && self
165                    .source
166                    .get(nl_byte + 1..nl_byte + 2)
167                    .is_some_and(|s| s == "\n")
168            {
169                nl_byte + 2
170            } else {
171                nl_byte + 1
172            }
173        };
174
175        // Count codepoints from line start to the requested offset.
176        let col_prefix = self
177            .source
178            .get(line_start_byte..offset as usize)
179            .unwrap_or("");
180        #[expect(
181            clippy::cast_possible_truncation,
182            reason = "column fits u32 for any realistic line"
183        )]
184        let column = if col_prefix.is_ascii() {
185            col_prefix.len() as u32
186        } else {
187            col_prefix.chars().count() as u32
188        };
189
190        (line, column)
191    }
192}
193
194// ---------------------------------------------------------------------------
195// Span
196// ---------------------------------------------------------------------------
197
198/// A half-open span `[start, end)` within the input stream, stored as byte
199/// offsets.
200///
201/// Use [`LineIndex`] (available from [`crate::node::Document::line_index`])
202/// to convert offsets to `(line, column)` pairs on demand.
203#[derive(Debug, Clone, Copy, PartialEq, Eq)]
204pub struct Span {
205    /// Inclusive start byte offset of the span.
206    pub start: u32,
207    /// Exclusive end byte offset of the span.
208    pub end: u32,
209}
210
211impl Span {
212    /// Construct a `Span` from a `Pos` pair.
213    ///
214    /// Used internally during parsing to convert eager `Pos` values to compact
215    /// byte-offset spans.
216    #[must_use]
217    #[expect(
218        clippy::cast_possible_truncation,
219        reason = "YAML files <= 4 GB; u32 offset is sufficient"
220    )]
221    pub(crate) const fn from_pos(start: Pos, end: Pos) -> Self {
222        Self {
223            start: start.byte_offset as u32,
224            end: end.byte_offset as u32,
225        }
226    }
227
228    /// Return `(line, column)` for the start of this span.
229    ///
230    /// Line is 1-based; column is 0-based codepoint count from line start.
231    #[must_use]
232    pub fn start_line_column(&self, index: &LineIndex) -> (u32, u32) {
233        index.line_column(self.start)
234    }
235
236    /// Return `(line, column)` for the end of this span.
237    ///
238    /// Line is 1-based; column is 0-based codepoint count from line start.
239    #[must_use]
240    pub fn end_line_column(&self, index: &LineIndex) -> (u32, u32) {
241        index.line_column(self.end)
242    }
243}
244
245// ---------------------------------------------------------------------------
246// Tests
247// ---------------------------------------------------------------------------
248
249#[cfg(test)]
250mod tests {
251    use proptest::prelude::*;
252    use rstest::rstest;
253
254    use super::*;
255
256    // -----------------------------------------------------------------------
257    // Pos tests (kept because Pos is still a public type)
258    // -----------------------------------------------------------------------
259
260    #[test]
261    fn pos_origin_is_start_of_document() {
262        let pos = Pos::ORIGIN;
263        assert_eq!(pos.byte_offset, 0);
264        assert_eq!(pos.line, 1);
265        assert_eq!(pos.column, 0);
266    }
267
268    #[test]
269    fn pos_fields_are_accessible() {
270        let pos = Pos {
271            byte_offset: 10,
272            line: 3,
273            column: 4,
274        };
275        assert_eq!(pos.byte_offset, 10);
276        assert_eq!(pos.line, 3);
277        assert_eq!(pos.column, 4);
278    }
279
280    #[test]
281    fn pos_is_copy() {
282        let pos = Pos::ORIGIN;
283        let pos2 = pos;
284        let _ = pos.byte_offset;
285        let _ = pos2.byte_offset;
286    }
287
288    #[rstest]
289    #[case::ascii_char('a', 1, 1, 1)]
290    #[case::newline('\n', 1, 2, 0)]
291    #[case::multibyte_cjk('中', 3, 1, 1)]
292    fn advance_basic(
293        #[case] ch: char,
294        #[case] expected_byte_offset: usize,
295        #[case] expected_line: usize,
296        #[case] expected_column: usize,
297    ) {
298        let pos = Pos::ORIGIN.advance(ch);
299        assert_eq!(pos.byte_offset, expected_byte_offset);
300        assert_eq!(pos.line, expected_line);
301        assert_eq!(pos.column, expected_column);
302    }
303
304    #[test]
305    fn advance_multiple_lines() {
306        let pos = Pos::ORIGIN
307            .advance('a')
308            .advance('\n')
309            .advance('b')
310            .advance('\n')
311            .advance('c');
312        assert_eq!(pos.line, 3);
313        assert_eq!(pos.column, 1);
314    }
315
316    // -----------------------------------------------------------------------
317    // Span tests
318    // -----------------------------------------------------------------------
319
320    // SZ-1
321    #[test]
322    fn span_size_is_eight_bytes() {
323        assert_eq!(std::mem::size_of::<Span>(), 8);
324    }
325
326    const _: () = assert!(
327        std::mem::size_of::<Span>() == 8,
328        "Span must be exactly 8 bytes"
329    );
330
331    #[test]
332    fn span_is_copy() {
333        let span = Span { start: 0, end: 0 };
334        let span2 = span;
335        let _ = span.start;
336        let _ = span2.start;
337    }
338
339    // -----------------------------------------------------------------------
340    // column_at
341    // -----------------------------------------------------------------------
342
343    #[rstest]
344    #[case::empty_prefix("hello", 0, 0)]
345    #[case::ascii_mid_line("hello world", 5, 5)]
346    #[case::ascii_full_line("abc", 3, 3)]
347    #[case::multibyte_only_prefix("日本語xyz", 9, 3)]
348    #[case::ascii_then_multibyte("ab日本", 8, 4)]
349    #[case::multibyte_then_ascii("日ab", 5, 3)]
350    #[case::full_multibyte_line("日本語", 9, 3)]
351    fn column_at_cases(
352        #[case] line_content: &str,
353        #[case] byte_offset: usize,
354        #[case] expected: usize,
355    ) {
356        assert_eq!(column_at(line_content, byte_offset), expected);
357    }
358
359    // -----------------------------------------------------------------------
360    // advance_within_line
361    // -----------------------------------------------------------------------
362
363    #[rstest]
364    #[case::empty_content(Pos { byte_offset: 5, line: 2, column: 3 }, "", 5, 2, 3)]
365    #[case::ascii_from_origin(Pos::ORIGIN, "hello", 5, 1, 5)]
366    #[case::ascii_mid_line(Pos { byte_offset: 10, line: 3, column: 4 }, "abc", 13, 3, 7)]
367    #[case::multibyte_from_origin(Pos::ORIGIN, "日本語", 9, 1, 3)]
368    #[case::multibyte_mid_line(Pos { byte_offset: 4, line: 1, column: 2 }, "日本語", 13, 1, 5)]
369    #[case::mixed_ascii_then_multibyte(Pos::ORIGIN, "ab日", 5, 1, 3)]
370    fn advance_within_line_fields(
371        #[case] start: Pos,
372        #[case] content: &str,
373        #[case] expected_byte_offset: usize,
374        #[case] expected_line: usize,
375        #[case] expected_column: usize,
376    ) {
377        let result = advance_within_line(start, content);
378        assert_eq!(result.byte_offset, expected_byte_offset);
379        assert_eq!(result.line, expected_line);
380        assert_eq!(result.column, expected_column);
381    }
382
383    #[test]
384    fn advance_within_line_line_field_is_preserved() {
385        let pos = Pos {
386            byte_offset: 0,
387            line: 7,
388            column: 0,
389        };
390        let result = advance_within_line(pos, "xyz");
391        assert_eq!(result.line, 7);
392    }
393
394    #[test]
395    fn advance_within_line_matches_advance_loop_ascii() {
396        let pos = Pos {
397            byte_offset: 2,
398            line: 1,
399            column: 2,
400        };
401        let content = "abc";
402        let expected = content.chars().fold(pos, super::Pos::advance);
403        assert_eq!(advance_within_line(pos, content), expected);
404    }
405
406    #[test]
407    fn advance_within_line_matches_advance_loop_multibyte() {
408        let pos = Pos {
409            byte_offset: 0,
410            line: 1,
411            column: 0,
412        };
413        let content = "日本語xyz";
414        let expected = content.chars().fold(pos, super::Pos::advance);
415        assert_eq!(advance_within_line(pos, content), expected);
416    }
417
418    // -----------------------------------------------------------------------
419    // LI-*: LineIndex unit tests
420    // -----------------------------------------------------------------------
421
422    // LI-1
423    #[test]
424    fn line_index_empty_string_produces_no_newlines() {
425        let idx = LineIndex::new("");
426        assert!(idx.newlines.is_empty());
427        assert_eq!(idx.line_column(0), (1, 0));
428    }
429
430    // LI-2
431    #[test]
432    fn line_index_single_line_no_newline() {
433        let idx = LineIndex::new("hello");
434        assert_eq!(idx.line_column(0), (1, 0));
435        assert_eq!(idx.line_column(4), (1, 4));
436        assert_eq!(idx.line_column(5), (1, 5));
437    }
438
439    // LI-3
440    #[test]
441    fn line_index_single_newline_at_end() {
442        let idx = LineIndex::new("hello\n");
443        assert_eq!(idx.line_column(0), (1, 0));
444        assert_eq!(idx.line_column(4), (1, 4));
445        assert_eq!(idx.line_column(5), (1, 5)); // the \n byte itself
446        assert_eq!(idx.line_column(6), (2, 0)); // byte past the newline
447    }
448
449    // LI-4
450    #[test]
451    fn line_index_multiple_lines_line_numbers_correct() {
452        let idx = LineIndex::new("a\nb\nc");
453        assert_eq!(idx.line_column(0), (1, 0));
454        assert_eq!(idx.line_column(2), (2, 0));
455        assert_eq!(idx.line_column(4), (3, 0));
456    }
457
458    // LI-5
459    #[test]
460    fn line_index_column_is_codepoint_count_not_byte_count() {
461        // "日本語\nfoo" — 日本語 is 9 bytes / 3 codepoints
462        let idx = LineIndex::new("日本語\nfoo");
463        assert_eq!(idx.line_column(9), (1, 3)); // the \n byte
464        assert_eq!(idx.line_column(10), (2, 0));
465        assert_eq!(idx.line_column(11), (2, 1));
466    }
467
468    // LI-6
469    #[test]
470    fn line_index_ascii_fast_path_matches_general_path() {
471        // "abc\nxyz": x=col0, y=col1, z=col2
472        let idx = LineIndex::new("abc\nxyz");
473        assert_eq!(idx.line_column(5), (2, 1)); // 'y'
474    }
475
476    // LI-7
477    #[test]
478    fn line_index_multibyte_mid_line() {
479        // "ab日xyz\nok" — ab=2B, 日=3B, xyz starts at byte 5
480        let idx = LineIndex::new("ab日xyz\nok");
481        assert_eq!(idx.line_column(2), (1, 2));
482        assert_eq!(idx.line_column(5), (1, 3)); // byte after 日 = col 3
483        assert_eq!(idx.line_column(6), (1, 4)); // x
484        assert_eq!(idx.line_column(8), (1, 6)); // z
485        assert_eq!(idx.line_column(9), (2, 0)); // after \n
486    }
487
488    // LI-8
489    #[test]
490    fn line_index_crlf_line_endings() {
491        // "a\r\nb" — \r at byte 1, \n at byte 2; 'b' at byte 3
492        let idx = LineIndex::new("a\r\nb");
493        assert_eq!(idx.line_column(0), (1, 0));
494        assert_eq!(idx.line_column(3), (2, 0));
495    }
496
497    // LI-9
498    #[test]
499    fn line_index_bare_cr_line_endings() {
500        // "a\rb" — \r at byte 1; 'b' at byte 2
501        let idx = LineIndex::new("a\rb");
502        assert_eq!(idx.line_column(0), (1, 0));
503        assert_eq!(idx.line_column(2), (2, 0));
504    }
505
506    // -----------------------------------------------------------------------
507    // RT-*: Property tests — round-trip correctness
508    // -----------------------------------------------------------------------
509
510    /// Oracle: drive `Pos::advance` through each char up to `offset`, return (line, col).
511    fn eager_line_column(source: &str, offset: usize) -> (u32, u32) {
512        let mut pos = Pos::ORIGIN;
513        for ch in source.chars() {
514            if pos.byte_offset >= offset {
515                break;
516            }
517            pos = pos.advance(ch);
518        }
519        #[expect(
520            clippy::cast_possible_truncation,
521            reason = "test oracle: values fit u32 in realistic inputs"
522        )]
523        (pos.line as u32, pos.column as u32)
524    }
525
526    proptest! {
527        // RT-1: ASCII inputs
528        #[test]
529        #[expect(
530            clippy::unwrap_used,
531            clippy::cast_possible_truncation,
532            reason = "proptest: regex strategy is valid; offset fits u32 for any string of length <= 50"
533        )]
534        fn line_index_line_column_matches_advance_loop_ascii(
535            input in proptest::string::string_regex("[a-z\n]{0,50}").unwrap()
536        ) {
537            let idx = LineIndex::new(&input);
538            let mut offset = 0usize;
539            for ch in input.chars() {
540                let expected = eager_line_column(&input, offset);
541                let got = idx.line_column(offset as u32);
542                prop_assert_eq!(
543                    got, expected,
544                    "mismatch at offset {} in {:?}", offset, input
545                );
546                offset += ch.len_utf8();
547            }
548            // Also test at end-of-string offset.
549            if offset <= input.len() {
550                let expected = eager_line_column(&input, offset);
551                let got = idx.line_column(offset as u32);
552                prop_assert_eq!(got, expected, "mismatch at end offset {}", offset);
553            }
554        }
555
556        // RT-2: Unicode inputs including multibyte characters
557        #[test]
558        #[expect(
559            clippy::unwrap_used,
560            clippy::cast_possible_truncation,
561            reason = "proptest: regex strategy is valid; offset fits u32 for any string of length <= 30"
562        )]
563        fn line_index_line_column_matches_advance_loop_multibyte(
564            input in proptest::string::string_regex("[a-z\n\u{4E00}-\u{4E10}]{0,30}").unwrap()
565        ) {
566            let idx = LineIndex::new(&input);
567            let mut offset = 0usize;
568            for ch in input.chars() {
569                let expected = eager_line_column(&input, offset);
570                let got = idx.line_column(offset as u32);
571                prop_assert_eq!(
572                    got, expected,
573                    "mismatch at offset {} in {:?}", offset, input
574                );
575                offset += ch.len_utf8();
576            }
577        }
578    }
579}