Skip to main content

rlsp_yaml_parser/
lines.rs

1// SPDX-License-Identifier: MIT
2
3//! Line-at-a-time buffer with one-line lookahead for the streaming parser.
4//!
5//! `LineBuffer` wraps an `&'input str` and yields one [`Line`] at a time,
6//! always keeping the *next* line primed in an internal slot so callers can
7//! peek at the next line's indent without consuming it.  It never scans the
8//! full input up front, giving O(1) first-event latency.
9
10use std::collections::VecDeque;
11
12use crate::pos::Pos;
13
14// ---------------------------------------------------------------------------
15// Public types
16// ---------------------------------------------------------------------------
17
18/// The type of line terminator that ends a [`Line`].
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum BreakType {
21    /// `\n` (line feed)
22    Lf,
23    /// `\r` (bare carriage return — no following `\n`)
24    Cr,
25    /// `\r\n` (CRLF pair)
26    CrLf,
27    /// End of input — the line has no terminator.
28    Eof,
29}
30
31impl BreakType {
32    /// Byte length of this line terminator (0 for Eof).
33    #[must_use]
34    pub const fn byte_len(self) -> usize {
35        match self {
36            Self::Lf | Self::Cr => 1,
37            Self::CrLf => 2,
38            Self::Eof => 0,
39        }
40    }
41
42    /// Advance `pos` past this line break.
43    ///
44    /// Each break type requires distinct logic because `Pos::advance(char)`
45    /// operates on individual characters and cannot distinguish bare `\r`
46    /// from `\r\n`.
47    #[must_use]
48    pub const fn advance(self, mut pos: Pos) -> Pos {
49        match self {
50            Self::Lf => pos.advance('\n'),
51            Self::CrLf => {
52                pos.byte_offset += '\r'.len_utf8();
53                pos.advance('\n')
54            }
55            Self::Cr => {
56                pos.byte_offset += '\r'.len_utf8();
57                pos.line += 1;
58                pos.column = 0;
59                pos
60            }
61            Self::Eof => pos,
62        }
63    }
64}
65
66/// A single logical line extracted from the input.
67#[derive(Debug, Clone, PartialEq, Eq)]
68pub struct Line<'input> {
69    /// The line content slice, **excluding** the terminator.
70    pub content: &'input str,
71    /// Byte offset of `content` within the original input string.
72    pub offset: usize,
73    /// Number of leading `SPACE` (`\x20`) characters.  Leading tabs do not
74    /// contribute to indent — they are a YAML syntax error in indentation
75    /// context and are reported by the lexer, not here.
76    pub indent: usize,
77    /// The terminator that ends this line.
78    pub break_type: BreakType,
79    /// Position of the first byte of this line (after BOM stripping on line 1).
80    pub pos: Pos,
81}
82
83// ---------------------------------------------------------------------------
84// Internal helpers
85// ---------------------------------------------------------------------------
86
87/// Detect the line break at the start of `s` and return `(BreakType, rest)`.
88///
89/// CRLF is checked first so that `\r\n` is consumed as a unit rather than
90/// treating `\r` as a bare CR.
91fn detect_break(s: &str) -> (BreakType, &str) {
92    if let Some(rest) = s.strip_prefix("\r\n") {
93        return (BreakType::CrLf, rest);
94    }
95    if let Some(rest) = s.strip_prefix('\r') {
96        return (BreakType::Cr, rest);
97    }
98    if let Some(rest) = s.strip_prefix('\n') {
99        return (BreakType::Lf, rest);
100    }
101    (BreakType::Eof, s)
102}
103
104/// Scan one line from `remaining`, starting at `pos`.
105///
106/// `is_first` controls BOM stripping: if `true` and the slice starts with
107/// U+FEFF (UTF-8 BOM, 3 bytes), the BOM is skipped before content begins.
108///
109/// Returns `Some((line, rest))` or `None` if `remaining` is empty.
110fn scan_line(remaining: &str, pos: Pos, is_first: bool) -> Option<(Line<'_>, &str)> {
111    if remaining.is_empty() {
112        return None;
113    }
114
115    // Strip BOM on first line only.
116    let (content_start, pos) = if is_first && remaining.starts_with('\u{FEFF}') {
117        let bom_len = '\u{FEFF}'.len_utf8(); // 3 bytes
118        (
119            &remaining[bom_len..],
120            Pos {
121                byte_offset: pos.byte_offset + bom_len,
122                ..pos
123            },
124        )
125    } else {
126        (remaining, pos)
127    };
128
129    // Find the end of line content (position of the first \n or \r).
130    let line_end = content_start
131        .find(['\n', '\r'])
132        .unwrap_or(content_start.len());
133
134    let content = &content_start[..line_end];
135    let after_content = &content_start[line_end..];
136
137    // Determine break type and advance past the terminator.
138    // Try CRLF first (must be checked before bare CR).
139    let (break_type, after_break) = detect_break(after_content);
140
141    // Count leading SPACE characters only (tabs do not count).
142    let indent = content.chars().take_while(|&ch| ch == ' ').count();
143
144    // `offset` is the byte offset of `content` within the *original* input.
145    // `pos` already reflects the position after any BOM skip.
146    let offset = pos.byte_offset;
147
148    let line = Line {
149        content,
150        offset,
151        indent,
152        break_type,
153        pos,
154    };
155
156    Some((line, after_break))
157}
158
159// ---------------------------------------------------------------------------
160// LineBuffer
161// ---------------------------------------------------------------------------
162
163/// A one-line-lookahead buffer over a `&'input str`.
164///
165/// Always holds the *next* line pre-parsed.  Callers use [`Self::peek_next`]
166/// to inspect without consuming and [`Self::consume_next`] to advance.
167pub struct LineBuffer<'input> {
168    /// Remaining unparsed input (past the next line's terminator).
169    remaining: &'input str,
170    /// Synthetic lines prepended by the caller (e.g. inline content extracted
171    /// from a sequence- or mapping-entry line).  Drained front-first before
172    /// `next`.  A `VecDeque` supports multiple pending prepends when parsing
173    /// implicit mapping entries that need to inject both key and value lines.
174    prepend: VecDeque<Line<'input>>,
175    /// The pre-parsed next line, if any.
176    next: Option<Line<'input>>,
177    /// Position at the start of `remaining`.
178    remaining_pos: Pos,
179    /// Whether the next line to be parsed from `remaining` is the first line
180    /// of input (used for BOM detection after the initial prime).
181    remaining_is_first: bool,
182    /// Lookahead buffer for [`Self::peek_until_dedent`].
183    lookahead: Vec<Line<'input>>,
184}
185
186impl<'input> LineBuffer<'input> {
187    /// Construct a new `LineBuffer` and prime the next-line slot.
188    #[must_use]
189    pub fn new(input: &'input str) -> Self {
190        let mut buf = Self {
191            remaining: input,
192            prepend: VecDeque::new(),
193            next: None,
194            remaining_pos: Pos::ORIGIN,
195            remaining_is_first: true,
196            lookahead: Vec::new(),
197        };
198        buf.prime();
199        buf
200    }
201
202    /// Prepend a synthetic line that will be returned by the next call to
203    /// [`Self::peek_next`] / [`Self::consume_next`], ahead of any real lines.
204    ///
205    /// Used to re-present inline content extracted from a sequence- or
206    /// mapping-entry line as if it were a separate line.  Multiple prepends
207    /// are supported: each call pushes to the front of the queue, so the last
208    /// prepended line is returned first (LIFO order).  Callers that need FIFO
209    /// order (key before value) should prepend value first, then key.
210    pub fn prepend_line(&mut self, line: Line<'input>) {
211        self.lookahead.clear();
212        self.prepend.push_front(line);
213    }
214
215    /// Look at the next line without consuming it.
216    ///
217    /// Returns the frontmost prepended synthetic line first (if any), then the
218    /// normally buffered next line.
219    #[must_use]
220    pub fn peek_next(&self) -> Option<&Line<'input>> {
221        self.prepend.front().or(self.next.as_ref())
222    }
223
224    /// Returns `true` if the next line comes from the prepend queue (synthetic),
225    /// rather than from the original input stream.
226    #[must_use]
227    pub fn is_next_synthetic(&self) -> bool {
228        !self.prepend.is_empty()
229    }
230
231    /// Convenience: the indent of the next line, without consuming it.
232    #[must_use]
233    pub fn peek_next_indent(&self) -> Option<usize> {
234        self.peek_next().map(|l| l.indent)
235    }
236
237    /// Peek at the second upcoming line without consuming either.
238    ///
239    /// Handles the prepend queue: the second line may come from the prepend
240    /// queue or from the primed `next` slot or from `remaining`.
241    #[must_use]
242    pub fn peek_second(&self) -> Option<Line<'input>> {
243        // Determine where the "first" line comes from, then find the "second".
244        if !self.prepend.is_empty() {
245            // First line is prepend[0]. Second is prepend[1] if it exists,
246            // else self.next.
247            if self.prepend.len() >= 2 {
248                return self.prepend.get(1).cloned();
249            }
250            return self.next.clone();
251        }
252        // First line is self.next. Second is the first line from `remaining`.
253        self.next.as_ref()?; // ensure first exists
254        scan_line(self.remaining, self.remaining_pos, self.remaining_is_first).map(|(line, _)| line)
255    }
256
257    /// Advance: return the currently primed next line and prime the following
258    /// one from the remaining input.  Returns `None` when no lines remain.
259    ///
260    /// Drains prepended synthetic lines (front-first) before the real buffer.
261    pub fn consume_next(&mut self) -> Option<Line<'input>> {
262        // Drain prepend queue front-first.
263        if let Some(line) = self.prepend.pop_front() {
264            return Some(line);
265        }
266        // Clear any cached lookahead — it was based on the old position.
267        self.lookahead.clear();
268        let line = self.next.take()?;
269        self.prime();
270        Some(line)
271    }
272
273    /// True when no more lines are available (buffer is empty, no prepend, and
274    /// input is exhausted).
275    #[must_use]
276    pub fn at_eof(&self) -> bool {
277        self.prepend.is_empty() && self.next.is_none()
278    }
279
280    /// Scan forward without consuming to collect all lines with
281    /// `indent > base_indent`, stopping at the first line with
282    /// `indent <= base_indent`.  Blank lines (empty content) are transparent
283    /// to the scan and are included in the result regardless of their indent.
284    ///
285    /// Returns a slice of the buffered lookahead lines.  Calling this method
286    /// repeatedly (without consuming) returns the same slice.
287    ///
288    /// Note: trailing blank lines in the returned slice are **not** part of
289    /// the block scalar content — per YAML chomping rules, trailing blank
290    /// lines are stripped, clipped, or kept based on the chomping indicator.
291    /// The consumer (lexer, Task 8) is responsible for trimming them.
292    pub fn peek_until_dedent(&mut self, base_indent: usize) -> &[Line<'input>] {
293        // Rebuild the lookahead starting from the next line.
294        self.lookahead.clear();
295
296        // We need to scan from the next primed line plus additional lines
297        // from `remaining`.  Use a local cursor.
298        let mut cursor_remaining = self.remaining;
299        let mut cursor_pos = self.remaining_pos;
300        let mut cursor_is_first = self.remaining_is_first;
301
302        // The first line in the lookahead is `self.next` (if any).
303        // We include it if it is blank or its indent > base_indent.
304        let start_line = match self.next.as_ref() {
305            None => return &self.lookahead,
306            Some(l) => l.clone(),
307        };
308
309        // Process lines in order: start with `self.next`, then scan from
310        // `remaining`.
311        let mut scanning_next = Some(start_line);
312
313        loop {
314            let line = match scanning_next.take() {
315                Some(l) => l,
316                None => {
317                    // Fetch from remaining input.
318                    match scan_line(cursor_remaining, cursor_pos, cursor_is_first) {
319                        None => break,
320                        Some((l, rest)) => {
321                            cursor_pos = pos_after_line(&l);
322                            cursor_remaining = rest;
323                            cursor_is_first = false;
324                            l
325                        }
326                    }
327                }
328            };
329
330            // Blank lines (empty content) are transparent: include them and
331            // keep scanning.
332            if line.content.is_empty() {
333                self.lookahead.push(line);
334                continue;
335            }
336
337            // Stop before the first non-blank line that is dedented.
338            // base_indent == usize::MAX is the "root level" sentinel meaning
339            // no indent threshold — include all non-blank lines.
340            if base_indent != usize::MAX && line.indent <= base_indent {
341                break;
342            }
343
344            self.lookahead.push(line);
345        }
346
347        &self.lookahead
348    }
349
350    // -----------------------------------------------------------------------
351    // Private helpers
352    // -----------------------------------------------------------------------
353
354    /// Parse one more line from `remaining` into `self.next`.
355    fn prime(&mut self) {
356        match scan_line(self.remaining, self.remaining_pos, self.remaining_is_first) {
357            None => {
358                self.next = None;
359            }
360            Some((line, rest)) => {
361                // Advance `remaining_pos` past the line we just parsed.
362                let new_pos = pos_after_line(&line);
363                self.remaining_pos = new_pos;
364                self.remaining = rest;
365                self.remaining_is_first = false;
366                self.next = Some(line);
367            }
368        }
369    }
370}
371
372/// Compute the `Pos` immediately after the terminator of `line`.
373///
374/// O(1) for `Lf`/`Cr`/`CrLf` — the next line is at `line+1, column=0`.
375/// O(content) for `Eof` — the final line has no terminator, so position stays
376/// on the same line; column advances by the char count of the content via the
377/// ASCII fast path in [`crate::pos::column_at`].
378pub fn pos_after_line(line: &Line<'_>) -> Pos {
379    let byte_offset = line.offset + line.content.len() + line.break_type.byte_len();
380    match line.break_type {
381        BreakType::Eof => Pos {
382            byte_offset,
383            line: line.pos.line,
384            column: line.pos.column + crate::pos::column_at(line.content, line.content.len()),
385        },
386        BreakType::Lf | BreakType::Cr | BreakType::CrLf => Pos {
387            byte_offset,
388            line: line.pos.line + 1,
389            column: 0,
390        },
391    }
392}
393
394// ---------------------------------------------------------------------------
395// Tests
396// ---------------------------------------------------------------------------
397
398#[cfg(test)]
399mod tests {
400    use rstest::rstest;
401
402    use super::*;
403
404    // -----------------------------------------------------------------------
405    // BreakType::advance
406    // -----------------------------------------------------------------------
407
408    #[rstest]
409    #[case::break_type_advance_lf(BreakType::Lf, Pos::ORIGIN, 1, 2, 0)]
410    #[case::break_type_advance_crlf(BreakType::CrLf, Pos::ORIGIN, 2, 2, 0)]
411    // \r = 1 byte, \n = 1 byte → 2 bytes total for CrLf
412    #[case::break_type_advance_lf_at_non_origin_pos(BreakType::Lf, Pos { byte_offset: 5, line: 2, column: 3 }, 6, 3, 0)]
413    #[case::break_type_advance_crlf_at_non_origin_pos(BreakType::CrLf, Pos { byte_offset: 5, line: 2, column: 3 }, 7, 3, 0)]
414    #[case::break_type_advance_cr_resets_column(BreakType::Cr, Pos { byte_offset: 3, line: 1, column: 3 }, 4, 2, 0)]
415    fn break_type_advance_all_fields(
416        #[case] break_type: BreakType,
417        #[case] input: Pos,
418        #[case] expected_byte_offset: usize,
419        #[case] expected_line: usize,
420        #[case] expected_column: usize,
421    ) {
422        let after = break_type.advance(input);
423        assert_eq!(after.byte_offset, expected_byte_offset);
424        assert_eq!(after.line, expected_line);
425        assert_eq!(after.column, expected_column);
426    }
427
428    #[test]
429    fn break_type_advance_cr_increments_line() {
430        let pos = Pos::ORIGIN;
431        let after = BreakType::Cr.advance(pos);
432        assert_eq!(after.line, 2);
433    }
434
435    #[test]
436    fn break_type_advance_eof_is_noop() {
437        let pos = Pos {
438            byte_offset: 5,
439            line: 3,
440            column: 2,
441        };
442        let after = BreakType::Eof.advance(pos);
443        assert_eq!(after, pos);
444    }
445
446    // -----------------------------------------------------------------------
447    // new and initial state
448    // -----------------------------------------------------------------------
449
450    #[rstest]
451    #[case::new_single_line_with_lf_primes_first_line("foo\n", "foo", BreakType::Lf)]
452    #[case::new_input_with_only_lf_primes_empty_line("\n", "", BreakType::Lf)]
453    fn new_single_line_peek(
454        #[case] input: &str,
455        #[case] expected_content: &str,
456        #[case] expected_break: BreakType,
457    ) {
458        let buf = LineBuffer::new(input);
459        let Some(line) = buf.peek_next() else {
460            unreachable!("expected a line");
461        };
462        assert_eq!(line.content, expected_content);
463        assert_eq!(line.break_type, expected_break);
464    }
465
466    #[test]
467    fn new_empty_input_at_eof_immediately() {
468        let buf = LineBuffer::new("");
469        assert!(buf.peek_next().is_none());
470        assert!(buf.at_eof());
471    }
472
473    #[test]
474    fn new_single_line_no_newline_primes_eof_line() {
475        let buf = LineBuffer::new("foo");
476        let Some(line) = buf.peek_next() else {
477            unreachable!("expected a line");
478        };
479        assert_eq!(line.content, "foo");
480        assert_eq!(line.break_type, BreakType::Eof);
481        assert_eq!(line.offset, 0);
482    }
483
484    // -----------------------------------------------------------------------
485    // consume_next sequencing
486    // -----------------------------------------------------------------------
487
488    #[test]
489    fn consume_returns_primed_line_and_advances() {
490        let mut buf = LineBuffer::new("a\nb\n");
491        let Some(first) = buf.consume_next() else {
492            unreachable!("expected first line");
493        };
494        assert_eq!(first.content, "a");
495        assert_eq!(first.break_type, BreakType::Lf);
496        let Some(second) = buf.consume_next() else {
497            unreachable!("expected second line");
498        };
499        assert_eq!(second.content, "b");
500        assert_eq!(second.break_type, BreakType::Lf);
501    }
502
503    #[test]
504    fn consume_after_last_line_returns_none() {
505        let mut buf = LineBuffer::new("foo");
506        assert!(buf.consume_next().is_some());
507        assert!(buf.consume_next().is_none());
508    }
509
510    #[test]
511    fn at_eof_false_before_consuming_last_and_true_after() {
512        let mut buf = LineBuffer::new("foo");
513        assert!(!buf.at_eof());
514        buf.consume_next();
515        assert!(buf.at_eof());
516    }
517
518    #[test]
519    fn consume_all_lines_then_peek_returns_none() {
520        let mut buf = LineBuffer::new("a\nb");
521        buf.consume_next();
522        buf.consume_next();
523        assert!(buf.peek_next().is_none());
524    }
525
526    // -----------------------------------------------------------------------
527    // line terminator types
528    // -----------------------------------------------------------------------
529
530    #[rstest]
531    #[case::only_lf_produces_one_empty_line("\n", BreakType::Lf)]
532    #[case::only_cr_produces_one_empty_line("\r", BreakType::Cr)]
533    #[case::only_crlf_produces_one_empty_line_not_two("\r\n", BreakType::CrLf)]
534    fn single_terminator_produces_empty_line(
535        #[case] input: &str,
536        #[case] expected_break: BreakType,
537    ) {
538        let mut buf = LineBuffer::new(input);
539        let Some(line) = buf.consume_next() else {
540            unreachable!("expected a line");
541        };
542        assert_eq!(line.content, "");
543        assert_eq!(line.break_type, expected_break);
544        assert!(buf.consume_next().is_none());
545    }
546
547    #[test]
548    fn lf_terminator_produces_lf_break_type() {
549        let mut buf = LineBuffer::new("a\n");
550        let Some(line) = buf.consume_next() else {
551            unreachable!("expected a line");
552        };
553        assert_eq!(line.break_type, BreakType::Lf);
554    }
555
556    #[test]
557    fn crlf_terminator_produces_crlf_break_type_not_two_lines() {
558        let mut buf = LineBuffer::new("a\r\nb");
559        let Some(first) = buf.consume_next() else {
560            unreachable!("expected first");
561        };
562        assert_eq!(first.content, "a");
563        assert_eq!(first.break_type, BreakType::CrLf);
564        let Some(second) = buf.consume_next() else {
565            unreachable!("expected second");
566        };
567        assert_eq!(second.content, "b");
568        assert_eq!(second.break_type, BreakType::Eof);
569        assert!(buf.consume_next().is_none());
570    }
571
572    #[test]
573    fn bare_cr_terminator_produces_cr_break_type() {
574        let mut buf = LineBuffer::new("a\rb");
575        let Some(first) = buf.consume_next() else {
576            unreachable!("expected first");
577        };
578        assert_eq!(first.content, "a");
579        assert_eq!(first.break_type, BreakType::Cr);
580        let Some(second) = buf.consume_next() else {
581            unreachable!("expected second");
582        };
583        assert_eq!(second.content, "b");
584        assert_eq!(second.break_type, BreakType::Eof);
585    }
586
587    #[test]
588    fn no_terminator_on_last_line_produces_eof_break_type() {
589        let mut buf = LineBuffer::new("a\nb");
590        buf.consume_next();
591        let Some(second) = buf.consume_next() else {
592            unreachable!("expected second");
593        };
594        assert_eq!(second.content, "b");
595        assert_eq!(second.break_type, BreakType::Eof);
596    }
597
598    #[test]
599    fn mixed_line_endings_each_line_has_correct_break_type() {
600        let mut buf = LineBuffer::new("a\nb\r\nc\rd");
601        let types: Vec<BreakType> = (0..4)
602            .filter_map(|_| buf.consume_next().map(|l| l.break_type))
603            .collect();
604        assert_eq!(
605            types,
606            [
607                BreakType::Lf,
608                BreakType::CrLf,
609                BreakType::Cr,
610                BreakType::Eof
611            ]
612        );
613    }
614
615    #[test]
616    fn two_consecutive_lf_produce_two_empty_lines() {
617        let mut buf = LineBuffer::new("\n\n");
618        let Some(first) = buf.consume_next() else {
619            unreachable!("expected first");
620        };
621        assert_eq!(first.content, "");
622        assert_eq!(first.break_type, BreakType::Lf);
623        let Some(second) = buf.consume_next() else {
624            unreachable!("expected second");
625        };
626        assert_eq!(second.content, "");
627        assert_eq!(second.break_type, BreakType::Lf);
628        assert!(buf.consume_next().is_none());
629    }
630
631    #[test]
632    fn trailing_lf_does_not_produce_extra_empty_line() {
633        // A trailing newline terminates the last line; it does not introduce
634        // a new empty line.
635        let mut buf = LineBuffer::new("foo\n");
636        let Some(line) = buf.consume_next() else {
637            unreachable!("expected a line");
638        };
639        assert_eq!(line.content, "foo");
640        assert!(buf.consume_next().is_none());
641    }
642
643    // -----------------------------------------------------------------------
644    // offset and Pos tracking
645    // -----------------------------------------------------------------------
646
647    #[rstest]
648    #[case::pos_line_increments_after_bare_cr("a\rb")]
649    #[case::pos_line_increments_after_crlf("a\r\nb")]
650    fn pos_line_increments_after_terminator(#[case] input: &str) {
651        let mut buf = LineBuffer::new(input);
652        let Some(first) = buf.consume_next() else {
653            unreachable!("expected first");
654        };
655        assert_eq!(first.pos.line, 1);
656        let Some(second) = buf.consume_next() else {
657            unreachable!("expected second");
658        };
659        assert_eq!(second.pos.line, 2);
660        assert_eq!(second.pos.column, 0);
661    }
662
663    #[test]
664    fn offset_is_byte_offset_of_content_start() {
665        let mut buf = LineBuffer::new("foo\nbar\n");
666        let Some(first) = buf.consume_next() else {
667            unreachable!("expected first");
668        };
669        assert_eq!(first.offset, 0);
670        let Some(second) = buf.consume_next() else {
671            unreachable!("expected second");
672        };
673        assert_eq!(second.offset, 4); // "foo\n" = 4 bytes
674    }
675
676    #[test]
677    fn offset_and_pos_byte_offset_agree() {
678        let mut buf = LineBuffer::new("foo\nbar");
679        while let Some(line) = buf.consume_next() {
680            assert_eq!(line.offset, line.pos.byte_offset);
681        }
682    }
683
684    #[test]
685    fn pos_line_number_increments_per_line() {
686        let mut buf = LineBuffer::new("a\nb\nc");
687        let lines: Vec<Line<'_>> = (0..3).filter_map(|_| buf.consume_next()).collect();
688        assert_eq!(lines.len(), 3, "expected 3 lines");
689        assert_eq!(lines.first().map(|l| l.pos.line), Some(1));
690        assert_eq!(lines.get(1).map(|l| l.pos.line), Some(2));
691        assert_eq!(lines.get(2).map(|l| l.pos.line), Some(3));
692    }
693
694    #[test]
695    fn pos_column_is_zero_at_start_of_each_line() {
696        let mut buf = LineBuffer::new("a\nb");
697        while let Some(line) = buf.consume_next() {
698            assert_eq!(line.pos.column, 0);
699        }
700    }
701
702    #[test]
703    fn pos_column_resets_after_bare_cr() {
704        // After consuming a line that ends with bare \r, the next line's
705        // column must be 0, not the column that followed the last content char.
706        let mut buf = LineBuffer::new("abc\rd");
707        buf.consume_next(); // consume "abc"
708        let Some(second) = buf.consume_next() else {
709            unreachable!("expected second");
710        };
711        assert_eq!(second.pos.column, 0);
712    }
713
714    #[test]
715    fn pos_after_mixed_endings_tracks_lines_correctly() {
716        // Input has four lines with three different terminator types.
717        let mut buf = LineBuffer::new("a\nb\r\nc\rd");
718        let lines: Vec<Line<'_>> = (0..4).filter_map(|_| buf.consume_next()).collect();
719        assert_eq!(lines.len(), 4, "expected 4 lines");
720        let line_nums: Vec<usize> = lines.iter().map(|l| l.pos.line).collect();
721        assert_eq!(line_nums, [1, 2, 3, 4]);
722        for line in &lines {
723            assert_eq!(
724                line.pos.column, 0,
725                "line {} should start at column 0",
726                line.pos.line
727            );
728        }
729    }
730
731    #[test]
732    fn multibyte_content_byte_offset_is_byte_based_not_char_based() {
733        // '中' is 3 UTF-8 bytes
734        let mut buf = LineBuffer::new("中\nfoo");
735        let Some(first) = buf.consume_next() else {
736            unreachable!("expected first");
737        };
738        assert_eq!(first.offset, 0);
739        assert_eq!(first.content, "中");
740        let Some(second) = buf.consume_next() else {
741            unreachable!("expected second");
742        };
743        // 3 bytes for '中' + 1 byte for '\n' = 4
744        assert_eq!(second.offset, 4);
745    }
746
747    // -----------------------------------------------------------------------
748    // BOM handling
749    // -----------------------------------------------------------------------
750
751    #[test]
752    fn bom_is_stripped_from_content_of_first_line() {
753        let input = "\u{FEFF}foo\n";
754        let buf = LineBuffer::new(input);
755        let Some(line) = buf.peek_next() else {
756            unreachable!("expected a line");
757        };
758        assert_eq!(line.content, "foo");
759    }
760
761    #[test]
762    fn bom_stripped_line_offset_starts_after_bom_bytes() {
763        let input = "\u{FEFF}foo\n";
764        let buf = LineBuffer::new(input);
765        let Some(line) = buf.peek_next() else {
766            unreachable!("expected a line");
767        };
768        // BOM is U+FEFF = 3 bytes in UTF-8
769        assert_eq!(line.offset, 3);
770        assert_eq!(line.pos.byte_offset, 3);
771    }
772
773    #[test]
774    fn bom_only_stripped_from_first_line() {
775        // A BOM in a non-first line is preserved as data (the lexer will
776        // report it as an error).
777        let input = "foo\n\u{FEFF}bar\n";
778        let mut buf = LineBuffer::new(input);
779        buf.consume_next(); // consume "foo"
780        let Some(second) = buf.consume_next() else {
781            unreachable!("expected second");
782        };
783        assert_eq!(second.content, "\u{FEFF}bar");
784    }
785
786    // -----------------------------------------------------------------------
787    // indent counting
788    // -----------------------------------------------------------------------
789
790    #[rstest]
791    #[case::indent_counts_only_leading_spaces("   foo", 3)]
792    #[case::indent_is_zero_for_no_leading_spaces("foo", 0)]
793    #[case::leading_tab_does_not_count_toward_indent("\tfoo", 0)]
794    #[case::tab_after_spaces_does_not_count("  \tfoo", 2)]
795    #[case::indent_of_blank_line_is_zero("\n", 0)]
796    fn indent_value(#[case] input: &str, #[case] expected: usize) {
797        let buf = LineBuffer::new(input);
798        let Some(line) = buf.peek_next() else {
799            unreachable!("expected a line");
800        };
801        assert_eq!(line.indent, expected);
802    }
803
804    #[test]
805    fn indent_of_spaces_only_line_equals_space_count() {
806        let buf = LineBuffer::new("   \n");
807        let Some(line) = buf.peek_next() else {
808            unreachable!("expected a line");
809        };
810        assert_eq!(line.indent, 3);
811        assert_eq!(line.content, "   ");
812    }
813
814    // -----------------------------------------------------------------------
815    // peek_next_indent
816    // -----------------------------------------------------------------------
817
818    #[rstest]
819    #[case::peek_next_indent_returns_indent_of_next_line("   foo", Some(3))]
820    #[case::peek_next_indent_returns_none_at_eof("", None)]
821    fn peek_next_indent_returns(#[case] input: &str, #[case] expected: Option<usize>) {
822        let buf = LineBuffer::new(input);
823        assert_eq!(buf.peek_next_indent(), expected);
824    }
825
826    #[test]
827    fn peek_next_indent_does_not_consume() {
828        let mut buf = LineBuffer::new("  foo");
829        assert_eq!(buf.peek_next_indent(), Some(2));
830        assert_eq!(buf.peek_next_indent(), Some(2));
831        let Some(line) = buf.consume_next() else {
832            unreachable!("expected a line");
833        };
834        assert_eq!(line.content, "  foo");
835    }
836
837    // -----------------------------------------------------------------------
838    // peek_until_dedent
839    // -----------------------------------------------------------------------
840
841    #[test]
842    fn peek_until_dedent_empty_input_returns_empty_slice() {
843        let mut buf = LineBuffer::new("");
844        assert!(buf.peek_until_dedent(0).is_empty());
845    }
846
847    #[test]
848    fn peek_until_dedent_returns_lines_until_indent_le_base() {
849        let mut buf = LineBuffer::new("  a\n  b\nc\n");
850        let lines = buf.peek_until_dedent(1);
851        assert_eq!(lines.len(), 2);
852        assert_eq!(lines.first().map(|l| l.content), Some("  a"));
853        assert_eq!(lines.get(1).map(|l| l.content), Some("  b"));
854    }
855
856    #[test]
857    fn peek_until_dedent_does_not_consume_lines() {
858        let mut buf = LineBuffer::new("  a\n  b\nc\n");
859        let _ = buf.peek_until_dedent(1);
860        let Some(first) = buf.consume_next() else {
861            unreachable!("expected first");
862        };
863        assert_eq!(first.content, "  a");
864    }
865
866    #[test]
867    fn peek_until_dedent_includes_all_lines_when_no_dedent_occurs() {
868        let mut buf = LineBuffer::new("  a\n  b\n  c");
869        let lines = buf.peek_until_dedent(1);
870        assert_eq!(lines.len(), 3);
871    }
872
873    #[test]
874    fn peek_until_dedent_returns_empty_slice_when_first_line_already_dedented() {
875        let mut buf = LineBuffer::new("a\n  b\n");
876        let lines = buf.peek_until_dedent(1);
877        // "a" has indent 0 <= 1, so stop immediately
878        assert!(lines.is_empty());
879    }
880
881    #[test]
882    fn peek_until_dedent_second_call_returns_same_slice() {
883        let mut buf = LineBuffer::new("  a\n  b\nc");
884        let first_call: Vec<String> = buf
885            .peek_until_dedent(1)
886            .iter()
887            .map(|l| l.content.to_owned())
888            .collect();
889        let second_call: Vec<String> = buf
890            .peek_until_dedent(1)
891            .iter()
892            .map(|l| l.content.to_owned())
893            .collect();
894        assert_eq!(first_call, second_call);
895        assert_eq!(first_call, ["  a", "  b"]);
896    }
897
898    #[test]
899    fn peek_until_dedent_base_zero_stops_at_non_indented_lines() {
900        // base_indent=0: stop at lines with indent <= 0 (i.e., indent == 0).
901        // Both lines here have indent > 0, so all are included.
902        let mut buf = LineBuffer::new("  a\n  b\n");
903        let lines = buf.peek_until_dedent(0);
904        assert_eq!(lines.len(), 2);
905    }
906
907    #[test]
908    fn peek_until_dedent_blank_lines_are_transparent() {
909        // Blank lines (empty content) are transparent: they are included in
910        // the result and do not halt the scan.
911        // "  a" (indent 2 > 1) -> included
912        // ""    (blank)         -> transparent, included
913        // "  b" (indent 2 > 1) -> included
914        // "c"   (indent 0 <= 1) -> stop
915        let mut buf = LineBuffer::new("  a\n\n  b\nc");
916        let lines = buf.peek_until_dedent(1);
917        assert_eq!(lines.len(), 3);
918        assert_eq!(lines.first().map(|l| l.content), Some("  a"));
919        assert_eq!(lines.get(1).map(|l| l.content), Some(""));
920        assert_eq!(lines.get(2).map(|l| l.content), Some("  b"));
921    }
922
923    // -----------------------------------------------------------------------
924    // pos_after_line
925    // -----------------------------------------------------------------------
926
927    #[rstest]
928    #[case::pos_after_line_lf_ascii(Line { content: "hello", offset: 0, indent: 0, break_type: BreakType::Lf, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 6, 2, 0)]
929    #[case::pos_after_line_lf_empty_content(Line { content: "", offset: 10, indent: 0, break_type: BreakType::Lf, pos: Pos { byte_offset: 10, line: 3, column: 0 } }, 11, 4, 0)]
930    #[case::pos_after_line_lf_multibyte(Line { content: "日本", offset: 0, indent: 0, break_type: BreakType::Lf, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 7, 2, 0)]
931    // 6 bytes + 1 for \n = 7
932    #[case::pos_after_line_cr_ascii(Line { content: "abc", offset: 0, indent: 0, break_type: BreakType::Cr, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 4, 2, 0)]
933    #[case::pos_after_line_cr_empty_content(Line { content: "", offset: 5, indent: 0, break_type: BreakType::Cr, pos: Pos { byte_offset: 5, line: 2, column: 0 } }, 6, 3, 0)]
934    #[case::pos_after_line_crlf_ascii(Line { content: "key: val", offset: 0, indent: 0, break_type: BreakType::CrLf, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 10, 2, 0)]
935    #[case::pos_after_line_crlf_empty_content(Line { content: "", offset: 0, indent: 0, break_type: BreakType::CrLf, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 2, 2, 0)]
936    #[case::pos_after_line_eof_empty_content(Line { content: "", offset: 20, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 20, line: 5, column: 0 } }, 20, 5, 0)]
937    #[case::pos_after_line_eof_ascii(Line { content: "last", offset: 10, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 10, line: 3, column: 0 } }, 14, 3, 4)]
938    #[case::pos_after_line_eof_ascii_nonzero_start_column(Line { content: "end", offset: 7, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 7, line: 2, column: 5 } }, 10, 2, 8)]
939    #[case::pos_after_line_eof_multibyte(Line { content: "日本語", offset: 0, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 9, 1, 3)]
940    #[case::pos_after_line_eof_mixed_content(Line { content: "ab日", offset: 0, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 5, 1, 3)]
941    fn pos_after_line_cases(
942        #[case] line: Line<'static>,
943        #[case] expected_byte_offset: usize,
944        #[case] expected_line: usize,
945        #[case] expected_column: usize,
946    ) {
947        let result = pos_after_line(&line);
948        assert_eq!(result.byte_offset, expected_byte_offset);
949        assert_eq!(result.line, expected_line);
950        assert_eq!(result.column, expected_column);
951    }
952}