Skip to main content

rlsp_yaml_parser/
lines.rs

1// SPDX-License-Identifier: MIT
2
3//! Line-at-a-time buffer with one-line lookahead for the streaming parser.
4//!
5//! `LineBuffer` wraps an `&'input str` and yields one [`Line`] at a time,
6//! always keeping the *next* line primed in an internal slot so callers can
7//! peek at the next line's indent without consuming it.  It never scans the
8//! full input up front, giving O(1) first-event latency.
9
10use std::collections::VecDeque;
11
12use crate::pos::Pos;
13
14// ---------------------------------------------------------------------------
15// Public types
16// ---------------------------------------------------------------------------
17
18/// The type of line terminator that ends a [`Line`].
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum BreakType {
21    /// `\n` (line feed)
22    Lf,
23    /// `\r` (bare carriage return — no following `\n`)
24    Cr,
25    /// `\r\n` (CRLF pair)
26    CrLf,
27    /// End of input — the line has no terminator.
28    Eof,
29}
30
31impl BreakType {
32    /// Byte length of this line terminator (0 for Eof).
33    #[must_use]
34    pub const fn byte_len(self) -> usize {
35        match self {
36            Self::Lf | Self::Cr => 1,
37            Self::CrLf => 2,
38            Self::Eof => 0,
39        }
40    }
41
42    /// Advance `pos` past this line break.
43    ///
44    /// Each break type requires distinct logic because `Pos::advance(char)`
45    /// operates on individual characters and cannot distinguish bare `\r`
46    /// from `\r\n`.
47    #[must_use]
48    pub const fn advance(self, mut pos: Pos) -> Pos {
49        match self {
50            Self::Lf => pos.advance('\n'),
51            Self::CrLf => {
52                pos.byte_offset += '\r'.len_utf8();
53                pos.advance('\n')
54            }
55            Self::Cr => {
56                pos.byte_offset += '\r'.len_utf8();
57                pos.line += 1;
58                pos.column = 0;
59                pos
60            }
61            Self::Eof => pos,
62        }
63    }
64}
65
66/// A single logical line extracted from the input.
67#[derive(Debug, Clone, PartialEq, Eq)]
68pub struct Line<'input> {
69    /// The line content slice, **excluding** the terminator.
70    pub content: &'input str,
71    /// Byte offset of `content` within the original input string.
72    pub offset: usize,
73    /// Number of leading `SPACE` (`\x20`) characters.  Leading tabs do not
74    /// contribute to indent — they are a YAML syntax error in indentation
75    /// context and are reported by the lexer, not here.
76    pub indent: usize,
77    /// The terminator that ends this line.
78    pub break_type: BreakType,
79    /// Position of the first byte of this line (after BOM stripping on line 1).
80    pub pos: Pos,
81}
82
83// ---------------------------------------------------------------------------
84// Internal helpers
85// ---------------------------------------------------------------------------
86
87/// Detect the line break at the start of `s` and return `(BreakType, rest)`.
88///
89/// CRLF is checked first so that `\r\n` is consumed as a unit rather than
90/// treating `\r` as a bare CR.
91fn detect_break(s: &str) -> (BreakType, &str) {
92    if let Some(rest) = s.strip_prefix("\r\n") {
93        return (BreakType::CrLf, rest);
94    }
95    if let Some(rest) = s.strip_prefix('\r') {
96        return (BreakType::Cr, rest);
97    }
98    if let Some(rest) = s.strip_prefix('\n') {
99        return (BreakType::Lf, rest);
100    }
101    (BreakType::Eof, s)
102}
103
104/// Scan one line from `remaining`, starting at `pos`.
105///
106/// `is_first` controls BOM stripping: if `true` and the slice starts with
107/// U+FEFF (UTF-8 BOM, 3 bytes), the BOM is skipped before content begins.
108///
109/// Returns `Some((line, rest))` or `None` if `remaining` is empty.
110fn scan_line(remaining: &str, pos: Pos, is_first: bool) -> Option<(Line<'_>, &str)> {
111    if remaining.is_empty() {
112        return None;
113    }
114
115    // Strip BOM on first line only.
116    let (content_start, pos) = if is_first && remaining.starts_with('\u{FEFF}') {
117        let bom_len = '\u{FEFF}'.len_utf8(); // 3 bytes
118        (
119            &remaining[bom_len..],
120            Pos {
121                byte_offset: pos.byte_offset + bom_len,
122                ..pos
123            },
124        )
125    } else {
126        (remaining, pos)
127    };
128
129    // Find the end of line content (position of the first \n or \r).
130    let line_end = content_start
131        .find(['\n', '\r'])
132        .unwrap_or(content_start.len());
133
134    let content = &content_start[..line_end];
135    let after_content = &content_start[line_end..];
136
137    // Determine break type and advance past the terminator.
138    // Try CRLF first (must be checked before bare CR).
139    let (break_type, after_break) = detect_break(after_content);
140
141    // Count leading SPACE characters only (tabs do not count).
142    let indent = content.chars().take_while(|&ch| ch == ' ').count();
143
144    // `offset` is the byte offset of `content` within the *original* input.
145    // `pos` already reflects the position after any BOM skip.
146    let offset = pos.byte_offset;
147
148    let line = Line {
149        content,
150        offset,
151        indent,
152        break_type,
153        pos,
154    };
155
156    Some((line, after_break))
157}
158
159// ---------------------------------------------------------------------------
160// LineBuffer
161// ---------------------------------------------------------------------------
162
163/// A one-line-lookahead buffer over a `&'input str`.
164///
165/// Always holds the *next* line pre-parsed.  Callers use [`Self::peek_next`]
166/// to inspect without consuming and [`Self::consume_next`] to advance.
167pub struct LineBuffer<'input> {
168    /// Remaining unparsed input (past the next line's terminator).
169    remaining: &'input str,
170    /// Synthetic lines prepended by the caller (e.g. inline content extracted
171    /// from a sequence- or mapping-entry line).  Drained front-first before
172    /// `next`.  A `VecDeque` supports multiple pending prepends when parsing
173    /// implicit mapping entries that need to inject both key and value lines.
174    prepend: VecDeque<Line<'input>>,
175    /// The pre-parsed next line, if any.
176    next: Option<Line<'input>>,
177    /// Position at the start of `remaining`.
178    remaining_pos: Pos,
179    /// Whether the next line to be parsed from `remaining` is a document-prefix
180    /// position — the first line of input or the first line after a `...`
181    /// document-end marker.  When `true`, `scan_line` strips a leading BOM per
182    /// YAML 1.2 §5.2 / production [202] `l-document-prefix`.
183    remaining_is_first: bool,
184    /// Lookahead buffer for [`Self::peek_until_dedent`].
185    lookahead: Vec<Line<'input>>,
186}
187
188impl<'input> LineBuffer<'input> {
189    /// Construct a new `LineBuffer` and prime the next-line slot.
190    #[must_use]
191    pub fn new(input: &'input str) -> Self {
192        let mut buf = Self {
193            remaining: input,
194            prepend: VecDeque::new(),
195            next: None,
196            remaining_pos: Pos::ORIGIN,
197            remaining_is_first: true,
198            lookahead: Vec::new(),
199        };
200        buf.prime();
201        buf
202    }
203
204    /// Prepend a synthetic line that will be returned by the next call to
205    /// [`Self::peek_next`] / [`Self::consume_next`], ahead of any real lines.
206    ///
207    /// Used to re-present inline content extracted from a sequence- or
208    /// mapping-entry line as if it were a separate line.  Multiple prepends
209    /// are supported: each call pushes to the front of the queue, so the last
210    /// prepended line is returned first (LIFO order).  Callers that need FIFO
211    /// order (key before value) should prepend value first, then key.
212    pub fn prepend_line(&mut self, line: Line<'input>) {
213        self.lookahead.clear();
214        self.prepend.push_front(line);
215    }
216
217    /// Look at the next line without consuming it.
218    ///
219    /// Returns the frontmost prepended synthetic line first (if any), then the
220    /// normally buffered next line.
221    #[must_use]
222    pub fn peek_next(&self) -> Option<&Line<'input>> {
223        self.prepend.front().or(self.next.as_ref())
224    }
225
226    /// Returns `true` if the next line comes from the prepend queue (synthetic),
227    /// rather than from the original input stream.
228    #[must_use]
229    pub fn is_next_synthetic(&self) -> bool {
230        !self.prepend.is_empty()
231    }
232
233    /// Convenience: the indent of the next line, without consuming it.
234    #[must_use]
235    pub fn peek_next_indent(&self) -> Option<usize> {
236        self.peek_next().map(|l| l.indent)
237    }
238
239    /// Peek at the second upcoming line without consuming either.
240    ///
241    /// Handles the prepend queue: the second line may come from the prepend
242    /// queue or from the primed `next` slot or from `remaining`.
243    #[must_use]
244    pub fn peek_second(&self) -> Option<Line<'input>> {
245        // Determine where the "first" line comes from, then find the "second".
246        if !self.prepend.is_empty() {
247            // First line is prepend[0]. Second is prepend[1] if it exists,
248            // else self.next.
249            if self.prepend.len() >= 2 {
250                return self.prepend.get(1).cloned();
251            }
252            return self.next.clone();
253        }
254        // First line is self.next. Second is the first line from `remaining`.
255        self.next.as_ref()?; // ensure first exists
256        scan_line(self.remaining, self.remaining_pos, self.remaining_is_first).map(|(line, _)| line)
257    }
258
259    /// Advance: return the currently primed next line and prime the following
260    /// one from the remaining input.  Returns `None` when no lines remain.
261    ///
262    /// Drains prepended synthetic lines (front-first) before the real buffer.
263    pub fn consume_next(&mut self) -> Option<Line<'input>> {
264        // Drain prepend queue front-first.
265        if let Some(line) = self.prepend.pop_front() {
266            return Some(line);
267        }
268        // Clear any cached lookahead — it was based on the old position.
269        self.lookahead.clear();
270        let line = self.next.take()?;
271        self.prime();
272        Some(line)
273    }
274
275    /// True when no more lines are available (buffer is empty, no prepend, and
276    /// input is exhausted).
277    #[must_use]
278    pub fn at_eof(&self) -> bool {
279        self.prepend.is_empty() && self.next.is_none()
280    }
281
282    /// Strip a leading BOM from the already-primed `next` line if present.
283    ///
284    /// Called after each blank-line-skip in the inter-document preamble
285    /// (`skip_blank_lines_between_docs`).  Per YAML 1.2 §5.2 / production [202]
286    /// `l-document-prefix = c-byte-order-mark? l-comment*`, a BOM is valid at
287    /// the start of any document prefix — not only at stream start.
288    ///
289    /// If `next` starts with U+FEFF, content, offset, and byte position are
290    /// advanced past the 3-byte UTF-8 encoding.  Only the first BOM is stripped;
291    /// a second consecutive BOM in the same line is left as illegal content.
292    pub fn signal_document_boundary(&mut self) {
293        // Strip at most one BOM from the already-primed next line.
294        if let Some(ref mut next) = self.next {
295            if next.content.starts_with('\u{FEFF}') {
296                let bom_len = '\u{FEFF}'.len_utf8(); // 3 bytes
297                next.content = &next.content[bom_len..];
298                next.offset += bom_len;
299                next.pos.byte_offset += bom_len;
300                // Column is unchanged: BOM is zero-width in column terms.
301            }
302        }
303        // Invalidate lookahead that may have peeked the unstripped BOM line.
304        self.lookahead.clear();
305    }
306
307    /// Scan forward without consuming to collect all lines with
308    /// `indent > base_indent`, stopping at the first line with
309    /// `indent <= base_indent`.  Blank lines (empty content) are transparent
310    /// to the scan and are included in the result regardless of their indent.
311    ///
312    /// Returns a slice of the buffered lookahead lines.  Calling this method
313    /// repeatedly (without consuming) returns the same slice.
314    ///
315    /// Note: trailing blank lines in the returned slice are **not** part of
316    /// the block scalar content — per YAML chomping rules, trailing blank
317    /// lines are stripped, clipped, or kept based on the chomping indicator.
318    /// The consumer (lexer, Task 8) is responsible for trimming them.
319    pub fn peek_until_dedent(&mut self, base_indent: usize) -> &[Line<'input>] {
320        // Rebuild the lookahead starting from the next line.
321        self.lookahead.clear();
322
323        // We need to scan from the next primed line plus additional lines
324        // from `remaining`.  Use a local cursor.
325        let mut cursor_remaining = self.remaining;
326        let mut cursor_pos = self.remaining_pos;
327        let mut cursor_is_first = self.remaining_is_first;
328
329        // The first line in the lookahead is `self.next` (if any).
330        // We include it if it is blank or its indent > base_indent.
331        let start_line = match self.next.as_ref() {
332            None => return &self.lookahead,
333            Some(l) => l.clone(),
334        };
335
336        // Process lines in order: start with `self.next`, then scan from
337        // `remaining`.
338        let mut scanning_next = Some(start_line);
339
340        loop {
341            let line = match scanning_next.take() {
342                Some(l) => l,
343                None => {
344                    // Fetch from remaining input.
345                    match scan_line(cursor_remaining, cursor_pos, cursor_is_first) {
346                        None => break,
347                        Some((l, rest)) => {
348                            cursor_pos = pos_after_line(&l);
349                            cursor_remaining = rest;
350                            cursor_is_first = false;
351                            l
352                        }
353                    }
354                }
355            };
356
357            // Blank lines (empty content) are transparent: include them and
358            // keep scanning.
359            if line.content.is_empty() {
360                self.lookahead.push(line);
361                continue;
362            }
363
364            // Stop before the first non-blank line that is dedented.
365            // base_indent == usize::MAX is the "root level" sentinel meaning
366            // no indent threshold — include all non-blank lines.
367            if base_indent != usize::MAX && line.indent <= base_indent {
368                break;
369            }
370
371            self.lookahead.push(line);
372        }
373
374        &self.lookahead
375    }
376
377    // -----------------------------------------------------------------------
378    // Private helpers
379    // -----------------------------------------------------------------------
380
381    /// Parse one more line from `remaining` into `self.next`.
382    fn prime(&mut self) {
383        match scan_line(self.remaining, self.remaining_pos, self.remaining_is_first) {
384            None => {
385                self.next = None;
386            }
387            Some((line, rest)) => {
388                // Advance `remaining_pos` past the line we just parsed.
389                let new_pos = pos_after_line(&line);
390                self.remaining_pos = new_pos;
391                self.remaining = rest;
392                self.remaining_is_first = false;
393                self.next = Some(line);
394            }
395        }
396    }
397}
398
399/// Compute the `Pos` immediately after the terminator of `line`.
400///
401/// O(1) for `Lf`/`Cr`/`CrLf` — the next line is at `line+1, column=0`.
402/// O(content) for `Eof` — the final line has no terminator, so position stays
403/// on the same line; column advances by the char count of the content via the
404/// ASCII fast path in [`crate::pos::column_at`].
405pub fn pos_after_line(line: &Line<'_>) -> Pos {
406    let byte_offset = line.offset + line.content.len() + line.break_type.byte_len();
407    match line.break_type {
408        BreakType::Eof => Pos {
409            byte_offset,
410            line: line.pos.line,
411            column: line.pos.column + crate::pos::column_at(line.content, line.content.len()),
412        },
413        BreakType::Lf | BreakType::Cr | BreakType::CrLf => Pos {
414            byte_offset,
415            line: line.pos.line + 1,
416            column: 0,
417        },
418    }
419}
420
421// ---------------------------------------------------------------------------
422// Tests
423// ---------------------------------------------------------------------------
424
425#[cfg(test)]
426mod tests {
427    use rstest::rstest;
428
429    use super::*;
430
431    // -----------------------------------------------------------------------
432    // BreakType::advance
433    // -----------------------------------------------------------------------
434
435    #[rstest]
436    #[case::break_type_advance_lf(BreakType::Lf, Pos::ORIGIN, 1, 2, 0)]
437    #[case::break_type_advance_crlf(BreakType::CrLf, Pos::ORIGIN, 2, 2, 0)]
438    // \r = 1 byte, \n = 1 byte → 2 bytes total for CrLf
439    #[case::break_type_advance_lf_at_non_origin_pos(BreakType::Lf, Pos { byte_offset: 5, line: 2, column: 3 }, 6, 3, 0)]
440    #[case::break_type_advance_crlf_at_non_origin_pos(BreakType::CrLf, Pos { byte_offset: 5, line: 2, column: 3 }, 7, 3, 0)]
441    #[case::break_type_advance_cr_resets_column(BreakType::Cr, Pos { byte_offset: 3, line: 1, column: 3 }, 4, 2, 0)]
442    fn break_type_advance_all_fields(
443        #[case] break_type: BreakType,
444        #[case] input: Pos,
445        #[case] expected_byte_offset: usize,
446        #[case] expected_line: usize,
447        #[case] expected_column: usize,
448    ) {
449        let after = break_type.advance(input);
450        assert_eq!(after.byte_offset, expected_byte_offset);
451        assert_eq!(after.line, expected_line);
452        assert_eq!(after.column, expected_column);
453    }
454
455    #[test]
456    fn break_type_advance_cr_increments_line() {
457        let pos = Pos::ORIGIN;
458        let after = BreakType::Cr.advance(pos);
459        assert_eq!(after.line, 2);
460    }
461
462    #[test]
463    fn break_type_advance_eof_is_noop() {
464        let pos = Pos {
465            byte_offset: 5,
466            line: 3,
467            column: 2,
468        };
469        let after = BreakType::Eof.advance(pos);
470        assert_eq!(after, pos);
471    }
472
473    // -----------------------------------------------------------------------
474    // new and initial state
475    // -----------------------------------------------------------------------
476
477    #[rstest]
478    #[case::new_single_line_with_lf_primes_first_line("foo\n", "foo", BreakType::Lf)]
479    #[case::new_input_with_only_lf_primes_empty_line("\n", "", BreakType::Lf)]
480    fn new_single_line_peek(
481        #[case] input: &str,
482        #[case] expected_content: &str,
483        #[case] expected_break: BreakType,
484    ) {
485        let buf = LineBuffer::new(input);
486        let Some(line) = buf.peek_next() else {
487            unreachable!("expected a line");
488        };
489        assert_eq!(line.content, expected_content);
490        assert_eq!(line.break_type, expected_break);
491    }
492
493    #[test]
494    fn new_empty_input_at_eof_immediately() {
495        let buf = LineBuffer::new("");
496        assert!(buf.peek_next().is_none());
497        assert!(buf.at_eof());
498    }
499
500    #[test]
501    fn new_single_line_no_newline_primes_eof_line() {
502        let buf = LineBuffer::new("foo");
503        let Some(line) = buf.peek_next() else {
504            unreachable!("expected a line");
505        };
506        assert_eq!(line.content, "foo");
507        assert_eq!(line.break_type, BreakType::Eof);
508        assert_eq!(line.offset, 0);
509    }
510
511    // -----------------------------------------------------------------------
512    // consume_next sequencing
513    // -----------------------------------------------------------------------
514
515    #[test]
516    fn consume_returns_primed_line_and_advances() {
517        let mut buf = LineBuffer::new("a\nb\n");
518        let Some(first) = buf.consume_next() else {
519            unreachable!("expected first line");
520        };
521        assert_eq!(first.content, "a");
522        assert_eq!(first.break_type, BreakType::Lf);
523        let Some(second) = buf.consume_next() else {
524            unreachable!("expected second line");
525        };
526        assert_eq!(second.content, "b");
527        assert_eq!(second.break_type, BreakType::Lf);
528    }
529
530    #[test]
531    fn consume_after_last_line_returns_none() {
532        let mut buf = LineBuffer::new("foo");
533        assert!(buf.consume_next().is_some());
534        assert!(buf.consume_next().is_none());
535    }
536
537    #[test]
538    fn at_eof_false_before_consuming_last_and_true_after() {
539        let mut buf = LineBuffer::new("foo");
540        assert!(!buf.at_eof());
541        buf.consume_next();
542        assert!(buf.at_eof());
543    }
544
545    #[test]
546    fn consume_all_lines_then_peek_returns_none() {
547        let mut buf = LineBuffer::new("a\nb");
548        buf.consume_next();
549        buf.consume_next();
550        assert!(buf.peek_next().is_none());
551    }
552
553    // -----------------------------------------------------------------------
554    // line terminator types
555    // -----------------------------------------------------------------------
556
557    #[rstest]
558    #[case::only_lf_produces_one_empty_line("\n", BreakType::Lf)]
559    #[case::only_cr_produces_one_empty_line("\r", BreakType::Cr)]
560    #[case::only_crlf_produces_one_empty_line_not_two("\r\n", BreakType::CrLf)]
561    fn single_terminator_produces_empty_line(
562        #[case] input: &str,
563        #[case] expected_break: BreakType,
564    ) {
565        let mut buf = LineBuffer::new(input);
566        let Some(line) = buf.consume_next() else {
567            unreachable!("expected a line");
568        };
569        assert_eq!(line.content, "");
570        assert_eq!(line.break_type, expected_break);
571        assert!(buf.consume_next().is_none());
572    }
573
574    #[test]
575    fn lf_terminator_produces_lf_break_type() {
576        let mut buf = LineBuffer::new("a\n");
577        let Some(line) = buf.consume_next() else {
578            unreachable!("expected a line");
579        };
580        assert_eq!(line.break_type, BreakType::Lf);
581    }
582
583    #[test]
584    fn crlf_terminator_produces_crlf_break_type_not_two_lines() {
585        let mut buf = LineBuffer::new("a\r\nb");
586        let Some(first) = buf.consume_next() else {
587            unreachable!("expected first");
588        };
589        assert_eq!(first.content, "a");
590        assert_eq!(first.break_type, BreakType::CrLf);
591        let Some(second) = buf.consume_next() else {
592            unreachable!("expected second");
593        };
594        assert_eq!(second.content, "b");
595        assert_eq!(second.break_type, BreakType::Eof);
596        assert!(buf.consume_next().is_none());
597    }
598
599    #[test]
600    fn bare_cr_terminator_produces_cr_break_type() {
601        let mut buf = LineBuffer::new("a\rb");
602        let Some(first) = buf.consume_next() else {
603            unreachable!("expected first");
604        };
605        assert_eq!(first.content, "a");
606        assert_eq!(first.break_type, BreakType::Cr);
607        let Some(second) = buf.consume_next() else {
608            unreachable!("expected second");
609        };
610        assert_eq!(second.content, "b");
611        assert_eq!(second.break_type, BreakType::Eof);
612    }
613
614    #[test]
615    fn no_terminator_on_last_line_produces_eof_break_type() {
616        let mut buf = LineBuffer::new("a\nb");
617        buf.consume_next();
618        let Some(second) = buf.consume_next() else {
619            unreachable!("expected second");
620        };
621        assert_eq!(second.content, "b");
622        assert_eq!(second.break_type, BreakType::Eof);
623    }
624
625    #[test]
626    fn mixed_line_endings_each_line_has_correct_break_type() {
627        let mut buf = LineBuffer::new("a\nb\r\nc\rd");
628        let types: Vec<BreakType> = (0..4)
629            .filter_map(|_| buf.consume_next().map(|l| l.break_type))
630            .collect();
631        assert_eq!(
632            types,
633            [
634                BreakType::Lf,
635                BreakType::CrLf,
636                BreakType::Cr,
637                BreakType::Eof
638            ]
639        );
640    }
641
642    #[test]
643    fn two_consecutive_lf_produce_two_empty_lines() {
644        let mut buf = LineBuffer::new("\n\n");
645        let Some(first) = buf.consume_next() else {
646            unreachable!("expected first");
647        };
648        assert_eq!(first.content, "");
649        assert_eq!(first.break_type, BreakType::Lf);
650        let Some(second) = buf.consume_next() else {
651            unreachable!("expected second");
652        };
653        assert_eq!(second.content, "");
654        assert_eq!(second.break_type, BreakType::Lf);
655        assert!(buf.consume_next().is_none());
656    }
657
658    #[test]
659    fn trailing_lf_does_not_produce_extra_empty_line() {
660        // A trailing newline terminates the last line; it does not introduce
661        // a new empty line.
662        let mut buf = LineBuffer::new("foo\n");
663        let Some(line) = buf.consume_next() else {
664            unreachable!("expected a line");
665        };
666        assert_eq!(line.content, "foo");
667        assert!(buf.consume_next().is_none());
668    }
669
670    // -----------------------------------------------------------------------
671    // offset and Pos tracking
672    // -----------------------------------------------------------------------
673
674    #[rstest]
675    #[case::pos_line_increments_after_bare_cr("a\rb")]
676    #[case::pos_line_increments_after_crlf("a\r\nb")]
677    fn pos_line_increments_after_terminator(#[case] input: &str) {
678        let mut buf = LineBuffer::new(input);
679        let Some(first) = buf.consume_next() else {
680            unreachable!("expected first");
681        };
682        assert_eq!(first.pos.line, 1);
683        let Some(second) = buf.consume_next() else {
684            unreachable!("expected second");
685        };
686        assert_eq!(second.pos.line, 2);
687        assert_eq!(second.pos.column, 0);
688    }
689
690    #[test]
691    fn offset_is_byte_offset_of_content_start() {
692        let mut buf = LineBuffer::new("foo\nbar\n");
693        let Some(first) = buf.consume_next() else {
694            unreachable!("expected first");
695        };
696        assert_eq!(first.offset, 0);
697        let Some(second) = buf.consume_next() else {
698            unreachable!("expected second");
699        };
700        assert_eq!(second.offset, 4); // "foo\n" = 4 bytes
701    }
702
703    #[test]
704    fn offset_and_pos_byte_offset_agree() {
705        let mut buf = LineBuffer::new("foo\nbar");
706        while let Some(line) = buf.consume_next() {
707            assert_eq!(line.offset, line.pos.byte_offset);
708        }
709    }
710
711    #[test]
712    fn pos_line_number_increments_per_line() {
713        let mut buf = LineBuffer::new("a\nb\nc");
714        let lines: Vec<Line<'_>> = (0..3).filter_map(|_| buf.consume_next()).collect();
715        assert_eq!(lines.len(), 3, "expected 3 lines");
716        assert_eq!(lines.first().map(|l| l.pos.line), Some(1));
717        assert_eq!(lines.get(1).map(|l| l.pos.line), Some(2));
718        assert_eq!(lines.get(2).map(|l| l.pos.line), Some(3));
719    }
720
721    #[test]
722    fn pos_column_is_zero_at_start_of_each_line() {
723        let mut buf = LineBuffer::new("a\nb");
724        while let Some(line) = buf.consume_next() {
725            assert_eq!(line.pos.column, 0);
726        }
727    }
728
729    #[test]
730    fn pos_column_resets_after_bare_cr() {
731        // After consuming a line that ends with bare \r, the next line's
732        // column must be 0, not the column that followed the last content char.
733        let mut buf = LineBuffer::new("abc\rd");
734        buf.consume_next(); // consume "abc"
735        let Some(second) = buf.consume_next() else {
736            unreachable!("expected second");
737        };
738        assert_eq!(second.pos.column, 0);
739    }
740
741    #[test]
742    fn pos_after_mixed_endings_tracks_lines_correctly() {
743        // Input has four lines with three different terminator types.
744        let mut buf = LineBuffer::new("a\nb\r\nc\rd");
745        let lines: Vec<Line<'_>> = (0..4).filter_map(|_| buf.consume_next()).collect();
746        assert_eq!(lines.len(), 4, "expected 4 lines");
747        let line_nums: Vec<usize> = lines.iter().map(|l| l.pos.line).collect();
748        assert_eq!(line_nums, [1, 2, 3, 4]);
749        for line in &lines {
750            assert_eq!(
751                line.pos.column, 0,
752                "line {} should start at column 0",
753                line.pos.line
754            );
755        }
756    }
757
758    #[test]
759    fn multibyte_content_byte_offset_is_byte_based_not_char_based() {
760        // '中' is 3 UTF-8 bytes
761        let mut buf = LineBuffer::new("中\nfoo");
762        let Some(first) = buf.consume_next() else {
763            unreachable!("expected first");
764        };
765        assert_eq!(first.offset, 0);
766        assert_eq!(first.content, "中");
767        let Some(second) = buf.consume_next() else {
768            unreachable!("expected second");
769        };
770        // 3 bytes for '中' + 1 byte for '\n' = 4
771        assert_eq!(second.offset, 4);
772    }
773
774    // -----------------------------------------------------------------------
775    // BOM handling
776    // -----------------------------------------------------------------------
777
778    #[test]
779    fn bom_is_stripped_from_content_of_first_line() {
780        let input = "\u{FEFF}foo\n";
781        let buf = LineBuffer::new(input);
782        let Some(line) = buf.peek_next() else {
783            unreachable!("expected a line");
784        };
785        assert_eq!(line.content, "foo");
786    }
787
788    #[test]
789    fn bom_stripped_line_offset_starts_after_bom_bytes() {
790        let input = "\u{FEFF}foo\n";
791        let buf = LineBuffer::new(input);
792        let Some(line) = buf.peek_next() else {
793            unreachable!("expected a line");
794        };
795        // BOM is U+FEFF = 3 bytes in UTF-8
796        assert_eq!(line.offset, 3);
797        assert_eq!(line.pos.byte_offset, 3);
798    }
799
800    #[test]
801    fn bom_not_stripped_on_non_boundary_mid_content_line() {
802        // A BOM in a non-first, non-boundary line is preserved as data —
803        // `signal_document_boundary` was never called, so it is an error.
804        let input = "foo\n\u{FEFF}bar\n";
805        let mut buf = LineBuffer::new(input);
806        buf.consume_next(); // consume "foo"
807        let Some(second) = buf.consume_next() else {
808            unreachable!("expected second");
809        };
810        assert_eq!(second.content, "\u{FEFF}bar");
811    }
812
813    #[test]
814    fn bom_stripped_after_document_boundary_signal() {
815        // After signal_document_boundary(), the primed next line has its
816        // leading BOM stripped.
817        let input = "foo\n\u{FEFF}bar\n";
818        let mut buf = LineBuffer::new(input);
819        buf.consume_next(); // consume "foo"; primes "\u{FEFF}bar"
820        buf.signal_document_boundary();
821        let Some(second) = buf.peek_next() else {
822            unreachable!("expected second");
823        };
824        assert_eq!(second.content, "bar");
825        assert_eq!(second.offset, 4 + 3); // "foo\n" = 4 bytes + 3-byte BOM
826        assert_eq!(second.pos.byte_offset, 4 + 3);
827    }
828
829    #[test]
830    #[expect(clippy::expect_used, reason = "test code")]
831    fn signal_document_boundary_strips_bom_from_primed_next_line() {
832        // signal_document_boundary() strips the BOM from the already-primed
833        // next line only.  Subsequent lines are not affected — the signal is
834        // a one-shot strip of the primed next slot.
835        let input = "...\n\u{FEFF}doc1\n\u{FEFF}doc2\n";
836        let mut buf = LineBuffer::new(input);
837        buf.consume_next(); // consume "..."; primes "\u{FEFF}doc1" into next
838
839        buf.signal_document_boundary();
840
841        // The already-primed next line has its BOM stripped.
842        let first = buf.consume_next().expect("first line");
843        assert_eq!(
844            first.content, "doc1",
845            "BOM stripped from primed next by signal"
846        );
847
848        // The following line was scanned by prime() with remaining_is_first=false,
849        // so its BOM is NOT stripped — it is illegal content (as in a real stream,
850        // signal_document_boundary would be called again for the next boundary).
851        let second = buf.peek_next().expect("second line");
852        assert_eq!(
853            second.content, "\u{FEFF}doc2",
854            "BOM on subsequent line preserved — not affected by one-shot signal"
855        );
856    }
857
858    #[test]
859    fn bom_stripped_line_offset_correct_after_boundary_signal() {
860        // After signal_document_boundary(), offset and pos.byte_offset advance
861        // past the 3-byte BOM.
862        let input = "\u{FEFF}key: value\n";
863        let buf = LineBuffer::new(input);
864        // Stream start: BOM already stripped by remaining_is_first=true at new().
865        // Verify the offset is 3 (past the BOM).
866        let Some(line) = buf.peek_next() else {
867            unreachable!("expected line");
868        };
869        assert_eq!(line.offset, 3);
870        assert_eq!(line.pos.byte_offset, 3);
871        assert_eq!(line.content, "key: value");
872
873        // Now test via explicit signal on a second document.
874        let input2 = "...\n\u{FEFF}key: value\n";
875        let mut buf2 = LineBuffer::new(input2);
876        buf2.consume_next(); // consume "..."
877        buf2.signal_document_boundary();
878        let Some(line2) = buf2.peek_next() else {
879            unreachable!("expected line2");
880        };
881        // "...\n" is 4 bytes; BOM is 3 bytes → content starts at offset 7.
882        assert_eq!(line2.offset, 4 + 3);
883        assert_eq!(line2.pos.byte_offset, 4 + 3);
884        assert_eq!(line2.content, "key: value");
885    }
886
887    // -----------------------------------------------------------------------
888    // indent counting
889    // -----------------------------------------------------------------------
890
891    #[rstest]
892    #[case::indent_counts_only_leading_spaces("   foo", 3)]
893    #[case::indent_is_zero_for_no_leading_spaces("foo", 0)]
894    #[case::leading_tab_does_not_count_toward_indent("\tfoo", 0)]
895    #[case::tab_after_spaces_does_not_count("  \tfoo", 2)]
896    #[case::indent_of_blank_line_is_zero("\n", 0)]
897    fn indent_value(#[case] input: &str, #[case] expected: usize) {
898        let buf = LineBuffer::new(input);
899        let Some(line) = buf.peek_next() else {
900            unreachable!("expected a line");
901        };
902        assert_eq!(line.indent, expected);
903    }
904
905    #[test]
906    fn indent_of_spaces_only_line_equals_space_count() {
907        let buf = LineBuffer::new("   \n");
908        let Some(line) = buf.peek_next() else {
909            unreachable!("expected a line");
910        };
911        assert_eq!(line.indent, 3);
912        assert_eq!(line.content, "   ");
913    }
914
915    // -----------------------------------------------------------------------
916    // peek_next_indent
917    // -----------------------------------------------------------------------
918
919    #[rstest]
920    #[case::peek_next_indent_returns_indent_of_next_line("   foo", Some(3))]
921    #[case::peek_next_indent_returns_none_at_eof("", None)]
922    fn peek_next_indent_returns(#[case] input: &str, #[case] expected: Option<usize>) {
923        let buf = LineBuffer::new(input);
924        assert_eq!(buf.peek_next_indent(), expected);
925    }
926
927    #[test]
928    fn peek_next_indent_does_not_consume() {
929        let mut buf = LineBuffer::new("  foo");
930        assert_eq!(buf.peek_next_indent(), Some(2));
931        assert_eq!(buf.peek_next_indent(), Some(2));
932        let Some(line) = buf.consume_next() else {
933            unreachable!("expected a line");
934        };
935        assert_eq!(line.content, "  foo");
936    }
937
938    // -----------------------------------------------------------------------
939    // peek_until_dedent
940    // -----------------------------------------------------------------------
941
942    #[test]
943    fn peek_until_dedent_empty_input_returns_empty_slice() {
944        let mut buf = LineBuffer::new("");
945        assert!(buf.peek_until_dedent(0).is_empty());
946    }
947
948    #[test]
949    fn peek_until_dedent_returns_lines_until_indent_le_base() {
950        let mut buf = LineBuffer::new("  a\n  b\nc\n");
951        let lines = buf.peek_until_dedent(1);
952        assert_eq!(lines.len(), 2);
953        assert_eq!(lines.first().map(|l| l.content), Some("  a"));
954        assert_eq!(lines.get(1).map(|l| l.content), Some("  b"));
955    }
956
957    #[test]
958    fn peek_until_dedent_does_not_consume_lines() {
959        let mut buf = LineBuffer::new("  a\n  b\nc\n");
960        let _ = buf.peek_until_dedent(1);
961        let Some(first) = buf.consume_next() else {
962            unreachable!("expected first");
963        };
964        assert_eq!(first.content, "  a");
965    }
966
967    #[test]
968    fn peek_until_dedent_includes_all_lines_when_no_dedent_occurs() {
969        let mut buf = LineBuffer::new("  a\n  b\n  c");
970        let lines = buf.peek_until_dedent(1);
971        assert_eq!(lines.len(), 3);
972    }
973
974    #[test]
975    fn peek_until_dedent_returns_empty_slice_when_first_line_already_dedented() {
976        let mut buf = LineBuffer::new("a\n  b\n");
977        let lines = buf.peek_until_dedent(1);
978        // "a" has indent 0 <= 1, so stop immediately
979        assert!(lines.is_empty());
980    }
981
982    #[test]
983    fn peek_until_dedent_second_call_returns_same_slice() {
984        let mut buf = LineBuffer::new("  a\n  b\nc");
985        let first_call: Vec<String> = buf
986            .peek_until_dedent(1)
987            .iter()
988            .map(|l| l.content.to_owned())
989            .collect();
990        let second_call: Vec<String> = buf
991            .peek_until_dedent(1)
992            .iter()
993            .map(|l| l.content.to_owned())
994            .collect();
995        assert_eq!(first_call, second_call);
996        assert_eq!(first_call, ["  a", "  b"]);
997    }
998
999    #[test]
1000    fn peek_until_dedent_base_zero_stops_at_non_indented_lines() {
1001        // base_indent=0: stop at lines with indent <= 0 (i.e., indent == 0).
1002        // Both lines here have indent > 0, so all are included.
1003        let mut buf = LineBuffer::new("  a\n  b\n");
1004        let lines = buf.peek_until_dedent(0);
1005        assert_eq!(lines.len(), 2);
1006    }
1007
1008    #[test]
1009    fn peek_until_dedent_blank_lines_are_transparent() {
1010        // Blank lines (empty content) are transparent: they are included in
1011        // the result and do not halt the scan.
1012        // "  a" (indent 2 > 1) -> included
1013        // ""    (blank)         -> transparent, included
1014        // "  b" (indent 2 > 1) -> included
1015        // "c"   (indent 0 <= 1) -> stop
1016        let mut buf = LineBuffer::new("  a\n\n  b\nc");
1017        let lines = buf.peek_until_dedent(1);
1018        assert_eq!(lines.len(), 3);
1019        assert_eq!(lines.first().map(|l| l.content), Some("  a"));
1020        assert_eq!(lines.get(1).map(|l| l.content), Some(""));
1021        assert_eq!(lines.get(2).map(|l| l.content), Some("  b"));
1022    }
1023
1024    // -----------------------------------------------------------------------
1025    // pos_after_line
1026    // -----------------------------------------------------------------------
1027
1028    #[rstest]
1029    #[case::pos_after_line_lf_ascii(Line { content: "hello", offset: 0, indent: 0, break_type: BreakType::Lf, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 6, 2, 0)]
1030    #[case::pos_after_line_lf_empty_content(Line { content: "", offset: 10, indent: 0, break_type: BreakType::Lf, pos: Pos { byte_offset: 10, line: 3, column: 0 } }, 11, 4, 0)]
1031    #[case::pos_after_line_lf_multibyte(Line { content: "日本", offset: 0, indent: 0, break_type: BreakType::Lf, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 7, 2, 0)]
1032    // 6 bytes + 1 for \n = 7
1033    #[case::pos_after_line_cr_ascii(Line { content: "abc", offset: 0, indent: 0, break_type: BreakType::Cr, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 4, 2, 0)]
1034    #[case::pos_after_line_cr_empty_content(Line { content: "", offset: 5, indent: 0, break_type: BreakType::Cr, pos: Pos { byte_offset: 5, line: 2, column: 0 } }, 6, 3, 0)]
1035    #[case::pos_after_line_crlf_ascii(Line { content: "key: val", offset: 0, indent: 0, break_type: BreakType::CrLf, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 10, 2, 0)]
1036    #[case::pos_after_line_crlf_empty_content(Line { content: "", offset: 0, indent: 0, break_type: BreakType::CrLf, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 2, 2, 0)]
1037    #[case::pos_after_line_eof_empty_content(Line { content: "", offset: 20, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 20, line: 5, column: 0 } }, 20, 5, 0)]
1038    #[case::pos_after_line_eof_ascii(Line { content: "last", offset: 10, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 10, line: 3, column: 0 } }, 14, 3, 4)]
1039    #[case::pos_after_line_eof_ascii_nonzero_start_column(Line { content: "end", offset: 7, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 7, line: 2, column: 5 } }, 10, 2, 8)]
1040    #[case::pos_after_line_eof_multibyte(Line { content: "日本語", offset: 0, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 9, 1, 3)]
1041    #[case::pos_after_line_eof_mixed_content(Line { content: "ab日", offset: 0, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 5, 1, 3)]
1042    fn pos_after_line_cases(
1043        #[case] line: Line<'static>,
1044        #[case] expected_byte_offset: usize,
1045        #[case] expected_line: usize,
1046        #[case] expected_column: usize,
1047    ) {
1048        let result = pos_after_line(&line);
1049        assert_eq!(result.byte_offset, expected_byte_offset);
1050        assert_eq!(result.line, expected_line);
1051        assert_eq!(result.column, expected_column);
1052    }
1053}