Skip to main content

rlsp_yaml_parser/
lines.rs

1// SPDX-License-Identifier: MIT
2
3//! Line-at-a-time buffer with one-line lookahead for the streaming parser.
4//!
5//! `LineBuffer` wraps an `&'input str` and yields one [`Line`] at a time,
6//! always keeping the *next* line primed in an internal slot so callers can
7//! peek at the next line's indent without consuming it.  It never scans the
8//! full input up front, giving O(1) first-event latency.
9
10use std::collections::VecDeque;
11
12use crate::pos::Pos;
13
14// ---------------------------------------------------------------------------
15// Public types
16// ---------------------------------------------------------------------------
17
18/// The type of line terminator that ends a [`Line`].
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum BreakType {
21    /// `\n` (line feed)
22    Lf,
23    /// `\r` (bare carriage return — no following `\n`)
24    Cr,
25    /// `\r\n` (CRLF pair)
26    CrLf,
27    /// End of input — the line has no terminator.
28    Eof,
29}
30
31impl BreakType {
32    /// Byte length of this line terminator (0 for Eof).
33    #[must_use]
34    pub const fn byte_len(self) -> usize {
35        match self {
36            Self::Lf | Self::Cr => 1,
37            Self::CrLf => 2,
38            Self::Eof => 0,
39        }
40    }
41
42    /// Advance `pos` past this line break.
43    ///
44    /// Each break type requires distinct logic because `Pos::advance(char)`
45    /// operates on individual characters and cannot distinguish bare `\r`
46    /// from `\r\n`.
47    #[must_use]
48    pub const fn advance(self, mut pos: Pos) -> Pos {
49        match self {
50            Self::Lf => pos.advance('\n'),
51            Self::CrLf => {
52                pos.byte_offset += '\r'.len_utf8();
53                pos.advance('\n')
54            }
55            Self::Cr => {
56                pos.byte_offset += '\r'.len_utf8();
57                pos.line += 1;
58                pos.column = 0;
59                pos
60            }
61            Self::Eof => pos,
62        }
63    }
64}
65
66/// A single logical line extracted from the input.
67#[derive(Debug, Clone, PartialEq, Eq)]
68pub struct Line<'input> {
69    /// The line content slice, **excluding** the terminator.
70    pub content: &'input str,
71    /// Byte offset of `content` within the original input string.
72    pub offset: usize,
73    /// Number of leading `SPACE` (`\x20`) characters.  Leading tabs do not
74    /// contribute to indent — they are a YAML syntax error in indentation
75    /// context and are reported by the lexer, not here.
76    pub indent: usize,
77    /// The terminator that ends this line.
78    pub break_type: BreakType,
79    /// Position of the first byte of this line (after BOM stripping when applicable).
80    pub pos: Pos,
81}
82
83// ---------------------------------------------------------------------------
84// Internal helpers
85// ---------------------------------------------------------------------------
86
87/// Detect the line break at the start of `s` and return `(BreakType, rest)`.
88///
89/// CRLF is checked first so that `\r\n` is consumed as a unit rather than
90/// treating `\r` as a bare CR.
91fn detect_break(s: &str) -> (BreakType, &str) {
92    if let Some(rest) = s.strip_prefix("\r\n") {
93        return (BreakType::CrLf, rest);
94    }
95    if let Some(rest) = s.strip_prefix('\r') {
96        return (BreakType::Cr, rest);
97    }
98    if let Some(rest) = s.strip_prefix('\n') {
99        return (BreakType::Lf, rest);
100    }
101    (BreakType::Eof, s)
102}
103
104/// Scan one line from `remaining`, starting at `pos`.
105///
106/// Returns `Some((line, rest))` or `None` if `remaining` is empty.
107fn scan_line(remaining: &str, pos: Pos) -> Option<(Line<'_>, &str)> {
108    if remaining.is_empty() {
109        return None;
110    }
111
112    // Find the end of line content (position of the first \n or \r).
113    let line_end = remaining.find(['\n', '\r']).unwrap_or(remaining.len());
114
115    let content = &remaining[..line_end];
116    let after_content = &remaining[line_end..];
117
118    // Determine break type and advance past the terminator.
119    // Try CRLF first (must be checked before bare CR).
120    let (break_type, after_break) = detect_break(after_content);
121
122    // Count leading SPACE characters only (tabs do not count).
123    let indent = content.chars().take_while(|&ch| ch == ' ').count();
124
125    // `offset` is the byte offset of `content` within the original input.
126    let offset = pos.byte_offset;
127
128    let line = Line {
129        content,
130        offset,
131        indent,
132        break_type,
133        pos,
134    };
135
136    Some((line, after_break))
137}
138
139// ---------------------------------------------------------------------------
140// LineBuffer
141// ---------------------------------------------------------------------------
142
143/// A one-line-lookahead buffer over a `&'input str`.
144///
145/// Always holds the *next* line pre-parsed.  Callers use [`Self::peek_next`]
146/// to inspect without consuming and [`Self::consume_next`] to advance.
147pub struct LineBuffer<'input> {
148    /// Remaining unparsed input (past the next line's terminator).
149    remaining: &'input str,
150    /// Synthetic lines prepended by the caller (e.g. inline content extracted
151    /// from a sequence- or mapping-entry line).  Drained front-first before
152    /// `next`.  A `VecDeque` supports multiple pending prepends when parsing
153    /// implicit mapping entries that need to inject both key and value lines.
154    prepend: VecDeque<Line<'input>>,
155    /// The pre-parsed next line, if any.
156    next: Option<Line<'input>>,
157    /// Position at the start of `remaining`.
158    remaining_pos: Pos,
159    /// Lookahead buffer for [`Self::peek_until_dedent`].
160    lookahead: Vec<Line<'input>>,
161}
162
163impl<'input> LineBuffer<'input> {
164    /// Construct a new `LineBuffer` and prime the next-line slot.
165    #[must_use]
166    pub fn new(input: &'input str) -> Self {
167        let mut buf = Self {
168            remaining: input,
169            prepend: VecDeque::new(),
170            next: None,
171            remaining_pos: Pos::ORIGIN,
172            lookahead: Vec::new(),
173        };
174        buf.prime();
175        buf
176    }
177
178    /// Prepend a synthetic line that will be returned by the next call to
179    /// [`Self::peek_next`] / [`Self::consume_next`], ahead of any real lines.
180    ///
181    /// Used to re-present inline content extracted from a sequence- or
182    /// mapping-entry line as if it were a separate line.  Multiple prepends
183    /// are supported: each call pushes to the front of the queue, so the last
184    /// prepended line is returned first (LIFO order).  Callers that need FIFO
185    /// order (key before value) should prepend value first, then key.
186    pub fn prepend_line(&mut self, line: Line<'input>) {
187        self.lookahead.clear();
188        self.prepend.push_front(line);
189    }
190
191    /// Look at the next line without consuming it.
192    ///
193    /// Returns the frontmost prepended synthetic line first (if any), then the
194    /// normally buffered next line.
195    #[must_use]
196    pub fn peek_next(&self) -> Option<&Line<'input>> {
197        self.prepend.front().or(self.next.as_ref())
198    }
199
200    /// Returns `true` if the next line comes from the prepend queue (synthetic),
201    /// rather than from the original input stream.
202    #[must_use]
203    pub fn is_next_synthetic(&self) -> bool {
204        !self.prepend.is_empty()
205    }
206
207    /// Convenience: the indent of the next line, without consuming it.
208    #[must_use]
209    pub fn peek_next_indent(&self) -> Option<usize> {
210        self.peek_next().map(|l| l.indent)
211    }
212
213    /// Peek at the second upcoming line without consuming either.
214    ///
215    /// Handles the prepend queue: the second line may come from the prepend
216    /// queue or from the primed `next` slot or from `remaining`.
217    #[must_use]
218    pub fn peek_second(&self) -> Option<Line<'input>> {
219        // Determine where the "first" line comes from, then find the "second".
220        if !self.prepend.is_empty() {
221            // First line is prepend[0]. Second is prepend[1] if it exists,
222            // else self.next.
223            if self.prepend.len() >= 2 {
224                return self.prepend.get(1).cloned();
225            }
226            return self.next.clone();
227        }
228        // First line is self.next. Second is the first line from `remaining`.
229        self.next.as_ref()?; // ensure first exists
230        scan_line(self.remaining, self.remaining_pos).map(|(line, _)| line)
231    }
232
233    /// Advance: return the currently primed next line and prime the following
234    /// one from the remaining input.  Returns `None` when no lines remain.
235    ///
236    /// Drains prepended synthetic lines (front-first) before the real buffer.
237    pub fn consume_next(&mut self) -> Option<Line<'input>> {
238        // Drain prepend queue front-first.
239        if let Some(line) = self.prepend.pop_front() {
240            return Some(line);
241        }
242        // Clear any cached lookahead — it was based on the old position.
243        self.lookahead.clear();
244        let line = self.next.take()?;
245        self.prime();
246        Some(line)
247    }
248
249    /// True when no more lines are available (buffer is empty, no prepend, and
250    /// input is exhausted).
251    #[must_use]
252    pub fn at_eof(&self) -> bool {
253        self.prepend.is_empty() && self.next.is_none()
254    }
255
256    /// Strip a leading BOM from the already-primed `next` line if present.
257    ///
258    /// This is the **sole BOM-strip site**.  It must be called at every document
259    /// prefix position — including stream start and each position after a `...`
260    /// document-end marker.  Per YAML 1.2 §5.2 / production [202]
261    /// `l-document-prefix = c-byte-order-mark? l-comment*`, a BOM is valid at
262    /// the start of any document prefix.
263    ///
264    /// If `next` starts with U+FEFF, content, offset, and byte position are
265    /// advanced past the 3-byte UTF-8 encoding.  Only the first BOM is stripped;
266    /// a second consecutive BOM in the same line is left as illegal content.
267    ///
268    /// Rationale: a BOM inside document body content (not at a document boundary)
269    /// is illegal per §5.2 and should be surfaced as a parse error, not silently
270    /// consumed.  Centralising stripping here ensures the lexer sees a BOM-free
271    /// first byte at every valid boundary position, and sees a raw `U+FEFF` (which
272    /// fails `c-printable`) everywhere else.
273    pub fn signal_document_boundary(&mut self) {
274        // Strip at most one BOM from the already-primed next line.
275        if let Some(ref mut next) = self.next {
276            if next.content.starts_with('\u{FEFF}') {
277                let bom_len = '\u{FEFF}'.len_utf8(); // 3 bytes
278                next.content = &next.content[bom_len..];
279                next.offset += bom_len;
280                next.pos.byte_offset += bom_len;
281                // Column is unchanged: BOM is zero-width in column terms.
282            }
283        }
284        // Invalidate lookahead that may have peeked the unstripped BOM line.
285        self.lookahead.clear();
286    }
287
288    /// Scan forward without consuming to collect all lines with
289    /// `indent > base_indent`, stopping at the first line with
290    /// `indent <= base_indent`.  Blank lines (empty content) are transparent
291    /// to the scan and are included in the result regardless of their indent.
292    ///
293    /// Returns a slice of the buffered lookahead lines.  Calling this method
294    /// repeatedly (without consuming) returns the same slice.
295    ///
296    /// Note: trailing blank lines in the returned slice are **not** part of
297    /// the block scalar content — per YAML chomping rules, trailing blank
298    /// lines are stripped, clipped, or kept based on the chomping indicator.
299    /// The consumer (lexer, Task 8) is responsible for trimming them.
300    pub fn peek_until_dedent(&mut self, base_indent: usize) -> &[Line<'input>] {
301        // Rebuild the lookahead starting from the next line.
302        self.lookahead.clear();
303
304        // We need to scan from the next primed line plus additional lines
305        // from `remaining`.  Use a local cursor.
306        let mut cursor_remaining = self.remaining;
307        let mut cursor_pos = self.remaining_pos;
308
309        // The first line in the lookahead is `self.next` (if any).
310        // We include it if it is blank or its indent > base_indent.
311        let start_line = match self.next.as_ref() {
312            None => return &self.lookahead,
313            Some(l) => l.clone(),
314        };
315
316        // Process lines in order: start with `self.next`, then scan from
317        // `remaining`.
318        let mut scanning_next = Some(start_line);
319
320        loop {
321            let line = match scanning_next.take() {
322                Some(l) => l,
323                None => {
324                    // Fetch from remaining input.
325                    match scan_line(cursor_remaining, cursor_pos) {
326                        None => break,
327                        Some((l, rest)) => {
328                            cursor_pos = pos_after_line(&l);
329                            cursor_remaining = rest;
330                            l
331                        }
332                    }
333                }
334            };
335
336            // Blank lines (empty content) are transparent: include them and
337            // keep scanning.
338            if line.content.is_empty() {
339                self.lookahead.push(line);
340                continue;
341            }
342
343            // Stop before the first non-blank line that is dedented.
344            // base_indent == usize::MAX is the "root level" sentinel meaning
345            // no indent threshold — include all non-blank lines.
346            if base_indent != usize::MAX && line.indent <= base_indent {
347                break;
348            }
349
350            self.lookahead.push(line);
351        }
352
353        &self.lookahead
354    }
355
356    // -----------------------------------------------------------------------
357    // Private helpers
358    // -----------------------------------------------------------------------
359
360    /// Parse one more line from `remaining` into `self.next`.
361    fn prime(&mut self) {
362        match scan_line(self.remaining, self.remaining_pos) {
363            None => {
364                self.next = None;
365            }
366            Some((line, rest)) => {
367                // Advance `remaining_pos` past the line we just parsed.
368                let new_pos = pos_after_line(&line);
369                self.remaining_pos = new_pos;
370                self.remaining = rest;
371                self.next = Some(line);
372            }
373        }
374    }
375}
376
377/// Compute the `Pos` immediately after the terminator of `line`.
378///
379/// O(1) for `Lf`/`Cr`/`CrLf` — the next line is at `line+1, column=0`.
380/// O(content) for `Eof` — the final line has no terminator, so position stays
381/// on the same line; column advances by the char count of the content via the
382/// ASCII fast path in [`crate::pos::column_at`].
383pub fn pos_after_line(line: &Line<'_>) -> Pos {
384    let byte_offset = line.offset + line.content.len() + line.break_type.byte_len();
385    match line.break_type {
386        BreakType::Eof => Pos {
387            byte_offset,
388            line: line.pos.line,
389            column: line.pos.column + crate::pos::column_at(line.content, line.content.len()),
390        },
391        BreakType::Lf | BreakType::Cr | BreakType::CrLf => Pos {
392            byte_offset,
393            line: line.pos.line + 1,
394            column: 0,
395        },
396    }
397}
398
399// ---------------------------------------------------------------------------
400// Tests
401// ---------------------------------------------------------------------------
402
403#[cfg(test)]
404mod tests {
405    use rstest::rstest;
406
407    use super::*;
408
409    // -----------------------------------------------------------------------
410    // BreakType::advance
411    // -----------------------------------------------------------------------
412
413    #[rstest]
414    #[case::break_type_advance_lf(BreakType::Lf, Pos::ORIGIN, 1, 2, 0)]
415    #[case::break_type_advance_crlf(BreakType::CrLf, Pos::ORIGIN, 2, 2, 0)]
416    // \r = 1 byte, \n = 1 byte → 2 bytes total for CrLf
417    #[case::break_type_advance_lf_at_non_origin_pos(BreakType::Lf, Pos { byte_offset: 5, line: 2, column: 3 }, 6, 3, 0)]
418    #[case::break_type_advance_crlf_at_non_origin_pos(BreakType::CrLf, Pos { byte_offset: 5, line: 2, column: 3 }, 7, 3, 0)]
419    #[case::break_type_advance_cr_resets_column(BreakType::Cr, Pos { byte_offset: 3, line: 1, column: 3 }, 4, 2, 0)]
420    fn break_type_advance_all_fields(
421        #[case] break_type: BreakType,
422        #[case] input: Pos,
423        #[case] expected_byte_offset: usize,
424        #[case] expected_line: usize,
425        #[case] expected_column: usize,
426    ) {
427        let after = break_type.advance(input);
428        assert_eq!(after.byte_offset, expected_byte_offset);
429        assert_eq!(after.line, expected_line);
430        assert_eq!(after.column, expected_column);
431    }
432
433    #[test]
434    fn break_type_advance_cr_increments_line() {
435        let pos = Pos::ORIGIN;
436        let after = BreakType::Cr.advance(pos);
437        assert_eq!(after.line, 2);
438    }
439
440    #[test]
441    fn break_type_advance_eof_is_noop() {
442        let pos = Pos {
443            byte_offset: 5,
444            line: 3,
445            column: 2,
446        };
447        let after = BreakType::Eof.advance(pos);
448        assert_eq!(after, pos);
449    }
450
451    // -----------------------------------------------------------------------
452    // new and initial state
453    // -----------------------------------------------------------------------
454
455    #[rstest]
456    #[case::new_single_line_with_lf_primes_first_line("foo\n", "foo", BreakType::Lf)]
457    #[case::new_input_with_only_lf_primes_empty_line("\n", "", BreakType::Lf)]
458    fn new_single_line_peek(
459        #[case] input: &str,
460        #[case] expected_content: &str,
461        #[case] expected_break: BreakType,
462    ) {
463        let buf = LineBuffer::new(input);
464        let Some(line) = buf.peek_next() else {
465            unreachable!("expected a line");
466        };
467        assert_eq!(line.content, expected_content);
468        assert_eq!(line.break_type, expected_break);
469    }
470
471    #[test]
472    fn new_empty_input_at_eof_immediately() {
473        let buf = LineBuffer::new("");
474        assert!(buf.peek_next().is_none());
475        assert!(buf.at_eof());
476    }
477
478    #[test]
479    fn new_single_line_no_newline_primes_eof_line() {
480        let buf = LineBuffer::new("foo");
481        let Some(line) = buf.peek_next() else {
482            unreachable!("expected a line");
483        };
484        assert_eq!(line.content, "foo");
485        assert_eq!(line.break_type, BreakType::Eof);
486        assert_eq!(line.offset, 0);
487    }
488
489    // -----------------------------------------------------------------------
490    // consume_next sequencing
491    // -----------------------------------------------------------------------
492
493    #[test]
494    fn consume_returns_primed_line_and_advances() {
495        let mut buf = LineBuffer::new("a\nb\n");
496        let Some(first) = buf.consume_next() else {
497            unreachable!("expected first line");
498        };
499        assert_eq!(first.content, "a");
500        assert_eq!(first.break_type, BreakType::Lf);
501        let Some(second) = buf.consume_next() else {
502            unreachable!("expected second line");
503        };
504        assert_eq!(second.content, "b");
505        assert_eq!(second.break_type, BreakType::Lf);
506    }
507
508    #[test]
509    fn consume_after_last_line_returns_none() {
510        let mut buf = LineBuffer::new("foo");
511        assert!(buf.consume_next().is_some());
512        assert!(buf.consume_next().is_none());
513    }
514
515    #[test]
516    fn at_eof_false_before_consuming_last_and_true_after() {
517        let mut buf = LineBuffer::new("foo");
518        assert!(!buf.at_eof());
519        buf.consume_next();
520        assert!(buf.at_eof());
521    }
522
523    #[test]
524    fn consume_all_lines_then_peek_returns_none() {
525        let mut buf = LineBuffer::new("a\nb");
526        buf.consume_next();
527        buf.consume_next();
528        assert!(buf.peek_next().is_none());
529    }
530
531    // -----------------------------------------------------------------------
532    // line terminator types
533    // -----------------------------------------------------------------------
534
535    #[rstest]
536    #[case::only_lf_produces_one_empty_line("\n", BreakType::Lf)]
537    #[case::only_cr_produces_one_empty_line("\r", BreakType::Cr)]
538    #[case::only_crlf_produces_one_empty_line_not_two("\r\n", BreakType::CrLf)]
539    fn single_terminator_produces_empty_line(
540        #[case] input: &str,
541        #[case] expected_break: BreakType,
542    ) {
543        let mut buf = LineBuffer::new(input);
544        let Some(line) = buf.consume_next() else {
545            unreachable!("expected a line");
546        };
547        assert_eq!(line.content, "");
548        assert_eq!(line.break_type, expected_break);
549        assert!(buf.consume_next().is_none());
550    }
551
552    #[test]
553    fn lf_terminator_produces_lf_break_type() {
554        let mut buf = LineBuffer::new("a\n");
555        let Some(line) = buf.consume_next() else {
556            unreachable!("expected a line");
557        };
558        assert_eq!(line.break_type, BreakType::Lf);
559    }
560
561    #[test]
562    fn crlf_terminator_produces_crlf_break_type_not_two_lines() {
563        let mut buf = LineBuffer::new("a\r\nb");
564        let Some(first) = buf.consume_next() else {
565            unreachable!("expected first");
566        };
567        assert_eq!(first.content, "a");
568        assert_eq!(first.break_type, BreakType::CrLf);
569        let Some(second) = buf.consume_next() else {
570            unreachable!("expected second");
571        };
572        assert_eq!(second.content, "b");
573        assert_eq!(second.break_type, BreakType::Eof);
574        assert!(buf.consume_next().is_none());
575    }
576
577    #[test]
578    fn bare_cr_terminator_produces_cr_break_type() {
579        let mut buf = LineBuffer::new("a\rb");
580        let Some(first) = buf.consume_next() else {
581            unreachable!("expected first");
582        };
583        assert_eq!(first.content, "a");
584        assert_eq!(first.break_type, BreakType::Cr);
585        let Some(second) = buf.consume_next() else {
586            unreachable!("expected second");
587        };
588        assert_eq!(second.content, "b");
589        assert_eq!(second.break_type, BreakType::Eof);
590    }
591
592    #[test]
593    fn no_terminator_on_last_line_produces_eof_break_type() {
594        let mut buf = LineBuffer::new("a\nb");
595        buf.consume_next();
596        let Some(second) = buf.consume_next() else {
597            unreachable!("expected second");
598        };
599        assert_eq!(second.content, "b");
600        assert_eq!(second.break_type, BreakType::Eof);
601    }
602
603    #[test]
604    fn mixed_line_endings_each_line_has_correct_break_type() {
605        let mut buf = LineBuffer::new("a\nb\r\nc\rd");
606        let types: Vec<BreakType> = (0..4)
607            .filter_map(|_| buf.consume_next().map(|l| l.break_type))
608            .collect();
609        assert_eq!(
610            types,
611            [
612                BreakType::Lf,
613                BreakType::CrLf,
614                BreakType::Cr,
615                BreakType::Eof
616            ]
617        );
618    }
619
620    #[test]
621    fn two_consecutive_lf_produce_two_empty_lines() {
622        let mut buf = LineBuffer::new("\n\n");
623        let Some(first) = buf.consume_next() else {
624            unreachable!("expected first");
625        };
626        assert_eq!(first.content, "");
627        assert_eq!(first.break_type, BreakType::Lf);
628        let Some(second) = buf.consume_next() else {
629            unreachable!("expected second");
630        };
631        assert_eq!(second.content, "");
632        assert_eq!(second.break_type, BreakType::Lf);
633        assert!(buf.consume_next().is_none());
634    }
635
636    #[test]
637    fn trailing_lf_does_not_produce_extra_empty_line() {
638        // A trailing newline terminates the last line; it does not introduce
639        // a new empty line.
640        let mut buf = LineBuffer::new("foo\n");
641        let Some(line) = buf.consume_next() else {
642            unreachable!("expected a line");
643        };
644        assert_eq!(line.content, "foo");
645        assert!(buf.consume_next().is_none());
646    }
647
648    // -----------------------------------------------------------------------
649    // offset and Pos tracking
650    // -----------------------------------------------------------------------
651
652    #[rstest]
653    #[case::pos_line_increments_after_bare_cr("a\rb")]
654    #[case::pos_line_increments_after_crlf("a\r\nb")]
655    fn pos_line_increments_after_terminator(#[case] input: &str) {
656        let mut buf = LineBuffer::new(input);
657        let Some(first) = buf.consume_next() else {
658            unreachable!("expected first");
659        };
660        assert_eq!(first.pos.line, 1);
661        let Some(second) = buf.consume_next() else {
662            unreachable!("expected second");
663        };
664        assert_eq!(second.pos.line, 2);
665        assert_eq!(second.pos.column, 0);
666    }
667
668    #[test]
669    fn offset_is_byte_offset_of_content_start() {
670        let mut buf = LineBuffer::new("foo\nbar\n");
671        let Some(first) = buf.consume_next() else {
672            unreachable!("expected first");
673        };
674        assert_eq!(first.offset, 0);
675        let Some(second) = buf.consume_next() else {
676            unreachable!("expected second");
677        };
678        assert_eq!(second.offset, 4); // "foo\n" = 4 bytes
679    }
680
681    #[test]
682    fn offset_and_pos_byte_offset_agree() {
683        let mut buf = LineBuffer::new("foo\nbar");
684        while let Some(line) = buf.consume_next() {
685            assert_eq!(line.offset, line.pos.byte_offset);
686        }
687    }
688
689    #[test]
690    fn pos_line_number_increments_per_line() {
691        let mut buf = LineBuffer::new("a\nb\nc");
692        let lines: Vec<Line<'_>> = (0..3).filter_map(|_| buf.consume_next()).collect();
693        assert_eq!(lines.len(), 3, "expected 3 lines");
694        assert_eq!(lines.first().map(|l| l.pos.line), Some(1));
695        assert_eq!(lines.get(1).map(|l| l.pos.line), Some(2));
696        assert_eq!(lines.get(2).map(|l| l.pos.line), Some(3));
697    }
698
699    #[test]
700    fn pos_column_is_zero_at_start_of_each_line() {
701        let mut buf = LineBuffer::new("a\nb");
702        while let Some(line) = buf.consume_next() {
703            assert_eq!(line.pos.column, 0);
704        }
705    }
706
707    #[test]
708    fn pos_column_resets_after_bare_cr() {
709        // After consuming a line that ends with bare \r, the next line's
710        // column must be 0, not the column that followed the last content char.
711        let mut buf = LineBuffer::new("abc\rd");
712        buf.consume_next(); // consume "abc"
713        let Some(second) = buf.consume_next() else {
714            unreachable!("expected second");
715        };
716        assert_eq!(second.pos.column, 0);
717    }
718
719    #[test]
720    fn pos_after_mixed_endings_tracks_lines_correctly() {
721        // Input has four lines with three different terminator types.
722        let mut buf = LineBuffer::new("a\nb\r\nc\rd");
723        let lines: Vec<Line<'_>> = (0..4).filter_map(|_| buf.consume_next()).collect();
724        assert_eq!(lines.len(), 4, "expected 4 lines");
725        let line_nums: Vec<usize> = lines.iter().map(|l| l.pos.line).collect();
726        assert_eq!(line_nums, [1, 2, 3, 4]);
727        for line in &lines {
728            assert_eq!(
729                line.pos.column, 0,
730                "line {} should start at column 0",
731                line.pos.line
732            );
733        }
734    }
735
736    #[test]
737    fn multibyte_content_byte_offset_is_byte_based_not_char_based() {
738        // '中' is 3 UTF-8 bytes
739        let mut buf = LineBuffer::new("中\nfoo");
740        let Some(first) = buf.consume_next() else {
741            unreachable!("expected first");
742        };
743        assert_eq!(first.offset, 0);
744        assert_eq!(first.content, "中");
745        let Some(second) = buf.consume_next() else {
746            unreachable!("expected second");
747        };
748        // 3 bytes for '中' + 1 byte for '\n' = 4
749        assert_eq!(second.offset, 4);
750    }
751
752    // -----------------------------------------------------------------------
753    // BOM handling
754    // -----------------------------------------------------------------------
755
756    #[test]
757    fn bom_not_stripped_by_new_before_boundary_signal() {
758        // LineBuffer::new no longer strips the BOM — signal_document_boundary
759        // is the sole BOM-strip site, and it must be called before the first
760        // line is consumed.
761        let input = "\u{FEFF}foo\n";
762        let buf = LineBuffer::new(input);
763        let Some(line) = buf.peek_next() else {
764            unreachable!("expected a line");
765        };
766        assert_eq!(line.content, "\u{FEFF}foo");
767    }
768
769    #[test]
770    fn bom_stripped_from_first_line_via_boundary_signal() {
771        // signal_document_boundary() strips the BOM from the primed first line,
772        // advancing offset and pos.byte_offset past the 3-byte BOM.
773        let input = "\u{FEFF}foo\n";
774        let mut buf = LineBuffer::new(input);
775        buf.signal_document_boundary();
776        let Some(line) = buf.peek_next() else {
777            unreachable!("expected a line");
778        };
779        assert_eq!(line.content, "foo");
780        // BOM is U+FEFF = 3 bytes in UTF-8
781        assert_eq!(line.offset, 3);
782        assert_eq!(line.pos.byte_offset, 3);
783    }
784
785    #[test]
786    fn bom_not_stripped_on_non_boundary_mid_content_line() {
787        // A BOM in a non-first, non-boundary line is preserved as data —
788        // `signal_document_boundary` was never called, so it is an error.
789        let input = "foo\n\u{FEFF}bar\n";
790        let mut buf = LineBuffer::new(input);
791        buf.consume_next(); // consume "foo"
792        let Some(second) = buf.consume_next() else {
793            unreachable!("expected second");
794        };
795        assert_eq!(second.content, "\u{FEFF}bar");
796    }
797
798    #[test]
799    fn bom_stripped_after_document_boundary_signal() {
800        // After signal_document_boundary(), the primed next line has its
801        // leading BOM stripped.
802        let input = "foo\n\u{FEFF}bar\n";
803        let mut buf = LineBuffer::new(input);
804        buf.consume_next(); // consume "foo"; primes "\u{FEFF}bar"
805        buf.signal_document_boundary();
806        let Some(second) = buf.peek_next() else {
807            unreachable!("expected second");
808        };
809        assert_eq!(second.content, "bar");
810        assert_eq!(second.offset, 4 + 3); // "foo\n" = 4 bytes + 3-byte BOM
811        assert_eq!(second.pos.byte_offset, 4 + 3);
812    }
813
814    #[test]
815    #[expect(clippy::expect_used, reason = "test code")]
816    fn signal_document_boundary_strips_bom_from_primed_next_line() {
817        // signal_document_boundary() strips the BOM from the already-primed
818        // next line only.  Subsequent lines are not affected — the signal is
819        // a one-shot strip of the primed next slot.
820        let input = "...\n\u{FEFF}doc1\n\u{FEFF}doc2\n";
821        let mut buf = LineBuffer::new(input);
822        buf.consume_next(); // consume "..."; primes "\u{FEFF}doc1" into next
823
824        buf.signal_document_boundary();
825
826        // The already-primed next line has its BOM stripped.
827        let first = buf.consume_next().expect("first line");
828        assert_eq!(
829            first.content, "doc1",
830            "BOM stripped from primed next by signal"
831        );
832
833        // The following line was scanned by prime() without a boundary signal,
834        // so its BOM is NOT stripped — it is illegal content (as in a real stream,
835        // signal_document_boundary would be called again for the next boundary).
836        let second = buf.peek_next().expect("second line");
837        assert_eq!(
838            second.content, "\u{FEFF}doc2",
839            "BOM on subsequent line preserved — not affected by one-shot signal"
840        );
841    }
842
843    #[test]
844    fn bom_stripped_line_offset_correct_after_boundary_signal() {
845        // After signal_document_boundary(), offset and pos.byte_offset advance
846        // past the 3-byte BOM.
847
848        // Stream start: signal strips BOM from the primed first line.
849        let input = "\u{FEFF}key: value\n";
850        let mut buf = LineBuffer::new(input);
851        buf.signal_document_boundary();
852        let Some(line) = buf.peek_next() else {
853            unreachable!("expected line");
854        };
855        assert_eq!(line.offset, 3);
856        assert_eq!(line.pos.byte_offset, 3);
857        assert_eq!(line.content, "key: value");
858
859        // Inter-document: signal strips BOM from the line after "...".
860        let input2 = "...\n\u{FEFF}key: value\n";
861        let mut buf2 = LineBuffer::new(input2);
862        buf2.consume_next(); // consume "..."
863        buf2.signal_document_boundary();
864        let Some(line2) = buf2.peek_next() else {
865            unreachable!("expected line2");
866        };
867        // "...\n" is 4 bytes; BOM is 3 bytes → content starts at offset 7.
868        assert_eq!(line2.offset, 4 + 3);
869        assert_eq!(line2.pos.byte_offset, 4 + 3);
870        assert_eq!(line2.content, "key: value");
871    }
872
873    // -----------------------------------------------------------------------
874    // indent counting
875    // -----------------------------------------------------------------------
876
877    #[rstest]
878    #[case::indent_counts_only_leading_spaces("   foo", 3)]
879    #[case::indent_is_zero_for_no_leading_spaces("foo", 0)]
880    #[case::leading_tab_does_not_count_toward_indent("\tfoo", 0)]
881    #[case::tab_after_spaces_does_not_count("  \tfoo", 2)]
882    #[case::indent_of_blank_line_is_zero("\n", 0)]
883    fn indent_value(#[case] input: &str, #[case] expected: usize) {
884        let buf = LineBuffer::new(input);
885        let Some(line) = buf.peek_next() else {
886            unreachable!("expected a line");
887        };
888        assert_eq!(line.indent, expected);
889    }
890
891    #[test]
892    fn indent_of_spaces_only_line_equals_space_count() {
893        let buf = LineBuffer::new("   \n");
894        let Some(line) = buf.peek_next() else {
895            unreachable!("expected a line");
896        };
897        assert_eq!(line.indent, 3);
898        assert_eq!(line.content, "   ");
899    }
900
901    // -----------------------------------------------------------------------
902    // peek_next_indent
903    // -----------------------------------------------------------------------
904
905    #[rstest]
906    #[case::peek_next_indent_returns_indent_of_next_line("   foo", Some(3))]
907    #[case::peek_next_indent_returns_none_at_eof("", None)]
908    fn peek_next_indent_returns(#[case] input: &str, #[case] expected: Option<usize>) {
909        let buf = LineBuffer::new(input);
910        assert_eq!(buf.peek_next_indent(), expected);
911    }
912
913    #[test]
914    fn peek_next_indent_does_not_consume() {
915        let mut buf = LineBuffer::new("  foo");
916        assert_eq!(buf.peek_next_indent(), Some(2));
917        assert_eq!(buf.peek_next_indent(), Some(2));
918        let Some(line) = buf.consume_next() else {
919            unreachable!("expected a line");
920        };
921        assert_eq!(line.content, "  foo");
922    }
923
924    // -----------------------------------------------------------------------
925    // peek_until_dedent
926    // -----------------------------------------------------------------------
927
928    #[test]
929    fn peek_until_dedent_empty_input_returns_empty_slice() {
930        let mut buf = LineBuffer::new("");
931        assert!(buf.peek_until_dedent(0).is_empty());
932    }
933
934    #[test]
935    fn peek_until_dedent_returns_lines_until_indent_le_base() {
936        let mut buf = LineBuffer::new("  a\n  b\nc\n");
937        let lines = buf.peek_until_dedent(1);
938        assert_eq!(lines.len(), 2);
939        assert_eq!(lines.first().map(|l| l.content), Some("  a"));
940        assert_eq!(lines.get(1).map(|l| l.content), Some("  b"));
941    }
942
943    #[test]
944    fn peek_until_dedent_does_not_consume_lines() {
945        let mut buf = LineBuffer::new("  a\n  b\nc\n");
946        let _ = buf.peek_until_dedent(1);
947        let Some(first) = buf.consume_next() else {
948            unreachable!("expected first");
949        };
950        assert_eq!(first.content, "  a");
951    }
952
953    #[test]
954    fn peek_until_dedent_includes_all_lines_when_no_dedent_occurs() {
955        let mut buf = LineBuffer::new("  a\n  b\n  c");
956        let lines = buf.peek_until_dedent(1);
957        assert_eq!(lines.len(), 3);
958    }
959
960    #[test]
961    fn peek_until_dedent_returns_empty_slice_when_first_line_already_dedented() {
962        let mut buf = LineBuffer::new("a\n  b\n");
963        let lines = buf.peek_until_dedent(1);
964        // "a" has indent 0 <= 1, so stop immediately
965        assert!(lines.is_empty());
966    }
967
968    #[test]
969    fn peek_until_dedent_second_call_returns_same_slice() {
970        let mut buf = LineBuffer::new("  a\n  b\nc");
971        let first_call: Vec<String> = buf
972            .peek_until_dedent(1)
973            .iter()
974            .map(|l| l.content.to_owned())
975            .collect();
976        let second_call: Vec<String> = buf
977            .peek_until_dedent(1)
978            .iter()
979            .map(|l| l.content.to_owned())
980            .collect();
981        assert_eq!(first_call, second_call);
982        assert_eq!(first_call, ["  a", "  b"]);
983    }
984
985    #[test]
986    fn peek_until_dedent_base_zero_stops_at_non_indented_lines() {
987        // base_indent=0: stop at lines with indent <= 0 (i.e., indent == 0).
988        // Both lines here have indent > 0, so all are included.
989        let mut buf = LineBuffer::new("  a\n  b\n");
990        let lines = buf.peek_until_dedent(0);
991        assert_eq!(lines.len(), 2);
992    }
993
994    #[test]
995    fn peek_until_dedent_blank_lines_are_transparent() {
996        // Blank lines (empty content) are transparent: they are included in
997        // the result and do not halt the scan.
998        // "  a" (indent 2 > 1) -> included
999        // ""    (blank)         -> transparent, included
1000        // "  b" (indent 2 > 1) -> included
1001        // "c"   (indent 0 <= 1) -> stop
1002        let mut buf = LineBuffer::new("  a\n\n  b\nc");
1003        let lines = buf.peek_until_dedent(1);
1004        assert_eq!(lines.len(), 3);
1005        assert_eq!(lines.first().map(|l| l.content), Some("  a"));
1006        assert_eq!(lines.get(1).map(|l| l.content), Some(""));
1007        assert_eq!(lines.get(2).map(|l| l.content), Some("  b"));
1008    }
1009
1010    // -----------------------------------------------------------------------
1011    // pos_after_line
1012    // -----------------------------------------------------------------------
1013
1014    #[rstest]
1015    #[case::pos_after_line_lf_ascii(Line { content: "hello", offset: 0, indent: 0, break_type: BreakType::Lf, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 6, 2, 0)]
1016    #[case::pos_after_line_lf_empty_content(Line { content: "", offset: 10, indent: 0, break_type: BreakType::Lf, pos: Pos { byte_offset: 10, line: 3, column: 0 } }, 11, 4, 0)]
1017    #[case::pos_after_line_lf_multibyte(Line { content: "日本", offset: 0, indent: 0, break_type: BreakType::Lf, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 7, 2, 0)]
1018    // 6 bytes + 1 for \n = 7
1019    #[case::pos_after_line_cr_ascii(Line { content: "abc", offset: 0, indent: 0, break_type: BreakType::Cr, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 4, 2, 0)]
1020    #[case::pos_after_line_cr_empty_content(Line { content: "", offset: 5, indent: 0, break_type: BreakType::Cr, pos: Pos { byte_offset: 5, line: 2, column: 0 } }, 6, 3, 0)]
1021    #[case::pos_after_line_crlf_ascii(Line { content: "key: val", offset: 0, indent: 0, break_type: BreakType::CrLf, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 10, 2, 0)]
1022    #[case::pos_after_line_crlf_empty_content(Line { content: "", offset: 0, indent: 0, break_type: BreakType::CrLf, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 2, 2, 0)]
1023    #[case::pos_after_line_eof_empty_content(Line { content: "", offset: 20, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 20, line: 5, column: 0 } }, 20, 5, 0)]
1024    #[case::pos_after_line_eof_ascii(Line { content: "last", offset: 10, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 10, line: 3, column: 0 } }, 14, 3, 4)]
1025    #[case::pos_after_line_eof_ascii_nonzero_start_column(Line { content: "end", offset: 7, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 7, line: 2, column: 5 } }, 10, 2, 8)]
1026    #[case::pos_after_line_eof_multibyte(Line { content: "日本語", offset: 0, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 9, 1, 3)]
1027    #[case::pos_after_line_eof_mixed_content(Line { content: "ab日", offset: 0, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 5, 1, 3)]
1028    fn pos_after_line_cases(
1029        #[case] line: Line<'static>,
1030        #[case] expected_byte_offset: usize,
1031        #[case] expected_line: usize,
1032        #[case] expected_column: usize,
1033    ) {
1034        let result = pos_after_line(&line);
1035        assert_eq!(result.byte_offset, expected_byte_offset);
1036        assert_eq!(result.line, expected_line);
1037        assert_eq!(result.column, expected_column);
1038    }
1039}