Skip to main content

rlsp_yaml_parser/
lines.rs

1// SPDX-License-Identifier: MIT
2
3//! Line-at-a-time buffer with one-line lookahead for the streaming parser.
4//!
5//! `LineBuffer` wraps an `&'input str` and yields one [`Line`] at a time,
6//! always keeping the *next* line primed in an internal slot so callers can
7//! peek at the next line's indent without consuming it.  It never scans the
8//! full input up front, giving O(1) first-event latency.
9
10use std::collections::VecDeque;
11
12use crate::pos::Pos;
13
14// ---------------------------------------------------------------------------
15// Public types
16// ---------------------------------------------------------------------------
17
18/// The type of line terminator that ends a [`Line`].
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum BreakType {
21    /// `\n` (line feed)
22    Lf,
23    /// `\r` (bare carriage return — no following `\n`)
24    Cr,
25    /// `\r\n` (CRLF pair)
26    CrLf,
27    /// End of input — the line has no terminator.
28    Eof,
29}
30
31impl BreakType {
32    /// Byte length of this line terminator (0 for Eof).
33    #[must_use]
34    pub const fn byte_len(self) -> usize {
35        match self {
36            Self::Lf | Self::Cr => 1,
37            Self::CrLf => 2,
38            Self::Eof => 0,
39        }
40    }
41
42    /// Advance `pos` past this line break.
43    ///
44    /// Each break type requires distinct logic because `Pos::advance(char)`
45    /// operates on individual characters and cannot distinguish bare `\r`
46    /// from `\r\n`.
47    #[must_use]
48    pub const fn advance(self, mut pos: Pos) -> Pos {
49        match self {
50            Self::Lf => pos.advance('\n'),
51            Self::CrLf => {
52                pos.byte_offset += '\r'.len_utf8();
53                pos.advance('\n')
54            }
55            Self::Cr => {
56                pos.byte_offset += '\r'.len_utf8();
57                pos.line += 1;
58                pos.column = 0;
59                pos
60            }
61            Self::Eof => pos,
62        }
63    }
64}
65
66/// A single logical line extracted from the input.
67#[derive(Debug, Clone, PartialEq, Eq)]
68pub struct Line<'input> {
69    /// The line content slice, **excluding** the terminator.
70    pub content: &'input str,
71    /// Byte offset of `content` within the original input string.
72    pub offset: usize,
73    /// Number of leading `SPACE` (`\x20`) characters.  Leading tabs do not
74    /// contribute to indent — they are a YAML syntax error in indentation
75    /// context and are reported by the lexer, not here.
76    pub indent: usize,
77    /// The terminator that ends this line.
78    pub break_type: BreakType,
79    /// Position of the first byte of this line (after BOM stripping on line 1).
80    pub pos: Pos,
81}
82
83// ---------------------------------------------------------------------------
84// Internal helpers
85// ---------------------------------------------------------------------------
86
87/// Detect the line break at the start of `s` and return `(BreakType, rest)`.
88///
89/// CRLF is checked first so that `\r\n` is consumed as a unit rather than
90/// treating `\r` as a bare CR.
91fn detect_break(s: &str) -> (BreakType, &str) {
92    if let Some(rest) = s.strip_prefix("\r\n") {
93        return (BreakType::CrLf, rest);
94    }
95    if let Some(rest) = s.strip_prefix('\r') {
96        return (BreakType::Cr, rest);
97    }
98    if let Some(rest) = s.strip_prefix('\n') {
99        return (BreakType::Lf, rest);
100    }
101    (BreakType::Eof, s)
102}
103
104/// Scan one line from `remaining`, starting at `pos`.
105///
106/// `is_first` controls BOM stripping: if `true` and the slice starts with
107/// U+FEFF (UTF-8 BOM, 3 bytes), the BOM is skipped before content begins.
108///
109/// Returns `Some((line, rest))` or `None` if `remaining` is empty.
110fn scan_line(remaining: &str, pos: Pos, is_first: bool) -> Option<(Line<'_>, &str)> {
111    if remaining.is_empty() {
112        return None;
113    }
114
115    // Strip BOM on first line only.
116    let (content_start, pos) = if is_first && remaining.starts_with('\u{FEFF}') {
117        let bom_len = '\u{FEFF}'.len_utf8(); // 3 bytes
118        (
119            &remaining[bom_len..],
120            Pos {
121                byte_offset: pos.byte_offset + bom_len,
122                ..pos
123            },
124        )
125    } else {
126        (remaining, pos)
127    };
128
129    // Find the end of line content (position of the first \n or \r).
130    let line_end = content_start
131        .find(['\n', '\r'])
132        .unwrap_or(content_start.len());
133
134    let content = &content_start[..line_end];
135    let after_content = &content_start[line_end..];
136
137    // Determine break type and advance past the terminator.
138    // Try CRLF first (must be checked before bare CR).
139    let (break_type, after_break) = detect_break(after_content);
140
141    // Count leading SPACE characters only (tabs do not count).
142    let indent = content.chars().take_while(|&ch| ch == ' ').count();
143
144    // `offset` is the byte offset of `content` within the *original* input.
145    // `pos` already reflects the position after any BOM skip.
146    let offset = pos.byte_offset;
147
148    let line = Line {
149        content,
150        offset,
151        indent,
152        break_type,
153        pos,
154    };
155
156    Some((line, after_break))
157}
158
159// ---------------------------------------------------------------------------
160// LineBuffer
161// ---------------------------------------------------------------------------
162
163/// A one-line-lookahead buffer over a `&'input str`.
164///
165/// Always holds the *next* line pre-parsed.  Callers use [`Self::peek_next`]
166/// to inspect without consuming and [`Self::consume_next`] to advance.
167pub struct LineBuffer<'input> {
168    /// Remaining unparsed input (past the next line's terminator).
169    remaining: &'input str,
170    /// Synthetic lines prepended by the caller (e.g. inline content extracted
171    /// from a sequence- or mapping-entry line).  Drained front-first before
172    /// `next`.  A `VecDeque` supports multiple pending prepends when parsing
173    /// implicit mapping entries that need to inject both key and value lines.
174    prepend: VecDeque<Line<'input>>,
175    /// The pre-parsed next line, if any.
176    next: Option<Line<'input>>,
177    /// Position at the start of `remaining`.
178    remaining_pos: Pos,
179    /// Whether the next line to be parsed from `remaining` is the first line
180    /// of input (used for BOM detection after the initial prime).
181    remaining_is_first: bool,
182    /// Lookahead buffer for [`Self::peek_until_dedent`].
183    lookahead: Vec<Line<'input>>,
184}
185
186impl<'input> LineBuffer<'input> {
187    /// Construct a new `LineBuffer` and prime the next-line slot.
188    #[must_use]
189    pub fn new(input: &'input str) -> Self {
190        let mut buf = Self {
191            remaining: input,
192            prepend: VecDeque::new(),
193            next: None,
194            remaining_pos: Pos::ORIGIN,
195            remaining_is_first: true,
196            lookahead: Vec::new(),
197        };
198        buf.prime();
199        buf
200    }
201
202    /// Prepend a synthetic line that will be returned by the next call to
203    /// [`Self::peek_next`] / [`Self::consume_next`], ahead of any real lines.
204    ///
205    /// Used to re-present inline content extracted from a sequence- or
206    /// mapping-entry line as if it were a separate line.  Multiple prepends
207    /// are supported: each call pushes to the front of the queue, so the last
208    /// prepended line is returned first (LIFO order).  Callers that need FIFO
209    /// order (key before value) should prepend value first, then key.
210    pub fn prepend_line(&mut self, line: Line<'input>) {
211        self.lookahead.clear();
212        self.prepend.push_front(line);
213    }
214
215    /// Look at the next line without consuming it.
216    ///
217    /// Returns the frontmost prepended synthetic line first (if any), then the
218    /// normally buffered next line.
219    #[must_use]
220    pub fn peek_next(&self) -> Option<&Line<'input>> {
221        self.prepend.front().or(self.next.as_ref())
222    }
223
224    /// Returns `true` if the next line comes from the prepend queue (synthetic),
225    /// rather than from the original input stream.
226    #[must_use]
227    pub fn is_next_synthetic(&self) -> bool {
228        !self.prepend.is_empty()
229    }
230
231    /// Convenience: the indent of the next line, without consuming it.
232    #[must_use]
233    pub fn peek_next_indent(&self) -> Option<usize> {
234        self.peek_next().map(|l| l.indent)
235    }
236
237    /// Peek at the second upcoming line without consuming either.
238    ///
239    /// Handles the prepend queue: the second line may come from the prepend
240    /// queue or from the primed `next` slot or from `remaining`.
241    #[must_use]
242    pub fn peek_second(&self) -> Option<Line<'input>> {
243        // Determine where the "first" line comes from, then find the "second".
244        if !self.prepend.is_empty() {
245            // First line is prepend[0]. Second is prepend[1] if it exists,
246            // else self.next.
247            if self.prepend.len() >= 2 {
248                return self.prepend.get(1).cloned();
249            }
250            return self.next.clone();
251        }
252        // First line is self.next. Second is the first line from `remaining`.
253        self.next.as_ref()?; // ensure first exists
254        scan_line(self.remaining, self.remaining_pos, self.remaining_is_first).map(|(line, _)| line)
255    }
256
257    /// Advance: return the currently primed next line and prime the following
258    /// one from the remaining input.  Returns `None` when no lines remain.
259    ///
260    /// Drains prepended synthetic lines (front-first) before the real buffer.
261    pub fn consume_next(&mut self) -> Option<Line<'input>> {
262        // Drain prepend queue front-first.
263        if let Some(line) = self.prepend.pop_front() {
264            return Some(line);
265        }
266        // Clear any cached lookahead — it was based on the old position.
267        self.lookahead.clear();
268        let line = self.next.take()?;
269        self.prime();
270        Some(line)
271    }
272
273    /// True when no more lines are available (buffer is empty, no prepend, and
274    /// input is exhausted).
275    #[must_use]
276    pub fn at_eof(&self) -> bool {
277        self.prepend.is_empty() && self.next.is_none()
278    }
279
280    /// Scan forward without consuming to collect all lines with
281    /// `indent > base_indent`, stopping at the first line with
282    /// `indent <= base_indent`.  Blank lines (empty content) are transparent
283    /// to the scan and are included in the result regardless of their indent.
284    ///
285    /// Returns a slice of the buffered lookahead lines.  Calling this method
286    /// repeatedly (without consuming) returns the same slice.
287    ///
288    /// Note: trailing blank lines in the returned slice are **not** part of
289    /// the block scalar content — per YAML chomping rules, trailing blank
290    /// lines are stripped, clipped, or kept based on the chomping indicator.
291    /// The consumer (lexer, Task 8) is responsible for trimming them.
292    pub fn peek_until_dedent(&mut self, base_indent: usize) -> &[Line<'input>] {
293        // Rebuild the lookahead starting from the next line.
294        self.lookahead.clear();
295
296        // We need to scan from the next primed line plus additional lines
297        // from `remaining`.  Use a local cursor.
298        let mut cursor_remaining = self.remaining;
299        let mut cursor_pos = self.remaining_pos;
300        let mut cursor_is_first = self.remaining_is_first;
301
302        // The first line in the lookahead is `self.next` (if any).
303        // We include it if it is blank or its indent > base_indent.
304        let start_line = match self.next.as_ref() {
305            None => return &self.lookahead,
306            Some(l) => l.clone(),
307        };
308
309        // Process lines in order: start with `self.next`, then scan from
310        // `remaining`.
311        let mut scanning_next = Some(start_line);
312
313        loop {
314            let line = match scanning_next.take() {
315                Some(l) => l,
316                None => {
317                    // Fetch from remaining input.
318                    match scan_line(cursor_remaining, cursor_pos, cursor_is_first) {
319                        None => break,
320                        Some((l, rest)) => {
321                            cursor_pos = pos_after_line(&l);
322                            cursor_remaining = rest;
323                            cursor_is_first = false;
324                            l
325                        }
326                    }
327                }
328            };
329
330            // Blank lines (empty content) are transparent: include them and
331            // keep scanning.
332            if line.content.is_empty() {
333                self.lookahead.push(line);
334                continue;
335            }
336
337            // Stop before the first non-blank line that is dedented.
338            // base_indent == usize::MAX is the "root level" sentinel meaning
339            // no indent threshold — include all non-blank lines.
340            if base_indent != usize::MAX && line.indent <= base_indent {
341                break;
342            }
343
344            self.lookahead.push(line);
345        }
346
347        &self.lookahead
348    }
349
350    // -----------------------------------------------------------------------
351    // Private helpers
352    // -----------------------------------------------------------------------
353
354    /// Parse one more line from `remaining` into `self.next`.
355    fn prime(&mut self) {
356        match scan_line(self.remaining, self.remaining_pos, self.remaining_is_first) {
357            None => {
358                self.next = None;
359            }
360            Some((line, rest)) => {
361                // Advance `remaining_pos` past the line we just parsed.
362                let new_pos = pos_after_line(&line);
363                self.remaining_pos = new_pos;
364                self.remaining = rest;
365                self.remaining_is_first = false;
366                self.next = Some(line);
367            }
368        }
369    }
370}
371
372/// Compute the `Pos` immediately after the terminator of `line`.
373///
374/// O(1) for `Lf`/`Cr`/`CrLf` — the next line is at `line+1, column=0`.
375/// O(content) for `Eof` — the final line has no terminator, so position stays
376/// on the same line; column advances by the char count of the content via the
377/// ASCII fast path in [`crate::pos::column_at`].
378pub fn pos_after_line(line: &Line<'_>) -> Pos {
379    let byte_offset = line.offset + line.content.len() + line.break_type.byte_len();
380    match line.break_type {
381        BreakType::Eof => Pos {
382            byte_offset,
383            line: line.pos.line,
384            column: line.pos.column + crate::pos::column_at(line.content, line.content.len()),
385        },
386        BreakType::Lf | BreakType::Cr | BreakType::CrLf => Pos {
387            byte_offset,
388            line: line.pos.line + 1,
389            column: 0,
390        },
391    }
392}
393
394// ---------------------------------------------------------------------------
395// Tests
396// ---------------------------------------------------------------------------
397
398#[cfg(test)]
399mod tests {
400    use super::*;
401
402    // -----------------------------------------------------------------------
403    // BreakType::advance
404    // -----------------------------------------------------------------------
405
406    #[test]
407    fn break_type_advance_lf() {
408        let pos = Pos::ORIGIN;
409        let after = BreakType::Lf.advance(pos);
410        assert_eq!(after.byte_offset, 1);
411        assert_eq!(after.line, 2);
412        assert_eq!(after.column, 0);
413    }
414
415    #[test]
416    fn break_type_advance_crlf() {
417        let pos = Pos::ORIGIN;
418        let after = BreakType::CrLf.advance(pos);
419        // \r = 1 byte, \n = 1 byte → 2 bytes total
420        assert_eq!(after.byte_offset, 2);
421        assert_eq!(after.line, 2);
422        assert_eq!(after.column, 0);
423    }
424
425    #[test]
426    fn break_type_advance_cr_increments_line() {
427        let pos = Pos::ORIGIN;
428        let after = BreakType::Cr.advance(pos);
429        assert_eq!(after.line, 2);
430    }
431
432    #[test]
433    fn break_type_advance_cr_resets_column() {
434        let pos = Pos {
435            byte_offset: 3,
436            line: 1,
437            column: 3,
438        };
439        let after = BreakType::Cr.advance(pos);
440        assert_eq!(after.column, 0);
441        assert_eq!(after.byte_offset, 4); // \r = 1 byte
442        assert_eq!(after.line, 2);
443    }
444
445    #[test]
446    fn break_type_advance_lf_at_non_origin_pos() {
447        let pos = Pos {
448            byte_offset: 5,
449            line: 2,
450            column: 3,
451        };
452        let after = BreakType::Lf.advance(pos);
453        assert_eq!(after.byte_offset, 6);
454        assert_eq!(after.line, 3);
455        assert_eq!(after.column, 0);
456    }
457
458    #[test]
459    fn break_type_advance_crlf_at_non_origin_pos() {
460        let pos = Pos {
461            byte_offset: 5,
462            line: 2,
463            column: 3,
464        };
465        let after = BreakType::CrLf.advance(pos);
466        assert_eq!(after.byte_offset, 7); // \r (1) + \n (1) = +2
467        assert_eq!(after.line, 3);
468        assert_eq!(after.column, 0);
469    }
470
471    #[test]
472    fn break_type_advance_eof_is_noop() {
473        let pos = Pos {
474            byte_offset: 5,
475            line: 3,
476            column: 2,
477        };
478        let after = BreakType::Eof.advance(pos);
479        assert_eq!(after, pos);
480    }
481
482    // -----------------------------------------------------------------------
483    // new and initial state
484    // -----------------------------------------------------------------------
485
486    #[test]
487    fn new_empty_input_at_eof_immediately() {
488        let buf = LineBuffer::new("");
489        assert!(buf.peek_next().is_none());
490        assert!(buf.at_eof());
491    }
492
493    #[test]
494    fn new_single_line_no_newline_primes_eof_line() {
495        let buf = LineBuffer::new("foo");
496        let Some(line) = buf.peek_next() else {
497            unreachable!("expected a line");
498        };
499        assert_eq!(line.content, "foo");
500        assert_eq!(line.break_type, BreakType::Eof);
501        assert_eq!(line.offset, 0);
502    }
503
504    #[test]
505    fn new_single_line_with_lf_primes_first_line() {
506        let buf = LineBuffer::new("foo\n");
507        let Some(line) = buf.peek_next() else {
508            unreachable!("expected a line");
509        };
510        assert_eq!(line.content, "foo");
511        assert_eq!(line.break_type, BreakType::Lf);
512    }
513
514    #[test]
515    fn new_input_with_only_lf_primes_empty_line() {
516        let buf = LineBuffer::new("\n");
517        let Some(line) = buf.peek_next() else {
518            unreachable!("expected a line");
519        };
520        assert_eq!(line.content, "");
521        assert_eq!(line.break_type, BreakType::Lf);
522    }
523
524    // -----------------------------------------------------------------------
525    // consume_next sequencing
526    // -----------------------------------------------------------------------
527
528    #[test]
529    fn consume_returns_primed_line_and_advances() {
530        let mut buf = LineBuffer::new("a\nb\n");
531        let Some(first) = buf.consume_next() else {
532            unreachable!("expected first line");
533        };
534        assert_eq!(first.content, "a");
535        assert_eq!(first.break_type, BreakType::Lf);
536        let Some(second) = buf.consume_next() else {
537            unreachable!("expected second line");
538        };
539        assert_eq!(second.content, "b");
540        assert_eq!(second.break_type, BreakType::Lf);
541    }
542
543    #[test]
544    fn consume_after_last_line_returns_none() {
545        let mut buf = LineBuffer::new("foo");
546        assert!(buf.consume_next().is_some());
547        assert!(buf.consume_next().is_none());
548    }
549
550    #[test]
551    fn at_eof_false_before_consuming_last_and_true_after() {
552        let mut buf = LineBuffer::new("foo");
553        assert!(!buf.at_eof());
554        buf.consume_next();
555        assert!(buf.at_eof());
556    }
557
558    #[test]
559    fn consume_all_lines_then_peek_returns_none() {
560        let mut buf = LineBuffer::new("a\nb");
561        buf.consume_next();
562        buf.consume_next();
563        assert!(buf.peek_next().is_none());
564    }
565
566    // -----------------------------------------------------------------------
567    // line terminator types
568    // -----------------------------------------------------------------------
569
570    #[test]
571    fn lf_terminator_produces_lf_break_type() {
572        let mut buf = LineBuffer::new("a\n");
573        let Some(line) = buf.consume_next() else {
574            unreachable!("expected a line");
575        };
576        assert_eq!(line.break_type, BreakType::Lf);
577    }
578
579    #[test]
580    fn crlf_terminator_produces_crlf_break_type_not_two_lines() {
581        let mut buf = LineBuffer::new("a\r\nb");
582        let Some(first) = buf.consume_next() else {
583            unreachable!("expected first");
584        };
585        assert_eq!(first.content, "a");
586        assert_eq!(first.break_type, BreakType::CrLf);
587        let Some(second) = buf.consume_next() else {
588            unreachable!("expected second");
589        };
590        assert_eq!(second.content, "b");
591        assert_eq!(second.break_type, BreakType::Eof);
592        assert!(buf.consume_next().is_none());
593    }
594
595    #[test]
596    fn bare_cr_terminator_produces_cr_break_type() {
597        let mut buf = LineBuffer::new("a\rb");
598        let Some(first) = buf.consume_next() else {
599            unreachable!("expected first");
600        };
601        assert_eq!(first.content, "a");
602        assert_eq!(first.break_type, BreakType::Cr);
603        let Some(second) = buf.consume_next() else {
604            unreachable!("expected second");
605        };
606        assert_eq!(second.content, "b");
607        assert_eq!(second.break_type, BreakType::Eof);
608    }
609
610    #[test]
611    fn no_terminator_on_last_line_produces_eof_break_type() {
612        let mut buf = LineBuffer::new("a\nb");
613        buf.consume_next();
614        let Some(second) = buf.consume_next() else {
615            unreachable!("expected second");
616        };
617        assert_eq!(second.content, "b");
618        assert_eq!(second.break_type, BreakType::Eof);
619    }
620
621    #[test]
622    fn mixed_line_endings_each_line_has_correct_break_type() {
623        let mut buf = LineBuffer::new("a\nb\r\nc\rd");
624        let types: Vec<BreakType> = (0..4)
625            .filter_map(|_| buf.consume_next().map(|l| l.break_type))
626            .collect();
627        assert_eq!(
628            types,
629            [
630                BreakType::Lf,
631                BreakType::CrLf,
632                BreakType::Cr,
633                BreakType::Eof
634            ]
635        );
636    }
637
638    #[test]
639    fn only_crlf_produces_one_empty_line_not_two() {
640        let mut buf = LineBuffer::new("\r\n");
641        let Some(line) = buf.consume_next() else {
642            unreachable!("expected a line");
643        };
644        assert_eq!(line.content, "");
645        assert_eq!(line.break_type, BreakType::CrLf);
646        assert!(buf.consume_next().is_none());
647    }
648
649    #[test]
650    fn only_cr_produces_one_empty_line() {
651        let mut buf = LineBuffer::new("\r");
652        let Some(line) = buf.consume_next() else {
653            unreachable!("expected a line");
654        };
655        assert_eq!(line.content, "");
656        assert_eq!(line.break_type, BreakType::Cr);
657        assert!(buf.consume_next().is_none());
658    }
659
660    #[test]
661    fn only_lf_produces_one_empty_line() {
662        let mut buf = LineBuffer::new("\n");
663        let Some(line) = buf.consume_next() else {
664            unreachable!("expected a line");
665        };
666        assert_eq!(line.content, "");
667        assert_eq!(line.break_type, BreakType::Lf);
668        assert!(buf.consume_next().is_none());
669    }
670
671    #[test]
672    fn two_consecutive_lf_produce_two_empty_lines() {
673        let mut buf = LineBuffer::new("\n\n");
674        let Some(first) = buf.consume_next() else {
675            unreachable!("expected first");
676        };
677        assert_eq!(first.content, "");
678        assert_eq!(first.break_type, BreakType::Lf);
679        let Some(second) = buf.consume_next() else {
680            unreachable!("expected second");
681        };
682        assert_eq!(second.content, "");
683        assert_eq!(second.break_type, BreakType::Lf);
684        assert!(buf.consume_next().is_none());
685    }
686
687    #[test]
688    fn trailing_lf_does_not_produce_extra_empty_line() {
689        // A trailing newline terminates the last line; it does not introduce
690        // a new empty line.
691        let mut buf = LineBuffer::new("foo\n");
692        let Some(line) = buf.consume_next() else {
693            unreachable!("expected a line");
694        };
695        assert_eq!(line.content, "foo");
696        assert!(buf.consume_next().is_none());
697    }
698
699    // -----------------------------------------------------------------------
700    // offset and Pos tracking
701    // -----------------------------------------------------------------------
702
703    #[test]
704    fn offset_is_byte_offset_of_content_start() {
705        let mut buf = LineBuffer::new("foo\nbar\n");
706        let Some(first) = buf.consume_next() else {
707            unreachable!("expected first");
708        };
709        assert_eq!(first.offset, 0);
710        let Some(second) = buf.consume_next() else {
711            unreachable!("expected second");
712        };
713        assert_eq!(second.offset, 4); // "foo\n" = 4 bytes
714    }
715
716    #[test]
717    fn offset_and_pos_byte_offset_agree() {
718        let mut buf = LineBuffer::new("foo\nbar");
719        while let Some(line) = buf.consume_next() {
720            assert_eq!(line.offset, line.pos.byte_offset);
721        }
722    }
723
724    #[test]
725    fn pos_line_number_increments_per_line() {
726        let mut buf = LineBuffer::new("a\nb\nc");
727        let lines: Vec<Line<'_>> = (0..3).filter_map(|_| buf.consume_next()).collect();
728        assert_eq!(lines.len(), 3, "expected 3 lines");
729        assert_eq!(lines.first().map(|l| l.pos.line), Some(1));
730        assert_eq!(lines.get(1).map(|l| l.pos.line), Some(2));
731        assert_eq!(lines.get(2).map(|l| l.pos.line), Some(3));
732    }
733
734    #[test]
735    fn pos_column_is_zero_at_start_of_each_line() {
736        let mut buf = LineBuffer::new("a\nb");
737        while let Some(line) = buf.consume_next() {
738            assert_eq!(line.pos.column, 0);
739        }
740    }
741
742    #[test]
743    fn pos_line_increments_after_bare_cr() {
744        // Bare \r is a line terminator: the next line must start on line 2.
745        let mut buf = LineBuffer::new("a\rb");
746        let Some(first) = buf.consume_next() else {
747            unreachable!("expected first");
748        };
749        assert_eq!(first.pos.line, 1);
750        let Some(second) = buf.consume_next() else {
751            unreachable!("expected second");
752        };
753        assert_eq!(second.pos.line, 2);
754        assert_eq!(second.pos.column, 0);
755    }
756
757    #[test]
758    fn pos_column_resets_after_bare_cr() {
759        // After consuming a line that ends with bare \r, the next line's
760        // column must be 0, not the column that followed the last content char.
761        let mut buf = LineBuffer::new("abc\rd");
762        buf.consume_next(); // consume "abc"
763        let Some(second) = buf.consume_next() else {
764            unreachable!("expected second");
765        };
766        assert_eq!(second.pos.column, 0);
767    }
768
769    #[test]
770    fn pos_line_increments_after_crlf() {
771        // CRLF is a line terminator: the next line must start on line 2.
772        let mut buf = LineBuffer::new("a\r\nb");
773        let Some(first) = buf.consume_next() else {
774            unreachable!("expected first");
775        };
776        assert_eq!(first.pos.line, 1);
777        let Some(second) = buf.consume_next() else {
778            unreachable!("expected second");
779        };
780        assert_eq!(second.pos.line, 2);
781        assert_eq!(second.pos.column, 0);
782    }
783
784    #[test]
785    fn pos_after_mixed_endings_tracks_lines_correctly() {
786        // Input has four lines with three different terminator types.
787        let mut buf = LineBuffer::new("a\nb\r\nc\rd");
788        let lines: Vec<Line<'_>> = (0..4).filter_map(|_| buf.consume_next()).collect();
789        assert_eq!(lines.len(), 4, "expected 4 lines");
790        let line_nums: Vec<usize> = lines.iter().map(|l| l.pos.line).collect();
791        assert_eq!(line_nums, [1, 2, 3, 4]);
792        for line in &lines {
793            assert_eq!(
794                line.pos.column, 0,
795                "line {} should start at column 0",
796                line.pos.line
797            );
798        }
799    }
800
801    #[test]
802    fn multibyte_content_byte_offset_is_byte_based_not_char_based() {
803        // '中' is 3 UTF-8 bytes
804        let mut buf = LineBuffer::new("中\nfoo");
805        let Some(first) = buf.consume_next() else {
806            unreachable!("expected first");
807        };
808        assert_eq!(first.offset, 0);
809        assert_eq!(first.content, "中");
810        let Some(second) = buf.consume_next() else {
811            unreachable!("expected second");
812        };
813        // 3 bytes for '中' + 1 byte for '\n' = 4
814        assert_eq!(second.offset, 4);
815    }
816
817    // -----------------------------------------------------------------------
818    // BOM handling
819    // -----------------------------------------------------------------------
820
821    #[test]
822    fn bom_is_stripped_from_content_of_first_line() {
823        let input = "\u{FEFF}foo\n";
824        let buf = LineBuffer::new(input);
825        let Some(line) = buf.peek_next() else {
826            unreachable!("expected a line");
827        };
828        assert_eq!(line.content, "foo");
829    }
830
831    #[test]
832    fn bom_stripped_line_offset_starts_after_bom_bytes() {
833        let input = "\u{FEFF}foo\n";
834        let buf = LineBuffer::new(input);
835        let Some(line) = buf.peek_next() else {
836            unreachable!("expected a line");
837        };
838        // BOM is U+FEFF = 3 bytes in UTF-8
839        assert_eq!(line.offset, 3);
840        assert_eq!(line.pos.byte_offset, 3);
841    }
842
843    #[test]
844    fn bom_only_stripped_from_first_line() {
845        // A BOM in a non-first line is preserved as data (the lexer will
846        // report it as an error).
847        let input = "foo\n\u{FEFF}bar\n";
848        let mut buf = LineBuffer::new(input);
849        buf.consume_next(); // consume "foo"
850        let Some(second) = buf.consume_next() else {
851            unreachable!("expected second");
852        };
853        assert_eq!(second.content, "\u{FEFF}bar");
854    }
855
856    // -----------------------------------------------------------------------
857    // indent counting
858    // -----------------------------------------------------------------------
859
860    #[test]
861    fn indent_counts_only_leading_spaces() {
862        let buf = LineBuffer::new("   foo");
863        let Some(line) = buf.peek_next() else {
864            unreachable!("expected a line");
865        };
866        assert_eq!(line.indent, 3);
867    }
868
869    #[test]
870    fn indent_is_zero_for_no_leading_spaces() {
871        let buf = LineBuffer::new("foo");
872        let Some(line) = buf.peek_next() else {
873            unreachable!("expected a line");
874        };
875        assert_eq!(line.indent, 0);
876    }
877
878    #[test]
879    fn leading_tab_does_not_count_toward_indent() {
880        let buf = LineBuffer::new("\tfoo");
881        let Some(line) = buf.peek_next() else {
882            unreachable!("expected a line");
883        };
884        assert_eq!(line.indent, 0);
885    }
886
887    #[test]
888    fn tab_after_spaces_does_not_count() {
889        let buf = LineBuffer::new("  \tfoo");
890        let Some(line) = buf.peek_next() else {
891            unreachable!("expected a line");
892        };
893        assert_eq!(line.indent, 2);
894    }
895
896    #[test]
897    fn indent_of_blank_line_is_zero() {
898        let buf = LineBuffer::new("\n");
899        let Some(line) = buf.peek_next() else {
900            unreachable!("expected a line");
901        };
902        assert_eq!(line.indent, 0);
903    }
904
905    #[test]
906    fn indent_of_spaces_only_line_equals_space_count() {
907        let buf = LineBuffer::new("   \n");
908        let Some(line) = buf.peek_next() else {
909            unreachable!("expected a line");
910        };
911        assert_eq!(line.indent, 3);
912        assert_eq!(line.content, "   ");
913    }
914
915    // -----------------------------------------------------------------------
916    // peek_next_indent
917    // -----------------------------------------------------------------------
918
919    #[test]
920    fn peek_next_indent_returns_indent_of_next_line() {
921        let buf = LineBuffer::new("   foo");
922        assert_eq!(buf.peek_next_indent(), Some(3));
923    }
924
925    #[test]
926    fn peek_next_indent_returns_none_at_eof() {
927        let buf = LineBuffer::new("");
928        assert_eq!(buf.peek_next_indent(), None);
929    }
930
931    #[test]
932    fn peek_next_indent_does_not_consume() {
933        let mut buf = LineBuffer::new("  foo");
934        assert_eq!(buf.peek_next_indent(), Some(2));
935        assert_eq!(buf.peek_next_indent(), Some(2));
936        let Some(line) = buf.consume_next() else {
937            unreachable!("expected a line");
938        };
939        assert_eq!(line.content, "  foo");
940    }
941
942    // -----------------------------------------------------------------------
943    // peek_until_dedent
944    // -----------------------------------------------------------------------
945
946    #[test]
947    fn peek_until_dedent_empty_input_returns_empty_slice() {
948        let mut buf = LineBuffer::new("");
949        assert!(buf.peek_until_dedent(0).is_empty());
950    }
951
952    #[test]
953    fn peek_until_dedent_returns_lines_until_indent_le_base() {
954        let mut buf = LineBuffer::new("  a\n  b\nc\n");
955        let lines = buf.peek_until_dedent(1);
956        assert_eq!(lines.len(), 2);
957        assert_eq!(lines.first().map(|l| l.content), Some("  a"));
958        assert_eq!(lines.get(1).map(|l| l.content), Some("  b"));
959    }
960
961    #[test]
962    fn peek_until_dedent_does_not_consume_lines() {
963        let mut buf = LineBuffer::new("  a\n  b\nc\n");
964        let _ = buf.peek_until_dedent(1);
965        let Some(first) = buf.consume_next() else {
966            unreachable!("expected first");
967        };
968        assert_eq!(first.content, "  a");
969    }
970
971    #[test]
972    fn peek_until_dedent_includes_all_lines_when_no_dedent_occurs() {
973        let mut buf = LineBuffer::new("  a\n  b\n  c");
974        let lines = buf.peek_until_dedent(1);
975        assert_eq!(lines.len(), 3);
976    }
977
978    #[test]
979    fn peek_until_dedent_returns_empty_slice_when_first_line_already_dedented() {
980        let mut buf = LineBuffer::new("a\n  b\n");
981        let lines = buf.peek_until_dedent(1);
982        // "a" has indent 0 <= 1, so stop immediately
983        assert!(lines.is_empty());
984    }
985
986    #[test]
987    fn peek_until_dedent_second_call_returns_same_slice() {
988        let mut buf = LineBuffer::new("  a\n  b\nc");
989        let first_call: Vec<String> = buf
990            .peek_until_dedent(1)
991            .iter()
992            .map(|l| l.content.to_owned())
993            .collect();
994        let second_call: Vec<String> = buf
995            .peek_until_dedent(1)
996            .iter()
997            .map(|l| l.content.to_owned())
998            .collect();
999        assert_eq!(first_call, second_call);
1000        assert_eq!(first_call, ["  a", "  b"]);
1001    }
1002
1003    #[test]
1004    fn peek_until_dedent_base_zero_stops_at_non_indented_lines() {
1005        // base_indent=0: stop at lines with indent <= 0 (i.e., indent == 0).
1006        // Both lines here have indent > 0, so all are included.
1007        let mut buf = LineBuffer::new("  a\n  b\n");
1008        let lines = buf.peek_until_dedent(0);
1009        assert_eq!(lines.len(), 2);
1010    }
1011
1012    #[test]
1013    fn peek_until_dedent_blank_lines_are_transparent() {
1014        // Blank lines (empty content) are transparent: they are included in
1015        // the result and do not halt the scan.
1016        // "  a" (indent 2 > 1) -> included
1017        // ""    (blank)         -> transparent, included
1018        // "  b" (indent 2 > 1) -> included
1019        // "c"   (indent 0 <= 1) -> stop
1020        let mut buf = LineBuffer::new("  a\n\n  b\nc");
1021        let lines = buf.peek_until_dedent(1);
1022        assert_eq!(lines.len(), 3);
1023        assert_eq!(lines.first().map(|l| l.content), Some("  a"));
1024        assert_eq!(lines.get(1).map(|l| l.content), Some(""));
1025        assert_eq!(lines.get(2).map(|l| l.content), Some("  b"));
1026    }
1027
1028    // -----------------------------------------------------------------------
1029    // pos_after_line
1030    // -----------------------------------------------------------------------
1031
1032    #[test]
1033    fn pos_after_line_lf_ascii() {
1034        let line = Line {
1035            content: "hello",
1036            offset: 0,
1037            indent: 0,
1038            break_type: BreakType::Lf,
1039            pos: Pos {
1040                byte_offset: 0,
1041                line: 1,
1042                column: 0,
1043            },
1044        };
1045        let result = pos_after_line(&line);
1046        assert_eq!(result.byte_offset, 6);
1047        assert_eq!(result.line, 2);
1048        assert_eq!(result.column, 0);
1049    }
1050
1051    #[test]
1052    fn pos_after_line_lf_empty_content() {
1053        let line = Line {
1054            content: "",
1055            offset: 10,
1056            indent: 0,
1057            break_type: BreakType::Lf,
1058            pos: Pos {
1059                byte_offset: 10,
1060                line: 3,
1061                column: 0,
1062            },
1063        };
1064        let result = pos_after_line(&line);
1065        assert_eq!(result.byte_offset, 11);
1066        assert_eq!(result.line, 4);
1067        assert_eq!(result.column, 0);
1068    }
1069
1070    #[test]
1071    fn pos_after_line_lf_multibyte() {
1072        let line = Line {
1073            content: "日本",
1074            offset: 0,
1075            indent: 0,
1076            break_type: BreakType::Lf,
1077            pos: Pos {
1078                byte_offset: 0,
1079                line: 1,
1080                column: 0,
1081            },
1082        };
1083        let result = pos_after_line(&line);
1084        assert_eq!(result.byte_offset, 7); // 6 bytes + 1 for \n
1085        assert_eq!(result.line, 2);
1086        assert_eq!(result.column, 0);
1087    }
1088
1089    #[test]
1090    fn pos_after_line_cr_ascii() {
1091        let line = Line {
1092            content: "abc",
1093            offset: 0,
1094            indent: 0,
1095            break_type: BreakType::Cr,
1096            pos: Pos {
1097                byte_offset: 0,
1098                line: 1,
1099                column: 0,
1100            },
1101        };
1102        let result = pos_after_line(&line);
1103        assert_eq!(result.byte_offset, 4);
1104        assert_eq!(result.line, 2);
1105        assert_eq!(result.column, 0);
1106    }
1107
1108    #[test]
1109    fn pos_after_line_cr_empty_content() {
1110        let line = Line {
1111            content: "",
1112            offset: 5,
1113            indent: 0,
1114            break_type: BreakType::Cr,
1115            pos: Pos {
1116                byte_offset: 5,
1117                line: 2,
1118                column: 0,
1119            },
1120        };
1121        let result = pos_after_line(&line);
1122        assert_eq!(result.byte_offset, 6);
1123        assert_eq!(result.line, 3);
1124        assert_eq!(result.column, 0);
1125    }
1126
1127    #[test]
1128    fn pos_after_line_crlf_ascii() {
1129        let line = Line {
1130            content: "key: val",
1131            offset: 0,
1132            indent: 0,
1133            break_type: BreakType::CrLf,
1134            pos: Pos {
1135                byte_offset: 0,
1136                line: 1,
1137                column: 0,
1138            },
1139        };
1140        let result = pos_after_line(&line);
1141        assert_eq!(result.byte_offset, 10);
1142        assert_eq!(result.line, 2);
1143        assert_eq!(result.column, 0);
1144    }
1145
1146    #[test]
1147    fn pos_after_line_crlf_empty_content() {
1148        let line = Line {
1149            content: "",
1150            offset: 0,
1151            indent: 0,
1152            break_type: BreakType::CrLf,
1153            pos: Pos {
1154                byte_offset: 0,
1155                line: 1,
1156                column: 0,
1157            },
1158        };
1159        let result = pos_after_line(&line);
1160        assert_eq!(result.byte_offset, 2);
1161        assert_eq!(result.line, 2);
1162        assert_eq!(result.column, 0);
1163    }
1164
1165    #[test]
1166    fn pos_after_line_eof_empty_content() {
1167        let line = Line {
1168            content: "",
1169            offset: 20,
1170            indent: 0,
1171            break_type: BreakType::Eof,
1172            pos: Pos {
1173                byte_offset: 20,
1174                line: 5,
1175                column: 0,
1176            },
1177        };
1178        let result = pos_after_line(&line);
1179        assert_eq!(result.byte_offset, 20);
1180        assert_eq!(result.line, 5);
1181        assert_eq!(result.column, 0);
1182    }
1183
1184    #[test]
1185    fn pos_after_line_eof_ascii() {
1186        let line = Line {
1187            content: "last",
1188            offset: 10,
1189            indent: 0,
1190            break_type: BreakType::Eof,
1191            pos: Pos {
1192                byte_offset: 10,
1193                line: 3,
1194                column: 0,
1195            },
1196        };
1197        let result = pos_after_line(&line);
1198        assert_eq!(result.byte_offset, 14);
1199        assert_eq!(result.line, 3);
1200        assert_eq!(result.column, 4);
1201    }
1202
1203    #[test]
1204    fn pos_after_line_eof_ascii_nonzero_start_column() {
1205        let line = Line {
1206            content: "end",
1207            offset: 7,
1208            indent: 0,
1209            break_type: BreakType::Eof,
1210            pos: Pos {
1211                byte_offset: 7,
1212                line: 2,
1213                column: 5,
1214            },
1215        };
1216        let result = pos_after_line(&line);
1217        assert_eq!(result.byte_offset, 10);
1218        assert_eq!(result.line, 2);
1219        assert_eq!(result.column, 8);
1220    }
1221
1222    #[test]
1223    fn pos_after_line_eof_multibyte() {
1224        let line = Line {
1225            content: "日本語",
1226            offset: 0,
1227            indent: 0,
1228            break_type: BreakType::Eof,
1229            pos: Pos {
1230                byte_offset: 0,
1231                line: 1,
1232                column: 0,
1233            },
1234        };
1235        let result = pos_after_line(&line);
1236        assert_eq!(result.byte_offset, 9);
1237        assert_eq!(result.line, 1);
1238        assert_eq!(result.column, 3);
1239    }
1240
1241    #[test]
1242    fn pos_after_line_eof_mixed_content() {
1243        let line = Line {
1244            content: "ab日",
1245            offset: 0,
1246            indent: 0,
1247            break_type: BreakType::Eof,
1248            pos: Pos {
1249                byte_offset: 0,
1250                line: 1,
1251                column: 0,
1252            },
1253        };
1254        let result = pos_after_line(&line);
1255        assert_eq!(result.byte_offset, 5);
1256        assert_eq!(result.line, 1);
1257        assert_eq!(result.column, 3);
1258    }
1259}