Skip to main content

rlsp_yaml_parser/
lines.rs

1// SPDX-License-Identifier: MIT
2
3//! Line-at-a-time buffer with one-line lookahead for the streaming parser.
4//!
5//! `LineBuffer` wraps an `&'input str` and yields one [`Line`] at a time,
6//! always keeping the *next* line primed in an internal slot so callers can
7//! peek at the next line's indent without consuming it.  It never scans the
8//! full input up front, giving O(1) first-event latency.
9
10use std::collections::VecDeque;
11
12use crate::pos::Pos;
13
14// ---------------------------------------------------------------------------
15// Public types
16// ---------------------------------------------------------------------------
17
18/// The type of line terminator that ends a [`Line`].
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum BreakType {
21    /// `\n` (line feed)
22    Lf,
23    /// `\r` (bare carriage return — no following `\n`)
24    Cr,
25    /// `\r\n` (CRLF pair)
26    CrLf,
27    /// End of input — the line has no terminator.
28    Eof,
29}
30
31impl BreakType {
32    /// Advance `pos` past this line break.
33    ///
34    /// Each break type requires distinct logic because `Pos::advance(char)`
35    /// operates on individual characters and cannot distinguish bare `\r`
36    /// from `\r\n`.
37    #[must_use]
38    pub const fn advance(self, mut pos: Pos) -> Pos {
39        match self {
40            Self::Lf => pos.advance('\n'),
41            Self::CrLf => {
42                pos.byte_offset += '\r'.len_utf8();
43                pos.char_offset += 1;
44                pos.advance('\n')
45            }
46            Self::Cr => {
47                pos.byte_offset += '\r'.len_utf8();
48                pos.char_offset += 1;
49                pos.line += 1;
50                pos.column = 0;
51                pos
52            }
53            Self::Eof => pos,
54        }
55    }
56}
57
58/// A single logical line extracted from the input.
59#[derive(Debug, Clone, PartialEq, Eq)]
60pub struct Line<'input> {
61    /// The line content slice, **excluding** the terminator.
62    pub content: &'input str,
63    /// Byte offset of `content` within the original input string.
64    pub offset: usize,
65    /// Number of leading `SPACE` (`\x20`) characters.  Leading tabs do not
66    /// contribute to indent — they are a YAML syntax error in indentation
67    /// context and are reported by the lexer, not here.
68    pub indent: usize,
69    /// The terminator that ends this line.
70    pub break_type: BreakType,
71    /// Position of the first byte of this line (after BOM stripping on line 1).
72    pub pos: Pos,
73}
74
75// ---------------------------------------------------------------------------
76// Internal helpers
77// ---------------------------------------------------------------------------
78
79/// Detect the line break at the start of `s` and return `(BreakType, rest)`.
80///
81/// CRLF is checked first so that `\r\n` is consumed as a unit rather than
82/// treating `\r` as a bare CR.
83fn detect_break(s: &str) -> (BreakType, &str) {
84    if let Some(rest) = s.strip_prefix("\r\n") {
85        return (BreakType::CrLf, rest);
86    }
87    if let Some(rest) = s.strip_prefix('\r') {
88        return (BreakType::Cr, rest);
89    }
90    if let Some(rest) = s.strip_prefix('\n') {
91        return (BreakType::Lf, rest);
92    }
93    (BreakType::Eof, s)
94}
95
96/// Scan one line from `remaining`, starting at `pos`.
97///
98/// `is_first` controls BOM stripping: if `true` and the slice starts with
99/// U+FEFF (UTF-8 BOM, 3 bytes), the BOM is skipped before content begins.
100///
101/// Returns `Some((line, rest))` or `None` if `remaining` is empty.
102fn scan_line(remaining: &str, pos: Pos, is_first: bool) -> Option<(Line<'_>, &str)> {
103    if remaining.is_empty() {
104        return None;
105    }
106
107    // Strip BOM on first line only.
108    let (content_start, pos) = if is_first && remaining.starts_with('\u{FEFF}') {
109        let bom_len = '\u{FEFF}'.len_utf8(); // 3 bytes
110        (
111            &remaining[bom_len..],
112            Pos {
113                byte_offset: pos.byte_offset + bom_len,
114                char_offset: pos.char_offset + 1,
115                ..pos
116            },
117        )
118    } else {
119        (remaining, pos)
120    };
121
122    // Find the end of line content (position of the first \n or \r).
123    let line_end = content_start
124        .find(['\n', '\r'])
125        .unwrap_or(content_start.len());
126
127    let content = &content_start[..line_end];
128    let after_content = &content_start[line_end..];
129
130    // Determine break type and advance past the terminator.
131    // Try CRLF first (must be checked before bare CR).
132    let (break_type, after_break) = detect_break(after_content);
133
134    // Count leading SPACE characters only (tabs do not count).
135    let indent = content.chars().take_while(|&ch| ch == ' ').count();
136
137    // `offset` is the byte offset of `content` within the *original* input.
138    // `pos` already reflects the position after any BOM skip.
139    let offset = pos.byte_offset;
140
141    let line = Line {
142        content,
143        offset,
144        indent,
145        break_type,
146        pos,
147    };
148
149    Some((line, after_break))
150}
151
152// ---------------------------------------------------------------------------
153// LineBuffer
154// ---------------------------------------------------------------------------
155
156/// A one-line-lookahead buffer over a `&'input str`.
157///
158/// Always holds the *next* line pre-parsed.  Callers use [`Self::peek_next`]
159/// to inspect without consuming and [`Self::consume_next`] to advance.
160pub struct LineBuffer<'input> {
161    /// Remaining unparsed input (past the next line's terminator).
162    remaining: &'input str,
163    /// Synthetic lines prepended by the caller (e.g. inline content extracted
164    /// from a sequence- or mapping-entry line).  Drained front-first before
165    /// `next`.  A `VecDeque` supports multiple pending prepends when parsing
166    /// implicit mapping entries that need to inject both key and value lines.
167    prepend: VecDeque<Line<'input>>,
168    /// The pre-parsed next line, if any.
169    next: Option<Line<'input>>,
170    /// Position at the start of `remaining`.
171    remaining_pos: Pos,
172    /// Whether the next line to be parsed from `remaining` is the first line
173    /// of input (used for BOM detection after the initial prime).
174    remaining_is_first: bool,
175    /// Lookahead buffer for [`Self::peek_until_dedent`].
176    lookahead: Vec<Line<'input>>,
177}
178
179impl<'input> LineBuffer<'input> {
180    /// Construct a new `LineBuffer` and prime the next-line slot.
181    #[must_use]
182    pub fn new(input: &'input str) -> Self {
183        let mut buf = Self {
184            remaining: input,
185            prepend: VecDeque::new(),
186            next: None,
187            remaining_pos: Pos::ORIGIN,
188            remaining_is_first: true,
189            lookahead: Vec::new(),
190        };
191        buf.prime();
192        buf
193    }
194
195    /// Prepend a synthetic line that will be returned by the next call to
196    /// [`Self::peek_next`] / [`Self::consume_next`], ahead of any real lines.
197    ///
198    /// Used to re-present inline content extracted from a sequence- or
199    /// mapping-entry line as if it were a separate line.  Multiple prepends
200    /// are supported: each call pushes to the front of the queue, so the last
201    /// prepended line is returned first (LIFO order).  Callers that need FIFO
202    /// order (key before value) should prepend value first, then key.
203    pub fn prepend_line(&mut self, line: Line<'input>) {
204        self.lookahead.clear();
205        self.prepend.push_front(line);
206    }
207
208    /// Look at the next line without consuming it.
209    ///
210    /// Returns the frontmost prepended synthetic line first (if any), then the
211    /// normally buffered next line.
212    #[must_use]
213    pub fn peek_next(&self) -> Option<&Line<'input>> {
214        self.prepend.front().or(self.next.as_ref())
215    }
216
217    /// Returns `true` if the next line comes from the prepend queue (synthetic),
218    /// rather than from the original input stream.
219    #[must_use]
220    pub fn is_next_synthetic(&self) -> bool {
221        !self.prepend.is_empty()
222    }
223
224    /// Convenience: the indent of the next line, without consuming it.
225    #[must_use]
226    pub fn peek_next_indent(&self) -> Option<usize> {
227        self.peek_next().map(|l| l.indent)
228    }
229
230    /// Peek at the second upcoming line without consuming either.
231    ///
232    /// Handles the prepend queue: the second line may come from the prepend
233    /// queue or from the primed `next` slot or from `remaining`.
234    #[must_use]
235    pub fn peek_second(&self) -> Option<Line<'input>> {
236        // Determine where the "first" line comes from, then find the "second".
237        if !self.prepend.is_empty() {
238            // First line is prepend[0]. Second is prepend[1] if it exists,
239            // else self.next.
240            if self.prepend.len() >= 2 {
241                return self.prepend.get(1).cloned();
242            }
243            return self.next.clone();
244        }
245        // First line is self.next. Second is the first line from `remaining`.
246        self.next.as_ref()?; // ensure first exists
247        scan_line(self.remaining, self.remaining_pos, self.remaining_is_first).map(|(line, _)| line)
248    }
249
250    /// Advance: return the currently primed next line and prime the following
251    /// one from the remaining input.  Returns `None` when no lines remain.
252    ///
253    /// Drains prepended synthetic lines (front-first) before the real buffer.
254    pub fn consume_next(&mut self) -> Option<Line<'input>> {
255        // Drain prepend queue front-first.
256        if let Some(line) = self.prepend.pop_front() {
257            return Some(line);
258        }
259        // Clear any cached lookahead — it was based on the old position.
260        self.lookahead.clear();
261        let line = self.next.take()?;
262        self.prime();
263        Some(line)
264    }
265
266    /// True when no more lines are available (buffer is empty, no prepend, and
267    /// input is exhausted).
268    #[must_use]
269    pub fn at_eof(&self) -> bool {
270        self.prepend.is_empty() && self.next.is_none()
271    }
272
273    /// Scan forward without consuming to collect all lines with
274    /// `indent > base_indent`, stopping at the first line with
275    /// `indent <= base_indent`.  Blank lines (empty content) are transparent
276    /// to the scan and are included in the result regardless of their indent.
277    ///
278    /// Returns a slice of the buffered lookahead lines.  Calling this method
279    /// repeatedly (without consuming) returns the same slice.
280    ///
281    /// Note: trailing blank lines in the returned slice are **not** part of
282    /// the block scalar content — per YAML chomping rules, trailing blank
283    /// lines are stripped, clipped, or kept based on the chomping indicator.
284    /// The consumer (lexer, Task 8) is responsible for trimming them.
285    pub fn peek_until_dedent(&mut self, base_indent: usize) -> &[Line<'input>] {
286        // Rebuild the lookahead starting from the next line.
287        self.lookahead.clear();
288
289        // We need to scan from the next primed line plus additional lines
290        // from `remaining`.  Use a local cursor.
291        let mut cursor_remaining = self.remaining;
292        let mut cursor_pos = self.remaining_pos;
293        let mut cursor_is_first = self.remaining_is_first;
294
295        // The first line in the lookahead is `self.next` (if any).
296        // We include it if it is blank or its indent > base_indent.
297        let start_line = match self.next.as_ref() {
298            None => return &self.lookahead,
299            Some(l) => l.clone(),
300        };
301
302        // Process lines in order: start with `self.next`, then scan from
303        // `remaining`.
304        let mut scanning_next = Some(start_line);
305
306        loop {
307            let line = match scanning_next.take() {
308                Some(l) => l,
309                None => {
310                    // Fetch from remaining input.
311                    match scan_line(cursor_remaining, cursor_pos, cursor_is_first) {
312                        None => break,
313                        Some((l, rest)) => {
314                            cursor_pos = advance_pos_past_line(&l);
315                            cursor_remaining = rest;
316                            cursor_is_first = false;
317                            l
318                        }
319                    }
320                }
321            };
322
323            // Blank lines (empty content) are transparent: include them and
324            // keep scanning.
325            if line.content.is_empty() {
326                self.lookahead.push(line);
327                continue;
328            }
329
330            // Stop before the first non-blank line that is dedented.
331            // base_indent == usize::MAX is the "root level" sentinel meaning
332            // no indent threshold — include all non-blank lines.
333            if base_indent != usize::MAX && line.indent <= base_indent {
334                break;
335            }
336
337            self.lookahead.push(line);
338        }
339
340        &self.lookahead
341    }
342
343    // -----------------------------------------------------------------------
344    // Private helpers
345    // -----------------------------------------------------------------------
346
347    /// Parse one more line from `remaining` into `self.next`.
348    fn prime(&mut self) {
349        match scan_line(self.remaining, self.remaining_pos, self.remaining_is_first) {
350            None => {
351                self.next = None;
352            }
353            Some((line, rest)) => {
354                // Advance `remaining_pos` past the line we just parsed.
355                let new_pos = advance_pos_past_line(&line);
356                self.remaining_pos = new_pos;
357                self.remaining = rest;
358                self.remaining_is_first = false;
359                self.next = Some(line);
360            }
361        }
362    }
363}
364
365/// Compute the `Pos` after the terminator of `line`.
366///
367/// Walks the content characters then advances past the terminator.
368///
369/// Each break type requires distinct `Pos` update logic because
370/// `Pos::advance(char)` operates on individual characters and cannot
371/// distinguish a bare `\r` (which ends a line) from a `\r` that is part of
372/// a `\r\n` pair.  This function knows the `BreakType` and updates
373/// `line`/`column` accordingly:
374///
375/// - `Lf`: delegate to `Pos::advance('\n')` — it already bumps `line`.
376/// - `CrLf`: bump `byte_offset`/`char_offset` for the `\r`, then delegate
377///   to `Pos::advance('\n')` for the `\n` (which bumps `line`).
378/// - `Cr`: bump `byte_offset`/`char_offset` and increment `line`/reset
379///   `column` directly — `Pos::advance('\r')` would not bump `line`.
380/// - `Eof`: no bytes to advance.
381fn advance_pos_past_line(line: &Line<'_>) -> Pos {
382    let mut pos = line.pos;
383    for ch in line.content.chars() {
384        pos = pos.advance(ch);
385    }
386    line.break_type.advance(pos)
387}
388
389// ---------------------------------------------------------------------------
390// Tests
391// ---------------------------------------------------------------------------
392
393#[cfg(test)]
394mod tests {
395    use super::*;
396
397    // -----------------------------------------------------------------------
398    // BreakType::advance
399    // -----------------------------------------------------------------------
400
401    #[test]
402    fn break_type_advance_lf() {
403        let pos = Pos::ORIGIN;
404        let after = BreakType::Lf.advance(pos);
405        assert_eq!(after.byte_offset, 1);
406        assert_eq!(after.char_offset, 1);
407        assert_eq!(after.line, 2);
408        assert_eq!(after.column, 0);
409    }
410
411    #[test]
412    fn break_type_advance_crlf() {
413        let pos = Pos::ORIGIN;
414        let after = BreakType::CrLf.advance(pos);
415        // \r = 1 byte, \n = 1 byte → 2 bytes total
416        assert_eq!(after.byte_offset, 2);
417        assert_eq!(after.char_offset, 2);
418        assert_eq!(after.line, 2);
419        assert_eq!(after.column, 0);
420    }
421
422    #[test]
423    fn break_type_advance_cr_increments_line() {
424        let pos = Pos::ORIGIN;
425        let after = BreakType::Cr.advance(pos);
426        assert_eq!(after.line, 2);
427    }
428
429    #[test]
430    fn break_type_advance_cr_resets_column() {
431        let pos = Pos {
432            byte_offset: 3,
433            char_offset: 3,
434            line: 1,
435            column: 3,
436        };
437        let after = BreakType::Cr.advance(pos);
438        assert_eq!(after.column, 0);
439        assert_eq!(after.byte_offset, 4); // \r = 1 byte
440        assert_eq!(after.line, 2);
441    }
442
443    #[test]
444    fn break_type_advance_lf_at_non_origin_pos() {
445        let pos = Pos {
446            byte_offset: 5,
447            char_offset: 5,
448            line: 2,
449            column: 3,
450        };
451        let after = BreakType::Lf.advance(pos);
452        assert_eq!(after.byte_offset, 6);
453        assert_eq!(after.char_offset, 6);
454        assert_eq!(after.line, 3);
455        assert_eq!(after.column, 0);
456    }
457
458    #[test]
459    fn break_type_advance_crlf_at_non_origin_pos() {
460        let pos = Pos {
461            byte_offset: 5,
462            char_offset: 5,
463            line: 2,
464            column: 3,
465        };
466        let after = BreakType::CrLf.advance(pos);
467        assert_eq!(after.byte_offset, 7); // \r (1) + \n (1) = +2
468        assert_eq!(after.char_offset, 7);
469        assert_eq!(after.line, 3);
470        assert_eq!(after.column, 0);
471    }
472
473    #[test]
474    fn break_type_advance_eof_is_noop() {
475        let pos = Pos {
476            byte_offset: 5,
477            char_offset: 4,
478            line: 3,
479            column: 2,
480        };
481        let after = BreakType::Eof.advance(pos);
482        assert_eq!(after, pos);
483    }
484
485    // -----------------------------------------------------------------------
486    // new and initial state
487    // -----------------------------------------------------------------------
488
489    #[test]
490    fn new_empty_input_at_eof_immediately() {
491        let buf = LineBuffer::new("");
492        assert!(buf.peek_next().is_none());
493        assert!(buf.at_eof());
494    }
495
496    #[test]
497    fn new_single_line_no_newline_primes_eof_line() {
498        let buf = LineBuffer::new("foo");
499        let Some(line) = buf.peek_next() else {
500            unreachable!("expected a line");
501        };
502        assert_eq!(line.content, "foo");
503        assert_eq!(line.break_type, BreakType::Eof);
504        assert_eq!(line.offset, 0);
505    }
506
507    #[test]
508    fn new_single_line_with_lf_primes_first_line() {
509        let buf = LineBuffer::new("foo\n");
510        let Some(line) = buf.peek_next() else {
511            unreachable!("expected a line");
512        };
513        assert_eq!(line.content, "foo");
514        assert_eq!(line.break_type, BreakType::Lf);
515    }
516
517    #[test]
518    fn new_input_with_only_lf_primes_empty_line() {
519        let buf = LineBuffer::new("\n");
520        let Some(line) = buf.peek_next() else {
521            unreachable!("expected a line");
522        };
523        assert_eq!(line.content, "");
524        assert_eq!(line.break_type, BreakType::Lf);
525    }
526
527    // -----------------------------------------------------------------------
528    // consume_next sequencing
529    // -----------------------------------------------------------------------
530
531    #[test]
532    fn consume_returns_primed_line_and_advances() {
533        let mut buf = LineBuffer::new("a\nb\n");
534        let Some(first) = buf.consume_next() else {
535            unreachable!("expected first line");
536        };
537        assert_eq!(first.content, "a");
538        assert_eq!(first.break_type, BreakType::Lf);
539        let Some(second) = buf.consume_next() else {
540            unreachable!("expected second line");
541        };
542        assert_eq!(second.content, "b");
543        assert_eq!(second.break_type, BreakType::Lf);
544    }
545
546    #[test]
547    fn consume_after_last_line_returns_none() {
548        let mut buf = LineBuffer::new("foo");
549        assert!(buf.consume_next().is_some());
550        assert!(buf.consume_next().is_none());
551    }
552
553    #[test]
554    fn at_eof_false_before_consuming_last_and_true_after() {
555        let mut buf = LineBuffer::new("foo");
556        assert!(!buf.at_eof());
557        buf.consume_next();
558        assert!(buf.at_eof());
559    }
560
561    #[test]
562    fn consume_all_lines_then_peek_returns_none() {
563        let mut buf = LineBuffer::new("a\nb");
564        buf.consume_next();
565        buf.consume_next();
566        assert!(buf.peek_next().is_none());
567    }
568
569    // -----------------------------------------------------------------------
570    // line terminator types
571    // -----------------------------------------------------------------------
572
573    #[test]
574    fn lf_terminator_produces_lf_break_type() {
575        let mut buf = LineBuffer::new("a\n");
576        let Some(line) = buf.consume_next() else {
577            unreachable!("expected a line");
578        };
579        assert_eq!(line.break_type, BreakType::Lf);
580    }
581
582    #[test]
583    fn crlf_terminator_produces_crlf_break_type_not_two_lines() {
584        let mut buf = LineBuffer::new("a\r\nb");
585        let Some(first) = buf.consume_next() else {
586            unreachable!("expected first");
587        };
588        assert_eq!(first.content, "a");
589        assert_eq!(first.break_type, BreakType::CrLf);
590        let Some(second) = buf.consume_next() else {
591            unreachable!("expected second");
592        };
593        assert_eq!(second.content, "b");
594        assert_eq!(second.break_type, BreakType::Eof);
595        assert!(buf.consume_next().is_none());
596    }
597
598    #[test]
599    fn bare_cr_terminator_produces_cr_break_type() {
600        let mut buf = LineBuffer::new("a\rb");
601        let Some(first) = buf.consume_next() else {
602            unreachable!("expected first");
603        };
604        assert_eq!(first.content, "a");
605        assert_eq!(first.break_type, BreakType::Cr);
606        let Some(second) = buf.consume_next() else {
607            unreachable!("expected second");
608        };
609        assert_eq!(second.content, "b");
610        assert_eq!(second.break_type, BreakType::Eof);
611    }
612
613    #[test]
614    fn no_terminator_on_last_line_produces_eof_break_type() {
615        let mut buf = LineBuffer::new("a\nb");
616        buf.consume_next();
617        let Some(second) = buf.consume_next() else {
618            unreachable!("expected second");
619        };
620        assert_eq!(second.content, "b");
621        assert_eq!(second.break_type, BreakType::Eof);
622    }
623
624    #[test]
625    fn mixed_line_endings_each_line_has_correct_break_type() {
626        let mut buf = LineBuffer::new("a\nb\r\nc\rd");
627        let types: Vec<BreakType> = (0..4)
628            .filter_map(|_| buf.consume_next().map(|l| l.break_type))
629            .collect();
630        assert_eq!(
631            types,
632            [
633                BreakType::Lf,
634                BreakType::CrLf,
635                BreakType::Cr,
636                BreakType::Eof
637            ]
638        );
639    }
640
641    #[test]
642    fn only_crlf_produces_one_empty_line_not_two() {
643        let mut buf = LineBuffer::new("\r\n");
644        let Some(line) = buf.consume_next() else {
645            unreachable!("expected a line");
646        };
647        assert_eq!(line.content, "");
648        assert_eq!(line.break_type, BreakType::CrLf);
649        assert!(buf.consume_next().is_none());
650    }
651
652    #[test]
653    fn only_cr_produces_one_empty_line() {
654        let mut buf = LineBuffer::new("\r");
655        let Some(line) = buf.consume_next() else {
656            unreachable!("expected a line");
657        };
658        assert_eq!(line.content, "");
659        assert_eq!(line.break_type, BreakType::Cr);
660        assert!(buf.consume_next().is_none());
661    }
662
663    #[test]
664    fn only_lf_produces_one_empty_line() {
665        let mut buf = LineBuffer::new("\n");
666        let Some(line) = buf.consume_next() else {
667            unreachable!("expected a line");
668        };
669        assert_eq!(line.content, "");
670        assert_eq!(line.break_type, BreakType::Lf);
671        assert!(buf.consume_next().is_none());
672    }
673
674    #[test]
675    fn two_consecutive_lf_produce_two_empty_lines() {
676        let mut buf = LineBuffer::new("\n\n");
677        let Some(first) = buf.consume_next() else {
678            unreachable!("expected first");
679        };
680        assert_eq!(first.content, "");
681        assert_eq!(first.break_type, BreakType::Lf);
682        let Some(second) = buf.consume_next() else {
683            unreachable!("expected second");
684        };
685        assert_eq!(second.content, "");
686        assert_eq!(second.break_type, BreakType::Lf);
687        assert!(buf.consume_next().is_none());
688    }
689
690    #[test]
691    fn trailing_lf_does_not_produce_extra_empty_line() {
692        // A trailing newline terminates the last line; it does not introduce
693        // a new empty line.
694        let mut buf = LineBuffer::new("foo\n");
695        let Some(line) = buf.consume_next() else {
696            unreachable!("expected a line");
697        };
698        assert_eq!(line.content, "foo");
699        assert!(buf.consume_next().is_none());
700    }
701
702    // -----------------------------------------------------------------------
703    // offset and Pos tracking
704    // -----------------------------------------------------------------------
705
706    #[test]
707    fn offset_is_byte_offset_of_content_start() {
708        let mut buf = LineBuffer::new("foo\nbar\n");
709        let Some(first) = buf.consume_next() else {
710            unreachable!("expected first");
711        };
712        assert_eq!(first.offset, 0);
713        let Some(second) = buf.consume_next() else {
714            unreachable!("expected second");
715        };
716        assert_eq!(second.offset, 4); // "foo\n" = 4 bytes
717    }
718
719    #[test]
720    fn offset_and_pos_byte_offset_agree() {
721        let mut buf = LineBuffer::new("foo\nbar");
722        while let Some(line) = buf.consume_next() {
723            assert_eq!(line.offset, line.pos.byte_offset);
724        }
725    }
726
727    #[test]
728    fn pos_line_number_increments_per_line() {
729        let mut buf = LineBuffer::new("a\nb\nc");
730        let lines: Vec<Line<'_>> = (0..3).filter_map(|_| buf.consume_next()).collect();
731        assert_eq!(lines.len(), 3, "expected 3 lines");
732        assert_eq!(lines.first().map(|l| l.pos.line), Some(1));
733        assert_eq!(lines.get(1).map(|l| l.pos.line), Some(2));
734        assert_eq!(lines.get(2).map(|l| l.pos.line), Some(3));
735    }
736
737    #[test]
738    fn pos_column_is_zero_at_start_of_each_line() {
739        let mut buf = LineBuffer::new("a\nb");
740        while let Some(line) = buf.consume_next() {
741            assert_eq!(line.pos.column, 0);
742        }
743    }
744
745    #[test]
746    fn pos_line_increments_after_bare_cr() {
747        // Bare \r is a line terminator: the next line must start on line 2.
748        let mut buf = LineBuffer::new("a\rb");
749        let Some(first) = buf.consume_next() else {
750            unreachable!("expected first");
751        };
752        assert_eq!(first.pos.line, 1);
753        let Some(second) = buf.consume_next() else {
754            unreachable!("expected second");
755        };
756        assert_eq!(second.pos.line, 2);
757        assert_eq!(second.pos.column, 0);
758    }
759
760    #[test]
761    fn pos_column_resets_after_bare_cr() {
762        // After consuming a line that ends with bare \r, the next line's
763        // column must be 0, not the column that followed the last content char.
764        let mut buf = LineBuffer::new("abc\rd");
765        buf.consume_next(); // consume "abc"
766        let Some(second) = buf.consume_next() else {
767            unreachable!("expected second");
768        };
769        assert_eq!(second.pos.column, 0);
770    }
771
772    #[test]
773    fn pos_line_increments_after_crlf() {
774        // CRLF is a line terminator: the next line must start on line 2.
775        let mut buf = LineBuffer::new("a\r\nb");
776        let Some(first) = buf.consume_next() else {
777            unreachable!("expected first");
778        };
779        assert_eq!(first.pos.line, 1);
780        let Some(second) = buf.consume_next() else {
781            unreachable!("expected second");
782        };
783        assert_eq!(second.pos.line, 2);
784        assert_eq!(second.pos.column, 0);
785    }
786
787    #[test]
788    fn pos_after_mixed_endings_tracks_lines_correctly() {
789        // Input has four lines with three different terminator types.
790        let mut buf = LineBuffer::new("a\nb\r\nc\rd");
791        let lines: Vec<Line<'_>> = (0..4).filter_map(|_| buf.consume_next()).collect();
792        assert_eq!(lines.len(), 4, "expected 4 lines");
793        let line_nums: Vec<usize> = lines.iter().map(|l| l.pos.line).collect();
794        assert_eq!(line_nums, [1, 2, 3, 4]);
795        for line in &lines {
796            assert_eq!(
797                line.pos.column, 0,
798                "line {} should start at column 0",
799                line.pos.line
800            );
801        }
802    }
803
804    #[test]
805    fn multibyte_content_byte_offset_is_byte_based_not_char_based() {
806        // '中' is 3 UTF-8 bytes
807        let mut buf = LineBuffer::new("中\nfoo");
808        let Some(first) = buf.consume_next() else {
809            unreachable!("expected first");
810        };
811        assert_eq!(first.offset, 0);
812        assert_eq!(first.content, "中");
813        let Some(second) = buf.consume_next() else {
814            unreachable!("expected second");
815        };
816        // 3 bytes for '中' + 1 byte for '\n' = 4
817        assert_eq!(second.offset, 4);
818    }
819
820    // -----------------------------------------------------------------------
821    // BOM handling
822    // -----------------------------------------------------------------------
823
824    #[test]
825    fn bom_is_stripped_from_content_of_first_line() {
826        let input = "\u{FEFF}foo\n";
827        let buf = LineBuffer::new(input);
828        let Some(line) = buf.peek_next() else {
829            unreachable!("expected a line");
830        };
831        assert_eq!(line.content, "foo");
832    }
833
834    #[test]
835    fn bom_stripped_line_offset_starts_after_bom_bytes() {
836        let input = "\u{FEFF}foo\n";
837        let buf = LineBuffer::new(input);
838        let Some(line) = buf.peek_next() else {
839            unreachable!("expected a line");
840        };
841        // BOM is U+FEFF = 3 bytes in UTF-8
842        assert_eq!(line.offset, 3);
843        assert_eq!(line.pos.byte_offset, 3);
844    }
845
846    #[test]
847    fn bom_only_stripped_from_first_line() {
848        // A BOM in a non-first line is preserved as data (the lexer will
849        // report it as an error).
850        let input = "foo\n\u{FEFF}bar\n";
851        let mut buf = LineBuffer::new(input);
852        buf.consume_next(); // consume "foo"
853        let Some(second) = buf.consume_next() else {
854            unreachable!("expected second");
855        };
856        assert_eq!(second.content, "\u{FEFF}bar");
857    }
858
859    // -----------------------------------------------------------------------
860    // indent counting
861    // -----------------------------------------------------------------------
862
863    #[test]
864    fn indent_counts_only_leading_spaces() {
865        let buf = LineBuffer::new("   foo");
866        let Some(line) = buf.peek_next() else {
867            unreachable!("expected a line");
868        };
869        assert_eq!(line.indent, 3);
870    }
871
872    #[test]
873    fn indent_is_zero_for_no_leading_spaces() {
874        let buf = LineBuffer::new("foo");
875        let Some(line) = buf.peek_next() else {
876            unreachable!("expected a line");
877        };
878        assert_eq!(line.indent, 0);
879    }
880
881    #[test]
882    fn leading_tab_does_not_count_toward_indent() {
883        let buf = LineBuffer::new("\tfoo");
884        let Some(line) = buf.peek_next() else {
885            unreachable!("expected a line");
886        };
887        assert_eq!(line.indent, 0);
888    }
889
890    #[test]
891    fn tab_after_spaces_does_not_count() {
892        let buf = LineBuffer::new("  \tfoo");
893        let Some(line) = buf.peek_next() else {
894            unreachable!("expected a line");
895        };
896        assert_eq!(line.indent, 2);
897    }
898
899    #[test]
900    fn indent_of_blank_line_is_zero() {
901        let buf = LineBuffer::new("\n");
902        let Some(line) = buf.peek_next() else {
903            unreachable!("expected a line");
904        };
905        assert_eq!(line.indent, 0);
906    }
907
908    #[test]
909    fn indent_of_spaces_only_line_equals_space_count() {
910        let buf = LineBuffer::new("   \n");
911        let Some(line) = buf.peek_next() else {
912            unreachable!("expected a line");
913        };
914        assert_eq!(line.indent, 3);
915        assert_eq!(line.content, "   ");
916    }
917
918    // -----------------------------------------------------------------------
919    // peek_next_indent
920    // -----------------------------------------------------------------------
921
922    #[test]
923    fn peek_next_indent_returns_indent_of_next_line() {
924        let buf = LineBuffer::new("   foo");
925        assert_eq!(buf.peek_next_indent(), Some(3));
926    }
927
928    #[test]
929    fn peek_next_indent_returns_none_at_eof() {
930        let buf = LineBuffer::new("");
931        assert_eq!(buf.peek_next_indent(), None);
932    }
933
934    #[test]
935    fn peek_next_indent_does_not_consume() {
936        let mut buf = LineBuffer::new("  foo");
937        assert_eq!(buf.peek_next_indent(), Some(2));
938        assert_eq!(buf.peek_next_indent(), Some(2));
939        let Some(line) = buf.consume_next() else {
940            unreachable!("expected a line");
941        };
942        assert_eq!(line.content, "  foo");
943    }
944
945    // -----------------------------------------------------------------------
946    // peek_until_dedent
947    // -----------------------------------------------------------------------
948
949    #[test]
950    fn peek_until_dedent_empty_input_returns_empty_slice() {
951        let mut buf = LineBuffer::new("");
952        assert!(buf.peek_until_dedent(0).is_empty());
953    }
954
955    #[test]
956    fn peek_until_dedent_returns_lines_until_indent_le_base() {
957        let mut buf = LineBuffer::new("  a\n  b\nc\n");
958        let lines = buf.peek_until_dedent(1);
959        assert_eq!(lines.len(), 2);
960        assert_eq!(lines.first().map(|l| l.content), Some("  a"));
961        assert_eq!(lines.get(1).map(|l| l.content), Some("  b"));
962    }
963
964    #[test]
965    fn peek_until_dedent_does_not_consume_lines() {
966        let mut buf = LineBuffer::new("  a\n  b\nc\n");
967        let _ = buf.peek_until_dedent(1);
968        let Some(first) = buf.consume_next() else {
969            unreachable!("expected first");
970        };
971        assert_eq!(first.content, "  a");
972    }
973
974    #[test]
975    fn peek_until_dedent_includes_all_lines_when_no_dedent_occurs() {
976        let mut buf = LineBuffer::new("  a\n  b\n  c");
977        let lines = buf.peek_until_dedent(1);
978        assert_eq!(lines.len(), 3);
979    }
980
981    #[test]
982    fn peek_until_dedent_returns_empty_slice_when_first_line_already_dedented() {
983        let mut buf = LineBuffer::new("a\n  b\n");
984        let lines = buf.peek_until_dedent(1);
985        // "a" has indent 0 <= 1, so stop immediately
986        assert!(lines.is_empty());
987    }
988
989    #[test]
990    fn peek_until_dedent_second_call_returns_same_slice() {
991        let mut buf = LineBuffer::new("  a\n  b\nc");
992        let first_call: Vec<String> = buf
993            .peek_until_dedent(1)
994            .iter()
995            .map(|l| l.content.to_owned())
996            .collect();
997        let second_call: Vec<String> = buf
998            .peek_until_dedent(1)
999            .iter()
1000            .map(|l| l.content.to_owned())
1001            .collect();
1002        assert_eq!(first_call, second_call);
1003        assert_eq!(first_call, ["  a", "  b"]);
1004    }
1005
1006    #[test]
1007    fn peek_until_dedent_base_zero_stops_at_non_indented_lines() {
1008        // base_indent=0: stop at lines with indent <= 0 (i.e., indent == 0).
1009        // Both lines here have indent > 0, so all are included.
1010        let mut buf = LineBuffer::new("  a\n  b\n");
1011        let lines = buf.peek_until_dedent(0);
1012        assert_eq!(lines.len(), 2);
1013    }
1014
1015    #[test]
1016    fn peek_until_dedent_blank_lines_are_transparent() {
1017        // Blank lines (empty content) are transparent: they are included in
1018        // the result and do not halt the scan.
1019        // "  a" (indent 2 > 1) -> included
1020        // ""    (blank)         -> transparent, included
1021        // "  b" (indent 2 > 1) -> included
1022        // "c"   (indent 0 <= 1) -> stop
1023        let mut buf = LineBuffer::new("  a\n\n  b\nc");
1024        let lines = buf.peek_until_dedent(1);
1025        assert_eq!(lines.len(), 3);
1026        assert_eq!(lines.first().map(|l| l.content), Some("  a"));
1027        assert_eq!(lines.get(1).map(|l| l.content), Some(""));
1028        assert_eq!(lines.get(2).map(|l| l.content), Some("  b"));
1029    }
1030}