use std::collections::VecDeque;
use crate::pos::Pos;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BreakType {
Lf,
Cr,
CrLf,
Eof,
}
impl BreakType {
#[must_use]
pub const fn byte_len(self) -> usize {
match self {
Self::Lf | Self::Cr => 1,
Self::CrLf => 2,
Self::Eof => 0,
}
}
#[must_use]
pub const fn advance(self, mut pos: Pos) -> Pos {
match self {
Self::Lf => pos.advance('\n'),
Self::CrLf => {
pos.byte_offset += '\r'.len_utf8();
pos.advance('\n')
}
Self::Cr => {
pos.byte_offset += '\r'.len_utf8();
pos.line += 1;
pos.column = 0;
pos
}
Self::Eof => pos,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Line<'input> {
pub content: &'input str,
pub offset: usize,
pub indent: usize,
pub break_type: BreakType,
pub pos: Pos,
}
fn detect_break(s: &str) -> (BreakType, &str) {
if let Some(rest) = s.strip_prefix("\r\n") {
return (BreakType::CrLf, rest);
}
if let Some(rest) = s.strip_prefix('\r') {
return (BreakType::Cr, rest);
}
if let Some(rest) = s.strip_prefix('\n') {
return (BreakType::Lf, rest);
}
(BreakType::Eof, s)
}
fn scan_line(remaining: &str, pos: Pos, is_first: bool) -> Option<(Line<'_>, &str)> {
if remaining.is_empty() {
return None;
}
let (content_start, pos) = if is_first && remaining.starts_with('\u{FEFF}') {
let bom_len = '\u{FEFF}'.len_utf8(); (
&remaining[bom_len..],
Pos {
byte_offset: pos.byte_offset + bom_len,
..pos
},
)
} else {
(remaining, pos)
};
let line_end = content_start
.find(['\n', '\r'])
.unwrap_or(content_start.len());
let content = &content_start[..line_end];
let after_content = &content_start[line_end..];
let (break_type, after_break) = detect_break(after_content);
let indent = content.chars().take_while(|&ch| ch == ' ').count();
let offset = pos.byte_offset;
let line = Line {
content,
offset,
indent,
break_type,
pos,
};
Some((line, after_break))
}
pub struct LineBuffer<'input> {
remaining: &'input str,
prepend: VecDeque<Line<'input>>,
next: Option<Line<'input>>,
remaining_pos: Pos,
remaining_is_first: bool,
lookahead: Vec<Line<'input>>,
}
impl<'input> LineBuffer<'input> {
#[must_use]
pub fn new(input: &'input str) -> Self {
let mut buf = Self {
remaining: input,
prepend: VecDeque::new(),
next: None,
remaining_pos: Pos::ORIGIN,
remaining_is_first: true,
lookahead: Vec::new(),
};
buf.prime();
buf
}
pub fn prepend_line(&mut self, line: Line<'input>) {
self.lookahead.clear();
self.prepend.push_front(line);
}
#[must_use]
pub fn peek_next(&self) -> Option<&Line<'input>> {
self.prepend.front().or(self.next.as_ref())
}
#[must_use]
pub fn is_next_synthetic(&self) -> bool {
!self.prepend.is_empty()
}
#[must_use]
pub fn peek_next_indent(&self) -> Option<usize> {
self.peek_next().map(|l| l.indent)
}
#[must_use]
pub fn peek_second(&self) -> Option<Line<'input>> {
if !self.prepend.is_empty() {
if self.prepend.len() >= 2 {
return self.prepend.get(1).cloned();
}
return self.next.clone();
}
self.next.as_ref()?; scan_line(self.remaining, self.remaining_pos, self.remaining_is_first).map(|(line, _)| line)
}
pub fn consume_next(&mut self) -> Option<Line<'input>> {
if let Some(line) = self.prepend.pop_front() {
return Some(line);
}
self.lookahead.clear();
let line = self.next.take()?;
self.prime();
Some(line)
}
#[must_use]
pub fn at_eof(&self) -> bool {
self.prepend.is_empty() && self.next.is_none()
}
pub fn peek_until_dedent(&mut self, base_indent: usize) -> &[Line<'input>] {
self.lookahead.clear();
let mut cursor_remaining = self.remaining;
let mut cursor_pos = self.remaining_pos;
let mut cursor_is_first = self.remaining_is_first;
let start_line = match self.next.as_ref() {
None => return &self.lookahead,
Some(l) => l.clone(),
};
let mut scanning_next = Some(start_line);
loop {
let line = match scanning_next.take() {
Some(l) => l,
None => {
match scan_line(cursor_remaining, cursor_pos, cursor_is_first) {
None => break,
Some((l, rest)) => {
cursor_pos = pos_after_line(&l);
cursor_remaining = rest;
cursor_is_first = false;
l
}
}
}
};
if line.content.is_empty() {
self.lookahead.push(line);
continue;
}
if base_indent != usize::MAX && line.indent <= base_indent {
break;
}
self.lookahead.push(line);
}
&self.lookahead
}
fn prime(&mut self) {
match scan_line(self.remaining, self.remaining_pos, self.remaining_is_first) {
None => {
self.next = None;
}
Some((line, rest)) => {
let new_pos = pos_after_line(&line);
self.remaining_pos = new_pos;
self.remaining = rest;
self.remaining_is_first = false;
self.next = Some(line);
}
}
}
}
pub fn pos_after_line(line: &Line<'_>) -> Pos {
let byte_offset = line.offset + line.content.len() + line.break_type.byte_len();
match line.break_type {
BreakType::Eof => Pos {
byte_offset,
line: line.pos.line,
column: line.pos.column + crate::pos::column_at(line.content, line.content.len()),
},
BreakType::Lf | BreakType::Cr | BreakType::CrLf => Pos {
byte_offset,
line: line.pos.line + 1,
column: 0,
},
}
}
#[cfg(test)]
mod tests {
use rstest::rstest;
use super::*;
#[rstest]
#[case::break_type_advance_lf(BreakType::Lf, Pos::ORIGIN, 1, 2, 0)]
#[case::break_type_advance_crlf(BreakType::CrLf, Pos::ORIGIN, 2, 2, 0)]
#[case::break_type_advance_lf_at_non_origin_pos(BreakType::Lf, Pos { byte_offset: 5, line: 2, column: 3 }, 6, 3, 0)]
#[case::break_type_advance_crlf_at_non_origin_pos(BreakType::CrLf, Pos { byte_offset: 5, line: 2, column: 3 }, 7, 3, 0)]
#[case::break_type_advance_cr_resets_column(BreakType::Cr, Pos { byte_offset: 3, line: 1, column: 3 }, 4, 2, 0)]
fn break_type_advance_all_fields(
#[case] break_type: BreakType,
#[case] input: Pos,
#[case] expected_byte_offset: usize,
#[case] expected_line: usize,
#[case] expected_column: usize,
) {
let after = break_type.advance(input);
assert_eq!(after.byte_offset, expected_byte_offset);
assert_eq!(after.line, expected_line);
assert_eq!(after.column, expected_column);
}
#[test]
fn break_type_advance_cr_increments_line() {
let pos = Pos::ORIGIN;
let after = BreakType::Cr.advance(pos);
assert_eq!(after.line, 2);
}
#[test]
fn break_type_advance_eof_is_noop() {
let pos = Pos {
byte_offset: 5,
line: 3,
column: 2,
};
let after = BreakType::Eof.advance(pos);
assert_eq!(after, pos);
}
#[rstest]
#[case::new_single_line_with_lf_primes_first_line("foo\n", "foo", BreakType::Lf)]
#[case::new_input_with_only_lf_primes_empty_line("\n", "", BreakType::Lf)]
fn new_single_line_peek(
#[case] input: &str,
#[case] expected_content: &str,
#[case] expected_break: BreakType,
) {
let buf = LineBuffer::new(input);
let Some(line) = buf.peek_next() else {
unreachable!("expected a line");
};
assert_eq!(line.content, expected_content);
assert_eq!(line.break_type, expected_break);
}
#[test]
fn new_empty_input_at_eof_immediately() {
let buf = LineBuffer::new("");
assert!(buf.peek_next().is_none());
assert!(buf.at_eof());
}
#[test]
fn new_single_line_no_newline_primes_eof_line() {
let buf = LineBuffer::new("foo");
let Some(line) = buf.peek_next() else {
unreachable!("expected a line");
};
assert_eq!(line.content, "foo");
assert_eq!(line.break_type, BreakType::Eof);
assert_eq!(line.offset, 0);
}
#[test]
fn consume_returns_primed_line_and_advances() {
let mut buf = LineBuffer::new("a\nb\n");
let Some(first) = buf.consume_next() else {
unreachable!("expected first line");
};
assert_eq!(first.content, "a");
assert_eq!(first.break_type, BreakType::Lf);
let Some(second) = buf.consume_next() else {
unreachable!("expected second line");
};
assert_eq!(second.content, "b");
assert_eq!(second.break_type, BreakType::Lf);
}
#[test]
fn consume_after_last_line_returns_none() {
let mut buf = LineBuffer::new("foo");
assert!(buf.consume_next().is_some());
assert!(buf.consume_next().is_none());
}
#[test]
fn at_eof_false_before_consuming_last_and_true_after() {
let mut buf = LineBuffer::new("foo");
assert!(!buf.at_eof());
buf.consume_next();
assert!(buf.at_eof());
}
#[test]
fn consume_all_lines_then_peek_returns_none() {
let mut buf = LineBuffer::new("a\nb");
buf.consume_next();
buf.consume_next();
assert!(buf.peek_next().is_none());
}
#[rstest]
#[case::only_lf_produces_one_empty_line("\n", BreakType::Lf)]
#[case::only_cr_produces_one_empty_line("\r", BreakType::Cr)]
#[case::only_crlf_produces_one_empty_line_not_two("\r\n", BreakType::CrLf)]
fn single_terminator_produces_empty_line(
#[case] input: &str,
#[case] expected_break: BreakType,
) {
let mut buf = LineBuffer::new(input);
let Some(line) = buf.consume_next() else {
unreachable!("expected a line");
};
assert_eq!(line.content, "");
assert_eq!(line.break_type, expected_break);
assert!(buf.consume_next().is_none());
}
#[test]
fn lf_terminator_produces_lf_break_type() {
let mut buf = LineBuffer::new("a\n");
let Some(line) = buf.consume_next() else {
unreachable!("expected a line");
};
assert_eq!(line.break_type, BreakType::Lf);
}
#[test]
fn crlf_terminator_produces_crlf_break_type_not_two_lines() {
let mut buf = LineBuffer::new("a\r\nb");
let Some(first) = buf.consume_next() else {
unreachable!("expected first");
};
assert_eq!(first.content, "a");
assert_eq!(first.break_type, BreakType::CrLf);
let Some(second) = buf.consume_next() else {
unreachable!("expected second");
};
assert_eq!(second.content, "b");
assert_eq!(second.break_type, BreakType::Eof);
assert!(buf.consume_next().is_none());
}
#[test]
fn bare_cr_terminator_produces_cr_break_type() {
let mut buf = LineBuffer::new("a\rb");
let Some(first) = buf.consume_next() else {
unreachable!("expected first");
};
assert_eq!(first.content, "a");
assert_eq!(first.break_type, BreakType::Cr);
let Some(second) = buf.consume_next() else {
unreachable!("expected second");
};
assert_eq!(second.content, "b");
assert_eq!(second.break_type, BreakType::Eof);
}
#[test]
fn no_terminator_on_last_line_produces_eof_break_type() {
let mut buf = LineBuffer::new("a\nb");
buf.consume_next();
let Some(second) = buf.consume_next() else {
unreachable!("expected second");
};
assert_eq!(second.content, "b");
assert_eq!(second.break_type, BreakType::Eof);
}
#[test]
fn mixed_line_endings_each_line_has_correct_break_type() {
let mut buf = LineBuffer::new("a\nb\r\nc\rd");
let types: Vec<BreakType> = (0..4)
.filter_map(|_| buf.consume_next().map(|l| l.break_type))
.collect();
assert_eq!(
types,
[
BreakType::Lf,
BreakType::CrLf,
BreakType::Cr,
BreakType::Eof
]
);
}
#[test]
fn two_consecutive_lf_produce_two_empty_lines() {
let mut buf = LineBuffer::new("\n\n");
let Some(first) = buf.consume_next() else {
unreachable!("expected first");
};
assert_eq!(first.content, "");
assert_eq!(first.break_type, BreakType::Lf);
let Some(second) = buf.consume_next() else {
unreachable!("expected second");
};
assert_eq!(second.content, "");
assert_eq!(second.break_type, BreakType::Lf);
assert!(buf.consume_next().is_none());
}
#[test]
fn trailing_lf_does_not_produce_extra_empty_line() {
let mut buf = LineBuffer::new("foo\n");
let Some(line) = buf.consume_next() else {
unreachable!("expected a line");
};
assert_eq!(line.content, "foo");
assert!(buf.consume_next().is_none());
}
#[rstest]
#[case::pos_line_increments_after_bare_cr("a\rb")]
#[case::pos_line_increments_after_crlf("a\r\nb")]
fn pos_line_increments_after_terminator(#[case] input: &str) {
let mut buf = LineBuffer::new(input);
let Some(first) = buf.consume_next() else {
unreachable!("expected first");
};
assert_eq!(first.pos.line, 1);
let Some(second) = buf.consume_next() else {
unreachable!("expected second");
};
assert_eq!(second.pos.line, 2);
assert_eq!(second.pos.column, 0);
}
#[test]
fn offset_is_byte_offset_of_content_start() {
let mut buf = LineBuffer::new("foo\nbar\n");
let Some(first) = buf.consume_next() else {
unreachable!("expected first");
};
assert_eq!(first.offset, 0);
let Some(second) = buf.consume_next() else {
unreachable!("expected second");
};
assert_eq!(second.offset, 4); }
#[test]
fn offset_and_pos_byte_offset_agree() {
let mut buf = LineBuffer::new("foo\nbar");
while let Some(line) = buf.consume_next() {
assert_eq!(line.offset, line.pos.byte_offset);
}
}
#[test]
fn pos_line_number_increments_per_line() {
let mut buf = LineBuffer::new("a\nb\nc");
let lines: Vec<Line<'_>> = (0..3).filter_map(|_| buf.consume_next()).collect();
assert_eq!(lines.len(), 3, "expected 3 lines");
assert_eq!(lines.first().map(|l| l.pos.line), Some(1));
assert_eq!(lines.get(1).map(|l| l.pos.line), Some(2));
assert_eq!(lines.get(2).map(|l| l.pos.line), Some(3));
}
#[test]
fn pos_column_is_zero_at_start_of_each_line() {
let mut buf = LineBuffer::new("a\nb");
while let Some(line) = buf.consume_next() {
assert_eq!(line.pos.column, 0);
}
}
#[test]
fn pos_column_resets_after_bare_cr() {
let mut buf = LineBuffer::new("abc\rd");
buf.consume_next(); let Some(second) = buf.consume_next() else {
unreachable!("expected second");
};
assert_eq!(second.pos.column, 0);
}
#[test]
fn pos_after_mixed_endings_tracks_lines_correctly() {
let mut buf = LineBuffer::new("a\nb\r\nc\rd");
let lines: Vec<Line<'_>> = (0..4).filter_map(|_| buf.consume_next()).collect();
assert_eq!(lines.len(), 4, "expected 4 lines");
let line_nums: Vec<usize> = lines.iter().map(|l| l.pos.line).collect();
assert_eq!(line_nums, [1, 2, 3, 4]);
for line in &lines {
assert_eq!(
line.pos.column, 0,
"line {} should start at column 0",
line.pos.line
);
}
}
#[test]
fn multibyte_content_byte_offset_is_byte_based_not_char_based() {
let mut buf = LineBuffer::new("中\nfoo");
let Some(first) = buf.consume_next() else {
unreachable!("expected first");
};
assert_eq!(first.offset, 0);
assert_eq!(first.content, "中");
let Some(second) = buf.consume_next() else {
unreachable!("expected second");
};
assert_eq!(second.offset, 4);
}
#[test]
fn bom_is_stripped_from_content_of_first_line() {
let input = "\u{FEFF}foo\n";
let buf = LineBuffer::new(input);
let Some(line) = buf.peek_next() else {
unreachable!("expected a line");
};
assert_eq!(line.content, "foo");
}
#[test]
fn bom_stripped_line_offset_starts_after_bom_bytes() {
let input = "\u{FEFF}foo\n";
let buf = LineBuffer::new(input);
let Some(line) = buf.peek_next() else {
unreachable!("expected a line");
};
assert_eq!(line.offset, 3);
assert_eq!(line.pos.byte_offset, 3);
}
#[test]
fn bom_only_stripped_from_first_line() {
let input = "foo\n\u{FEFF}bar\n";
let mut buf = LineBuffer::new(input);
buf.consume_next(); let Some(second) = buf.consume_next() else {
unreachable!("expected second");
};
assert_eq!(second.content, "\u{FEFF}bar");
}
#[rstest]
#[case::indent_counts_only_leading_spaces(" foo", 3)]
#[case::indent_is_zero_for_no_leading_spaces("foo", 0)]
#[case::leading_tab_does_not_count_toward_indent("\tfoo", 0)]
#[case::tab_after_spaces_does_not_count(" \tfoo", 2)]
#[case::indent_of_blank_line_is_zero("\n", 0)]
fn indent_value(#[case] input: &str, #[case] expected: usize) {
let buf = LineBuffer::new(input);
let Some(line) = buf.peek_next() else {
unreachable!("expected a line");
};
assert_eq!(line.indent, expected);
}
#[test]
fn indent_of_spaces_only_line_equals_space_count() {
let buf = LineBuffer::new(" \n");
let Some(line) = buf.peek_next() else {
unreachable!("expected a line");
};
assert_eq!(line.indent, 3);
assert_eq!(line.content, " ");
}
#[rstest]
#[case::peek_next_indent_returns_indent_of_next_line(" foo", Some(3))]
#[case::peek_next_indent_returns_none_at_eof("", None)]
fn peek_next_indent_returns(#[case] input: &str, #[case] expected: Option<usize>) {
let buf = LineBuffer::new(input);
assert_eq!(buf.peek_next_indent(), expected);
}
#[test]
fn peek_next_indent_does_not_consume() {
let mut buf = LineBuffer::new(" foo");
assert_eq!(buf.peek_next_indent(), Some(2));
assert_eq!(buf.peek_next_indent(), Some(2));
let Some(line) = buf.consume_next() else {
unreachable!("expected a line");
};
assert_eq!(line.content, " foo");
}
#[test]
fn peek_until_dedent_empty_input_returns_empty_slice() {
let mut buf = LineBuffer::new("");
assert!(buf.peek_until_dedent(0).is_empty());
}
#[test]
fn peek_until_dedent_returns_lines_until_indent_le_base() {
let mut buf = LineBuffer::new(" a\n b\nc\n");
let lines = buf.peek_until_dedent(1);
assert_eq!(lines.len(), 2);
assert_eq!(lines.first().map(|l| l.content), Some(" a"));
assert_eq!(lines.get(1).map(|l| l.content), Some(" b"));
}
#[test]
fn peek_until_dedent_does_not_consume_lines() {
let mut buf = LineBuffer::new(" a\n b\nc\n");
let _ = buf.peek_until_dedent(1);
let Some(first) = buf.consume_next() else {
unreachable!("expected first");
};
assert_eq!(first.content, " a");
}
#[test]
fn peek_until_dedent_includes_all_lines_when_no_dedent_occurs() {
let mut buf = LineBuffer::new(" a\n b\n c");
let lines = buf.peek_until_dedent(1);
assert_eq!(lines.len(), 3);
}
#[test]
fn peek_until_dedent_returns_empty_slice_when_first_line_already_dedented() {
let mut buf = LineBuffer::new("a\n b\n");
let lines = buf.peek_until_dedent(1);
assert!(lines.is_empty());
}
#[test]
fn peek_until_dedent_second_call_returns_same_slice() {
let mut buf = LineBuffer::new(" a\n b\nc");
let first_call: Vec<String> = buf
.peek_until_dedent(1)
.iter()
.map(|l| l.content.to_owned())
.collect();
let second_call: Vec<String> = buf
.peek_until_dedent(1)
.iter()
.map(|l| l.content.to_owned())
.collect();
assert_eq!(first_call, second_call);
assert_eq!(first_call, [" a", " b"]);
}
#[test]
fn peek_until_dedent_base_zero_stops_at_non_indented_lines() {
let mut buf = LineBuffer::new(" a\n b\n");
let lines = buf.peek_until_dedent(0);
assert_eq!(lines.len(), 2);
}
#[test]
fn peek_until_dedent_blank_lines_are_transparent() {
let mut buf = LineBuffer::new(" a\n\n b\nc");
let lines = buf.peek_until_dedent(1);
assert_eq!(lines.len(), 3);
assert_eq!(lines.first().map(|l| l.content), Some(" a"));
assert_eq!(lines.get(1).map(|l| l.content), Some(""));
assert_eq!(lines.get(2).map(|l| l.content), Some(" b"));
}
#[rstest]
#[case::pos_after_line_lf_ascii(Line { content: "hello", offset: 0, indent: 0, break_type: BreakType::Lf, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 6, 2, 0)]
#[case::pos_after_line_lf_empty_content(Line { content: "", offset: 10, indent: 0, break_type: BreakType::Lf, pos: Pos { byte_offset: 10, line: 3, column: 0 } }, 11, 4, 0)]
#[case::pos_after_line_lf_multibyte(Line { content: "日本", offset: 0, indent: 0, break_type: BreakType::Lf, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 7, 2, 0)]
#[case::pos_after_line_cr_ascii(Line { content: "abc", offset: 0, indent: 0, break_type: BreakType::Cr, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 4, 2, 0)]
#[case::pos_after_line_cr_empty_content(Line { content: "", offset: 5, indent: 0, break_type: BreakType::Cr, pos: Pos { byte_offset: 5, line: 2, column: 0 } }, 6, 3, 0)]
#[case::pos_after_line_crlf_ascii(Line { content: "key: val", offset: 0, indent: 0, break_type: BreakType::CrLf, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 10, 2, 0)]
#[case::pos_after_line_crlf_empty_content(Line { content: "", offset: 0, indent: 0, break_type: BreakType::CrLf, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 2, 2, 0)]
#[case::pos_after_line_eof_empty_content(Line { content: "", offset: 20, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 20, line: 5, column: 0 } }, 20, 5, 0)]
#[case::pos_after_line_eof_ascii(Line { content: "last", offset: 10, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 10, line: 3, column: 0 } }, 14, 3, 4)]
#[case::pos_after_line_eof_ascii_nonzero_start_column(Line { content: "end", offset: 7, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 7, line: 2, column: 5 } }, 10, 2, 8)]
#[case::pos_after_line_eof_multibyte(Line { content: "日本語", offset: 0, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 9, 1, 3)]
#[case::pos_after_line_eof_mixed_content(Line { content: "ab日", offset: 0, indent: 0, break_type: BreakType::Eof, pos: Pos { byte_offset: 0, line: 1, column: 0 } }, 5, 1, 3)]
fn pos_after_line_cases(
#[case] line: Line<'static>,
#[case] expected_byte_offset: usize,
#[case] expected_line: usize,
#[case] expected_column: usize,
) {
let result = pos_after_line(&line);
assert_eq!(result.byte_offset, expected_byte_offset);
assert_eq!(result.line, expected_line);
assert_eq!(result.column, expected_column);
}
}