#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Pos {
pub byte_offset: usize,
pub line: usize,
pub column: usize,
}
impl Pos {
pub const ORIGIN: Self = Self {
byte_offset: 0,
line: 1,
column: 0,
};
#[must_use]
pub const fn advance(self, ch: char) -> Self {
let byte_offset = self.byte_offset + ch.len_utf8();
if ch == '\n' {
Self {
byte_offset,
line: self.line + 1,
column: 0,
}
} else {
Self {
byte_offset,
line: self.line,
column: self.column + 1,
}
}
}
}
pub fn column_at(line_content: &str, byte_offset_in_line: usize) -> usize {
let prefix = &line_content[..byte_offset_in_line];
if prefix.is_ascii() {
byte_offset_in_line
} else {
prefix.chars().count()
}
}
pub fn advance_within_line(pos: Pos, content: &str) -> Pos {
Pos {
byte_offset: pos.byte_offset + content.len(),
line: pos.line,
column: pos.column + column_at(content, content.len()),
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct LineIndex {
newlines: Vec<u32>,
source: String,
}
impl LineIndex {
#[must_use]
pub fn new(source: &str) -> Self {
let mut newlines = Vec::new();
let mut chars = source.char_indices().peekable();
while let Some((i, ch)) = chars.next() {
match ch {
'\r' => {
#[expect(
clippy::cast_possible_truncation,
reason = "YAML files <= 4 GB; u32 offset is sufficient"
)]
newlines.push(i as u32);
if chars.peek().is_some_and(|(_, next)| *next == '\n') {
let _ = chars.next();
}
}
'\n' => {
#[expect(
clippy::cast_possible_truncation,
reason = "YAML files <= 4 GB; u32 offset is sufficient"
)]
newlines.push(i as u32);
}
_ => {}
}
}
Self {
newlines,
source: source.to_owned(),
}
}
#[must_use]
pub fn line_column(&self, offset: u32) -> (u32, u32) {
let newline_idx = self.newlines.partition_point(|&nl| nl < offset);
#[expect(
clippy::cast_possible_truncation,
reason = "line count fits u32 for any realistic document"
)]
let line = (newline_idx as u32) + 1;
let line_start_byte = if newline_idx == 0 {
0usize
} else {
#[expect(clippy::indexing_slicing, reason = "newline_idx > 0 is checked above")]
let nl_byte = self.newlines[newline_idx - 1] as usize;
let nl_char = self
.source
.get(nl_byte..)
.and_then(|s| s.chars().next())
.unwrap_or('\n');
if nl_char == '\r'
&& self
.source
.get(nl_byte + 1..nl_byte + 2)
.is_some_and(|s| s == "\n")
{
nl_byte + 2
} else {
nl_byte + 1
}
};
let col_prefix = self
.source
.get(line_start_byte..offset as usize)
.unwrap_or("");
#[expect(
clippy::cast_possible_truncation,
reason = "column fits u32 for any realistic line"
)]
let column = if col_prefix.is_ascii() {
col_prefix.len() as u32
} else {
col_prefix.chars().count() as u32
};
(line, column)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Span {
pub start: u32,
pub end: u32,
}
impl Span {
#[must_use]
#[expect(
clippy::cast_possible_truncation,
reason = "YAML files <= 4 GB; u32 offset is sufficient"
)]
pub(crate) const fn from_pos(start: Pos, end: Pos) -> Self {
Self {
start: start.byte_offset as u32,
end: end.byte_offset as u32,
}
}
#[must_use]
pub fn start_line_column(&self, index: &LineIndex) -> (u32, u32) {
index.line_column(self.start)
}
#[must_use]
pub fn end_line_column(&self, index: &LineIndex) -> (u32, u32) {
index.line_column(self.end)
}
}
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use rstest::rstest;
use super::*;
#[test]
fn pos_origin_is_start_of_document() {
let pos = Pos::ORIGIN;
assert_eq!(pos.byte_offset, 0);
assert_eq!(pos.line, 1);
assert_eq!(pos.column, 0);
}
#[test]
fn pos_fields_are_accessible() {
let pos = Pos {
byte_offset: 10,
line: 3,
column: 4,
};
assert_eq!(pos.byte_offset, 10);
assert_eq!(pos.line, 3);
assert_eq!(pos.column, 4);
}
#[test]
fn pos_is_copy() {
let pos = Pos::ORIGIN;
let pos2 = pos;
let _ = pos.byte_offset;
let _ = pos2.byte_offset;
}
#[rstest]
#[case::ascii_char('a', 1, 1, 1)]
#[case::newline('\n', 1, 2, 0)]
#[case::multibyte_cjk('中', 3, 1, 1)]
fn advance_basic(
#[case] ch: char,
#[case] expected_byte_offset: usize,
#[case] expected_line: usize,
#[case] expected_column: usize,
) {
let pos = Pos::ORIGIN.advance(ch);
assert_eq!(pos.byte_offset, expected_byte_offset);
assert_eq!(pos.line, expected_line);
assert_eq!(pos.column, expected_column);
}
#[test]
fn advance_multiple_lines() {
let pos = Pos::ORIGIN
.advance('a')
.advance('\n')
.advance('b')
.advance('\n')
.advance('c');
assert_eq!(pos.line, 3);
assert_eq!(pos.column, 1);
}
#[test]
fn span_size_is_eight_bytes() {
assert_eq!(std::mem::size_of::<Span>(), 8);
}
const _: () = assert!(
std::mem::size_of::<Span>() == 8,
"Span must be exactly 8 bytes"
);
#[test]
fn span_is_copy() {
let span = Span { start: 0, end: 0 };
let span2 = span;
let _ = span.start;
let _ = span2.start;
}
#[rstest]
#[case::empty_prefix("hello", 0, 0)]
#[case::ascii_mid_line("hello world", 5, 5)]
#[case::ascii_full_line("abc", 3, 3)]
#[case::multibyte_only_prefix("日本語xyz", 9, 3)]
#[case::ascii_then_multibyte("ab日本", 8, 4)]
#[case::multibyte_then_ascii("日ab", 5, 3)]
#[case::full_multibyte_line("日本語", 9, 3)]
fn column_at_cases(
#[case] line_content: &str,
#[case] byte_offset: usize,
#[case] expected: usize,
) {
assert_eq!(column_at(line_content, byte_offset), expected);
}
#[rstest]
#[case::empty_content(Pos { byte_offset: 5, line: 2, column: 3 }, "", 5, 2, 3)]
#[case::ascii_from_origin(Pos::ORIGIN, "hello", 5, 1, 5)]
#[case::ascii_mid_line(Pos { byte_offset: 10, line: 3, column: 4 }, "abc", 13, 3, 7)]
#[case::multibyte_from_origin(Pos::ORIGIN, "日本語", 9, 1, 3)]
#[case::multibyte_mid_line(Pos { byte_offset: 4, line: 1, column: 2 }, "日本語", 13, 1, 5)]
#[case::mixed_ascii_then_multibyte(Pos::ORIGIN, "ab日", 5, 1, 3)]
fn advance_within_line_fields(
#[case] start: Pos,
#[case] content: &str,
#[case] expected_byte_offset: usize,
#[case] expected_line: usize,
#[case] expected_column: usize,
) {
let result = advance_within_line(start, content);
assert_eq!(result.byte_offset, expected_byte_offset);
assert_eq!(result.line, expected_line);
assert_eq!(result.column, expected_column);
}
#[test]
fn advance_within_line_line_field_is_preserved() {
let pos = Pos {
byte_offset: 0,
line: 7,
column: 0,
};
let result = advance_within_line(pos, "xyz");
assert_eq!(result.line, 7);
}
#[test]
fn advance_within_line_matches_advance_loop_ascii() {
let pos = Pos {
byte_offset: 2,
line: 1,
column: 2,
};
let content = "abc";
let expected = content.chars().fold(pos, super::Pos::advance);
assert_eq!(advance_within_line(pos, content), expected);
}
#[test]
fn advance_within_line_matches_advance_loop_multibyte() {
let pos = Pos {
byte_offset: 0,
line: 1,
column: 0,
};
let content = "日本語xyz";
let expected = content.chars().fold(pos, super::Pos::advance);
assert_eq!(advance_within_line(pos, content), expected);
}
#[test]
fn line_index_empty_string_produces_no_newlines() {
let idx = LineIndex::new("");
assert!(idx.newlines.is_empty());
assert_eq!(idx.line_column(0), (1, 0));
}
#[test]
fn line_index_single_line_no_newline() {
let idx = LineIndex::new("hello");
assert_eq!(idx.line_column(0), (1, 0));
assert_eq!(idx.line_column(4), (1, 4));
assert_eq!(idx.line_column(5), (1, 5));
}
#[test]
fn line_index_single_newline_at_end() {
let idx = LineIndex::new("hello\n");
assert_eq!(idx.line_column(0), (1, 0));
assert_eq!(idx.line_column(4), (1, 4));
assert_eq!(idx.line_column(5), (1, 5)); assert_eq!(idx.line_column(6), (2, 0)); }
#[test]
fn line_index_multiple_lines_line_numbers_correct() {
let idx = LineIndex::new("a\nb\nc");
assert_eq!(idx.line_column(0), (1, 0));
assert_eq!(idx.line_column(2), (2, 0));
assert_eq!(idx.line_column(4), (3, 0));
}
#[test]
fn line_index_column_is_codepoint_count_not_byte_count() {
let idx = LineIndex::new("日本語\nfoo");
assert_eq!(idx.line_column(9), (1, 3)); assert_eq!(idx.line_column(10), (2, 0));
assert_eq!(idx.line_column(11), (2, 1));
}
#[test]
fn line_index_ascii_fast_path_matches_general_path() {
let idx = LineIndex::new("abc\nxyz");
assert_eq!(idx.line_column(5), (2, 1)); }
#[test]
fn line_index_multibyte_mid_line() {
let idx = LineIndex::new("ab日xyz\nok");
assert_eq!(idx.line_column(2), (1, 2));
assert_eq!(idx.line_column(5), (1, 3)); assert_eq!(idx.line_column(6), (1, 4)); assert_eq!(idx.line_column(8), (1, 6)); assert_eq!(idx.line_column(9), (2, 0)); }
#[test]
fn line_index_crlf_line_endings() {
let idx = LineIndex::new("a\r\nb");
assert_eq!(idx.line_column(0), (1, 0));
assert_eq!(idx.line_column(3), (2, 0));
}
#[test]
fn line_index_bare_cr_line_endings() {
let idx = LineIndex::new("a\rb");
assert_eq!(idx.line_column(0), (1, 0));
assert_eq!(idx.line_column(2), (2, 0));
}
fn eager_line_column(source: &str, offset: usize) -> (u32, u32) {
let mut pos = Pos::ORIGIN;
for ch in source.chars() {
if pos.byte_offset >= offset {
break;
}
pos = pos.advance(ch);
}
#[expect(
clippy::cast_possible_truncation,
reason = "test oracle: values fit u32 in realistic inputs"
)]
(pos.line as u32, pos.column as u32)
}
proptest! {
#[test]
#[expect(
clippy::unwrap_used,
clippy::cast_possible_truncation,
reason = "proptest: regex strategy is valid; offset fits u32 for any string of length <= 50"
)]
fn line_index_line_column_matches_advance_loop_ascii(
input in proptest::string::string_regex("[a-z\n]{0,50}").unwrap()
) {
let idx = LineIndex::new(&input);
let mut offset = 0usize;
for ch in input.chars() {
let expected = eager_line_column(&input, offset);
let got = idx.line_column(offset as u32);
prop_assert_eq!(
got, expected,
"mismatch at offset {} in {:?}", offset, input
);
offset += ch.len_utf8();
}
if offset <= input.len() {
let expected = eager_line_column(&input, offset);
let got = idx.line_column(offset as u32);
prop_assert_eq!(got, expected, "mismatch at end offset {}", offset);
}
}
#[test]
#[expect(
clippy::unwrap_used,
clippy::cast_possible_truncation,
reason = "proptest: regex strategy is valid; offset fits u32 for any string of length <= 30"
)]
fn line_index_line_column_matches_advance_loop_multibyte(
input in proptest::string::string_regex("[a-z\n\u{4E00}-\u{4E10}]{0,30}").unwrap()
) {
let idx = LineIndex::new(&input);
let mut offset = 0usize;
for ch in input.chars() {
let expected = eager_line_column(&input, offset);
let got = idx.line_column(offset as u32);
prop_assert_eq!(
got, expected,
"mismatch at offset {} in {:?}", offset, input
);
offset += ch.len_utf8();
}
}
}
}