use std::fmt;
#[derive(Debug, Clone)]
pub struct LineIndex {
line_starts: Vec<u32>,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum LineIndexError {
OffsetOutOfBounds { byte: usize, source_len: usize },
OffsetTooLarge { byte: usize },
NotUtf8Boundary { byte: usize },
}
impl fmt::Display for LineIndexError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::OffsetOutOfBounds { byte, source_len } => {
write!(f, "byte offset {byte} past source length {source_len}")
}
Self::OffsetTooLarge { byte } => write!(f, "byte offset {byte} exceeds u32 range"),
Self::NotUtf8Boundary { byte } => write!(f, "byte {byte} not on a UTF-8 boundary"),
}
}
}
impl std::error::Error for LineIndexError {}
impl LineIndex {
#[must_use]
pub fn new(source: &str) -> Self {
let mut line_starts: Vec<u32> = Vec::with_capacity(source.len() / 40);
line_starts.push(0);
for (i, b) in source.bytes().enumerate() {
if b == b'\n' {
let next = u32::try_from(i.saturating_add(1)).unwrap_or(u32::MAX);
line_starts.push(next);
}
}
Self { line_starts }
}
pub fn locate(&self, source: &str, byte: usize) -> Result<(usize, usize), LineIndexError> {
if byte > source.len() {
return Err(LineIndexError::OffsetOutOfBounds {
byte,
source_len: source.len(),
});
}
let byte_u32 = u32::try_from(byte).map_err(|_| LineIndexError::OffsetTooLarge { byte })?;
let idx = match self.line_starts.binary_search(&byte_u32) {
Ok(i) => i,
Err(i) => i.saturating_sub(1),
};
let line_start = self.line_starts.get(idx).copied().unwrap_or(0) as usize;
let prefix = source
.get(line_start..byte)
.ok_or(LineIndexError::NotUtf8Boundary { byte })?;
let column = prefix.chars().count().saturating_add(1);
Ok((idx.saturating_add(1), column))
}
#[must_use]
pub fn byte_of_position_0based(&self, source: &str, line: usize, column: usize) -> Option<usize> {
let line_start = self.line_starts.get(line).copied()? as usize;
let line_end = self
.line_starts
.get(line.saturating_add(1))
.copied()
.map_or(source.len(), |n| n as usize);
let mut visible_end = line_end;
if visible_end > line_start && source.as_bytes().get(visible_end.saturating_sub(1)) == Some(&b'\n') {
visible_end = visible_end.saturating_sub(1);
if visible_end > line_start && source.as_bytes().get(visible_end.saturating_sub(1)) == Some(&b'\r') {
visible_end = visible_end.saturating_sub(1);
}
}
let line_text = source.get(line_start..visible_end)?;
let mut cursor = line_start;
let mut remaining = column;
for ch in line_text.chars() {
if remaining == 0 {
return Some(cursor);
}
cursor = cursor.saturating_add(ch.len_utf8());
remaining = remaining.saturating_sub(1);
}
(remaining == 0).then_some(visible_end)
}
#[must_use]
pub fn line_bounds(&self, source: &str, byte: usize) -> Option<std::ops::Range<usize>> {
if byte > source.len() {
return None;
}
let byte_u32 = u32::try_from(byte).ok()?;
let idx = match self.line_starts.binary_search(&byte_u32) {
Ok(i) => i,
Err(i) => i.saturating_sub(1),
};
let start = self.line_starts.get(idx).copied()? as usize;
let raw_end = self
.line_starts
.get(idx.saturating_add(1))
.copied()
.map_or(source.len(), |n| n as usize);
let mut end = raw_end;
if end > start && source.as_bytes().get(end.saturating_sub(1)) == Some(&b'\n') {
end = end.saturating_sub(1);
if end > start && source.as_bytes().get(end.saturating_sub(1)) == Some(&b'\r') {
end = end.saturating_sub(1);
}
}
Some(start..end)
}
}
#[cfg(test)]
mod tests {
use super::LineIndex;
fn locate(idx: &LineIndex, source: &str, byte: usize) -> Result<(usize, usize), String> {
idx.locate(source, byte).map_err(|err| err.to_string())
}
#[test]
fn locate_start_of_first_line() -> Result<(), String> {
let src = "hello\nworld\n";
let idx = LineIndex::new(src);
assert_eq!(locate(&idx, src, 0)?, (1, 1));
Ok(())
}
#[test]
fn locate_after_newline() -> Result<(), String> {
let src = "hello\nworld\n";
let idx = LineIndex::new(src);
assert_eq!(locate(&idx, src, 6)?, (2, 1));
Ok(())
}
#[test]
fn locate_codepoint_column() -> Result<(), String> {
let src = "αβγ\n";
let idx = LineIndex::new(src);
assert_eq!(locate(&idx, src, 4)?, (1, 3));
Ok(())
}
#[test]
fn rejects_out_of_range() {
let src = "hi\n";
let idx = LineIndex::new(src);
assert!(idx.locate(src, 99).is_err());
}
#[test]
fn byte_of_position_0based_basic() {
let src = "hello\nworld\n";
let idx = LineIndex::new(src);
assert_eq!(idx.byte_of_position_0based(src, 0, 0), Some(0));
assert_eq!(idx.byte_of_position_0based(src, 0, 5), Some(5));
assert_eq!(idx.byte_of_position_0based(src, 1, 0), Some(6));
assert_eq!(idx.byte_of_position_0based(src, 1, 5), Some(11));
}
#[test]
fn byte_of_position_0based_codepoints() {
let src = "αβγ\n";
let idx = LineIndex::new(src);
assert_eq!(idx.byte_of_position_0based(src, 0, 0), Some(0));
assert_eq!(idx.byte_of_position_0based(src, 0, 1), Some(2));
assert_eq!(idx.byte_of_position_0based(src, 0, 2), Some(4));
assert_eq!(idx.byte_of_position_0based(src, 0, 3), Some(6));
}
#[test]
fn byte_of_position_0based_out_of_range() {
let src = "hi\n";
let idx = LineIndex::new(src);
assert!(idx.byte_of_position_0based(src, 5, 0).is_none());
assert!(idx.byte_of_position_0based(src, 0, 99).is_none());
}
}