use std::{fmt, ops::Range};
use derive_more::Add;
use serde::Serialize;
use wide::{CmpEq as _, CmpLt as _, i8x32};
use zerocopy::transmute_ref;
const SIMD_LANES: usize = i8x32::LANES as usize;
pub type TextRange = Range<TextIndex>;
#[derive(Default, Hash, Copy, Clone, PartialEq, Eq, Debug, Serialize, Add)]
pub struct TextIndex {
pub utf8: usize,
pub utf16: usize,
pub line: usize,
pub column: usize,
}
impl TextIndex {
pub const ZERO: TextIndex = TextIndex {
utf8: 0,
utf16: 0,
line: 0,
column: 0,
};
#[inline]
pub fn advance(&mut self, c: char, next: Option<&char>) {
if c.is_ascii() {
self.utf8 += 1;
self.utf16 += 1;
match (c, next) {
('\r', Some(&'\n')) => {
}
('\n' | '\r', _) => {
self.line += 1;
self.column = 0;
}
_ => {
self.column += 1;
}
}
} else {
self.utf8 += c.len_utf8();
self.utf16 += c.len_utf16();
match c {
'\u{2028}' | '\u{2029}' => {
self.line += 1;
self.column = 0;
}
_ => {
self.column += 1;
}
}
}
}
#[inline]
fn advance_unicode(&mut self, c: char) {
debug_assert!(!c.is_ascii());
self.utf8 += c.len_utf8();
self.utf16 += c.len_utf16();
match c {
'\u{2028}' | '\u{2029}' => {
self.line += 1;
self.column = 0;
}
_ => {
self.column += 1;
}
}
}
#[inline]
fn advance_by(&mut self, advance: &Advance) {
self.utf8 += advance.bytes;
self.utf16 += advance.bytes;
self.line += advance.lines;
match advance.column {
Column::Increment(n) => self.column += n,
Column::Set(n) => self.column = n,
}
}
}
impl fmt::Display for TextIndex {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}:{}", self.line + 1, self.column + 1)
}
}
impl PartialOrd for TextIndex {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for TextIndex {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.utf8.cmp(&other.utf8)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Column {
Increment(usize),
Set(usize),
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct Advance {
bytes: usize,
lines: usize,
column: Column,
}
impl Advance {
#[inline]
#[must_use]
fn scan(slice: &[i8], start: usize, next_offset: usize) -> Self {
let bytes = &slice[start..next_offset];
let arr: [i8; SIMD_LANES] = bytes.first_chunk().copied().unwrap_or_else(|| {
let mut arr = [-1; SIMD_LANES];
arr[0..bytes.len()].copy_from_slice(bytes);
arr
});
Self::from(arr)
}
}
impl From<[i8; SIMD_LANES]> for Advance {
#[inline]
fn from(chunk: [i8; SIMD_LANES]) -> Self {
let bytes = i8x32::new(chunk);
let nonascii_mask = bytes.simd_lt(i8x32::ZERO).to_bitmask();
#[expect(clippy::cast_possible_wrap)]
let lf_bytes = i8x32::splat(b'\n' as i8);
let mut lf_mask = bytes.simd_eq(lf_bytes).to_bitmask();
#[expect(clippy::cast_possible_wrap)]
let cr_bytes = i8x32::splat(b'\r' as i8);
let mut cr_mask = bytes.simd_eq(cr_bytes).to_bitmask();
let n_ascii = nonascii_mask.trailing_zeros() as usize;
if n_ascii == 0 {
return Advance {
bytes: 0,
column: Column::Increment(0),
lines: 0,
};
}
let shift = SIMD_LANES - n_ascii; lf_mask <<= shift;
cr_mask <<= shift;
let mut n_lines = 0;
let column = if lf_mask > 0 {
n_lines = lf_mask.count_ones() as usize;
let n_last_line = lf_mask.leading_zeros() as usize;
if n_last_line == 0 {
return Advance {
bytes: n_ascii,
column: Column::Set(0),
lines: n_lines,
};
}
cr_mask >>= SIMD_LANES - n_last_line; Column::Set(n_last_line - cr_mask.count_ones() as usize)
} else {
Column::Increment(n_ascii - cr_mask.count_ones() as usize)
};
Advance {
bytes: n_ascii,
column,
lines: n_lines,
}
}
}
pub fn compute_indices(source: &str, offsets: &[usize]) -> Vec<TextIndex> {
assert!(!source.is_empty(), "source cannot be empty");
let mut text_indices = Vec::with_capacity(offsets.len()); let mut current = TextIndex::ZERO;
let mut ofs_iter = offsets.iter();
let Some(mut next_offset) = ofs_iter.next() else {
return text_indices;
};
let bytes: &[i8] = transmute_ref!(source.as_bytes());
'outer: loop {
loop {
let advance = Advance::scan(bytes, current.utf8, *next_offset);
current.advance_by(&advance);
if ¤t.utf8 == next_offset {
text_indices.push(current);
next_offset = match ofs_iter.find(|o| o != &next_offset) {
Some(o) => o,
None => break 'outer, };
}
if bytes[current.utf8] < 0 {
break;
}
}
let remaining_source = &source[current.utf8..];
let mut char_iter = remaining_source.chars().peekable();
while let Some(c) = char_iter.next() {
debug_assert!(
next_offset >= ¤t.utf8,
"next offset {next_offset} is smaller than current {}",
current.utf8
);
current.advance_unicode(c);
if ¤t.utf8 >= next_offset {
text_indices.push(current);
next_offset = match ofs_iter.find(|o| o > &¤t.utf8) {
Some(o) => o,
None => break 'outer, };
}
if char_iter.peek().is_some_and(char::is_ascii) {
break;
}
}
if current.utf8 >= bytes.len() - 1 {
break; }
}
text_indices
}
#[cfg(test)]
#[expect(clippy::cast_possible_wrap)]
mod tests {
use similar_asserts::assert_eq;
use super::*;
#[test]
fn test_advance_simple() {
let chunk: Vec<_> = b"abcdabcdabcdabcdabcdabcdabcdabcd"
.iter()
.map(|b| *b as i8)
.collect();
let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
let advance = Advance::from(chunk);
assert_eq!(
advance,
Advance {
bytes: 32,
lines: 0,
column: Column::Increment(32)
}
);
}
#[test]
fn test_advance_newline() {
let chunk: Vec<_> = b"abcdabcdabcdabcdabcdabcdabc\nabcd"
.iter()
.map(|b| *b as i8)
.collect();
let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
let advance = Advance::from(chunk);
assert_eq!(
advance,
Advance {
bytes: 32,
lines: 1,
column: Column::Set(4)
}
);
}
#[test]
fn test_advance_multiple_newlines() {
let chunk: Vec<_> = b"abcdabcdabc\nabcdabcdabcdab\r\nabc\r"
.iter()
.map(|b| *b as i8)
.collect();
let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
let advance = Advance::from(chunk);
assert_eq!(
advance,
Advance {
bytes: 32,
lines: 2,
column: Column::Set(3)
}
);
}
#[test]
fn test_advance_unicode() {
let chunk: Vec<_> = "abcdabcdabcdabcdabcdabcdabcd🦀"
.bytes()
.map(|b| b as i8)
.collect();
let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
let advance = Advance::from(chunk);
assert_eq!(
advance,
Advance {
bytes: 28,
lines: 0,
column: Column::Increment(28)
}
);
}
#[test]
fn test_advance_unicode_newlines() {
let chunk: Vec<_> = "abcdabcdabc\nabcdabcdabc\nabcd🦀"
.bytes()
.map(|b| b as i8)
.collect();
let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
let advance = Advance::from(chunk);
assert_eq!(
advance,
Advance {
bytes: 28,
lines: 2,
column: Column::Set(4)
}
);
}
#[test]
fn test_advance_next_offset() {
let chunk: Vec<_> = b"abcdabcdabcdabcdabcdabcdabcdabcd"
.iter()
.map(|b| *b as i8)
.collect();
let advance = Advance::scan(chunk.as_slice(), 0, 28);
assert_eq!(
advance,
Advance {
bytes: 28,
lines: 0,
column: Column::Increment(28)
}
);
}
#[test]
fn test_advance_next_offset_newline() {
let chunk: Vec<_> = b"abcdabcdabcdabc\nabcdabcdabc\nabcd"
.iter()
.map(|b| *b as i8)
.collect();
let advance = Advance::scan(chunk.as_slice(), 0, 28);
assert_eq!(
advance,
Advance {
bytes: 28,
lines: 2,
column: Column::Set(0)
}
);
}
#[test]
fn test_compute_indices_simple() {
let source = "hello world";
let offsets = vec![0, 5, 6, 10]; let result = compute_indices(source, &offsets);
assert_eq!(result.len(), 4);
assert_eq!(result[0], TextIndex::ZERO);
assert_eq!(
result[1],
TextIndex {
utf8: 5,
utf16: 5,
line: 0,
column: 5
}
);
assert_eq!(
result[2],
TextIndex {
utf8: 6,
utf16: 6,
line: 0,
column: 6
}
);
assert_eq!(
result[3],
TextIndex {
utf8: 10,
utf16: 10,
line: 0,
column: 10
}
);
}
#[test]
fn test_compute_indices_with_newlines() {
let source = "hello\nworld\ntest";
let offsets = vec![0, 5, 6, 12, 15]; let result = compute_indices(source, &offsets);
assert_eq!(result.len(), 5);
assert_eq!(
result[0],
TextIndex {
utf8: 0,
utf16: 0,
line: 0,
column: 0
}
);
assert_eq!(
result[1],
TextIndex {
utf8: 5,
utf16: 5,
line: 0,
column: 5
}
);
assert_eq!(
result[2],
TextIndex {
utf8: 6,
utf16: 6,
line: 1,
column: 0
}
);
assert_eq!(
result[3],
TextIndex {
utf8: 12,
utf16: 12,
line: 2,
column: 0
}
);
assert_eq!(
result[4],
TextIndex {
utf8: 15,
utf16: 15,
line: 2,
column: 3
}
);
}
#[test]
fn test_compute_indices_with_unicode() {
let source = "hel🦀lo";
let offsets = vec![0, 3, 7]; let result = compute_indices(source, &offsets);
assert_eq!(
result[0],
TextIndex {
utf8: 0,
utf16: 0,
line: 0,
column: 0
}
);
assert_eq!(
result[1],
TextIndex {
utf8: 3,
utf16: 3,
line: 0,
column: 3
}
);
assert_eq!(
result[2],
TextIndex {
utf8: 7,
utf16: 5,
line: 0,
column: 4
}
);
}
#[test]
fn test_compute_indices_with_carriage_return() {
let source = "padding_hello\r\nworld_padding_padding";
let offsets = vec![8, 13, 14, 16, 36]; let result = compute_indices(source, &offsets);
assert_eq!(
result[0],
TextIndex {
utf8: 8,
utf16: 8,
line: 0,
column: 8
}
);
assert_eq!(
result[1],
TextIndex {
utf8: 13,
utf16: 13,
line: 0,
column: 13
}
);
assert_eq!(
result[2],
TextIndex {
utf8: 14,
utf16: 14,
line: 0,
column: 13 }
);
assert_eq!(
result[3],
TextIndex {
utf8: 16,
utf16: 16,
line: 1,
column: 1
}
);
assert_eq!(
result[4],
TextIndex {
utf8: 36,
utf16: 36,
line: 1,
column: 21
}
);
}
#[test]
fn test_compute_indices_duplicate_offsets() {
let source = "hello";
let offsets = vec![0, 0, 2, 2, 4];
let result = compute_indices(source, &offsets);
assert_eq!(result.len(), 3); assert_eq!(
result[0],
TextIndex {
utf8: 0,
utf16: 0,
line: 0,
column: 0
}
);
assert_eq!(
result[1],
TextIndex {
utf8: 2,
utf16: 2,
line: 0,
column: 2
}
);
assert_eq!(
result[2],
TextIndex {
utf8: 4,
utf16: 4,
line: 0,
column: 4
}
);
}
#[test]
fn test_compute_indices_unicode_line_separators() {
let source = "hello\u{2028}world\u{2029}test";
let offsets = vec![0, 5, 8, 16]; let result = compute_indices(source, &offsets);
assert_eq!(result[0], TextIndex::ZERO);
assert_eq!(
result[1],
TextIndex {
utf8: 5,
utf16: 5,
line: 0,
column: 5
}
);
assert_eq!(
result[2],
TextIndex {
utf8: 8,
utf16: 6,
line: 1,
column: 0
}
);
assert_eq!(
result[3],
TextIndex {
utf8: 16,
utf16: 12,
line: 2,
column: 0
}
);
}
#[test]
#[should_panic(expected = "source cannot be empty")]
fn test_compute_indices_empty_source() {
let source = "";
let offsets = vec![0];
compute_indices(source, &offsets);
}
}