use memchr::{memchr, memchr2, memchr3, memmem};
use crate::common::Position;
#[derive(Clone, Copy, Eq, PartialOrd, Ord, Debug)]
pub struct CursorPosition {
pub byte_index: usize,
pub char_index: usize,
last_line_offset: u32,
last_line_number: u32,
}
impl PartialEq for CursorPosition {
fn eq(&self, other: &Self) -> bool {
self.byte_index == other.byte_index
}
}
impl CursorPosition {
#[inline(always)]
pub fn to_position(&self) -> Position {
Position::new(
self.byte_index as u32,
self.last_line_number,
self.char_index as u32 - self.last_line_offset,
self.char_index as u32,
)
}
}
pub struct Cursor<'a> {
pub input: &'a str,
pub bytes: &'a [u8],
pub len: usize,
pub position: CursorPosition,
}
impl<'a> Cursor<'a> {
pub fn new(input: &'a str) -> Self {
Self {
input,
bytes: input.as_bytes(),
len: input.len(),
position: CursorPosition {
byte_index: 0,
char_index: 0,
last_line_offset: 0,
last_line_number: 0,
},
}
}
#[inline(always)]
pub fn override_position(&mut self, pos: CursorPosition) {
self.position.byte_index = pos.byte_index;
self.position.char_index = pos.char_index;
self.position.last_line_offset = pos.last_line_offset;
self.position.last_line_number = pos.last_line_number;
}
#[inline(always)]
pub fn current_byte(&self) -> u8 {
let s = if self.position.byte_index >= self.len {
self.len - 1
} else {
self.position.byte_index
};
self.bytes[s]
}
pub fn byte_ahead(&self, offset: usize) -> u8 {
debug_assert!(self.position.byte_index + offset < self.len);
self.bytes[self.position.byte_index + offset]
}
#[inline(always)]
pub fn next_byte(&self) -> u8 {
self.bytes[self.position.byte_index + 1]
}
#[inline(always)]
pub fn increment(&mut self) {
if self.ended() {
return;
}
let s = &self.input[self.position.byte_index..];
let c = s.chars().next().unwrap();
let char_len = c.len_utf8();
self.position.byte_index += char_len;
self.position.char_index += 1;
if c == '\n' {
self.position.last_line_number += 1;
self.position.last_line_offset = self.position.char_index as u32;
}
}
pub fn advance(&mut self, len: usize) {
let new_byte = self.position.byte_index + len;
debug_assert!(new_byte <= self.len, "Advance past end of input");
let seg = &self.bytes[self.position.byte_index..new_byte];
let delta_ch = if seg.is_ascii() {
len
} else {
self.input[self.position.byte_index..new_byte]
.chars()
.count()
};
for nl_offset in memchr::memchr_iter(b'\n', seg) {
self.position.last_line_number += 1;
let nl_byte_pos = self.position.byte_index + nl_offset;
let chars_to_nl = self.input[self.position.byte_index..=nl_byte_pos]
.chars()
.count();
self.position.last_line_offset = (self.position.char_index + chars_to_nl) as u32;
}
self.position.byte_index = new_byte;
self.position.char_index += delta_ch;
}
#[inline(always)]
pub fn to_end(&mut self) -> CursorPosition {
self.advance(self.len - self.position.byte_index);
self.position
}
#[inline(always)]
pub fn ended(&self) -> bool {
self.position.byte_index >= self.len
}
#[inline(always)]
pub fn remaining(&self) -> &[u8] {
&self.bytes[self.position.byte_index..]
}
pub fn search2(&self, byte1: u8, byte2: u8) -> Option<usize> {
let haystack = self.remaining();
if byte1 == byte2 {
memchr(byte1, haystack)
} else {
memchr2(byte1, byte2, haystack)
}
}
pub fn search3(&self, byte1: u8, byte2: u8, byte3: u8) -> Option<usize> {
let haystack = self.remaining();
memchr3(byte1, byte2, byte3, haystack)
}
#[inline(always)]
pub fn next_bytes_equal(&self, needle: &[u8]) -> bool {
self.remaining().starts_with(needle)
}
}
#[inline(always)]
pub fn find_subslice(needle: &[u8], haystack: &[u8]) -> Option<usize> {
memmem::find(haystack, needle)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::cursor::cursor::Cursor;
#[inline(always)]
fn is_lead(b: u8) -> bool {
(b & 0xC0) != 0x80
}
fn step_all(input: &str) -> Vec<CursorPosition> {
let mut c = Cursor::new(input);
let mut out = Vec::new();
out.push(c.position);
while !c.ended() {
c.increment();
out.push(c.position);
}
out
}
fn assert_end_indices(input: &str) {
let mut c = Cursor::new(input);
let end = c.to_end();
assert!(c.ended());
assert_eq!(
end.byte_index,
input.len(),
"byte_index must equal input.len()"
);
assert_eq!(
end.char_index,
input.chars().count(),
"char_index must equal chars().count()"
);
}
fn assert_step_matches_chars(input: &str) {
let positions = step_all(input);
assert_eq!(
positions.len(),
input.chars().count() + 1,
"one position per char, plus initial"
);
let mut cumulative = 0usize;
assert_eq!(positions[0].byte_index, 0);
assert_eq!(positions[0].char_index, 0);
for (i, ch) in input.chars().enumerate() {
cumulative += ch.len_utf8();
let p = positions[i + 1];
assert_eq!(p.char_index, i + 1, "char_index increments by 1 per char");
assert_eq!(
p.byte_index, cumulative,
"byte_index increments by UTF-8 width"
);
}
}
fn assert_line_tracking_lf_only(input: &str) {
let mut c = Cursor::new(input);
let mut expected_line = 0u32;
let mut expected_last_line_offset = 0u32;
assert_eq!(c.position.last_line_number, expected_line);
assert_eq!(c.position.last_line_offset, expected_last_line_offset);
while !c.ended() {
let b = c.current_byte();
c.increment();
if b == b'\n' {
expected_line += 1;
expected_last_line_offset = c.position.char_index as u32;
}
assert_eq!(
c.position.last_line_number, expected_line,
"line number should track LF only"
);
assert_eq!(
c.position.last_line_offset, expected_last_line_offset,
"last_line_offset tracks char_index at start of current line"
);
}
}
fn assert_remaining_is_suffix(input: &str) {
let mut c = Cursor::new(input);
while !c.ended() {
let bi = c.position.byte_index;
assert_eq!(c.remaining(), &input.as_bytes()[bi..]);
c.increment();
}
assert_eq!(c.remaining(), &input.as_bytes()[input.len()..]);
}
fn assert_byte_index_always_aligned(input: &str) {
let mut c = Cursor::new(input);
while !c.ended() {
let bi = c.position.byte_index;
if bi < input.len() {
let b = input.as_bytes()[bi];
assert!(
is_lead(b),
"byte_index must always point to UTF-8 lead byte boundary"
);
}
c.increment();
}
}
#[test]
fn test_ascii_text() {
let input = "hello world";
assert_step_matches_chars(input);
assert_end_indices(input);
assert_remaining_is_suffix(input);
assert_byte_index_always_aligned(input);
}
#[test]
fn test_empty_string() {
let input = "";
assert_step_matches_chars(input);
assert_end_indices(input);
assert_remaining_is_suffix(input);
}
#[test]
fn test_unicode_chars() {
let input = "Hello äøē š";
assert_step_matches_chars(input);
assert_end_indices(input);
assert_remaining_is_suffix(input);
assert_byte_index_always_aligned(input);
}
#[test]
fn test_newline_tracking() {
let input = "line1\nline2\nline3";
assert_line_tracking_lf_only(input);
assert_step_matches_chars(input);
}
#[test]
fn test_vue_interpolation() {
let input = "{{ message }}";
assert_step_matches_chars(input);
assert_end_indices(input);
assert_remaining_is_suffix(input);
}
#[test]
fn test_multiline_vue() {
let input = "<template>\n <div>{{ text }}</div>\n</template>";
assert_line_tracking_lf_only(input);
assert_step_matches_chars(input);
assert_end_indices(input);
}
#[test]
fn empty_string_end() {
assert_end_indices("");
assert_step_matches_chars("");
}
#[test]
fn ascii_simple() {
let s = "abc";
assert_step_matches_chars(s);
assert_end_indices(s);
assert_remaining_is_suffix(s);
assert_byte_index_always_aligned(s);
}
#[test]
fn ascii_with_newlines_lf() {
let s = "a\nb\n\nc";
assert_step_matches_chars(s);
assert_end_indices(s);
assert_line_tracking_lf_only(s);
let mut c = Cursor::new(s);
c.advance(2);
assert_eq!(c.position.last_line_number, 1);
assert_eq!(c.position.last_line_offset, c.position.char_index as u32);
}
#[test]
fn ascii_crlf_is_two_chars_and_only_lf_counts() {
let s = "a\r\nb";
assert_step_matches_chars(s);
assert_line_tracking_lf_only(s);
let mut c = Cursor::new(s);
c.advance(2); assert_eq!(
c.position.last_line_number, 0,
"CR alone should not increment line counter"
);
c.increment(); assert_eq!(
c.position.last_line_number, 1,
"LF increments line counter even in CRLF"
);
}
#[test]
fn override_position_roundtrip() {
let s = "hello\nworld";
let mut c1 = Cursor::new(s);
c1.advance(3);
let saved = c1.position;
let mut c2 = Cursor::new(s);
c2.override_position(saved);
assert_eq!(c2.position, saved);
assert_eq!(c2.remaining(), &s.as_bytes()[saved.byte_index..]);
}
#[test]
fn unicode_two_byte_chars() {
let s = "éèê"; assert_step_matches_chars(s);
assert_end_indices(s);
assert_byte_index_always_aligned(s);
}
#[test]
fn unicode_three_byte_chars() {
let s = "ę±å"; assert_step_matches_chars(s);
assert_end_indices(s);
assert_byte_index_always_aligned(s);
}
#[test]
fn unicode_four_byte_chars_emoji() {
let s = "šš"; assert_step_matches_chars(s);
assert_end_indices(s);
assert_byte_index_always_aligned(s);
}
#[test]
fn mixed_ascii_and_unicode() {
let s = "aƩ걚z\n";
assert_step_matches_chars(s);
assert_end_indices(s);
assert_line_tracking_lf_only(s);
assert_byte_index_always_aligned(s);
}
#[test]
fn combining_mark_sequence_counts_as_two_chars() {
let s = "e\u{0301}"; assert_eq!(s.chars().count(), 2);
assert_step_matches_chars(s);
assert_end_indices(s);
assert_byte_index_always_aligned(s);
}
#[test]
fn precomposed_vs_decomposed_have_different_char_counts() {
let nfc = "Ć©"; let nfd = "e\u{0301}";
assert_eq!(nfc.chars().count(), 1);
assert_eq!(nfd.chars().count(), 2);
assert_step_matches_chars(nfc);
assert_step_matches_chars(nfd);
}
#[test]
fn multiple_combining_marks() {
let s = "a\u{0301}\u{0327}\u{0308}"; assert_step_matches_chars(s);
assert_end_indices(s);
}
#[test]
fn zwj_sequence_multiple_chars() {
let s = "š©\u{200D}š»";
assert!(s.chars().count() >= 3);
assert_step_matches_chars(s);
assert_end_indices(s);
}
#[test]
fn emoji_with_skin_tone_modifier() {
let s = "šš½";
assert_eq!(s.chars().count(), 2);
assert_step_matches_chars(s);
assert_end_indices(s);
}
#[test]
fn flag_is_two_regional_indicators() {
let s = "šµš¹";
assert_eq!(s.chars().count(), 2);
assert_step_matches_chars(s);
assert_end_indices(s);
}
#[test]
fn variation_selector() {
let s = "ā\u{FE0F}";
assert_eq!(s.chars().count(), 2);
assert_step_matches_chars(s);
assert_end_indices(s);
}
#[test]
fn rtl_text_and_bidi_marks() {
let s = "abc\u{200F}Ł
Ų±ŲŲØŲ§\u{200E}xyz";
assert_step_matches_chars(s);
assert_end_indices(s);
assert_byte_index_always_aligned(s);
}
#[test]
fn zero_width_space_and_joiners() {
let s = "a\u{200B}b\u{200C}c\u{200D}d"; assert_step_matches_chars(s);
assert_end_indices(s);
}
#[test]
fn unicode_line_separator_does_not_increment_line() {
let s = "a\u{2028}b"; assert_step_matches_chars(s);
let mut c = Cursor::new(s);
c.advance(1 + 3); assert_eq!(
c.position.last_line_number, 0,
"U+2028 should not affect line counting under current rules"
);
}
#[test]
fn nel_does_not_increment_line() {
let s = "a\u{0085}b"; assert_step_matches_chars(s);
let mut c = Cursor::new(s);
c.advance(1 + 2); assert_eq!(
c.position.last_line_number, 0,
"NEL should not affect line counting under current rules"
);
}
#[test]
fn search2_finds_ascii_bytes() {
let s = "abc:def;ghi";
let c = Cursor::new(s);
let pos = c.search2(b':', b';').expect("should find ':' or ';'");
assert_eq!(s.as_bytes()[pos], b':');
}
#[test]
fn search3_finds_one_of_three() {
let s = "a,b;c";
let c = Cursor::new(s);
let pos = c.search3(b':', b';', b',').unwrap();
assert_eq!(s.as_bytes()[pos], b',');
}
#[test]
fn next_bytes_equal_matches_prefix() {
let s = "hello world";
let c = Cursor::new(s);
assert!(c.next_bytes_equal(b"hell"));
assert!(!c.next_bytes_equal(b"world"));
}
#[test]
fn byte_search_can_match_inside_utf8_sequence() {
let s = "Ć©";
let c = Cursor::new(s);
let idx = c.search2(0xA9, b'x').unwrap();
assert_eq!(idx, 1);
assert_eq!(s.as_bytes()[idx], 0xA9);
}
#[test]
fn find_subslice_unicode_bytes() {
let hay = "aƩ걚z";
let needle = "ę±".as_bytes();
let at = find_subslice(needle, hay.as_bytes()).unwrap();
assert_eq!(&hay.as_bytes()[at..at + needle.len()], needle);
}
#[test]
fn find_subslice_empty_needle_is_zero() {
assert_eq!(find_subslice(b"", b"abc"), Some(0));
}
#[test]
fn find_subslice_not_found() {
assert_eq!(find_subslice(b"xyz", b"abc"), None);
}
#[test]
#[should_panic]
fn current_byte_panics_at_end() {
let c = Cursor::new("");
let _ = c.current_byte();
}
#[test]
#[should_panic]
fn next_byte_panics_when_no_next() {
let c = Cursor::new("a");
let _ = c.next_byte();
}
#[test]
fn increment_is_noop_at_end() {
let mut c = Cursor::new("a");
c.to_end();
let p = c.position;
c.increment();
assert_eq!(c.position, p);
}
#[test]
fn property_end_indices_match_std_counts_various_strings() {
let cases = [
"",
"ascii",
"a\nb\nc",
"Ć©",
"e\u{0301}",
"ę±å",
"š",
"š©\u{200D}š»",
"abc\u{200F}Ł
Ų±ŲŲØŲ§\u{200E}xyz",
"a\u{2028}b",
"a\r\nb",
];
for s in cases {
assert_end_indices(s);
assert_step_matches_chars(s);
assert_remaining_is_suffix(s);
assert_byte_index_always_aligned(s);
}
}
#[test]
fn property_line_tracking_matches_count_of_lf() {
let s = "a\nƩ\n걚\n\nx";
let mut c = Cursor::new(s);
c.to_end();
let lf_count = s.as_bytes().iter().filter(|&&b| b == b'\n').count() as u32;
assert_eq!(c.position.last_line_number, lf_count);
}
#[test]
fn to_position_matches_internal_state() {
let s = "a\nƩ걚\nzz";
let mut c = Cursor::new(s);
while !c.ended() {
let _pos = c.position.to_position();
c.increment();
}
}
#[test]
fn unicode_to_source() {
let s = "šš"; let mut c = Cursor::new(s);
let start = c.position.to_position();
let end = c.to_end().to_position();
let loc = crate::common::SourceLocation::from_source(s, start, end);
assert_eq!(loc.source, s);
}
}