use serde::{Deserialize, Serialize};
use std::ops::Range;
pub use crate::{ByteOffset, CharOffset};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct CharRange {
pub start: CharOffset,
pub end: CharOffset,
}
impl CharRange {
#[must_use]
pub const fn new(start: CharOffset, end: CharOffset) -> Self {
Self { start, end }
}
#[must_use]
pub const fn from_raw(start: usize, end: usize) -> Self {
Self {
start: CharOffset::new(start),
end: CharOffset::new(end),
}
}
#[must_use]
pub const fn len(&self) -> usize {
self.end.get().saturating_sub(self.start.get())
}
#[must_use]
pub const fn is_empty(&self) -> bool {
self.start.get() >= self.end.get()
}
#[must_use]
pub const fn as_range(&self) -> Range<usize> {
self.start.get()..self.end.get()
}
}
impl From<(usize, usize)> for CharRange {
fn from((start, end): (usize, usize)) -> Self {
Self::from_raw(start, end)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct ByteRange {
pub start: ByteOffset,
pub end: ByteOffset,
}
impl ByteRange {
#[must_use]
pub const fn new(start: ByteOffset, end: ByteOffset) -> Self {
Self { start, end }
}
#[must_use]
pub const fn from_raw(start: usize, end: usize) -> Self {
Self {
start: ByteOffset::new(start),
end: ByteOffset::new(end),
}
}
#[must_use]
pub const fn len(&self) -> usize {
self.end.get().saturating_sub(self.start.get())
}
#[must_use]
pub const fn is_empty(&self) -> bool {
self.start.get() >= self.end.get()
}
#[must_use]
pub const fn as_range(&self) -> Range<usize> {
self.start.get()..self.end.get()
}
}
impl From<(usize, usize)> for ByteRange {
fn from((start, end): (usize, usize)) -> Self {
Self::from_raw(start, end)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct TextSpan {
pub byte_start: usize,
pub byte_end: usize,
pub char_start: usize,
pub char_end: usize,
}
impl TextSpan {
#[must_use]
pub fn from_bytes(text: &str, byte_start: usize, byte_end: usize) -> Self {
let (char_start, char_end) = bytes_to_chars(text, byte_start, byte_end);
Self {
byte_start,
byte_end,
char_start,
char_end,
}
}
#[must_use]
pub fn from_chars(text: &str, char_start: usize, char_end: usize) -> Self {
let (byte_start, byte_end) = chars_to_bytes(text, char_start, char_end);
Self {
byte_start,
byte_end,
char_start,
char_end,
}
}
#[must_use]
pub const fn ascii(start: usize, end: usize) -> Self {
Self {
byte_start: start,
byte_end: end,
char_start: start,
char_end: end,
}
}
#[must_use]
pub const fn byte_range(&self) -> Range<usize> {
self.byte_start..self.byte_end
}
#[must_use]
pub const fn char_range(&self) -> Range<usize> {
self.char_start..self.char_end
}
#[must_use]
pub const fn byte_len(&self) -> usize {
self.byte_end.saturating_sub(self.byte_start)
}
#[must_use]
pub const fn char_len(&self) -> usize {
self.char_end.saturating_sub(self.char_start)
}
#[must_use]
pub const fn is_empty(&self) -> bool {
self.byte_start >= self.byte_end
}
#[must_use]
pub const fn is_ascii(&self) -> bool {
self.byte_start == self.char_start && self.byte_end == self.char_end
}
#[must_use]
pub fn extract<'a>(&self, text: &'a str) -> &'a str {
text.get(self.byte_start..self.byte_end).unwrap_or("")
}
}
impl From<Range<usize>> for TextSpan {
fn from(range: Range<usize>) -> Self {
Self::ascii(range.start, range.end)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct TokenSpan {
pub start: usize,
pub end: usize,
pub text_span: TextSpan,
}
impl TokenSpan {
#[must_use]
pub const fn new(start: usize, end: usize, text_span: TextSpan) -> Self {
Self {
start,
end,
text_span,
}
}
#[must_use]
pub const fn len(&self) -> usize {
self.end.saturating_sub(self.start)
}
#[must_use]
pub const fn is_empty(&self) -> bool {
self.start >= self.end
}
#[must_use]
pub const fn token_range(&self) -> Range<usize> {
self.start..self.end
}
}
#[derive(Debug, Clone)]
pub struct OffsetMapping {
offsets: Vec<(usize, usize)>,
}
impl OffsetMapping {
#[must_use]
pub fn new(offsets: Vec<(usize, usize)>) -> Self {
Self { offsets }
}
#[must_use]
pub fn get(&self, token_idx: usize) -> Option<(usize, usize)> {
self.offsets.get(token_idx).copied()
}
#[must_use]
pub fn char_span_to_tokens(
&self,
char_start: usize,
char_end: usize,
) -> Option<(usize, usize)> {
let mut first_token = None;
let mut last_token = 0;
for (idx, &(tok_start, tok_end)) in self.offsets.iter().enumerate() {
if tok_start == 0 && tok_end == 0 && idx != 0 {
continue;
}
if tok_end > char_start && tok_start < char_end {
if first_token.is_none() {
first_token = Some(idx);
}
last_token = idx + 1;
}
}
first_token.map(|first| (first, last_token))
}
#[must_use]
pub fn tokens_to_char_span(
&self,
token_start: usize,
token_end: usize,
) -> Option<(usize, usize)> {
if token_start >= token_end || token_end > self.offsets.len() {
return None;
}
let char_start = (token_start..token_end)
.filter_map(|idx| {
let (s, e) = self.offsets.get(idx)?;
if *s == 0 && *e == 0 {
None
} else {
Some(*s)
}
})
.next()
.or_else(|| {
self.offsets.get(token_start).map(|(s, _)| *s)
})?;
let char_end = (token_start..token_end)
.rev()
.filter_map(|idx| {
let (s, e) = self.offsets.get(idx)?;
if *s == 0 && *e == 0 {
None
} else {
Some(*e)
}
})
.next()?;
Some((char_start, char_end))
}
#[must_use]
pub fn len(&self) -> usize {
self.offsets.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.offsets.is_empty()
}
}
#[must_use]
pub fn bytes_to_chars(text: &str, byte_start: usize, byte_end: usize) -> (usize, usize) {
if text.is_empty() {
return (0, 0);
}
let mut char_start = 0;
let mut found_start = false;
let mut last_char_idx = 0;
let mut last_byte_idx = 0;
for (char_idx, (byte_idx, ch)) in text.char_indices().enumerate() {
last_char_idx = char_idx;
last_byte_idx = byte_idx;
let char_byte_end = byte_idx + ch.len_utf8();
if !found_start {
if byte_idx == byte_start {
char_start = char_idx;
found_start = true;
} else if byte_idx < byte_start && byte_start < char_byte_end {
char_start = char_idx;
found_start = true;
}
}
if byte_idx == byte_end {
return (char_start, char_idx);
} else if byte_idx < byte_end && byte_end < char_byte_end {
return (char_start, char_idx + 1);
} else if byte_idx > byte_end {
return (char_start, char_idx);
}
}
let char_count = last_char_idx + 1;
if !found_start {
if byte_start >= last_byte_idx {
if let Some(last_ch) = text.chars().last() {
let last_char_byte_end = last_byte_idx + last_ch.len_utf8();
if byte_start < last_char_byte_end {
char_start = last_char_idx;
} else {
char_start = char_count;
}
} else {
char_start = char_count;
}
} else {
char_start = char_count;
}
}
(char_start, char_count)
}
#[must_use]
pub fn chars_to_bytes(text: &str, char_start: usize, char_end: usize) -> (usize, usize) {
let mut byte_start = 0;
let mut byte_end = text.len();
let mut found_start = false;
for (char_idx, (byte_idx, _ch)) in text.char_indices().enumerate() {
if char_idx == char_start {
byte_start = byte_idx;
found_start = true;
}
if char_idx == char_end {
byte_end = byte_idx;
return (byte_start, byte_end);
}
}
if !found_start {
byte_start = text.len();
}
(byte_start, byte_end)
}
#[must_use]
pub fn build_byte_to_char_map(text: &str) -> Vec<usize> {
let mut map = vec![0usize; text.len() + 1];
for (char_idx, (byte_idx, ch)) in text.char_indices().enumerate() {
let ch_len = ch.len_utf8();
for i in 0..ch_len {
if byte_idx + i < map.len() {
map[byte_idx + i] = char_idx;
}
}
}
if !map.is_empty() {
map[text.len()] = text.chars().count();
}
map
}
#[must_use]
pub fn build_char_to_byte_map(text: &str) -> Vec<usize> {
let char_count = text.chars().count();
let mut map = vec![0usize; char_count + 1];
for (char_idx, (byte_idx, _ch)) in text.char_indices().enumerate() {
map[char_idx] = byte_idx;
}
if !map.is_empty() {
map[char_count] = text.len();
}
map
}
#[must_use]
pub fn is_ascii(text: &str) -> bool {
text.is_ascii()
}
pub struct SpanConverter {
byte_to_char: Vec<usize>,
char_to_byte: Vec<usize>,
is_ascii: bool,
}
impl SpanConverter {
#[must_use]
pub fn new(text: &str) -> Self {
let is_ascii = is_ascii(text);
if is_ascii {
Self {
byte_to_char: Vec::new(),
char_to_byte: Vec::new(),
is_ascii: true,
}
} else {
Self {
byte_to_char: build_byte_to_char_map(text),
char_to_byte: build_char_to_byte_map(text),
is_ascii: false,
}
}
}
#[must_use]
pub fn byte_to_char(&self, byte_idx: usize) -> usize {
if self.is_ascii {
byte_idx
} else {
self.byte_to_char.get(byte_idx).copied().unwrap_or_else(|| {
#[cfg(debug_assertions)]
{
let max_valid = self.byte_to_char.len().saturating_sub(1);
if byte_idx > max_valid {
debug_assert!(
byte_idx <= max_valid + 1,
"byte_idx {} out of bounds (max valid: {}, map len: {})",
byte_idx,
max_valid,
self.byte_to_char.len()
);
}
}
self.byte_to_char.last().copied().unwrap_or(0)
})
}
}
#[must_use]
pub fn byte_to_char_ceil(&self, byte_idx: usize) -> usize {
if self.is_ascii {
return byte_idx;
}
let floor = self.byte_to_char(byte_idx);
if byte_idx < self.byte_to_char.len() {
if byte_idx > 0
&& self
.byte_to_char
.get(byte_idx.wrapping_sub(1))
.copied()
.unwrap_or(0)
== floor
&& (self.byte_to_char.get(byte_idx).copied() == Some(floor))
{
let at_char_start = byte_idx == 0
|| self.byte_to_char.get(byte_idx - 1).copied().unwrap_or(0) != floor;
if !at_char_start {
return floor + 1;
}
}
}
floor
}
#[must_use]
pub fn char_to_byte(&self, char_idx: usize) -> usize {
if self.is_ascii {
char_idx
} else {
self.char_to_byte.get(char_idx).copied().unwrap_or_else(|| {
#[cfg(debug_assertions)]
{
let max_valid = self.char_to_byte.len().saturating_sub(1);
if char_idx > max_valid {
debug_assert!(
char_idx <= max_valid + 1,
"char_idx {} out of bounds (max valid: {}, map len: {})",
char_idx,
max_valid,
self.char_to_byte.len()
);
}
}
self.char_to_byte.last().copied().unwrap_or(0)
})
}
}
#[must_use]
pub fn from_bytes(&self, byte_start: usize, byte_end: usize) -> TextSpan {
TextSpan {
byte_start,
byte_end,
char_start: self.byte_to_char(byte_start),
char_end: self.byte_to_char(byte_end),
}
}
#[must_use]
pub fn from_chars(&self, char_start: usize, char_end: usize) -> TextSpan {
TextSpan {
byte_start: self.char_to_byte(char_start),
byte_end: self.char_to_byte(char_end),
char_start,
char_end,
}
}
#[must_use]
pub const fn is_ascii(&self) -> bool {
self.is_ascii
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ascii_text() {
let text = "Hello World";
let span = TextSpan::from_bytes(text, 0, 5);
assert_eq!(span.byte_start, 0);
assert_eq!(span.byte_end, 5);
assert_eq!(span.char_start, 0);
assert_eq!(span.char_end, 5);
assert!(span.is_ascii());
assert_eq!(span.extract(text), "Hello");
}
#[test]
fn test_euro_symbol() {
let text = "Price €50";
let span = TextSpan::from_bytes(text, 6, 11);
assert_eq!(span.byte_start, 6);
assert_eq!(span.byte_end, 11);
assert_eq!(span.char_start, 6);
assert_eq!(span.char_end, 9);
assert!(!span.is_ascii());
assert_eq!(span.extract(text), "€50");
}
#[test]
fn test_pound_symbol() {
let text = "Fee: £25";
let span = TextSpan::from_bytes(text, 5, 9);
assert_eq!(span.byte_start, 5);
assert_eq!(span.byte_end, 9);
assert_eq!(span.char_start, 5);
assert_eq!(span.char_end, 8);
assert_eq!(span.extract(text), "£25");
}
#[test]
fn test_emoji() {
let text = "Hello 👋 World";
let span = TextSpan::from_bytes(text, 11, 16);
assert_eq!(span.char_start, 8);
assert_eq!(span.char_end, 13);
assert_eq!(span.extract(text), "World");
}
#[test]
fn test_cjk() {
let text = "日本語 test";
let span = TextSpan::from_bytes(text, 10, 14);
assert_eq!(span.char_start, 4);
assert_eq!(span.char_end, 8);
assert_eq!(span.extract(text), "test");
}
#[test]
fn test_from_chars() {
let text = "Price €50";
let span = TextSpan::from_chars(text, 6, 9);
assert_eq!(span.char_start, 6);
assert_eq!(span.char_end, 9);
assert_eq!(span.byte_start, 6);
assert_eq!(span.byte_end, 11);
assert_eq!(span.extract(text), "€50");
}
#[test]
fn test_converter_ascii() {
let text = "Hello World";
let conv = SpanConverter::new(text);
assert!(conv.is_ascii());
assert_eq!(conv.byte_to_char(5), 5);
assert_eq!(conv.char_to_byte(5), 5);
}
#[test]
fn test_converter_unicode() {
let text = "Price €50";
let conv = SpanConverter::new(text);
assert!(!conv.is_ascii());
assert_eq!(conv.byte_to_char(6), 6);
assert_eq!(conv.byte_to_char(9), 7);
assert_eq!(conv.byte_to_char(11), 9);
assert_eq!(conv.char_to_byte(6), 6);
assert_eq!(conv.char_to_byte(9), 11);
}
#[test]
fn test_empty_span() {
let text = "test";
let span = TextSpan::from_bytes(text, 2, 2);
assert!(span.is_empty());
assert_eq!(span.byte_len(), 0);
assert_eq!(span.char_len(), 0);
}
#[test]
fn test_full_text_span() {
let text = "日本語";
let span = TextSpan::from_bytes(text, 0, text.len());
assert_eq!(span.char_start, 0);
assert_eq!(span.char_end, 3);
assert_eq!(span.byte_len(), 9);
assert_eq!(span.char_len(), 3);
}
#[test]
fn test_char_offset_newtype() {
let offset = CharOffset::new(5);
assert_eq!(offset.get(), 5);
let from_usize: CharOffset = 10.into();
assert_eq!(from_usize.get(), 10);
let back_to_usize: usize = CharOffset::new(15).into();
assert_eq!(back_to_usize, 15);
}
#[test]
fn test_byte_offset_newtype() {
let offset = ByteOffset::new(5);
assert_eq!(offset.get(), 5);
let from_usize: ByteOffset = 10.into();
assert_eq!(from_usize.get(), 10);
let back_to_usize: usize = ByteOffset::new(15).into();
assert_eq!(back_to_usize, 15);
}
#[test]
fn test_char_range() {
let range = CharRange::new(CharOffset::new(5), CharOffset::new(10));
assert_eq!(range.len(), 5);
assert!(!range.is_empty());
assert_eq!(range.as_range(), 5..10);
let from_raw = CharRange::from_raw(0, 5);
assert_eq!(from_raw.start.get(), 0);
assert_eq!(from_raw.end.get(), 5);
let from_tuple: CharRange = (2, 7).into();
assert_eq!(from_tuple.len(), 5);
}
#[test]
fn test_byte_range() {
let range = ByteRange::new(ByteOffset::new(5), ByteOffset::new(10));
assert_eq!(range.len(), 5);
assert!(!range.is_empty());
assert_eq!(range.as_range(), 5..10);
let empty_range = ByteRange::from_raw(5, 5);
assert!(empty_range.is_empty());
}
#[test]
fn test_char_offset_ordering() {
let a = CharOffset::new(5);
let b = CharOffset::new(10);
let c = CharOffset::new(5);
assert!(a < b);
assert!(b > a);
assert_eq!(a, c);
}
#[test]
fn test_byte_offset_ordering() {
let a = ByteOffset::new(5);
let b = ByteOffset::new(10);
let c = ByteOffset::new(5);
assert!(a < b);
assert!(b > a);
assert_eq!(a, c);
}
}
#[cfg(test)]
mod proptests {
use super::*;
use proptest::prelude::*;
proptest! {
#[test]
fn roundtrip_bytes_chars_bytes(text in ".{0,100}") {
if text.is_empty() {
return Ok(());
}
let byte_end = text.len();
let (char_start, char_end) = bytes_to_chars(&text, 0, byte_end);
let (byte_start2, byte_end2) = chars_to_bytes(&text, char_start, char_end);
prop_assert_eq!(byte_start2, 0);
prop_assert_eq!(byte_end2, byte_end);
}
#[test]
fn textspan_extract_valid(text in ".{1,50}") {
let span = TextSpan::from_bytes(&text, 0, text.len());
let extracted = span.extract(&text);
prop_assert_eq!(extracted, &text);
}
#[test]
fn converter_matches_direct(text in ".{1,50}") {
let conv = SpanConverter::new(&text);
let span_direct = TextSpan::from_bytes(&text, 0, text.len());
let span_conv = conv.from_bytes(0, text.len());
prop_assert_eq!(span_direct.char_start, span_conv.char_start);
prop_assert_eq!(span_direct.char_end, span_conv.char_end);
}
#[test]
fn ascii_detection(text in "[a-zA-Z0-9 ]{0,50}") {
prop_assert!(is_ascii(&text));
}
#[test]
fn char_offset_roundtrip(val in 0usize..1_000_000) {
let offset = CharOffset::new(val);
prop_assert_eq!(offset.get(), val);
let from_into: usize = CharOffset::from(val).into();
prop_assert_eq!(from_into, val);
}
#[test]
fn byte_offset_roundtrip(val in 0usize..1_000_000) {
let offset = ByteOffset::new(val);
prop_assert_eq!(offset.get(), val);
let from_into: usize = ByteOffset::from(val).into();
prop_assert_eq!(from_into, val);
}
#[test]
fn char_range_length(start in 0usize..1000, len in 0usize..1000) {
let end = start + len;
let range = CharRange::from_raw(start, end);
prop_assert_eq!(range.len(), len);
prop_assert_eq!(range.is_empty(), len == 0);
}
#[test]
fn byte_range_length(start in 0usize..1000, len in 0usize..1000) {
let end = start + len;
let range = ByteRange::from_raw(start, end);
prop_assert_eq!(range.len(), len);
prop_assert_eq!(range.is_empty(), len == 0);
}
}
}