anno/offset.rs
1//! Unified byte/character/token offset handling.
2//!
3//! # The Three Coordinate Systems
4//!
5//! When working with text, different tools use different ways to count positions.
6//! This causes bugs when tools disagree on where an entity starts and ends.
7//!
8//! ```text
9//! ┌──────────────────────────────────────────────────────────────────────────┐
10//! │ THE OFFSET ALIGNMENT PROBLEM │
11//! ├──────────────────────────────────────────────────────────────────────────┤
12//! │ │
13//! │ Text: "The café costs €50" │
14//! │ │
15//! │ ┌─────────────────────────────────────────────────────────────────────┐ │
16//! │ │ BYTE INDEX (what regex/file I/O returns) │ │
17//! │ │ │ │
18//! │ │ T h e c a f [ é ] c o s t s │ │
19//! │ │ 0 1 2 3 4 5 6 7-8 9 10 11 12 13 14 15 16 │ │
20//! │ │ └─2 bytes─┘ │ │
21//! │ │ │ │
22//! │ │ [ € ] 5 0 │ │
23//! │ │ 17-18-19 20 21 22 │ │
24//! │ │ └─3 bytes──┘ │ │
25//! │ └─────────────────────────────────────────────────────────────────────┘ │
26//! │ │
27//! │ ┌─────────────────────────────────────────────────────────────────────┐ │
28//! │ │ CHAR INDEX (what humans count, what eval tools expect) │ │
29//! │ │ │ │
30//! │ │ T h e c a f é c o s t s € 5 │ │
31//! │ │ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 │ │
32//! │ │ └─1 char─┘ └─1 char─┘ │ │
33//! │ └─────────────────────────────────────────────────────────────────────┘ │
34//! │ │
35//! │ ┌─────────────────────────────────────────────────────────────────────┐ │
36//! │ │ TOKEN INDEX (what BERT/transformers return) │ │
37//! │ │ │ │
38//! │ │ [CLS] The café costs € 50 [SEP] │ │
39//! │ │ 0 1 2 3 4 5 6 │ │
40//! │ │ │ │
41//! │ │ But wait! "café" might be split: │ │
42//! │ │ [CLS] The ca ##fe costs € 50 [SEP] │ │
43//! │ │ 0 1 2 3 4 5 6 7 │ │
44//! │ └─────────────────────────────────────────────────────────────────────┘ │
45//! │ │
46//! │ THE PROBLEM: │
47//! │ • Regex finds "€50" at byte positions (17, 22) │
48//! │ • Evaluation tool expects char positions (15, 18) │
49//! │ • BERT returns token positions (5, 6) │
50//! │ │
51//! │ Without conversion, your F1 score will be WRONG. │
52//! └──────────────────────────────────────────────────────────────────────────┘
53//! ```
54//!
55//! # The Subword Problem
56//!
57//! Transformer models split words into subword tokens. This breaks NER labels:
58//!
59//! ```text
60//! Text: "playing"
61//!
62//! Tokenizer: WordPiece splits unknown words
63//! "playing" → ["play", "##ing"]
64//!
65//! Problem: Which token gets the NER label?
66//!
67//! ┌────────────────────────────────────────────────────┐
68//! │ OPTION 1: First-only │
69//! │ │
70//! │ Tokens: ["play", "##ing"] │
71//! │ Labels: [B-PER, O ] ← "##ing" ignored! │
72//! │ │
73//! │ Problem: Model never learns "##ing" is part of │
74//! │ the entity. Loses signal. │
75//! ├────────────────────────────────────────────────────┤
76//! │ OPTION 2: All tokens │
77//! │ │
78//! │ Tokens: ["play", "##ing"] │
79//! │ Labels: [B-PER, I-PER ] ← Continuation! │
80//! │ │
81//! │ Better, but requires propagating labels during │
82//! │ both training AND inference. │
83//! └────────────────────────────────────────────────────┘
84//! ```
85//!
86//! # Solution: Dual Representations
87//!
88//! ```text
89//! ┌────────────────────────────────────────────────────┐
90//! │ Use TextSpan at boundaries, TokenSpan for models │
91//! ├────────────────────────────────────────────────────┤
92//! │ │
93//! │ Entity: "John" in "Hello John!" │
94//! │ │
95//! │ TextSpan { │
96//! │ byte_start: 6, byte_end: 10, │
97//! │ char_start: 6, char_end: 10, // ASCII: same│
98//! │ } │
99//! │ │
100//! │ TokenSpan { │
101//! │ token_start: 2, // [CLS] Hello John [SEP] │
102//! │ token_end: 3, // 0 1 2 3 │
103//! │ } │
104//! │ │
105//! │ Store BOTH. Convert at boundaries. │
106//! └────────────────────────────────────────────────────┘
107//! ```
108//!
109//! This module provides:
110//! - [`TextSpan`]: Stores both byte and char offsets together
111//! - [`TokenSpan`]: Stores subword token indices
112//! - [`OffsetMapping`]: Maps between token ↔ character positions
113//! - [`CharOffset`]: Newtype wrapper for character offsets (type safety)
114//! - [`ByteOffset`]: Newtype wrapper for byte offsets (type safety)
115//!
116//! # API Boundary Conventions
117//!
118//! Anno uses **character offsets** as the canonical representation at API boundaries:
119//!
120//! | Type | Offset Convention | Notes |
121//! |------|-------------------|-------|
122//! | `Entity.start/end` | Character | Public API, evaluation, serialization |
123//! | `Signal` with `Location::Text` | Character | Grounded document model |
124//! | `Span::Text` | Character | Entity span representation |
125//! | Backend internals | Often byte | Regex, JSON parsing, byte slicing |
126//! | Token indices | Token | BERT/transformer models |
127//!
128//! **Rule of thumb**: Convert to character offsets as early as possible (at the
129//! backend boundary), and use the newtype wrappers (`CharOffset`, `ByteOffset`)
130//! when you need to be explicit about which you're working with.
131//!
132//! # Type Safety with Newtypes
133//!
134//! The most common source of Unicode bugs is accidentally mixing byte and character
135//! offsets. Use the newtype wrappers to make this impossible at compile time:
136//!
137//! ```rust
138//! use anno::offset::{CharOffset, ByteOffset};
139//!
140//! fn process_span(start: CharOffset, end: CharOffset) {
141//! // Can only receive CharOffset, not ByteOffset
142//! }
143//!
144//! let char_pos = CharOffset(5);
145//! let byte_pos = ByteOffset(10);
146//!
147//! process_span(char_pos, CharOffset(10)); // OK
148//! // process_span(byte_pos, CharOffset(10)); // Compile error!
149//! ```
150
151use serde::{Deserialize, Serialize};
152use std::ops::Range;
153
154// =============================================================================
155// Newtype Wrappers for Type Safety
156// =============================================================================
157
158/// A character offset (Unicode scalar value index).
159///
160/// Use this newtype to prevent accidentally passing byte offsets where
161/// character offsets are expected. This is the most common source of
162/// Unicode-related bugs in NLP code.
163///
164/// # Example
165///
166/// ```rust
167/// use anno::offset::CharOffset;
168///
169/// let text = "日本語"; // 3 chars, 9 bytes
170/// let pos = CharOffset(1); // Second character (本)
171/// assert_eq!(pos.0, 1);
172/// ```
173#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
174#[repr(transparent)]
175pub struct CharOffset(pub usize);
176
177impl CharOffset {
178 /// Create a new character offset.
179 #[must_use]
180 pub const fn new(offset: usize) -> Self {
181 Self(offset)
182 }
183
184 /// Get the raw value.
185 #[must_use]
186 pub const fn get(self) -> usize {
187 self.0
188 }
189}
190
191impl From<usize> for CharOffset {
192 fn from(offset: usize) -> Self {
193 Self(offset)
194 }
195}
196
197impl From<CharOffset> for usize {
198 fn from(offset: CharOffset) -> Self {
199 offset.0
200 }
201}
202
203/// A byte offset (raw byte index into UTF-8 string).
204///
205/// Use this newtype to prevent accidentally passing character offsets where
206/// byte offsets are expected. Byte offsets are what Rust's `str::get()` and
207/// regex libraries return.
208///
209/// # Example
210///
211/// ```rust
212/// use anno::offset::ByteOffset;
213///
214/// let text = "日本語"; // 3 chars, 9 bytes
215/// let pos = ByteOffset(3); // Start of second character (本)
216/// assert_eq!(pos.0, 3);
217/// ```
218#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
219#[repr(transparent)]
220pub struct ByteOffset(pub usize);
221
222impl ByteOffset {
223 /// Create a new byte offset.
224 #[must_use]
225 pub const fn new(offset: usize) -> Self {
226 Self(offset)
227 }
228
229 /// Get the raw value.
230 #[must_use]
231 pub const fn get(self) -> usize {
232 self.0
233 }
234}
235
236impl From<usize> for ByteOffset {
237 fn from(offset: usize) -> Self {
238 Self(offset)
239 }
240}
241
242impl From<ByteOffset> for usize {
243 fn from(offset: ByteOffset) -> Self {
244 offset.0
245 }
246}
247
248/// A character range (start and end as character offsets).
249///
250/// This is a convenience type for APIs that need to express spans
251/// in character coordinates with compile-time type safety.
252#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
253pub struct CharRange {
254 /// Start offset (inclusive)
255 pub start: CharOffset,
256 /// End offset (exclusive)
257 pub end: CharOffset,
258}
259
260impl CharRange {
261 /// Create a new character range.
262 #[must_use]
263 pub const fn new(start: CharOffset, end: CharOffset) -> Self {
264 Self { start, end }
265 }
266
267 /// Create from raw usize values.
268 #[must_use]
269 pub const fn from_raw(start: usize, end: usize) -> Self {
270 Self {
271 start: CharOffset(start),
272 end: CharOffset(end),
273 }
274 }
275
276 /// Length in characters.
277 #[must_use]
278 pub const fn len(&self) -> usize {
279 self.end.0.saturating_sub(self.start.0)
280 }
281
282 /// Check if empty.
283 #[must_use]
284 pub const fn is_empty(&self) -> bool {
285 self.start.0 >= self.end.0
286 }
287
288 /// Convert to a standard Range.
289 #[must_use]
290 pub const fn as_range(&self) -> Range<usize> {
291 self.start.0..self.end.0
292 }
293}
294
295impl From<(usize, usize)> for CharRange {
296 fn from((start, end): (usize, usize)) -> Self {
297 Self::from_raw(start, end)
298 }
299}
300
301/// A byte range (start and end as byte offsets).
302///
303/// Use for APIs that work with raw byte positions (regex, file I/O).
304#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
305pub struct ByteRange {
306 /// Start offset (inclusive)
307 pub start: ByteOffset,
308 /// End offset (exclusive)
309 pub end: ByteOffset,
310}
311
312impl ByteRange {
313 /// Create a new byte range.
314 #[must_use]
315 pub const fn new(start: ByteOffset, end: ByteOffset) -> Self {
316 Self { start, end }
317 }
318
319 /// Create from raw usize values.
320 #[must_use]
321 pub const fn from_raw(start: usize, end: usize) -> Self {
322 Self {
323 start: ByteOffset(start),
324 end: ByteOffset(end),
325 }
326 }
327
328 /// Length in bytes.
329 #[must_use]
330 pub const fn len(&self) -> usize {
331 self.end.0.saturating_sub(self.start.0)
332 }
333
334 /// Check if empty.
335 #[must_use]
336 pub const fn is_empty(&self) -> bool {
337 self.start.0 >= self.end.0
338 }
339
340 /// Convert to a standard Range.
341 #[must_use]
342 pub const fn as_range(&self) -> Range<usize> {
343 self.start.0..self.end.0
344 }
345}
346
347impl From<(usize, usize)> for ByteRange {
348 fn from((start, end): (usize, usize)) -> Self {
349 Self::from_raw(start, end)
350 }
351}
352
353/// A text span with both byte and character offsets.
354///
355/// This is the canonical representation for entity positions.
356/// Store both to avoid repeated conversion.
357#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
358pub struct TextSpan {
359 /// Byte offset (start, inclusive)
360 pub byte_start: usize,
361 /// Byte offset (end, exclusive)
362 pub byte_end: usize,
363 /// Character offset (start, inclusive)
364 pub char_start: usize,
365 /// Character offset (end, exclusive)
366 pub char_end: usize,
367}
368
369impl TextSpan {
370 /// Create a span from byte offsets, computing char offsets from text.
371 ///
372 /// # Arguments
373 /// * `text` - The full text (needed to compute char offsets)
374 /// * `byte_start` - Byte offset start (inclusive)
375 /// * `byte_end` - Byte offset end (exclusive)
376 ///
377 /// # Example
378 /// ```
379 /// use anno::offset::TextSpan;
380 ///
381 /// let text = "Price €50";
382 /// // "Price " = 6 bytes, € = 3 bytes, "50" = 2 bytes = 11 total
383 /// let span = TextSpan::from_bytes(text, 6, 11); // "€50"
384 /// assert_eq!(span.char_start, 6);
385 /// assert_eq!(span.char_end, 9); // € is 1 char but 3 bytes
386 /// ```
387 #[must_use]
388 pub fn from_bytes(text: &str, byte_start: usize, byte_end: usize) -> Self {
389 let (char_start, char_end) = bytes_to_chars(text, byte_start, byte_end);
390 Self {
391 byte_start,
392 byte_end,
393 char_start,
394 char_end,
395 }
396 }
397
398 /// Create a span from character offsets, computing byte offsets from text.
399 ///
400 /// # Arguments
401 /// * `text` - The full text (needed to compute byte offsets)
402 /// * `char_start` - Character offset start (inclusive)
403 /// * `char_end` - Character offset end (exclusive)
404 #[must_use]
405 pub fn from_chars(text: &str, char_start: usize, char_end: usize) -> Self {
406 let (byte_start, byte_end) = chars_to_bytes(text, char_start, char_end);
407 Self {
408 byte_start,
409 byte_end,
410 char_start,
411 char_end,
412 }
413 }
414
415 /// Create a span for ASCII text where byte == char offsets.
416 ///
417 /// This is a fast path for ASCII-only text.
418 #[must_use]
419 pub const fn ascii(start: usize, end: usize) -> Self {
420 Self {
421 byte_start: start,
422 byte_end: end,
423 char_start: start,
424 char_end: end,
425 }
426 }
427
428 /// Get byte range.
429 #[must_use]
430 pub const fn byte_range(&self) -> Range<usize> {
431 self.byte_start..self.byte_end
432 }
433
434 /// Get character range.
435 #[must_use]
436 pub const fn char_range(&self) -> Range<usize> {
437 self.char_start..self.char_end
438 }
439
440 /// Byte length.
441 #[must_use]
442 pub const fn byte_len(&self) -> usize {
443 self.byte_end.saturating_sub(self.byte_start)
444 }
445
446 /// Character length.
447 #[must_use]
448 pub const fn char_len(&self) -> usize {
449 self.char_end.saturating_sub(self.char_start)
450 }
451
452 /// Check if this span is empty.
453 #[must_use]
454 pub const fn is_empty(&self) -> bool {
455 self.byte_start >= self.byte_end
456 }
457
458 /// Check if this is ASCII (byte == char offsets).
459 #[must_use]
460 pub const fn is_ascii(&self) -> bool {
461 self.byte_start == self.char_start && self.byte_end == self.char_end
462 }
463
464 /// Extract the text for this span.
465 #[must_use]
466 pub fn extract<'a>(&self, text: &'a str) -> &'a str {
467 text.get(self.byte_start..self.byte_end).unwrap_or("")
468 }
469}
470
471impl From<Range<usize>> for TextSpan {
472 /// Create from byte range (assumes ASCII).
473 fn from(range: Range<usize>) -> Self {
474 Self::ascii(range.start, range.end)
475 }
476}
477
478// =============================================================================
479// Token Span (Subword-Level)
480// =============================================================================
481
482/// Span in subword token space.
483///
484/// # Research Context (BERT for NER, NAACL 2019)
485///
486/// Transformer models operate on subword tokens, not characters.
487/// Entity boundaries often split mid-token:
488///
489/// ```text
490/// Text: "New York City"
491/// Tokens: ["New", "York", "City"] <- clean split
492/// Token IDs: [2739, 1816, 2103]
493/// TokenSpan: (0, 3) for "New York City"
494///
495/// Text: "playing"
496/// Tokens: ["play", "##ing"] <- mid-word split
497/// Token IDs: [2377, 2075]
498/// TokenSpan: (0, 2) for "playing"
499/// ```
500///
501/// Key insight: When propagating BIO labels to continuation tokens (##),
502/// use I- prefix to avoid treating them as separate entities.
503#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
504pub struct TokenSpan {
505 /// Token index (start, inclusive)
506 pub start: usize,
507 /// Token index (end, exclusive)
508 pub end: usize,
509 /// Original text span (for reconstruction)
510 pub text_span: TextSpan,
511}
512
513impl TokenSpan {
514 /// Create a token span with its corresponding text span.
515 #[must_use]
516 pub const fn new(start: usize, end: usize, text_span: TextSpan) -> Self {
517 Self {
518 start,
519 end,
520 text_span,
521 }
522 }
523
524 /// Number of tokens in this span.
525 #[must_use]
526 pub const fn len(&self) -> usize {
527 self.end.saturating_sub(self.start)
528 }
529
530 /// Check if empty.
531 #[must_use]
532 pub const fn is_empty(&self) -> bool {
533 self.start >= self.end
534 }
535
536 /// Token range.
537 #[must_use]
538 pub const fn token_range(&self) -> Range<usize> {
539 self.start..self.end
540 }
541}
542
543/// Offset mapping from tokenizer.
544///
545/// Maps each token to its character span in the original text.
546/// Used to convert between token indices and character positions.
547///
548/// # Research Note (HuggingFace Tokenizers)
549///
550/// The `offset_mapping` from HuggingFace tokenizers is a list of
551/// `(char_start, char_end)` for each token. Special tokens like
552/// `[CLS]` and `[SEP]` have offset `(0, 0)`.
553#[derive(Debug, Clone)]
554pub struct OffsetMapping {
555 /// Character spans for each token: `[(char_start, char_end), ...]`
556 offsets: Vec<(usize, usize)>,
557}
558
559impl OffsetMapping {
560 /// Create from tokenizer output.
561 ///
562 /// # Arguments
563 /// * `offsets` - List of (char_start, char_end) for each token
564 #[must_use]
565 pub fn new(offsets: Vec<(usize, usize)>) -> Self {
566 Self { offsets }
567 }
568
569 /// Get character span for a token.
570 #[must_use]
571 pub fn get(&self, token_idx: usize) -> Option<(usize, usize)> {
572 self.offsets.get(token_idx).copied()
573 }
574
575 /// Find tokens that overlap with a character span.
576 ///
577 /// Returns `(first_token, last_token_exclusive)`.
578 ///
579 /// # Note on Label Propagation
580 ///
581 /// For entity "playing" tokenized as `["play", "##ing"]`:
582 /// - Assign B-PER to "play" (first token)
583 /// - Assign I-PER to "##ing" (continuation)
584 #[must_use]
585 pub fn char_span_to_tokens(
586 &self,
587 char_start: usize,
588 char_end: usize,
589 ) -> Option<(usize, usize)> {
590 let mut first_token = None;
591 let mut last_token = 0;
592
593 for (idx, &(tok_start, tok_end)) in self.offsets.iter().enumerate() {
594 // Skip special tokens (offset 0, 0)
595 if tok_start == 0 && tok_end == 0 && idx != 0 {
596 continue;
597 }
598
599 // Check overlap
600 if tok_end > char_start && tok_start < char_end {
601 if first_token.is_none() {
602 first_token = Some(idx);
603 }
604 last_token = idx + 1;
605 }
606 }
607
608 first_token.map(|first| (first, last_token))
609 }
610
611 /// Convert token span to character span.
612 #[must_use]
613 pub fn tokens_to_char_span(
614 &self,
615 token_start: usize,
616 token_end: usize,
617 ) -> Option<(usize, usize)> {
618 if token_start >= token_end || token_end > self.offsets.len() {
619 return None;
620 }
621
622 // Find first non-special token's start
623 // Make logic consistent - always skip special tokens (0, 0)
624 let char_start = (token_start..token_end)
625 .filter_map(|idx| {
626 let (s, e) = self.offsets.get(idx)?;
627 // Skip special tokens (0, 0)
628 if *s == 0 && *e == 0 {
629 None
630 } else {
631 Some(*s)
632 }
633 })
634 .next()
635 .or_else(|| {
636 // If all tokens are special, return the start of the first token's position
637 // (which is 0 for special tokens, but we need a fallback)
638 self.offsets.get(token_start).map(|(s, _)| *s)
639 })?;
640
641 // Find last non-special token's end
642 let char_end = (token_start..token_end)
643 .rev()
644 .filter_map(|idx| {
645 let (s, e) = self.offsets.get(idx)?;
646 // Skip special tokens
647 if *s == 0 && *e == 0 {
648 None
649 } else {
650 Some(*e)
651 }
652 })
653 .next()?;
654
655 Some((char_start, char_end))
656 }
657
658 /// Number of tokens.
659 #[must_use]
660 pub fn len(&self) -> usize {
661 self.offsets.len()
662 }
663
664 /// Check if empty.
665 #[must_use]
666 pub fn is_empty(&self) -> bool {
667 self.offsets.is_empty()
668 }
669}
670
671// =============================================================================
672// Conversion Functions
673// =============================================================================
674
675/// Convert byte offsets to character offsets.
676///
677/// Handles cases where byte offsets fall in the middle of multi-byte UTF-8 characters
678/// by mapping them to the containing character's start position.
679///
680/// # Arguments
681///
682/// * `text` - The source text
683/// * `byte_start` - Byte offset for the start (may be in middle of a character)
684/// * `byte_end` - Byte offset for the end (may be in middle of a character)
685///
686/// # Returns
687///
688/// A tuple `(char_start, char_end)` where:
689/// - `char_start` is the character index containing `byte_start`
690/// - `char_end` is the character index after the character containing `byte_end` (exclusive)
691///
692/// # Behavior
693///
694/// - If `byte_start` falls in the middle of a multi-byte character, it maps to that character's start
695/// - If `byte_end` falls in the middle of a multi-byte character, it maps to the next character (exclusive end)
696/// - If `byte_start` or `byte_end` are beyond the text length, they map to the end character index
697///
698/// Uses standard library's `char_indices()` for iteration.
699#[must_use]
700pub fn bytes_to_chars(text: &str, byte_start: usize, byte_end: usize) -> (usize, usize) {
701 if text.is_empty() {
702 return (0, 0);
703 }
704
705 let mut char_start = 0;
706 let mut found_start = false;
707 let mut last_char_idx = 0;
708 let mut last_byte_idx = 0;
709
710 for (char_idx, (byte_idx, ch)) in text.char_indices().enumerate() {
711 last_char_idx = char_idx;
712 last_byte_idx = byte_idx;
713
714 // Check if byte_start falls within this character's byte range
715 let char_byte_end = byte_idx + ch.len_utf8();
716 if !found_start {
717 if byte_idx == byte_start {
718 // Exact match at character start
719 char_start = char_idx;
720 found_start = true;
721 } else if byte_idx < byte_start && byte_start < char_byte_end {
722 // byte_start is in the middle of this character - map to character start
723 char_start = char_idx;
724 found_start = true;
725 }
726 }
727
728 // Check if byte_end falls within this character's byte range
729 if byte_idx == byte_end {
730 // Exact match at character start - char_end is exclusive, so return this char index
731 // (meaning range is [char_start, char_idx), which includes chars up to but not including char_idx)
732 return (char_start, char_idx);
733 } else if byte_idx < byte_end && byte_end < char_byte_end {
734 // byte_end is in the middle of this character - map to next character (exclusive)
735 return (char_start, char_idx + 1);
736 } else if byte_idx > byte_end {
737 // We've passed byte_end - use current character (exclusive end)
738 return (char_start, char_idx);
739 }
740 }
741
742 // Handle end of string
743 let char_count = last_char_idx + 1;
744 if !found_start {
745 // byte_start was beyond all characters or in the last character
746 if byte_start >= last_byte_idx {
747 // Check if byte_start is in the last character's range
748 if let Some(last_ch) = text.chars().last() {
749 let last_char_byte_end = last_byte_idx + last_ch.len_utf8();
750 if byte_start < last_char_byte_end {
751 char_start = last_char_idx;
752 } else {
753 char_start = char_count;
754 }
755 } else {
756 char_start = char_count;
757 }
758 } else {
759 // Shouldn't happen, but fallback
760 char_start = char_count;
761 }
762 }
763
764 (char_start, char_count)
765}
766
767/// Convert character offsets to byte offsets.
768#[must_use]
769pub fn chars_to_bytes(text: &str, char_start: usize, char_end: usize) -> (usize, usize) {
770 let mut byte_start = 0;
771 let mut byte_end = text.len();
772 let mut found_start = false;
773
774 for (char_idx, (byte_idx, _ch)) in text.char_indices().enumerate() {
775 if char_idx == char_start {
776 byte_start = byte_idx;
777 found_start = true;
778 }
779 if char_idx == char_end {
780 byte_end = byte_idx;
781 return (byte_start, byte_end);
782 }
783 }
784
785 if !found_start {
786 byte_start = text.len();
787 }
788
789 (byte_start, byte_end)
790}
791
792/// Build an offset mapping table for efficient repeated conversions.
793///
794/// Returns a vec where `mapping[byte_idx]` gives the character index.
795/// Useful when converting many spans from the same text.
796#[must_use]
797pub fn build_byte_to_char_map(text: &str) -> Vec<usize> {
798 let mut map = vec![0usize; text.len() + 1];
799
800 for (char_idx, (byte_idx, ch)) in text.char_indices().enumerate() {
801 // Fill all bytes of this character with the same char index
802 let ch_len = ch.len_utf8();
803 for i in 0..ch_len {
804 if byte_idx + i < map.len() {
805 map[byte_idx + i] = char_idx;
806 }
807 }
808 }
809
810 // Set the final position
811 if !map.is_empty() {
812 map[text.len()] = text.chars().count();
813 }
814
815 map
816}
817
818/// Build an offset mapping table from char to byte.
819///
820/// Returns a vec where `mapping[char_idx]` gives the byte index.
821#[must_use]
822pub fn build_char_to_byte_map(text: &str) -> Vec<usize> {
823 let char_count = text.chars().count();
824 let mut map = vec![0usize; char_count + 1];
825
826 for (char_idx, (byte_idx, _ch)) in text.char_indices().enumerate() {
827 map[char_idx] = byte_idx;
828 }
829
830 // Set the final position
831 if !map.is_empty() {
832 map[char_count] = text.len();
833 }
834
835 map
836}
837
838/// Fast check if text is ASCII-only.
839#[must_use]
840pub fn is_ascii(text: &str) -> bool {
841 text.is_ascii()
842}
843
844// =============================================================================
845// Span Converter (batch operations)
846// =============================================================================
847
848/// Converter for efficiently handling many spans from the same text.
849///
850/// Pre-computes mapping tables so each conversion is O(1).
851pub struct SpanConverter {
852 byte_to_char: Vec<usize>,
853 char_to_byte: Vec<usize>,
854 is_ascii: bool,
855}
856
857impl SpanConverter {
858 /// Create a converter for the given text.
859 #[must_use]
860 pub fn new(text: &str) -> Self {
861 let is_ascii = is_ascii(text);
862 if is_ascii {
863 // For ASCII, mappings are identity
864 Self {
865 byte_to_char: Vec::new(),
866 char_to_byte: Vec::new(),
867 is_ascii: true,
868 }
869 } else {
870 Self {
871 byte_to_char: build_byte_to_char_map(text),
872 char_to_byte: build_char_to_byte_map(text),
873 is_ascii: false,
874 }
875 }
876 }
877
878 /// Convert byte offset to char offset.
879 ///
880 /// # Arguments
881 ///
882 /// * `byte_idx` - Byte offset in the text
883 ///
884 /// # Returns
885 ///
886 /// Character offset corresponding to the byte offset. If `byte_idx` is out of bounds,
887 /// returns the last valid character index (or 0 if the map is empty).
888 ///
889 /// # Panics
890 ///
891 /// In debug mode, panics if `byte_idx` exceeds the text length by more than 1
892 /// (allowing for the exclusive end position).
893 #[must_use]
894 pub fn byte_to_char(&self, byte_idx: usize) -> usize {
895 if self.is_ascii {
896 byte_idx
897 } else {
898 self.byte_to_char.get(byte_idx).copied().unwrap_or_else(|| {
899 // Bounds check: byte_idx should be <= text.len() (inclusive end position)
900 // The map has length text.len() + 1 to include the end position
901 #[cfg(debug_assertions)]
902 {
903 let max_valid = self.byte_to_char.len().saturating_sub(1);
904 if byte_idx > max_valid {
905 debug_assert!(
906 byte_idx <= max_valid + 1,
907 "byte_idx {} out of bounds (max valid: {}, map len: {})",
908 byte_idx,
909 max_valid,
910 self.byte_to_char.len()
911 );
912 }
913 }
914 self.byte_to_char.last().copied().unwrap_or(0)
915 })
916 }
917 }
918
919 /// Convert char offset to byte offset.
920 ///
921 /// # Arguments
922 ///
923 /// * `char_idx` - Character offset in the text
924 ///
925 /// # Returns
926 ///
927 /// Byte offset corresponding to the character offset. If `char_idx` is out of bounds,
928 /// returns the last valid byte index (or 0 if the map is empty).
929 ///
930 /// # Panics
931 ///
932 /// In debug mode, panics if `char_idx` exceeds the character count by more than 1
933 /// (allowing for the exclusive end position).
934 #[must_use]
935 pub fn char_to_byte(&self, char_idx: usize) -> usize {
936 if self.is_ascii {
937 char_idx
938 } else {
939 self.char_to_byte.get(char_idx).copied().unwrap_or_else(|| {
940 // Bounds check: char_idx should be <= char_count (inclusive end position)
941 // The map has length char_count + 1 to include the end position
942 #[cfg(debug_assertions)]
943 {
944 let max_valid = self.char_to_byte.len().saturating_sub(1);
945 if char_idx > max_valid {
946 debug_assert!(
947 char_idx <= max_valid + 1,
948 "char_idx {} out of bounds (max valid: {}, map len: {})",
949 char_idx,
950 max_valid,
951 self.char_to_byte.len()
952 );
953 }
954 }
955 self.char_to_byte.last().copied().unwrap_or(0)
956 })
957 }
958 }
959
960 /// Convert byte span to TextSpan.
961 #[must_use]
962 pub fn from_bytes(&self, byte_start: usize, byte_end: usize) -> TextSpan {
963 TextSpan {
964 byte_start,
965 byte_end,
966 char_start: self.byte_to_char(byte_start),
967 char_end: self.byte_to_char(byte_end),
968 }
969 }
970
971 /// Convert char span to TextSpan.
972 #[must_use]
973 pub fn from_chars(&self, char_start: usize, char_end: usize) -> TextSpan {
974 TextSpan {
975 byte_start: self.char_to_byte(char_start),
976 byte_end: self.char_to_byte(char_end),
977 char_start,
978 char_end,
979 }
980 }
981
982 /// Check if this text is ASCII.
983 #[must_use]
984 pub const fn is_ascii(&self) -> bool {
985 self.is_ascii
986 }
987}
988
989// =============================================================================
990// Tests
991// =============================================================================
992
993#[cfg(test)]
994mod tests {
995 use super::*;
996
997 #[test]
998 fn test_ascii_text() {
999 let text = "Hello World";
1000 let span = TextSpan::from_bytes(text, 0, 5);
1001
1002 assert_eq!(span.byte_start, 0);
1003 assert_eq!(span.byte_end, 5);
1004 assert_eq!(span.char_start, 0);
1005 assert_eq!(span.char_end, 5);
1006 assert!(span.is_ascii());
1007 assert_eq!(span.extract(text), "Hello");
1008 }
1009
1010 #[test]
1011 fn test_euro_symbol() {
1012 let text = "Price €50";
1013 // "Price " = 6 bytes, 6 chars
1014 // € = 3 bytes (E2 82 AC), 1 char
1015 // "50" = 2 bytes, 2 chars
1016 // Total: 11 bytes, 9 chars
1017 //
1018 // "€50" starts at byte 6, ends at byte 11
1019 // "€50" starts at char 6, ends at char 9
1020
1021 let span = TextSpan::from_bytes(text, 6, 11);
1022
1023 assert_eq!(span.byte_start, 6);
1024 assert_eq!(span.byte_end, 11);
1025 assert_eq!(span.char_start, 6);
1026 assert_eq!(span.char_end, 9);
1027 assert!(!span.is_ascii());
1028 assert_eq!(span.extract(text), "€50");
1029 }
1030
1031 #[test]
1032 fn test_pound_symbol() {
1033 let text = "Fee: £25";
1034 // "Fee: " = 5 bytes, 5 chars
1035 // £ = 2 bytes (C2 A3), 1 char
1036 // "25" = 2 bytes, 2 chars
1037 // Total: 9 bytes, 8 chars
1038 //
1039 // "£25" starts at byte 5, ends at byte 9
1040 // "£25" starts at char 5, ends at char 8
1041
1042 let span = TextSpan::from_bytes(text, 5, 9);
1043
1044 assert_eq!(span.byte_start, 5);
1045 assert_eq!(span.byte_end, 9);
1046 assert_eq!(span.char_start, 5);
1047 assert_eq!(span.char_end, 8);
1048 assert_eq!(span.extract(text), "£25");
1049 }
1050
1051 #[test]
1052 fn test_emoji() {
1053 let text = "Hello 👋 World";
1054 // "Hello " = 6 bytes, 6 chars
1055 // 👋 = 4 bytes, 1 char
1056 // " World" = 6 bytes, 6 chars
1057 // Total: 16 bytes, 13 chars
1058 //
1059 // "World" starts at byte 11, ends at byte 16
1060 // "World" starts at char 8, ends at char 13
1061
1062 let span = TextSpan::from_bytes(text, 11, 16);
1063
1064 assert_eq!(span.char_start, 8);
1065 assert_eq!(span.char_end, 13);
1066 assert_eq!(span.extract(text), "World");
1067 }
1068
1069 #[test]
1070 fn test_cjk() {
1071 let text = "日本語 test";
1072 // 日 = 3 bytes, 1 char
1073 // 本 = 3 bytes, 1 char
1074 // 語 = 3 bytes, 1 char
1075 // " " = 1 byte, 1 char
1076 // "test" = 4 bytes, 4 chars
1077 // Total: 14 bytes, 8 chars
1078 //
1079 // "test" starts at byte 10, ends at byte 14
1080 // "test" starts at char 4, ends at char 8
1081
1082 let span = TextSpan::from_bytes(text, 10, 14);
1083
1084 assert_eq!(span.char_start, 4);
1085 assert_eq!(span.char_end, 8);
1086 assert_eq!(span.extract(text), "test");
1087 }
1088
1089 #[test]
1090 fn test_from_chars() {
1091 let text = "Price €50";
1092 // "€50" is chars 6..9
1093
1094 let span = TextSpan::from_chars(text, 6, 9);
1095
1096 assert_eq!(span.char_start, 6);
1097 assert_eq!(span.char_end, 9);
1098 assert_eq!(span.byte_start, 6);
1099 assert_eq!(span.byte_end, 11);
1100 assert_eq!(span.extract(text), "€50");
1101 }
1102
1103 #[test]
1104 fn test_converter_ascii() {
1105 let text = "Hello World";
1106 let conv = SpanConverter::new(text);
1107
1108 assert!(conv.is_ascii());
1109 assert_eq!(conv.byte_to_char(5), 5);
1110 assert_eq!(conv.char_to_byte(5), 5);
1111 }
1112
1113 #[test]
1114 fn test_converter_unicode() {
1115 let text = "Price €50";
1116 let conv = SpanConverter::new(text);
1117
1118 assert!(!conv.is_ascii());
1119
1120 // Byte 6 -> Char 6 (start of €)
1121 assert_eq!(conv.byte_to_char(6), 6);
1122 // Byte 9 -> Char 7 (end of €, which spans bytes 6-8)
1123 assert_eq!(conv.byte_to_char(9), 7);
1124 // Byte 11 -> Char 9 (end of string)
1125 assert_eq!(conv.byte_to_char(11), 9);
1126
1127 // Char 6 -> Byte 6
1128 assert_eq!(conv.char_to_byte(6), 6);
1129 // Char 9 -> Byte 11
1130 assert_eq!(conv.char_to_byte(9), 11);
1131 }
1132
1133 #[test]
1134 fn test_empty_span() {
1135 let text = "test";
1136 let span = TextSpan::from_bytes(text, 2, 2);
1137
1138 assert!(span.is_empty());
1139 assert_eq!(span.byte_len(), 0);
1140 assert_eq!(span.char_len(), 0);
1141 }
1142
1143 #[test]
1144 fn test_full_text_span() {
1145 let text = "日本語";
1146 let span = TextSpan::from_bytes(text, 0, text.len());
1147
1148 assert_eq!(span.char_start, 0);
1149 assert_eq!(span.char_end, 3);
1150 assert_eq!(span.byte_len(), 9);
1151 assert_eq!(span.char_len(), 3);
1152 }
1153
1154 // =========================================================================
1155 // Newtype wrapper tests
1156 // =========================================================================
1157
1158 #[test]
1159 fn test_char_offset_newtype() {
1160 let offset = CharOffset::new(5);
1161 assert_eq!(offset.get(), 5);
1162 assert_eq!(offset.0, 5);
1163
1164 let from_usize: CharOffset = 10.into();
1165 assert_eq!(from_usize.get(), 10);
1166
1167 let back_to_usize: usize = CharOffset(15).into();
1168 assert_eq!(back_to_usize, 15);
1169 }
1170
1171 #[test]
1172 fn test_byte_offset_newtype() {
1173 let offset = ByteOffset::new(5);
1174 assert_eq!(offset.get(), 5);
1175 assert_eq!(offset.0, 5);
1176
1177 let from_usize: ByteOffset = 10.into();
1178 assert_eq!(from_usize.get(), 10);
1179
1180 let back_to_usize: usize = ByteOffset(15).into();
1181 assert_eq!(back_to_usize, 15);
1182 }
1183
1184 #[test]
1185 fn test_char_range() {
1186 let range = CharRange::new(CharOffset(5), CharOffset(10));
1187 assert_eq!(range.len(), 5);
1188 assert!(!range.is_empty());
1189 assert_eq!(range.as_range(), 5..10);
1190
1191 let from_raw = CharRange::from_raw(0, 5);
1192 assert_eq!(from_raw.start.0, 0);
1193 assert_eq!(from_raw.end.0, 5);
1194
1195 let from_tuple: CharRange = (2, 7).into();
1196 assert_eq!(from_tuple.len(), 5);
1197 }
1198
1199 #[test]
1200 fn test_byte_range() {
1201 let range = ByteRange::new(ByteOffset(5), ByteOffset(10));
1202 assert_eq!(range.len(), 5);
1203 assert!(!range.is_empty());
1204 assert_eq!(range.as_range(), 5..10);
1205
1206 let empty_range = ByteRange::from_raw(5, 5);
1207 assert!(empty_range.is_empty());
1208 }
1209
1210 #[test]
1211 fn test_char_offset_ordering() {
1212 let a = CharOffset(5);
1213 let b = CharOffset(10);
1214 let c = CharOffset(5);
1215
1216 assert!(a < b);
1217 assert!(b > a);
1218 assert_eq!(a, c);
1219 }
1220
1221 #[test]
1222 fn test_byte_offset_ordering() {
1223 let a = ByteOffset(5);
1224 let b = ByteOffset(10);
1225 let c = ByteOffset(5);
1226
1227 assert!(a < b);
1228 assert!(b > a);
1229 assert_eq!(a, c);
1230 }
1231}
1232
1233#[cfg(test)]
1234mod proptests {
1235 use super::*;
1236 use proptest::prelude::*;
1237
1238 proptest! {
1239 /// Round-trip: bytes -> chars -> bytes should preserve byte offsets.
1240 #[test]
1241 fn roundtrip_bytes_chars_bytes(text in ".{0,100}") {
1242 if text.is_empty() {
1243 return Ok(());
1244 }
1245
1246 let byte_end = text.len();
1247 let (char_start, char_end) = bytes_to_chars(&text, 0, byte_end);
1248 let (byte_start2, byte_end2) = chars_to_bytes(&text, char_start, char_end);
1249
1250 prop_assert_eq!(byte_start2, 0);
1251 prop_assert_eq!(byte_end2, byte_end);
1252 }
1253
1254 /// TextSpan extraction should always succeed for valid spans.
1255 #[test]
1256 fn textspan_extract_valid(text in ".{1,50}") {
1257 let span = TextSpan::from_bytes(&text, 0, text.len());
1258 let extracted = span.extract(&text);
1259 prop_assert_eq!(extracted, &text);
1260 }
1261
1262 /// Converter should match direct conversion.
1263 #[test]
1264 fn converter_matches_direct(text in ".{1,50}") {
1265 let conv = SpanConverter::new(&text);
1266
1267 let span_direct = TextSpan::from_bytes(&text, 0, text.len());
1268 let span_conv = conv.from_bytes(0, text.len());
1269
1270 prop_assert_eq!(span_direct.char_start, span_conv.char_start);
1271 prop_assert_eq!(span_direct.char_end, span_conv.char_end);
1272 }
1273
1274 /// ASCII detection should be correct.
1275 #[test]
1276 fn ascii_detection(text in "[a-zA-Z0-9 ]{0,50}") {
1277 prop_assert!(is_ascii(&text));
1278 }
1279
1280 /// CharOffset preserves value through conversions.
1281 #[test]
1282 fn char_offset_roundtrip(val in 0usize..1_000_000) {
1283 let offset = CharOffset::new(val);
1284 prop_assert_eq!(offset.get(), val);
1285
1286 let from_into: usize = CharOffset::from(val).into();
1287 prop_assert_eq!(from_into, val);
1288 }
1289
1290 /// ByteOffset preserves value through conversions.
1291 #[test]
1292 fn byte_offset_roundtrip(val in 0usize..1_000_000) {
1293 let offset = ByteOffset::new(val);
1294 prop_assert_eq!(offset.get(), val);
1295
1296 let from_into: usize = ByteOffset::from(val).into();
1297 prop_assert_eq!(from_into, val);
1298 }
1299
1300 /// CharRange length is always end - start.
1301 #[test]
1302 fn char_range_length(start in 0usize..1000, len in 0usize..1000) {
1303 let end = start + len;
1304 let range = CharRange::from_raw(start, end);
1305 prop_assert_eq!(range.len(), len);
1306 prop_assert_eq!(range.is_empty(), len == 0);
1307 }
1308
1309 /// ByteRange length is always end - start.
1310 #[test]
1311 fn byte_range_length(start in 0usize..1000, len in 0usize..1000) {
1312 let end = start + len;
1313 let range = ByteRange::from_raw(start, end);
1314 prop_assert_eq!(range.len(), len);
1315 prop_assert_eq!(range.is_empty(), len == 0);
1316 }
1317 }
1318}