anno/
offset.rs

1//! Unified byte/character/token offset handling.
2//!
3//! # The Three Coordinate Systems
4//!
5//! When working with text, different tools use different ways to count positions.
6//! This causes bugs when tools disagree on where an entity starts and ends.
7//!
8//! ```text
9//! ┌──────────────────────────────────────────────────────────────────────────┐
10//! │                    THE OFFSET ALIGNMENT PROBLEM                          │
11//! ├──────────────────────────────────────────────────────────────────────────┤
12//! │                                                                          │
13//! │  Text: "The café costs €50"                                              │
14//! │                                                                          │
15//! │  ┌─────────────────────────────────────────────────────────────────────┐ │
16//! │  │ BYTE INDEX (what regex/file I/O returns)                            │ │
17//! │  │                                                                     │ │
18//! │  │   T   h   e       c   a   f   [  é  ]       c   o   s   t   s       │ │
19//! │  │   0   1   2   3   4   5   6   7-8   9  10  11  12  13  14  15  16   │ │
20//! │  │                               └─2 bytes─┘                           │ │
21//! │  │                                                                     │ │
22//! │  │   [     €     ]   5   0                                             │ │
23//! │  │   17-18-19   20  21  22                                             │ │
24//! │  │   └─3 bytes──┘                                                      │ │
25//! │  └─────────────────────────────────────────────────────────────────────┘ │
26//! │                                                                          │
27//! │  ┌─────────────────────────────────────────────────────────────────────┐ │
28//! │  │ CHAR INDEX (what humans count, what eval tools expect)              │ │
29//! │  │                                                                     │ │
30//! │  │   T   h   e       c   a   f   é       c   o   s   t   s       €   5 │ │
31//! │  │   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16 │ │
32//! │  │                               └─1 char─┘              └─1 char─┘    │ │
33//! │  └─────────────────────────────────────────────────────────────────────┘ │
34//! │                                                                          │
35//! │  ┌─────────────────────────────────────────────────────────────────────┐ │
36//! │  │ TOKEN INDEX (what BERT/transformers return)                         │ │
37//! │  │                                                                     │ │
38//! │  │   [CLS]  The  café  costs   €    50   [SEP]                         │ │
39//! │  │     0     1    2      3     4     5     6                           │ │
40//! │  │                                                                     │ │
41//! │  │   But wait! "café" might be split:                                  │ │
42//! │  │   [CLS]  The  ca  ##fe  costs   €    50   [SEP]                     │ │
43//! │  │     0     1    2    3     4     5     6     7                       │ │
44//! │  └─────────────────────────────────────────────────────────────────────┘ │
45//! │                                                                          │
46//! │  THE PROBLEM:                                                            │
47//! │  • Regex finds "€50" at byte positions (17, 22)                          │
48//! │  • Evaluation tool expects char positions (15, 18)                       │
49//! │  • BERT returns token positions (5, 6)                                   │
50//! │                                                                          │
51//! │  Without conversion, your F1 score will be WRONG.                        │
52//! └──────────────────────────────────────────────────────────────────────────┘
53//! ```
54//!
55//! # The Subword Problem
56//!
57//! Transformer models split words into subword tokens. This breaks NER labels:
58//!
59//! ```text
60//! Text:      "playing"
61//!
62//! Tokenizer: WordPiece splits unknown words
63//!            "playing" → ["play", "##ing"]
64//!
65//! Problem:   Which token gets the NER label?
66//!
67//! ┌────────────────────────────────────────────────────┐
68//! │                 OPTION 1: First-only               │
69//! │                                                    │
70//! │   Tokens:  ["play", "##ing"]                       │
71//! │   Labels:  [B-PER,    O    ]  ← "##ing" ignored!   │
72//! │                                                    │
73//! │   Problem: Model never learns "##ing" is part of  │
74//! │            the entity. Loses signal.              │
75//! ├────────────────────────────────────────────────────┤
76//! │                 OPTION 2: All tokens               │
77//! │                                                    │
78//! │   Tokens:  ["play", "##ing"]                       │
79//! │   Labels:  [B-PER,  I-PER ]  ← Continuation!       │
80//! │                                                    │
81//! │   Better, but requires propagating labels during  │
82//! │   both training AND inference.                    │
83//! └────────────────────────────────────────────────────┘
84//! ```
85//!
86//! # Solution: Dual Representations
87//!
88//! ```text
89//! ┌────────────────────────────────────────────────────┐
90//! │  Use TextSpan at boundaries, TokenSpan for models  │
91//! ├────────────────────────────────────────────────────┤
92//! │                                                    │
93//! │  Entity: "John" in "Hello John!"                   │
94//! │                                                    │
95//! │  TextSpan {                                        │
96//! │      byte_start: 6,   byte_end: 10,                │
97//! │      char_start: 6,   char_end: 10,  // ASCII: same│
98//! │  }                                                 │
99//! │                                                    │
100//! │  TokenSpan {                                       │
101//! │      token_start: 2,  // [CLS] Hello John [SEP]    │
102//! │      token_end: 3,    //   0     1     2     3     │
103//! │  }                                                 │
104//! │                                                    │
105//! │  Store BOTH. Convert at boundaries.                │
106//! └────────────────────────────────────────────────────┘
107//! ```
108//!
109//! This module provides:
110//! - [`TextSpan`]: Stores both byte and char offsets together
111//! - [`TokenSpan`]: Stores subword token indices
112//! - [`OffsetMapping`]: Maps between token ↔ character positions
113//! - [`CharOffset`]: Newtype wrapper for character offsets (type safety)
114//! - [`ByteOffset`]: Newtype wrapper for byte offsets (type safety)
115//!
116//! # API Boundary Conventions
117//!
118//! Anno uses **character offsets** as the canonical representation at API boundaries:
119//!
120//! | Type | Offset Convention | Notes |
121//! |------|-------------------|-------|
122//! | `Entity.start/end` | Character | Public API, evaluation, serialization |
123//! | `Signal` with `Location::Text` | Character | Grounded document model |
124//! | `Span::Text` | Character | Entity span representation |
125//! | Backend internals | Often byte | Regex, JSON parsing, byte slicing |
126//! | Token indices | Token | BERT/transformer models |
127//!
128//! **Rule of thumb**: Convert to character offsets as early as possible (at the
129//! backend boundary), and use the newtype wrappers (`CharOffset`, `ByteOffset`)
130//! when you need to be explicit about which you're working with.
131//!
132//! # Type Safety with Newtypes
133//!
134//! The most common source of Unicode bugs is accidentally mixing byte and character
135//! offsets. Use the newtype wrappers to make this impossible at compile time:
136//!
137//! ```rust
138//! use anno::offset::{CharOffset, ByteOffset};
139//!
140//! fn process_span(start: CharOffset, end: CharOffset) {
141//!     // Can only receive CharOffset, not ByteOffset
142//! }
143//!
144//! let char_pos = CharOffset(5);
145//! let byte_pos = ByteOffset(10);
146//!
147//! process_span(char_pos, CharOffset(10));  // OK
148//! // process_span(byte_pos, CharOffset(10));  // Compile error!
149//! ```
150
151use serde::{Deserialize, Serialize};
152use std::ops::Range;
153
154// =============================================================================
155// Newtype Wrappers for Type Safety
156// =============================================================================
157
158/// A character offset (Unicode scalar value index).
159///
160/// Use this newtype to prevent accidentally passing byte offsets where
161/// character offsets are expected. This is the most common source of
162/// Unicode-related bugs in NLP code.
163///
164/// # Example
165///
166/// ```rust
167/// use anno::offset::CharOffset;
168///
169/// let text = "日本語";  // 3 chars, 9 bytes
170/// let pos = CharOffset(1);  // Second character (本)
171/// assert_eq!(pos.0, 1);
172/// ```
173#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
174#[repr(transparent)]
175pub struct CharOffset(pub usize);
176
177impl CharOffset {
178    /// Create a new character offset.
179    #[must_use]
180    pub const fn new(offset: usize) -> Self {
181        Self(offset)
182    }
183
184    /// Get the raw value.
185    #[must_use]
186    pub const fn get(self) -> usize {
187        self.0
188    }
189}
190
191impl From<usize> for CharOffset {
192    fn from(offset: usize) -> Self {
193        Self(offset)
194    }
195}
196
197impl From<CharOffset> for usize {
198    fn from(offset: CharOffset) -> Self {
199        offset.0
200    }
201}
202
203/// A byte offset (raw byte index into UTF-8 string).
204///
205/// Use this newtype to prevent accidentally passing character offsets where
206/// byte offsets are expected. Byte offsets are what Rust's `str::get()` and
207/// regex libraries return.
208///
209/// # Example
210///
211/// ```rust
212/// use anno::offset::ByteOffset;
213///
214/// let text = "日本語";  // 3 chars, 9 bytes
215/// let pos = ByteOffset(3);  // Start of second character (本)
216/// assert_eq!(pos.0, 3);
217/// ```
218#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
219#[repr(transparent)]
220pub struct ByteOffset(pub usize);
221
222impl ByteOffset {
223    /// Create a new byte offset.
224    #[must_use]
225    pub const fn new(offset: usize) -> Self {
226        Self(offset)
227    }
228
229    /// Get the raw value.
230    #[must_use]
231    pub const fn get(self) -> usize {
232        self.0
233    }
234}
235
236impl From<usize> for ByteOffset {
237    fn from(offset: usize) -> Self {
238        Self(offset)
239    }
240}
241
242impl From<ByteOffset> for usize {
243    fn from(offset: ByteOffset) -> Self {
244        offset.0
245    }
246}
247
248/// A character range (start and end as character offsets).
249///
250/// This is a convenience type for APIs that need to express spans
251/// in character coordinates with compile-time type safety.
252#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
253pub struct CharRange {
254    /// Start offset (inclusive)
255    pub start: CharOffset,
256    /// End offset (exclusive)
257    pub end: CharOffset,
258}
259
260impl CharRange {
261    /// Create a new character range.
262    #[must_use]
263    pub const fn new(start: CharOffset, end: CharOffset) -> Self {
264        Self { start, end }
265    }
266
267    /// Create from raw usize values.
268    #[must_use]
269    pub const fn from_raw(start: usize, end: usize) -> Self {
270        Self {
271            start: CharOffset(start),
272            end: CharOffset(end),
273        }
274    }
275
276    /// Length in characters.
277    #[must_use]
278    pub const fn len(&self) -> usize {
279        self.end.0.saturating_sub(self.start.0)
280    }
281
282    /// Check if empty.
283    #[must_use]
284    pub const fn is_empty(&self) -> bool {
285        self.start.0 >= self.end.0
286    }
287
288    /// Convert to a standard Range.
289    #[must_use]
290    pub const fn as_range(&self) -> Range<usize> {
291        self.start.0..self.end.0
292    }
293}
294
295impl From<(usize, usize)> for CharRange {
296    fn from((start, end): (usize, usize)) -> Self {
297        Self::from_raw(start, end)
298    }
299}
300
301/// A byte range (start and end as byte offsets).
302///
303/// Use for APIs that work with raw byte positions (regex, file I/O).
304#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
305pub struct ByteRange {
306    /// Start offset (inclusive)
307    pub start: ByteOffset,
308    /// End offset (exclusive)
309    pub end: ByteOffset,
310}
311
312impl ByteRange {
313    /// Create a new byte range.
314    #[must_use]
315    pub const fn new(start: ByteOffset, end: ByteOffset) -> Self {
316        Self { start, end }
317    }
318
319    /// Create from raw usize values.
320    #[must_use]
321    pub const fn from_raw(start: usize, end: usize) -> Self {
322        Self {
323            start: ByteOffset(start),
324            end: ByteOffset(end),
325        }
326    }
327
328    /// Length in bytes.
329    #[must_use]
330    pub const fn len(&self) -> usize {
331        self.end.0.saturating_sub(self.start.0)
332    }
333
334    /// Check if empty.
335    #[must_use]
336    pub const fn is_empty(&self) -> bool {
337        self.start.0 >= self.end.0
338    }
339
340    /// Convert to a standard Range.
341    #[must_use]
342    pub const fn as_range(&self) -> Range<usize> {
343        self.start.0..self.end.0
344    }
345}
346
347impl From<(usize, usize)> for ByteRange {
348    fn from((start, end): (usize, usize)) -> Self {
349        Self::from_raw(start, end)
350    }
351}
352
353/// A text span with both byte and character offsets.
354///
355/// This is the canonical representation for entity positions.
356/// Store both to avoid repeated conversion.
357#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
358pub struct TextSpan {
359    /// Byte offset (start, inclusive)
360    pub byte_start: usize,
361    /// Byte offset (end, exclusive)
362    pub byte_end: usize,
363    /// Character offset (start, inclusive)
364    pub char_start: usize,
365    /// Character offset (end, exclusive)
366    pub char_end: usize,
367}
368
369impl TextSpan {
370    /// Create a span from byte offsets, computing char offsets from text.
371    ///
372    /// # Arguments
373    /// * `text` - The full text (needed to compute char offsets)
374    /// * `byte_start` - Byte offset start (inclusive)
375    /// * `byte_end` - Byte offset end (exclusive)
376    ///
377    /// # Example
378    /// ```
379    /// use anno::offset::TextSpan;
380    ///
381    /// let text = "Price €50";
382    /// // "Price " = 6 bytes, € = 3 bytes, "50" = 2 bytes = 11 total
383    /// let span = TextSpan::from_bytes(text, 6, 11); // "€50"
384    /// assert_eq!(span.char_start, 6);
385    /// assert_eq!(span.char_end, 9); // € is 1 char but 3 bytes
386    /// ```
387    #[must_use]
388    pub fn from_bytes(text: &str, byte_start: usize, byte_end: usize) -> Self {
389        let (char_start, char_end) = bytes_to_chars(text, byte_start, byte_end);
390        Self {
391            byte_start,
392            byte_end,
393            char_start,
394            char_end,
395        }
396    }
397
398    /// Create a span from character offsets, computing byte offsets from text.
399    ///
400    /// # Arguments
401    /// * `text` - The full text (needed to compute byte offsets)
402    /// * `char_start` - Character offset start (inclusive)
403    /// * `char_end` - Character offset end (exclusive)
404    #[must_use]
405    pub fn from_chars(text: &str, char_start: usize, char_end: usize) -> Self {
406        let (byte_start, byte_end) = chars_to_bytes(text, char_start, char_end);
407        Self {
408            byte_start,
409            byte_end,
410            char_start,
411            char_end,
412        }
413    }
414
415    /// Create a span for ASCII text where byte == char offsets.
416    ///
417    /// This is a fast path for ASCII-only text.
418    #[must_use]
419    pub const fn ascii(start: usize, end: usize) -> Self {
420        Self {
421            byte_start: start,
422            byte_end: end,
423            char_start: start,
424            char_end: end,
425        }
426    }
427
428    /// Get byte range.
429    #[must_use]
430    pub const fn byte_range(&self) -> Range<usize> {
431        self.byte_start..self.byte_end
432    }
433
434    /// Get character range.
435    #[must_use]
436    pub const fn char_range(&self) -> Range<usize> {
437        self.char_start..self.char_end
438    }
439
440    /// Byte length.
441    #[must_use]
442    pub const fn byte_len(&self) -> usize {
443        self.byte_end.saturating_sub(self.byte_start)
444    }
445
446    /// Character length.
447    #[must_use]
448    pub const fn char_len(&self) -> usize {
449        self.char_end.saturating_sub(self.char_start)
450    }
451
452    /// Check if this span is empty.
453    #[must_use]
454    pub const fn is_empty(&self) -> bool {
455        self.byte_start >= self.byte_end
456    }
457
458    /// Check if this is ASCII (byte == char offsets).
459    #[must_use]
460    pub const fn is_ascii(&self) -> bool {
461        self.byte_start == self.char_start && self.byte_end == self.char_end
462    }
463
464    /// Extract the text for this span.
465    #[must_use]
466    pub fn extract<'a>(&self, text: &'a str) -> &'a str {
467        text.get(self.byte_start..self.byte_end).unwrap_or("")
468    }
469}
470
471impl From<Range<usize>> for TextSpan {
472    /// Create from byte range (assumes ASCII).
473    fn from(range: Range<usize>) -> Self {
474        Self::ascii(range.start, range.end)
475    }
476}
477
478// =============================================================================
479// Token Span (Subword-Level)
480// =============================================================================
481
482/// Span in subword token space.
483///
484/// # Research Context (BERT for NER, NAACL 2019)
485///
486/// Transformer models operate on subword tokens, not characters.
487/// Entity boundaries often split mid-token:
488///
489/// ```text
490/// Text:       "New York City"
491/// Tokens:     ["New", "York", "City"]      <- clean split
492/// Token IDs:  [2739, 1816, 2103]
493/// TokenSpan:  (0, 3) for "New York City"
494///
495/// Text:       "playing"
496/// Tokens:     ["play", "##ing"]            <- mid-word split
497/// Token IDs:  [2377, 2075]
498/// TokenSpan:  (0, 2) for "playing"
499/// ```
500///
501/// Key insight: When propagating BIO labels to continuation tokens (##),
502/// use I- prefix to avoid treating them as separate entities.
503#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
504pub struct TokenSpan {
505    /// Token index (start, inclusive)
506    pub start: usize,
507    /// Token index (end, exclusive)
508    pub end: usize,
509    /// Original text span (for reconstruction)
510    pub text_span: TextSpan,
511}
512
513impl TokenSpan {
514    /// Create a token span with its corresponding text span.
515    #[must_use]
516    pub const fn new(start: usize, end: usize, text_span: TextSpan) -> Self {
517        Self {
518            start,
519            end,
520            text_span,
521        }
522    }
523
524    /// Number of tokens in this span.
525    #[must_use]
526    pub const fn len(&self) -> usize {
527        self.end.saturating_sub(self.start)
528    }
529
530    /// Check if empty.
531    #[must_use]
532    pub const fn is_empty(&self) -> bool {
533        self.start >= self.end
534    }
535
536    /// Token range.
537    #[must_use]
538    pub const fn token_range(&self) -> Range<usize> {
539        self.start..self.end
540    }
541}
542
543/// Offset mapping from tokenizer.
544///
545/// Maps each token to its character span in the original text.
546/// Used to convert between token indices and character positions.
547///
548/// # Research Note (HuggingFace Tokenizers)
549///
550/// The `offset_mapping` from HuggingFace tokenizers is a list of
551/// `(char_start, char_end)` for each token. Special tokens like
552/// `[CLS]` and `[SEP]` have offset `(0, 0)`.
553#[derive(Debug, Clone)]
554pub struct OffsetMapping {
555    /// Character spans for each token: `[(char_start, char_end), ...]`
556    offsets: Vec<(usize, usize)>,
557}
558
559impl OffsetMapping {
560    /// Create from tokenizer output.
561    ///
562    /// # Arguments
563    /// * `offsets` - List of (char_start, char_end) for each token
564    #[must_use]
565    pub fn new(offsets: Vec<(usize, usize)>) -> Self {
566        Self { offsets }
567    }
568
569    /// Get character span for a token.
570    #[must_use]
571    pub fn get(&self, token_idx: usize) -> Option<(usize, usize)> {
572        self.offsets.get(token_idx).copied()
573    }
574
575    /// Find tokens that overlap with a character span.
576    ///
577    /// Returns `(first_token, last_token_exclusive)`.
578    ///
579    /// # Note on Label Propagation
580    ///
581    /// For entity "playing" tokenized as `["play", "##ing"]`:
582    /// - Assign B-PER to "play" (first token)
583    /// - Assign I-PER to "##ing" (continuation)
584    #[must_use]
585    pub fn char_span_to_tokens(
586        &self,
587        char_start: usize,
588        char_end: usize,
589    ) -> Option<(usize, usize)> {
590        let mut first_token = None;
591        let mut last_token = 0;
592
593        for (idx, &(tok_start, tok_end)) in self.offsets.iter().enumerate() {
594            // Skip special tokens (offset 0, 0)
595            if tok_start == 0 && tok_end == 0 && idx != 0 {
596                continue;
597            }
598
599            // Check overlap
600            if tok_end > char_start && tok_start < char_end {
601                if first_token.is_none() {
602                    first_token = Some(idx);
603                }
604                last_token = idx + 1;
605            }
606        }
607
608        first_token.map(|first| (first, last_token))
609    }
610
611    /// Convert token span to character span.
612    #[must_use]
613    pub fn tokens_to_char_span(
614        &self,
615        token_start: usize,
616        token_end: usize,
617    ) -> Option<(usize, usize)> {
618        if token_start >= token_end || token_end > self.offsets.len() {
619            return None;
620        }
621
622        // Find first non-special token's start
623        // Make logic consistent - always skip special tokens (0, 0)
624        let char_start = (token_start..token_end)
625            .filter_map(|idx| {
626                let (s, e) = self.offsets.get(idx)?;
627                // Skip special tokens (0, 0)
628                if *s == 0 && *e == 0 {
629                    None
630                } else {
631                    Some(*s)
632                }
633            })
634            .next()
635            .or_else(|| {
636                // If all tokens are special, return the start of the first token's position
637                // (which is 0 for special tokens, but we need a fallback)
638                self.offsets.get(token_start).map(|(s, _)| *s)
639            })?;
640
641        // Find last non-special token's end
642        let char_end = (token_start..token_end)
643            .rev()
644            .filter_map(|idx| {
645                let (s, e) = self.offsets.get(idx)?;
646                // Skip special tokens
647                if *s == 0 && *e == 0 {
648                    None
649                } else {
650                    Some(*e)
651                }
652            })
653            .next()?;
654
655        Some((char_start, char_end))
656    }
657
658    /// Number of tokens.
659    #[must_use]
660    pub fn len(&self) -> usize {
661        self.offsets.len()
662    }
663
664    /// Check if empty.
665    #[must_use]
666    pub fn is_empty(&self) -> bool {
667        self.offsets.is_empty()
668    }
669}
670
671// =============================================================================
672// Conversion Functions
673// =============================================================================
674
675/// Convert byte offsets to character offsets.
676///
677/// Handles cases where byte offsets fall in the middle of multi-byte UTF-8 characters
678/// by mapping them to the containing character's start position.
679///
680/// # Arguments
681///
682/// * `text` - The source text
683/// * `byte_start` - Byte offset for the start (may be in middle of a character)
684/// * `byte_end` - Byte offset for the end (may be in middle of a character)
685///
686/// # Returns
687///
688/// A tuple `(char_start, char_end)` where:
689/// - `char_start` is the character index containing `byte_start`
690/// - `char_end` is the character index after the character containing `byte_end` (exclusive)
691///
692/// # Behavior
693///
694/// - If `byte_start` falls in the middle of a multi-byte character, it maps to that character's start
695/// - If `byte_end` falls in the middle of a multi-byte character, it maps to the next character (exclusive end)
696/// - If `byte_start` or `byte_end` are beyond the text length, they map to the end character index
697///
698/// Uses standard library's `char_indices()` for iteration.
699#[must_use]
700pub fn bytes_to_chars(text: &str, byte_start: usize, byte_end: usize) -> (usize, usize) {
701    if text.is_empty() {
702        return (0, 0);
703    }
704
705    let mut char_start = 0;
706    let mut found_start = false;
707    let mut last_char_idx = 0;
708    let mut last_byte_idx = 0;
709
710    for (char_idx, (byte_idx, ch)) in text.char_indices().enumerate() {
711        last_char_idx = char_idx;
712        last_byte_idx = byte_idx;
713
714        // Check if byte_start falls within this character's byte range
715        let char_byte_end = byte_idx + ch.len_utf8();
716        if !found_start {
717            if byte_idx == byte_start {
718                // Exact match at character start
719                char_start = char_idx;
720                found_start = true;
721            } else if byte_idx < byte_start && byte_start < char_byte_end {
722                // byte_start is in the middle of this character - map to character start
723                char_start = char_idx;
724                found_start = true;
725            }
726        }
727
728        // Check if byte_end falls within this character's byte range
729        if byte_idx == byte_end {
730            // Exact match at character start - char_end is exclusive, so return this char index
731            // (meaning range is [char_start, char_idx), which includes chars up to but not including char_idx)
732            return (char_start, char_idx);
733        } else if byte_idx < byte_end && byte_end < char_byte_end {
734            // byte_end is in the middle of this character - map to next character (exclusive)
735            return (char_start, char_idx + 1);
736        } else if byte_idx > byte_end {
737            // We've passed byte_end - use current character (exclusive end)
738            return (char_start, char_idx);
739        }
740    }
741
742    // Handle end of string
743    let char_count = last_char_idx + 1;
744    if !found_start {
745        // byte_start was beyond all characters or in the last character
746        if byte_start >= last_byte_idx {
747            // Check if byte_start is in the last character's range
748            if let Some(last_ch) = text.chars().last() {
749                let last_char_byte_end = last_byte_idx + last_ch.len_utf8();
750                if byte_start < last_char_byte_end {
751                    char_start = last_char_idx;
752                } else {
753                    char_start = char_count;
754                }
755            } else {
756                char_start = char_count;
757            }
758        } else {
759            // Shouldn't happen, but fallback
760            char_start = char_count;
761        }
762    }
763
764    (char_start, char_count)
765}
766
767/// Convert character offsets to byte offsets.
768#[must_use]
769pub fn chars_to_bytes(text: &str, char_start: usize, char_end: usize) -> (usize, usize) {
770    let mut byte_start = 0;
771    let mut byte_end = text.len();
772    let mut found_start = false;
773
774    for (char_idx, (byte_idx, _ch)) in text.char_indices().enumerate() {
775        if char_idx == char_start {
776            byte_start = byte_idx;
777            found_start = true;
778        }
779        if char_idx == char_end {
780            byte_end = byte_idx;
781            return (byte_start, byte_end);
782        }
783    }
784
785    if !found_start {
786        byte_start = text.len();
787    }
788
789    (byte_start, byte_end)
790}
791
792/// Build an offset mapping table for efficient repeated conversions.
793///
794/// Returns a vec where `mapping[byte_idx]` gives the character index.
795/// Useful when converting many spans from the same text.
796#[must_use]
797pub fn build_byte_to_char_map(text: &str) -> Vec<usize> {
798    let mut map = vec![0usize; text.len() + 1];
799
800    for (char_idx, (byte_idx, ch)) in text.char_indices().enumerate() {
801        // Fill all bytes of this character with the same char index
802        let ch_len = ch.len_utf8();
803        for i in 0..ch_len {
804            if byte_idx + i < map.len() {
805                map[byte_idx + i] = char_idx;
806            }
807        }
808    }
809
810    // Set the final position
811    if !map.is_empty() {
812        map[text.len()] = text.chars().count();
813    }
814
815    map
816}
817
818/// Build an offset mapping table from char to byte.
819///
820/// Returns a vec where `mapping[char_idx]` gives the byte index.
821#[must_use]
822pub fn build_char_to_byte_map(text: &str) -> Vec<usize> {
823    let char_count = text.chars().count();
824    let mut map = vec![0usize; char_count + 1];
825
826    for (char_idx, (byte_idx, _ch)) in text.char_indices().enumerate() {
827        map[char_idx] = byte_idx;
828    }
829
830    // Set the final position
831    if !map.is_empty() {
832        map[char_count] = text.len();
833    }
834
835    map
836}
837
838/// Fast check if text is ASCII-only.
839#[must_use]
840pub fn is_ascii(text: &str) -> bool {
841    text.is_ascii()
842}
843
844// =============================================================================
845// Span Converter (batch operations)
846// =============================================================================
847
848/// Converter for efficiently handling many spans from the same text.
849///
850/// Pre-computes mapping tables so each conversion is O(1).
851pub struct SpanConverter {
852    byte_to_char: Vec<usize>,
853    char_to_byte: Vec<usize>,
854    is_ascii: bool,
855}
856
857impl SpanConverter {
858    /// Create a converter for the given text.
859    #[must_use]
860    pub fn new(text: &str) -> Self {
861        let is_ascii = is_ascii(text);
862        if is_ascii {
863            // For ASCII, mappings are identity
864            Self {
865                byte_to_char: Vec::new(),
866                char_to_byte: Vec::new(),
867                is_ascii: true,
868            }
869        } else {
870            Self {
871                byte_to_char: build_byte_to_char_map(text),
872                char_to_byte: build_char_to_byte_map(text),
873                is_ascii: false,
874            }
875        }
876    }
877
878    /// Convert byte offset to char offset.
879    ///
880    /// # Arguments
881    ///
882    /// * `byte_idx` - Byte offset in the text
883    ///
884    /// # Returns
885    ///
886    /// Character offset corresponding to the byte offset. If `byte_idx` is out of bounds,
887    /// returns the last valid character index (or 0 if the map is empty).
888    ///
889    /// # Panics
890    ///
891    /// In debug mode, panics if `byte_idx` exceeds the text length by more than 1
892    /// (allowing for the exclusive end position).
893    #[must_use]
894    pub fn byte_to_char(&self, byte_idx: usize) -> usize {
895        if self.is_ascii {
896            byte_idx
897        } else {
898            self.byte_to_char.get(byte_idx).copied().unwrap_or_else(|| {
899                // Bounds check: byte_idx should be <= text.len() (inclusive end position)
900                // The map has length text.len() + 1 to include the end position
901                #[cfg(debug_assertions)]
902                {
903                    let max_valid = self.byte_to_char.len().saturating_sub(1);
904                    if byte_idx > max_valid {
905                        debug_assert!(
906                            byte_idx <= max_valid + 1,
907                            "byte_idx {} out of bounds (max valid: {}, map len: {})",
908                            byte_idx,
909                            max_valid,
910                            self.byte_to_char.len()
911                        );
912                    }
913                }
914                self.byte_to_char.last().copied().unwrap_or(0)
915            })
916        }
917    }
918
919    /// Convert char offset to byte offset.
920    ///
921    /// # Arguments
922    ///
923    /// * `char_idx` - Character offset in the text
924    ///
925    /// # Returns
926    ///
927    /// Byte offset corresponding to the character offset. If `char_idx` is out of bounds,
928    /// returns the last valid byte index (or 0 if the map is empty).
929    ///
930    /// # Panics
931    ///
932    /// In debug mode, panics if `char_idx` exceeds the character count by more than 1
933    /// (allowing for the exclusive end position).
934    #[must_use]
935    pub fn char_to_byte(&self, char_idx: usize) -> usize {
936        if self.is_ascii {
937            char_idx
938        } else {
939            self.char_to_byte.get(char_idx).copied().unwrap_or_else(|| {
940                // Bounds check: char_idx should be <= char_count (inclusive end position)
941                // The map has length char_count + 1 to include the end position
942                #[cfg(debug_assertions)]
943                {
944                    let max_valid = self.char_to_byte.len().saturating_sub(1);
945                    if char_idx > max_valid {
946                        debug_assert!(
947                            char_idx <= max_valid + 1,
948                            "char_idx {} out of bounds (max valid: {}, map len: {})",
949                            char_idx,
950                            max_valid,
951                            self.char_to_byte.len()
952                        );
953                    }
954                }
955                self.char_to_byte.last().copied().unwrap_or(0)
956            })
957        }
958    }
959
960    /// Convert byte span to TextSpan.
961    #[must_use]
962    pub fn from_bytes(&self, byte_start: usize, byte_end: usize) -> TextSpan {
963        TextSpan {
964            byte_start,
965            byte_end,
966            char_start: self.byte_to_char(byte_start),
967            char_end: self.byte_to_char(byte_end),
968        }
969    }
970
971    /// Convert char span to TextSpan.
972    #[must_use]
973    pub fn from_chars(&self, char_start: usize, char_end: usize) -> TextSpan {
974        TextSpan {
975            byte_start: self.char_to_byte(char_start),
976            byte_end: self.char_to_byte(char_end),
977            char_start,
978            char_end,
979        }
980    }
981
982    /// Check if this text is ASCII.
983    #[must_use]
984    pub const fn is_ascii(&self) -> bool {
985        self.is_ascii
986    }
987}
988
989// =============================================================================
990// Tests
991// =============================================================================
992
993#[cfg(test)]
994mod tests {
995    use super::*;
996
997    #[test]
998    fn test_ascii_text() {
999        let text = "Hello World";
1000        let span = TextSpan::from_bytes(text, 0, 5);
1001
1002        assert_eq!(span.byte_start, 0);
1003        assert_eq!(span.byte_end, 5);
1004        assert_eq!(span.char_start, 0);
1005        assert_eq!(span.char_end, 5);
1006        assert!(span.is_ascii());
1007        assert_eq!(span.extract(text), "Hello");
1008    }
1009
1010    #[test]
1011    fn test_euro_symbol() {
1012        let text = "Price €50";
1013        // "Price " = 6 bytes, 6 chars
1014        // € = 3 bytes (E2 82 AC), 1 char
1015        // "50" = 2 bytes, 2 chars
1016        // Total: 11 bytes, 9 chars
1017        //
1018        // "€50" starts at byte 6, ends at byte 11
1019        // "€50" starts at char 6, ends at char 9
1020
1021        let span = TextSpan::from_bytes(text, 6, 11);
1022
1023        assert_eq!(span.byte_start, 6);
1024        assert_eq!(span.byte_end, 11);
1025        assert_eq!(span.char_start, 6);
1026        assert_eq!(span.char_end, 9);
1027        assert!(!span.is_ascii());
1028        assert_eq!(span.extract(text), "€50");
1029    }
1030
1031    #[test]
1032    fn test_pound_symbol() {
1033        let text = "Fee: £25";
1034        // "Fee: " = 5 bytes, 5 chars
1035        // £ = 2 bytes (C2 A3), 1 char
1036        // "25" = 2 bytes, 2 chars
1037        // Total: 9 bytes, 8 chars
1038        //
1039        // "£25" starts at byte 5, ends at byte 9
1040        // "£25" starts at char 5, ends at char 8
1041
1042        let span = TextSpan::from_bytes(text, 5, 9);
1043
1044        assert_eq!(span.byte_start, 5);
1045        assert_eq!(span.byte_end, 9);
1046        assert_eq!(span.char_start, 5);
1047        assert_eq!(span.char_end, 8);
1048        assert_eq!(span.extract(text), "£25");
1049    }
1050
1051    #[test]
1052    fn test_emoji() {
1053        let text = "Hello 👋 World";
1054        // "Hello " = 6 bytes, 6 chars
1055        // 👋 = 4 bytes, 1 char
1056        // " World" = 6 bytes, 6 chars
1057        // Total: 16 bytes, 13 chars
1058        //
1059        // "World" starts at byte 11, ends at byte 16
1060        // "World" starts at char 8, ends at char 13
1061
1062        let span = TextSpan::from_bytes(text, 11, 16);
1063
1064        assert_eq!(span.char_start, 8);
1065        assert_eq!(span.char_end, 13);
1066        assert_eq!(span.extract(text), "World");
1067    }
1068
1069    #[test]
1070    fn test_cjk() {
1071        let text = "日本語 test";
1072        // 日 = 3 bytes, 1 char
1073        // 本 = 3 bytes, 1 char
1074        // 語 = 3 bytes, 1 char
1075        // " " = 1 byte, 1 char
1076        // "test" = 4 bytes, 4 chars
1077        // Total: 14 bytes, 8 chars
1078        //
1079        // "test" starts at byte 10, ends at byte 14
1080        // "test" starts at char 4, ends at char 8
1081
1082        let span = TextSpan::from_bytes(text, 10, 14);
1083
1084        assert_eq!(span.char_start, 4);
1085        assert_eq!(span.char_end, 8);
1086        assert_eq!(span.extract(text), "test");
1087    }
1088
1089    #[test]
1090    fn test_from_chars() {
1091        let text = "Price €50";
1092        // "€50" is chars 6..9
1093
1094        let span = TextSpan::from_chars(text, 6, 9);
1095
1096        assert_eq!(span.char_start, 6);
1097        assert_eq!(span.char_end, 9);
1098        assert_eq!(span.byte_start, 6);
1099        assert_eq!(span.byte_end, 11);
1100        assert_eq!(span.extract(text), "€50");
1101    }
1102
1103    #[test]
1104    fn test_converter_ascii() {
1105        let text = "Hello World";
1106        let conv = SpanConverter::new(text);
1107
1108        assert!(conv.is_ascii());
1109        assert_eq!(conv.byte_to_char(5), 5);
1110        assert_eq!(conv.char_to_byte(5), 5);
1111    }
1112
1113    #[test]
1114    fn test_converter_unicode() {
1115        let text = "Price €50";
1116        let conv = SpanConverter::new(text);
1117
1118        assert!(!conv.is_ascii());
1119
1120        // Byte 6 -> Char 6 (start of €)
1121        assert_eq!(conv.byte_to_char(6), 6);
1122        // Byte 9 -> Char 7 (end of €, which spans bytes 6-8)
1123        assert_eq!(conv.byte_to_char(9), 7);
1124        // Byte 11 -> Char 9 (end of string)
1125        assert_eq!(conv.byte_to_char(11), 9);
1126
1127        // Char 6 -> Byte 6
1128        assert_eq!(conv.char_to_byte(6), 6);
1129        // Char 9 -> Byte 11
1130        assert_eq!(conv.char_to_byte(9), 11);
1131    }
1132
1133    #[test]
1134    fn test_empty_span() {
1135        let text = "test";
1136        let span = TextSpan::from_bytes(text, 2, 2);
1137
1138        assert!(span.is_empty());
1139        assert_eq!(span.byte_len(), 0);
1140        assert_eq!(span.char_len(), 0);
1141    }
1142
1143    #[test]
1144    fn test_full_text_span() {
1145        let text = "日本語";
1146        let span = TextSpan::from_bytes(text, 0, text.len());
1147
1148        assert_eq!(span.char_start, 0);
1149        assert_eq!(span.char_end, 3);
1150        assert_eq!(span.byte_len(), 9);
1151        assert_eq!(span.char_len(), 3);
1152    }
1153
1154    // =========================================================================
1155    // Newtype wrapper tests
1156    // =========================================================================
1157
1158    #[test]
1159    fn test_char_offset_newtype() {
1160        let offset = CharOffset::new(5);
1161        assert_eq!(offset.get(), 5);
1162        assert_eq!(offset.0, 5);
1163
1164        let from_usize: CharOffset = 10.into();
1165        assert_eq!(from_usize.get(), 10);
1166
1167        let back_to_usize: usize = CharOffset(15).into();
1168        assert_eq!(back_to_usize, 15);
1169    }
1170
1171    #[test]
1172    fn test_byte_offset_newtype() {
1173        let offset = ByteOffset::new(5);
1174        assert_eq!(offset.get(), 5);
1175        assert_eq!(offset.0, 5);
1176
1177        let from_usize: ByteOffset = 10.into();
1178        assert_eq!(from_usize.get(), 10);
1179
1180        let back_to_usize: usize = ByteOffset(15).into();
1181        assert_eq!(back_to_usize, 15);
1182    }
1183
1184    #[test]
1185    fn test_char_range() {
1186        let range = CharRange::new(CharOffset(5), CharOffset(10));
1187        assert_eq!(range.len(), 5);
1188        assert!(!range.is_empty());
1189        assert_eq!(range.as_range(), 5..10);
1190
1191        let from_raw = CharRange::from_raw(0, 5);
1192        assert_eq!(from_raw.start.0, 0);
1193        assert_eq!(from_raw.end.0, 5);
1194
1195        let from_tuple: CharRange = (2, 7).into();
1196        assert_eq!(from_tuple.len(), 5);
1197    }
1198
1199    #[test]
1200    fn test_byte_range() {
1201        let range = ByteRange::new(ByteOffset(5), ByteOffset(10));
1202        assert_eq!(range.len(), 5);
1203        assert!(!range.is_empty());
1204        assert_eq!(range.as_range(), 5..10);
1205
1206        let empty_range = ByteRange::from_raw(5, 5);
1207        assert!(empty_range.is_empty());
1208    }
1209
1210    #[test]
1211    fn test_char_offset_ordering() {
1212        let a = CharOffset(5);
1213        let b = CharOffset(10);
1214        let c = CharOffset(5);
1215
1216        assert!(a < b);
1217        assert!(b > a);
1218        assert_eq!(a, c);
1219    }
1220
1221    #[test]
1222    fn test_byte_offset_ordering() {
1223        let a = ByteOffset(5);
1224        let b = ByteOffset(10);
1225        let c = ByteOffset(5);
1226
1227        assert!(a < b);
1228        assert!(b > a);
1229        assert_eq!(a, c);
1230    }
1231}
1232
1233#[cfg(test)]
1234mod proptests {
1235    use super::*;
1236    use proptest::prelude::*;
1237
1238    proptest! {
1239        /// Round-trip: bytes -> chars -> bytes should preserve byte offsets.
1240        #[test]
1241        fn roundtrip_bytes_chars_bytes(text in ".{0,100}") {
1242            if text.is_empty() {
1243                return Ok(());
1244            }
1245
1246            let byte_end = text.len();
1247            let (char_start, char_end) = bytes_to_chars(&text, 0, byte_end);
1248            let (byte_start2, byte_end2) = chars_to_bytes(&text, char_start, char_end);
1249
1250            prop_assert_eq!(byte_start2, 0);
1251            prop_assert_eq!(byte_end2, byte_end);
1252        }
1253
1254        /// TextSpan extraction should always succeed for valid spans.
1255        #[test]
1256        fn textspan_extract_valid(text in ".{1,50}") {
1257            let span = TextSpan::from_bytes(&text, 0, text.len());
1258            let extracted = span.extract(&text);
1259            prop_assert_eq!(extracted, &text);
1260        }
1261
1262        /// Converter should match direct conversion.
1263        #[test]
1264        fn converter_matches_direct(text in ".{1,50}") {
1265            let conv = SpanConverter::new(&text);
1266
1267            let span_direct = TextSpan::from_bytes(&text, 0, text.len());
1268            let span_conv = conv.from_bytes(0, text.len());
1269
1270            prop_assert_eq!(span_direct.char_start, span_conv.char_start);
1271            prop_assert_eq!(span_direct.char_end, span_conv.char_end);
1272        }
1273
1274        /// ASCII detection should be correct.
1275        #[test]
1276        fn ascii_detection(text in "[a-zA-Z0-9 ]{0,50}") {
1277            prop_assert!(is_ascii(&text));
1278        }
1279
1280        /// CharOffset preserves value through conversions.
1281        #[test]
1282        fn char_offset_roundtrip(val in 0usize..1_000_000) {
1283            let offset = CharOffset::new(val);
1284            prop_assert_eq!(offset.get(), val);
1285
1286            let from_into: usize = CharOffset::from(val).into();
1287            prop_assert_eq!(from_into, val);
1288        }
1289
1290        /// ByteOffset preserves value through conversions.
1291        #[test]
1292        fn byte_offset_roundtrip(val in 0usize..1_000_000) {
1293            let offset = ByteOffset::new(val);
1294            prop_assert_eq!(offset.get(), val);
1295
1296            let from_into: usize = ByteOffset::from(val).into();
1297            prop_assert_eq!(from_into, val);
1298        }
1299
1300        /// CharRange length is always end - start.
1301        #[test]
1302        fn char_range_length(start in 0usize..1000, len in 0usize..1000) {
1303            let end = start + len;
1304            let range = CharRange::from_raw(start, end);
1305            prop_assert_eq!(range.len(), len);
1306            prop_assert_eq!(range.is_empty(), len == 0);
1307        }
1308
1309        /// ByteRange length is always end - start.
1310        #[test]
1311        fn byte_range_length(start in 0usize..1000, len in 0usize..1000) {
1312            let end = start + len;
1313            let range = ByteRange::from_raw(start, end);
1314            prop_assert_eq!(range.len(), len);
1315            prop_assert_eq!(range.is_empty(), len == 0);
1316        }
1317    }
1318}
anno/offset.rs

anno/
offset.rs