code_chunker/
slab.rs

1//! The Slab type: a chunk of text with position metadata.
2
3/// A chunk of text with its position in the original document.
4///
5/// The name "slab" evokes a physical slice of material—concrete, wood, stone.
6/// Each slab is a self-contained piece that can be embedded, indexed, and
7/// retrieved independently.
8///
9/// ## Offsets
10///
11/// Primary offsets (`start`/`end`) are byte offsets into the original text,
12/// matching Rust's string slicing semantics:
13///
14/// ```rust
15/// use code_chunker::Slab;
16///
17/// let text = "Hello, world!";
18/// let slab = Slab::new("world", 7, 12, 0);
19///
20/// // The offsets let you recover the original position
21/// assert_eq!(&text[slab.start..slab.end], "world");
22/// ```
23///
24/// Character offsets (`char_start`/`char_end`) are automatically populated
25/// when using [`Chunker::chunk`](crate::Chunker::chunk). They count Unicode
26/// scalar values (`char`s), useful for NLP systems that index by character
27/// position. Only `None` when using [`Chunker::chunk_bytes`](crate::Chunker::chunk_bytes)
28/// directly.
29///
30/// ## Overlap Handling
31///
32/// When chunks overlap, adjacent slabs share some text. The `index` field
33/// identifies each slab's position in the sequence:
34///
35/// ```text
36/// Original: "The quick brown fox"
37/// Slab 0:   "The quick b"     [0..11]
38/// Slab 1:   "ck brown fox"    [8..19]  <- overlaps with slab 0
39///                ^
40///            overlap region [8..11]
41/// ```
42#[derive(Debug, Clone, PartialEq, Eq)]
43#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
44pub struct Slab {
45    /// The chunk text.
46    pub text: String,
47    /// Byte offset where this chunk starts in the original document.
48    pub start: usize,
49    /// Byte offset where this chunk ends (exclusive) in the original document.
50    pub end: usize,
51    /// Character offset where this chunk starts (Unicode scalar values).
52    /// `None` until [`with_char_offsets`](Slab::with_char_offsets) or
53    /// [`compute_char_offsets`] is called.
54    pub char_start: Option<usize>,
55    /// Character offset where this chunk ends (exclusive, Unicode scalar values).
56    pub char_end: Option<usize>,
57    /// Zero-based index of this chunk in the sequence.
58    pub index: usize,
59}
60
61impl Slab {
62    /// Create a new slab (byte offsets only; char offsets unset).
63    #[must_use]
64    pub fn new(text: impl Into<String>, start: usize, end: usize, index: usize) -> Self {
65        debug_assert!(
66            start <= end,
67            "Slab start ({start}) must not exceed end ({end})"
68        );
69        Self {
70            text: text.into(),
71            start,
72            end,
73            char_start: None,
74            char_end: None,
75            index,
76        }
77    }
78
79    /// Set character offsets on this slab.
80    #[must_use]
81    pub fn with_char_offsets(mut self, char_start: usize, char_end: usize) -> Self {
82        self.char_start = Some(char_start);
83        self.char_end = Some(char_end);
84        self
85    }
86
87    /// The length of this chunk in bytes.
88    #[must_use]
89    pub fn len(&self) -> usize {
90        self.text.len()
91    }
92
93    /// The length of this chunk in characters (Unicode scalar values).
94    #[must_use]
95    pub fn char_len(&self) -> usize {
96        self.text.chars().count()
97    }
98
99    /// Whether this chunk is empty.
100    #[must_use]
101    pub fn is_empty(&self) -> bool {
102        self.text.is_empty()
103    }
104
105    /// The byte span of this chunk in the original document.
106    #[must_use]
107    pub fn span(&self) -> std::ops::Range<usize> {
108        self.start..self.end
109    }
110
111    /// The character span, if computed.
112    #[must_use]
113    pub fn char_span(&self) -> Option<std::ops::Range<usize>> {
114        match (self.char_start, self.char_end) {
115            (Some(s), Some(e)) => Some(s..e),
116            _ => None,
117        }
118    }
119}
120
121/// Compute character offsets for a batch of slabs from the same document.
122///
123/// Builds a byte-to-char mapping in a single O(n) pass over the source text,
124/// then fills `char_start`/`char_end` on each slab. This is faster than
125/// per-slab computation when there are many slabs.
126///
127/// # Example
128///
129/// ```rust
130/// use code_chunker::{compute_char_offsets, Slab};
131///
132/// let text = "Hello 日本語 world";
133/// let mut slabs = vec![
134///     Slab::new("Hello ", 0, 6, 0),
135///     Slab::new("日本語", 6, 15, 1),
136/// ];
137/// compute_char_offsets(text, &mut slabs);
138///
139/// assert_eq!(slabs[0].char_start, Some(0));
140/// assert_eq!(slabs[1].char_start, Some(6));
141/// assert_eq!(slabs[1].char_end, Some(9));
142/// ```
143pub fn compute_char_offsets(text: &str, slabs: &mut [Slab]) {
144    if slabs.is_empty() {
145        return;
146    }
147
148    // Build byte->char index in one pass.
149    // byte_to_char[byte_offset] = char_offset for each char boundary.
150    // For non-boundary bytes, the value is undefined (we only look up boundaries).
151    let mut byte_to_char = vec![0usize; text.len() + 1];
152    for (char_idx, (byte_idx, _)) in text.char_indices().enumerate() {
153        byte_to_char[byte_idx] = char_idx;
154    }
155    // Sentinel: byte offset == text.len() maps to total char count.
156    byte_to_char[text.len()] = text.chars().count();
157
158    for slab in slabs.iter_mut() {
159        slab.char_start = Some(byte_to_char[slab.start]);
160        slab.char_end = Some(byte_to_char[slab.end]);
161    }
162}
163
164impl std::fmt::Display for Slab {
165    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
166        if let (Some(cs), Some(ce)) = (self.char_start, self.char_end) {
167            write!(
168                f,
169                "Slab {{ index: {}, bytes: {}..{}, chars: {}..{}, len: {} }}",
170                self.index,
171                self.start,
172                self.end,
173                cs,
174                ce,
175                self.len()
176            )
177        } else {
178            write!(
179                f,
180                "Slab {{ index: {}, span: {}..{}, len: {} }}",
181                self.index,
182                self.start,
183                self.end,
184                self.len()
185            )
186        }
187    }
188}
code_chunker/slab.rs

code_chunker/
slab.rs