code_chunker/slab.rs
1//! The Slab type: a chunk of text with position metadata.
2
3/// A chunk of text with its position in the original document.
4///
5/// The name "slab" evokes a physical slice of material—concrete, wood, stone.
6/// Each slab is a self-contained piece that can be embedded, indexed, and
7/// retrieved independently.
8///
9/// ## Offsets
10///
11/// Primary offsets (`start`/`end`) are byte offsets into the original text,
12/// matching Rust's string slicing semantics:
13///
14/// ```rust
15/// use code_chunker::Slab;
16///
17/// let text = "Hello, world!";
18/// let slab = Slab::new("world", 7, 12, 0);
19///
20/// // The offsets let you recover the original position
21/// assert_eq!(&text[slab.start..slab.end], "world");
22/// ```
23///
24/// Character offsets (`char_start`/`char_end`) are automatically populated
25/// when using [`Chunker::chunk`](crate::Chunker::chunk). They count Unicode
26/// scalar values (`char`s), useful for NLP systems that index by character
27/// position. Only `None` when using [`Chunker::chunk_bytes`](crate::Chunker::chunk_bytes)
28/// directly.
29///
30/// ## Overlap Handling
31///
32/// When chunks overlap, adjacent slabs share some text. The `index` field
33/// identifies each slab's position in the sequence:
34///
35/// ```text
36/// Original: "The quick brown fox"
37/// Slab 0: "The quick b" [0..11]
38/// Slab 1: "ck brown fox" [8..19] <- overlaps with slab 0
39/// ^
40/// overlap region [8..11]
41/// ```
42#[derive(Debug, Clone, PartialEq, Eq)]
43#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
44pub struct Slab {
45 /// The chunk text.
46 pub text: String,
47 /// Byte offset where this chunk starts in the original document.
48 pub start: usize,
49 /// Byte offset where this chunk ends (exclusive) in the original document.
50 pub end: usize,
51 /// Character offset where this chunk starts (Unicode scalar values).
52 /// `None` until [`with_char_offsets`](Slab::with_char_offsets) or
53 /// [`compute_char_offsets`] is called.
54 pub char_start: Option<usize>,
55 /// Character offset where this chunk ends (exclusive, Unicode scalar values).
56 pub char_end: Option<usize>,
57 /// Zero-based index of this chunk in the sequence.
58 pub index: usize,
59}
60
61impl Slab {
62 /// Create a new slab (byte offsets only; char offsets unset).
63 #[must_use]
64 pub fn new(text: impl Into<String>, start: usize, end: usize, index: usize) -> Self {
65 debug_assert!(
66 start <= end,
67 "Slab start ({start}) must not exceed end ({end})"
68 );
69 Self {
70 text: text.into(),
71 start,
72 end,
73 char_start: None,
74 char_end: None,
75 index,
76 }
77 }
78
79 /// Set character offsets on this slab.
80 #[must_use]
81 pub fn with_char_offsets(mut self, char_start: usize, char_end: usize) -> Self {
82 self.char_start = Some(char_start);
83 self.char_end = Some(char_end);
84 self
85 }
86
87 /// The length of this chunk in bytes.
88 #[must_use]
89 pub fn len(&self) -> usize {
90 self.text.len()
91 }
92
93 /// The length of this chunk in characters (Unicode scalar values).
94 #[must_use]
95 pub fn char_len(&self) -> usize {
96 self.text.chars().count()
97 }
98
99 /// Whether this chunk is empty.
100 #[must_use]
101 pub fn is_empty(&self) -> bool {
102 self.text.is_empty()
103 }
104
105 /// The byte span of this chunk in the original document.
106 #[must_use]
107 pub fn span(&self) -> std::ops::Range<usize> {
108 self.start..self.end
109 }
110
111 /// The character span, if computed.
112 #[must_use]
113 pub fn char_span(&self) -> Option<std::ops::Range<usize>> {
114 match (self.char_start, self.char_end) {
115 (Some(s), Some(e)) => Some(s..e),
116 _ => None,
117 }
118 }
119}
120
121/// Compute character offsets for a batch of slabs from the same document.
122///
123/// Builds a byte-to-char mapping in a single O(n) pass over the source text,
124/// then fills `char_start`/`char_end` on each slab. This is faster than
125/// per-slab computation when there are many slabs.
126///
127/// # Example
128///
129/// ```rust
130/// use code_chunker::{compute_char_offsets, Slab};
131///
132/// let text = "Hello 日本語 world";
133/// let mut slabs = vec![
134/// Slab::new("Hello ", 0, 6, 0),
135/// Slab::new("日本語", 6, 15, 1),
136/// ];
137/// compute_char_offsets(text, &mut slabs);
138///
139/// assert_eq!(slabs[0].char_start, Some(0));
140/// assert_eq!(slabs[1].char_start, Some(6));
141/// assert_eq!(slabs[1].char_end, Some(9));
142/// ```
143pub fn compute_char_offsets(text: &str, slabs: &mut [Slab]) {
144 if slabs.is_empty() {
145 return;
146 }
147
148 // Build byte->char index in one pass.
149 // byte_to_char[byte_offset] = char_offset for each char boundary.
150 // For non-boundary bytes, the value is undefined (we only look up boundaries).
151 let mut byte_to_char = vec![0usize; text.len() + 1];
152 for (char_idx, (byte_idx, _)) in text.char_indices().enumerate() {
153 byte_to_char[byte_idx] = char_idx;
154 }
155 // Sentinel: byte offset == text.len() maps to total char count.
156 byte_to_char[text.len()] = text.chars().count();
157
158 for slab in slabs.iter_mut() {
159 slab.char_start = Some(byte_to_char[slab.start]);
160 slab.char_end = Some(byte_to_char[slab.end]);
161 }
162}
163
164impl std::fmt::Display for Slab {
165 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
166 if let (Some(cs), Some(ce)) = (self.char_start, self.char_end) {
167 write!(
168 f,
169 "Slab {{ index: {}, bytes: {}..{}, chars: {}..{}, len: {} }}",
170 self.index,
171 self.start,
172 self.end,
173 cs,
174 ce,
175 self.len()
176 )
177 } else {
178 write!(
179 f,
180 "Slab {{ index: {}, span: {}..{}, len: {} }}",
181 self.index,
182 self.start,
183 self.end,
184 self.len()
185 )
186 }
187 }
188}