Skip to main content

rlm_rs/core/
chunk.rs

1//! Chunk representation for RLM-RS.
2//!
3//! Chunks are segments of buffer content created by chunking strategies.
4//! Each chunk maintains its position within the original buffer and
5//! metadata for tracking and processing.
6
7use crate::io::{current_timestamp, find_char_boundary};
8use serde::{Deserialize, Serialize};
9use std::ops::Range;
10
11/// Estimates token count for a text string with improved accuracy.
12///
13/// This uses a heuristic that accounts for word boundaries, punctuation,
14/// and character types to provide better estimates than simple char/4.
15///
16/// # Algorithm
17///
18/// 1. Count whitespace-separated words (~1.3 tokens each on average)
19/// 2. Count punctuation/operators (often separate tokens)
20/// 3. Account for non-ASCII characters (CJK uses ~1-2 chars/token)
21/// 4. Apply adjustments for code-like content
22#[must_use]
23pub fn estimate_tokens_for_text(text: &str) -> usize {
24    if text.is_empty() {
25        return 0;
26    }
27
28    let mut word_count: usize = 0;
29    let mut punct_count: usize = 0;
30    let mut non_ascii_chars: usize = 0;
31    let mut in_word = false;
32
33    for ch in text.chars() {
34        if ch.is_whitespace() {
35            in_word = false;
36        } else if ch.is_ascii_punctuation() {
37            punct_count += 1;
38            in_word = false;
39        } else if !ch.is_ascii() {
40            non_ascii_chars += 1;
41            in_word = false;
42        } else if !in_word {
43            word_count += 1;
44            in_word = true;
45        }
46    }
47
48    // Heuristics based on tokenizer behavior:
49    // - Average English word: ~1.3 tokens (subword tokenization)
50    // - Punctuation: ~0.5 tokens (often merged with adjacent)
51    // - Non-ASCII: ~1.5 tokens per character (CJK, emoji, etc.)
52    // - Minimum 1 token for non-empty text
53    let word_tokens = (word_count * 13) / 10; // 1.3x
54    let punct_tokens = punct_count.div_ceil(2); // 0.5x
55    let non_ascii_tokens = (non_ascii_chars * 3) / 2; // 1.5x
56
57    (word_tokens + punct_tokens + non_ascii_tokens).max(1)
58}
59
60/// Represents a chunk of text from a buffer.
61///
62/// Chunks are created by chunking strategies and contain a portion of
63/// buffer content along with metadata about their position and origin.
64///
65/// # Examples
66///
67/// ```
68/// use rlm_rs::core::Chunk;
69///
70/// let chunk = Chunk::new(
71///     1,
72///     "Hello, world!".to_string(),
73///     0..13,
74///     0,
75/// );
76/// assert_eq!(chunk.size(), 13);
77/// ```
78#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
79pub struct Chunk {
80    /// Unique identifier (assigned by storage layer).
81    pub id: Option<i64>,
82
83    /// ID of the buffer this chunk belongs to.
84    pub buffer_id: i64,
85
86    /// Chunk content.
87    pub content: String,
88
89    /// Byte range in the original buffer.
90    pub byte_range: Range<usize>,
91
92    /// Sequential index within the buffer (0-based).
93    pub index: usize,
94
95    /// Chunk metadata.
96    pub metadata: ChunkMetadata,
97}
98
99/// Metadata associated with a chunk.
100#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
101pub struct ChunkMetadata {
102    /// Chunking strategy that created this chunk.
103    pub strategy: Option<String>,
104
105    /// Token count estimate (if available).
106    pub token_count: Option<usize>,
107
108    /// Line range in the original buffer (if computed).
109    pub line_range: Option<Range<usize>>,
110
111    /// Unix timestamp when chunk was created.
112    pub created_at: i64,
113
114    /// Content hash for deduplication.
115    pub content_hash: Option<String>,
116
117    /// Whether this chunk overlaps with the previous chunk.
118    pub has_overlap: bool,
119
120    /// Custom metadata as JSON string.
121    pub custom: Option<String>,
122}
123
124impl Chunk {
125    /// Creates a new chunk.
126    ///
127    /// # Arguments
128    ///
129    /// * `buffer_id` - ID of the parent buffer.
130    /// * `content` - Chunk content.
131    /// * `byte_range` - Byte range in the original buffer.
132    /// * `index` - Sequential index within the buffer.
133    ///
134    /// # Examples
135    ///
136    /// ```
137    /// use rlm_rs::core::Chunk;
138    ///
139    /// let chunk = Chunk::new(1, "content".to_string(), 0..7, 0);
140    /// assert_eq!(chunk.buffer_id, 1);
141    /// assert_eq!(chunk.index, 0);
142    /// ```
143    #[must_use]
144    pub fn new(buffer_id: i64, content: String, byte_range: Range<usize>, index: usize) -> Self {
145        Self {
146            id: None,
147            buffer_id,
148            content,
149            byte_range,
150            index,
151            metadata: ChunkMetadata {
152                created_at: current_timestamp(),
153                ..Default::default()
154            },
155        }
156    }
157
158    /// Creates a chunk with a specific strategy name.
159    ///
160    /// # Arguments
161    ///
162    /// * `buffer_id` - ID of the parent buffer.
163    /// * `content` - Chunk content.
164    /// * `byte_range` - Byte range in the original buffer.
165    /// * `index` - Sequential index within the buffer.
166    /// * `strategy` - Name of the chunking strategy.
167    #[must_use]
168    pub fn with_strategy(
169        buffer_id: i64,
170        content: String,
171        byte_range: Range<usize>,
172        index: usize,
173        strategy: &str,
174    ) -> Self {
175        let mut chunk = Self::new(buffer_id, content, byte_range, index);
176        chunk.metadata.strategy = Some(strategy.to_string());
177        chunk
178    }
179
180    /// Returns the size of the chunk in bytes.
181    #[must_use]
182    pub const fn size(&self) -> usize {
183        self.content.len()
184    }
185
186    /// Returns the byte range size.
187    #[must_use]
188    pub const fn range_size(&self) -> usize {
189        self.byte_range.end - self.byte_range.start
190    }
191
192    /// Checks if the chunk is empty.
193    #[must_use]
194    pub const fn is_empty(&self) -> bool {
195        self.content.is_empty()
196    }
197
198    /// Returns the start byte offset in the original buffer.
199    #[must_use]
200    pub const fn start(&self) -> usize {
201        self.byte_range.start
202    }
203
204    /// Returns the end byte offset in the original buffer.
205    #[must_use]
206    pub const fn end(&self) -> usize {
207        self.byte_range.end
208    }
209
210    /// Sets the token count estimate.
211    pub const fn set_token_count(&mut self, count: usize) {
212        self.metadata.token_count = Some(count);
213    }
214
215    /// Estimates token count using a simple heuristic.
216    ///
217    /// Uses the approximation of ~4 characters per token for ASCII text.
218    /// For a more accurate estimate, use [`Self::estimate_tokens_accurate`].
219    ///
220    /// # Accuracy
221    ///
222    /// This simple method is typically accurate within 20-30% for English text
223    /// and code. It tends to undercount for text with many short words and
224    /// overcount for text with long technical terms.
225    #[must_use]
226    pub const fn estimate_tokens(&self) -> usize {
227        // Common approximation: ~4 chars per token
228        self.content.len().div_ceil(4)
229    }
230
231    /// Estimates token count with improved accuracy.
232    ///
233    /// Uses a more sophisticated heuristic that accounts for:
234    /// - Word boundaries (whitespace-separated tokens)
235    /// - Punctuation and operators (often separate tokens)
236    /// - Non-ASCII characters (typically 1-2 chars per token)
237    ///
238    /// # Accuracy
239    ///
240    /// This method is typically accurate within 10-15% for mixed content.
241    /// For production use requiring exact counts, consider integrating
242    /// a proper tokenizer like `tiktoken-rs`.
243    ///
244    /// # Performance
245    ///
246    /// This method iterates over the content string, so it's O(n) where
247    /// n is the content length. For very large chunks, the simple
248    /// [`Self::estimate_tokens`] method may be preferred.
249    #[must_use]
250    pub fn estimate_tokens_accurate(&self) -> usize {
251        estimate_tokens_for_text(&self.content)
252    }
253
254    /// Sets the line range in the original buffer.
255    pub const fn set_line_range(&mut self, start_line: usize, end_line: usize) {
256        self.metadata.line_range = Some(start_line..end_line);
257    }
258
259    /// Marks this chunk as having overlap with the previous chunk.
260    pub const fn set_has_overlap(&mut self, has_overlap: bool) {
261        self.metadata.has_overlap = has_overlap;
262    }
263
264    /// Computes and sets the content hash.
265    pub fn compute_hash(&mut self) {
266        use std::collections::hash_map::DefaultHasher;
267        use std::hash::{Hash, Hasher};
268
269        let mut hasher = DefaultHasher::new();
270        self.content.hash(&mut hasher);
271        self.metadata.content_hash = Some(format!("{:016x}", hasher.finish()));
272    }
273
274    /// Returns a preview of the chunk content (first N characters).
275    ///
276    /// # Arguments
277    ///
278    /// * `max_len` - Maximum number of characters to include.
279    #[must_use]
280    pub fn preview(&self, max_len: usize) -> &str {
281        if self.content.len() <= max_len {
282            &self.content
283        } else {
284            let end = find_char_boundary(&self.content, max_len);
285            &self.content[..end]
286        }
287    }
288
289    /// Checks if this chunk's byte range overlaps with another range.
290    #[must_use]
291    pub const fn overlaps_with(&self, other_range: &Range<usize>) -> bool {
292        self.byte_range.start < other_range.end && other_range.start < self.byte_range.end
293    }
294
295    /// Checks if this chunk's byte range contains a specific byte offset.
296    #[must_use]
297    pub fn contains_offset(&self, offset: usize) -> bool {
298        self.byte_range.contains(&offset)
299    }
300}
301
302/// Builder for creating chunks with fluent API.
303#[derive(Debug, Default)]
304pub struct ChunkBuilder {
305    buffer_id: Option<i64>,
306    content: Option<String>,
307    byte_range: Option<Range<usize>>,
308    index: Option<usize>,
309    strategy: Option<String>,
310    token_count: Option<usize>,
311    line_range: Option<Range<usize>>,
312    has_overlap: bool,
313}
314
315impl ChunkBuilder {
316    /// Creates a new chunk builder.
317    #[must_use]
318    pub fn new() -> Self {
319        Self::default()
320    }
321
322    /// Sets the buffer ID.
323    #[must_use]
324    pub const fn buffer_id(mut self, id: i64) -> Self {
325        self.buffer_id = Some(id);
326        self
327    }
328
329    /// Sets the content.
330    #[must_use]
331    pub fn content(mut self, content: String) -> Self {
332        self.content = Some(content);
333        self
334    }
335
336    /// Sets the byte range.
337    #[must_use]
338    pub const fn byte_range(mut self, range: Range<usize>) -> Self {
339        self.byte_range = Some(range);
340        self
341    }
342
343    /// Sets the index.
344    #[must_use]
345    pub const fn index(mut self, index: usize) -> Self {
346        self.index = Some(index);
347        self
348    }
349
350    /// Sets the strategy name.
351    #[must_use]
352    pub fn strategy(mut self, strategy: &str) -> Self {
353        self.strategy = Some(strategy.to_string());
354        self
355    }
356
357    /// Sets the token count.
358    #[must_use]
359    pub const fn token_count(mut self, count: usize) -> Self {
360        self.token_count = Some(count);
361        self
362    }
363
364    /// Sets the line range.
365    #[must_use]
366    pub const fn line_range(mut self, range: Range<usize>) -> Self {
367        self.line_range = Some(range);
368        self
369    }
370
371    /// Sets whether this chunk has overlap.
372    #[must_use]
373    pub const fn has_overlap(mut self, has_overlap: bool) -> Self {
374        self.has_overlap = has_overlap;
375        self
376    }
377
378    /// Builds the chunk.
379    ///
380    /// # Default Behavior
381    ///
382    /// If optional fields are not set, defaults are applied:
383    /// - `buffer_id`: 0
384    /// - `content`: empty string
385    /// - `byte_range`: `0..content.len()`
386    /// - `index`: 0
387    #[must_use]
388    pub fn build(self) -> Chunk {
389        let buffer_id = self.buffer_id.unwrap_or(0);
390        let content = self.content.unwrap_or_default();
391        let byte_range = self.byte_range.unwrap_or(0..content.len());
392        let index = self.index.unwrap_or(0);
393
394        let mut chunk = Chunk::new(buffer_id, content, byte_range, index);
395
396        if let Some(strategy) = self.strategy {
397            chunk.metadata.strategy = Some(strategy);
398        }
399        if let Some(count) = self.token_count {
400            chunk.metadata.token_count = Some(count);
401        }
402        if let Some(range) = self.line_range {
403            chunk.metadata.line_range = Some(range);
404        }
405        chunk.metadata.has_overlap = self.has_overlap;
406
407        chunk
408    }
409}
410
411#[cfg(test)]
412mod tests {
413    use super::*;
414
415    #[test]
416    fn test_chunk_new() {
417        let chunk = Chunk::new(1, "Hello".to_string(), 0..5, 0);
418        assert_eq!(chunk.buffer_id, 1);
419        assert_eq!(chunk.content, "Hello");
420        assert_eq!(chunk.byte_range, 0..5);
421        assert_eq!(chunk.index, 0);
422        assert!(chunk.id.is_none());
423    }
424
425    #[test]
426    fn test_chunk_with_strategy() {
427        let chunk = Chunk::with_strategy(1, "content".to_string(), 0..7, 0, "semantic");
428        assert_eq!(chunk.metadata.strategy, Some("semantic".to_string()));
429    }
430
431    #[test]
432    fn test_chunk_size() {
433        let chunk = Chunk::new(1, "Hello, world!".to_string(), 0..13, 0);
434        assert_eq!(chunk.size(), 13);
435        assert_eq!(chunk.range_size(), 13);
436    }
437
438    #[test]
439    fn test_chunk_offsets() {
440        let chunk = Chunk::new(1, "world".to_string(), 7..12, 1);
441        assert_eq!(chunk.start(), 7);
442        assert_eq!(chunk.end(), 12);
443    }
444
445    #[test]
446    fn test_chunk_estimate_tokens() {
447        let chunk = Chunk::new(1, "Hello, world!".to_string(), 0..13, 0);
448        // 13 chars / 4 ≈ 3-4 tokens
449        assert!(chunk.estimate_tokens() >= 3);
450        assert!(chunk.estimate_tokens() <= 4);
451    }
452
453    #[test]
454    fn test_chunk_estimate_tokens_accurate() {
455        // Simple English text: 2 words + 2 punctuation
456        let chunk = Chunk::new(1, "Hello, world!".to_string(), 0..13, 0);
457        let accurate = chunk.estimate_tokens_accurate();
458        // Should be around 3-4 tokens (2 words * 1.3 + 2 punct * 0.5)
459        assert!(accurate >= 2, "Expected >= 2, got {accurate}");
460        assert!(accurate <= 5, "Expected <= 5, got {accurate}");
461    }
462
463    #[test]
464    fn test_estimate_tokens_for_text() {
465        // Empty text
466        assert_eq!(estimate_tokens_for_text(""), 0);
467
468        // Single word
469        let single = estimate_tokens_for_text("hello");
470        assert!(single >= 1);
471
472        // Multiple words
473        let words = estimate_tokens_for_text("the quick brown fox");
474        assert!(words >= 4, "Expected >= 4 for 4 words, got {words}");
475
476        // Code-like content with punctuation
477        let code = estimate_tokens_for_text("fn main() { println!(\"hello\"); }");
478        assert!(code >= 5, "Expected >= 5 for code, got {code}");
479
480        // Non-ASCII (CJK characters)
481        let cjk = estimate_tokens_for_text("你好世界");
482        assert!(cjk >= 4, "Expected >= 4 for 4 CJK chars, got {cjk}");
483    }
484
485    #[test]
486    fn test_chunk_preview() {
487        let chunk = Chunk::new(1, "Hello, world!".to_string(), 0..13, 0);
488        assert_eq!(chunk.preview(5), "Hello");
489        assert_eq!(chunk.preview(100), "Hello, world!");
490    }
491
492    #[test]
493    fn test_chunk_overlaps_with() {
494        let chunk = Chunk::new(1, "test".to_string(), 10..20, 0);
495        assert!(chunk.overlaps_with(&(15..25)));
496        assert!(chunk.overlaps_with(&(5..15)));
497        assert!(!chunk.overlaps_with(&(20..30)));
498        assert!(!chunk.overlaps_with(&(0..10)));
499    }
500
501    #[test]
502    fn test_chunk_contains_offset() {
503        let chunk = Chunk::new(1, "test".to_string(), 10..20, 0);
504        assert!(chunk.contains_offset(10));
505        assert!(chunk.contains_offset(15));
506        assert!(!chunk.contains_offset(20));
507        assert!(!chunk.contains_offset(5));
508    }
509
510    #[test]
511    fn test_chunk_hash() {
512        let mut chunk1 = Chunk::new(1, "Hello".to_string(), 0..5, 0);
513        let mut chunk2 = Chunk::new(2, "Hello".to_string(), 0..5, 0);
514        chunk1.compute_hash();
515        chunk2.compute_hash();
516        assert_eq!(chunk1.metadata.content_hash, chunk2.metadata.content_hash);
517    }
518
519    #[test]
520    fn test_chunk_builder() {
521        let chunk = ChunkBuilder::new()
522            .buffer_id(1)
523            .content("test".to_string())
524            .byte_range(0..4)
525            .index(0)
526            .strategy("fixed")
527            .token_count(1)
528            .line_range(0..1)
529            .has_overlap(true)
530            .build();
531
532        assert_eq!(chunk.buffer_id, 1);
533        assert_eq!(chunk.content, "test");
534        assert_eq!(chunk.metadata.strategy, Some("fixed".to_string()));
535        assert_eq!(chunk.metadata.token_count, Some(1));
536        assert_eq!(chunk.metadata.line_range, Some(0..1));
537        assert!(chunk.metadata.has_overlap);
538    }
539
540    #[test]
541    fn test_chunk_serialization() {
542        let chunk = Chunk::new(1, "test".to_string(), 0..4, 0);
543        let json = serde_json::to_string(&chunk);
544        assert!(json.is_ok());
545
546        let deserialized: Result<Chunk, _> = serde_json::from_str(&json.unwrap());
547        assert!(deserialized.is_ok());
548        assert_eq!(deserialized.unwrap().content, "test");
549    }
550
551    #[test]
552    fn test_chunk_empty() {
553        let chunk = Chunk::new(1, String::new(), 0..0, 0);
554        assert!(chunk.is_empty());
555        assert_eq!(chunk.size(), 0);
556    }
557
558    #[test]
559    fn test_chunk_set_line_range() {
560        // Test set_line_range method (lines 175-176)
561        let mut chunk = Chunk::new(1, "test".to_string(), 0..4, 0);
562        chunk.set_line_range(5, 10);
563        assert_eq!(chunk.metadata.line_range, Some(5..10));
564    }
565
566    #[test]
567    fn test_find_char_boundary_at_end() {
568        // Test find_char_boundary when pos >= s.len() (lines 329-330)
569        let s = "hello";
570        assert_eq!(find_char_boundary(s, 10), 5);
571        assert_eq!(find_char_boundary(s, 5), 5);
572    }
573
574    #[test]
575    fn test_find_char_boundary_in_multibyte() {
576        // Test find_char_boundary in middle of multibyte char (lines 333-334)
577        let s = "Hello 世界!";
578        // '世' is at bytes 6-8, '界' is at bytes 9-11
579        assert_eq!(find_char_boundary(s, 7), 6); // Middle of '世', should go back
580        assert_eq!(find_char_boundary(s, 8), 6); // Still middle of '世'
581        assert_eq!(find_char_boundary(s, 9), 9); // Start of '界'
582    }
583}