1use crate::io::{current_timestamp, find_char_boundary};
8use serde::{Deserialize, Serialize};
9use std::ops::Range;
10
11#[must_use]
23pub fn estimate_tokens_for_text(text: &str) -> usize {
24 if text.is_empty() {
25 return 0;
26 }
27
28 let mut word_count: usize = 0;
29 let mut punct_count: usize = 0;
30 let mut non_ascii_chars: usize = 0;
31 let mut in_word = false;
32
33 for ch in text.chars() {
34 if ch.is_whitespace() {
35 in_word = false;
36 } else if ch.is_ascii_punctuation() {
37 punct_count += 1;
38 in_word = false;
39 } else if !ch.is_ascii() {
40 non_ascii_chars += 1;
41 in_word = false;
42 } else if !in_word {
43 word_count += 1;
44 in_word = true;
45 }
46 }
47
48 let word_tokens = (word_count * 13) / 10; let punct_tokens = punct_count.div_ceil(2); let non_ascii_tokens = (non_ascii_chars * 3) / 2; (word_tokens + punct_tokens + non_ascii_tokens).max(1)
58}
59
60#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
79pub struct Chunk {
80 pub id: Option<i64>,
82
83 pub buffer_id: i64,
85
86 pub content: String,
88
89 pub byte_range: Range<usize>,
91
92 pub index: usize,
94
95 pub metadata: ChunkMetadata,
97}
98
99#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
101pub struct ChunkMetadata {
102 pub strategy: Option<String>,
104
105 pub token_count: Option<usize>,
107
108 pub line_range: Option<Range<usize>>,
110
111 pub created_at: i64,
113
114 pub content_hash: Option<String>,
116
117 pub has_overlap: bool,
119
120 pub custom: Option<String>,
122}
123
124impl Chunk {
125 #[must_use]
144 pub fn new(buffer_id: i64, content: String, byte_range: Range<usize>, index: usize) -> Self {
145 Self {
146 id: None,
147 buffer_id,
148 content,
149 byte_range,
150 index,
151 metadata: ChunkMetadata {
152 created_at: current_timestamp(),
153 ..Default::default()
154 },
155 }
156 }
157
158 #[must_use]
168 pub fn with_strategy(
169 buffer_id: i64,
170 content: String,
171 byte_range: Range<usize>,
172 index: usize,
173 strategy: &str,
174 ) -> Self {
175 let mut chunk = Self::new(buffer_id, content, byte_range, index);
176 chunk.metadata.strategy = Some(strategy.to_string());
177 chunk
178 }
179
180 #[must_use]
182 pub const fn size(&self) -> usize {
183 self.content.len()
184 }
185
186 #[must_use]
188 pub const fn range_size(&self) -> usize {
189 self.byte_range.end - self.byte_range.start
190 }
191
192 #[must_use]
194 pub const fn is_empty(&self) -> bool {
195 self.content.is_empty()
196 }
197
198 #[must_use]
200 pub const fn start(&self) -> usize {
201 self.byte_range.start
202 }
203
204 #[must_use]
206 pub const fn end(&self) -> usize {
207 self.byte_range.end
208 }
209
210 pub const fn set_token_count(&mut self, count: usize) {
212 self.metadata.token_count = Some(count);
213 }
214
215 #[must_use]
226 pub const fn estimate_tokens(&self) -> usize {
227 self.content.len().div_ceil(4)
229 }
230
231 #[must_use]
250 pub fn estimate_tokens_accurate(&self) -> usize {
251 estimate_tokens_for_text(&self.content)
252 }
253
254 pub const fn set_line_range(&mut self, start_line: usize, end_line: usize) {
256 self.metadata.line_range = Some(start_line..end_line);
257 }
258
259 pub const fn set_has_overlap(&mut self, has_overlap: bool) {
261 self.metadata.has_overlap = has_overlap;
262 }
263
264 pub fn compute_hash(&mut self) {
266 use std::collections::hash_map::DefaultHasher;
267 use std::hash::{Hash, Hasher};
268
269 let mut hasher = DefaultHasher::new();
270 self.content.hash(&mut hasher);
271 self.metadata.content_hash = Some(format!("{:016x}", hasher.finish()));
272 }
273
274 #[must_use]
280 pub fn preview(&self, max_len: usize) -> &str {
281 if self.content.len() <= max_len {
282 &self.content
283 } else {
284 let end = find_char_boundary(&self.content, max_len);
285 &self.content[..end]
286 }
287 }
288
289 #[must_use]
291 pub const fn overlaps_with(&self, other_range: &Range<usize>) -> bool {
292 self.byte_range.start < other_range.end && other_range.start < self.byte_range.end
293 }
294
295 #[must_use]
297 pub fn contains_offset(&self, offset: usize) -> bool {
298 self.byte_range.contains(&offset)
299 }
300}
301
302#[derive(Debug, Default)]
304pub struct ChunkBuilder {
305 buffer_id: Option<i64>,
306 content: Option<String>,
307 byte_range: Option<Range<usize>>,
308 index: Option<usize>,
309 strategy: Option<String>,
310 token_count: Option<usize>,
311 line_range: Option<Range<usize>>,
312 has_overlap: bool,
313}
314
315impl ChunkBuilder {
316 #[must_use]
318 pub fn new() -> Self {
319 Self::default()
320 }
321
322 #[must_use]
324 pub const fn buffer_id(mut self, id: i64) -> Self {
325 self.buffer_id = Some(id);
326 self
327 }
328
329 #[must_use]
331 pub fn content(mut self, content: String) -> Self {
332 self.content = Some(content);
333 self
334 }
335
336 #[must_use]
338 pub const fn byte_range(mut self, range: Range<usize>) -> Self {
339 self.byte_range = Some(range);
340 self
341 }
342
343 #[must_use]
345 pub const fn index(mut self, index: usize) -> Self {
346 self.index = Some(index);
347 self
348 }
349
350 #[must_use]
352 pub fn strategy(mut self, strategy: &str) -> Self {
353 self.strategy = Some(strategy.to_string());
354 self
355 }
356
357 #[must_use]
359 pub const fn token_count(mut self, count: usize) -> Self {
360 self.token_count = Some(count);
361 self
362 }
363
364 #[must_use]
366 pub const fn line_range(mut self, range: Range<usize>) -> Self {
367 self.line_range = Some(range);
368 self
369 }
370
371 #[must_use]
373 pub const fn has_overlap(mut self, has_overlap: bool) -> Self {
374 self.has_overlap = has_overlap;
375 self
376 }
377
378 #[must_use]
388 pub fn build(self) -> Chunk {
389 let buffer_id = self.buffer_id.unwrap_or(0);
390 let content = self.content.unwrap_or_default();
391 let byte_range = self.byte_range.unwrap_or(0..content.len());
392 let index = self.index.unwrap_or(0);
393
394 let mut chunk = Chunk::new(buffer_id, content, byte_range, index);
395
396 if let Some(strategy) = self.strategy {
397 chunk.metadata.strategy = Some(strategy);
398 }
399 if let Some(count) = self.token_count {
400 chunk.metadata.token_count = Some(count);
401 }
402 if let Some(range) = self.line_range {
403 chunk.metadata.line_range = Some(range);
404 }
405 chunk.metadata.has_overlap = self.has_overlap;
406
407 chunk
408 }
409}
410
411#[cfg(test)]
412mod tests {
413 use super::*;
414
415 #[test]
416 fn test_chunk_new() {
417 let chunk = Chunk::new(1, "Hello".to_string(), 0..5, 0);
418 assert_eq!(chunk.buffer_id, 1);
419 assert_eq!(chunk.content, "Hello");
420 assert_eq!(chunk.byte_range, 0..5);
421 assert_eq!(chunk.index, 0);
422 assert!(chunk.id.is_none());
423 }
424
425 #[test]
426 fn test_chunk_with_strategy() {
427 let chunk = Chunk::with_strategy(1, "content".to_string(), 0..7, 0, "semantic");
428 assert_eq!(chunk.metadata.strategy, Some("semantic".to_string()));
429 }
430
431 #[test]
432 fn test_chunk_size() {
433 let chunk = Chunk::new(1, "Hello, world!".to_string(), 0..13, 0);
434 assert_eq!(chunk.size(), 13);
435 assert_eq!(chunk.range_size(), 13);
436 }
437
438 #[test]
439 fn test_chunk_offsets() {
440 let chunk = Chunk::new(1, "world".to_string(), 7..12, 1);
441 assert_eq!(chunk.start(), 7);
442 assert_eq!(chunk.end(), 12);
443 }
444
445 #[test]
446 fn test_chunk_estimate_tokens() {
447 let chunk = Chunk::new(1, "Hello, world!".to_string(), 0..13, 0);
448 assert!(chunk.estimate_tokens() >= 3);
450 assert!(chunk.estimate_tokens() <= 4);
451 }
452
453 #[test]
454 fn test_chunk_estimate_tokens_accurate() {
455 let chunk = Chunk::new(1, "Hello, world!".to_string(), 0..13, 0);
457 let accurate = chunk.estimate_tokens_accurate();
458 assert!(accurate >= 2, "Expected >= 2, got {accurate}");
460 assert!(accurate <= 5, "Expected <= 5, got {accurate}");
461 }
462
463 #[test]
464 fn test_estimate_tokens_for_text() {
465 assert_eq!(estimate_tokens_for_text(""), 0);
467
468 let single = estimate_tokens_for_text("hello");
470 assert!(single >= 1);
471
472 let words = estimate_tokens_for_text("the quick brown fox");
474 assert!(words >= 4, "Expected >= 4 for 4 words, got {words}");
475
476 let code = estimate_tokens_for_text("fn main() { println!(\"hello\"); }");
478 assert!(code >= 5, "Expected >= 5 for code, got {code}");
479
480 let cjk = estimate_tokens_for_text("你好世界");
482 assert!(cjk >= 4, "Expected >= 4 for 4 CJK chars, got {cjk}");
483 }
484
485 #[test]
486 fn test_chunk_preview() {
487 let chunk = Chunk::new(1, "Hello, world!".to_string(), 0..13, 0);
488 assert_eq!(chunk.preview(5), "Hello");
489 assert_eq!(chunk.preview(100), "Hello, world!");
490 }
491
492 #[test]
493 fn test_chunk_overlaps_with() {
494 let chunk = Chunk::new(1, "test".to_string(), 10..20, 0);
495 assert!(chunk.overlaps_with(&(15..25)));
496 assert!(chunk.overlaps_with(&(5..15)));
497 assert!(!chunk.overlaps_with(&(20..30)));
498 assert!(!chunk.overlaps_with(&(0..10)));
499 }
500
501 #[test]
502 fn test_chunk_contains_offset() {
503 let chunk = Chunk::new(1, "test".to_string(), 10..20, 0);
504 assert!(chunk.contains_offset(10));
505 assert!(chunk.contains_offset(15));
506 assert!(!chunk.contains_offset(20));
507 assert!(!chunk.contains_offset(5));
508 }
509
510 #[test]
511 fn test_chunk_hash() {
512 let mut chunk1 = Chunk::new(1, "Hello".to_string(), 0..5, 0);
513 let mut chunk2 = Chunk::new(2, "Hello".to_string(), 0..5, 0);
514 chunk1.compute_hash();
515 chunk2.compute_hash();
516 assert_eq!(chunk1.metadata.content_hash, chunk2.metadata.content_hash);
517 }
518
519 #[test]
520 fn test_chunk_builder() {
521 let chunk = ChunkBuilder::new()
522 .buffer_id(1)
523 .content("test".to_string())
524 .byte_range(0..4)
525 .index(0)
526 .strategy("fixed")
527 .token_count(1)
528 .line_range(0..1)
529 .has_overlap(true)
530 .build();
531
532 assert_eq!(chunk.buffer_id, 1);
533 assert_eq!(chunk.content, "test");
534 assert_eq!(chunk.metadata.strategy, Some("fixed".to_string()));
535 assert_eq!(chunk.metadata.token_count, Some(1));
536 assert_eq!(chunk.metadata.line_range, Some(0..1));
537 assert!(chunk.metadata.has_overlap);
538 }
539
540 #[test]
541 fn test_chunk_serialization() {
542 let chunk = Chunk::new(1, "test".to_string(), 0..4, 0);
543 let json = serde_json::to_string(&chunk);
544 assert!(json.is_ok());
545
546 let deserialized: Result<Chunk, _> = serde_json::from_str(&json.unwrap());
547 assert!(deserialized.is_ok());
548 assert_eq!(deserialized.unwrap().content, "test");
549 }
550
551 #[test]
552 fn test_chunk_empty() {
553 let chunk = Chunk::new(1, String::new(), 0..0, 0);
554 assert!(chunk.is_empty());
555 assert_eq!(chunk.size(), 0);
556 }
557
558 #[test]
559 fn test_chunk_set_line_range() {
560 let mut chunk = Chunk::new(1, "test".to_string(), 0..4, 0);
562 chunk.set_line_range(5, 10);
563 assert_eq!(chunk.metadata.line_range, Some(5..10));
564 }
565
566 #[test]
567 fn test_find_char_boundary_at_end() {
568 let s = "hello";
570 assert_eq!(find_char_boundary(s, 10), 5);
571 assert_eq!(find_char_boundary(s, 5), 5);
572 }
573
574 #[test]
575 fn test_find_char_boundary_in_multibyte() {
576 let s = "Hello 世界!";
578 assert_eq!(find_char_boundary(s, 7), 6); assert_eq!(find_char_boundary(s, 8), 6); assert_eq!(find_char_boundary(s, 9), 9); }
583}