use crate::io::{current_timestamp, find_char_boundary};
use serde::{Deserialize, Serialize};
use std::ops::Range;
#[must_use]
pub fn estimate_tokens_for_text(text: &str) -> usize {
if text.is_empty() {
return 0;
}
let mut word_count: usize = 0;
let mut punct_count: usize = 0;
let mut non_ascii_chars: usize = 0;
let mut in_word = false;
for ch in text.chars() {
if ch.is_whitespace() {
in_word = false;
} else if ch.is_ascii_punctuation() {
punct_count += 1;
in_word = false;
} else if !ch.is_ascii() {
non_ascii_chars += 1;
in_word = false;
} else if !in_word {
word_count += 1;
in_word = true;
}
}
let word_tokens = (word_count * 13) / 10; let punct_tokens = punct_count.div_ceil(2); let non_ascii_tokens = (non_ascii_chars * 3) / 2;
(word_tokens + punct_tokens + non_ascii_tokens).max(1)
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Chunk {
pub id: Option<i64>,
pub buffer_id: i64,
pub content: String,
pub byte_range: Range<usize>,
pub index: usize,
pub metadata: ChunkMetadata,
}
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct ChunkMetadata {
pub strategy: Option<String>,
pub token_count: Option<usize>,
pub line_range: Option<Range<usize>>,
pub created_at: i64,
pub content_hash: Option<String>,
pub has_overlap: bool,
pub custom: Option<String>,
}
impl Chunk {
#[must_use]
pub fn new(buffer_id: i64, content: String, byte_range: Range<usize>, index: usize) -> Self {
Self {
id: None,
buffer_id,
content,
byte_range,
index,
metadata: ChunkMetadata {
created_at: current_timestamp(),
..Default::default()
},
}
}
#[must_use]
pub fn with_strategy(
buffer_id: i64,
content: String,
byte_range: Range<usize>,
index: usize,
strategy: &str,
) -> Self {
let mut chunk = Self::new(buffer_id, content, byte_range, index);
chunk.metadata.strategy = Some(strategy.to_string());
chunk
}
#[must_use]
pub const fn size(&self) -> usize {
self.content.len()
}
#[must_use]
pub const fn range_size(&self) -> usize {
self.byte_range.end - self.byte_range.start
}
#[must_use]
pub const fn is_empty(&self) -> bool {
self.content.is_empty()
}
#[must_use]
pub const fn start(&self) -> usize {
self.byte_range.start
}
#[must_use]
pub const fn end(&self) -> usize {
self.byte_range.end
}
pub const fn set_token_count(&mut self, count: usize) {
self.metadata.token_count = Some(count);
}
#[must_use]
pub const fn estimate_tokens(&self) -> usize {
self.content.len().div_ceil(4)
}
#[must_use]
pub fn estimate_tokens_accurate(&self) -> usize {
estimate_tokens_for_text(&self.content)
}
pub const fn set_line_range(&mut self, start_line: usize, end_line: usize) {
self.metadata.line_range = Some(start_line..end_line);
}
pub const fn set_has_overlap(&mut self, has_overlap: bool) {
self.metadata.has_overlap = has_overlap;
}
pub fn compute_hash(&mut self) {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
self.content.hash(&mut hasher);
self.metadata.content_hash = Some(format!("{:016x}", hasher.finish()));
}
#[must_use]
pub fn preview(&self, max_len: usize) -> &str {
if self.content.len() <= max_len {
&self.content
} else {
let end = find_char_boundary(&self.content, max_len);
&self.content[..end]
}
}
#[must_use]
pub const fn overlaps_with(&self, other_range: &Range<usize>) -> bool {
self.byte_range.start < other_range.end && other_range.start < self.byte_range.end
}
#[must_use]
pub fn contains_offset(&self, offset: usize) -> bool {
self.byte_range.contains(&offset)
}
}
#[derive(Debug, Default)]
pub struct ChunkBuilder {
buffer_id: Option<i64>,
content: Option<String>,
byte_range: Option<Range<usize>>,
index: Option<usize>,
strategy: Option<String>,
token_count: Option<usize>,
line_range: Option<Range<usize>>,
has_overlap: bool,
}
impl ChunkBuilder {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub const fn buffer_id(mut self, id: i64) -> Self {
self.buffer_id = Some(id);
self
}
#[must_use]
pub fn content(mut self, content: String) -> Self {
self.content = Some(content);
self
}
#[must_use]
pub const fn byte_range(mut self, range: Range<usize>) -> Self {
self.byte_range = Some(range);
self
}
#[must_use]
pub const fn index(mut self, index: usize) -> Self {
self.index = Some(index);
self
}
#[must_use]
pub fn strategy(mut self, strategy: &str) -> Self {
self.strategy = Some(strategy.to_string());
self
}
#[must_use]
pub const fn token_count(mut self, count: usize) -> Self {
self.token_count = Some(count);
self
}
#[must_use]
pub const fn line_range(mut self, range: Range<usize>) -> Self {
self.line_range = Some(range);
self
}
#[must_use]
pub const fn has_overlap(mut self, has_overlap: bool) -> Self {
self.has_overlap = has_overlap;
self
}
#[must_use]
pub fn build(self) -> Chunk {
let buffer_id = self.buffer_id.unwrap_or(0);
let content = self.content.unwrap_or_default();
let byte_range = self.byte_range.unwrap_or(0..content.len());
let index = self.index.unwrap_or(0);
let mut chunk = Chunk::new(buffer_id, content, byte_range, index);
if let Some(strategy) = self.strategy {
chunk.metadata.strategy = Some(strategy);
}
if let Some(count) = self.token_count {
chunk.metadata.token_count = Some(count);
}
if let Some(range) = self.line_range {
chunk.metadata.line_range = Some(range);
}
chunk.metadata.has_overlap = self.has_overlap;
chunk
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_chunk_new() {
let chunk = Chunk::new(1, "Hello".to_string(), 0..5, 0);
assert_eq!(chunk.buffer_id, 1);
assert_eq!(chunk.content, "Hello");
assert_eq!(chunk.byte_range, 0..5);
assert_eq!(chunk.index, 0);
assert!(chunk.id.is_none());
}
#[test]
fn test_chunk_with_strategy() {
let chunk = Chunk::with_strategy(1, "content".to_string(), 0..7, 0, "semantic");
assert_eq!(chunk.metadata.strategy, Some("semantic".to_string()));
}
#[test]
fn test_chunk_size() {
let chunk = Chunk::new(1, "Hello, world!".to_string(), 0..13, 0);
assert_eq!(chunk.size(), 13);
assert_eq!(chunk.range_size(), 13);
}
#[test]
fn test_chunk_offsets() {
let chunk = Chunk::new(1, "world".to_string(), 7..12, 1);
assert_eq!(chunk.start(), 7);
assert_eq!(chunk.end(), 12);
}
#[test]
fn test_chunk_estimate_tokens() {
let chunk = Chunk::new(1, "Hello, world!".to_string(), 0..13, 0);
assert!(chunk.estimate_tokens() >= 3);
assert!(chunk.estimate_tokens() <= 4);
}
#[test]
fn test_chunk_estimate_tokens_accurate() {
let chunk = Chunk::new(1, "Hello, world!".to_string(), 0..13, 0);
let accurate = chunk.estimate_tokens_accurate();
assert!(accurate >= 2, "Expected >= 2, got {accurate}");
assert!(accurate <= 5, "Expected <= 5, got {accurate}");
}
#[test]
fn test_estimate_tokens_for_text() {
assert_eq!(estimate_tokens_for_text(""), 0);
let single = estimate_tokens_for_text("hello");
assert!(single >= 1);
let words = estimate_tokens_for_text("the quick brown fox");
assert!(words >= 4, "Expected >= 4 for 4 words, got {words}");
let code = estimate_tokens_for_text("fn main() { println!(\"hello\"); }");
assert!(code >= 5, "Expected >= 5 for code, got {code}");
let cjk = estimate_tokens_for_text("你好世界");
assert!(cjk >= 4, "Expected >= 4 for 4 CJK chars, got {cjk}");
}
#[test]
fn test_chunk_preview() {
let chunk = Chunk::new(1, "Hello, world!".to_string(), 0..13, 0);
assert_eq!(chunk.preview(5), "Hello");
assert_eq!(chunk.preview(100), "Hello, world!");
}
#[test]
fn test_chunk_overlaps_with() {
let chunk = Chunk::new(1, "test".to_string(), 10..20, 0);
assert!(chunk.overlaps_with(&(15..25)));
assert!(chunk.overlaps_with(&(5..15)));
assert!(!chunk.overlaps_with(&(20..30)));
assert!(!chunk.overlaps_with(&(0..10)));
}
#[test]
fn test_chunk_contains_offset() {
let chunk = Chunk::new(1, "test".to_string(), 10..20, 0);
assert!(chunk.contains_offset(10));
assert!(chunk.contains_offset(15));
assert!(!chunk.contains_offset(20));
assert!(!chunk.contains_offset(5));
}
#[test]
fn test_chunk_hash() {
let mut chunk1 = Chunk::new(1, "Hello".to_string(), 0..5, 0);
let mut chunk2 = Chunk::new(2, "Hello".to_string(), 0..5, 0);
chunk1.compute_hash();
chunk2.compute_hash();
assert_eq!(chunk1.metadata.content_hash, chunk2.metadata.content_hash);
}
#[test]
fn test_chunk_builder() {
let chunk = ChunkBuilder::new()
.buffer_id(1)
.content("test".to_string())
.byte_range(0..4)
.index(0)
.strategy("fixed")
.token_count(1)
.line_range(0..1)
.has_overlap(true)
.build();
assert_eq!(chunk.buffer_id, 1);
assert_eq!(chunk.content, "test");
assert_eq!(chunk.metadata.strategy, Some("fixed".to_string()));
assert_eq!(chunk.metadata.token_count, Some(1));
assert_eq!(chunk.metadata.line_range, Some(0..1));
assert!(chunk.metadata.has_overlap);
}
#[test]
fn test_chunk_serialization() {
let chunk = Chunk::new(1, "test".to_string(), 0..4, 0);
let json = serde_json::to_string(&chunk);
assert!(json.is_ok());
let deserialized: Result<Chunk, _> = serde_json::from_str(&json.unwrap());
assert!(deserialized.is_ok());
assert_eq!(deserialized.unwrap().content, "test");
}
#[test]
fn test_chunk_empty() {
let chunk = Chunk::new(1, String::new(), 0..0, 0);
assert!(chunk.is_empty());
assert_eq!(chunk.size(), 0);
}
#[test]
fn test_chunk_set_line_range() {
let mut chunk = Chunk::new(1, "test".to_string(), 0..4, 0);
chunk.set_line_range(5, 10);
assert_eq!(chunk.metadata.line_range, Some(5..10));
}
#[test]
fn test_find_char_boundary_at_end() {
let s = "hello";
assert_eq!(find_char_boundary(s, 10), 5);
assert_eq!(find_char_boundary(s, 5), 5);
}
#[test]
fn test_find_char_boundary_in_multibyte() {
let s = "Hello 世界!";
assert_eq!(find_char_boundary(s, 7), 6); assert_eq!(find_char_boundary(s, 8), 6); assert_eq!(find_char_boundary(s, 9), 9); }
}