use async_trait::async_trait;
use super::{TextSplitter, TextSplitterError};
#[derive(Debug, Clone)]
pub struct CharacterTextSplitterOptions {
pub chunk_size: usize,
pub chunk_overlap: usize,
pub separator: String,
pub trim_chunks: bool,
}
impl Default for CharacterTextSplitterOptions {
fn default() -> Self {
Self::new()
}
}
impl CharacterTextSplitterOptions {
pub fn new() -> Self {
Self {
chunk_size: 1000,
chunk_overlap: 200,
separator: " ".to_string(),
trim_chunks: true,
}
}
pub fn with_chunk_size(mut self, chunk_size: usize) -> Self {
self.chunk_size = chunk_size;
self
}
pub fn with_chunk_overlap(mut self, chunk_overlap: usize) -> Self {
self.chunk_overlap = chunk_overlap;
self
}
pub fn with_separator<S: Into<String>>(mut self, separator: S) -> Self {
self.separator = separator.into();
self
}
pub fn with_trim_chunks(mut self, trim_chunks: bool) -> Self {
self.trim_chunks = trim_chunks;
self
}
}
pub struct CharacterTextSplitter {
options: CharacterTextSplitterOptions,
}
impl Default for CharacterTextSplitter {
fn default() -> Self {
Self::new()
}
}
impl CharacterTextSplitter {
pub fn new() -> Self {
Self::with_options(CharacterTextSplitterOptions::default())
}
pub fn with_options(options: CharacterTextSplitterOptions) -> Self {
Self { options }
}
pub fn with_chunk_size(chunk_size: usize) -> Self {
Self::new().with_chunk_size_option(chunk_size)
}
pub fn with_chunk_size_option(mut self, chunk_size: usize) -> Self {
self.options.chunk_size = chunk_size;
self
}
pub fn with_chunk_overlap(mut self, chunk_overlap: usize) -> Self {
self.options.chunk_overlap = chunk_overlap;
self
}
pub fn with_separator<S: Into<String>>(mut self, separator: S) -> Self {
self.options.separator = separator.into();
self
}
fn split_by_separator(&self, text: &str) -> Vec<String> {
if text.is_empty() {
return vec![];
}
let parts: Vec<&str> = if self.options.separator.is_empty() {
return self.split_by_characters(text);
} else {
text.split(&self.options.separator).collect()
};
let mut chunks = Vec::new();
let mut current_chunk = String::new();
for (i, part) in parts.iter().enumerate() {
let part_with_sep = if i > 0 && !self.options.separator.is_empty() {
format!("{}{}", self.options.separator, part)
} else {
part.to_string()
};
let test_chunk = if current_chunk.is_empty() {
part_with_sep.clone()
} else {
format!("{}{}", current_chunk, part_with_sep)
};
if test_chunk.len() <= self.options.chunk_size {
if current_chunk.is_empty() {
current_chunk = part_with_sep;
} else {
current_chunk.push_str(&part_with_sep);
}
} else {
if !current_chunk.is_empty() {
let trimmed = if self.options.trim_chunks {
current_chunk.trim().to_string()
} else {
current_chunk.clone()
};
if !trimmed.is_empty() {
chunks.push(trimmed);
}
}
if part.len() > self.options.chunk_size {
let sub_chunks = self.split_by_characters(part);
chunks.extend(sub_chunks);
current_chunk = String::new();
} else {
current_chunk = part_with_sep;
}
}
}
if !current_chunk.is_empty() {
let trimmed = if self.options.trim_chunks {
current_chunk.trim().to_string()
} else {
current_chunk
};
if !trimmed.is_empty() {
chunks.push(trimmed);
}
}
self.apply_overlap(chunks)
}
fn split_by_characters(&self, text: &str) -> Vec<String> {
let mut chunks = Vec::new();
let mut start = 0;
while start < text.len() {
let end = (start + self.options.chunk_size).min(text.len());
let chunk = text[start..end].to_string();
let trimmed = if self.options.trim_chunks {
chunk.trim().to_string()
} else {
chunk
};
if !trimmed.is_empty() {
chunks.push(trimmed);
}
start = end.saturating_sub(self.options.chunk_overlap);
}
chunks
}
fn apply_overlap(&self, chunks: Vec<String>) -> Vec<String> {
if self.options.chunk_overlap == 0 || chunks.len() <= 1 {
return chunks;
}
let mut overlapped = Vec::new();
for (i, chunk) in chunks.iter().enumerate() {
if i == 0 {
overlapped.push(chunk.clone());
} else {
let prev_chunk = &chunks[i - 1];
let overlap_start = prev_chunk.len().saturating_sub(self.options.chunk_overlap);
let overlap_text = &prev_chunk[overlap_start..];
let mut new_chunk = String::new();
if !overlap_text.is_empty() {
new_chunk.push_str(overlap_text);
if !self.options.separator.is_empty() {
new_chunk.push_str(&self.options.separator);
}
}
new_chunk.push_str(chunk);
overlapped.push(new_chunk);
}
}
overlapped
}
}
#[async_trait]
impl TextSplitter for CharacterTextSplitter {
async fn split_text(&self, text: &str) -> Result<Vec<String>, TextSplitterError> {
if text.is_empty() {
return Ok(vec![]);
}
if self.options.chunk_size == 0 {
return Err(TextSplitterError::InvalidSplitterOptions);
}
let chunks = self.split_by_separator(text);
Ok(chunks)
}
}