use async_trait::async_trait;
use super::{TextSplitter, TextSplitterError};
#[derive(Debug, Clone)]
pub struct CodeSplitterOptions {
pub chunk_size: usize,
pub chunk_overlap: usize,
pub trim_chunks: bool,
}
impl Default for CodeSplitterOptions {
fn default() -> Self {
Self::new()
}
}
impl CodeSplitterOptions {
pub fn new() -> Self {
Self {
chunk_size: 1000,
chunk_overlap: 200,
trim_chunks: true,
}
}
pub fn with_chunk_size(mut self, chunk_size: usize) -> Self {
self.chunk_size = chunk_size;
self
}
pub fn with_chunk_overlap(mut self, chunk_overlap: usize) -> Self {
self.chunk_overlap = chunk_overlap;
self
}
pub fn with_trim_chunks(mut self, trim_chunks: bool) -> Self {
self.trim_chunks = trim_chunks;
self
}
}
#[cfg(feature = "tree-sitter")]
pub struct CodeSplitter {
language: tree_sitter::Language,
options: CodeSplitterOptions,
}
#[cfg(not(feature = "tree-sitter"))]
pub struct CodeSplitter {
options: CodeSplitterOptions,
}
impl CodeSplitter {
#[cfg(feature = "tree-sitter")]
pub fn new(language: tree_sitter::Language) -> Self {
Self::with_options(language, CodeSplitterOptions::default())
}
#[cfg(feature = "tree-sitter")]
pub fn with_options(language: tree_sitter::Language, options: CodeSplitterOptions) -> Self {
Self { language, options }
}
#[cfg(feature = "tree-sitter")]
pub fn with_chunk_size(language: tree_sitter::Language, chunk_size: usize) -> Self {
Self::new(language).with_chunk_size_option(chunk_size)
}
#[cfg(feature = "tree-sitter")]
pub fn with_chunk_size_option(mut self, chunk_size: usize) -> Self {
self.options.chunk_size = chunk_size;
self
}
#[cfg(feature = "tree-sitter")]
pub fn with_chunk_overlap(mut self, chunk_overlap: usize) -> Self {
self.options.chunk_overlap = chunk_overlap;
self
}
#[cfg(not(feature = "tree-sitter"))]
pub fn new() -> Self {
Self::with_options(CodeSplitterOptions::default())
}
#[cfg(not(feature = "tree-sitter"))]
pub fn with_options(options: CodeSplitterOptions) -> Self {
Self { options }
}
#[cfg(not(feature = "tree-sitter"))]
pub fn with_chunk_size(_language: (), chunk_size: usize) -> Self {
Self::new().with_chunk_size_option(chunk_size)
}
#[cfg(not(feature = "tree-sitter"))]
pub fn with_chunk_size_option(mut self, chunk_size: usize) -> Self {
self.options.chunk_size = chunk_size;
self
}
#[cfg(not(feature = "tree-sitter"))]
pub fn with_chunk_overlap(mut self, chunk_overlap: usize) -> Self {
self.options.chunk_overlap = chunk_overlap;
self
}
}
#[async_trait]
impl TextSplitter for CodeSplitter {
async fn split_text(&self, text: &str) -> Result<Vec<String>, TextSplitterError> {
if text.is_empty() {
return Ok(vec![]);
}
if self.options.chunk_size == 0 {
return Err(TextSplitterError::InvalidSplitterOptions);
}
#[cfg(feature = "tree-sitter")]
{
use text_splitter::ChunkConfig;
use text_splitter::CodeSplitter as TextSplitterCodeSplitter;
let chunk_config = ChunkConfig::new(self.options.chunk_size)
.with_trim(self.options.trim_chunks)
.with_overlap(self.options.chunk_overlap)
.map_err(|_| TextSplitterError::InvalidSplitterOptions)?;
let language = self.language.clone();
let splitter = TextSplitterCodeSplitter::new(language, chunk_config)
.map_err(|e| TextSplitterError::CodeParseError(format!("{}", e)))?;
let chunks: Vec<String> = splitter.chunks(text).map(|x| x.to_string()).collect();
Ok(chunks)
}
#[cfg(not(feature = "tree-sitter"))]
{
Err(TextSplitterError::OtherError(
"CodeSplitter requires the 'tree-sitter' feature to be enabled.".to_string(),
))
}
}
}