use crate::document::Document;
use super::{recursive::RecursiveCharSplitter, TextSplitter};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CodeLanguage {
Rust,
Python,
JavaScript,
Go,
Java,
Cpp,
Generic,
}
impl CodeLanguage {
pub fn separators(&self) -> Vec<&'static str> {
match self {
Self::Rust => vec![
"\nimpl ",
"\nfn ",
"\nstruct ",
"\nenum ",
"\ntrait ",
"\nmod ",
"\n\n",
"\n",
" ",
"",
],
Self::Python => vec!["\nclass ", "\ndef ", "\nasync def ", "\n\n", "\n", " ", ""],
Self::JavaScript => vec![
"\nfunction ",
"\nclass ",
"\nconst ",
"\nlet ",
"\nvar ",
"\n\n",
"\n",
" ",
"",
],
Self::Go => vec!["\nfunc ", "\ntype ", "\n\n", "\n", " ", ""],
Self::Java => vec![
"\npublic class ",
"\nclass ",
"\npublic ",
"\nprivate ",
"\nprotected ",
"\n\n",
"\n",
" ",
"",
],
Self::Cpp => vec!["\nclass ", "\nstruct ", "\nvoid ", "\n\n", "\n", " ", ""],
Self::Generic => vec!["\n\n", "\n", " ", ""],
}
}
}
pub struct CodeSplitter {
inner: RecursiveCharSplitter,
}
impl CodeSplitter {
pub fn new(language: CodeLanguage) -> Self {
Self {
inner: RecursiveCharSplitter::new().with_separators(language.separators()),
}
}
pub fn with_chunk_size(mut self, n: usize) -> Self {
self.inner = self.inner.with_chunk_size(n);
self
}
pub fn with_overlap(mut self, n: usize) -> Self {
self.inner = self.inner.with_overlap(n);
self
}
}
impl TextSplitter for CodeSplitter {
fn split(&self, doc: &Document) -> Vec<Document> {
self.inner.split(doc)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn rust_splits_at_fn_boundary() {
let code = "fn a() { 1 }\n\nfn b() { 2 }\n\nfn c() { 3 }\n";
let s = CodeSplitter::new(CodeLanguage::Rust)
.with_chunk_size(15)
.with_overlap(0);
let chunks = s.split(&Document::new(code));
assert!(chunks.len() >= 2);
assert!(chunks.iter().any(|c| c.content.contains("fn a")));
}
#[test]
fn python_splits_at_def_boundary() {
let code = "def a():\n return 1\n\ndef b():\n return 2\n";
let s = CodeSplitter::new(CodeLanguage::Python).with_chunk_size(20);
let chunks = s.split(&Document::new(code));
assert!(!chunks.is_empty());
}
}