#[cfg(feature = "semantic-chunking")]
use crate::indexer::ast_chunker::AstChunker;
use crate::indexer::language::{detect_language_from_str, Language};
use crate::indexer::regex_chunker::RegexChunker;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum ChunkType {
Function,
Class,
Method,
Struct,
Enum,
Interface,
Module,
Impl,
Trait,
Import,
Constant,
Variable,
Type,
Mixed,
#[default]
Unknown,
}
impl ChunkType {
#[allow(dead_code)]
pub fn as_str(&self) -> &'static str {
match self {
ChunkType::Function => "function",
ChunkType::Class => "class",
ChunkType::Method => "method",
ChunkType::Struct => "struct",
ChunkType::Enum => "enum",
ChunkType::Interface => "interface",
ChunkType::Module => "module",
ChunkType::Impl => "impl",
ChunkType::Trait => "trait",
ChunkType::Import => "import",
ChunkType::Constant => "constant",
ChunkType::Variable => "variable",
ChunkType::Type => "type",
ChunkType::Mixed => "mixed",
ChunkType::Unknown => "unknown",
}
}
}
#[derive(Debug, Clone)]
pub struct Chunk {
pub content: String,
pub start_line: usize,
pub end_line: usize,
#[allow(dead_code)]
pub chunk_type: ChunkType,
#[allow(dead_code)]
pub symbol_name: Option<String>,
#[allow(dead_code)]
pub language: Option<String>,
}
impl Chunk {
#[allow(dead_code)]
pub fn new(content: String, start_line: usize, end_line: usize) -> Self {
Self {
content,
start_line,
end_line,
chunk_type: ChunkType::Unknown,
symbol_name: None,
language: None,
}
}
pub fn with_metadata(
content: String,
start_line: usize,
end_line: usize,
chunk_type: ChunkType,
symbol_name: Option<String>,
language: Option<String>,
) -> Self {
Self {
content,
start_line,
end_line,
chunk_type,
symbol_name,
language,
}
}
}
#[derive(Clone)]
pub struct Chunker {
chunk_size: usize,
overlap: usize,
max_chunk_size: usize,
semantic_chunking: bool,
}
fn floor_char_boundary(s: &str, index: usize) -> usize {
if index >= s.len() {
return s.len();
}
let mut i = index;
while i > 0 && !s.is_char_boundary(i) {
i -= 1;
}
i
}
#[allow(dead_code)]
fn ceil_char_boundary(s: &str, index: usize) -> usize {
if index >= s.len() {
return s.len();
}
let mut i = index;
while i < s.len() && !s.is_char_boundary(i) {
i += 1;
}
i
}
impl Chunker {
#[allow(dead_code)]
pub fn new(chunk_size: usize, overlap: usize) -> Self {
Self {
chunk_size,
overlap,
max_chunk_size: chunk_size * 3, semantic_chunking: true,
}
}
pub fn with_options(
chunk_size: usize,
overlap: usize,
max_chunk_size: usize,
semantic_chunking: bool,
) -> Self {
Self {
chunk_size,
overlap,
max_chunk_size,
semantic_chunking,
}
}
#[allow(dead_code)]
pub fn set_semantic_chunking(&mut self, enabled: bool) {
self.semantic_chunking = enabled;
}
#[allow(dead_code)]
pub fn set_max_chunk_size(&mut self, size: usize) {
self.max_chunk_size = size;
}
#[allow(dead_code)]
pub fn chunk(&self, content: &str) -> Vec<Chunk> {
self.chunk_with_path(content, None)
}
pub fn chunk_with_path(&self, content: &str, file_path: Option<&str>) -> Vec<Chunk> {
if content.is_empty() {
return vec![];
}
if self.semantic_chunking {
if let Some(path) = file_path {
let language = detect_language_from_str(path);
if language != Language::Unknown {
#[cfg(feature = "semantic-chunking")]
if language.has_tree_sitter_grammar() {
if let Ok(chunks) = self.chunk_with_ast(content, language) {
return chunks;
}
}
if let Ok(chunks) = self.chunk_with_regex(content, language) {
return chunks;
}
}
}
}
self.chunk_by_characters(content, file_path)
}
#[cfg(feature = "semantic-chunking")]
fn chunk_with_ast(&self, content: &str, language: Language) -> Result<Vec<Chunk>, ()> {
let ast_chunker = AstChunker::new(self.max_chunk_size, self.overlap);
let units = ast_chunker
.extract_semantic_units(content, language)
.map_err(|_| ())?;
if units.is_empty() {
return Err(());
}
let lang_str = language.as_str().to_string();
let mut chunks = Vec::new();
for unit in units {
let split_units = ast_chunker.split_large_unit(&unit);
for split_unit in split_units {
chunks.push(Chunk::with_metadata(
split_unit.content,
split_unit.start_line,
split_unit.end_line,
convert_semantic_unit_type(split_unit.unit_type),
split_unit.symbol_name,
Some(lang_str.clone()),
));
}
}
let chunks = self.merge_small_chunks(chunks);
Ok(chunks)
}
fn chunk_with_regex(&self, content: &str, language: Language) -> Result<Vec<Chunk>, ()> {
let regex_chunker = RegexChunker::new(self.max_chunk_size, self.overlap);
let units = regex_chunker
.extract_boundaries(content, language)
.map_err(|_| ())?;
if units.is_empty() {
return Err(());
}
let lang_str = language.as_str().to_string();
let mut chunks = Vec::new();
for unit in units {
let split_units = regex_chunker.split_large_unit(&unit);
for split_unit in split_units {
chunks.push(Chunk::with_metadata(
split_unit.content,
split_unit.start_line,
split_unit.end_line,
convert_semantic_unit_type(split_unit.unit_type),
split_unit.symbol_name,
Some(lang_str.clone()),
));
}
}
Ok(chunks)
}
fn chunk_by_characters(&self, content: &str, file_path: Option<&str>) -> Vec<Chunk> {
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return vec![];
}
let language = file_path.map(|p| detect_language_from_str(p).as_str().to_string());
let mut chunks = Vec::new();
let mut current_chunk = String::new();
let mut chunk_start_line = 1;
let mut current_line = 1;
for line in &lines {
let line_with_newline = format!("{}\n", line);
if !current_chunk.is_empty()
&& current_chunk.len() + line_with_newline.len() > self.chunk_size
{
chunks.push(Chunk::with_metadata(
current_chunk.trim_end().to_string(),
chunk_start_line,
current_line - 1,
ChunkType::Unknown,
None,
language.clone(),
));
let overlap_start = self.find_overlap_start(¤t_chunk);
current_chunk = current_chunk[overlap_start..].to_string();
let overlap_lines = current_chunk.lines().count();
chunk_start_line = current_line.saturating_sub(overlap_lines);
}
current_chunk.push_str(&line_with_newline);
current_line += 1;
}
if !current_chunk.trim().is_empty() {
chunks.push(Chunk::with_metadata(
current_chunk.trim_end().to_string(),
chunk_start_line,
lines.len(),
ChunkType::Unknown,
None,
language,
));
}
chunks
}
fn merge_small_chunks(&self, chunks: Vec<Chunk>) -> Vec<Chunk> {
if chunks.len() <= 1 {
return chunks;
}
let min_chunk_size = self.chunk_size / 4; let mut result = Vec::new();
let mut pending: Option<Chunk> = None;
for chunk in chunks {
if let Some(mut prev) = pending.take() {
let both_small =
prev.content.len() < min_chunk_size && chunk.content.len() < min_chunk_size;
let both_other =
prev.chunk_type == ChunkType::Unknown && chunk.chunk_type == ChunkType::Unknown;
let combined_fits = prev.content.len() + chunk.content.len() < self.chunk_size;
if (both_small || both_other) && combined_fits {
prev.content = format!("{}\n{}", prev.content, chunk.content);
prev.end_line = chunk.end_line;
prev.chunk_type = ChunkType::Mixed;
pending = Some(prev);
} else {
result.push(prev);
pending = Some(chunk);
}
} else {
pending = Some(chunk);
}
}
if let Some(last) = pending {
result.push(last);
}
result
}
fn find_overlap_start(&self, content: &str) -> usize {
if content.len() <= self.overlap {
return 0;
}
let target_start = content.len() - self.overlap;
let target_start = floor_char_boundary(content, target_start);
let search_start = floor_char_boundary(content, target_start.saturating_sub(100));
if let Some(pos) = content[search_start..target_start].rfind('\n') {
return search_start + pos + 1;
}
target_start
}
#[allow(dead_code)]
pub fn chunk_code(&self, content: &str, language: Option<&str>) -> Vec<Chunk> {
let fake_path = language.map(|lang| match lang {
"rust" => "file.rs",
"python" => "file.py",
"javascript" => "file.js",
"typescript" => "file.ts",
"go" => "file.go",
"java" => "file.java",
"c" => "file.c",
"cpp" => "file.cpp",
_ => "file.txt",
});
self.chunk_with_path(content, fake_path)
}
}
fn convert_semantic_unit_type(
unit_type: crate::indexer::ast_chunker::SemanticUnitType,
) -> ChunkType {
use crate::indexer::ast_chunker::SemanticUnitType;
match unit_type {
SemanticUnitType::Function => ChunkType::Function,
SemanticUnitType::Class => ChunkType::Class,
SemanticUnitType::Method => ChunkType::Method,
SemanticUnitType::Struct => ChunkType::Struct,
SemanticUnitType::Enum => ChunkType::Enum,
SemanticUnitType::Interface => ChunkType::Interface,
SemanticUnitType::Module => ChunkType::Module,
SemanticUnitType::Impl => ChunkType::Impl,
SemanticUnitType::Trait => ChunkType::Trait,
SemanticUnitType::Import => ChunkType::Import,
SemanticUnitType::Constant => ChunkType::Constant,
SemanticUnitType::Variable => ChunkType::Variable,
SemanticUnitType::Type => ChunkType::Type,
SemanticUnitType::Other => ChunkType::Unknown,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_chunking() {
let chunker = Chunker::new(100, 20);
let content = "line1\nline2\nline3\nline4\nline5\n";
let chunks = chunker.chunk(content);
assert!(!chunks.is_empty());
assert_eq!(chunks[0].start_line, 1);
}
#[test]
fn test_empty_content() {
let chunker = Chunker::new(100, 20);
let chunks = chunker.chunk("");
assert!(chunks.is_empty());
}
#[test]
fn test_small_content() {
let chunker = Chunker::new(1000, 200);
let content = "small file";
let chunks = chunker.chunk(content);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].content, "small file");
}
#[test]
fn test_overlap() {
let chunker = Chunker::new(50, 10);
let content = (0..20)
.map(|i| format!("line{}", i))
.collect::<Vec<_>>()
.join("\n");
let chunks = chunker.chunk(&content);
if chunks.len() >= 2 {
assert!(chunks[1].start_line <= chunks[0].end_line + 1);
}
}
#[test]
fn test_whitespace_only_content() {
let chunker = Chunker::new(100, 20);
let chunks = chunker.chunk(" \n\n \n");
assert!(chunks.is_empty() || chunks.iter().all(|c| c.content.trim().is_empty()));
}
#[test]
fn test_single_line() {
let chunker = Chunker::new(100, 20);
let content = "single line content";
let chunks = chunker.chunk(content);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[0].end_line, 1);
}
#[test]
fn test_exact_chunk_size() {
let chunker = Chunker::new(20, 5);
let content = "12345678901234567890"; let chunks = chunker.chunk(content);
assert_eq!(chunks.len(), 1);
}
#[test]
fn test_line_numbers_are_correct() {
let chunker = Chunker::new(1000, 100);
let content = "line 1\nline 2\nline 3\nline 4\nline 5";
let chunks = chunker.chunk(content);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[0].end_line, 5);
}
#[test]
fn test_large_file_produces_multiple_chunks() {
let chunker = Chunker::new(100, 20);
let content = (0..100)
.map(|i| format!("line number {}", i))
.collect::<Vec<_>>()
.join("\n");
let chunks = chunker.chunk(&content);
assert!(
chunks.len() > 1,
"Large content should produce multiple chunks"
);
let total_lines = content.lines().count();
assert!(chunks.last().unwrap().end_line >= total_lines - 1);
}
#[test]
fn test_chunk_content_preserved() {
let chunker = Chunker::new(1000, 100);
let content = "fn main() {\n println!(\"Hello\");\n}";
let chunks = chunker.chunk(content);
assert_eq!(chunks.len(), 1);
assert!(chunks[0].content.contains("fn main()"));
assert!(chunks[0].content.contains("println!"));
}
#[test]
fn test_unicode_content() {
let chunker = Chunker::new(100, 20);
let content = "日本語テスト\n䏿–‡æµ‹è¯•\n한êµì–´ 테스트\nemoji: 🎉🚀";
let chunks = chunker.chunk(content);
assert!(!chunks.is_empty());
assert!(chunks[0].content.contains("日本語"));
}
#[test]
fn test_very_long_line() {
let chunker = Chunker::new(50, 10);
let long_line = "a".repeat(200);
let chunks = chunker.chunk(&long_line);
assert!(!chunks.is_empty());
}
#[test]
fn test_chunks_dont_lose_content() {
let chunker = Chunker::new(100, 20);
let original_lines: Vec<_> = (0..50).map(|i| format!("line{}", i)).collect();
let content = original_lines.join("\n");
let chunks = chunker.chunk(&content);
let all_chunk_content: String = chunks
.iter()
.map(|c| c.content.as_str())
.collect::<Vec<_>>()
.join("\n");
assert!(all_chunk_content.contains("line0"));
assert!(all_chunk_content.contains("line49"));
}
#[test]
fn test_chunk_with_path_rust() {
let chunker = Chunker::new(1000, 100);
let content = r#"
fn hello() {
println!("hello");
}
fn world() {
println!("world");
}
"#;
let chunks = chunker.chunk_with_path(content, Some("main.rs"));
assert!(chunks.iter().all(|c| c.language.as_deref() == Some("rust")));
}
#[test]
fn test_chunk_with_path_python() {
let chunker = Chunker::new(1000, 100);
let content = r#"
def hello():
print("hello")
class MyClass:
pass
"#;
let chunks = chunker.chunk_with_path(content, Some("script.py"));
assert!(chunks
.iter()
.all(|c| c.language.as_deref() == Some("python")));
}
#[test]
fn test_chunk_metadata() {
let chunk = Chunk::with_metadata(
"fn test() {}".to_string(),
1,
1,
ChunkType::Function,
Some("test".to_string()),
Some("rust".to_string()),
);
assert_eq!(chunk.chunk_type, ChunkType::Function);
assert_eq!(chunk.symbol_name.as_deref(), Some("test"));
assert_eq!(chunk.language.as_deref(), Some("rust"));
}
#[test]
fn test_no_overlap_at_content_start() {
let chunker = Chunker::new(100, 20);
let content = (0..50)
.map(|i| format!("line{}", i))
.collect::<Vec<_>>()
.join("\n");
let chunks = chunker.chunk(&content);
assert_eq!(chunks[0].start_line, 1);
}
#[test]
fn test_semantic_chunking_disabled() {
let mut chunker = Chunker::new(100, 20);
chunker.set_semantic_chunking(false);
let content = "fn test() { }\nfn other() { }";
let chunks = chunker.chunk_with_path(content, Some("test.rs"));
for chunk in &chunks {
assert_eq!(chunk.chunk_type, ChunkType::Unknown);
}
}
#[test]
fn test_chunk_type_as_str() {
assert_eq!(ChunkType::Function.as_str(), "function");
assert_eq!(ChunkType::Class.as_str(), "class");
assert_eq!(ChunkType::Unknown.as_str(), "unknown");
}
}