use std::collections::HashMap;
#[derive(Debug, Clone)]
pub struct ChunkConfig {
pub max_tokens: usize,
pub include_language_markers: bool,
pub include_cross_file_refs: bool,
pub chunk_overlap: usize,
}
impl Default for ChunkConfig {
fn default() -> Self {
Self {
max_tokens: 512,
include_language_markers: true,
include_cross_file_refs: true,
chunk_overlap: 64,
}
}
}
#[derive(Debug, Clone)]
pub struct SemanticChunk {
pub content: String,
pub language: String,
pub file_path: String,
pub byte_range: (usize, usize),
pub cross_lang_refs: HashMap<String, Vec<String>>,
pub chunk_index: usize,
pub total_chunks: usize,
}
pub struct CrossLanguageChunker {
config: ChunkConfig,
}
impl CrossLanguageChunker {
pub fn new(config: ChunkConfig) -> Self {
Self { config }
}
pub fn default() -> Self {
Self {
config: ChunkConfig::default(),
}
}
pub fn chunk_file(
&self,
content: &str,
language: &str,
file_path: &str,
cross_lang_refs: HashMap<String, Vec<String>>,
) -> Vec<SemanticChunk> {
let mut chunks = Vec::new();
let language_marker = if self.config.include_language_markers {
format!("// LANGUAGE: {}\n", language)
} else {
String::new()
};
let cross_ref_marker = if self.config.include_cross_file_refs && !cross_lang_refs.is_empty() {
let refs: Vec<String> = cross_lang_refs
.iter()
.map(|(lang, symbols)| {
format!(
"// REFERENCES[{}]: {}",
lang,
symbols.join(", ")
)
})
.collect();
format!("{}\n", refs.join("\n"))
} else {
String::new()
};
let tokens: Vec<&str> = content.split_whitespace().collect();
let chunk_size = self.config.max_tokens;
if tokens.len() <= chunk_size {
chunks.push(SemanticChunk {
content: format!("{}{}{}", language_marker, cross_ref_marker, content),
language: language.to_string(),
file_path: file_path.to_string(),
byte_range: (0, content.len()),
cross_lang_refs,
chunk_index: 0,
total_chunks: 1,
});
} else {
let total_chunks = (tokens.len() as f32 / chunk_size as f32).ceil() as usize;
for (i, chunk_tokens) in tokens.chunks(chunk_size).enumerate() {
let chunk_content = chunk_tokens.join(" ");
chunks.push(SemanticChunk {
content: format!(
"{}{}// CHUNK {}/{}\n{}",
language_marker, cross_ref_marker, i + 1, total_chunks, chunk_content
),
language: language.to_string(),
file_path: file_path.to_string(),
byte_range: (0, 0), cross_lang_refs: cross_lang_refs.clone(),
chunk_index: i,
total_chunks,
});
}
}
chunks
}
pub fn extract_cross_lang_references(
&self,
content: &str,
language: &str,
) -> HashMap<String, Vec<String>> {
let mut refs = HashMap::new();
match language {
"python" => {
if content.contains("SELECT") || content.contains("INSERT") || content.contains("UPDATE") {
refs.entry("sql".to_string())
.or_insert_with(Vec::new)
.push("database_query".to_string());
}
if content.contains("render_template") || content.contains("<html") {
refs.entry("html".to_string())
.or_insert_with(Vec::new)
.push("template".to_string());
}
}
"javascript" | "typescript" => {
if content.contains("SELECT") || content.contains("query") {
refs.entry("sql".to_string())
.or_insert_with(Vec::new)
.push("database_query".to_string());
}
if content.contains("<div") || content.contains("ReactDOM") {
refs.entry("html".to_string())
.or_insert_with(Vec::new)
.push("ui_component".to_string());
}
}
_ => {}
}
refs
}
}
impl Default for CrossLanguageChunker {
fn default() -> Self {
Self::new(ChunkConfig::default())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_chunker_creation() {
let chunker = CrossLanguageChunker::default();
assert_eq!(chunker.config.max_tokens, 512);
}
#[test]
fn test_single_chunk_small_file() {
let chunker = CrossLanguageChunker::default();
let content = "fn hello() { println!(\"world\"); }";
let chunks = chunker.chunk_file(content, "rust", "test.rs", HashMap::new());
assert_eq!(chunks.len(), 1);
assert!(chunks[0].content.contains("LANGUAGE: rust"));
}
#[test]
fn test_cross_lang_reference_extraction_python() {
let chunker = CrossLanguageChunker::default();
let content = "def query_user(): return execute(\"SELECT * FROM users\")";
let refs = chunker.extract_cross_lang_references(content, "python");
assert!(refs.contains_key("sql"));
}
#[test]
fn test_cross_lang_reference_extraction_javascript() {
let chunker = CrossLanguageChunker::default();
let content = "const App = () => { return <div>Hello</div>; }";
let refs = chunker.extract_cross_lang_references(content, "javascript");
assert!(refs.contains_key("html"));
}
}