use crate::chunker::{Chunk, ChunkerImpl};
use crate::codeparse::{code_symbol_node_id, Symbol};
use crate::config::SymbolAwareChunkerConfig;
use crate::sources::Document;
use serde_json::json;
pub struct SymbolAwareChunker {
cfg: SymbolAwareChunkerConfig,
}
impl SymbolAwareChunker {
pub fn new(cfg: SymbolAwareChunkerConfig) -> Self {
Self { cfg }
}
pub fn chunk(&self, doc: &Document) -> Vec<Chunk> {
let Some(language) = detect_language(doc) else {
return self.fallback(doc, "language_undetected");
};
let content = doc.content.as_str();
#[cfg(feature = "code-aware-python")]
if language == "python"
&& crate::codeparse::langs::python::has_syntax_errors(content)
{
return self.fallback(doc, "python_syntax_error");
}
let file_path = doc_file_path(doc);
let symbols = extract_symbols_for_language(&language, &file_path, content);
if symbols.is_empty() {
return self.fallback(doc, "no_symbols");
}
let project_id = self
.cfg
.project_id
.clone()
.unwrap_or_else(|| "default".to_string());
let top_level: Vec<&Symbol> = symbols.iter().filter(|s| s.parent_name.is_none()).collect();
if top_level.is_empty() {
return self.fallback(doc, "no_symbols");
}
let mut chunks: Vec<Chunk> = Vec::with_capacity(top_level.len());
let lines: Vec<&str> = content.lines().collect();
for (seq_idx, sym) in top_level.iter().enumerate() {
let source_slice = slice_lines(&lines, sym.line_start, sym.line_end);
let node_id = code_symbol_node_id(&project_id, &language, &file_path, &sym.fqn);
let metadata = json!({
"strategy": "symbol_aware",
"fqn": sym.fqn,
"node_id": node_id,
"language": language,
"parent_name": sym.parent_name,
"symbol_name": sym.name,
"symbol_type": sym.symbol_type,
"line_start": sym.line_start,
"line_end": sym.line_end,
});
chunks.push(Chunk {
doc_id: doc.id.clone(),
seq_num: seq_idx,
original_content: source_slice.clone(),
embedded_content: source_slice,
metadata,
});
}
chunks
}
fn fallback(&self, doc: &Document, reason: &str) -> Vec<Chunk> {
use crate::chunker::SentenceAwareChunker;
use crate::config::SentenceAwareChunkerConfig;
let inner = SentenceAwareChunker::new(SentenceAwareChunkerConfig {
doc_type: "prose".to_string(),
max_chars: 2000,
min_chars: 200,
if_oversize: None,
});
let mut chunks = inner.chunk(doc);
for c in &mut chunks {
if let Some(obj) = c.metadata.as_object_mut() {
obj.insert("strategy".to_string(), json!("symbol_aware_fallback"));
obj.insert("fallback_reason".to_string(), json!(reason));
}
}
chunks
}
}
impl ChunkerImpl for SymbolAwareChunker {
fn chunk(&self, doc: &Document) -> Vec<Chunk> {
Self::chunk(self, doc)
}
}
fn detect_language(doc: &Document) -> Option<String> {
for key in ["path", "source_path"] {
if let Some(val) = doc.metadata.get(key).and_then(|v| v.as_str()) {
if let Some(lang) = lang_from_extension(val) {
return Some(lang);
}
}
}
lang_from_extension(&doc.id)
}
fn doc_file_path(doc: &Document) -> String {
for key in ["path", "source_path"] {
if let Some(val) = doc.metadata.get(key).and_then(|v| v.as_str()) {
if !val.is_empty() {
return val.to_string();
}
}
}
doc.id.clone()
}
fn lang_from_extension(path: &str) -> Option<String> {
let ext = std::path::Path::new(path).extension()?.to_str()?;
match ext {
"py" => Some("python".to_string()),
"java" => Some("java".to_string()),
_ => None, }
}
fn extract_symbols_for_language(language: &str, file_path: &str, source: &str) -> Vec<Symbol> {
match language {
#[cfg(feature = "code-aware-python")]
"python" => crate::codeparse::langs::python::extract_symbols(file_path, source),
#[cfg(feature = "code-aware-java")]
"java" => crate::codeparse::langs::java::extract_symbols(file_path, source),
_ => {
let _ = (file_path, source);
Vec::new()
}
}
}
fn slice_lines(lines: &[&str], start_1based: u32, end_1based: u32) -> String {
let s = (start_1based.saturating_sub(1)) as usize;
let e = std::cmp::min(end_1based as usize, lines.len());
if s >= e {
return String::new();
}
lines[s..e].join("\n")
}
#[cfg(test)]
mod tests {
use super::*;
fn make_doc(id: &str, content: &str) -> Document {
Document {
id: id.to_string(),
content: content.to_string(),
title: None,
metadata: serde_json::json!({}),
fingerprint: None,
}
}
#[cfg(feature = "code-aware-python")]
#[test]
fn chunks_python_at_symbol_boundaries() {
let doc = make_doc(
"test.py",
"def hello():\n pass\n\nclass Foo:\n def bar(self):\n pass\n",
);
let chunker = SymbolAwareChunker::new(SymbolAwareChunkerConfig::default());
let chunks = chunker.chunk(&doc);
assert_eq!(
chunks.len(),
2,
"expected 2 top-level symbols, got {}: {:?}",
chunks.len(),
chunks
.iter()
.map(|c| c.metadata.get("symbol_name").cloned())
.collect::<Vec<_>>()
);
let names: Vec<&str> = chunks
.iter()
.filter_map(|c| c.metadata.get("symbol_name").and_then(|v| v.as_str()))
.collect();
assert!(names.contains(&"hello"), "missing hello: {:?}", names);
assert!(names.contains(&"Foo"), "missing Foo: {:?}", names);
for c in &chunks {
assert_eq!(
c.metadata.get("strategy").and_then(|s| s.as_str()),
Some("symbol_aware")
);
assert!(c.metadata.get("fqn").and_then(|s| s.as_str()).is_some());
assert!(c.metadata.get("node_id").and_then(|s| s.as_str()).is_some());
}
}
#[test]
fn unknown_language_falls_back_to_sentence_aware() {
let doc = make_doc("mystery.xyz", "some random content here that should still chunk\n");
let chunker = SymbolAwareChunker::new(SymbolAwareChunkerConfig::default());
let chunks = chunker.chunk(&doc);
assert!(!chunks.is_empty(), "unknown language should still produce chunks via sentence_aware");
let strategy = chunks[0]
.metadata
.get("strategy")
.and_then(|s| s.as_str())
.unwrap_or("");
assert_eq!(strategy, "symbol_aware_fallback");
let reason = chunks[0]
.metadata
.get("fallback_reason")
.and_then(|s| s.as_str())
.unwrap_or("");
assert_eq!(reason, "language_undetected");
}
#[cfg(feature = "code-aware-python")]
#[test]
fn python_syntax_error_falls_back() {
let doc = make_doc(
"broken.py",
"def hello(\n # missing close paren\n",
);
let chunker = SymbolAwareChunker::new(SymbolAwareChunkerConfig::default());
let chunks = chunker.chunk(&doc);
assert!(!chunks.is_empty(), "syntax error should fall back, not produce empty");
let strategy = chunks[0]
.metadata
.get("strategy")
.and_then(|s| s.as_str())
.unwrap_or("");
assert_eq!(strategy, "symbol_aware_fallback");
let reason = chunks[0]
.metadata
.get("fallback_reason")
.and_then(|s| s.as_str())
.unwrap_or("");
assert_eq!(reason, "python_syntax_error");
}
}