use super::document::chunk_document;
use super::parsers::{parse_with_cached, ParserKind};
use super::types::{chunk_text, RawChunk};
use crate::core::chunker::walk::{build_line_offsets, split_oversized, walk_for_chunks};
use crate::core::entity::{extract_entities, RawEntity};
fn language_for(file: &str) -> Option<(&'static str, ParserKind)> {
let ext = std::path::Path::new(file)
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_ascii_lowercase();
let pair: (&'static str, ParserKind) = match ext.as_str() {
"rs" => ("rust", ParserKind::Rust),
"py" => ("python", ParserKind::Python),
"js" | "mjs" | "cjs" | "jsx" => ("javascript", ParserKind::Javascript),
"ts" => ("typescript", ParserKind::Typescript),
"tsx" => ("typescript", ParserKind::Tsx),
"go" => ("go", ParserKind::Go),
"java" => ("java", ParserKind::Java),
"c" | "h" => ("c", ParserKind::C),
"cpp" | "cc" | "cxx" | "hpp" | "hh" | "hxx" => ("cpp", ParserKind::Cpp),
"rb" => ("ruby", ParserKind::Ruby),
"php" => ("php", ParserKind::Php),
"scala" => ("scala", ParserKind::Scala),
"cs" => ("csharp", ParserKind::Csharp),
"kt" | "kts" => ("kotlin", ParserKind::Kotlin),
"swift" => ("swift", ParserKind::Swift),
_ => return None,
};
Some(pair)
}
pub fn chunk_ast(file: &str, content: &str) -> (Vec<RawChunk>, Vec<RawEntity>) {
let Some((lang, kind)) = language_for(file) else {
if let Some(chunks) = chunk_document(file, content) {
return (chunks, Vec::new());
}
return (chunk_text(file, content, 150, 50), Vec::new());
};
let src = content.as_bytes();
let Some(tree) = parse_with_cached(kind, src) else {
return (chunk_text(file, content, 150, 50), Vec::new());
};
let line_offsets = build_line_offsets(src);
let mut chunks: Vec<RawChunk> = Vec::new();
walk_for_chunks(
tree.root_node(),
src,
file,
lang,
&line_offsets,
0,
&mut chunks,
);
if chunks.is_empty() {
let total_lines = content.lines().count().max(1);
chunks.push(RawChunk::generic(
format!("{file}:1:{total_lines}"),
file.to_string(),
1,
total_lines,
content.to_string(),
));
if let Some(c) = chunks.first_mut() {
c.language = Some(lang.to_string());
}
}
let split = split_oversized(chunks);
let entities = extract_entities(&tree, src, file, lang);
(split, entities)
}