mod classify;
mod inherits;
mod walk;
use std::cell::RefCell;
use serde::{Deserialize, Serialize};
use tree_sitter::{Language, Parser, Tree};
use crate::core::entity::{extract_entities, RawEntity};
use self::walk::{build_line_offsets, split_oversized, walk_for_chunks};
macro_rules! ts_parser_thread_locals {
($($name:ident => $lang_expr:expr),* $(,)?) => {
$(
thread_local! {
static $name: RefCell<Parser> = RefCell::new({
let mut p = Parser::new();
let lang: Language = $lang_expr.into();
p.set_language(&lang).expect("tree-sitter grammar load");
p
});
}
)*
};
}
ts_parser_thread_locals! {
PARSER_RUST => tree_sitter_rust::LANGUAGE,
PARSER_PYTHON => tree_sitter_python::LANGUAGE,
PARSER_JAVASCRIPT => tree_sitter_javascript::LANGUAGE,
PARSER_TYPESCRIPT => tree_sitter_typescript::LANGUAGE_TYPESCRIPT,
PARSER_TSX => tree_sitter_typescript::LANGUAGE_TSX,
PARSER_GO => tree_sitter_go::LANGUAGE,
PARSER_JAVA => tree_sitter_java::LANGUAGE,
PARSER_C => tree_sitter_c::LANGUAGE,
PARSER_CPP => tree_sitter_cpp::LANGUAGE,
PARSER_RUBY => tree_sitter_ruby::LANGUAGE,
PARSER_PHP => tree_sitter_php::LANGUAGE_PHP,
PARSER_SCALA => tree_sitter_scala::LANGUAGE,
PARSER_CSHARP => tree_sitter_c_sharp::LANGUAGE,
PARSER_KOTLIN => tree_sitter_kotlin_ng::LANGUAGE,
PARSER_SWIFT => tree_sitter_swift::LANGUAGE,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ParserKind {
Rust,
Python,
Javascript,
Typescript,
Tsx,
Go,
Java,
C,
Cpp,
Ruby,
Php,
Scala,
Csharp,
Kotlin,
Swift,
}
fn parse_with_cached(kind: ParserKind, src: &[u8]) -> Option<Tree> {
match kind {
ParserKind::Rust => PARSER_RUST.with(|p| p.borrow_mut().parse(src, None)),
ParserKind::Python => PARSER_PYTHON.with(|p| p.borrow_mut().parse(src, None)),
ParserKind::Javascript => PARSER_JAVASCRIPT.with(|p| p.borrow_mut().parse(src, None)),
ParserKind::Typescript => PARSER_TYPESCRIPT.with(|p| p.borrow_mut().parse(src, None)),
ParserKind::Tsx => PARSER_TSX.with(|p| p.borrow_mut().parse(src, None)),
ParserKind::Go => PARSER_GO.with(|p| p.borrow_mut().parse(src, None)),
ParserKind::Java => PARSER_JAVA.with(|p| p.borrow_mut().parse(src, None)),
ParserKind::C => PARSER_C.with(|p| p.borrow_mut().parse(src, None)),
ParserKind::Cpp => PARSER_CPP.with(|p| p.borrow_mut().parse(src, None)),
ParserKind::Ruby => PARSER_RUBY.with(|p| p.borrow_mut().parse(src, None)),
ParserKind::Php => PARSER_PHP.with(|p| p.borrow_mut().parse(src, None)),
ParserKind::Scala => PARSER_SCALA.with(|p| p.borrow_mut().parse(src, None)),
ParserKind::Csharp => PARSER_CSHARP.with(|p| p.borrow_mut().parse(src, None)),
ParserKind::Kotlin => PARSER_KOTLIN.with(|p| p.borrow_mut().parse(src, None)),
ParserKind::Swift => PARSER_SWIFT.with(|p| p.borrow_mut().parse(src, None)),
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
pub enum ChunkType {
#[default]
Unknown,
Function,
Method,
Class,
Struct,
Impl,
Module,
Trait,
Enum,
Test,
Constant,
TypeAlias,
Docstring,
FreeCode,
Code,
}
impl ChunkType {
fn as_str(&self) -> &'static str {
match self {
Self::Unknown => "Unknown",
Self::Function => "Function",
Self::Method => "Method",
Self::Class => "Class",
Self::Struct => "Struct",
Self::Impl => "Impl",
Self::Module => "Module",
Self::Trait => "Trait",
Self::Enum => "Enum",
Self::Test => "Test",
Self::Constant => "Constant",
Self::TypeAlias => "TypeAlias",
Self::Docstring => "Docstring",
Self::FreeCode => "FreeCode",
Self::Code => "Code",
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RawChunk {
pub id: String,
pub file: String,
pub start_line: usize,
pub end_line: usize,
pub content: String,
pub function_name: Option<String>,
pub language: Option<String>,
pub chunk_type: ChunkType,
pub calls: Vec<String>,
pub inherits_from: Vec<String>,
pub chunk_depth: usize,
pub parent_chunk_id: Option<String>,
pub child_chunk_ids: Vec<String>,
pub nlp_keywords: Vec<String>,
pub nlp_code_refs: Vec<String>,
#[serde(default)]
pub virtual_terms: Vec<String>,
}
impl RawChunk {
fn generic(
id: String,
file: String,
start_line: usize,
end_line: usize,
content: String,
) -> Self {
Self {
id,
file,
start_line,
end_line,
content,
function_name: None,
language: None,
chunk_type: ChunkType::Code,
calls: Vec::new(),
inherits_from: Vec::new(),
chunk_depth: 0,
parent_chunk_id: None,
child_chunk_ids: Vec::new(),
nlp_keywords: Vec::new(),
nlp_code_refs: Vec::new(),
virtual_terms: Vec::new(),
}
}
}
pub fn chunk_text(file: &str, content: &str, window: usize, stride: usize) -> Vec<RawChunk> {
let lines: Vec<&str> = content.lines().collect();
let mut chunks = Vec::new();
let mut start = 0usize;
while start < lines.len() {
let end = (start + window).min(lines.len());
let text = lines[start..end].join("\n");
chunks.push(RawChunk::generic(
format!("{}:{}:{}", file, start + 1, end),
file.to_string(),
start + 1,
end,
text,
));
if end == lines.len() {
break;
}
start += stride;
}
chunks
}
fn language_for(file: &str) -> Option<(&'static str, ParserKind)> {
let ext = std::path::Path::new(file)
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_ascii_lowercase();
let pair: (&'static str, ParserKind) = match ext.as_str() {
"rs" => ("rust", ParserKind::Rust),
"py" => ("python", ParserKind::Python),
"js" | "mjs" | "cjs" | "jsx" => ("javascript", ParserKind::Javascript),
"ts" => ("typescript", ParserKind::Typescript),
"tsx" => ("typescript", ParserKind::Tsx),
"go" => ("go", ParserKind::Go),
"java" => ("java", ParserKind::Java),
"c" | "h" => ("c", ParserKind::C),
"cpp" | "cc" | "cxx" | "hpp" | "hh" | "hxx" => ("cpp", ParserKind::Cpp),
"rb" => ("ruby", ParserKind::Ruby),
"php" => ("php", ParserKind::Php),
"scala" => ("scala", ParserKind::Scala),
"cs" => ("csharp", ParserKind::Csharp),
"kt" | "kts" => ("kotlin", ParserKind::Kotlin),
"swift" => ("swift", ParserKind::Swift),
_ => return None,
};
Some(pair)
}
const JSON_MAX_LINES: usize = 500;
const PLAINTEXT_MAX_LINES: usize = 50;
pub fn chunk_document(file: &str, content: &str) -> Option<Vec<RawChunk>> {
let ext = std::path::Path::new(file)
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_ascii_lowercase();
let chunks = match ext.as_str() {
"md" | "mdx" => chunk_markdown(file, content),
"yaml" | "yml" => chunk_yaml(file, content),
"toml" => chunk_toml(file, content),
"json" => chunk_json(file, content)?,
"txt" | "log" => chunk_plaintext(file, content),
"xml" => chunk_xml(file, content),
_ => return None,
};
Some(chunks)
}
fn document_chunk(
file: &str,
start_line: usize,
end_line: usize,
content: String,
function_name: Option<String>,
language: &str,
chunk_type: ChunkType,
) -> RawChunk {
let id = match &function_name {
Some(name) if !name.is_empty() => {
format!("{file}::{}::{name}::{start_line}", chunk_type.as_str())
}
_ => format!("{file}:{start_line}:{end_line}"),
};
RawChunk {
id,
file: file.to_string(),
start_line,
end_line,
content,
function_name,
language: Some(language.to_string()),
chunk_type,
calls: Vec::new(),
inherits_from: Vec::new(),
chunk_depth: 0,
parent_chunk_id: None,
child_chunk_ids: Vec::new(),
nlp_keywords: Vec::new(),
nlp_code_refs: Vec::new(),
virtual_terms: Vec::new(),
}
}
fn chunk_markdown(file: &str, content: &str) -> Vec<RawChunk> {
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return Vec::new();
}
let mut out: Vec<RawChunk> = Vec::new();
let mut section_start = 0usize;
let mut section_heading: Option<String> = None;
let mut in_code_fence = false;
let flush = |out: &mut Vec<RawChunk>,
start: usize,
end: usize,
heading: &Option<String>,
lines: &[&str]| {
if start >= end {
return;
}
let text = lines[start..end].join("\n");
if text.trim().is_empty() {
return;
}
out.push(document_chunk(
file,
start + 1,
end,
text,
heading.clone(),
"markdown",
ChunkType::Docstring,
));
};
for (i, line) in lines.iter().enumerate() {
let trimmed = line.trim_start();
if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
in_code_fence = !in_code_fence;
continue;
}
if in_code_fence {
continue;
}
if trimmed.starts_with('#') {
flush(&mut out, section_start, i, §ion_heading, &lines);
let heading = trimmed.trim_start_matches('#').trim().to_string();
section_heading = if heading.is_empty() {
None
} else {
Some(heading)
};
section_start = i;
}
}
flush(
&mut out,
section_start,
lines.len(),
§ion_heading,
&lines,
);
if out.is_empty() {
out.push(document_chunk(
file,
1,
lines.len(),
content.to_string(),
None,
"markdown",
ChunkType::Docstring,
));
}
out
}
fn chunk_yaml(file: &str, content: &str) -> Vec<RawChunk> {
chunk_by_top_level_key(file, content, "yaml", |line| {
let trimmed = line.trim_end();
if trimmed.is_empty() || trimmed.starts_with('#') {
return None;
}
if !line.starts_with(|c: char| c.is_whitespace() || c == '-') {
if let Some(idx) = trimmed.find(':') {
let key = trimmed[..idx].trim();
if !key.is_empty() && !key.contains(' ') {
return Some(key.to_string());
}
}
}
None
})
}
fn chunk_toml(file: &str, content: &str) -> Vec<RawChunk> {
chunk_by_top_level_key(file, content, "toml", |line| {
let trimmed = line.trim_end();
if trimmed.starts_with('[') && trimmed.ends_with(']') {
let inner = trimmed
.trim_start_matches('[')
.trim_end_matches(']')
.trim_start_matches('[')
.trim_end_matches(']')
.trim()
.to_string();
if !inner.is_empty() {
return Some(inner);
}
}
None
})
}
fn chunk_by_top_level_key(
file: &str,
content: &str,
language: &str,
header_of: impl Fn(&str) -> Option<String>,
) -> Vec<RawChunk> {
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return Vec::new();
}
let mut out: Vec<RawChunk> = Vec::new();
let mut section_start = 0usize;
let mut section_name: Option<String> = None;
let flush = |out: &mut Vec<RawChunk>,
start: usize,
end: usize,
name: &Option<String>,
lines: &[&str]| {
if start >= end {
return;
}
let text = lines[start..end].join("\n");
if text.trim().is_empty() {
return;
}
out.push(document_chunk(
file,
start + 1,
end,
text,
name.clone(),
language,
ChunkType::Constant,
));
};
for (i, line) in lines.iter().enumerate() {
if let Some(name) = header_of(line) {
flush(&mut out, section_start, i, §ion_name, &lines);
section_name = Some(name);
section_start = i;
}
}
flush(&mut out, section_start, lines.len(), §ion_name, &lines);
if out.is_empty() {
out.push(document_chunk(
file,
1,
lines.len(),
content.to_string(),
None,
language,
ChunkType::Constant,
));
}
out
}
fn chunk_json(file: &str, content: &str) -> Option<Vec<RawChunk>> {
let line_count = content.lines().count();
if line_count == 0 {
return Some(Vec::new());
}
if line_count >= JSON_MAX_LINES {
return Some(Vec::new());
}
Some(vec![document_chunk(
file,
1,
line_count,
content.to_string(),
None,
"json",
ChunkType::Constant,
)])
}
fn chunk_plaintext(file: &str, content: &str) -> Vec<RawChunk> {
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return Vec::new();
}
let lang = match std::path::Path::new(file)
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_ascii_lowercase()
.as_str()
{
"log" => "log",
_ => "text",
};
let mut out: Vec<RawChunk> = Vec::new();
let mut buf_start: Option<usize> = None;
let push_buf =
|out: &mut Vec<RawChunk>, start: usize, end: usize, lines: &[&str], lang: &str| {
let mut s = start;
while s < end {
let e = (s + PLAINTEXT_MAX_LINES).min(end);
let text = lines[s..e].join("\n");
if !text.trim().is_empty() {
out.push(document_chunk(
file,
s + 1,
e,
text,
None,
lang,
ChunkType::Code,
));
}
s = e;
}
};
for (i, line) in lines.iter().enumerate() {
if line.trim().is_empty() {
if let Some(start) = buf_start.take() {
push_buf(&mut out, start, i, &lines, lang);
}
} else if buf_start.is_none() {
buf_start = Some(i);
}
}
if let Some(start) = buf_start {
push_buf(&mut out, start, lines.len(), &lines, lang);
}
if out.is_empty() {
out.push(document_chunk(
file,
1,
lines.len(),
content.to_string(),
None,
lang,
ChunkType::Code,
));
}
out
}
fn chunk_xml(file: &str, content: &str) -> Vec<RawChunk> {
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return Vec::new();
}
let mut out: Vec<RawChunk> = Vec::new();
let mut depth: i32 = 0;
let mut child_start: Option<usize> = None;
let mut child_name: Option<String> = None;
for (i, line) in lines.iter().enumerate() {
let opens = count_xml_opens(line);
let closes = count_xml_closes(line);
if depth == 1 && child_start.is_none() && opens > closes {
child_start = Some(i);
child_name = first_xml_tag_name(line);
}
let prev_depth = depth;
depth += opens as i32;
depth -= closes as i32;
if let Some(start) = child_start {
if depth <= 1 && prev_depth >= 1 && i >= start {
let text = lines[start..=i].join("\n");
if !text.trim().is_empty() {
out.push(document_chunk(
file,
start + 1,
i + 1,
text,
child_name.clone(),
"xml",
ChunkType::Class,
));
}
child_start = None;
child_name = None;
}
}
}
if out.is_empty() {
out.push(document_chunk(
file,
1,
lines.len(),
content.to_string(),
None,
"xml",
ChunkType::Class,
));
}
out
}
fn count_xml_opens(line: &str) -> usize {
let mut count = 0usize;
let bytes = line.as_bytes();
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'<' {
let rest = &line[i..];
if rest.starts_with("<?")
|| rest.starts_with("<!--")
|| rest.starts_with("<!")
|| rest.starts_with("</")
{
i += 1;
continue;
}
if let Some(close) = rest.find('>') {
let tag = &rest[..=close];
if !tag.ends_with("/>") {
count += 1;
}
i += close + 1;
continue;
}
}
i += 1;
}
count
}
fn count_xml_closes(line: &str) -> usize {
line.matches("</").count()
}
fn first_xml_tag_name(line: &str) -> Option<String> {
let start = line.find('<')?;
let rest = &line[start + 1..];
if rest.starts_with('?') || rest.starts_with('!') || rest.starts_with('/') {
return None;
}
let end = rest
.find(|c: char| c.is_whitespace() || c == '>' || c == '/')
.unwrap_or(rest.len());
let name = rest[..end].trim();
if name.is_empty() {
None
} else {
Some(name.to_string())
}
}
pub fn chunk_ast(file: &str, content: &str) -> (Vec<RawChunk>, Vec<RawEntity>) {
let Some((lang, kind)) = language_for(file) else {
if let Some(chunks) = chunk_document(file, content) {
return (chunks, Vec::new());
}
return (chunk_text(file, content, 150, 50), Vec::new());
};
let src = content.as_bytes();
let Some(tree) = parse_with_cached(kind, src) else {
return (chunk_text(file, content, 150, 50), Vec::new());
};
let line_offsets = build_line_offsets(src);
let mut chunks: Vec<RawChunk> = Vec::new();
walk_for_chunks(
tree.root_node(),
src,
file,
lang,
&line_offsets,
0,
&mut chunks,
);
if chunks.is_empty() {
let total_lines = content.lines().count().max(1);
chunks.push(RawChunk::generic(
format!("{file}:1:{total_lines}"),
file.to_string(),
1,
total_lines,
content.to_string(),
));
if let Some(c) = chunks.first_mut() {
c.language = Some(lang.to_string());
}
}
let split = split_oversized(chunks);
let entities = extract_entities(&tree, src, file, lang);
(split, entities)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_overlapping_chunks() {
let content = (1..=200)
.map(|i| format!("line {i}"))
.collect::<Vec<_>>()
.join("\n");
let chunks = chunk_text("test.txt", &content, 150, 50);
assert!(chunks.len() >= 2);
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[1].start_line, 51);
}
#[test]
fn test_chunk_id_format() {
let chunks = chunk_text("src/main.txt", "line1\nline2\nline3", 150, 50);
assert!(chunks[0].id.starts_with("src/main.txt:"));
}
#[test]
fn test_rust_function_chunking() {
let src = r#"
fn alpha() {}
fn beta() -> i32 { 1 }
fn gamma(x: i32) -> i32 { x + 1 }
"#;
let (chunks, _ents) = chunk_ast("a.rs", src);
let fns: Vec<&RawChunk> = chunks
.iter()
.filter(|c| c.chunk_type == ChunkType::Function)
.collect();
assert_eq!(fns.len(), 3, "expected 3 function chunks, got {fns:?}");
let names: Vec<_> = fns
.iter()
.map(|c| c.function_name.clone().unwrap_or_default())
.collect();
assert!(names.contains(&"alpha".to_string()));
assert!(names.contains(&"beta".to_string()));
assert!(names.contains(&"gamma".to_string()));
}
#[test]
fn test_rust_impl_method_qualified_name() {
let src = r#"
struct Foo;
impl Foo {
fn bar(&self) {}
}
"#;
let (chunks, _) = chunk_ast("foo.rs", src);
let method = chunks
.iter()
.find(|c| c.chunk_type == ChunkType::Method)
.expect("expected at least one Method chunk");
assert_eq!(method.function_name.as_deref(), Some("Foo::bar"));
}
#[test]
fn test_rust_calls_extraction() {
let src = r#"
fn main() {
foo();
bar(1, 2);
}
fn foo() {}
fn bar(_a: i32, _b: i32) {}
"#;
let (chunks, _) = chunk_ast("m.rs", src);
let main_chunk = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("main"))
.expect("main chunk");
assert!(
main_chunk.calls.contains(&"foo".to_string()),
"calls={:?}",
main_chunk.calls
);
assert!(
main_chunk.calls.contains(&"bar".to_string()),
"calls={:?}",
main_chunk.calls
);
}
#[test]
fn test_rust_entity_named_types() {
let src = r#"
use std::sync::Arc;
fn f() {
let _x: Arc<Vec<String>> = Arc::new(Vec::new());
}
"#;
let (_chunks, entities) = chunk_ast("t.rs", src);
let named: Vec<&str> = entities
.iter()
.filter(|e| e.entity_type == crate::core::entity::EntityType::NamedType)
.map(|e| e.text.as_str())
.collect();
assert!(named.contains(&"Arc"), "named_types={named:?}");
assert!(named.contains(&"Vec"), "named_types={named:?}");
assert!(named.contains(&"String"), "named_types={named:?}");
}
#[test]
fn test_large_function_splits() {
let mut body = String::new();
for i in 0..250 {
body.push_str(&format!(" let _v{i} = {i};\n"));
}
let src = format!("fn huge() {{\n{body}}}\n");
let (chunks, _) = chunk_ast("h.rs", &src);
let subs: Vec<&RawChunk> = chunks
.iter()
.filter(|c| c.parent_chunk_id.is_some())
.collect();
assert!(
!subs.is_empty(),
"expected sub-chunks for 250-line fn, got {chunks:#?}"
);
let parent_id = subs[0].parent_chunk_id.clone().unwrap();
let parent = chunks
.iter()
.find(|c| c.id == parent_id)
.expect("parent retained");
assert!(!parent.child_chunk_ids.is_empty());
}
#[test]
fn test_unknown_language_fallback() {
let content = "hello world\nfoo bar\nbaz";
let (chunks, entities) = chunk_ast("notes.unknownext", content);
assert!(entities.is_empty());
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].chunk_type, ChunkType::Code);
}
#[test]
fn test_chunk_markdown_sections() {
let content = "# Title\n\nintro\n\n## Section A\n\nbody a\n\n## Section B\n\nbody b\n";
let chunks = chunk_markdown("doc.md", content);
assert!(
chunks.len() >= 2,
"expected multiple sections, got {chunks:#?}"
);
let names: Vec<_> = chunks
.iter()
.filter_map(|c| c.function_name.clone())
.collect();
assert!(names.iter().any(|n| n == "Section A"), "names={names:?}");
assert!(names.iter().any(|n| n == "Section B"), "names={names:?}");
for c in &chunks {
assert_eq!(c.language.as_deref(), Some("markdown"));
assert_eq!(c.chunk_type, ChunkType::Docstring);
}
}
#[test]
fn test_chunk_markdown_ignores_hash_in_code_fence() {
let content = "# Real Heading\n\nintro\n\n```\n## not a heading\n```\n\n## Next\n\nx\n";
let chunks = chunk_markdown("doc.md", content);
let names: Vec<_> = chunks
.iter()
.filter_map(|c| c.function_name.clone())
.collect();
assert!(names.iter().any(|n| n == "Real Heading"));
assert!(names.iter().any(|n| n == "Next"));
assert!(
!names.iter().any(|n| n == "not a heading"),
"should not split on # inside fenced code block: {names:?}"
);
}
#[test]
fn test_chunk_yaml_top_level_keys() {
let content = "name: foo\nversion: 1.0\n\ndeps:\n - a\n - b\n\nscripts:\n build: x\n";
let chunks = chunk_yaml("conf.yaml", content);
let names: Vec<_> = chunks
.iter()
.filter_map(|c| c.function_name.clone())
.collect();
assert!(names.iter().any(|n| n == "name"), "names={names:?}");
assert!(names.iter().any(|n| n == "deps"), "names={names:?}");
assert!(names.iter().any(|n| n == "scripts"), "names={names:?}");
for c in &chunks {
assert_eq!(c.language.as_deref(), Some("yaml"));
}
}
#[test]
fn test_chunk_toml_sections() {
let content = "[package]\nname = \"foo\"\nversion = \"1.0\"\n\n[dependencies]\nserde = \"1\"\n\n[[bin]]\nname = \"x\"\n";
let chunks = chunk_toml("Cargo.toml", content);
let names: Vec<_> = chunks
.iter()
.filter_map(|c| c.function_name.clone())
.collect();
assert!(names.iter().any(|n| n == "package"), "names={names:?}");
assert!(names.iter().any(|n| n == "dependencies"), "names={names:?}");
assert!(names.iter().any(|n| n == "bin"), "names={names:?}");
}
#[test]
fn test_chunk_json_small_file_single_chunk() {
let content = "{\n \"name\": \"foo\",\n \"version\": \"1.0\"\n}\n";
let chunks = chunk_json("a.json", content).expect("Some result");
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].language.as_deref(), Some("json"));
}
#[test]
fn test_chunk_json_large_file_skipped() {
let big = (0..600)
.map(|i| format!(" \"k{i}\": {i},"))
.collect::<Vec<_>>()
.join("\n");
let content = format!("{{\n{big}\n}}\n");
let chunks = chunk_json("big.json", &content).expect("Some result");
assert!(chunks.is_empty(), "expected large JSON to be skipped");
}
#[test]
fn test_chunk_plaintext_paragraphs() {
let content = "First paragraph line 1.\nFirst paragraph line 2.\n\nSecond paragraph line 1.\nSecond paragraph line 2.\n\nThird paragraph.\n";
let chunks = chunk_plaintext("note.txt", content);
assert_eq!(
chunks.len(),
3,
"expected one chunk per paragraph, got {chunks:#?}"
);
for c in &chunks {
assert_eq!(c.language.as_deref(), Some("text"));
}
}
#[test]
fn test_chunk_plaintext_caps_at_50_lines() {
let content = (1..=130)
.map(|i| format!("line {i}"))
.collect::<Vec<_>>()
.join("\n");
let chunks = chunk_plaintext("big.log", &content);
assert!(
chunks.len() >= 3,
"expected at least 3 chunks for 130-line paragraph, got {}",
chunks.len()
);
for c in &chunks {
let line_count = c.end_line.saturating_sub(c.start_line) + 1;
assert!(line_count <= 50, "chunk too large: {line_count} lines");
assert_eq!(c.language.as_deref(), Some("log"));
}
}
#[test]
fn test_chunk_xml_top_level_children() {
let content = "<?xml version=\"1.0\"?>\n<library>\n <book id=\"1\">\n <title>A</title>\n </book>\n <book id=\"2\">\n <title>B</title>\n </book>\n <magazine>\n <title>C</title>\n </magazine>\n</library>\n";
let chunks = chunk_xml("data.xml", content);
let names: Vec<_> = chunks
.iter()
.filter_map(|c| c.function_name.clone())
.collect();
assert!(
names.iter().filter(|n| *n == "book").count() >= 2,
"names={names:?}"
);
assert!(names.iter().any(|n| n == "magazine"), "names={names:?}");
for c in &chunks {
assert_eq!(c.language.as_deref(), Some("xml"));
}
}
#[test]
fn test_chunk_document_dispatch() {
let md_content = "# Hello\n\nworld\n";
let (md_chunks, _) = chunk_ast("readme.md", md_content);
assert!(md_chunks
.iter()
.any(|c| c.language.as_deref() == Some("markdown")));
let yaml_content = "key: value\n";
let (yaml_chunks, _) = chunk_ast("conf.yml", yaml_content);
assert!(yaml_chunks
.iter()
.any(|c| c.language.as_deref() == Some("yaml")));
let toml_content = "[section]\nx = 1\n";
let (toml_chunks, _) = chunk_ast("a.toml", toml_content);
assert!(toml_chunks
.iter()
.any(|c| c.language.as_deref() == Some("toml")));
}
#[test]
fn test_nlp_code_refs() {
let src = r#"
/// Wraps the `CodeIndexer` to expose hybrid search.
fn make() {}
"#;
let (chunks, _) = chunk_ast("d.rs", src);
let f = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("make"))
.unwrap();
assert!(
f.nlp_code_refs.iter().any(|k| k == "CodeIndexer"),
"code_refs={:?}",
f.nlp_code_refs
);
}
#[test]
fn test_entity_external_crate() {
let src = r#"
use usearch::Index;
fn f() {}
"#;
let (_chunks, ents) = chunk_ast("u.rs", src);
let exts: Vec<&str> = ents
.iter()
.filter(|e| e.entity_type == crate::core::entity::EntityType::ExternalCrate)
.map(|e| e.text.as_str())
.collect();
assert!(exts.contains(&"usearch"), "external_crates={exts:?}");
}
#[test]
fn test_entity_error_variant() {
let src = r#"
fn f() -> Result<(), anyhow::Error> {
anyhow::bail!("index not found");
}
"#;
let (_chunks, ents) = chunk_ast("e.rs", src);
let any_err = ents
.iter()
.any(|e| e.entity_type == crate::core::entity::EntityType::ErrorVariant);
assert!(
any_err,
"expected at least one ErrorVariant entity, got {ents:#?}"
);
}
#[test]
fn test_csharp_chunking() {
let src = r#"
namespace MyApp {
class Foo {
public void Bar() { Baz(); this.Qux(); }
public Foo() {}
}
interface IThing { void Do(); }
}
"#;
let (chunks, _) = chunk_ast("a.cs", src);
let classes: Vec<&RawChunk> = chunks
.iter()
.filter(|c| c.chunk_type == ChunkType::Class)
.collect();
assert!(
classes
.iter()
.any(|c| c.function_name.as_deref() == Some("Foo")),
"expected class Foo, got {chunks:#?}"
);
let traits: Vec<&RawChunk> = chunks
.iter()
.filter(|c| c.chunk_type == ChunkType::Trait)
.collect();
assert!(
traits
.iter()
.any(|c| c.function_name.as_deref() == Some("IThing")),
"expected interface IThing as Trait"
);
let bar = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("Bar"))
.expect("Bar method chunk");
assert_eq!(bar.chunk_type, ChunkType::Method);
assert!(
bar.calls.contains(&"Baz".to_string()),
"calls={:?}",
bar.calls
);
assert!(
bar.calls.contains(&"Qux".to_string()),
"calls={:?}",
bar.calls
);
}
#[test]
fn test_kotlin_chunking() {
let src = r#"
class Foo {
fun bar() { baz(); this.qux() }
}
object Singleton {
fun run() { other() }
}
"#;
let (chunks, _) = chunk_ast("a.kt", src);
assert!(
chunks
.iter()
.any(|c| c.function_name.as_deref() == Some("Foo")
&& c.chunk_type == ChunkType::Class),
"expected class Foo, got {chunks:#?}"
);
let bar = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("bar"))
.expect("bar method chunk");
assert_eq!(bar.chunk_type, ChunkType::Method);
assert!(
bar.calls.contains(&"baz".to_string()),
"calls={:?}",
bar.calls
);
assert!(
bar.calls.contains(&"qux".to_string()),
"calls={:?}",
bar.calls
);
}
#[test]
fn test_swift_chunking() {
let src = r#"
class Foo {
func bar() { baz(); self.qux() }
init() {}
}
struct S {}
enum E { case a }
protocol P { func d() }
extension Foo { func ext() {} }
"#;
let (chunks, _) = chunk_ast("a.swift", src);
assert!(
chunks
.iter()
.any(|c| c.function_name.as_deref() == Some("Foo")
&& c.chunk_type == ChunkType::Class),
"expected class Foo, got {chunks:#?}"
);
assert!(
chunks
.iter()
.any(|c| c.function_name.as_deref() == Some("S")
&& c.chunk_type == ChunkType::Struct),
"expected struct S"
);
assert!(
chunks.iter().any(|c| c.function_name.as_deref() == Some("E")
&& c.chunk_type == ChunkType::Enum),
"expected enum E"
);
assert!(
chunks.iter().any(
|c| c.function_name.as_deref() == Some("P") && c.chunk_type == ChunkType::Trait
),
"expected protocol P as Trait"
);
assert!(
chunks
.iter()
.any(|c| c.chunk_type == ChunkType::Module
&& c.function_name.as_deref() == Some("Foo")),
"expected extension Foo as Module"
);
let bar = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("bar"))
.expect("bar method chunk");
assert!(
bar.calls.contains(&"baz".to_string()),
"calls={:?}",
bar.calls
);
assert!(
bar.calls.contains(&"qux".to_string()),
"calls={:?}",
bar.calls
);
}
#[test]
fn test_nlp_keywords_from_doc_comments() {
let src = r#"
/// Implements the RRF fusion algorithm.
fn fuse() {}
"#;
let (chunks, _) = chunk_ast("d.rs", src);
let f = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("fuse"))
.unwrap();
assert!(
f.nlp_keywords.iter().any(|k| k == "RRF"),
"keywords={:?}",
f.nlp_keywords
);
assert!(
f.nlp_keywords.iter().any(|k| k == "Implements"),
"keywords={:?}",
f.nlp_keywords
);
}
#[test]
fn test_scala_method_qualified_name() {
let src = r#"
class Foo extends Bar with Mixin {
def bar(): Unit = baz()
}
object O {
def run(): Unit = other()
}
def freefn(): Unit = ()
"#;
let (chunks, _) = chunk_ast("a.scala", src);
let bar = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("Foo::bar"))
.expect("expected qualified method Foo::bar, got: {chunks:#?}");
assert_eq!(bar.chunk_type, ChunkType::Method);
let run = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("O::run"))
.expect("expected qualified method O::run");
assert_eq!(run.chunk_type, ChunkType::Method);
assert!(
chunks
.iter()
.any(|c| c.function_name.as_deref() == Some("freefn")
&& c.chunk_type == ChunkType::Function),
"expected unqualified Function freefn, got {chunks:#?}"
);
}
#[test]
fn test_scala_caller_scoped_call_edges() {
let src = r#"
class Foo {
def bar(): Unit = {
baz()
this.qux()
}
}
"#;
let (chunks, _) = chunk_ast("a.scala", src);
let bar = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("Foo::bar"))
.expect("Foo::bar chunk");
assert!(
bar.calls.contains(&"baz".to_string()),
"calls={:?}",
bar.calls
);
assert!(
bar.calls.contains(&"qux".to_string()),
"calls={:?}",
bar.calls
);
}
#[test]
fn test_scala_extends_and_with_emit_inherits() {
let src = r#"
class Foo extends Bar with Mixin with Other {
def m(): Unit = ()
}
"#;
let (chunks, _) = chunk_ast("a.scala", src);
let foo = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("Foo") && c.chunk_type == ChunkType::Class)
.expect("Foo class chunk");
for parent in ["Bar", "Mixin", "Other"] {
assert!(
foo.inherits_from.iter().any(|p| p == parent),
"expected parent {parent} in inherits_from={:?}",
foo.inherits_from
);
}
}
#[test]
fn test_scala_symbol_graph_resolves_caller() {
use crate::core::symbol_graph::SymbolGraph;
let src = r#"
class Foo {
def bar(): Unit = baz()
}
def baz(): Unit = ()
"#;
let (chunks, _) = chunk_ast("s.scala", src);
let tuples: Vec<_> = chunks
.iter()
.map(|c| {
(
c.id.clone(),
c.file.clone(),
c.function_name.clone(),
c.calls.clone(),
c.inherits_from.clone(),
c.chunk_type.clone(),
)
})
.collect();
let g = SymbolGraph::build_from_chunks(&tuples);
let callers = g.callers_of("baz", 1);
assert!(
callers.iter().any(|(s, _)| s == "Foo::bar"),
"callers={callers:?}"
);
}
#[test]
fn test_php_method_qualified_name() {
let src = r#"<?php
class Foo extends Bar implements I1, I2 {
public function doIt(): void {
$this->helper();
}
}
function freefn(): void {}
"#;
let (chunks, _) = chunk_ast("a.php", src);
let doit = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("Foo::doIt"))
.expect("expected qualified Foo::doIt, got: {chunks:#?}");
assert_eq!(doit.chunk_type, ChunkType::Method);
assert!(
chunks
.iter()
.any(|c| c.function_name.as_deref() == Some("freefn")
&& c.chunk_type == ChunkType::Function),
"expected unqualified Function freefn"
);
}
#[test]
fn test_php_caller_scoped_call_edges() {
let src = r#"<?php
class Foo {
public function doIt(): void {
$this->helper();
Foo::staticCall();
regularFunc();
}
}
"#;
let (chunks, _) = chunk_ast("a.php", src);
let doit = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("Foo::doIt"))
.expect("Foo::doIt chunk");
for callee in ["helper", "staticCall", "regularFunc"] {
assert!(
doit.calls.iter().any(|c| c == callee),
"expected callee {callee} in calls={:?}",
doit.calls
);
}
}
#[test]
fn test_php_implements_and_extends_emit_inherits() {
let src = r#"<?php
class Foo extends Bar implements I1, I2 {}
"#;
let (chunks, _) = chunk_ast("a.php", src);
let foo = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("Foo") && c.chunk_type == ChunkType::Class)
.expect("Foo class chunk");
for parent in ["Bar", "I1", "I2"] {
assert!(
foo.inherits_from.iter().any(|p| p == parent),
"expected parent {parent} in inherits_from={:?}",
foo.inherits_from
);
}
}
#[test]
fn test_php_interface_extends_emits_inherits() {
let src = r#"<?php
interface Child extends P1, P2 {}
"#;
let (chunks, _) = chunk_ast("a.php", src);
let child = chunks
.iter()
.find(|c| {
c.function_name.as_deref() == Some("Child") && c.chunk_type == ChunkType::Trait
})
.expect("Child interface (chunked as Trait)");
for parent in ["P1", "P2"] {
assert!(
child.inherits_from.iter().any(|p| p == parent),
"expected parent {parent} in inherits_from={:?}",
child.inherits_from
);
}
}
#[test]
fn test_php_symbol_graph_resolves_caller() {
use crate::core::symbol_graph::SymbolGraph;
let src = r#"<?php
class Foo {
public function doIt(): void {
$this->helper();
}
public function helper(): void {}
}
"#;
let (chunks, _) = chunk_ast("p.php", src);
let tuples: Vec<_> = chunks
.iter()
.map(|c| {
(
c.id.clone(),
c.file.clone(),
c.function_name.clone(),
c.calls.clone(),
c.inherits_from.clone(),
c.chunk_type.clone(),
)
})
.collect();
let g = SymbolGraph::build_from_chunks(&tuples);
let callers = g.callers_of("Foo::helper", 1);
assert!(
callers.iter().any(|(s, _)| s == "Foo::doIt"),
"callers={callers:?}"
);
}
#[test]
fn test_rust_pub_const_chunking_produces_n_constant_chunks() {
let src = r#"
pub const ALPHA: u32 = 1;
pub const BRUSILOV_EPOCH: u64 = 1_000_000;
pub const MAX_BATCH_SIZE: usize = 256;
pub const KIKUCHI_MAX_DEPTH: usize = 8;
pub const HNSW_EF_CONSTRUCTION: usize = 200;
pub const DEFAULT_TOP_K: usize = 10;
pub const BM25_K1: f32 = 1.5;
pub const BM25_B: f32 = 0.75;
"#;
let (chunks, _) = chunk_ast("constants.rs", src);
let const_chunks: Vec<&RawChunk> = chunks
.iter()
.filter(|c| c.chunk_type == ChunkType::Constant)
.collect();
assert_eq!(
const_chunks.len(),
8,
"expected 8 Constant chunks (one per pub const), got {}: {:#?}",
const_chunks.len(),
chunks
);
for c in &const_chunks {
assert!(
c.function_name.is_some(),
"expected non-null function_name for constant chunk {c:#?}"
);
}
let names: Vec<_> = const_chunks
.iter()
.filter_map(|c| c.function_name.as_deref())
.collect();
assert!(
names.contains(&"BRUSILOV_EPOCH"),
"expected BRUSILOV_EPOCH in names: {names:?}"
);
assert!(
names.contains(&"MAX_BATCH_SIZE"),
"expected MAX_BATCH_SIZE in names: {names:?}"
);
}
#[test]
fn test_rust_mixed_const_and_fn_chunking() {
let src = r#"
pub const FOO: u32 = 42;
pub fn do_something() -> u32 {
FOO + 1
}
pub const BAR: &str = "hello";
"#;
let (chunks, _) = chunk_ast("mixed.rs", src);
let const_chunks: Vec<&RawChunk> = chunks
.iter()
.filter(|c| c.chunk_type == ChunkType::Constant)
.collect();
assert_eq!(
const_chunks.len(),
2,
"expected 2 Constant chunks, got {const_chunks:#?}"
);
let const_names: Vec<_> = const_chunks
.iter()
.filter_map(|c| c.function_name.as_deref())
.collect();
assert!(
const_names.contains(&"FOO"),
"expected FOO: {const_names:?}"
);
assert!(
const_names.contains(&"BAR"),
"expected BAR: {const_names:?}"
);
let fn_chunks: Vec<&RawChunk> = chunks
.iter()
.filter(|c| c.chunk_type == ChunkType::Function)
.collect();
assert_eq!(
fn_chunks.len(),
1,
"expected 1 Function chunk, got {fn_chunks:#?}"
);
assert_eq!(
fn_chunks[0].function_name.as_deref(),
Some("do_something"),
"fn chunk name mismatch"
);
}
#[test]
fn test_rust_pub_static_treated_as_constant() {
let src = r#"
pub static GREETING: &str = "hello";
pub static MAX_RETRIES: u32 = 3;
"#;
let (chunks, _) = chunk_ast("statics.rs", src);
let const_chunks: Vec<&RawChunk> = chunks
.iter()
.filter(|c| c.chunk_type == ChunkType::Constant)
.collect();
assert_eq!(
const_chunks.len(),
2,
"expected 2 Constant chunks for pub static, got {const_chunks:#?}"
);
let names: Vec<_> = const_chunks
.iter()
.filter_map(|c| c.function_name.as_deref())
.collect();
assert!(names.contains(&"GREETING"), "expected GREETING: {names:?}");
assert!(
names.contains(&"MAX_RETRIES"),
"expected MAX_RETRIES: {names:?}"
);
}
#[test]
fn test_rust_private_const_does_not_get_constant_chunk() {
let src = r#"
const INTERNAL_LIMIT: usize = 100;
const PRIVATE_KEY: &str = "secret";
"#;
let (chunks, _) = chunk_ast("private.rs", src);
let const_chunks: Vec<&RawChunk> = chunks
.iter()
.filter(|c| c.chunk_type == ChunkType::Constant)
.collect();
assert!(
const_chunks.is_empty(),
"expected no Constant chunks for private consts, got {const_chunks:#?}"
);
}
#[test]
fn test_rust_no_const_regression_on_function_chunks() {
let src = r#"
fn alpha() {}
fn beta() -> i32 { 1 }
fn gamma(x: i32) -> i32 { x + 1 }
"#;
let (chunks, _) = chunk_ast("no_const.rs", src);
let fns: Vec<&RawChunk> = chunks
.iter()
.filter(|c| c.chunk_type == ChunkType::Function)
.collect();
assert_eq!(fns.len(), 3, "expected 3 Function chunks, got {fns:#?}");
assert!(
chunks.iter().all(|c| c.chunk_type != ChunkType::Constant),
"unexpected Constant chunk in function-only file: {chunks:#?}"
);
}
#[test]
fn test_rust_symbol_graph_resolves_caller() {
use crate::core::symbol_graph::SymbolGraph;
let src = "fn alpha() { beta(); }\nfn beta() {}\n";
let (chunks, _) = chunk_ast("a.rs", src);
let tuples: Vec<_> = chunks
.iter()
.map(|c| {
(
c.id.clone(),
c.file.clone(),
c.function_name.clone(),
c.calls.clone(),
c.inherits_from.clone(),
c.chunk_type.clone(),
)
})
.collect();
let g = SymbolGraph::build_from_chunks(&tuples);
assert!(
g.node_count() >= 2,
"expected >= 2 symbol nodes for alpha+beta, got {} (chunks={:#?})",
g.node_count(),
chunks
.iter()
.map(|c| (c.function_name.clone(), c.calls.clone()))
.collect::<Vec<_>>(),
);
let callers = g.callers_of("beta", 1);
assert!(
callers.iter().any(|(s, _)| s == "alpha"),
"expected alpha among callers of beta, got {callers:?}"
);
}
}