use std::collections::HashSet;
use serde::{Deserialize, Serialize};
use tree_sitter::{Language, Node, Parser};
use crate::core::entity::{extract_entities, RawEntity};
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
pub enum ChunkType {
#[default]
Unknown,
Function,
Method,
Class,
Struct,
Impl,
Module,
Trait,
Enum,
Test,
Constant,
TypeAlias,
Docstring,
FreeCode,
Code,
}
impl ChunkType {
fn as_str(&self) -> &'static str {
match self {
Self::Unknown => "Unknown",
Self::Function => "Function",
Self::Method => "Method",
Self::Class => "Class",
Self::Struct => "Struct",
Self::Impl => "Impl",
Self::Module => "Module",
Self::Trait => "Trait",
Self::Enum => "Enum",
Self::Test => "Test",
Self::Constant => "Constant",
Self::TypeAlias => "TypeAlias",
Self::Docstring => "Docstring",
Self::FreeCode => "FreeCode",
Self::Code => "Code",
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RawChunk {
pub id: String,
pub file: String,
pub start_line: usize,
pub end_line: usize,
pub content: String,
pub function_name: Option<String>,
pub language: Option<String>,
pub chunk_type: ChunkType,
pub calls: Vec<String>,
pub inherits_from: Vec<String>,
pub chunk_depth: usize,
pub parent_chunk_id: Option<String>,
pub child_chunk_ids: Vec<String>,
pub nlp_keywords: Vec<String>,
pub nlp_code_refs: Vec<String>,
#[serde(default)]
pub virtual_terms: Vec<String>,
}
impl RawChunk {
fn generic(
id: String,
file: String,
start_line: usize,
end_line: usize,
content: String,
) -> Self {
Self {
id,
file,
start_line,
end_line,
content,
function_name: None,
language: None,
chunk_type: ChunkType::Code,
calls: Vec::new(),
inherits_from: Vec::new(),
chunk_depth: 0,
parent_chunk_id: None,
child_chunk_ids: Vec::new(),
nlp_keywords: Vec::new(),
nlp_code_refs: Vec::new(),
virtual_terms: Vec::new(),
}
}
}
const MAX_CHUNK_LINES: usize = 200;
const SUB_CHUNK_WINDOW: usize = 100;
const SUB_CHUNK_STRIDE: usize = 50;
pub fn chunk_text(file: &str, content: &str, window: usize, stride: usize) -> Vec<RawChunk> {
let lines: Vec<&str> = content.lines().collect();
let mut chunks = Vec::new();
let mut start = 0usize;
while start < lines.len() {
let end = (start + window).min(lines.len());
let text = lines[start..end].join("\n");
chunks.push(RawChunk::generic(
format!("{}:{}:{}", file, start + 1, end),
file.to_string(),
start + 1,
end,
text,
));
if end == lines.len() {
break;
}
start += stride;
}
chunks
}
fn language_for(file: &str) -> Option<(&'static str, Language)> {
let ext = std::path::Path::new(file)
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_ascii_lowercase();
let (tag, lang_fn): (&'static str, tree_sitter_language::LanguageFn) = match ext.as_str() {
"rs" => ("rust", tree_sitter_rust::LANGUAGE),
"py" => ("python", tree_sitter_python::LANGUAGE),
"js" | "mjs" | "cjs" | "jsx" => ("javascript", tree_sitter_javascript::LANGUAGE),
"ts" => ("typescript", tree_sitter_typescript::LANGUAGE_TYPESCRIPT),
"tsx" => ("typescript", tree_sitter_typescript::LANGUAGE_TSX),
"go" => ("go", tree_sitter_go::LANGUAGE),
"java" => ("java", tree_sitter_java::LANGUAGE),
"c" | "h" => ("c", tree_sitter_c::LANGUAGE),
"cpp" | "cc" | "cxx" | "hpp" | "hh" | "hxx" => ("cpp", tree_sitter_cpp::LANGUAGE),
"rb" => ("ruby", tree_sitter_ruby::LANGUAGE),
"php" => ("php", tree_sitter_php::LANGUAGE_PHP),
"scala" => ("scala", tree_sitter_scala::LANGUAGE),
"cs" => ("csharp", tree_sitter_c_sharp::LANGUAGE),
"kt" | "kts" => ("kotlin", tree_sitter_kotlin_ng::LANGUAGE),
"swift" => ("swift", tree_sitter_swift::LANGUAGE),
_ => return None,
};
Some((tag, lang_fn.into()))
}
fn line_for_byte(line_offsets: &[usize], byte: usize) -> usize {
match line_offsets.binary_search(&byte) {
Ok(i) => i + 1,
Err(i) => i.max(1),
}
}
fn build_line_offsets(src: &[u8]) -> Vec<usize> {
let mut v = vec![0usize];
for (i, b) in src.iter().enumerate() {
if *b == b'\n' {
v.push(i + 1);
}
}
v
}
fn make_chunk_id(
file: &str,
chunk_type: &ChunkType,
name: &str,
start_line: usize,
end_line: usize,
) -> String {
if name.is_empty() {
format!("{file}:{start_line}:{end_line}")
} else {
format!("{file}::{}::{name}::{start_line}", chunk_type.as_str())
}
}
fn name_of(node: Node<'_>, src: &[u8]) -> String {
if let Some(n) = node.child_by_field_name("name") {
return std::str::from_utf8(&src[n.start_byte()..n.end_byte()])
.unwrap_or("")
.to_string();
}
String::new()
}
fn collect_calls(node: Node<'_>, src: &[u8], lang: &str) -> Vec<String> {
let mut out: HashSet<String> = HashSet::new();
let mut stack: Vec<Node> = vec![node];
while let Some(n) = stack.pop() {
let kind = n.kind();
let is_fn_kind = matches!(
(lang, kind),
("rust", "function_item")
| ("python", "function_definition")
| ("javascript", "function_declaration")
| ("typescript", "function_declaration")
| ("go", "function_declaration")
| ("java", "method_declaration")
| ("c" | "cpp", "function_definition")
| ("ruby", "method")
| ("ruby", "singleton_method")
| ("php", "function_definition")
| ("php", "method_declaration")
| ("scala", "function_definition")
| ("csharp", "method_declaration")
| ("csharp", "constructor_declaration")
| ("kotlin", "function_declaration")
| ("kotlin", "secondary_constructor")
| ("swift", "function_declaration")
| ("swift", "init_declaration")
| ("swift", "protocol_function_declaration")
);
if is_fn_kind && n.id() != node.id() {
continue;
}
let is_call = matches!(
(lang, kind),
("rust", "call_expression")
| ("python", "call")
| ("javascript" | "typescript", "call_expression")
| ("go", "call_expression")
| ("java", "method_invocation")
| ("c" | "cpp", "call_expression")
| ("ruby", "call")
| ("php", "function_call_expression")
| ("php", "member_call_expression")
| ("php", "scoped_call_expression")
| ("php", "nullsafe_member_call_expression")
| ("scala", "call_expression")
| ("csharp", "invocation_expression")
| ("kotlin", "call_expression")
| ("swift", "call_expression")
);
if is_call {
let callee = n
.child_by_field_name("function")
.or_else(|| n.child_by_field_name("name"))
.or_else(|| n.child(0));
if let Some(c) = callee {
let raw = std::str::from_utf8(&src[c.start_byte()..c.end_byte()])
.unwrap_or("")
.to_string();
let simple = raw
.rsplit(['.', ':'])
.next()
.unwrap_or(&raw)
.trim()
.to_string();
if !simple.is_empty() {
out.insert(simple);
}
}
}
let mut cursor = n.walk();
for child in n.children(&mut cursor) {
stack.push(child);
}
}
let mut v: Vec<String> = out.into_iter().collect();
v.sort();
v
}
fn collect_inherits(node: Node<'_>, src: &[u8], lang: &str) -> Vec<String> {
let mut out = Vec::new();
if lang == "rust" && node.kind() == "impl_item" {
if let Some(t) = node.child_by_field_name("trait") {
out.push(
std::str::from_utf8(&src[t.start_byte()..t.end_byte()])
.unwrap_or("")
.to_string(),
);
}
} else if lang == "python" && node.kind() == "class_definition" {
if let Some(s) = node.child_by_field_name("superclasses") {
let txt = std::str::from_utf8(&src[s.start_byte()..s.end_byte()])
.unwrap_or("")
.trim_matches(|c: char| c == '(' || c == ')')
.to_string();
for part in txt.split(',') {
let p = part.trim();
if !p.is_empty() {
out.push(p.to_string());
}
}
}
} else if lang == "scala"
&& matches!(
node.kind(),
"class_definition" | "object_definition" | "trait_definition"
)
{
let mut cur = node.walk();
for child in node.children(&mut cur) {
if child.kind() != "extends_clause" {
continue;
}
let mut cur2 = child.walk();
for sub in child.children(&mut cur2) {
if sub.kind() == "type_identifier" {
let t = std::str::from_utf8(&src[sub.start_byte()..sub.end_byte()])
.unwrap_or("")
.to_string();
if !t.is_empty() {
out.push(t);
}
}
}
}
} else if lang == "php"
&& matches!(
node.kind(),
"class_declaration" | "interface_declaration" | "trait_declaration"
)
{
let mut cur = node.walk();
for child in node.children(&mut cur) {
if !matches!(child.kind(), "base_clause" | "class_interface_clause") {
continue;
}
let mut cur2 = child.walk();
for sub in child.children(&mut cur2) {
if sub.kind() == "name" || sub.kind() == "qualified_name" {
let t = std::str::from_utf8(&src[sub.start_byte()..sub.end_byte()])
.unwrap_or("")
.to_string();
if !t.is_empty() {
out.push(t);
}
}
}
}
}
out.retain(|s| !s.is_empty());
out
}
fn preceding_doc_comments(node: Node<'_>, src: &[u8]) -> String {
let mut buf = String::new();
let mut prev = node.prev_sibling();
while let Some(p) = prev {
if p.kind() == "line_comment" || p.kind() == "block_comment" {
let txt = std::str::from_utf8(&src[p.start_byte()..p.end_byte()]).unwrap_or("");
if txt.starts_with("///") || txt.starts_with("//!") || txt.starts_with("/**") {
buf.insert_str(0, txt);
buf.insert(0, '\n');
}
prev = p.prev_sibling();
} else {
break;
}
}
buf
}
fn nlp_from_doc(doc: &str) -> (Vec<String>, Vec<String>) {
let mut keywords: Vec<String> = Vec::new();
let mut code_refs: Vec<String> = Vec::new();
let mut in_backticks = false;
let mut buf = String::new();
for ch in doc.chars() {
if ch == '`' {
if in_backticks && !buf.is_empty() {
code_refs.push(buf.clone());
}
buf.clear();
in_backticks = !in_backticks;
} else if in_backticks {
buf.push(ch);
}
}
let mut depth = 0;
for word in doc.split(|c: char| !c.is_alphanumeric() && c != '_') {
if word.contains('`') {
depth = if depth == 0 { 1 } else { 0 };
continue;
}
if word.len() < 3 {
continue;
}
let first = word.chars().next().unwrap();
let all_upper = word
.chars()
.all(|c| c.is_ascii_uppercase() || c.is_ascii_digit());
let title =
first.is_ascii_uppercase() && word.chars().skip(1).any(|c| c.is_ascii_lowercase());
if all_upper || title {
keywords.push(word.to_string());
}
}
keywords.sort();
keywords.dedup();
code_refs.sort();
code_refs.dedup();
(keywords, code_refs)
}
fn swift_class_decl_kind(node: Node<'_>) -> ChunkType {
let kw = node
.child(0)
.map(|c| c.kind().to_string())
.unwrap_or_default();
match kw.as_str() {
"struct" => ChunkType::Struct,
"enum" => ChunkType::Enum,
"extension" => ChunkType::Module,
_ => ChunkType::Class, }
}
fn classify_node(lang: &str, node: Node<'_>) -> Option<ChunkType> {
let kind = node.kind();
let parent_kind = node.parent().map(|p| p.kind()).unwrap_or("");
Some(match (lang, kind) {
("rust", "function_item") => {
if matches!(parent_kind, "declaration_list" | "impl_item" | "trait_item")
|| ancestor_kind(node, "impl_item").is_some()
{
ChunkType::Method
} else {
ChunkType::Function
}
}
("rust", "impl_item") => ChunkType::Impl,
("rust", "struct_item") => ChunkType::Class,
("rust", "trait_item") => ChunkType::Trait,
("rust", "enum_item") => ChunkType::Enum,
("rust", "mod_item") => ChunkType::Module,
("python", "function_definition") => {
if ancestor_kind(node, "class_definition").is_some() {
ChunkType::Method
} else {
ChunkType::Function
}
}
("python", "class_definition") => ChunkType::Class,
("python", "decorated_definition") => return None,
("javascript" | "typescript", "function_declaration") => ChunkType::Function,
("javascript" | "typescript", "class_declaration") => ChunkType::Class,
("javascript" | "typescript", "method_definition") => ChunkType::Method,
("go", "function_declaration") => ChunkType::Function,
("go", "method_declaration") => ChunkType::Method,
("go", "type_declaration") => ChunkType::Class,
("java", "method_declaration") => ChunkType::Method,
("java", "class_declaration") => ChunkType::Class,
("java", "interface_declaration") => ChunkType::Trait,
("c" | "cpp", "function_definition") => ChunkType::Function,
("cpp", "class_specifier") => ChunkType::Class,
("c" | "cpp", "struct_specifier") => ChunkType::Class,
("ruby", "method") => ChunkType::Function,
("ruby", "singleton_method") => ChunkType::Method,
("ruby", "module") => ChunkType::Module,
("ruby", "class") => ChunkType::Class,
("php", "function_definition") => ChunkType::Function,
("php", "method_declaration") => ChunkType::Method,
("php", "class_declaration") => ChunkType::Class,
("php", "interface_declaration") => ChunkType::Trait,
("php", "trait_declaration") => ChunkType::Trait,
("php", "namespace_definition") => ChunkType::Module,
("scala", "function_definition") => {
if ancestor_kind(node, "class_definition").is_some()
|| ancestor_kind(node, "object_definition").is_some()
|| ancestor_kind(node, "trait_definition").is_some()
{
ChunkType::Method
} else {
ChunkType::Function
}
}
("scala", "class_definition") => ChunkType::Class,
("scala", "object_definition") => ChunkType::Class,
("scala", "trait_definition") => ChunkType::Trait,
("csharp", "method_declaration") => {
if ancestor_kind(node, "class_declaration").is_some()
|| ancestor_kind(node, "interface_declaration").is_some()
|| ancestor_kind(node, "struct_declaration").is_some()
{
ChunkType::Method
} else {
ChunkType::Function
}
}
("csharp", "constructor_declaration") => ChunkType::Method,
("csharp", "class_declaration") => ChunkType::Class,
("csharp", "interface_declaration") => ChunkType::Trait,
("csharp", "struct_declaration") => ChunkType::Class,
("csharp", "namespace_declaration") => ChunkType::Module,
("csharp", "enum_declaration") => ChunkType::Enum,
("kotlin", "function_declaration") => {
if ancestor_kind(node, "class_declaration").is_some()
|| ancestor_kind(node, "object_declaration").is_some()
{
ChunkType::Method
} else {
ChunkType::Function
}
}
("kotlin", "secondary_constructor") => ChunkType::Method,
("kotlin", "class_declaration") => ChunkType::Class,
("kotlin", "object_declaration") => ChunkType::Class,
("kotlin", "companion_object") => ChunkType::Class,
("kotlin", "interface_declaration") => ChunkType::Trait,
("swift", "class_declaration") => swift_class_decl_kind(node),
("swift", "protocol_declaration") => ChunkType::Trait,
("swift", "function_declaration") | ("swift", "protocol_function_declaration") => {
if ancestor_kind(node, "class_declaration").is_some()
|| ancestor_kind(node, "protocol_declaration").is_some()
{
ChunkType::Method
} else {
ChunkType::Function
}
}
("swift", "init_declaration") => ChunkType::Method,
_ => return None,
})
}
fn ancestor_kind<'a>(node: Node<'a>, kind: &str) -> Option<Node<'a>> {
let mut cur = node.parent();
while let Some(c) = cur {
if c.kind() == kind {
return Some(c);
}
cur = c.parent();
}
None
}
fn rust_impl_type_name(node: Node<'_>, src: &[u8]) -> Option<String> {
let imp = ancestor_kind(node, "impl_item")?;
let t = imp.child_by_field_name("type")?;
Some(
std::str::from_utf8(&src[t.start_byte()..t.end_byte()])
.unwrap_or("")
.to_string(),
)
}
fn scala_enclosing_class_name(node: Node<'_>, src: &[u8]) -> Option<String> {
let owner = ancestor_kind_any(
node,
&["class_definition", "object_definition", "trait_definition"],
)?;
let n = owner.child_by_field_name("name")?;
Some(
std::str::from_utf8(&src[n.start_byte()..n.end_byte()])
.unwrap_or("")
.to_string(),
)
}
fn php_enclosing_class_name(node: Node<'_>, src: &[u8]) -> Option<String> {
let owner = ancestor_kind_any(
node,
&[
"class_declaration",
"interface_declaration",
"trait_declaration",
],
)?;
let n = owner.child_by_field_name("name")?;
Some(
std::str::from_utf8(&src[n.start_byte()..n.end_byte()])
.unwrap_or("")
.to_string(),
)
}
fn ancestor_kind_any<'a>(node: Node<'a>, kinds: &[&str]) -> Option<Node<'a>> {
let mut cur = node.parent();
while let Some(c) = cur {
if kinds.contains(&c.kind()) {
return Some(c);
}
cur = c.parent();
}
None
}
const JSON_MAX_LINES: usize = 500;
const PLAINTEXT_MAX_LINES: usize = 50;
pub fn chunk_document(file: &str, content: &str) -> Option<Vec<RawChunk>> {
let ext = std::path::Path::new(file)
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_ascii_lowercase();
let chunks = match ext.as_str() {
"md" | "mdx" => chunk_markdown(file, content),
"yaml" | "yml" => chunk_yaml(file, content),
"toml" => chunk_toml(file, content),
"json" => chunk_json(file, content)?,
"txt" | "log" => chunk_plaintext(file, content),
"xml" => chunk_xml(file, content),
_ => return None,
};
Some(chunks)
}
fn document_chunk(
file: &str,
start_line: usize,
end_line: usize,
content: String,
function_name: Option<String>,
language: &str,
chunk_type: ChunkType,
) -> RawChunk {
let id = match &function_name {
Some(name) if !name.is_empty() => {
format!("{file}::{}::{name}::{start_line}", chunk_type.as_str())
}
_ => format!("{file}:{start_line}:{end_line}"),
};
RawChunk {
id,
file: file.to_string(),
start_line,
end_line,
content,
function_name,
language: Some(language.to_string()),
chunk_type,
calls: Vec::new(),
inherits_from: Vec::new(),
chunk_depth: 0,
parent_chunk_id: None,
child_chunk_ids: Vec::new(),
nlp_keywords: Vec::new(),
nlp_code_refs: Vec::new(),
virtual_terms: Vec::new(),
}
}
fn chunk_markdown(file: &str, content: &str) -> Vec<RawChunk> {
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return Vec::new();
}
let mut out: Vec<RawChunk> = Vec::new();
let mut section_start = 0usize;
let mut section_heading: Option<String> = None;
let mut in_code_fence = false;
let flush = |out: &mut Vec<RawChunk>,
start: usize,
end: usize,
heading: &Option<String>,
lines: &[&str]| {
if start >= end {
return;
}
let text = lines[start..end].join("\n");
if text.trim().is_empty() {
return;
}
out.push(document_chunk(
file,
start + 1,
end,
text,
heading.clone(),
"markdown",
ChunkType::Docstring,
));
};
for (i, line) in lines.iter().enumerate() {
let trimmed = line.trim_start();
if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
in_code_fence = !in_code_fence;
continue;
}
if in_code_fence {
continue;
}
if trimmed.starts_with('#') {
flush(&mut out, section_start, i, §ion_heading, &lines);
let heading = trimmed.trim_start_matches('#').trim().to_string();
section_heading = if heading.is_empty() {
None
} else {
Some(heading)
};
section_start = i;
}
}
flush(
&mut out,
section_start,
lines.len(),
§ion_heading,
&lines,
);
if out.is_empty() {
out.push(document_chunk(
file,
1,
lines.len(),
content.to_string(),
None,
"markdown",
ChunkType::Docstring,
));
}
out
}
fn chunk_yaml(file: &str, content: &str) -> Vec<RawChunk> {
chunk_by_top_level_key(file, content, "yaml", |line| {
let trimmed = line.trim_end();
if trimmed.is_empty() || trimmed.starts_with('#') {
return None;
}
if !line.starts_with(|c: char| c.is_whitespace() || c == '-') {
if let Some(idx) = trimmed.find(':') {
let key = trimmed[..idx].trim();
if !key.is_empty() && !key.contains(' ') {
return Some(key.to_string());
}
}
}
None
})
}
fn chunk_toml(file: &str, content: &str) -> Vec<RawChunk> {
chunk_by_top_level_key(file, content, "toml", |line| {
let trimmed = line.trim_end();
if trimmed.starts_with('[') && trimmed.ends_with(']') {
let inner = trimmed
.trim_start_matches('[')
.trim_end_matches(']')
.trim_start_matches('[')
.trim_end_matches(']')
.trim()
.to_string();
if !inner.is_empty() {
return Some(inner);
}
}
None
})
}
fn chunk_by_top_level_key(
file: &str,
content: &str,
language: &str,
header_of: impl Fn(&str) -> Option<String>,
) -> Vec<RawChunk> {
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return Vec::new();
}
let mut out: Vec<RawChunk> = Vec::new();
let mut section_start = 0usize;
let mut section_name: Option<String> = None;
let flush = |out: &mut Vec<RawChunk>,
start: usize,
end: usize,
name: &Option<String>,
lines: &[&str]| {
if start >= end {
return;
}
let text = lines[start..end].join("\n");
if text.trim().is_empty() {
return;
}
out.push(document_chunk(
file,
start + 1,
end,
text,
name.clone(),
language,
ChunkType::Constant,
));
};
for (i, line) in lines.iter().enumerate() {
if let Some(name) = header_of(line) {
flush(&mut out, section_start, i, §ion_name, &lines);
section_name = Some(name);
section_start = i;
}
}
flush(&mut out, section_start, lines.len(), §ion_name, &lines);
if out.is_empty() {
out.push(document_chunk(
file,
1,
lines.len(),
content.to_string(),
None,
language,
ChunkType::Constant,
));
}
out
}
fn chunk_json(file: &str, content: &str) -> Option<Vec<RawChunk>> {
let line_count = content.lines().count();
if line_count == 0 {
return Some(Vec::new());
}
if line_count >= JSON_MAX_LINES {
return Some(Vec::new());
}
Some(vec![document_chunk(
file,
1,
line_count,
content.to_string(),
None,
"json",
ChunkType::Constant,
)])
}
fn chunk_plaintext(file: &str, content: &str) -> Vec<RawChunk> {
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return Vec::new();
}
let lang = match std::path::Path::new(file)
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_ascii_lowercase()
.as_str()
{
"log" => "log",
_ => "text",
};
let mut out: Vec<RawChunk> = Vec::new();
let mut buf_start: Option<usize> = None;
let push_buf =
|out: &mut Vec<RawChunk>, start: usize, end: usize, lines: &[&str], lang: &str| {
let mut s = start;
while s < end {
let e = (s + PLAINTEXT_MAX_LINES).min(end);
let text = lines[s..e].join("\n");
if !text.trim().is_empty() {
out.push(document_chunk(
file,
s + 1,
e,
text,
None,
lang,
ChunkType::Code,
));
}
s = e;
}
};
for (i, line) in lines.iter().enumerate() {
if line.trim().is_empty() {
if let Some(start) = buf_start.take() {
push_buf(&mut out, start, i, &lines, lang);
}
} else if buf_start.is_none() {
buf_start = Some(i);
}
}
if let Some(start) = buf_start {
push_buf(&mut out, start, lines.len(), &lines, lang);
}
if out.is_empty() {
out.push(document_chunk(
file,
1,
lines.len(),
content.to_string(),
None,
lang,
ChunkType::Code,
));
}
out
}
fn chunk_xml(file: &str, content: &str) -> Vec<RawChunk> {
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return Vec::new();
}
let mut out: Vec<RawChunk> = Vec::new();
let mut depth: i32 = 0;
let mut child_start: Option<usize> = None;
let mut child_name: Option<String> = None;
for (i, line) in lines.iter().enumerate() {
let opens = count_xml_opens(line);
let closes = count_xml_closes(line);
if depth == 1 && child_start.is_none() && opens > closes {
child_start = Some(i);
child_name = first_xml_tag_name(line);
}
let prev_depth = depth;
depth += opens as i32;
depth -= closes as i32;
if let Some(start) = child_start {
if depth <= 1 && prev_depth >= 1 && i >= start {
let text = lines[start..=i].join("\n");
if !text.trim().is_empty() {
out.push(document_chunk(
file,
start + 1,
i + 1,
text,
child_name.clone(),
"xml",
ChunkType::Class,
));
}
child_start = None;
child_name = None;
}
}
}
if out.is_empty() {
out.push(document_chunk(
file,
1,
lines.len(),
content.to_string(),
None,
"xml",
ChunkType::Class,
));
}
out
}
fn count_xml_opens(line: &str) -> usize {
let mut count = 0usize;
let bytes = line.as_bytes();
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'<' {
let rest = &line[i..];
if rest.starts_with("<?")
|| rest.starts_with("<!--")
|| rest.starts_with("<!")
|| rest.starts_with("</")
{
i += 1;
continue;
}
if let Some(close) = rest.find('>') {
let tag = &rest[..=close];
if !tag.ends_with("/>") {
count += 1;
}
i += close + 1;
continue;
}
}
i += 1;
}
count
}
fn count_xml_closes(line: &str) -> usize {
line.matches("</").count()
}
fn first_xml_tag_name(line: &str) -> Option<String> {
let start = line.find('<')?;
let rest = &line[start + 1..];
if rest.starts_with('?') || rest.starts_with('!') || rest.starts_with('/') {
return None;
}
let end = rest
.find(|c: char| c.is_whitespace() || c == '>' || c == '/')
.unwrap_or(rest.len());
let name = rest[..end].trim();
if name.is_empty() {
None
} else {
Some(name.to_string())
}
}
pub fn chunk_ast(file: &str, content: &str) -> (Vec<RawChunk>, Vec<RawEntity>) {
let Some((lang, language)) = language_for(file) else {
if let Some(chunks) = chunk_document(file, content) {
return (chunks, Vec::new());
}
return (chunk_text(file, content, 150, 50), Vec::new());
};
let mut parser = Parser::new();
if parser.set_language(&language).is_err() {
tracing::warn!(
"failed to set tree-sitter language for {file}; falling back to sliding-window"
);
return (chunk_text(file, content, 150, 50), Vec::new());
}
let src = content.as_bytes();
let Some(tree) = parser.parse(src, None) else {
return (chunk_text(file, content, 150, 50), Vec::new());
};
let line_offsets = build_line_offsets(src);
let mut chunks: Vec<RawChunk> = Vec::new();
walk_for_chunks(
tree.root_node(),
src,
file,
lang,
&line_offsets,
0,
&mut chunks,
);
if chunks.is_empty() {
let total_lines = content.lines().count().max(1);
chunks.push(RawChunk::generic(
format!("{file}:1:{total_lines}"),
file.to_string(),
1,
total_lines,
content.to_string(),
));
if let Some(c) = chunks.first_mut() {
c.language = Some(lang.to_string());
}
}
let split = split_oversized(chunks);
let entities = extract_entities(&tree, src, file, lang);
(split, entities)
}
fn walk_for_chunks(
node: Node<'_>,
src: &[u8],
file: &str,
lang: &str,
line_offsets: &[usize],
depth: usize,
out: &mut Vec<RawChunk>,
) {
if let Some(chunk_type) = classify_node(lang, node) {
let start_byte = node.start_byte();
let end_byte = node.end_byte();
let start_line = line_for_byte(line_offsets, start_byte);
let end_line = line_for_byte(line_offsets, end_byte.saturating_sub(1));
let content = std::str::from_utf8(&src[start_byte..end_byte])
.unwrap_or("")
.to_string();
let mut name = name_of(node, src);
if lang == "rust" && chunk_type == ChunkType::Method {
if let Some(ty) = rust_impl_type_name(node, src) {
if !name.is_empty() {
name = format!("{ty}::{name}");
}
}
}
if lang == "scala" && chunk_type == ChunkType::Method {
if let Some(ty) = scala_enclosing_class_name(node, src) {
if !name.is_empty() && !ty.is_empty() {
name = format!("{ty}::{name}");
}
}
}
if lang == "php" && chunk_type == ChunkType::Method {
if let Some(ty) = php_enclosing_class_name(node, src) {
if !name.is_empty() && !ty.is_empty() {
name = format!("{ty}::{name}");
}
}
}
let calls = collect_calls(node, src, lang);
let inherits_from = collect_inherits(node, src, lang);
let doc = preceding_doc_comments(node, src);
let (nlp_keywords, nlp_code_refs) = nlp_from_doc(&doc);
let id = make_chunk_id(file, &chunk_type, &name, start_line, end_line);
out.push(RawChunk {
id,
file: file.to_string(),
start_line,
end_line,
content,
function_name: if name.is_empty() { None } else { Some(name) },
language: Some(lang.to_string()),
chunk_type,
calls,
inherits_from,
chunk_depth: depth,
parent_chunk_id: None,
child_chunk_ids: Vec::new(),
nlp_keywords,
nlp_code_refs,
virtual_terms: Vec::new(),
});
let recurse = !matches!(
node.kind(),
"function_item"
| "function_declaration"
| "function_definition"
| "method_declaration"
| "method_definition"
| "constructor_declaration"
| "secondary_constructor"
| "init_declaration"
| "protocol_function_declaration"
);
if recurse {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
walk_for_chunks(child, src, file, lang, line_offsets, depth + 1, out);
}
}
return;
}
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
walk_for_chunks(child, src, file, lang, line_offsets, depth, out);
}
}
fn split_oversized(chunks: Vec<RawChunk>) -> Vec<RawChunk> {
let mut out: Vec<RawChunk> = Vec::with_capacity(chunks.len());
for chunk in chunks {
let line_count = chunk.end_line.saturating_sub(chunk.start_line) + 1;
if line_count <= MAX_CHUNK_LINES {
out.push(chunk);
continue;
}
let parent_id = chunk.id.clone();
let mut child_ids: Vec<String> = Vec::new();
let lines: Vec<&str> = chunk.content.lines().collect();
let mut start = 0usize;
let mut sub_idx = 0usize;
while start < lines.len() {
let end = (start + SUB_CHUNK_WINDOW).min(lines.len());
let text = lines[start..end].join("\n");
let sub_id = format!("{parent_id}::sub::{sub_idx}");
child_ids.push(sub_id.clone());
out.push(RawChunk {
id: sub_id,
file: chunk.file.clone(),
start_line: chunk.start_line + start,
end_line: chunk.start_line + end - 1,
content: text,
function_name: chunk.function_name.clone(),
language: chunk.language.clone(),
chunk_type: chunk.chunk_type.clone(),
calls: Vec::new(),
inherits_from: Vec::new(),
chunk_depth: chunk.chunk_depth,
parent_chunk_id: Some(parent_id.clone()),
child_chunk_ids: Vec::new(),
nlp_keywords: Vec::new(),
nlp_code_refs: Vec::new(),
virtual_terms: Vec::new(),
});
if end == lines.len() {
break;
}
start += SUB_CHUNK_STRIDE;
sub_idx += 1;
}
let mut parent = chunk;
parent.child_chunk_ids = child_ids;
out.push(parent);
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_overlapping_chunks() {
let content = (1..=200)
.map(|i| format!("line {i}"))
.collect::<Vec<_>>()
.join("\n");
let chunks = chunk_text("test.txt", &content, 150, 50);
assert!(chunks.len() >= 2);
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[1].start_line, 51);
}
#[test]
fn test_chunk_id_format() {
let chunks = chunk_text("src/main.txt", "line1\nline2\nline3", 150, 50);
assert!(chunks[0].id.starts_with("src/main.txt:"));
}
#[test]
fn test_rust_function_chunking() {
let src = r#"
fn alpha() {}
fn beta() -> i32 { 1 }
fn gamma(x: i32) -> i32 { x + 1 }
"#;
let (chunks, _ents) = chunk_ast("a.rs", src);
let fns: Vec<&RawChunk> = chunks
.iter()
.filter(|c| c.chunk_type == ChunkType::Function)
.collect();
assert_eq!(fns.len(), 3, "expected 3 function chunks, got {fns:?}");
let names: Vec<_> = fns
.iter()
.map(|c| c.function_name.clone().unwrap_or_default())
.collect();
assert!(names.contains(&"alpha".to_string()));
assert!(names.contains(&"beta".to_string()));
assert!(names.contains(&"gamma".to_string()));
}
#[test]
fn test_rust_impl_method_qualified_name() {
let src = r#"
struct Foo;
impl Foo {
fn bar(&self) {}
}
"#;
let (chunks, _) = chunk_ast("foo.rs", src);
let method = chunks
.iter()
.find(|c| c.chunk_type == ChunkType::Method)
.expect("expected at least one Method chunk");
assert_eq!(method.function_name.as_deref(), Some("Foo::bar"));
}
#[test]
fn test_rust_calls_extraction() {
let src = r#"
fn main() {
foo();
bar(1, 2);
}
fn foo() {}
fn bar(_a: i32, _b: i32) {}
"#;
let (chunks, _) = chunk_ast("m.rs", src);
let main_chunk = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("main"))
.expect("main chunk");
assert!(
main_chunk.calls.contains(&"foo".to_string()),
"calls={:?}",
main_chunk.calls
);
assert!(
main_chunk.calls.contains(&"bar".to_string()),
"calls={:?}",
main_chunk.calls
);
}
#[test]
fn test_rust_entity_named_types() {
let src = r#"
use std::sync::Arc;
fn f() {
let _x: Arc<Vec<String>> = Arc::new(Vec::new());
}
"#;
let (_chunks, entities) = chunk_ast("t.rs", src);
let named: Vec<&str> = entities
.iter()
.filter(|e| e.entity_type == crate::core::entity::EntityType::NamedType)
.map(|e| e.text.as_str())
.collect();
assert!(named.contains(&"Arc"), "named_types={named:?}");
assert!(named.contains(&"Vec"), "named_types={named:?}");
assert!(named.contains(&"String"), "named_types={named:?}");
}
#[test]
fn test_large_function_splits() {
let mut body = String::new();
for i in 0..250 {
body.push_str(&format!(" let _v{i} = {i};\n"));
}
let src = format!("fn huge() {{\n{body}}}\n");
let (chunks, _) = chunk_ast("h.rs", &src);
let subs: Vec<&RawChunk> = chunks
.iter()
.filter(|c| c.parent_chunk_id.is_some())
.collect();
assert!(
!subs.is_empty(),
"expected sub-chunks for 250-line fn, got {chunks:#?}"
);
let parent_id = subs[0].parent_chunk_id.clone().unwrap();
let parent = chunks
.iter()
.find(|c| c.id == parent_id)
.expect("parent retained");
assert!(!parent.child_chunk_ids.is_empty());
}
#[test]
fn test_unknown_language_fallback() {
let content = "hello world\nfoo bar\nbaz";
let (chunks, entities) = chunk_ast("notes.unknownext", content);
assert!(entities.is_empty());
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].chunk_type, ChunkType::Code);
}
#[test]
fn test_chunk_markdown_sections() {
let content = "# Title\n\nintro\n\n## Section A\n\nbody a\n\n## Section B\n\nbody b\n";
let chunks = chunk_markdown("doc.md", content);
assert!(
chunks.len() >= 2,
"expected multiple sections, got {chunks:#?}"
);
let names: Vec<_> = chunks
.iter()
.filter_map(|c| c.function_name.clone())
.collect();
assert!(names.iter().any(|n| n == "Section A"), "names={names:?}");
assert!(names.iter().any(|n| n == "Section B"), "names={names:?}");
for c in &chunks {
assert_eq!(c.language.as_deref(), Some("markdown"));
assert_eq!(c.chunk_type, ChunkType::Docstring);
}
}
#[test]
fn test_chunk_markdown_ignores_hash_in_code_fence() {
let content = "# Real Heading\n\nintro\n\n```\n## not a heading\n```\n\n## Next\n\nx\n";
let chunks = chunk_markdown("doc.md", content);
let names: Vec<_> = chunks
.iter()
.filter_map(|c| c.function_name.clone())
.collect();
assert!(names.iter().any(|n| n == "Real Heading"));
assert!(names.iter().any(|n| n == "Next"));
assert!(
!names.iter().any(|n| n == "not a heading"),
"should not split on # inside fenced code block: {names:?}"
);
}
#[test]
fn test_chunk_yaml_top_level_keys() {
let content = "name: foo\nversion: 1.0\n\ndeps:\n - a\n - b\n\nscripts:\n build: x\n";
let chunks = chunk_yaml("conf.yaml", content);
let names: Vec<_> = chunks
.iter()
.filter_map(|c| c.function_name.clone())
.collect();
assert!(names.iter().any(|n| n == "name"), "names={names:?}");
assert!(names.iter().any(|n| n == "deps"), "names={names:?}");
assert!(names.iter().any(|n| n == "scripts"), "names={names:?}");
for c in &chunks {
assert_eq!(c.language.as_deref(), Some("yaml"));
}
}
#[test]
fn test_chunk_toml_sections() {
let content = "[package]\nname = \"foo\"\nversion = \"1.0\"\n\n[dependencies]\nserde = \"1\"\n\n[[bin]]\nname = \"x\"\n";
let chunks = chunk_toml("Cargo.toml", content);
let names: Vec<_> = chunks
.iter()
.filter_map(|c| c.function_name.clone())
.collect();
assert!(names.iter().any(|n| n == "package"), "names={names:?}");
assert!(names.iter().any(|n| n == "dependencies"), "names={names:?}");
assert!(names.iter().any(|n| n == "bin"), "names={names:?}");
}
#[test]
fn test_chunk_json_small_file_single_chunk() {
let content = "{\n \"name\": \"foo\",\n \"version\": \"1.0\"\n}\n";
let chunks = chunk_json("a.json", content).expect("Some result");
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].language.as_deref(), Some("json"));
}
#[test]
fn test_chunk_json_large_file_skipped() {
let big = (0..600)
.map(|i| format!(" \"k{i}\": {i},"))
.collect::<Vec<_>>()
.join("\n");
let content = format!("{{\n{big}\n}}\n");
let chunks = chunk_json("big.json", &content).expect("Some result");
assert!(chunks.is_empty(), "expected large JSON to be skipped");
}
#[test]
fn test_chunk_plaintext_paragraphs() {
let content = "First paragraph line 1.\nFirst paragraph line 2.\n\nSecond paragraph line 1.\nSecond paragraph line 2.\n\nThird paragraph.\n";
let chunks = chunk_plaintext("note.txt", content);
assert_eq!(
chunks.len(),
3,
"expected one chunk per paragraph, got {chunks:#?}"
);
for c in &chunks {
assert_eq!(c.language.as_deref(), Some("text"));
}
}
#[test]
fn test_chunk_plaintext_caps_at_50_lines() {
let content = (1..=130)
.map(|i| format!("line {i}"))
.collect::<Vec<_>>()
.join("\n");
let chunks = chunk_plaintext("big.log", &content);
assert!(
chunks.len() >= 3,
"expected at least 3 chunks for 130-line paragraph, got {}",
chunks.len()
);
for c in &chunks {
let line_count = c.end_line.saturating_sub(c.start_line) + 1;
assert!(line_count <= 50, "chunk too large: {line_count} lines");
assert_eq!(c.language.as_deref(), Some("log"));
}
}
#[test]
fn test_chunk_xml_top_level_children() {
let content = "<?xml version=\"1.0\"?>\n<library>\n <book id=\"1\">\n <title>A</title>\n </book>\n <book id=\"2\">\n <title>B</title>\n </book>\n <magazine>\n <title>C</title>\n </magazine>\n</library>\n";
let chunks = chunk_xml("data.xml", content);
let names: Vec<_> = chunks
.iter()
.filter_map(|c| c.function_name.clone())
.collect();
assert!(
names.iter().filter(|n| *n == "book").count() >= 2,
"names={names:?}"
);
assert!(names.iter().any(|n| n == "magazine"), "names={names:?}");
for c in &chunks {
assert_eq!(c.language.as_deref(), Some("xml"));
}
}
#[test]
fn test_chunk_document_dispatch() {
let md_content = "# Hello\n\nworld\n";
let (md_chunks, _) = chunk_ast("readme.md", md_content);
assert!(md_chunks
.iter()
.any(|c| c.language.as_deref() == Some("markdown")));
let yaml_content = "key: value\n";
let (yaml_chunks, _) = chunk_ast("conf.yml", yaml_content);
assert!(yaml_chunks
.iter()
.any(|c| c.language.as_deref() == Some("yaml")));
let toml_content = "[section]\nx = 1\n";
let (toml_chunks, _) = chunk_ast("a.toml", toml_content);
assert!(toml_chunks
.iter()
.any(|c| c.language.as_deref() == Some("toml")));
}
#[test]
fn test_nlp_code_refs() {
let src = r#"
/// Wraps the `CodeIndexer` to expose hybrid search.
fn make() {}
"#;
let (chunks, _) = chunk_ast("d.rs", src);
let f = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("make"))
.unwrap();
assert!(
f.nlp_code_refs.iter().any(|k| k == "CodeIndexer"),
"code_refs={:?}",
f.nlp_code_refs
);
}
#[test]
fn test_entity_external_crate() {
let src = r#"
use usearch::Index;
fn f() {}
"#;
let (_chunks, ents) = chunk_ast("u.rs", src);
let exts: Vec<&str> = ents
.iter()
.filter(|e| e.entity_type == crate::core::entity::EntityType::ExternalCrate)
.map(|e| e.text.as_str())
.collect();
assert!(exts.contains(&"usearch"), "external_crates={exts:?}");
}
#[test]
fn test_entity_error_variant() {
let src = r#"
fn f() -> Result<(), anyhow::Error> {
anyhow::bail!("index not found");
}
"#;
let (_chunks, ents) = chunk_ast("e.rs", src);
let any_err = ents
.iter()
.any(|e| e.entity_type == crate::core::entity::EntityType::ErrorVariant);
assert!(
any_err,
"expected at least one ErrorVariant entity, got {ents:#?}"
);
}
#[test]
fn test_csharp_chunking() {
let src = r#"
namespace MyApp {
class Foo {
public void Bar() { Baz(); this.Qux(); }
public Foo() {}
}
interface IThing { void Do(); }
}
"#;
let (chunks, _) = chunk_ast("a.cs", src);
let classes: Vec<&RawChunk> = chunks
.iter()
.filter(|c| c.chunk_type == ChunkType::Class)
.collect();
assert!(
classes
.iter()
.any(|c| c.function_name.as_deref() == Some("Foo")),
"expected class Foo, got {chunks:#?}"
);
let traits: Vec<&RawChunk> = chunks
.iter()
.filter(|c| c.chunk_type == ChunkType::Trait)
.collect();
assert!(
traits
.iter()
.any(|c| c.function_name.as_deref() == Some("IThing")),
"expected interface IThing as Trait"
);
let bar = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("Bar"))
.expect("Bar method chunk");
assert_eq!(bar.chunk_type, ChunkType::Method);
assert!(
bar.calls.contains(&"Baz".to_string()),
"calls={:?}",
bar.calls
);
assert!(
bar.calls.contains(&"Qux".to_string()),
"calls={:?}",
bar.calls
);
}
#[test]
fn test_kotlin_chunking() {
let src = r#"
class Foo {
fun bar() { baz(); this.qux() }
}
object Singleton {
fun run() { other() }
}
"#;
let (chunks, _) = chunk_ast("a.kt", src);
assert!(
chunks
.iter()
.any(|c| c.function_name.as_deref() == Some("Foo")
&& c.chunk_type == ChunkType::Class),
"expected class Foo, got {chunks:#?}"
);
let bar = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("bar"))
.expect("bar method chunk");
assert_eq!(bar.chunk_type, ChunkType::Method);
assert!(
bar.calls.contains(&"baz".to_string()),
"calls={:?}",
bar.calls
);
assert!(
bar.calls.contains(&"qux".to_string()),
"calls={:?}",
bar.calls
);
}
#[test]
fn test_swift_chunking() {
let src = r#"
class Foo {
func bar() { baz(); self.qux() }
init() {}
}
struct S {}
enum E { case a }
protocol P { func d() }
extension Foo { func ext() {} }
"#;
let (chunks, _) = chunk_ast("a.swift", src);
assert!(
chunks
.iter()
.any(|c| c.function_name.as_deref() == Some("Foo")
&& c.chunk_type == ChunkType::Class),
"expected class Foo, got {chunks:#?}"
);
assert!(
chunks
.iter()
.any(|c| c.function_name.as_deref() == Some("S")
&& c.chunk_type == ChunkType::Struct),
"expected struct S"
);
assert!(
chunks.iter().any(|c| c.function_name.as_deref() == Some("E")
&& c.chunk_type == ChunkType::Enum),
"expected enum E"
);
assert!(
chunks.iter().any(
|c| c.function_name.as_deref() == Some("P") && c.chunk_type == ChunkType::Trait
),
"expected protocol P as Trait"
);
assert!(
chunks
.iter()
.any(|c| c.chunk_type == ChunkType::Module
&& c.function_name.as_deref() == Some("Foo")),
"expected extension Foo as Module"
);
let bar = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("bar"))
.expect("bar method chunk");
assert!(
bar.calls.contains(&"baz".to_string()),
"calls={:?}",
bar.calls
);
assert!(
bar.calls.contains(&"qux".to_string()),
"calls={:?}",
bar.calls
);
}
#[test]
fn test_nlp_keywords_from_doc_comments() {
let src = r#"
/// Implements the RRF fusion algorithm.
fn fuse() {}
"#;
let (chunks, _) = chunk_ast("d.rs", src);
let f = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("fuse"))
.unwrap();
assert!(
f.nlp_keywords.iter().any(|k| k == "RRF"),
"keywords={:?}",
f.nlp_keywords
);
assert!(
f.nlp_keywords.iter().any(|k| k == "Implements"),
"keywords={:?}",
f.nlp_keywords
);
}
#[test]
fn test_scala_method_qualified_name() {
let src = r#"
class Foo extends Bar with Mixin {
def bar(): Unit = baz()
}
object O {
def run(): Unit = other()
}
def freefn(): Unit = ()
"#;
let (chunks, _) = chunk_ast("a.scala", src);
let bar = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("Foo::bar"))
.expect("expected qualified method Foo::bar, got: {chunks:#?}");
assert_eq!(bar.chunk_type, ChunkType::Method);
let run = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("O::run"))
.expect("expected qualified method O::run");
assert_eq!(run.chunk_type, ChunkType::Method);
assert!(
chunks
.iter()
.any(|c| c.function_name.as_deref() == Some("freefn")
&& c.chunk_type == ChunkType::Function),
"expected unqualified Function freefn, got {chunks:#?}"
);
}
#[test]
fn test_scala_caller_scoped_call_edges() {
let src = r#"
class Foo {
def bar(): Unit = {
baz()
this.qux()
}
}
"#;
let (chunks, _) = chunk_ast("a.scala", src);
let bar = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("Foo::bar"))
.expect("Foo::bar chunk");
assert!(
bar.calls.contains(&"baz".to_string()),
"calls={:?}",
bar.calls
);
assert!(
bar.calls.contains(&"qux".to_string()),
"calls={:?}",
bar.calls
);
}
#[test]
fn test_scala_extends_and_with_emit_inherits() {
let src = r#"
class Foo extends Bar with Mixin with Other {
def m(): Unit = ()
}
"#;
let (chunks, _) = chunk_ast("a.scala", src);
let foo = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("Foo") && c.chunk_type == ChunkType::Class)
.expect("Foo class chunk");
for parent in ["Bar", "Mixin", "Other"] {
assert!(
foo.inherits_from.iter().any(|p| p == parent),
"expected parent {parent} in inherits_from={:?}",
foo.inherits_from
);
}
}
#[test]
fn test_scala_symbol_graph_resolves_caller() {
use crate::core::symbol_graph::SymbolGraph;
let src = r#"
class Foo {
def bar(): Unit = baz()
}
def baz(): Unit = ()
"#;
let (chunks, _) = chunk_ast("s.scala", src);
let tuples: Vec<_> = chunks
.iter()
.map(|c| {
(
c.id.clone(),
c.file.clone(),
c.function_name.clone(),
c.calls.clone(),
c.inherits_from.clone(),
c.chunk_type.clone(),
)
})
.collect();
let g = SymbolGraph::build_from_chunks(&tuples);
let callers = g.callers_of("baz", 1);
assert!(
callers.iter().any(|(s, _)| s == "Foo::bar"),
"callers={callers:?}"
);
}
#[test]
fn test_php_method_qualified_name() {
let src = r#"<?php
class Foo extends Bar implements I1, I2 {
public function doIt(): void {
$this->helper();
}
}
function freefn(): void {}
"#;
let (chunks, _) = chunk_ast("a.php", src);
let doit = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("Foo::doIt"))
.expect("expected qualified Foo::doIt, got: {chunks:#?}");
assert_eq!(doit.chunk_type, ChunkType::Method);
assert!(
chunks
.iter()
.any(|c| c.function_name.as_deref() == Some("freefn")
&& c.chunk_type == ChunkType::Function),
"expected unqualified Function freefn"
);
}
#[test]
fn test_php_caller_scoped_call_edges() {
let src = r#"<?php
class Foo {
public function doIt(): void {
$this->helper();
Foo::staticCall();
regularFunc();
}
}
"#;
let (chunks, _) = chunk_ast("a.php", src);
let doit = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("Foo::doIt"))
.expect("Foo::doIt chunk");
for callee in ["helper", "staticCall", "regularFunc"] {
assert!(
doit.calls.iter().any(|c| c == callee),
"expected callee {callee} in calls={:?}",
doit.calls
);
}
}
#[test]
fn test_php_implements_and_extends_emit_inherits() {
let src = r#"<?php
class Foo extends Bar implements I1, I2 {}
"#;
let (chunks, _) = chunk_ast("a.php", src);
let foo = chunks
.iter()
.find(|c| c.function_name.as_deref() == Some("Foo") && c.chunk_type == ChunkType::Class)
.expect("Foo class chunk");
for parent in ["Bar", "I1", "I2"] {
assert!(
foo.inherits_from.iter().any(|p| p == parent),
"expected parent {parent} in inherits_from={:?}",
foo.inherits_from
);
}
}
#[test]
fn test_php_interface_extends_emits_inherits() {
let src = r#"<?php
interface Child extends P1, P2 {}
"#;
let (chunks, _) = chunk_ast("a.php", src);
let child = chunks
.iter()
.find(|c| {
c.function_name.as_deref() == Some("Child") && c.chunk_type == ChunkType::Trait
})
.expect("Child interface (chunked as Trait)");
for parent in ["P1", "P2"] {
assert!(
child.inherits_from.iter().any(|p| p == parent),
"expected parent {parent} in inherits_from={:?}",
child.inherits_from
);
}
}
#[test]
fn test_php_symbol_graph_resolves_caller() {
use crate::core::symbol_graph::SymbolGraph;
let src = r#"<?php
class Foo {
public function doIt(): void {
$this->helper();
}
public function helper(): void {}
}
"#;
let (chunks, _) = chunk_ast("p.php", src);
let tuples: Vec<_> = chunks
.iter()
.map(|c| {
(
c.id.clone(),
c.file.clone(),
c.function_name.clone(),
c.calls.clone(),
c.inherits_from.clone(),
c.chunk_type.clone(),
)
})
.collect();
let g = SymbolGraph::build_from_chunks(&tuples);
let callers = g.callers_of("Foo::helper", 1);
assert!(
callers.iter().any(|(s, _)| s == "Foo::doIt"),
"callers={callers:?}"
);
}
}