use std::path::Path;
use tree_sitter::{Node, Parser};
use codescout_embed::chunker::RawChunk;
pub struct LanguageSpec {
pub node_types: &'static [&'static str],
pub doc_prefixes: &'static [&'static str],
pub inner_node_types: &'static [&'static str],
}
struct RegistryEntry {
name: &'static str,
spec: LanguageSpec,
}
static LANGUAGE_REGISTRY: &[RegistryEntry] = &[
RegistryEntry {
name: "rust",
spec: LanguageSpec {
node_types: &[
"function_item",
"struct_item",
"enum_item",
"trait_item",
"impl_item",
"mod_item",
"type_item",
"const_item",
"static_item",
"macro_definition",
],
doc_prefixes: &["///", "//!"],
inner_node_types: &["function_item", "const_item", "type_item", "impl_item"],
},
},
RegistryEntry {
name: "python",
spec: LanguageSpec {
node_types: &[
"function_definition",
"class_definition",
"decorated_definition",
"async_function_definition",
],
doc_prefixes: &["#"],
inner_node_types: &[
"function_definition",
"decorated_definition",
"async_function_definition",
],
},
},
RegistryEntry {
name: "go",
spec: LanguageSpec {
node_types: &[
"function_declaration",
"method_declaration",
"type_declaration",
"var_declaration",
"const_declaration",
],
doc_prefixes: &["//"],
inner_node_types: &[],
},
},
RegistryEntry {
name: "typescript",
spec: LanguageSpec {
node_types: &[
"function_declaration",
"class_declaration",
"method_definition",
"export_statement",
"interface_declaration",
"type_alias_declaration",
],
doc_prefixes: &["/**", " *", "//"],
inner_node_types: &["method_definition"],
},
},
RegistryEntry {
name: "javascript",
spec: LanguageSpec {
node_types: &[
"function_declaration",
"class_declaration",
"method_definition",
"export_statement",
],
doc_prefixes: &["/**", " *", "//"],
inner_node_types: &["method_definition"],
},
},
RegistryEntry {
name: "tsx",
spec: LanguageSpec {
node_types: &[
"function_declaration",
"class_declaration",
"method_definition",
"export_statement",
"interface_declaration",
"type_alias_declaration",
],
doc_prefixes: &["/**", " *", "//"],
inner_node_types: &["method_definition"],
},
},
RegistryEntry {
name: "jsx",
spec: LanguageSpec {
node_types: &[
"function_declaration",
"class_declaration",
"method_definition",
"export_statement",
],
doc_prefixes: &["/**", " *", "//"],
inner_node_types: &["method_definition"],
},
},
RegistryEntry {
name: "java",
spec: LanguageSpec {
node_types: &[
"method_declaration",
"class_declaration",
"interface_declaration",
"constructor_declaration",
"enum_declaration",
],
doc_prefixes: &["/**", " *"],
inner_node_types: &[
"method_declaration",
"constructor_declaration",
"field_declaration",
],
},
},
RegistryEntry {
name: "kotlin",
spec: LanguageSpec {
node_types: &[
"function_declaration",
"class_declaration",
"object_declaration",
"property_declaration",
],
doc_prefixes: &["/**", " *"],
inner_node_types: &["function_declaration", "property_declaration"],
},
},
RegistryEntry {
name: "bash",
spec: LanguageSpec {
node_types: &["function_definition"],
doc_prefixes: &["#"],
inner_node_types: &[],
},
},
];
#[derive(Debug, Clone)]
pub(crate) struct AstNode {
pub(crate) start_line: usize,
pub(crate) end_line: usize,
pub(crate) kind: String,
pub(crate) name: Option<String>,
}
pub fn get_language_spec(lang: &str) -> Option<&'static LanguageSpec> {
let lower = lang.to_lowercase();
LANGUAGE_REGISTRY
.iter()
.find(|entry| entry.name == lower)
.map(|entry| &entry.spec)
}
fn is_markdown(path: &Path) -> bool {
path.extension()
.and_then(|ext| ext.to_str())
.map(|ext| {
let lower = ext.to_lowercase();
lower == "md" || lower == "markdown"
})
.unwrap_or(false)
}
fn extract_node_name(node: &tree_sitter::Node, source: &str) -> Option<String> {
node.child_by_field_name("name")
.and_then(|n| n.utf8_text(source.as_bytes()).ok())
.map(|s| s.to_string())
}
pub(crate) fn extract_ast_nodes(
source: &str,
ts_lang: &tree_sitter::Language,
spec: Option<&LanguageSpec>,
) -> anyhow::Result<Vec<AstNode>> {
let mut parser = Parser::new();
parser.set_language(ts_lang)?;
let tree = parser
.parse(source, None)
.ok_or_else(|| anyhow::anyhow!("tree-sitter parse failed"))?;
let root = tree.root_node();
let mut nodes = Vec::new();
let mut cursor = root.walk();
for child in root.children(&mut cursor) {
let is_extractable = if let Some(spec) = spec {
spec.node_types.contains(&child.kind())
} else {
child.is_named()
&& (child
.end_position()
.row
.saturating_sub(child.start_position().row))
>= 2
&& has_named_child(child)
};
if is_extractable {
nodes.push(AstNode {
start_line: child.start_position().row,
end_line: child.end_position().row,
kind: child.kind().to_string(),
name: extract_node_name(&child, source),
});
}
}
Ok(nodes)
}
fn has_named_child(node: Node) -> bool {
let mut cursor = node.walk();
let result = node.children(&mut cursor).any(|c| c.is_named());
result
}
fn collect_inner_nodes(
node: tree_sitter::Node,
inner_types: &[&str],
source_lines: &[&str],
doc_prefixes: &[&str],
line_offset: usize,
source: &str,
result: &mut Vec<AstNode>,
) {
let _ = (source_lines, doc_prefixes);
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
if inner_types.contains(&child.kind()) {
let local_start = child.start_position().row;
result.push(AstNode {
start_line: line_offset + local_start,
end_line: line_offset + child.end_position().row,
kind: child.kind().to_string(),
name: extract_node_name(&child, source),
});
} else {
collect_inner_nodes(
child,
inner_types,
source_lines,
doc_prefixes,
line_offset,
source,
result,
);
}
}
}
fn try_extract_inner_nodes(
source_lines: &[&str],
node: &AstNode,
ts_lang: &tree_sitter::Language,
inner_types: &[&str],
doc_prefixes: &[&str],
) -> Option<Vec<AstNode>> {
let node_source = source_lines[node.start_line..=node.end_line].join("\n");
let node_lines: Vec<&str> = node_source.lines().collect();
let mut parser = Parser::new();
parser.set_language(ts_lang).ok()?;
let tree = parser.parse(&node_source, None)?;
let mut result = Vec::new();
let root = tree.root_node();
let container = root.named_child(0).unwrap_or(root);
collect_inner_nodes(
container,
inner_types,
&node_lines,
doc_prefixes,
node.start_line,
&node_source,
&mut result,
);
if result.is_empty() {
None
} else {
Some(result)
}
}
fn extract_container_header(
lines: &[&str],
start: usize,
end: usize,
doc_prefixes: &[&str],
) -> RawChunk {
let node_lines = &lines[start..end];
let mut sig_end = 0;
while sig_end < node_lines.len() && is_doc_line(node_lines[sig_end], doc_prefixes) {
sig_end += 1;
}
let sig_search_start = sig_end;
while sig_end < node_lines.len() && (sig_end - sig_search_start) < 3 {
sig_end += 1;
let trimmed = node_lines[sig_end - 1].trim();
if trimmed.contains('{') || trimmed.ends_with(':') || trimmed.contains("=>") {
break;
}
}
let sig_end = sig_end.max(1).min(node_lines.len());
RawChunk {
content: node_lines[..sig_end].join("\n"),
start_line: start + 1,
end_line: start + sig_end,
metadata: None,
}
}
fn extract_signature(first_line: &str) -> String {
const MAX_LEN: usize = 100;
let trimmed = first_line.trim_end();
let mut end = trimmed.len();
for delim in ["{", "=>"] {
if let Some(i) = trimmed.find(delim) {
end = end.min(i);
}
}
if let Some(i) = trimmed.rfind(':') {
if i >= trimmed.len().saturating_sub(2) {
end = end.min(i);
}
}
let sliced = trimmed[..end].trim_end();
if sliced.chars().count() > MAX_LEN {
sliced.chars().take(MAX_LEN).collect::<String>()
} else {
sliced.to_string()
}
}
fn kind_keyword_for_node(lang: &str, node_kind: &str) -> Option<&'static str> {
match (lang, node_kind) {
("rust", "function_item") => Some("fn"),
("rust", "struct_item") => Some("struct"),
("rust", "enum_item") => Some("enum"),
("rust", "trait_item") => Some("trait"),
("rust", "impl_item") => Some("impl"),
("rust", "mod_item") => Some("mod"),
("rust", "type_item") => Some("type"),
("rust", "const_item") => Some("const"),
("rust", "static_item") => Some("static"),
("rust", "macro_definition") => Some("macro"),
("python", "function_definition") => Some("def"),
("python", "async_function_definition") => Some("async def"),
("python", "class_definition") => Some("class"),
("python", "decorated_definition") => Some("def"),
("typescript" | "tsx", "function_declaration") => Some("function"),
("typescript" | "tsx", "class_declaration") => Some("class"),
("typescript" | "tsx", "method_definition") => Some("method"),
("typescript" | "tsx", "interface_declaration") => Some("interface"),
("typescript" | "tsx", "type_alias_declaration") => Some("type"),
("typescript" | "tsx", "export_statement") => Some("export"),
("javascript" | "jsx", "function_declaration") => Some("function"),
("javascript" | "jsx", "class_declaration") => Some("class"),
("javascript" | "jsx", "method_definition") => Some("method"),
("javascript" | "jsx", "export_statement") => Some("export"),
("java", "method_declaration") => Some("method"),
("java", "class_declaration") => Some("class"),
("java", "interface_declaration") => Some("interface"),
("java", "constructor_declaration") => Some("constructor"),
("java", "enum_declaration") => Some("enum"),
("kotlin", "function_declaration") => Some("fun"),
("kotlin", "class_declaration") => Some("class"),
("kotlin", "object_declaration") => Some("object"),
("kotlin", "property_declaration") => Some("property"),
("go", "function_declaration") => Some("func"),
("go", "method_declaration") => Some("method"),
("go", "type_declaration") => Some("type"),
("go", "var_declaration") => Some("var"),
("go", "const_declaration") => Some("const"),
("bash", "function_definition") => Some("function"),
_ => None,
}
}
fn build_metadata_header(
file_path: &str,
container_path: &[&str],
kind: Option<&str>,
name: Option<&str>,
signature: Option<&str>,
) -> Option<String> {
if file_path.is_empty() {
return None;
}
let mut parts: Vec<String> = Vec::with_capacity(container_path.len() + 2);
parts.push(file_path.to_string());
for c in container_path {
parts.push((*c).to_string());
}
let node_part = match (kind, name, signature) {
(_, _, Some(sig)) => Some(sig.to_string()),
(Some(k), Some(n), None) => Some(format!("{k} {n}")),
(None, Some(n), None) => Some(n.to_string()),
_ => None,
};
if let Some(np) = node_part {
parts.push(np);
}
Some(parts.join(" :: "))
}
#[allow(clippy::too_many_arguments)]
fn nodes_to_chunks(
source: &str,
nodes: &[AstNode],
chunk_size: usize,
doc_prefixes: &[&str],
ts_lang: Option<&tree_sitter::Language>,
spec: Option<&LanguageSpec>,
lang: &str,
file_path: &str,
container_path: &[String],
) -> Vec<RawChunk> {
let lines: Vec<&str> = source.lines().collect();
let mut chunks = Vec::new();
let mut prev_end: usize = 0;
for node in nodes {
let expanded_start = expand_doc_comment_start(&lines, node.start_line, doc_prefixes);
if expanded_start > prev_end {
let gap_content = lines[prev_end..expanded_start].join("\n");
if !gap_content.trim().is_empty() {
if gap_content.len() > chunk_size {
let sub = codescout_embed::chunker::split(&gap_content, chunk_size, 0);
for mut sc in sub {
sc.start_line += prev_end;
sc.end_line += prev_end;
sc.metadata = Some(file_path.to_string());
chunks.push(sc);
}
} else {
chunks.push(RawChunk {
content: gap_content,
start_line: prev_end + 1,
end_line: expanded_start,
metadata: Some(file_path.to_string()),
});
}
}
}
let node_end = (node.end_line + 1).min(lines.len());
let content = lines[expanded_start..node_end].join("\n");
let kind_kw = kind_keyword_for_node(lang, &node.kind);
let first_line = lines.get(node.start_line).copied().unwrap_or("");
let sig_str = extract_signature(first_line);
let sig_opt: Option<&str> = if sig_str.is_empty() {
None
} else {
Some(&sig_str)
};
let container_refs: Vec<&str> = container_path.iter().map(|s| s.as_str()).collect();
let inner_nodes = spec
.filter(|s| !s.inner_node_types.is_empty())
.zip(ts_lang)
.and_then(|(s, ts)| {
try_extract_inner_nodes(&lines, node, ts, s.inner_node_types, doc_prefixes)
});
if let Some(inner) = inner_nodes {
let container_desc = if !sig_str.is_empty() {
sig_str.clone()
} else if let (Some(k), Some(n)) = (kind_kw, node.name.as_deref()) {
format!("{k} {n}")
} else if let Some(n) = node.name.as_deref() {
n.to_string()
} else {
node.kind.clone()
};
let mut header =
extract_container_header(&lines, expanded_start, node_end, doc_prefixes);
if !header.content.trim().is_empty() {
header.metadata = build_metadata_header(
file_path,
&container_refs,
None,
Some(&container_desc),
None,
);
chunks.push(header);
}
let mut inner_container = container_path.to_vec();
inner_container.push(container_desc);
let inner_chunks = nodes_to_chunks(
source,
&inner,
chunk_size,
doc_prefixes,
None,
None,
lang,
file_path,
&inner_container,
);
chunks.extend(inner_chunks);
} else if content.len() <= chunk_size {
let metadata = build_metadata_header(
file_path,
&container_refs,
kind_kw,
node.name.as_deref(),
sig_opt,
);
chunks.push(RawChunk {
content,
start_line: expanded_start + 1,
end_line: node_end,
metadata,
});
} else {
let sub_metadata = build_metadata_header(
file_path,
&container_refs,
kind_kw,
node.name.as_deref(),
sig_opt,
);
let mut sub =
sub_split_node(&lines, expanded_start, node_end, chunk_size, doc_prefixes);
for c in &mut sub {
c.metadata = sub_metadata.clone();
}
chunks.extend(sub);
}
prev_end = node_end;
}
if prev_end < lines.len() {
let gap_content = lines[prev_end..].join("\n");
if !gap_content.trim().is_empty() {
if gap_content.len() > chunk_size {
let sub = codescout_embed::chunker::split(&gap_content, chunk_size, 0);
for mut sc in sub {
sc.start_line += prev_end;
sc.end_line += prev_end;
sc.metadata = Some(file_path.to_string());
chunks.push(sc);
}
} else {
chunks.push(RawChunk {
content: gap_content,
start_line: prev_end + 1,
end_line: lines.len(),
metadata: Some(file_path.to_string()),
});
}
}
}
chunks
}
fn sub_split_node(
lines: &[&str],
start: usize,
end: usize,
chunk_size: usize,
doc_prefixes: &[&str],
) -> Vec<RawChunk> {
let node_lines = &lines[start..end];
let mut sig_end = 0;
while sig_end < node_lines.len() && is_doc_line(node_lines[sig_end], doc_prefixes) {
sig_end += 1;
}
let sig_search_start = sig_end;
let max_sig_lines = 3;
while sig_end < node_lines.len() && (sig_end - sig_search_start) < max_sig_lines {
sig_end += 1;
let line = node_lines[sig_end - 1];
let trimmed = line.trim();
if trimmed.contains('{') || trimmed.ends_with(':') || trimmed.contains("=>") {
break;
}
}
let prefix = node_lines[..sig_end].join("\n");
let body_lines = &node_lines[sig_end..];
if body_lines.is_empty() {
return vec![RawChunk {
content: node_lines.join("\n"),
start_line: start + 1,
end_line: end,
metadata: None,
}];
}
let continued_marker = " // ... (continued)";
let overhead = prefix.len() + 1 + continued_marker.len() + 1 ;
let body_chunk_size = if chunk_size > overhead {
chunk_size - overhead
} else {
chunk_size / 2
};
let body_text = body_lines.join("\n");
let sub_chunks = codescout_embed::chunker::split(&body_text, body_chunk_size, 0);
sub_chunks
.into_iter()
.enumerate()
.map(|(i, sc)| {
let content = if i == 0 {
format!("{}\n{}", prefix, sc.content)
} else {
format!("{}\n{}\n{}", prefix, continued_marker, sc.content)
};
let body_offset = start + sig_end; let start_line = if i == 0 {
start + 1 } else {
body_offset + sc.start_line };
let end_line = body_offset + sc.end_line;
RawChunk {
content,
start_line,
end_line,
metadata: None,
}
})
.collect()
}
pub const AST_CHUNK_TARGET: usize = 3000;
pub fn split_file(source: &str, lang: &str, path: &Path, chunk_size: usize) -> Vec<RawChunk> {
if source.is_empty() {
return vec![];
}
let target = chunk_size.min(AST_CHUNK_TARGET);
let file_path_str = path.to_string_lossy();
let container_path: Vec<String> = Vec::new();
let chunks = if is_markdown(path) {
codescout_embed::chunker::split_markdown(source, target, 0)
} else if let Some(ts_lang) = crate::ast::get_ts_language(lang) {
let spec = get_language_spec(lang);
if let Ok(nodes) = extract_ast_nodes(source, &ts_lang, spec) {
if !nodes.is_empty() {
let doc_prefixes = spec.map(|s| s.doc_prefixes).unwrap_or(&["//"] as &[&str]);
nodes_to_chunks(
source,
&nodes,
target,
doc_prefixes,
Some(&ts_lang),
spec,
lang,
&file_path_str,
&container_path,
)
} else {
codescout_embed::chunker::split(source, target, 0)
}
} else {
codescout_embed::chunker::split(source, target, 0)
}
} else {
codescout_embed::chunker::split(source, target, 0)
};
enforce_max_chunk_size(chunks, target)
}
fn enforce_max_chunk_size(chunks: Vec<RawChunk>, target: usize) -> Vec<RawChunk> {
let mut out = Vec::with_capacity(chunks.len());
for chunk in chunks {
if chunk.content.len() <= target {
out.push(chunk);
continue;
}
let parent_offset = chunk.start_line.saturating_sub(1);
for sub in codescout_embed::chunker::split(&chunk.content, target, 0) {
if sub.content.len() <= target {
out.push(RawChunk {
content: sub.content,
start_line: sub.start_line + parent_offset,
end_line: sub.end_line + parent_offset,
metadata: None,
});
continue;
}
let abs_start = sub.start_line + parent_offset;
let abs_end = sub.end_line + parent_offset;
for piece in slice_on_char_boundary(&sub.content, target) {
out.push(RawChunk {
content: piece,
start_line: abs_start,
end_line: abs_end,
metadata: None,
});
}
}
}
out
}
fn slice_on_char_boundary(s: &str, max_bytes: usize) -> Vec<String> {
if max_bytes == 0 || s.is_empty() {
return vec![s.to_string()];
}
let mut pieces = Vec::new();
let bytes = s.as_bytes();
let mut start = 0;
while start < bytes.len() {
let mut end = (start + max_bytes).min(bytes.len());
while end > start && !s.is_char_boundary(end) {
end -= 1;
}
if end == start {
end = (start + max_bytes).min(bytes.len());
while end < bytes.len() && !s.is_char_boundary(end) {
end += 1;
}
}
pieces.push(s[start..end].to_string());
start = end;
}
pieces
}
pub fn is_doc_line(line: &str, doc_prefixes: &[&str]) -> bool {
let trimmed = line.trim_start();
if trimmed.is_empty() {
return false;
}
if trimmed.starts_with("*/") {
return true;
}
doc_prefixes.iter().any(|prefix| {
trimmed.starts_with(prefix) || line.starts_with(prefix)
})
}
pub fn expand_doc_comment_start(
lines: &[&str],
node_start_line: usize,
doc_prefixes: &[&str],
) -> usize {
if node_start_line == 0 {
return 0;
}
let mut cursor = node_start_line;
let mut blank_count = 0;
while cursor > 0 && lines[cursor - 1].trim().is_empty() && blank_count < 2 {
cursor -= 1;
blank_count += 1;
}
if cursor == 0 && lines[0].trim().is_empty() {
return node_start_line;
}
if cursor == 0 || !is_doc_line(lines[cursor - 1], doc_prefixes) {
return node_start_line;
}
let mut doc_start = cursor - 1;
while doc_start > 0 && is_doc_line(lines[doc_start - 1], doc_prefixes) {
doc_start -= 1;
}
doc_start
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn registry_lookup_all_languages() {
let languages = [
"rust",
"python",
"go",
"typescript",
"javascript",
"tsx",
"jsx",
"java",
"kotlin",
"bash",
];
for lang in &languages {
let spec = get_language_spec(lang);
assert!(
spec.is_some(),
"expected LanguageSpec for '{}', got None",
lang
);
let spec = spec.unwrap();
assert!(
!spec.node_types.is_empty(),
"'{}' should have node_types",
lang
);
assert!(
!spec.doc_prefixes.is_empty(),
"'{}' should have doc_prefixes",
lang
);
}
}
#[test]
fn registry_lookup_case_insensitive() {
assert!(get_language_spec("Rust").is_some());
assert!(get_language_spec("PYTHON").is_some());
assert!(get_language_spec("TypeScript").is_some());
}
#[test]
fn registry_returns_none_for_unknown() {
assert!(get_language_spec("haskell").is_none());
assert!(get_language_spec("brainfuck").is_none());
assert!(get_language_spec("").is_none());
}
#[test]
fn split_file_empty_source() {
let chunks = split_file("", "rust", Path::new("main.rs"), 4000);
assert!(chunks.is_empty());
}
#[test]
fn split_file_markdown_delegates_to_markdown_splitter() {
let source = "# Heading\n\nIntro.\n\n## Section\n\nBody text.\n";
let chunks = split_file(source, "markdown", Path::new("README.md"), 4000);
assert!(!chunks.is_empty());
assert!(
chunks.len() >= 2,
"expected markdown heading split, got {} chunks",
chunks.len()
);
assert!(chunks[0].content.contains("Heading"));
assert!(chunks.iter().any(|c| c.content.contains("Section")));
}
#[test]
fn split_file_markdown_uppercase_extension() {
let source = "# Title\n\nText.\n\n## Part Two\n\nMore text.\n";
let chunks = split_file(source, "markdown", Path::new("NOTES.MD"), 4000);
assert!(chunks.len() >= 2, "should recognise .MD as markdown");
}
#[test]
fn split_file_unknown_lang_falls_through_to_plain_split() {
let source = "line 1\nline 2\nline 3\n";
let chunks = split_file(source, "unknown_lang", Path::new("file.xyz"), 4000);
assert!(!chunks.is_empty());
assert_eq!(chunks[0].start_line, 1);
}
#[test]
fn split_file_enforces_max_chunk_size_on_oversized_node() {
let mut body = String::from("pub fn huge() -> i32 {\n");
for i in 0..200 {
body.push_str(&format!(" let v{i} = {i};\n"));
}
body.push_str(" 0\n}\n");
let chunks = split_file(&body, "rust", Path::new("lib.rs"), 800);
assert!(!chunks.is_empty());
for c in &chunks {
assert!(
c.content.len() <= 800,
"chunk len {} exceeds target 800; chunks={}",
c.content.len(),
chunks.len()
);
}
assert!(
chunks.len() >= 2,
"expected post-pass split, got {} chunks",
chunks.len()
);
}
#[test]
fn split_file_post_pass_preserves_absolute_line_numbers() {
let mut src = String::new();
for i in 0..50 {
src.push_str(&format!("// preamble {i}\n"));
}
src.push_str("pub fn big() {\n");
for i in 0..200 {
src.push_str(&format!(" let v{i} = {i};\n"));
}
src.push_str("}\n");
let chunks = split_file(&src, "rust", Path::new("lib.rs"), 800);
assert!(chunks.len() >= 2);
let total_lines = src.lines().count();
for c in &chunks {
assert!(
c.start_line >= 1 && c.end_line <= total_lines,
"chunk lines {}-{} out of file range 1-{}",
c.start_line,
c.end_line,
total_lines
);
assert!(
c.start_line <= c.end_line,
"chunk start {} > end {}",
c.start_line,
c.end_line
);
}
}
#[test]
fn split_file_post_pass_applies_to_plain_text_path() {
let src = "x".repeat(5000) + "\n";
let chunks = split_file(&src, "unknown_xyz", Path::new("a.xyz"), 1000);
for c in &chunks {
assert!(
c.content.len() <= 1000,
"plain-text chunk len {} exceeds target",
c.content.len()
);
}
}
#[test]
fn enforce_max_chunk_size_is_noop_for_small_chunks() {
let small = vec![
RawChunk {
content: "fn a() {}".into(),
start_line: 1,
end_line: 1,
metadata: None,
},
RawChunk {
content: "fn b() {}".into(),
start_line: 3,
end_line: 3,
metadata: None,
},
];
let out = enforce_max_chunk_size(small.clone(), 1000);
assert_eq!(out.len(), 2);
assert_eq!(out[0].content, small[0].content);
assert_eq!(out[0].start_line, 1);
assert_eq!(out[1].start_line, 3);
}
#[test]
fn slice_on_char_boundary_respects_utf8() {
let s = "🦀🦀🦀🦀";
let pieces = slice_on_char_boundary(s, 5);
assert!(!pieces.is_empty());
let total: String = pieces.join("");
assert_eq!(total, s, "round-trip must equal original");
for p in &pieces {
assert!(!p.is_empty());
assert!(
p.len() <= 5 || p.chars().count() == 1,
"piece {p:?} ({} bytes) violates cap",
p.len()
);
}
}
#[test]
fn slice_on_char_boundary_ascii_round_trip() {
let s = "hello world this is a long string";
let pieces = slice_on_char_boundary(s, 7);
assert_eq!(pieces.join(""), s);
for p in &pieces {
assert!(p.len() <= 7);
}
}
#[test]
fn split_file_known_lang_uses_ast_split() {
let source = "fn main() {\n println!(\"hello\");\n}\n";
let chunks = split_file(source, "rust", Path::new("main.rs"), 4000);
assert_eq!(chunks.len(), 1);
assert!(chunks[0].content.contains("fn main"));
}
#[test]
fn expand_doc_comments_rust() {
let source =
"use std::io;\n\n/// Adds two numbers.\n/// Returns the sum.\nfn add(a: i32, b: i32) -> i32 {\n a + b\n}\n";
let lines: Vec<&str> = source.lines().collect();
let expanded = expand_doc_comment_start(&lines, 4, &["///", "//!"]);
assert_eq!(expanded, 2, "should expand to include both /// lines");
}
#[test]
fn expand_doc_comments_java_block() {
let source = "import foo;\n\n/**\n * Does something.\n */\npublic void doIt() {\n}\n";
let lines: Vec<&str> = source.lines().collect();
let expanded = expand_doc_comment_start(&lines, 5, &["/**", " *", " */"]);
assert_eq!(expanded, 2, "should expand to include /** block");
}
#[test]
fn expand_doc_comments_none() {
let source = "use std::io;\n\nfn add(a: i32, b: i32) -> i32 {\n a + b\n}\n";
let lines: Vec<&str> = source.lines().collect();
let expanded = expand_doc_comment_start(&lines, 2, &["///", "//!"]);
assert_eq!(expanded, 2, "no doc comment — should not expand");
}
#[test]
fn expand_skips_blank_lines_between_doc_and_node() {
let source = "/// Documented.\n\nfn foo() {}\n";
let lines: Vec<&str> = source.lines().collect();
let expanded = expand_doc_comment_start(&lines, 2, &["///"]);
assert_eq!(expanded, 0, "should cross blank line to find doc comment");
}
#[test]
fn expand_at_line_zero() {
let source = "fn foo() {}\n";
let lines: Vec<&str> = source.lines().collect();
let expanded = expand_doc_comment_start(&lines, 0, &["///"]);
assert_eq!(expanded, 0, "already at start — no expansion possible");
}
#[test]
fn is_doc_line_matches_prefix() {
assert!(is_doc_line("/// hello", &["///"]));
assert!(is_doc_line(" /// indented", &["///"]));
assert!(is_doc_line(" * middle of block", &[" *"]));
assert!(is_doc_line(" */", &[" *"]));
assert!(is_doc_line("*/", &[]), "closing */ always matches");
}
#[test]
fn is_doc_line_rejects_non_doc() {
assert!(!is_doc_line("fn foo() {}", &["///"]));
assert!(!is_doc_line("// regular comment", &["///"]));
assert!(!is_doc_line("", &["///"]));
}
#[test]
fn ast_split_rust_two_functions() {
let source = "use std::io;\n\n/// Adds two numbers.\nfn add(a: i32, b: i32) -> i32 {\n a + b\n}\n\n/// Subtracts b from a.\nfn sub(a: i32, b: i32) -> i32 {\n a - b\n}\n";
let chunks = split_file(source, "rust", Path::new("test.rs"), 4000);
assert!(chunks.len() >= 2, "got {} chunks", chunks.len());
let add_chunk = chunks
.iter()
.find(|c| c.content.contains("fn add"))
.expect("add chunk");
assert!(
add_chunk.content.contains("/// Adds two numbers"),
"add chunk should include doc"
);
let sub_chunk = chunks
.iter()
.find(|c| c.content.contains("fn sub"))
.expect("sub chunk");
assert!(
sub_chunk.content.contains("/// Subtracts"),
"sub chunk should include doc"
);
assert!(
!add_chunk.content.contains("fn sub"),
"add chunk should not contain sub"
);
}
#[test]
fn ast_split_python_function_with_comment() {
let source = "import os\n\n# Helper to greet.\ndef greet(name):\n return f'Hello {name}'\n\nclass Greeter:\n def __init__(self, name):\n self.name = name\n";
let chunks = split_file(source, "python", Path::new("test.py"), 4000);
assert!(
chunks.len() >= 2,
"should split into function + class, got {}",
chunks.len()
);
let greet_chunk = chunks
.iter()
.find(|c| c.content.contains("def greet"))
.expect("greet chunk");
assert!(
greet_chunk.content.contains("# Helper"),
"greet should include doc comment"
);
}
#[test]
fn ast_split_preserves_line_numbers() {
let source = "/// First.\nfn first() {}\n\n/// Second.\nfn second() {}\n";
let chunks = split_file(source, "rust", Path::new("test.rs"), 4000);
let first = chunks
.iter()
.find(|c| c.content.contains("fn first"))
.unwrap();
assert_eq!(
first.start_line, 1,
"first fn starts at line 1 (includes doc)"
);
let second = chunks
.iter()
.find(|c| c.content.contains("fn second"))
.unwrap();
assert_eq!(
second.start_line, 4,
"second fn starts at line 4 (includes doc)"
);
}
#[test]
fn ast_split_captures_gap_text() {
let source = "use std::io;\nuse std::fmt;\n\nfn foo() {}\n";
let chunks = split_file(source, "rust", Path::new("test.rs"), 4000);
let has_use = chunks.iter().any(|c| c.content.contains("use std::io"));
let has_fn = chunks.iter().any(|c| c.content.contains("fn foo"));
assert!(has_use, "should capture use statements as gap chunk");
assert!(has_fn, "should capture function");
}
#[test]
fn oversized_node_is_sub_split_with_prefix() {
let mut source = String::from("/// Important function.\nfn big() {\n");
for i in 0..50 {
source.push_str(&format!(" let x{} = {};\n", i, i));
}
source.push_str("}\n");
let chunks = split_file(&source, "rust", Path::new("test.rs"), 300);
assert!(
chunks.len() > 1,
"oversized fn should be sub-split, got {}",
chunks.len()
);
for (i, chunk) in chunks.iter().enumerate() {
assert!(
chunk.content.contains("/// Important function"),
"sub-chunk {} missing doc prefix",
i
);
assert!(
chunk.content.contains("fn big()"),
"sub-chunk {} missing signature prefix",
i
);
}
assert!(!chunks[0].content.contains("(continued)"));
if chunks.len() > 1 {
assert!(chunks[1].content.contains("(continued)"));
}
}
#[test]
fn sub_split_covers_all_body_lines() {
let mut source = String::from("fn big() {\n");
let body_lines: Vec<String> = (0..40)
.map(|i| format!(" let x{} = {};", i, i))
.collect();
source.push_str(&body_lines.join("\n"));
source.push_str("\n}\n");
let chunks = split_file(&source, "rust", Path::new("test.rs"), 200);
for (i, body_line) in body_lines.iter().enumerate() {
let covered = chunks
.iter()
.any(|c| c.content.contains(body_line.as_str()));
assert!(covered, "body line {} not covered: {}", i, body_line);
}
}
#[test]
fn generic_heuristic_extracts_multiline_named_nodes() {
let source =
"fn hello() {\n println!(\"hi\");\n}\n\nfn world() {\n println!(\"world\");\n}\n";
let ts_lang: tree_sitter::Language = tree_sitter_rust::LANGUAGE.into();
let nodes = extract_ast_nodes(source, &ts_lang, None).unwrap();
assert_eq!(nodes.len(), 2, "generic heuristic should find 2 functions");
}
#[test]
fn generic_heuristic_ignores_single_line_nodes() {
let source =
"use std::io;\nuse std::fmt;\n\nfn multi_line() {\n let x = 1;\n let y = 2;\n}\n";
let ts_lang: tree_sitter::Language = tree_sitter_rust::LANGUAGE.into();
let nodes = extract_ast_nodes(source, &ts_lang, None).unwrap();
assert_eq!(nodes.len(), 1, "should only find multi_line fn");
}
#[test]
fn broken_syntax_falls_back_to_line_splitting() {
let source = "fn broken( { {{ missing close\n let x = 1;\n let y = 2;\n";
let chunks = split_file(source, "rust", Path::new("test.rs"), 200);
assert!(
!chunks.is_empty(),
"broken syntax should still produce chunks via fallback"
);
}
#[test]
fn ast_with_no_extractable_nodes_falls_back() {
let source = "// just a comment\n// another comment\n\n";
let chunks = split_file(source, "rust", Path::new("test.rs"), 200);
assert!(
!chunks.is_empty(),
"should fall back to line-based for comment-only files"
);
}
#[test]
fn ast_split_go_function_with_doc() {
let source = "package main\n\nimport \"fmt\"\n\n// Greet prints a greeting.\nfunc Greet(name string) {\n\tfmt.Println(\"Hello\", name)\n}\n\n// Add returns the sum.\nfunc Add(a, b int) int {\n\treturn a + b\n}\n";
let chunks = split_file(source, "go", Path::new("main.go"), 4000);
assert!(
chunks.len() >= 2,
"Go should split into 2+ chunks, got {}",
chunks.len()
);
let greet = chunks
.iter()
.find(|c| c.content.contains("func Greet"))
.expect("Greet chunk");
assert!(
greet.content.contains("// Greet prints"),
"Go func should include doc comment"
);
let add = chunks
.iter()
.find(|c| c.content.contains("func Add"))
.expect("Add chunk");
assert!(
!greet.content.contains("func Add"),
"Greet chunk should not contain Add"
);
assert!(
add.content.contains("// Add returns"),
"Add should include doc comment"
);
}
#[test]
fn ast_split_typescript_with_jsdoc() {
let source = "import { foo } from 'bar';\n\n/**\n * Adds two numbers.\n * @param a first\n * @param b second\n */\nfunction add(a: number, b: number): number {\n return a + b;\n}\n\nfunction sub(a: number, b: number): number {\n return a - b;\n}\n";
let chunks = split_file(source, "typescript", Path::new("math.ts"), 4000);
assert!(chunks.len() >= 2, "TS should split, got {}", chunks.len());
let add_chunk = chunks
.iter()
.find(|c| c.content.contains("function add"))
.expect("add chunk");
assert!(
add_chunk.content.contains("Adds two numbers"),
"TS func should include JSDoc"
);
}
#[test]
fn ast_split_trailing_gap_captured() {
let source = "fn foo() {\n 1\n}\n\n// trailing comment\nconst X: i32 = 42;\n";
let chunks = split_file(source, "rust", Path::new("test.rs"), 4000);
let has_trailing = chunks
.iter()
.any(|c| c.content.contains("trailing comment"));
assert!(has_trailing, "trailing gap text should be captured");
}
#[test]
fn expand_doc_does_not_bridge_many_blank_lines() {
let source = "/// Orphaned doc.\n\n\n\n\nfn foo() {}\n";
let lines: Vec<&str> = source.lines().collect();
let expanded = expand_doc_comment_start(&lines, 5, &["///"]);
assert_eq!(expanded, 5, "should not bridge 4 blank lines");
}
#[test]
fn recursive_impl_block_extracts_methods() {
let mut source = String::from("struct Foo;\n\nimpl Foo {\n");
source.push_str(" /// First method.\n fn method1(&self) -> i32 {\n");
for i in 0..6 {
source.push_str(&format!(" let _x{} = {};\n", i, i));
}
source.push_str(" }\n");
source.push_str(" /// Second method.\n fn method2(&self) -> i32 {\n");
for i in 0..6 {
source.push_str(&format!(" let _y{} = {};\n", i, i));
}
source.push_str(" }\n");
source.push_str(" /// Third method.\n fn method3(&self) -> i32 {\n");
for i in 0..6 {
source.push_str(&format!(" let _z{} = {};\n", i, i));
}
source.push_str(" }\n}\n");
let chunks = split_file(&source, "rust", Path::new("test.rs"), 200);
let header = chunks
.iter()
.find(|c| c.content.contains("impl Foo"))
.expect("should have an impl Foo header chunk");
assert!(
!header.content.contains("fn method1"),
"header chunk should not include method bodies"
);
let m1 = chunks
.iter()
.find(|c| c.content.contains("fn method1"))
.expect("method1 chunk");
let m2 = chunks
.iter()
.find(|c| c.content.contains("fn method2"))
.expect("method2 chunk");
let m3 = chunks
.iter()
.find(|c| c.content.contains("fn method3"))
.expect("method3 chunk");
assert!(
!m1.content.contains("fn method2"),
"m1 should not contain m2"
);
assert!(
!m2.content.contains("fn method3"),
"m2 should not contain m3"
);
assert!(
!m3.content.contains("fn method1"),
"m3 should not contain m1"
);
assert!(
m1.content.contains("/// First method"),
"m1 should include its doc"
);
assert!(
m2.content.contains("/// Second method"),
"m2 should include its doc"
);
assert!(
m3.content.contains("/// Third method"),
"m3 should include its doc"
);
assert!(
m1.start_line > 3,
"method1 start_line {} should be > 3 (file-level, not impl-relative)",
m1.start_line
);
}
#[test]
fn recursive_class_extracts_methods_python() {
let mut source = String::from("import os\n\nclass MyService:\n");
source.push_str(" # Process items.\n def process(self, items):\n");
for i in 0..8 {
source.push_str(&format!(" item_{} = items[{}]\n", i, i));
}
source.push_str(" return items\n");
source.push_str(" # Validate input.\n def validate(self, data):\n");
for i in 0..8 {
source.push_str(&format!(" val_{} = data.get('{}')\n", i, i));
}
source.push_str(" return True\n");
let chunks = split_file(&source, "python", Path::new("service.py"), 200);
let process = chunks.iter().find(|c| c.content.contains("def process"));
let validate = chunks.iter().find(|c| c.content.contains("def validate"));
assert!(process.is_some(), "should have a process chunk");
assert!(validate.is_some(), "should have a validate chunk");
let process = process.unwrap();
let validate = validate.unwrap();
assert!(
!process.content.contains("def validate"),
"process chunk should not contain validate"
);
assert!(
process.content.contains("# Process items"),
"process chunk should include its comment"
);
assert!(
validate.content.contains("# Validate input"),
"validate chunk should include its comment"
);
}
#[test]
fn recursive_falls_back_when_no_inner_types() {
let mut source = String::from("package main\n\n// BigFunc does a lot.\nfunc BigFunc() {\n");
for i in 0..60 {
source.push_str(&format!("\tx{} := {}\n", i, i));
}
source.push_str("}\n");
let chunks = split_file(&source, "go", Path::new("big.go"), 300);
assert!(
chunks.len() > 1,
"large Go func should be sub-split, got {} chunks",
chunks.len()
);
let func_chunks: Vec<_> = chunks
.iter()
.filter(|c| c.content.contains("func BigFunc"))
.collect();
assert!(
func_chunks.len() > 1,
"large Go func should produce multiple sub-chunks with signature prefix, got {}",
func_chunks.len()
);
}
#[test]
fn recursive_impl_inner_doc_comments_included() {
let mut source = String::from("impl Calculator {\n");
source.push_str(" /// Adds a and b.\n /// Returns the sum.\n");
source.push_str(" fn add(&self, a: i32, b: i32) -> i32 {\n");
for i in 0..12 {
source.push_str(&format!(" let _step{} = {};\n", i, i));
}
source.push_str(" a + b\n }\n");
source.push_str(" /// Subtracts b from a.\n");
source.push_str(" fn sub(&self, a: i32, b: i32) -> i32 {\n");
for i in 0..12 {
source.push_str(&format!(" let _step{} = {};\n", i, i));
}
source.push_str(" a - b\n }\n}\n");
let chunks = split_file(&source, "rust", Path::new("calc.rs"), 250);
let add_chunk = chunks
.iter()
.find(|c| c.content.contains("fn add"))
.expect("add chunk");
assert!(
add_chunk.content.contains("/// Adds a and b"),
"add chunk should include first doc line"
);
assert!(
add_chunk.content.contains("/// Returns the sum"),
"add chunk should include second doc line"
);
let sub_chunk = chunks
.iter()
.find(|c| c.content.contains("fn sub"))
.expect("sub chunk");
assert!(
sub_chunk.content.contains("/// Subtracts b from a"),
"sub chunk should include its doc"
);
}
#[test]
fn chunk_overlap_removed_from_ast_paths() {
let source = concat!(
"use std::io;\n",
"use std::fmt;\n",
"\n",
"fn foo() {\n let x = 1;\n}\n",
"\n",
"fn bar() {\n let y = 2;\n}\n",
);
let chunks = split_file(source, "rust", Path::new("test.rs"), 4000);
let gap_chunks: Vec<_> = chunks
.iter()
.filter(|c| c.content.contains("use std::io"))
.collect();
assert_eq!(
gap_chunks.len(),
1,
"use statements should appear in exactly one chunk, not duplicated by overlap"
);
}
#[test]
fn ast_split_bash_two_functions() {
let source = "foo() {\n echo foo\n}\n\nbar() {\n echo bar\n}\n";
let chunks = split_file(source, "bash", Path::new("script.sh"), 4000);
assert!(
chunks.len() >= 2,
"expected at least 2 chunks for a 2-function bash script, got {}",
chunks.len()
);
assert!(chunks.iter().any(|c| c.content.contains("foo")));
assert!(chunks.iter().any(|c| c.content.contains("bar")));
}
#[test]
fn extract_signature_rust_fn() {
let s = extract_signature("pub fn foo(x: i32) -> Result<String> {");
assert_eq!(s, "pub fn foo(x: i32) -> Result<String>");
}
#[test]
fn extract_signature_python_def() {
let s = extract_signature("def bar(self, token: str) -> bool:");
assert_eq!(s, "def bar(self, token: str) -> bool");
}
#[test]
fn extract_signature_arrow_fn() {
let s = extract_signature("const foo = (x) => {");
assert_eq!(s, "const foo = (x)");
}
#[test]
fn extract_signature_truncates_at_100_chars() {
let long = "pub fn a_very_long_name_with_lots_of_generic_parameters<T: Clone + Send + Sync + Debug + Display + PartialEq>(x: T) -> Result<T> {";
let s = extract_signature(long);
assert!(
s.chars().count() <= 100,
"expected <=100 chars, got {}: {s}",
s.chars().count()
);
}
#[test]
fn extract_signature_no_block_start() {
let s = extract_signature("pub const X: i32 = 5;");
assert_eq!(s, "pub const X: i32 = 5;");
}
#[test]
fn kind_keyword_rust_function() {
assert_eq!(kind_keyword_for_node("rust", "function_item"), Some("fn"));
}
#[test]
fn kind_keyword_rust_struct() {
assert_eq!(kind_keyword_for_node("rust", "struct_item"), Some("struct"));
}
#[test]
fn kind_keyword_rust_impl() {
assert_eq!(kind_keyword_for_node("rust", "impl_item"), Some("impl"));
}
#[test]
fn kind_keyword_python_class() {
assert_eq!(
kind_keyword_for_node("python", "class_definition"),
Some("class")
);
}
#[test]
fn kind_keyword_python_async() {
assert_eq!(
kind_keyword_for_node("python", "async_function_definition"),
Some("async def")
);
}
#[test]
fn kind_keyword_typescript_method() {
assert_eq!(
kind_keyword_for_node("typescript", "method_definition"),
Some("method")
);
}
#[test]
fn kind_keyword_unknown_returns_none() {
assert_eq!(kind_keyword_for_node("rust", "weird_node"), None);
assert_eq!(kind_keyword_for_node("klingon", "function_item"), None);
}
#[test]
fn metadata_header_top_level_rust_fn() {
let h = build_metadata_header(
"src/foo.rs",
&[],
Some("fn"),
Some("foo"),
Some("fn foo(x: i32)"),
);
assert_eq!(h.as_deref(), Some("src/foo.rs :: fn foo(x: i32)"));
}
#[test]
fn metadata_header_rust_method_in_impl() {
let h = build_metadata_header(
"src/embed/index.rs",
&["impl IndexStore"],
Some("fn"),
Some("build_index"),
Some("fn build_index(force: bool)"),
);
assert_eq!(
h.as_deref(),
Some("src/embed/index.rs :: impl IndexStore :: fn build_index(force: bool)")
);
}
#[test]
fn metadata_header_struct_no_signature() {
let h = build_metadata_header("src/foo.rs", &[], Some("struct"), Some("Bar"), None);
assert_eq!(h.as_deref(), Some("src/foo.rs :: struct Bar"));
}
#[test]
fn metadata_header_gap_file_only() {
let h = build_metadata_header("src/foo.rs", &[], None, None, None);
assert_eq!(h.as_deref(), Some("src/foo.rs"));
}
#[test]
fn metadata_header_container_only() {
let h = build_metadata_header("src/foo.rs", &["impl Bar"], None, None, None);
assert_eq!(h.as_deref(), Some("src/foo.rs :: impl Bar"));
}
#[test]
fn metadata_header_kind_without_signature_uses_name() {
let h = build_metadata_header("src/foo.rs", &[], Some("fn"), Some("bar"), None);
assert_eq!(h.as_deref(), Some("src/foo.rs :: fn bar"));
}
#[test]
fn metadata_header_name_only_no_kind_no_sig() {
let h = build_metadata_header("src/foo.rs", &[], None, Some("orphan_name"), None);
assert_eq!(h.as_deref(), Some("src/foo.rs :: orphan_name"));
}
#[test]
fn metadata_header_nested_container() {
let h = build_metadata_header(
"src/x.rs",
&["mod inner", "impl Foo"],
Some("fn"),
Some("baz"),
Some("fn baz()"),
);
assert_eq!(
h.as_deref(),
Some("src/x.rs :: mod inner :: impl Foo :: fn baz()")
);
}
#[test]
fn split_file_rust_populates_metadata_headers() {
use std::path::Path;
let src = r#"
pub fn top_level() {
println!("hi");
}
pub struct MyStore;
impl MyStore {
pub fn build(&self) {
// body
}
}
"#;
let chunks = split_file(src, "rust", Path::new("src/mystore.rs"), 4000);
let top = chunks
.iter()
.find(|c| c.content.contains("top_level"))
.expect("top_level chunk");
let meta = top.metadata.as_deref().expect("top_level has metadata");
assert!(meta.contains("src/mystore.rs"), "meta missing path: {meta}");
assert!(meta.contains("fn"), "meta missing kind fn: {meta}");
assert!(meta.contains("top_level"), "meta missing name: {meta}");
let build = chunks
.iter()
.find(|c| c.content.contains("fn build"))
.expect("build chunk");
let bmeta = build.metadata.as_deref().expect("build has metadata");
assert!(
bmeta.contains("impl MyStore"),
"build metadata missing impl container: {bmeta}"
);
assert!(
bmeta.contains("fn") && bmeta.contains("build"),
"build metadata incomplete: {bmeta}"
);
}
#[test]
fn split_file_signature_skips_doc_comments() {
use std::path::Path;
let src = r#"
/// Compute the answer.
/// A second line of documentation.
pub fn compute(x: i32) -> i32 {
x + 1
}
"#;
let chunks = split_file(src, "rust", Path::new("src/math.rs"), 4000);
let chunk = chunks
.iter()
.find(|c| c.content.contains("pub fn compute"))
.expect("compute chunk");
let meta = chunk.metadata.as_deref().expect("metadata present");
assert!(
meta.contains("compute"),
"signature missing symbol name: {meta}"
);
assert!(
!meta.contains("///"),
"doc comment leaked into signature: {meta}"
);
assert!(
!meta.contains("Compute the answer"),
"doc body leaked into signature: {meta}"
);
}
#[test]
fn inner_method_signature_skips_doc_comments() {
use std::path::Path;
let src = r#"
pub struct Foo;
impl Foo {
/// Compute the answer.
/// A second line of documentation.
pub fn compute(&self, x: i32) -> i32 {
x + 1
}
}
"#;
let chunks = split_file(src, "rust", Path::new("src/foo.rs"), 4000);
let chunk = chunks
.iter()
.find(|c| c.content.contains("pub fn compute"))
.expect("compute chunk");
let meta = chunk.metadata.as_deref().expect("metadata present");
assert!(
meta.contains("compute"),
"signature missing symbol name: {meta}"
);
assert!(
!meta.contains("///"),
"doc comment leaked into inner-method signature: {meta}"
);
assert!(
!meta.contains("Compute the answer"),
"doc body leaked into inner-method signature: {meta}"
);
}
}