use std::collections::HashSet;
use tree_sitter::Node;
use super::classify::{
classify_node, name_of, php_enclosing_class_name, rust_impl_type_name,
scala_enclosing_class_name,
};
use super::inherits::collect_inherits;
use super::{ChunkType, RawChunk};
const MAX_CHUNK_LINES: usize = 200;
const SUB_CHUNK_WINDOW: usize = 100;
const SUB_CHUNK_STRIDE: usize = 50;
pub(super) fn line_for_byte(line_offsets: &[usize], byte: usize) -> usize {
match line_offsets.binary_search(&byte) {
Ok(i) => i + 1,
Err(i) => i.max(1),
}
}
pub(super) fn build_line_offsets(src: &[u8]) -> Vec<usize> {
let mut v = vec![0usize];
for (i, b) in src.iter().enumerate() {
if *b == b'\n' {
v.push(i + 1);
}
}
v
}
pub(super) fn make_chunk_id(
file: &str,
chunk_type: &ChunkType,
name: &str,
start_line: usize,
end_line: usize,
) -> String {
if name.is_empty() {
format!("{file}:{start_line}:{end_line}")
} else {
format!("{file}::{}::{name}::{start_line}", chunk_type.as_str())
}
}
pub(super) fn collect_calls(node: Node<'_>, src: &[u8], lang: &str) -> Vec<String> {
let mut out: HashSet<String> = HashSet::new();
let mut stack: Vec<Node> = vec![node];
while let Some(n) = stack.pop() {
let kind = n.kind();
let is_fn_kind = matches!(
(lang, kind),
("rust", "function_item")
| ("python", "function_definition")
| ("javascript", "function_declaration")
| ("typescript", "function_declaration")
| ("go", "function_declaration")
| ("java", "method_declaration")
| ("c" | "cpp", "function_definition")
| ("ruby", "method")
| ("ruby", "singleton_method")
| ("php", "function_definition")
| ("php", "method_declaration")
| ("scala", "function_definition")
| ("csharp", "method_declaration")
| ("csharp", "constructor_declaration")
| ("kotlin", "function_declaration")
| ("kotlin", "secondary_constructor")
| ("swift", "function_declaration")
| ("swift", "init_declaration")
| ("swift", "protocol_function_declaration")
);
if is_fn_kind && n.id() != node.id() {
continue;
}
let is_call = matches!(
(lang, kind),
("rust", "call_expression")
| ("python", "call")
| ("javascript" | "typescript", "call_expression")
| ("go", "call_expression")
| ("java", "method_invocation")
| ("c" | "cpp", "call_expression")
| ("ruby", "call")
| ("php", "function_call_expression")
| ("php", "member_call_expression")
| ("php", "scoped_call_expression")
| ("php", "nullsafe_member_call_expression")
| ("scala", "call_expression")
| ("csharp", "invocation_expression")
| ("kotlin", "call_expression")
| ("swift", "call_expression")
);
if is_call {
let callee = n
.child_by_field_name("function")
.or_else(|| n.child_by_field_name("name"))
.or_else(|| n.child(0));
if let Some(c) = callee {
let raw = std::str::from_utf8(&src[c.start_byte()..c.end_byte()])
.unwrap_or("")
.to_string();
let simple = raw
.rsplit(['.', ':'])
.next()
.unwrap_or(&raw)
.trim()
.to_string();
if !simple.is_empty() {
out.insert(simple);
}
}
}
let mut cursor = n.walk();
for child in n.children(&mut cursor) {
stack.push(child);
}
}
let mut v: Vec<String> = out.into_iter().collect();
v.sort();
v
}
pub(super) fn preceding_doc_comments(node: Node<'_>, src: &[u8]) -> String {
let mut buf = String::new();
let mut prev = node.prev_sibling();
while let Some(p) = prev {
if p.kind() == "line_comment" || p.kind() == "block_comment" {
let txt = std::str::from_utf8(&src[p.start_byte()..p.end_byte()]).unwrap_or("");
if txt.starts_with("///") || txt.starts_with("//!") || txt.starts_with("/**") {
buf.insert_str(0, txt);
buf.insert(0, '\n');
}
prev = p.prev_sibling();
} else {
break;
}
}
buf
}
pub(super) fn nlp_from_doc(doc: &str) -> (Vec<String>, Vec<String>) {
let mut keywords: Vec<String> = Vec::new();
let mut code_refs: Vec<String> = Vec::new();
let mut in_backticks = false;
let mut buf = String::new();
for ch in doc.chars() {
if ch == '`' {
if in_backticks && !buf.is_empty() {
code_refs.push(buf.clone());
}
buf.clear();
in_backticks = !in_backticks;
} else if in_backticks {
buf.push(ch);
}
}
let mut depth = 0;
for word in doc.split(|c: char| !c.is_alphanumeric() && c != '_') {
if word.contains('`') {
depth = if depth == 0 { 1 } else { 0 };
continue;
}
if word.len() < 3 {
continue;
}
let Some(first) = word.chars().next() else {
continue;
};
let all_upper = word
.chars()
.all(|c| c.is_ascii_uppercase() || c.is_ascii_digit());
let title =
first.is_ascii_uppercase() && word.chars().skip(1).any(|c| c.is_ascii_lowercase());
if all_upper || title {
keywords.push(word.to_string());
}
}
keywords.sort();
keywords.dedup();
code_refs.sort();
code_refs.dedup();
(keywords, code_refs)
}
fn qualify_method_name(
lang: &str,
chunk_type: &ChunkType,
node: Node<'_>,
src: &[u8],
name: String,
) -> String {
if *chunk_type != ChunkType::Method || name.is_empty() {
return name;
}
match lang {
"rust" => {
if let Some(ty) = rust_impl_type_name(node, src) {
return format!("{ty}::{name}");
}
}
"scala" => {
if let Some(ty) = scala_enclosing_class_name(node, src) {
if !ty.is_empty() {
return format!("{ty}::{name}");
}
}
}
"php" => {
if let Some(ty) = php_enclosing_class_name(node, src) {
if !ty.is_empty() {
return format!("{ty}::{name}");
}
}
}
_ => {}
}
name
}
fn is_function_body_node(kind: &str) -> bool {
matches!(
kind,
"function_item"
| "function_declaration"
| "function_definition"
| "method_declaration"
| "method_definition"
| "constructor_declaration"
| "secondary_constructor"
| "init_declaration"
| "protocol_function_declaration"
)
}
pub(super) fn walk_for_chunks(
node: Node<'_>,
src: &[u8],
file: &str,
lang: &str,
line_offsets: &[usize],
depth: usize,
out: &mut Vec<RawChunk>,
) {
if let Some(chunk_type) = classify_node(lang, node) {
let start_byte = node.start_byte();
let end_byte = node.end_byte();
let start_line = line_for_byte(line_offsets, start_byte);
let end_line = line_for_byte(line_offsets, end_byte.saturating_sub(1));
let content = std::str::from_utf8(&src[start_byte..end_byte])
.unwrap_or("")
.to_string();
let name = qualify_method_name(lang, &chunk_type, node, src, name_of(node, src));
let calls = collect_calls(node, src, lang);
let inherits_from = collect_inherits(node, src, lang);
let doc = preceding_doc_comments(node, src);
let (nlp_keywords, nlp_code_refs) = nlp_from_doc(&doc);
let id = make_chunk_id(file, &chunk_type, &name, start_line, end_line);
out.push(RawChunk {
id,
file: file.to_string(),
start_line,
end_line,
content,
function_name: if name.is_empty() { None } else { Some(name) },
language: Some(lang.to_string()),
chunk_type,
calls,
inherits_from,
chunk_depth: depth,
parent_chunk_id: None,
child_chunk_ids: Vec::new(),
nlp_keywords,
nlp_code_refs,
virtual_terms: Vec::new(),
});
if !is_function_body_node(node.kind()) {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
walk_for_chunks(child, src, file, lang, line_offsets, depth + 1, out);
}
}
return;
}
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
walk_for_chunks(child, src, file, lang, line_offsets, depth, out);
}
}
pub(super) fn split_oversized(chunks: Vec<RawChunk>) -> Vec<RawChunk> {
let mut out: Vec<RawChunk> = Vec::with_capacity(chunks.len());
for chunk in chunks {
let line_count = chunk.end_line.saturating_sub(chunk.start_line) + 1;
if line_count <= MAX_CHUNK_LINES {
out.push(chunk);
continue;
}
let parent_id = chunk.id.clone();
let mut child_ids: Vec<String> = Vec::new();
let lines: Vec<&str> = chunk.content.lines().collect();
let mut start = 0usize;
let mut sub_idx = 0usize;
while start < lines.len() {
let end = (start + SUB_CHUNK_WINDOW).min(lines.len());
let text = lines[start..end].join("\n");
let sub_id = format!("{parent_id}::sub::{sub_idx}");
child_ids.push(sub_id.clone());
out.push(RawChunk {
id: sub_id,
file: chunk.file.clone(),
start_line: chunk.start_line + start,
end_line: chunk.start_line + end - 1,
content: text,
function_name: chunk.function_name.clone(),
language: chunk.language.clone(),
chunk_type: chunk.chunk_type.clone(),
calls: Vec::new(),
inherits_from: Vec::new(),
chunk_depth: chunk.chunk_depth,
parent_chunk_id: Some(parent_id.clone()),
child_chunk_ids: Vec::new(),
nlp_keywords: Vec::new(),
nlp_code_refs: Vec::new(),
virtual_terms: Vec::new(),
});
if end == lines.len() {
break;
}
start += SUB_CHUNK_STRIDE;
sub_idx += 1;
}
let mut parent = chunk;
parent.child_chunk_ids = child_ids;
out.push(parent);
}
out
}