use std::path::Path;
use std::sync::atomic::{AtomicU64, Ordering};
use tree_sitter::Node;
use crate::error::ParserError;
use crate::parser::treesitter::SupportedLanguage;
use crate::parser::{ChunkKind, CodeChunk, ParseResult};
static CHUNK_ID_COUNTER: AtomicU64 = AtomicU64::new(1);
fn next_chunk_id() -> u64 {
CHUNK_ID_COUNTER.fetch_add(1, Ordering::Relaxed)
}
const MIN_CHUNK_LINES: usize = 2;
const FALLBACK_CHUNK_SIZE: usize = 50;
pub fn chunk_file(
path: &Path,
source: &str,
lang: SupportedLanguage,
) -> Result<ParseResult, ParserError> {
let tree = crate::parser::treesitter::parse_source(source, lang)?;
let root = tree.root_node();
let chunk_kinds = lang.chunk_node_kinds();
let mut chunks = Vec::new();
if chunk_kinds.is_empty() {
if source.lines().count() <= FALLBACK_CHUNK_SIZE * 2 {
chunks.push(CodeChunk {
id: next_chunk_id(),
file_path: path.to_path_buf(),
language: lang.name().to_string(),
kind: ChunkKind::Block,
name: path.file_name().and_then(|f| f.to_str()).map(String::from),
signature: None,
doc_comment: None,
body: source.to_string(),
byte_range: 0..source.len(),
line_range: 0..source.lines().count(),
});
}
} else {
extract_chunks_recursive(
&root,
source,
path,
lang,
chunk_kinds,
&mut chunks,
);
if chunks.is_empty() {
chunks = fallback_line_chunks(path, source, lang);
}
}
Ok(ParseResult {
chunks,
language: lang.name().to_string(),
})
}
fn extract_chunks_recursive(
node: &Node,
source: &str,
file_path: &Path,
lang: SupportedLanguage,
chunk_kinds: &[&str],
chunks: &mut Vec<CodeChunk>,
) {
let kind = node.kind();
if chunk_kinds.contains(&kind) {
if let Some(chunk) = node_to_chunk(node, source, file_path, lang) {
let line_count = chunk.line_range.end - chunk.line_range.start;
if line_count >= MIN_CHUNK_LINES {
chunks.push(chunk);
}
}
if should_recurse_into(kind) {
let mut cursor = node.walk();
if cursor.goto_first_child() {
loop {
let child = cursor.node();
extract_chunks_recursive(
&child, source, file_path, lang, chunk_kinds, chunks,
);
if !cursor.goto_next_sibling() {
break;
}
}
}
}
} else {
let mut cursor = node.walk();
if cursor.goto_first_child() {
loop {
let child = cursor.node();
extract_chunks_recursive(
&child, source, file_path, lang, chunk_kinds, chunks,
);
if !cursor.goto_next_sibling() {
break;
}
}
}
}
}
fn should_recurse_into(kind: &str) -> bool {
matches!(
kind,
"impl_item"
| "class_declaration"
| "class_definition"
| "class_specifier"
| "interface_declaration"
| "namespace_definition"
| "module"
| "mod_item"
| "export_statement"
| "decorated_definition"
)
}
fn node_to_chunk(
node: &Node,
source: &str,
file_path: &Path,
lang: SupportedLanguage,
) -> Option<CodeChunk> {
let start_byte = node.start_byte();
let end_byte = node.end_byte();
if end_byte <= start_byte || end_byte > source.len() {
return None;
}
let body = source[start_byte..end_byte].to_string();
let start_line = node.start_position().row;
let end_line = node.end_position().row + 1;
let kind = classify_node_kind(node.kind(), lang);
let name = extract_node_name(node, source);
let signature = extract_signature(node, source, lang);
let doc_comment = extract_doc_comment(node, source, start_line);
Some(CodeChunk {
id: next_chunk_id(),
file_path: file_path.to_path_buf(),
language: lang.name().to_string(),
kind,
name,
signature,
doc_comment,
body,
byte_range: start_byte..end_byte,
line_range: start_line..end_line,
})
}
fn classify_node_kind(ts_kind: &str, _lang: SupportedLanguage) -> ChunkKind {
match ts_kind {
"function_item" | "function_definition" | "function_declaration" | "arrow_function" => {
ChunkKind::Function
}
"method_definition" | "method_declaration" | "method" | "singleton_method"
| "constructor_declaration" => ChunkKind::Method,
"class_declaration" | "class_definition" | "class_specifier" => ChunkKind::Class,
"struct_item" | "struct_specifier" => ChunkKind::Struct,
"enum_item" | "enum_declaration" | "enum_specifier" => ChunkKind::Enum,
"interface_declaration" | "trait_item" => ChunkKind::Interface,
"mod_item" | "namespace_definition" | "module" => ChunkKind::Module,
"impl_item" => ChunkKind::Module,
_ => ChunkKind::Block,
}
}
fn extract_node_name(node: &Node, source: &str) -> Option<String> {
for field_name in &["name", "declarator"] {
if let Some(name_node) = node.child_by_field_name(field_name) {
let name = &source[name_node.start_byte()..name_node.end_byte()];
return Some(name.to_string());
}
}
let mut cursor = node.walk();
if cursor.goto_first_child() {
loop {
let child = cursor.node();
if child.kind() == "identifier" || child.kind() == "type_identifier" {
let name = &source[child.start_byte()..child.end_byte()];
return Some(name.to_string());
}
if !cursor.goto_next_sibling() {
break;
}
}
}
None
}
fn extract_signature(node: &Node, source: &str, _lang: SupportedLanguage) -> Option<String> {
let body = &source[node.start_byte()..node.end_byte()];
if let Some(pos) = body.find('{') {
let sig = body[..pos].trim();
if !sig.is_empty() {
return Some(sig.to_string());
}
}
if let Some(pos) = body.find(':') {
let before_colon = &body[..pos];
if before_colon.contains("def ") || before_colon.contains("class ") {
let sig = body[..=pos].trim();
if !sig.is_empty() {
return Some(sig.to_string());
}
}
}
let first_line = body.lines().next().map(|l| l.trim().to_string());
first_line.filter(|l| !l.is_empty())
}
fn extract_doc_comment(
node: &Node,
source: &str,
_node_start_line: usize,
) -> Option<String> {
let mut prev = node.prev_sibling();
let mut comments = Vec::new();
while let Some(sibling) = prev {
let kind = sibling.kind();
if kind == "line_comment" || kind == "comment" || kind == "block_comment" {
let text = &source[sibling.start_byte()..sibling.end_byte()];
comments.push(text.to_string());
prev = sibling.prev_sibling();
} else {
break;
}
}
if comments.is_empty() {
return None;
}
comments.reverse();
let combined = comments.join("\n");
let cleaned: String = combined
.lines()
.map(|line| {
let trimmed = line.trim();
if let Some(stripped) = trimmed.strip_prefix("///") {
stripped.trim().to_string()
} else if let Some(stripped) = trimmed.strip_prefix("//!") {
stripped.trim().to_string()
} else if let Some(stripped) = trimmed.strip_prefix("//") {
stripped.trim().to_string()
} else if let Some(stripped) = trimmed.strip_prefix('#') {
stripped.trim().to_string()
} else {
trimmed.to_string()
}
})
.collect::<Vec<_>>()
.join("\n");
if cleaned.trim().is_empty() {
None
} else {
Some(cleaned)
}
}
fn fallback_line_chunks(
file_path: &Path,
source: &str,
lang: SupportedLanguage,
) -> Vec<CodeChunk> {
let lines: Vec<&str> = source.lines().collect();
let total_lines = lines.len();
if total_lines == 0 {
return Vec::new();
}
let mut chunks = Vec::new();
let mut offset = 0;
for chunk_start in (0..total_lines).step_by(FALLBACK_CHUNK_SIZE) {
let chunk_end = (chunk_start + FALLBACK_CHUNK_SIZE).min(total_lines);
let chunk_lines = &lines[chunk_start..chunk_end];
let body = chunk_lines.join("\n");
let byte_start = offset;
let byte_end = offset + body.len();
offset = byte_end + 1;
chunks.push(CodeChunk {
id: next_chunk_id(),
file_path: file_path.to_path_buf(),
language: lang.name().to_string(),
kind: ChunkKind::Block,
name: Some(format!(
"{}:L{}-L{}",
file_path.file_name().unwrap_or_default().to_string_lossy(),
chunk_start + 1,
chunk_end
)),
signature: None,
doc_comment: None,
body,
byte_range: byte_start..byte_end,
line_range: chunk_start..chunk_end,
});
}
chunks
}
pub fn chunk_file_from_path(path: &Path) -> Result<Option<ParseResult>, ParserError> {
let lang = match SupportedLanguage::from_path(path) {
Some(l) => l,
None => return Ok(None), };
let source = std::fs::read_to_string(path).map_err(ParserError::Io)?;
if crate::scanner::filter::is_binary_content(source.as_bytes()) {
return Ok(None);
}
let result = chunk_file(path, &source, lang)?;
Ok(Some(result))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_chunk_rust_file() {
let source = r#"
/// A greeting function.
fn greet(name: &str) -> String {
format!("Hello, {}!", name)
}
/// A struct.
struct User {
name: String,
age: u32,
}
impl User {
fn new(name: String, age: u32) -> Self {
Self { name, age }
}
fn display(&self) -> String {
format!("{} ({})", self.name, self.age)
}
}
"#;
let result = chunk_file(
Path::new("test.rs"),
source,
SupportedLanguage::Rust,
)
.unwrap();
assert_eq!(result.language, "rust");
assert!(!result.chunks.is_empty(), "Should find at least some chunks");
let greet = result.chunks.iter().find(|c| c.name.as_deref() == Some("greet"));
assert!(greet.is_some(), "Should find greet function");
let greet = greet.unwrap();
assert_eq!(greet.kind, ChunkKind::Function);
assert!(greet.doc_comment.is_some(), "Should extract doc comment");
assert!(greet.signature.is_some(), "Should extract signature");
}
#[test]
fn test_chunk_python_file() {
let source = r#"
class Calculator:
"""A simple calculator."""
def add(self, a: int, b: int) -> int:
"""Add two numbers."""
return a + b
def subtract(self, a: int, b: int) -> int:
return a - b
def standalone_function(x: str) -> bool:
return len(x) > 0
"#;
let result = chunk_file(
Path::new("calc.py"),
source,
SupportedLanguage::Python,
)
.unwrap();
assert_eq!(result.language, "python");
assert!(!result.chunks.is_empty());
}
#[test]
fn test_chunk_javascript_file() {
let source = r#"
function fetchData(url) {
return fetch(url).then(r => r.json());
}
class EventEmitter {
constructor() {
this.listeners = {};
}
on(event, callback) {
this.listeners[event] = callback;
}
}
"#;
let result = chunk_file(
Path::new("app.js"),
source,
SupportedLanguage::JavaScript,
)
.unwrap();
assert_eq!(result.language, "javascript");
assert!(!result.chunks.is_empty());
}
#[test]
fn test_fallback_chunking() {
let source = "#!/bin/bash\necho 'hello'\necho 'world'\n".repeat(30);
let result = chunk_file(
Path::new("script.sh"),
&source,
SupportedLanguage::Bash,
)
.unwrap();
assert!(!result.chunks.is_empty());
}
}