use super::*;
use ast_grep_core::tree_sitter::LanguageExt;
use std::collections::HashSet;
fn parse_and_chunk(source: &str, ext: &str, config: &ChunkConfig) -> Vec<CodeChunk> {
let path = format!("test.{ext}");
let engine = crate::index::engine::AstGrepEngine::new();
let lang = engine.find_language(ext).expect("extractor for extension");
let symbols = engine.extract_symbols(lang, source, &path);
let root = lang.lang.ast_grep(source);
chunk_file(&root, source, &path, &symbols, config)
}
#[test]
fn test_chunk_config_default() {
let config = ChunkConfig::default();
assert_eq!(config.max_chunk_size, 1500);
assert_eq!(config.min_chunk_size, 50);
}
#[test]
fn small_file_produces_single_chunk() {
let source = r#"pub fn hello() { println!("hello"); }"#;
let config = ChunkConfig::default();
let chunks = parse_and_chunk(source, "rs", &config);
assert_eq!(chunks.len(), 1, "A small file should produce one chunk");
assert_eq!(chunks[0].index, 0);
assert_eq!(chunks[0].file_path, "test.rs");
assert!(chunks[0].non_ws_chars > 0);
}
#[test]
fn large_file_produces_multiple_chunks() {
let mut source = String::new();
for i in 0..50 {
source.push_str(&format!(
"pub fn function_{i}(x: i32) -> i32 {{\n let result = x * {i} + 1;\n result\n}}\n\n"
));
}
let config = ChunkConfig {
max_chunk_size: 200,
min_chunk_size: 20,
..Default::default()
};
let chunks = parse_and_chunk(&source, "rs", &config);
assert!(
chunks.len() > 1,
"A large file with small max_chunk_size should produce multiple chunks, got {}",
chunks.len()
);
for chunk in &chunks {
assert!(
chunk.non_ws_chars <= config.max_chunk_size,
"Chunk {} has {} non-ws chars, exceeding max {}",
chunk.index,
chunk.non_ws_chars,
config.max_chunk_size,
);
}
}
#[test]
fn parent_symbol_resolution() {
let source = r#"
pub struct Config {
pub debug: bool,
}
impl Config {
pub fn new() -> Self {
Config { debug: false }
}
pub fn enable_debug(&mut self) {
self.debug = true;
}
}
"#;
let config = ChunkConfig {
max_chunk_size: 50,
min_chunk_size: 10,
..Default::default()
};
let chunks = parse_and_chunk(source, "rs", &config);
let with_parent: Vec<_> = chunks
.iter()
.filter(|c| c.parent_symbol.is_some())
.collect();
assert!(
!with_parent.is_empty(),
"At least one chunk should have a resolved parent symbol"
);
}
#[test]
fn deep_nesting_still_produces_chunks() {
let source = r#"
pub fn outer() {
if true {
if true {
if true {
if true {
println!("deeply nested");
}
}
}
}
}
"#;
let config = ChunkConfig {
max_chunk_size: 30,
min_chunk_size: 5,
..Default::default()
};
let chunks = parse_and_chunk(source, "rs", &config);
assert!(
!chunks.is_empty(),
"Should produce at least one chunk from deeply nested code"
);
}
#[test]
fn custom_chunk_config() {
let source = "pub fn a() {}\npub fn b() {}\npub fn c() {}\n";
let config = ChunkConfig {
max_chunk_size: 5000,
min_chunk_size: 1,
..Default::default()
};
let chunks = parse_and_chunk(source, "rs", &config);
assert_eq!(
chunks.len(),
1,
"With large max_chunk_size, file should be a single chunk"
);
}
#[test]
fn empty_file_produces_zero_chunks() {
let source = "";
let config = ChunkConfig::default();
let chunks = parse_and_chunk(source, "rs", &config);
assert!(chunks.is_empty(), "Empty file should produce zero chunks");
}
#[test]
fn whitespace_only_file_produces_zero_chunks() {
let source = " \n\n \t \n";
let config = ChunkConfig::default();
let chunks = parse_and_chunk(source, "rs", &config);
assert!(
chunks.is_empty(),
"Whitespace-only file should produce zero chunks"
);
}
#[test]
fn multi_language_rust() {
let source = r#"
pub fn greet(name: &str) -> String {
format!("Hello, {}!", name)
}
pub struct Greeter {
pub prefix: String,
}
impl Greeter {
pub fn new(prefix: &str) -> Self {
Self { prefix: prefix.to_string() }
}
}
"#;
let config = ChunkConfig::default();
let chunks = parse_and_chunk(source, "rs", &config);
assert!(!chunks.is_empty(), "Rust file should produce chunks");
for chunk in &chunks {
assert_eq!(chunk.file_path, "test.rs");
}
}
#[test]
fn multi_language_typescript() {
let source = r#"
export function greet(name: string): string {
return `Hello, ${name}!`;
}
export class Greeter {
prefix: string;
constructor(prefix: string) {
this.prefix = prefix;
}
}
"#;
let config = ChunkConfig::default();
let chunks = parse_and_chunk(source, "ts", &config);
assert!(!chunks.is_empty(), "TypeScript file should produce chunks");
for chunk in &chunks {
assert_eq!(chunk.file_path, "test.ts");
}
}
#[test]
fn chunks_are_contiguous_indices() {
let source = r#"
pub fn a() { }
pub fn b() { }
pub fn c() { }
pub fn d() { }
"#;
let config = ChunkConfig {
max_chunk_size: 20,
min_chunk_size: 5,
..Default::default()
};
let chunks = parse_and_chunk(source, "rs", &config);
for (i, chunk) in chunks.iter().enumerate() {
assert_eq!(
chunk.index, i,
"Chunk indices should be contiguous starting from 0"
);
}
}
#[test]
fn chunk_line_ranges_are_valid() {
let source = r#"
pub fn first() { }
pub fn second() { }
pub fn third() { }
"#;
let config = ChunkConfig::default();
let chunks = parse_and_chunk(source, "rs", &config);
for chunk in &chunks {
assert!(
chunk.line_end >= chunk.line_start,
"line_end ({}) should be >= line_start ({})",
chunk.line_end,
chunk.line_start
);
assert!(
chunk.byte_end >= chunk.byte_start,
"byte_end ({}) should be >= byte_start ({})",
chunk.byte_end,
chunk.byte_start
);
}
}
#[test]
fn classify_node_categories() {
assert_eq!(classify_node("use_declaration"), SemanticCategory::Import);
assert_eq!(classify_node("import_statement"), SemanticCategory::Import);
assert_eq!(classify_node("line_comment"), SemanticCategory::Comment);
assert_eq!(classify_node("block_comment"), SemanticCategory::Comment);
assert_eq!(
classify_node("function_item"),
SemanticCategory::Declaration
);
assert_eq!(classify_node("impl_item"), SemanticCategory::Declaration);
assert_eq!(
classify_node("class_declaration"),
SemanticCategory::Declaration
);
assert_eq!(classify_node("struct_item"), SemanticCategory::Declaration);
assert_eq!(
classify_node("expression_statement"),
SemanticCategory::Other
);
assert_eq!(classify_node("if_expression"), SemanticCategory::Other);
}
#[test]
fn is_semantic_boundary_detects_declarations() {
assert!(is_semantic_boundary("function_item"));
assert!(is_semantic_boundary("method_definition"));
assert!(is_semantic_boundary("class_declaration"));
assert!(is_semantic_boundary("impl_item"));
assert!(!is_semantic_boundary("use_declaration"));
assert!(!is_semantic_boundary("if_expression"));
assert!(!is_semantic_boundary("line_comment"));
}
#[test]
fn impl_block_splits_at_method_boundaries() {
let source = r#"
impl MyStruct {
pub fn method_a(&self) -> String {
let x = "hello world".to_string();
let y = x.repeat(10);
let z = y.trim().to_uppercase();
format!("{z} done")
}
pub fn method_b(&self) -> i32 {
let a = 42;
let b = a * 2;
let c = b + a;
c * 3
}
pub fn method_c(&self) -> bool {
let flag = true;
let result = !flag;
result || flag
}
}
"#;
let config = ChunkConfig {
max_chunk_size: 100,
min_chunk_size: 10,
..Default::default()
};
let chunks = parse_and_chunk(source, "rs", &config);
let method_chunks: Vec<_> = chunks
.iter()
.filter(|c| c.text.contains("fn method_"))
.collect();
assert!(
method_chunks.len() >= 3,
"Each method should be a separate chunk, got {} method chunks from {} total",
method_chunks.len(),
chunks.len()
);
}
#[test]
fn imports_not_merged_with_functions() {
let source = r#"
use std::collections::HashMap;
pub fn tiny() -> i32 { 1 }
"#;
let config = ChunkConfig {
max_chunk_size: 500,
min_chunk_size: 100, ..Default::default()
};
let chunks = parse_and_chunk(source, "rs", &config);
assert!(!chunks.is_empty());
if chunks.len() >= 2 {
let has_import = chunks.iter().any(|c| c.text.contains("use std"));
let has_fn = chunks.iter().any(|c| c.text.contains("fn tiny"));
assert!(has_import, "Should have an import chunk");
assert!(has_fn, "Should have a function chunk");
}
}
#[test]
fn signature_injected_for_inner_chunks() {
let mut body = String::new();
for i in 0..30 {
body.push_str(&format!(" let var_{i} = {i} * 2 + 1;\n"));
}
let source = format!("pub fn big_function(x: i32, y: i32) -> i32 {{\n{body} x + y\n}}");
let config = ChunkConfig {
max_chunk_size: 150,
min_chunk_size: 20,
..Default::default()
};
let chunks = parse_and_chunk(&source, "rs", &config);
if chunks.len() > 1 {
let inner_with_sig: Vec<_> = chunks
.iter()
.filter(|c| c.text.contains("[context:") && c.text.contains("big_function"))
.collect();
assert!(
!inner_with_sig.is_empty(),
"Inner chunks should have signature context injected. Chunks: {:?}",
chunks
.iter()
.map(|c| &c.text[..c.text.len().min(60)])
.collect::<Vec<_>>()
);
}
}
#[test]
fn truncate_signature_short() {
assert_eq!(truncate_signature("fn foo()", 120), "fn foo()");
}
#[test]
fn truncate_signature_long() {
let sig = "pub fn very_long_function_name(param1: String, param2: HashMap<String, Vec<i32>>, param3: Option<Box<dyn Trait>>) -> Result<String, Error>";
let truncated = truncate_signature(sig, 60);
assert!(truncated.len() <= 60);
assert!(truncated.starts_with("pub fn very_long_function_name"));
}
#[test]
fn truncate_signature_multiline() {
let sig = "pub fn foo(\n x: i32,\n y: i32,\n) -> i32";
assert_eq!(truncate_signature(sig, 120), "pub fn foo(");
}
#[test]
fn categories_mergeable_same() {
assert!(categories_mergeable(
SemanticCategory::Import,
SemanticCategory::Import
));
assert!(categories_mergeable(
SemanticCategory::Declaration,
SemanticCategory::Declaration
));
assert!(categories_mergeable(
SemanticCategory::Other,
SemanticCategory::Other
));
}
#[test]
fn categories_mergeable_comment_with_anything() {
assert!(categories_mergeable(
SemanticCategory::Comment,
SemanticCategory::Import
));
assert!(categories_mergeable(
SemanticCategory::Comment,
SemanticCategory::Declaration
));
assert!(categories_mergeable(
SemanticCategory::Declaration,
SemanticCategory::Comment
));
}
#[test]
fn categories_not_mergeable_different() {
assert!(!categories_mergeable(
SemanticCategory::Import,
SemanticCategory::Declaration
));
assert!(!categories_mergeable(
SemanticCategory::Declaration,
SemanticCategory::Import
));
assert!(!categories_mergeable(
SemanticCategory::Import,
SemanticCategory::Other
));
}
#[test]
fn typescript_class_splits_at_methods() {
let source = r#"
export class MyService {
private data: Map<string, number> = new Map();
constructor(private name: string) {
this.data.set("init", 0);
this.data.set("count", 0);
this.data.set("total", 0);
}
public getData(): Map<string, number> {
const copy = new Map(this.data);
copy.set("accessed", Date.now());
return copy;
}
public processItem(item: string): number {
const len = item.length;
const hash = len * 31;
this.data.set(item, hash);
return hash;
}
}
"#;
let config = ChunkConfig {
max_chunk_size: 120,
min_chunk_size: 10,
..Default::default()
};
let chunks = parse_and_chunk(source, "ts", &config);
assert!(
chunks.len() >= 2,
"TypeScript class should split into multiple chunks at method boundaries, got {}",
chunks.len()
);
}
#[test]
fn python_file_splits_at_function_boundaries() {
let source = r#"
import os
import sys
def function_one(x):
result = x * 2
extra = result + 1
final = extra * 3
return final
def function_two(y):
value = y + 10
adjusted = value - 5
computed = adjusted * 2
return computed
def function_three(z):
base = z ** 2
modified = base + z
output = modified * 4
return output
"#;
let config = ChunkConfig {
max_chunk_size: 100,
min_chunk_size: 10,
..Default::default()
};
let chunks = parse_and_chunk(source, "py", &config);
let fn_chunks: Vec<_> = chunks.iter().filter(|c| c.text.contains("def ")).collect();
assert!(
fn_chunks.len() >= 2,
"Python functions should be separate chunks, got {} function chunks",
fn_chunks.len()
);
}
#[test]
fn single_large_function_no_children_emits_chunk() {
let long_expr = (0..100)
.map(|i| format!("{i}"))
.collect::<Vec<_>>()
.join(" + ");
let source = format!("pub fn huge() -> i32 {{ {long_expr} }}");
let config = ChunkConfig {
max_chunk_size: 50,
min_chunk_size: 10,
..Default::default()
};
let chunks = parse_and_chunk(&source, "rs", &config);
assert!(
!chunks.is_empty(),
"Should produce at least one chunk even for oversized nodes"
);
}
#[test]
fn overlap_preserved_with_semantic_splitting() {
let source = r#"
pub fn a() -> i32 { 1 }
pub fn b() -> i32 { 2 }
pub fn c() -> i32 { 3 }
"#;
let config = ChunkConfig {
max_chunk_size: 30,
min_chunk_size: 5,
overlap_lines: 1,
};
let chunks = parse_and_chunk(source, "rs", &config);
if chunks.len() > 1 {
for chunk in &chunks[1..] {
assert!(
chunk.line_start < chunk.line_end || !chunk.text.is_empty(),
"Overlapping chunks should have content"
);
}
}
}
#[test]
fn no_duplicate_chunks_from_boundary_splitting() {
let source = r#"
impl Foo {
fn bar(&self) -> i32 { 42 }
fn baz(&self) -> i32 { 99 }
}
"#;
let config = ChunkConfig {
max_chunk_size: 60,
min_chunk_size: 5,
..Default::default()
};
let chunks = parse_and_chunk(source, "rs", &config);
let texts: HashSet<_> = chunks.iter().map(|c| &c.text).collect();
assert_eq!(
texts.len(),
chunks.len(),
"Should not produce duplicate chunks"
);
}