use std::cmp;
#[derive(Debug, Clone)]
pub struct ChunkInfo {
pub text: String,
pub start_line: usize, pub end_line: usize, }
pub fn chunk_by_paragraphs(content: &str) -> Vec<ChunkInfo> {
let mut chunks = Vec::new();
let mut current_line_num = 1;
let mut chunk_start_line = 1;
let mut current_chunk = String::new();
for line in content.lines() {
if line.trim().is_empty() {
if !current_chunk.is_empty() {
chunks.push(ChunkInfo {
text: current_chunk.trim().to_string(),
start_line: chunk_start_line,
end_line: current_line_num -1, });
current_chunk.clear();
chunk_start_line = current_line_num + 1;
} else {
chunk_start_line = current_line_num + 1;
}
} else {
if !current_chunk.is_empty() {
current_chunk.push('\n');
}
current_chunk.push_str(line);
}
current_line_num += 1;
}
if !current_chunk.is_empty() {
chunks.push(ChunkInfo {
text: current_chunk.trim().to_string(),
start_line: chunk_start_line,
end_line: cmp::max(chunk_start_line, current_line_num.saturating_sub(1)),
});
}
chunks
}
fn calculate_indentation(line: &str) -> usize {
line.chars().take_while(|&c| c.is_whitespace()).count()
}
fn finalize_chunk(
chunks: &mut Vec<ChunkInfo>,
current_chunk_lines: &mut Vec<&str>,
start_line: usize,
potential_end_line: usize, ) {
let mut actual_end_line = start_line; let mut last_non_empty_index = None;
for (idx, line) in current_chunk_lines.iter().enumerate().rev() {
if !line.trim().is_empty() {
actual_end_line = start_line + idx; last_non_empty_index = Some(idx);
break;
}
}
actual_end_line = std::cmp::min(actual_end_line, potential_end_line);
let lines_to_join = if let Some(last_idx) = last_non_empty_index {
¤t_chunk_lines[..=last_idx]
} else {
¤t_chunk_lines[0..0]
};
let chunk_text = lines_to_join.join("\n");
if !chunk_text.trim().is_empty() {
let mut contains_code = false;
for line in lines_to_join {
let trimmed = line.trim();
if !trimmed.is_empty() && !trimmed.starts_with("//") && !trimmed.starts_with("#") {
contains_code = true;
break;
}
}
if contains_code {
chunks.push(ChunkInfo {
text: chunk_text, start_line,
end_line: actual_end_line, });
}
}
current_chunk_lines.clear(); }
pub fn chunk_by_indentation(content: &str) -> Vec<ChunkInfo> {
let mut chunks = Vec::new();
let mut current_chunk_lines = Vec::<&str>::new();
let mut current_chunk_start_line: Option<usize> = None;
let all_lines: Vec<&str> = content.lines().collect();
for (i, line) in all_lines.iter().enumerate() {
let current_line_num = i + 1; let trimmed_line = line.trim();
let current_indentation = calculate_indentation(line);
let is_likely_top_level = current_indentation == 0 && !trimmed_line.is_empty() &&
!trimmed_line.starts_with("//") && !trimmed_line.starts_with("#") &&
(
trimmed_line.starts_with("fn ") || trimmed_line.starts_with("pub fn") ||
trimmed_line.starts_with("struct ") ||
trimmed_line.starts_with("enum ") ||
trimmed_line.starts_with("impl ") ||
trimmed_line.starts_with("trait ") ||
trimmed_line.starts_with("mod ") ||
trimmed_line.starts_with("function ") || trimmed_line.starts_with("async function") ||
trimmed_line.starts_with("class ") || trimmed_line.starts_with("interface ") ||
trimmed_line.starts_with("export ") || trimmed_line.starts_with("const ") || trimmed_line.starts_with("let ") || trimmed_line.starts_with("var ") ||
trimmed_line.starts_with("def ") || trimmed_line.starts_with("async def")
);
if current_chunk_start_line.is_none() {
if is_likely_top_level { current_chunk_start_line = Some(current_line_num);
current_chunk_lines.push(line);
}
} else {
if is_likely_top_level {
finalize_chunk(
&mut chunks,
&mut current_chunk_lines, current_chunk_start_line.unwrap(),
current_line_num - 1, );
current_chunk_lines.push(line); current_chunk_start_line = Some(current_line_num);
} else {
current_chunk_lines.push(line);
}
}
}
if let Some(start_line) = current_chunk_start_line {
finalize_chunk(
&mut chunks,
&mut current_chunk_lines, start_line,
all_lines.len(), );
}
chunks
}
pub fn chunk_by_lines(content: &str, chunk_size: usize, overlap: usize) -> Vec<ChunkInfo> {
if chunk_size == 0 {
return Vec::new();
}
let lines: Vec<&str> = content.lines().collect();
let total_lines = lines.len();
if total_lines == 0 {
return Vec::new();
}
let step_size = chunk_size.saturating_sub(overlap);
if step_size == 0 {
return Vec::new();
}
let mut chunks = Vec::new();
let mut current_start_index = 0;
loop { let current_end_index = cmp::min(current_start_index + chunk_size, total_lines);
if current_start_index >= current_end_index {
break; }
let chunk_lines = &lines[current_start_index..current_end_index];
if !chunk_lines.is_empty() {
let chunk_text = chunk_lines.join("\n");
if !chunk_text.trim().is_empty() {
chunks.push(ChunkInfo {
text: chunk_text,
start_line: current_start_index + 1,
end_line: current_end_index,
});
}
}
let next_start_index = current_start_index + step_size;
if next_start_index >= total_lines {
break;
}
current_start_index = next_start_index;
}
chunks
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simple_paragraphs() {
let content = "First paragraph.\nLine two.\n\nSecond paragraph.\n\nThird.";
let chunks = chunk_by_paragraphs(content);
assert_eq!(chunks.len(), 3);
assert_eq!(chunks[0].text, "First paragraph.\nLine two.");
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[0].end_line, 2);
assert_eq!(chunks[1].text, "Second paragraph.");
assert_eq!(chunks[1].start_line, 4);
assert_eq!(chunks[1].end_line, 4);
assert_eq!(chunks[2].text, "Third.");
assert_eq!(chunks[2].start_line, 6);
assert_eq!(chunks[2].end_line, 6);
}
#[test]
fn test_leading_trailing_empty_lines() {
let content = "\n\nFirst paragraph.\n\n\nSecond paragraph.\n\n";
let chunks = chunk_by_paragraphs(content);
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].text, "First paragraph.");
assert_eq!(chunks[0].start_line, 3);
assert_eq!(chunks[0].end_line, 3);
assert_eq!(chunks[1].text, "Second paragraph.");
assert_eq!(chunks[1].start_line, 6);
assert_eq!(chunks[1].end_line, 6);
}
#[test]
fn test_no_empty_lines() {
let content = "Single line one.\nSingle line two.";
let chunks = chunk_by_paragraphs(content);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].text, "Single line one.\nSingle line two.");
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[0].end_line, 2);
}
#[test]
fn test_empty_content() {
let content = "";
let chunks = chunk_by_paragraphs(content);
assert!(chunks.is_empty());
}
#[test]
fn test_only_empty_lines() {
let content = "\n\n\n";
let chunks = chunk_by_paragraphs(content);
assert!(chunks.is_empty());
}
#[test]
fn test_single_line_content() {
let content = "Just one line.";
let chunks = chunk_by_paragraphs(content); assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].text, "Just one line.");
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[0].end_line, 1);
}
}
#[cfg(test)]
mod line_chunking_tests {
use super::*;
#[test]
fn test_lines_simple() {
let content = "Line 1\nLine 2\nLine 3\nLine 4\nLine 5";
let chunks = chunk_by_lines(content, 3, 1);
assert_eq!(chunks.len(), 3, "Expected 3 chunks for this input and parameters");
assert_eq!(chunks[0].text, "Line 1\nLine 2\nLine 3");
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[0].end_line, 3);
assert_eq!(chunks[1].text, "Line 3\nLine 4\nLine 5");
assert_eq!(chunks[1].start_line, 3);
assert_eq!(chunks[1].end_line, 5);
assert_eq!(chunks[2].text, "Line 5");
assert_eq!(chunks[2].start_line, 5);
assert_eq!(chunks[2].end_line, 5);
}
#[test]
fn test_lines_no_overlap() {
let content = "L1\nL2\nL3\nL4\nL5\nL6";
let chunks = chunk_by_lines(content, 2, 0);
assert_eq!(chunks.len(), 3);
assert_eq!(chunks[0].text, "L1\nL2");
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[0].end_line, 2);
assert_eq!(chunks[1].text, "L3\nL4");
assert_eq!(chunks[1].start_line, 3);
assert_eq!(chunks[1].end_line, 4);
assert_eq!(chunks[2].text, "L5\nL6");
assert_eq!(chunks[2].start_line, 5);
assert_eq!(chunks[2].end_line, 6);
}
#[test]
fn test_lines_larger_than_content() {
let content = "Line 1\nLine 2";
let chunks = chunk_by_lines(content, 10, 2);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].text, "Line 1\nLine 2");
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[0].end_line, 2);
}
#[test]
fn test_lines_with_empty_lines() {
let content = "Line 1\n\nLine 3\nLine 4\n\nLine 6";
let chunks = chunk_by_lines(content, 3, 1);
assert_eq!(chunks.len(), 3);
assert_eq!(chunks[0].text, "Line 1\n\nLine 3");
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[0].end_line, 3);
assert_eq!(chunks[1].text, "Line 3\nLine 4\n");
assert_eq!(chunks[1].start_line, 3);
assert_eq!(chunks[1].end_line, 5);
assert_eq!(chunks[2].text, "\nLine 6");
assert_eq!(chunks[2].start_line, 5);
assert_eq!(chunks[2].end_line, 6);
}
#[test]
fn test_lines_empty_content() {
let content = "";
let chunks = chunk_by_lines(content, 10, 2);
assert!(chunks.is_empty());
}
#[test]
fn test_lines_whitespace_only_content() {
let content = " \n\t\n "; let chunks = chunk_by_lines(content, 2, 1);
assert!(chunks.is_empty(), "Chunking whitespace-only content should result in empty vec");
}
#[test]
fn test_lines_zero_chunk_size() {
let content = "Line 1";
let chunks = chunk_by_lines(content, 0, 0);
assert!(chunks.is_empty());
}
#[test]
fn test_lines_zero_step_size() {
let content = "Line 1\nLine 2";
let chunks = chunk_by_lines(content, 2, 2);
assert!(chunks.is_empty());
let chunks = chunk_by_lines(content, 2, 3);
assert!(chunks.is_empty());
}
}
#[cfg(test)]
mod indentation_chunking_tests {
use super::*;
#[test]
fn test_indentation_simple_rust() {
let content = r#"
fn main() {
println!("Hello");
}
struct Point {
x: i32,
y: i32,
}
"#;
let chunks = chunk_by_indentation(content);
assert_eq!(chunks.len(), 2);
assert!(chunks[0].text.contains("fn main()"));
assert_eq!(chunks[0].start_line, 2);
assert_eq!(chunks[0].end_line, 4);
assert!(chunks[1].text.contains("struct Point"));
assert_eq!(chunks[1].start_line, 6);
assert_eq!(chunks[1].end_line, 9);
}
#[test]
fn test_indentation_leading_whitespace_and_comments() {
let content = r#"
// A comment
fn first_func() {
// ...
}
// Another comment
pub fn second_func() { // This line has indentation > 0
// ...
}
"#;
let chunks = chunk_by_indentation(content);
assert_eq!(chunks.len(), 1);
assert!(chunks[0].text.contains("fn first_func()"));
assert_eq!(chunks[0].start_line, 5);
assert_eq!(chunks[0].end_line, 12);
}
#[test]
fn test_indentation_python_mixed() {
let content = r#"
class MyClass:
def __init__(self):
pass
def top_level_func():
print("hello")
# Indented comment...
async def another_func(): # Line 11
pass # Line 12
"#;
let chunks = chunk_by_indentation(content);
assert_eq!(chunks.len(), 3);
assert!(chunks[0].text.contains("class MyClass"));
assert_eq!(chunks[0].start_line, 2);
assert_eq!(chunks[0].end_line, 4);
assert!(chunks[1].text.contains("def top_level_func()"));
assert_eq!(chunks[1].start_line, 6);
assert_eq!(chunks[1].end_line, 9);
assert!(chunks[2].text.contains("async def another_func()"));
assert_eq!(chunks[2].start_line, 11);
assert_eq!(chunks[2].end_line, 12);
}
#[test]
fn test_indentation_javascript_export() {
let content = r#"
export const foo = 1;
export function bar() {
return 2;
}
class Baz {
method() {}
}
"#;
let chunks = chunk_by_indentation(content);
assert_eq!(chunks.len(), 3);
assert!(chunks[0].text.contains("export const foo"));
assert_eq!(chunks[0].start_line, 2);
assert_eq!(chunks[0].end_line, 2);
assert!(chunks[1].text.contains("export function bar"));
assert_eq!(chunks[1].start_line, 4);
assert_eq!(chunks[1].end_line, 6);
assert!(chunks[2].text.contains("class Baz"));
assert_eq!(chunks[2].start_line, 8);
assert_eq!(chunks[2].end_line, 10);
}
#[test]
fn test_indentation_no_top_level() {
let content = r#"
line 1
line 2
line 3
"#;
let chunks = chunk_by_indentation(content);
assert_eq!(chunks.len(), 0);
}
#[test]
fn test_indentation_empty_content() {
let content = "";
let chunks = chunk_by_indentation(content);
assert!(chunks.is_empty());
}
#[test]
fn test_indentation_whitespace_only_content() {
let content = " \n \t\n";
let chunks = chunk_by_indentation(content);
assert!(chunks.is_empty());
}
#[test]
fn test_indentation_single_top_level() {
let content = "fn main() {}\n";
let chunks = chunk_by_indentation(content);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[0].end_line, 1);
}
#[test]
fn test_indentation_only_comments_or_empty() {
let content = r#"
// comment 1
// comment 2
"#;
let chunks = chunk_by_indentation(content);
assert!(chunks.is_empty());
}
}
#[cfg(test)]
mod line_chunking_tests_extended {
use super::*;
#[test]
fn test_lines_exact_multiple() {
let content = "1\n2\n3\n4\n5\n6";
let chunks = chunk_by_lines(content, 3, 1);
assert_eq!(chunks.len(), 3);
assert_eq!(chunks[0].text, "1\n2\n3");
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[0].end_line, 3);
assert_eq!(chunks[1].text, "3\n4\n5");
assert_eq!(chunks[1].start_line, 3);
assert_eq!(chunks[1].end_line, 5);
assert_eq!(chunks[2].text, "5\n6");
assert_eq!(chunks[2].start_line, 5);
assert_eq!(chunks[2].end_line, 6);
}
#[test]
fn test_lines_overlap_equals_chunk_size() {
let content = "a\nb\nc";
let chunks = chunk_by_lines(content, 2, 2);
assert!(chunks.is_empty());
}
#[test]
fn test_lines_overlap_greater_than_chunk_size() {
let content = "a\nb\nc";
let chunks = chunk_by_lines(content, 2, 3);
assert!(chunks.is_empty());
}
#[test]
fn test_lines_only_whitespace_lines() {
let content = " \n\t \n \n ";
let chunks = chunk_by_lines(content, 2, 1);
assert!(chunks.is_empty());
}
#[test]
fn test_lines_mixed_whitespace_and_content() {
let content = "line 1\n \nline 3\n\t\nline 5";
let chunks = chunk_by_lines(content, 3, 1);
assert_eq!(chunks.len(), 3);
assert_eq!(chunks[0].text, "line 1\n \nline 3");
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[0].end_line, 3);
assert_eq!(chunks[1].text, "line 3\n\t\nline 5");
assert_eq!(chunks[1].start_line, 3);
assert_eq!(chunks[1].end_line, 5);
assert_eq!(chunks[2].text, "line 5");
assert_eq!(chunks[2].start_line, 5);
assert_eq!(chunks[2].end_line, 5);
}
#[test]
fn test_lines_chunk_size_one() {
let content = "a\nb\nc";
let chunks = chunk_by_lines(content, 1, 0);
assert_eq!(chunks.len(), 3);
assert_eq!(chunks[0].text, "a"); assert_eq!(chunks[0].start_line, 1); assert_eq!(chunks[0].end_line, 1);
assert_eq!(chunks[1].text, "b"); assert_eq!(chunks[1].start_line, 2); assert_eq!(chunks[1].end_line, 2);
assert_eq!(chunks[2].text, "c"); assert_eq!(chunks[2].start_line, 3); assert_eq!(chunks[2].end_line, 3);
}
#[test]
fn test_lines_chunk_size_one_with_overlap() {
let content = "a\nb\nc";
let chunks = chunk_by_lines(content, 1, 1);
assert!(chunks.is_empty());
let chunks_overlap_0 = chunk_by_lines(content, 1, 0);
assert_eq!(chunks_overlap_0.len(), 3);
}
}