Skip to main content

ripvec_core/
chunk.rs

1//! Tree-sitter based code chunking with sliding-window fallback.
2//!
3//! Parses source files into ASTs and extracts semantic chunks at
4//! function, class, and method boundaries. For files without recognized
5//! semantic structure (or very large fallback chunks), splits into
6//! overlapping sliding windows for uniform embedding sizes.
7
8use std::path::Path;
9use streaming_iterator::StreamingIterator;
10use tree_sitter::{Parser, QueryCursor};
11
12/// Runtime configuration for the chunking pipeline.
13///
14/// All size values are in bytes. Pass to [`chunk_file`] to control
15/// chunk sizing without recompilation.
16#[derive(Debug, Clone)]
17pub struct ChunkConfig {
18    /// Maximum chunk content length in bytes before splitting into windows.
19    /// Chunks larger than this are split even if tree-sitter found them as
20    /// a single definition (e.g., a 500-line function).
21    pub max_chunk_bytes: usize,
22    /// Target window size in bytes for the sliding-window fallback chunker.
23    /// ~2KB of source text ≈ 128-256 tokens after BPE, well within the
24    /// 512-token model limit and optimal for embedding quality.
25    pub window_size: usize,
26    /// Overlap between adjacent windows in bytes.
27    /// Ensures definitions spanning a window boundary are captured in at
28    /// least one window. Defaults to 25% of `window_size`.
29    pub window_overlap: usize,
30}
31
32impl Default for ChunkConfig {
33    fn default() -> Self {
34        Self {
35            max_chunk_bytes: 4096,
36            window_size: 2048,
37            window_overlap: 512,
38        }
39    }
40}
41
42/// A semantic chunk extracted from a source file.
43#[derive(
44    Debug,
45    Clone,
46    rkyv::Archive,
47    rkyv::Serialize,
48    rkyv::Deserialize,
49    bitcode::Encode,
50    bitcode::Decode,
51)]
52pub struct CodeChunk {
53    /// Path to the source file.
54    pub file_path: String,
55    /// Name of the definition (function name, class name, etc.).
56    pub name: String,
57    /// Kind of syntax node (e.g., `function_item`, `class_definition`).
58    pub kind: String,
59    /// 1-based start line number.
60    pub start_line: usize,
61    /// 1-based end line number.
62    pub end_line: usize,
63    /// Source text of the chunk (raw code for display).
64    pub content: String,
65    /// Enriched content with scope chain and signature metadata for embedding.
66    /// Falls back to `content` if metadata would exceed chunk size limits.
67    pub enriched_content: String,
68}
69
70/// Walk up the AST parent chain collecting structural container names.
71///
72/// Produces a scope chain like `"impl_item Foo > fn forward"` by
73/// identifying structural containers (impl blocks, classes, modules, namespaces)
74/// and extracting their names. Tries the `name` field first, then `type`
75/// (for Rust `impl_item` which uses `type` instead of `name`).
76#[must_use]
77pub fn build_scope_chain(node: tree_sitter::Node<'_>, source: &str) -> String {
78    /// Node kinds that represent structural containers, by language.
79    const CONTAINER_KINDS: &[&str] = &[
80        // Rust
81        "impl_item",
82        "trait_item",
83        "mod_item",
84        // Python
85        "class_definition",
86        "module",
87        // JS/TS
88        "class_declaration",
89        // Java
90        // "class_declaration" already covered above
91        // Go
92        "type_declaration",
93        // C++
94        "namespace_definition",
95        "class_specifier",
96    ];
97
98    /// Field names to try when extracting the container name.
99    /// `impl_item` uses `type` instead of `name`; Go `type_declaration`
100    /// has no fields, so we fall back to the node kind.
101    const NAME_FIELDS: &[&str] = &["name", "type"];
102
103    let mut parts = Vec::new();
104    let mut current = node.parent();
105    while let Some(parent) = current {
106        let kind = parent.kind();
107        if CONTAINER_KINDS.contains(&kind) {
108            let name = NAME_FIELDS
109                .iter()
110                .find_map(|field| parent.child_by_field_name(field))
111                .map_or(kind, |n| &source[n.start_byte()..n.end_byte()]);
112            parts.push(format!("{kind} {name}"));
113        }
114        current = parent.parent();
115    }
116    parts.reverse();
117    parts.join(" > ")
118}
119
120/// Extract the function/method signature from a definition node.
121///
122/// Returns the text from the function name to the start of the body,
123/// which captures the parameter list and return type (if any).
124/// Returns `None` if the node has no `name` or `body`/`block` field.
125#[must_use]
126pub fn extract_signature(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
127    let name_node = node.child_by_field_name("name")?;
128    let body_node = node
129        .child_by_field_name("body")
130        .or_else(|| node.child_by_field_name("block"))?;
131    let start = name_node.start_byte();
132    let end = body_node.start_byte();
133    if start >= end {
134        return None;
135    }
136    let sig = source[start..end].trim();
137    if sig.is_empty() {
138        None
139    } else {
140        Some(sig.to_string())
141    }
142}
143
144/// Reduce indentation waste for embedding by normalizing whitespace.
145///
146/// For each line:
147/// - Counts leading spaces/tabs, normalises to 2 spaces per indent level
148///   (4 spaces → 2, 8 spaces → 4, 1 tab → 2 spaces).
149/// - Strips trailing whitespace.
150///
151/// Additionally, 3 or more consecutive blank lines are collapsed to a single
152/// blank line. This reduces the number of whitespace tokens consumed in the
153/// 512-token embedding window without altering visible structure.
154#[must_use]
155pub fn minify_whitespace(source: &str) -> String {
156    let mut result = String::with_capacity(source.len());
157    let mut consecutive_blank = 0usize;
158
159    for line in source.lines() {
160        // Count leading whitespace and determine indent level
161        let leading = line
162            .chars()
163            .take_while(|c| *c == ' ' || *c == '\t')
164            .fold(0usize, |acc, c| acc + if c == '\t' { 2 } else { 1 });
165        let rest = line.trim_start();
166
167        if rest.is_empty() {
168            // Blank line handling: collapse 3+ consecutive blanks to 1.
169            // Only emit the first blank line of a run; suppress the rest.
170            consecutive_blank += 1;
171            if consecutive_blank == 1 {
172                result.push('\n');
173            }
174        } else {
175            consecutive_blank = 0;
176            // Normalise: every 2 spaces of original indent → 1 space of output
177            // (round up so indent level 1 → 1 space, level 2 → 2, etc.)
178            let indent_level = leading.div_ceil(2);
179            for _ in 0..indent_level {
180                result.push(' ');
181            }
182            result.push_str(rest.trim_end());
183            result.push('\n');
184        }
185    }
186
187    // Remove trailing newline added for the last line if source didn't end with one
188    if !source.ends_with('\n') && result.ends_with('\n') {
189        result.pop();
190    }
191
192    result
193}
194
195/// Build the enriched content header for a code chunk.
196///
197/// Prepends scope chain and signature metadata as a comment line.
198/// If the header + content would exceed `max_bytes`, returns `content` unchanged.
199fn build_enriched_content(
200    path: &Path,
201    node: tree_sitter::Node<'_>,
202    source: &str,
203    content: &str,
204    max_bytes: usize,
205) -> String {
206    let scope = build_scope_chain(node, source);
207    let sig = extract_signature(node, source).unwrap_or_default();
208    let rel_path = path.display().to_string();
209
210    let header = if scope.is_empty() && sig.is_empty() {
211        format!("// {rel_path}\n")
212    } else if scope.is_empty() {
213        format!("// {rel_path} | defines: {sig}\n")
214    } else if sig.is_empty() {
215        format!("// {rel_path} | {scope}\n")
216    } else {
217        format!("// {rel_path} | {scope} | defines: {sig}\n")
218    };
219
220    // Minify whitespace for the embedding content to reduce token waste.
221    // The raw `content` field is kept as-is for display.
222    let minified = minify_whitespace(content);
223
224    if header.len() + minified.len() > max_bytes {
225        minified
226    } else {
227        format!("{header}{minified}")
228    }
229}
230
231/// Extract semantic chunks from a source file.
232///
233/// Uses tree-sitter to parse the file and extract definitions matching
234/// the language's query patterns. For files with no semantic matches,
235/// falls back to overlapping sliding windows. Large individual chunks
236/// are also split into windows.
237///
238/// Pass a [`ChunkConfig`] to control chunk sizing at runtime.
239#[must_use]
240pub fn chunk_file(
241    path: &Path,
242    source: &str,
243    config: &crate::languages::LangConfig,
244    chunk_config: &ChunkConfig,
245) -> Vec<CodeChunk> {
246    let mut parser = Parser::new();
247    if parser.set_language(&config.language).is_err() {
248        return sliding_windows(path, source, chunk_config);
249    }
250
251    let Some(tree) = parser.parse(source, None) else {
252        return sliding_windows(path, source, chunk_config);
253    };
254
255    let mut cursor = QueryCursor::new();
256    let mut chunks = Vec::new();
257    let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
258
259    while let Some(m) = matches.next() {
260        let mut name = String::new();
261        let mut def_node = None;
262        for cap in m.captures {
263            let cap_name = &config.query.capture_names()[cap.index as usize];
264            if *cap_name == "name" {
265                name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
266            } else if *cap_name == "def" {
267                def_node = Some(cap.node);
268            }
269        }
270        if let Some(node) = def_node {
271            let content = &source[node.start_byte()..node.end_byte()];
272            let start_line = node.start_position().row + 1;
273
274            // Split oversized chunks into windows
275            if content.len() > chunk_config.max_chunk_bytes {
276                chunks.extend(sliding_windows_with_name(
277                    path,
278                    content,
279                    &name,
280                    start_line,
281                    chunk_config,
282                ));
283            } else {
284                let enriched = build_enriched_content(
285                    path,
286                    node,
287                    source,
288                    content,
289                    chunk_config.max_chunk_bytes,
290                );
291                chunks.push(CodeChunk {
292                    file_path: path.display().to_string(),
293                    name,
294                    kind: node.kind().to_string(),
295                    start_line,
296                    end_line: node.end_position().row + 1,
297                    enriched_content: enriched,
298                    content: content.to_string(),
299                });
300            }
301        }
302    }
303
304    // Fallback: sliding windows if no semantic matches
305    if chunks.is_empty() && !source.trim().is_empty() {
306        return sliding_windows(path, source, chunk_config);
307    }
308
309    chunks
310}
311
312/// Split source text into overlapping sliding windows.
313///
314/// Each window is `chunk_config.window_size` bytes with `chunk_config.window_overlap` bytes of
315/// overlap. Window boundaries are adjusted to line breaks to avoid
316/// splitting mid-line.
317///
318/// This is used as the fallback for files without tree-sitter support
319/// (plain text, unknown extensions) and for large semantic chunks that
320/// exceed `max_chunk_bytes`.
321#[must_use]
322pub fn chunk_text(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
323    sliding_windows(path, source, chunk_config)
324}
325
326/// Internal sliding-window implementation.
327fn sliding_windows(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
328    if source.trim().is_empty() {
329        return vec![];
330    }
331
332    // Small enough for a single chunk
333    if source.len() <= chunk_config.max_chunk_bytes {
334        let content = source.to_string();
335        return vec![CodeChunk {
336            file_path: path.display().to_string(),
337            name: path
338                .file_name()
339                .unwrap_or_default()
340                .to_string_lossy()
341                .to_string(),
342            kind: "file".to_string(),
343            start_line: 1,
344            end_line: source.lines().count(),
345            enriched_content: content.clone(),
346            content,
347        }];
348    }
349
350    let file_name = path
351        .file_name()
352        .unwrap_or_default()
353        .to_string_lossy()
354        .to_string();
355    sliding_window_chunks(source, path, &file_name, 1, chunk_config)
356}
357
358/// Split a named definition into overlapping windows.
359///
360/// Used when a single tree-sitter match (e.g., a large function) exceeds
361/// `chunk_config.max_chunk_bytes`. Windows carry the definition name for search context.
362fn sliding_windows_with_name(
363    path: &Path,
364    content: &str,
365    name: &str,
366    base_line: usize,
367    chunk_config: &ChunkConfig,
368) -> Vec<CodeChunk> {
369    sliding_window_chunks(content, path, name, base_line, chunk_config)
370}
371
372/// Shared sliding-window loop used by both [`sliding_windows`] and
373/// [`sliding_windows_with_name`].
374///
375/// Splits `source` into overlapping windows of `chunk_config.window_size` bytes,
376/// snapping boundaries to line breaks. Each chunk is tagged with `name_prefix`
377/// and an index suffix (e.g., `"main[0]"`, `"main[1]"`).
378fn sliding_window_chunks(
379    source: &str,
380    file_path: &Path,
381    name_prefix: &str,
382    base_line: usize,
383    chunk_config: &ChunkConfig,
384) -> Vec<CodeChunk> {
385    let step = chunk_config
386        .window_size
387        .saturating_sub(chunk_config.window_overlap)
388        .max(1);
389    let bytes = source.as_bytes();
390    let mut chunks = Vec::new();
391    let mut offset = 0;
392    let mut window_idx = 0;
393
394    while offset < bytes.len() {
395        let raw_end = (offset + chunk_config.window_size).min(bytes.len());
396
397        // Snap end to a line boundary (don't split mid-line)
398        let end = if raw_end < bytes.len() {
399            match bytes[offset..raw_end].iter().rposition(|&b| b == b'\n') {
400                Some(pos) => offset + pos + 1,
401                None => raw_end, // no newline found, use raw end
402            }
403        } else {
404            raw_end
405        };
406
407        // Extract window as str (skip invalid UTF-8)
408        if let Ok(window) = std::str::from_utf8(&bytes[offset..end])
409            && !window.trim().is_empty()
410        {
411            let start_line = base_line + source[..offset].matches('\n').count();
412            let content_lines = window.lines().count().max(1);
413            let end_line = start_line + content_lines - 1;
414            let content = window.to_string();
415            chunks.push(CodeChunk {
416                file_path: file_path.display().to_string(),
417                name: format!("{name_prefix}[{window_idx}]"),
418                kind: "window".to_string(),
419                start_line,
420                end_line,
421                enriched_content: content.clone(),
422                content,
423            });
424            window_idx += 1;
425        }
426
427        offset += step;
428    }
429
430    chunks
431}
432
433#[cfg(test)]
434mod tests {
435    use super::*;
436    use std::fmt::Write as _;
437    use std::path::Path;
438
439    #[test]
440    fn chunks_rust_functions_and_structs() {
441        let source = "fn hello() { println!(\"hi\"); }\nfn world() {}\nstruct Foo { x: i32 }";
442        let config = crate::languages::config_for_extension("rs").unwrap();
443        let chunks = chunk_file(
444            Path::new("test.rs"),
445            source,
446            &config,
447            &ChunkConfig::default(),
448        );
449        assert!(
450            chunks.len() >= 2,
451            "expected at least 2 chunks, got {}",
452            chunks.len()
453        );
454        assert!(chunks.iter().any(|c| c.name == "hello"));
455        assert!(chunks.iter().any(|c| c.name == "world"));
456    }
457
458    #[test]
459    fn chunks_python_functions_and_classes() {
460        let source = "def greet(name):\n    pass\n\nclass Foo:\n    pass\n";
461        let config = crate::languages::config_for_extension("py").unwrap();
462        let chunks = chunk_file(
463            Path::new("test.py"),
464            source,
465            &config,
466            &ChunkConfig::default(),
467        );
468        assert!(chunks.len() >= 2);
469        assert!(chunks.iter().any(|c| c.name == "greet"));
470        assert!(chunks.iter().any(|c| c.name == "Foo"));
471    }
472
473    #[test]
474    fn fallback_small_file_single_chunk() {
475        let source = "let x = 42;\nconsole.log(x);\n";
476        let config = crate::languages::config_for_extension("js").unwrap();
477        let chunks = chunk_file(
478            Path::new("script.js"),
479            source,
480            &config,
481            &ChunkConfig::default(),
482        );
483        assert_eq!(chunks.len(), 1);
484        assert_eq!(chunks[0].kind, "file");
485    }
486
487    #[test]
488    fn fallback_large_file_produces_windows() {
489        // Create a file larger than default max_chunk_bytes with no function declarations
490        let line = "console.log('hello world, this is a long line of javascript code');\n";
491        let source: String = line.repeat(200); // ~13KB
492        let chunk_config = ChunkConfig::default();
493        assert!(source.len() > chunk_config.max_chunk_bytes);
494
495        let config = crate::languages::config_for_extension("js").unwrap();
496        let chunks = chunk_file(Path::new("big.js"), &source, &config, &chunk_config);
497        assert!(
498            chunks.len() > 1,
499            "expected multiple windows, got {}",
500            chunks.len()
501        );
502        assert!(chunks.iter().all(|c| c.kind == "window"));
503        assert!(chunks[0].name.contains("[0]"));
504    }
505
506    #[test]
507    fn large_definition_is_windowed() {
508        // A Rust function larger than default max_chunk_bytes
509        let mut source = String::from("fn big_function() {\n");
510        for i in 0..200 {
511            writeln!(source, "    let var_{i} = {i} * 2 + 1; // some computation").unwrap();
512        }
513        source.push_str("}\n");
514        let chunk_config = ChunkConfig::default();
515        assert!(source.len() > chunk_config.max_chunk_bytes);
516
517        let config = crate::languages::config_for_extension("rs").unwrap();
518        let chunks = chunk_file(Path::new("test.rs"), &source, &config, &chunk_config);
519        assert!(
520            chunks.len() > 1,
521            "expected windowed chunks, got {}",
522            chunks.len()
523        );
524        assert!(chunks[0].name.starts_with("big_function["));
525    }
526
527    #[test]
528    fn empty_file_produces_no_chunks() {
529        let config = crate::languages::config_for_extension("rs").unwrap();
530        let chunks = chunk_file(Path::new("empty.rs"), "", &config, &ChunkConfig::default());
531        assert!(chunks.is_empty());
532    }
533
534    // --- T1 enrichment tests ---
535
536    /// Helper: parse source with tree-sitter and return the first `@def` node.
537    fn first_def_node(
538        source: &str,
539        ext: &str,
540    ) -> (
541        tree_sitter::Tree,
542        std::sync::Arc<crate::languages::LangConfig>,
543    ) {
544        let config = crate::languages::config_for_extension(ext).unwrap();
545        let mut parser = Parser::new();
546        parser.set_language(&config.language).unwrap();
547        let tree = parser.parse(source, None).unwrap();
548        (tree, config)
549    }
550
551    #[test]
552    fn scope_chain_rust_impl_method() {
553        let source = "impl Foo {\n    fn bar(&self) {}\n}";
554        let (tree, config) = first_def_node(source, "rs");
555        let mut cursor = QueryCursor::new();
556        let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
557
558        let mut def_node = None;
559        while let Some(m) = StreamingIterator::next(&mut matches) {
560            for cap in m.captures {
561                let cap_name = &config.query.capture_names()[cap.index as usize];
562                if *cap_name == "def" {
563                    def_node = Some(cap.node);
564                }
565            }
566        }
567        let node = def_node.expect("should find a @def node");
568        let scope = build_scope_chain(node, source);
569        assert!(
570            scope.contains("impl_item"),
571            "scope should contain impl_item, got: {scope}"
572        );
573        assert!(
574            scope.contains("Foo"),
575            "scope should contain 'Foo', got: {scope}"
576        );
577    }
578
579    #[test]
580    fn scope_chain_python_class_method() {
581        let source = "class Greeter:\n    def say_hello(self):\n        pass\n";
582        let (tree, config) = first_def_node(source, "py");
583        let mut cursor = QueryCursor::new();
584        let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
585
586        // Find the function_definition @def (say_hello), not the class @def
587        let mut fn_node = None;
588        while let Some(m) = StreamingIterator::next(&mut matches) {
589            for cap in m.captures {
590                let cap_name = &config.query.capture_names()[cap.index as usize];
591                if *cap_name == "def" && cap.node.kind() == "function_definition" {
592                    fn_node = Some(cap.node);
593                }
594            }
595        }
596        let node = fn_node.expect("should find say_hello @def node");
597        let scope = build_scope_chain(node, source);
598        assert!(
599            scope.contains("class_definition"),
600            "scope should contain class_definition, got: {scope}"
601        );
602        assert!(
603            scope.contains("Greeter"),
604            "scope should contain 'Greeter', got: {scope}"
605        );
606    }
607
608    #[test]
609    fn extract_signature_rust_function() {
610        let source = "fn greet(name: &str) -> String { name.to_string() }";
611        let (tree, config) = first_def_node(source, "rs");
612        let mut cursor = QueryCursor::new();
613        let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
614
615        let mut def_node = None;
616        while let Some(m) = StreamingIterator::next(&mut matches) {
617            for cap in m.captures {
618                let cap_name = &config.query.capture_names()[cap.index as usize];
619                if *cap_name == "def" {
620                    def_node = Some(cap.node);
621                }
622            }
623        }
624        let node = def_node.expect("should find @def node");
625        let sig = extract_signature(node, source).expect("should extract signature");
626        assert!(
627            sig.contains("greet"),
628            "signature should contain 'greet', got: {sig}"
629        );
630        assert!(
631            sig.contains("name: &str"),
632            "signature should contain parameter, got: {sig}"
633        );
634        assert!(
635            sig.contains("-> String"),
636            "signature should contain return type, got: {sig}"
637        );
638    }
639
640    #[test]
641    fn enriched_content_has_header() {
642        let source = "fn hello() { println!(\"hi\"); }";
643        let config = crate::languages::config_for_extension("rs").unwrap();
644        let chunks = chunk_file(
645            Path::new("src/main.rs"),
646            source,
647            &config,
648            &ChunkConfig::default(),
649        );
650        assert!(!chunks.is_empty());
651        let chunk = &chunks[0];
652        assert!(
653            chunk.enriched_content.starts_with("//"),
654            "enriched_content should start with '//' header, got: {}",
655            &chunk.enriched_content[..chunk.enriched_content.len().min(80)]
656        );
657        assert!(
658            chunk.enriched_content.contains("src/main.rs"),
659            "enriched_content should contain file path"
660        );
661        // Raw content should NOT have the header
662        assert!(
663            !chunk.content.starts_with("//"),
664            "raw content should not start with header"
665        );
666    }
667
668    #[test]
669    fn sliding_window_enriched_equals_content() {
670        let source = "let x = 42;\nconsole.log(x);\n";
671        let chunks = chunk_text(Path::new("test.txt"), source, &ChunkConfig::default());
672        assert!(!chunks.is_empty());
673        for chunk in &chunks {
674            assert_eq!(
675                chunk.enriched_content, chunk.content,
676                "sliding window chunks should have enriched_content == content"
677            );
678        }
679    }
680
681    #[test]
682    fn header_dropped_when_exceeding_max_bytes() {
683        // Create a chunk that barely fits in max_chunk_bytes, so adding
684        // a header would push it over the limit.
685        let tiny_config = ChunkConfig {
686            max_chunk_bytes: 60,
687            window_size: 30,
688            window_overlap: 10,
689        };
690        // Source is exactly at max_chunk_bytes — any header would exceed it
691        let source = "fn f() { let x = 42; return x; }";
692        assert!(source.len() <= tiny_config.max_chunk_bytes);
693
694        let config = crate::languages::config_for_extension("rs").unwrap();
695        let chunks = chunk_file(
696            Path::new("long/path/to/file.rs"),
697            source,
698            &config,
699            &tiny_config,
700        );
701        assert!(!chunks.is_empty());
702        let chunk = &chunks[0];
703        // Header ("// long/path/to/file.rs | defines: ...") + minified content > 60 bytes.
704        // So enriched_content should fall back to minified content (no header),
705        // and raw content is preserved as-is.
706        assert!(
707            !chunk.enriched_content.starts_with("//"),
708            "header should be dropped when it would exceed max_chunk_bytes"
709        );
710        assert_eq!(chunk.content, source, "raw content should be unchanged");
711    }
712
713    #[test]
714    fn minify_whitespace_normalizes_indent_and_strips_trailing() {
715        // 8-space indent → 4-space (halved)
716        let source = "fn foo() {\n        let x = 1;\n        let y = 2;\n}\n";
717        let result = minify_whitespace(source);
718        let lines: Vec<&str> = result.lines().collect();
719        assert_eq!(
720            lines[1], "    let x = 1;",
721            "8-space indent should become 4-space"
722        );
723        assert_eq!(
724            lines[2], "    let y = 2;",
725            "8-space indent should become 4-space"
726        );
727
728        // Trailing whitespace removed
729        let with_trailing = "fn bar()   \n    return 1;   \n";
730        let result2 = minify_whitespace(with_trailing);
731        assert!(
732            result2.lines().all(|l| !l.ends_with(' ')),
733            "trailing whitespace should be stripped"
734        );
735
736        // 3+ consecutive blank lines collapsed to 1
737        let with_blanks = "a\n\n\n\nb\n";
738        let result3 = minify_whitespace(with_blanks);
739        // Should have at most 1 blank line between a and b
740        let blank_runs: Vec<usize> = {
741            let mut runs = Vec::new();
742            let mut count = 0usize;
743            for line in result3.lines() {
744                if line.is_empty() {
745                    count += 1;
746                } else {
747                    if count > 0 {
748                        runs.push(count);
749                    }
750                    count = 0;
751                }
752            }
753            runs
754        };
755        assert!(
756            blank_runs.iter().all(|&n| n <= 1),
757            "3+ blank lines should collapse to 1, got runs: {blank_runs:?}"
758        );
759    }
760}