Skip to main content

ripvec_core/
chunk.rs

1//! Tree-sitter based code chunking with sliding-window fallback.
2//!
3//! Parses source files into ASTs and extracts semantic chunks at
4//! function, class, and method boundaries. For files without recognized
5//! semantic structure (or very large fallback chunks), splits into
6//! overlapping sliding windows for uniform embedding sizes.
7
8use std::path::Path;
9use streaming_iterator::StreamingIterator;
10use tree_sitter::{Parser, QueryCursor};
11
12/// Runtime configuration for the chunking pipeline.
13///
14/// All size values are in bytes. Pass to [`chunk_file`] to control
15/// chunk sizing without recompilation.
16#[derive(Debug, Clone)]
17pub struct ChunkConfig {
18    /// Maximum chunk content length in bytes before splitting into windows.
19    /// Chunks larger than this are split even if tree-sitter found them as
20    /// a single definition (e.g., a 500-line function).
21    pub max_chunk_bytes: usize,
22    /// Target window size in bytes for the sliding-window fallback chunker.
23    /// ~2KB of source text ≈ 128-256 tokens after BPE, well within the
24    /// 512-token model limit and optimal for embedding quality.
25    pub window_size: usize,
26    /// Overlap between adjacent windows in bytes.
27    /// Ensures definitions spanning a window boundary are captured in at
28    /// least one window. Defaults to 25% of `window_size`.
29    pub window_overlap: usize,
30}
31
32impl Default for ChunkConfig {
33    fn default() -> Self {
34        Self {
35            max_chunk_bytes: 4096,
36            window_size: 2048,
37            window_overlap: 512,
38        }
39    }
40}
41
42/// A semantic chunk extracted from a source file.
43#[derive(Debug, Clone, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
44pub struct CodeChunk {
45    /// Path to the source file.
46    pub file_path: String,
47    /// Name of the definition (function name, class name, etc.).
48    pub name: String,
49    /// Kind of syntax node (e.g., `function_item`, `class_definition`).
50    pub kind: String,
51    /// 1-based start line number.
52    pub start_line: usize,
53    /// 1-based end line number.
54    pub end_line: usize,
55    /// Source text of the chunk (raw code for display).
56    pub content: String,
57    /// Enriched content with scope chain and signature metadata for embedding.
58    /// Falls back to `content` if metadata would exceed chunk size limits.
59    pub enriched_content: String,
60}
61
62/// Walk up the AST parent chain collecting structural container names.
63///
64/// Produces a scope chain like `"impl_item Foo > fn forward"` by
65/// identifying structural containers (impl blocks, classes, modules, namespaces)
66/// and extracting their names. Tries the `name` field first, then `type`
67/// (for Rust `impl_item` which uses `type` instead of `name`).
68#[must_use]
69pub fn build_scope_chain(node: tree_sitter::Node<'_>, source: &str) -> String {
70    /// Node kinds that represent structural containers, by language.
71    const CONTAINER_KINDS: &[&str] = &[
72        // Rust
73        "impl_item",
74        "trait_item",
75        "mod_item",
76        // Python
77        "class_definition",
78        "module",
79        // JS/TS
80        "class_declaration",
81        // Java
82        // "class_declaration" already covered above
83        // Go
84        "type_declaration",
85        // C++
86        "namespace_definition",
87        "class_specifier",
88    ];
89
90    /// Field names to try when extracting the container name.
91    /// `impl_item` uses `type` instead of `name`; Go `type_declaration`
92    /// has no fields, so we fall back to the node kind.
93    const NAME_FIELDS: &[&str] = &["name", "type"];
94
95    let mut parts = Vec::new();
96    let mut current = node.parent();
97    while let Some(parent) = current {
98        let kind = parent.kind();
99        if CONTAINER_KINDS.contains(&kind) {
100            let name = NAME_FIELDS
101                .iter()
102                .find_map(|field| parent.child_by_field_name(field))
103                .map_or(kind, |n| &source[n.start_byte()..n.end_byte()]);
104            parts.push(format!("{kind} {name}"));
105        }
106        current = parent.parent();
107    }
108    parts.reverse();
109    parts.join(" > ")
110}
111
112/// Extract the function/method signature from a definition node.
113///
114/// Returns the text from the function name to the start of the body,
115/// which captures the parameter list and return type (if any).
116/// Returns `None` if the node has no `name` or `body`/`block` field.
117#[must_use]
118pub fn extract_signature(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
119    let name_node = node.child_by_field_name("name")?;
120    let body_node = node
121        .child_by_field_name("body")
122        .or_else(|| node.child_by_field_name("block"))?;
123    let start = name_node.start_byte();
124    let end = body_node.start_byte();
125    if start >= end {
126        return None;
127    }
128    let sig = source[start..end].trim();
129    if sig.is_empty() {
130        None
131    } else {
132        Some(sig.to_string())
133    }
134}
135
136/// Reduce indentation waste for embedding by normalizing whitespace.
137///
138/// For each line:
139/// - Counts leading spaces/tabs, normalises to 2 spaces per indent level
140///   (4 spaces → 2, 8 spaces → 4, 1 tab → 2 spaces).
141/// - Strips trailing whitespace.
142///
143/// Additionally, 3 or more consecutive blank lines are collapsed to a single
144/// blank line. This reduces the number of whitespace tokens consumed in the
145/// 512-token embedding window without altering visible structure.
146#[must_use]
147pub fn minify_whitespace(source: &str) -> String {
148    let mut result = String::with_capacity(source.len());
149    let mut consecutive_blank = 0usize;
150
151    for line in source.lines() {
152        // Count leading whitespace and determine indent level
153        let leading = line
154            .chars()
155            .take_while(|c| *c == ' ' || *c == '\t')
156            .fold(0usize, |acc, c| acc + if c == '\t' { 2 } else { 1 });
157        let rest = line.trim_start();
158
159        if rest.is_empty() {
160            // Blank line handling: collapse 3+ consecutive blanks to 1.
161            // Only emit the first blank line of a run; suppress the rest.
162            consecutive_blank += 1;
163            if consecutive_blank == 1 {
164                result.push('\n');
165            }
166        } else {
167            consecutive_blank = 0;
168            // Normalise: every 2 spaces of original indent → 1 space of output
169            // (round up so indent level 1 → 1 space, level 2 → 2, etc.)
170            let indent_level = leading.div_ceil(2);
171            for _ in 0..indent_level {
172                result.push(' ');
173            }
174            result.push_str(rest.trim_end());
175            result.push('\n');
176        }
177    }
178
179    // Remove trailing newline added for the last line if source didn't end with one
180    if !source.ends_with('\n') && result.ends_with('\n') {
181        result.pop();
182    }
183
184    result
185}
186
187/// Build the enriched content header for a code chunk.
188///
189/// Prepends scope chain and signature metadata as a comment line.
190/// If the header + content would exceed `max_bytes`, returns `content` unchanged.
191fn build_enriched_content(
192    path: &Path,
193    node: tree_sitter::Node<'_>,
194    source: &str,
195    content: &str,
196    max_bytes: usize,
197) -> String {
198    let scope = build_scope_chain(node, source);
199    let sig = extract_signature(node, source).unwrap_or_default();
200    let rel_path = path.display().to_string();
201
202    let header = if scope.is_empty() && sig.is_empty() {
203        format!("// {rel_path}\n")
204    } else if scope.is_empty() {
205        format!("// {rel_path} | defines: {sig}\n")
206    } else if sig.is_empty() {
207        format!("// {rel_path} | {scope}\n")
208    } else {
209        format!("// {rel_path} | {scope} | defines: {sig}\n")
210    };
211
212    // Minify whitespace for the embedding content to reduce token waste.
213    // The raw `content` field is kept as-is for display.
214    let minified = minify_whitespace(content);
215
216    if header.len() + minified.len() > max_bytes {
217        minified
218    } else {
219        format!("{header}{minified}")
220    }
221}
222
223/// Extract semantic chunks from a source file.
224///
225/// Uses tree-sitter to parse the file and extract definitions matching
226/// the language's query patterns. For files with no semantic matches,
227/// falls back to overlapping sliding windows. Large individual chunks
228/// are also split into windows.
229///
230/// Pass a [`ChunkConfig`] to control chunk sizing at runtime.
231#[must_use]
232pub fn chunk_file(
233    path: &Path,
234    source: &str,
235    config: &crate::languages::LangConfig,
236    chunk_config: &ChunkConfig,
237) -> Vec<CodeChunk> {
238    let mut parser = Parser::new();
239    if parser.set_language(&config.language).is_err() {
240        return sliding_windows(path, source, chunk_config);
241    }
242
243    let Some(tree) = parser.parse(source, None) else {
244        return sliding_windows(path, source, chunk_config);
245    };
246
247    let mut cursor = QueryCursor::new();
248    let mut chunks = Vec::new();
249    let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
250
251    while let Some(m) = matches.next() {
252        let mut name = String::new();
253        let mut def_node = None;
254        for cap in m.captures {
255            let cap_name = &config.query.capture_names()[cap.index as usize];
256            if *cap_name == "name" {
257                name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
258            } else if *cap_name == "def" {
259                def_node = Some(cap.node);
260            }
261        }
262        if let Some(node) = def_node {
263            let content = &source[node.start_byte()..node.end_byte()];
264            let start_line = node.start_position().row + 1;
265
266            // Split oversized chunks into windows
267            if content.len() > chunk_config.max_chunk_bytes {
268                chunks.extend(sliding_windows_with_name(
269                    path,
270                    content,
271                    &name,
272                    start_line,
273                    chunk_config,
274                ));
275            } else {
276                let enriched = build_enriched_content(
277                    path,
278                    node,
279                    source,
280                    content,
281                    chunk_config.max_chunk_bytes,
282                );
283                chunks.push(CodeChunk {
284                    file_path: path.display().to_string(),
285                    name,
286                    kind: node.kind().to_string(),
287                    start_line,
288                    end_line: node.end_position().row + 1,
289                    enriched_content: enriched,
290                    content: content.to_string(),
291                });
292            }
293        }
294    }
295
296    // Fallback: sliding windows if no semantic matches
297    if chunks.is_empty() && !source.trim().is_empty() {
298        return sliding_windows(path, source, chunk_config);
299    }
300
301    chunks
302}
303
304/// Split source text into overlapping sliding windows.
305///
306/// Each window is `chunk_config.window_size` bytes with `chunk_config.window_overlap` bytes of
307/// overlap. Window boundaries are adjusted to line breaks to avoid
308/// splitting mid-line.
309///
310/// This is used as the fallback for files without tree-sitter support
311/// (plain text, unknown extensions) and for large semantic chunks that
312/// exceed `max_chunk_bytes`.
313#[must_use]
314pub fn chunk_text(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
315    sliding_windows(path, source, chunk_config)
316}
317
318/// Internal sliding-window implementation.
319fn sliding_windows(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
320    if source.trim().is_empty() {
321        return vec![];
322    }
323
324    // Small enough for a single chunk
325    if source.len() <= chunk_config.max_chunk_bytes {
326        let content = source.to_string();
327        return vec![CodeChunk {
328            file_path: path.display().to_string(),
329            name: path
330                .file_name()
331                .unwrap_or_default()
332                .to_string_lossy()
333                .to_string(),
334            kind: "file".to_string(),
335            start_line: 1,
336            end_line: source.lines().count(),
337            enriched_content: content.clone(),
338            content,
339        }];
340    }
341
342    let file_name = path
343        .file_name()
344        .unwrap_or_default()
345        .to_string_lossy()
346        .to_string();
347    sliding_window_chunks(source, path, &file_name, 1, chunk_config)
348}
349
350/// Split a named definition into overlapping windows.
351///
352/// Used when a single tree-sitter match (e.g., a large function) exceeds
353/// `chunk_config.max_chunk_bytes`. Windows carry the definition name for search context.
354fn sliding_windows_with_name(
355    path: &Path,
356    content: &str,
357    name: &str,
358    base_line: usize,
359    chunk_config: &ChunkConfig,
360) -> Vec<CodeChunk> {
361    sliding_window_chunks(content, path, name, base_line, chunk_config)
362}
363
364/// Shared sliding-window loop used by both [`sliding_windows`] and
365/// [`sliding_windows_with_name`].
366///
367/// Splits `source` into overlapping windows of `chunk_config.window_size` bytes,
368/// snapping boundaries to line breaks. Each chunk is tagged with `name_prefix`
369/// and an index suffix (e.g., `"main[0]"`, `"main[1]"`).
370fn sliding_window_chunks(
371    source: &str,
372    file_path: &Path,
373    name_prefix: &str,
374    base_line: usize,
375    chunk_config: &ChunkConfig,
376) -> Vec<CodeChunk> {
377    let step = chunk_config
378        .window_size
379        .saturating_sub(chunk_config.window_overlap)
380        .max(1);
381    let bytes = source.as_bytes();
382    let mut chunks = Vec::new();
383    let mut offset = 0;
384    let mut window_idx = 0;
385
386    while offset < bytes.len() {
387        let raw_end = (offset + chunk_config.window_size).min(bytes.len());
388
389        // Snap end to a line boundary (don't split mid-line)
390        let end = if raw_end < bytes.len() {
391            match bytes[offset..raw_end].iter().rposition(|&b| b == b'\n') {
392                Some(pos) => offset + pos + 1,
393                None => raw_end, // no newline found, use raw end
394            }
395        } else {
396            raw_end
397        };
398
399        // Extract window as str (skip invalid UTF-8)
400        if let Ok(window) = std::str::from_utf8(&bytes[offset..end])
401            && !window.trim().is_empty()
402        {
403            let start_line = base_line + source[..offset].matches('\n').count();
404            let content_lines = window.lines().count().max(1);
405            let end_line = start_line + content_lines - 1;
406            let content = window.to_string();
407            chunks.push(CodeChunk {
408                file_path: file_path.display().to_string(),
409                name: format!("{name_prefix}[{window_idx}]"),
410                kind: "window".to_string(),
411                start_line,
412                end_line,
413                enriched_content: content.clone(),
414                content,
415            });
416            window_idx += 1;
417        }
418
419        offset += step;
420    }
421
422    chunks
423}
424
425#[cfg(test)]
426mod tests {
427    use super::*;
428    use std::fmt::Write as _;
429    use std::path::Path;
430
431    #[test]
432    fn chunks_rust_functions_and_structs() {
433        let source = "fn hello() { println!(\"hi\"); }\nfn world() {}\nstruct Foo { x: i32 }";
434        let config = crate::languages::config_for_extension("rs").unwrap();
435        let chunks = chunk_file(
436            Path::new("test.rs"),
437            source,
438            &config,
439            &ChunkConfig::default(),
440        );
441        assert!(
442            chunks.len() >= 2,
443            "expected at least 2 chunks, got {}",
444            chunks.len()
445        );
446        assert!(chunks.iter().any(|c| c.name == "hello"));
447        assert!(chunks.iter().any(|c| c.name == "world"));
448    }
449
450    #[test]
451    fn chunks_python_functions_and_classes() {
452        let source = "def greet(name):\n    pass\n\nclass Foo:\n    pass\n";
453        let config = crate::languages::config_for_extension("py").unwrap();
454        let chunks = chunk_file(
455            Path::new("test.py"),
456            source,
457            &config,
458            &ChunkConfig::default(),
459        );
460        assert!(chunks.len() >= 2);
461        assert!(chunks.iter().any(|c| c.name == "greet"));
462        assert!(chunks.iter().any(|c| c.name == "Foo"));
463    }
464
465    #[test]
466    fn fallback_small_file_single_chunk() {
467        let source = "let x = 42;\nconsole.log(x);\n";
468        let config = crate::languages::config_for_extension("js").unwrap();
469        let chunks = chunk_file(
470            Path::new("script.js"),
471            source,
472            &config,
473            &ChunkConfig::default(),
474        );
475        assert_eq!(chunks.len(), 1);
476        assert_eq!(chunks[0].kind, "file");
477    }
478
479    #[test]
480    fn fallback_large_file_produces_windows() {
481        // Create a file larger than default max_chunk_bytes with no function declarations
482        let line = "console.log('hello world, this is a long line of javascript code');\n";
483        let source: String = line.repeat(200); // ~13KB
484        let chunk_config = ChunkConfig::default();
485        assert!(source.len() > chunk_config.max_chunk_bytes);
486
487        let config = crate::languages::config_for_extension("js").unwrap();
488        let chunks = chunk_file(Path::new("big.js"), &source, &config, &chunk_config);
489        assert!(
490            chunks.len() > 1,
491            "expected multiple windows, got {}",
492            chunks.len()
493        );
494        assert!(chunks.iter().all(|c| c.kind == "window"));
495        assert!(chunks[0].name.contains("[0]"));
496    }
497
498    #[test]
499    fn large_definition_is_windowed() {
500        // A Rust function larger than default max_chunk_bytes
501        let mut source = String::from("fn big_function() {\n");
502        for i in 0..200 {
503            writeln!(source, "    let var_{i} = {i} * 2 + 1; // some computation").unwrap();
504        }
505        source.push_str("}\n");
506        let chunk_config = ChunkConfig::default();
507        assert!(source.len() > chunk_config.max_chunk_bytes);
508
509        let config = crate::languages::config_for_extension("rs").unwrap();
510        let chunks = chunk_file(Path::new("test.rs"), &source, &config, &chunk_config);
511        assert!(
512            chunks.len() > 1,
513            "expected windowed chunks, got {}",
514            chunks.len()
515        );
516        assert!(chunks[0].name.starts_with("big_function["));
517    }
518
519    #[test]
520    fn empty_file_produces_no_chunks() {
521        let config = crate::languages::config_for_extension("rs").unwrap();
522        let chunks = chunk_file(Path::new("empty.rs"), "", &config, &ChunkConfig::default());
523        assert!(chunks.is_empty());
524    }
525
526    // --- T1 enrichment tests ---
527
528    /// Helper: parse source with tree-sitter and return the first `@def` node.
529    fn first_def_node(
530        source: &str,
531        ext: &str,
532    ) -> (
533        tree_sitter::Tree,
534        std::sync::Arc<crate::languages::LangConfig>,
535    ) {
536        let config = crate::languages::config_for_extension(ext).unwrap();
537        let mut parser = Parser::new();
538        parser.set_language(&config.language).unwrap();
539        let tree = parser.parse(source, None).unwrap();
540        (tree, config)
541    }
542
543    #[test]
544    fn scope_chain_rust_impl_method() {
545        let source = "impl Foo {\n    fn bar(&self) {}\n}";
546        let (tree, config) = first_def_node(source, "rs");
547        let mut cursor = QueryCursor::new();
548        let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
549
550        let mut def_node = None;
551        while let Some(m) = StreamingIterator::next(&mut matches) {
552            for cap in m.captures {
553                let cap_name = &config.query.capture_names()[cap.index as usize];
554                if *cap_name == "def" {
555                    def_node = Some(cap.node);
556                }
557            }
558        }
559        let node = def_node.expect("should find a @def node");
560        let scope = build_scope_chain(node, source);
561        assert!(
562            scope.contains("impl_item"),
563            "scope should contain impl_item, got: {scope}"
564        );
565        assert!(
566            scope.contains("Foo"),
567            "scope should contain 'Foo', got: {scope}"
568        );
569    }
570
571    #[test]
572    fn scope_chain_python_class_method() {
573        let source = "class Greeter:\n    def say_hello(self):\n        pass\n";
574        let (tree, config) = first_def_node(source, "py");
575        let mut cursor = QueryCursor::new();
576        let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
577
578        // Find the function_definition @def (say_hello), not the class @def
579        let mut fn_node = None;
580        while let Some(m) = StreamingIterator::next(&mut matches) {
581            for cap in m.captures {
582                let cap_name = &config.query.capture_names()[cap.index as usize];
583                if *cap_name == "def" && cap.node.kind() == "function_definition" {
584                    fn_node = Some(cap.node);
585                }
586            }
587        }
588        let node = fn_node.expect("should find say_hello @def node");
589        let scope = build_scope_chain(node, source);
590        assert!(
591            scope.contains("class_definition"),
592            "scope should contain class_definition, got: {scope}"
593        );
594        assert!(
595            scope.contains("Greeter"),
596            "scope should contain 'Greeter', got: {scope}"
597        );
598    }
599
600    #[test]
601    fn extract_signature_rust_function() {
602        let source = "fn greet(name: &str) -> String { name.to_string() }";
603        let (tree, config) = first_def_node(source, "rs");
604        let mut cursor = QueryCursor::new();
605        let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
606
607        let mut def_node = None;
608        while let Some(m) = StreamingIterator::next(&mut matches) {
609            for cap in m.captures {
610                let cap_name = &config.query.capture_names()[cap.index as usize];
611                if *cap_name == "def" {
612                    def_node = Some(cap.node);
613                }
614            }
615        }
616        let node = def_node.expect("should find @def node");
617        let sig = extract_signature(node, source).expect("should extract signature");
618        assert!(
619            sig.contains("greet"),
620            "signature should contain 'greet', got: {sig}"
621        );
622        assert!(
623            sig.contains("name: &str"),
624            "signature should contain parameter, got: {sig}"
625        );
626        assert!(
627            sig.contains("-> String"),
628            "signature should contain return type, got: {sig}"
629        );
630    }
631
632    #[test]
633    fn enriched_content_has_header() {
634        let source = "fn hello() { println!(\"hi\"); }";
635        let config = crate::languages::config_for_extension("rs").unwrap();
636        let chunks = chunk_file(
637            Path::new("src/main.rs"),
638            source,
639            &config,
640            &ChunkConfig::default(),
641        );
642        assert!(!chunks.is_empty());
643        let chunk = &chunks[0];
644        assert!(
645            chunk.enriched_content.starts_with("//"),
646            "enriched_content should start with '//' header, got: {}",
647            &chunk.enriched_content[..chunk.enriched_content.len().min(80)]
648        );
649        assert!(
650            chunk.enriched_content.contains("src/main.rs"),
651            "enriched_content should contain file path"
652        );
653        // Raw content should NOT have the header
654        assert!(
655            !chunk.content.starts_with("//"),
656            "raw content should not start with header"
657        );
658    }
659
660    #[test]
661    fn sliding_window_enriched_equals_content() {
662        let source = "let x = 42;\nconsole.log(x);\n";
663        let chunks = chunk_text(Path::new("test.txt"), source, &ChunkConfig::default());
664        assert!(!chunks.is_empty());
665        for chunk in &chunks {
666            assert_eq!(
667                chunk.enriched_content, chunk.content,
668                "sliding window chunks should have enriched_content == content"
669            );
670        }
671    }
672
673    #[test]
674    fn header_dropped_when_exceeding_max_bytes() {
675        // Create a chunk that barely fits in max_chunk_bytes, so adding
676        // a header would push it over the limit.
677        let tiny_config = ChunkConfig {
678            max_chunk_bytes: 60,
679            window_size: 30,
680            window_overlap: 10,
681        };
682        // Source is exactly at max_chunk_bytes — any header would exceed it
683        let source = "fn f() { let x = 42; return x; }";
684        assert!(source.len() <= tiny_config.max_chunk_bytes);
685
686        let config = crate::languages::config_for_extension("rs").unwrap();
687        let chunks = chunk_file(
688            Path::new("long/path/to/file.rs"),
689            source,
690            &config,
691            &tiny_config,
692        );
693        assert!(!chunks.is_empty());
694        let chunk = &chunks[0];
695        // Header ("// long/path/to/file.rs | defines: ...") + minified content > 60 bytes.
696        // So enriched_content should fall back to minified content (no header),
697        // and raw content is preserved as-is.
698        assert!(
699            !chunk.enriched_content.starts_with("//"),
700            "header should be dropped when it would exceed max_chunk_bytes"
701        );
702        assert_eq!(chunk.content, source, "raw content should be unchanged");
703    }
704
705    #[test]
706    fn minify_whitespace_normalizes_indent_and_strips_trailing() {
707        // 8-space indent → 4-space (halved)
708        let source = "fn foo() {\n        let x = 1;\n        let y = 2;\n}\n";
709        let result = minify_whitespace(source);
710        let lines: Vec<&str> = result.lines().collect();
711        assert_eq!(
712            lines[1], "    let x = 1;",
713            "8-space indent should become 4-space"
714        );
715        assert_eq!(
716            lines[2], "    let y = 2;",
717            "8-space indent should become 4-space"
718        );
719
720        // Trailing whitespace removed
721        let with_trailing = "fn bar()   \n    return 1;   \n";
722        let result2 = minify_whitespace(with_trailing);
723        assert!(
724            result2.lines().all(|l| !l.ends_with(' ')),
725            "trailing whitespace should be stripped"
726        );
727
728        // 3+ consecutive blank lines collapsed to 1
729        let with_blanks = "a\n\n\n\nb\n";
730        let result3 = minify_whitespace(with_blanks);
731        // Should have at most 1 blank line between a and b
732        let blank_runs: Vec<usize> = {
733            let mut runs = Vec::new();
734            let mut count = 0usize;
735            for line in result3.lines() {
736                if line.is_empty() {
737                    count += 1;
738                } else {
739                    if count > 0 {
740                        runs.push(count);
741                    }
742                    count = 0;
743                }
744            }
745            runs
746        };
747        assert!(
748            blank_runs.iter().all(|&n| n <= 1),
749            "3+ blank lines should collapse to 1, got runs: {blank_runs:?}"
750        );
751    }
752}