Skip to main content

ripvec_core/
chunk.rs

1//! Tree-sitter based code chunking with sliding-window fallback.
2//!
3//! Parses source files into ASTs and extracts semantic chunks at
4//! function, class, and method boundaries. For files without recognized
5//! semantic structure (or very large fallback chunks), splits into
6//! overlapping sliding windows for uniform embedding sizes.
7
8use std::path::Path;
9use streaming_iterator::StreamingIterator;
10use tree_sitter::{Parser, QueryCursor};
11
12/// Runtime configuration for the chunking pipeline.
13///
14/// All size values are in bytes. Pass to [`chunk_file`] to control
15/// chunk sizing without recompilation.
16#[derive(Debug, Clone)]
17pub struct ChunkConfig {
18    /// Maximum chunk content length in bytes before splitting into windows.
19    /// Chunks larger than this are split even if tree-sitter found them as
20    /// a single definition (e.g., a 500-line function).
21    pub max_chunk_bytes: usize,
22    /// Target window size in bytes for the sliding-window fallback chunker.
23    /// ~2KB of source text ≈ 128-256 tokens after BPE, well within the
24    /// 512-token model limit and optimal for embedding quality.
25    pub window_size: usize,
26    /// Overlap between adjacent windows in bytes.
27    /// Ensures definitions spanning a window boundary are captured in at
28    /// least one window. Defaults to 25% of `window_size`.
29    pub window_overlap: usize,
30}
31
32impl Default for ChunkConfig {
33    fn default() -> Self {
34        Self {
35            max_chunk_bytes: 4096,
36            window_size: 2048,
37            window_overlap: 512,
38        }
39    }
40}
41
42/// A semantic chunk extracted from a source file.
43#[derive(
44    Debug,
45    Clone,
46    rkyv::Archive,
47    rkyv::Serialize,
48    rkyv::Deserialize,
49    bitcode::Encode,
50    bitcode::Decode,
51)]
52pub struct CodeChunk {
53    /// Path to the source file.
54    pub file_path: String,
55    /// Name of the definition (function name, class name, etc.).
56    pub name: String,
57    /// Kind of syntax node (e.g., `function_item`, `class_definition`).
58    pub kind: String,
59    /// 1-based start line number.
60    pub start_line: usize,
61    /// 1-based end line number.
62    pub end_line: usize,
63    /// Source text of the chunk (raw code for display).
64    pub content: String,
65    /// Enriched content with scope chain and signature metadata for embedding.
66    /// Falls back to `content` if metadata would exceed chunk size limits.
67    pub enriched_content: String,
68}
69
70/// Walk up the AST parent chain collecting structural container names.
71///
72/// Produces a scope chain like `"impl_item Foo > fn forward"` by
73/// identifying structural containers (impl blocks, classes, modules, namespaces)
74/// and extracting their names. Tries the `name` field first, then `type`
75/// (for Rust `impl_item` which uses `type` instead of `name`).
76#[must_use]
77pub fn build_scope_chain(node: tree_sitter::Node<'_>, source: &str) -> String {
78    /// Node kinds that represent structural containers, by language.
79    const CONTAINER_KINDS: &[&str] = &[
80        // Rust
81        "impl_item",
82        "trait_item",
83        "mod_item",
84        // Python
85        "class_definition",
86        "module",
87        // JS/TS
88        "class_declaration",
89        // Java
90        // "class_declaration" already covered above
91        // Go
92        "type_declaration",
93        // C++
94        "namespace_definition",
95        "class_specifier",
96    ];
97
98    /// Field names to try when extracting the container name.
99    /// `impl_item` uses `type` instead of `name`; Go `type_declaration`
100    /// has no fields, so we fall back to the node kind.
101    const NAME_FIELDS: &[&str] = &["name", "type"];
102
103    let mut parts = Vec::new();
104    let mut current = node.parent();
105    while let Some(parent) = current {
106        let kind = parent.kind();
107        if CONTAINER_KINDS.contains(&kind) {
108            let name = NAME_FIELDS
109                .iter()
110                .find_map(|field| parent.child_by_field_name(field))
111                .map_or(kind, |n| &source[n.start_byte()..n.end_byte()]);
112            parts.push(format!("{kind} {name}"));
113        }
114        current = parent.parent();
115    }
116    parts.reverse();
117    parts.join(" > ")
118}
119
120/// Extract the function/method signature from a definition node.
121///
122/// Returns the text from the function name to the start of the body,
123/// which captures the parameter list and return type (if any).
124/// Returns `None` if the node has no `name` or `body`/`block` field.
125#[must_use]
126pub fn extract_signature(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
127    let name_node = node.child_by_field_name("name")?;
128    let body_node = node
129        .child_by_field_name("body")
130        .or_else(|| node.child_by_field_name("block"))?;
131    let start = name_node.start_byte();
132    let end = body_node.start_byte();
133    if start >= end {
134        return None;
135    }
136    let sig = source[start..end].trim();
137    if sig.is_empty() {
138        None
139    } else {
140        Some(sig.to_string())
141    }
142}
143
144/// Reduce indentation waste for embedding by normalizing whitespace.
145///
146/// For each line:
147/// - Counts leading spaces/tabs, normalises to 2 spaces per indent level
148///   (4 spaces → 2, 8 spaces → 4, 1 tab → 2 spaces).
149/// - Strips trailing whitespace.
150///
151/// Additionally, 3 or more consecutive blank lines are collapsed to a single
152/// blank line. This reduces the number of whitespace tokens consumed in the
153/// 512-token embedding window without altering visible structure.
154#[must_use]
155pub fn minify_whitespace(source: &str) -> String {
156    let mut result = String::with_capacity(source.len());
157    let mut consecutive_blank = 0usize;
158
159    for line in source.lines() {
160        // Count leading whitespace and determine indent level
161        let leading = line
162            .chars()
163            .take_while(|c| *c == ' ' || *c == '\t')
164            .fold(0usize, |acc, c| acc + if c == '\t' { 2 } else { 1 });
165        let rest = line.trim_start();
166
167        if rest.is_empty() {
168            // Blank line handling: collapse 3+ consecutive blanks to 1.
169            // Only emit the first blank line of a run; suppress the rest.
170            consecutive_blank += 1;
171            if consecutive_blank == 1 {
172                result.push('\n');
173            }
174        } else {
175            consecutive_blank = 0;
176            // Normalise: every 2 spaces of original indent → 1 space of output
177            // (round up so indent level 1 → 1 space, level 2 → 2, etc.)
178            let indent_level = leading.div_ceil(2);
179            for _ in 0..indent_level {
180                result.push(' ');
181            }
182            result.push_str(rest.trim_end());
183            result.push('\n');
184        }
185    }
186
187    // Remove trailing newline added for the last line if source didn't end with one
188    if !source.ends_with('\n') && result.ends_with('\n') {
189        result.pop();
190    }
191
192    result
193}
194
195/// Build the enriched content header for a code chunk.
196///
197/// Prepends scope chain and signature metadata as a comment line.
198/// If the header + content would exceed `max_bytes`, returns `content` unchanged.
199fn build_enriched_content(
200    path: &Path,
201    node: tree_sitter::Node<'_>,
202    source: &str,
203    content: &str,
204    max_bytes: usize,
205) -> String {
206    let scope = build_scope_chain(node, source);
207    let sig = extract_signature(node, source).unwrap_or_default();
208    let rel_path = path.display().to_string();
209
210    let header = if scope.is_empty() && sig.is_empty() {
211        format!("// {rel_path}\n")
212    } else if scope.is_empty() {
213        format!("// {rel_path} | defines: {sig}\n")
214    } else if sig.is_empty() {
215        format!("// {rel_path} | {scope}\n")
216    } else {
217        format!("// {rel_path} | {scope} | defines: {sig}\n")
218    };
219
220    // Minify whitespace for the embedding content to reduce token waste.
221    // The raw `content` field is kept as-is for display.
222    let minified = minify_whitespace(content);
223
224    if header.len() + minified.len() > max_bytes {
225        minified
226    } else {
227        format!("{header}{minified}")
228    }
229}
230
231/// Extract semantic chunks from a source file.
232///
233/// Uses tree-sitter to parse the file and extract definitions matching
234/// the language's query patterns. For files with no semantic matches,
235/// falls back to overlapping sliding windows. Large individual chunks
236/// are also split into windows.
237///
238/// Pass a [`ChunkConfig`] to control chunk sizing at runtime.
239#[must_use]
240pub fn chunk_file(
241    path: &Path,
242    source: &str,
243    config: &crate::languages::LangConfig,
244    chunk_config: &ChunkConfig,
245) -> Vec<CodeChunk> {
246    let mut parser = Parser::new();
247    if parser.set_language(&config.language).is_err() {
248        return sliding_windows(path, source, chunk_config);
249    }
250
251    let Some(tree) = parser.parse(source, None) else {
252        return sliding_windows(path, source, chunk_config);
253    };
254
255    let mut cursor = QueryCursor::new();
256    let mut chunks = Vec::new();
257    let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
258
259    while let Some(m) = matches.next() {
260        let mut name = String::new();
261        let mut def_node = None;
262        for cap in m.captures {
263            let cap_name = &config.query.capture_names()[cap.index as usize];
264            if *cap_name == "name" {
265                name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
266            } else if *cap_name == "def" {
267                def_node = Some(cap.node);
268            }
269        }
270        if let Some(node) = def_node {
271            let content = &source[node.start_byte()..node.end_byte()];
272            let start_line = node.start_position().row + 1;
273
274            // Split oversized chunks into windows
275            if content.len() > chunk_config.max_chunk_bytes {
276                chunks.extend(sliding_windows_with_name(
277                    path,
278                    content,
279                    &name,
280                    start_line,
281                    chunk_config,
282                ));
283            } else {
284                let enriched = build_enriched_content(
285                    path,
286                    node,
287                    source,
288                    content,
289                    chunk_config.max_chunk_bytes,
290                );
291                chunks.push(CodeChunk {
292                    file_path: path.display().to_string(),
293                    name,
294                    kind: node.kind().to_string(),
295                    start_line,
296                    end_line: node.end_position().row + 1,
297                    enriched_content: enriched,
298                    content: content.to_string(),
299                });
300            }
301        }
302    }
303
304    // Fallback: sliding windows if no semantic matches
305    if chunks.is_empty() && !source.trim().is_empty() {
306        return sliding_windows(path, source, chunk_config);
307    }
308
309    chunks
310}
311
312/// Split source text into overlapping sliding windows.
313///
314/// Each window is `chunk_config.window_size` bytes with `chunk_config.window_overlap` bytes of
315/// overlap. Window boundaries are adjusted to line breaks to avoid
316/// splitting mid-line.
317///
318/// This is used as the fallback for files without tree-sitter support
319/// (plain text, unknown extensions) and for large semantic chunks that
320/// exceed `max_chunk_bytes`.
321#[must_use]
322pub fn chunk_text(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
323    sliding_windows(path, source, chunk_config)
324}
325
326/// Internal sliding-window implementation.
327fn sliding_windows(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
328    if source.trim().is_empty() {
329        return vec![];
330    }
331
332    // Small enough for a single chunk
333    if source.len() <= chunk_config.max_chunk_bytes {
334        let content = source.to_string();
335        return vec![CodeChunk {
336            file_path: path.display().to_string(),
337            name: path
338                .file_name()
339                .unwrap_or_default()
340                .to_string_lossy()
341                .to_string(),
342            kind: "file".to_string(),
343            start_line: 1,
344            end_line: source.lines().count(),
345            enriched_content: content.clone(),
346            content,
347        }];
348    }
349
350    let file_name = path
351        .file_name()
352        .unwrap_or_default()
353        .to_string_lossy()
354        .to_string();
355    sliding_window_chunks(source, path, &file_name, 1, chunk_config)
356}
357
358/// Split a named definition into overlapping windows.
359///
360/// Used when a single tree-sitter match (e.g., a large function) exceeds
361/// `chunk_config.max_chunk_bytes`. Windows carry the definition name for search context.
362fn sliding_windows_with_name(
363    path: &Path,
364    content: &str,
365    name: &str,
366    base_line: usize,
367    chunk_config: &ChunkConfig,
368) -> Vec<CodeChunk> {
369    sliding_window_chunks(content, path, name, base_line, chunk_config)
370}
371
372/// Shared sliding-window loop used by both [`sliding_windows`] and
373/// [`sliding_windows_with_name`].
374///
375/// Splits `source` into overlapping windows of `chunk_config.window_size` bytes,
376/// snapping boundaries to line breaks. Each chunk is tagged with `name_prefix`
377/// and an index suffix (e.g., `"main[0]"`, `"main[1]"`).
378fn sliding_window_chunks(
379    source: &str,
380    file_path: &Path,
381    name_prefix: &str,
382    base_line: usize,
383    chunk_config: &ChunkConfig,
384) -> Vec<CodeChunk> {
385    let step = chunk_config
386        .window_size
387        .saturating_sub(chunk_config.window_overlap)
388        .max(1);
389    let bytes = source.as_bytes();
390    let mut chunks = Vec::new();
391    let mut offset = 0;
392    let mut window_idx = 0;
393
394    while offset < bytes.len() {
395        let raw_end = (offset + chunk_config.window_size).min(bytes.len());
396
397        // Snap end to a line boundary (don't split mid-line)
398        let end = if raw_end < bytes.len() {
399            match bytes[offset..raw_end].iter().rposition(|&b| b == b'\n') {
400                Some(pos) => offset + pos + 1,
401                None => raw_end, // no newline found, use raw end
402            }
403        } else {
404            raw_end
405        };
406
407        // Extract window as str (skip invalid UTF-8)
408        if let Ok(window) = std::str::from_utf8(&bytes[offset..end])
409            && !window.trim().is_empty()
410        {
411            let start_line = base_line + source[..offset].matches('\n').count();
412            let content_lines = window.lines().count().max(1);
413            let end_line = start_line + content_lines - 1;
414            let content = window.to_string();
415            chunks.push(CodeChunk {
416                file_path: file_path.display().to_string(),
417                name: format!("{name_prefix}[{window_idx}]"),
418                kind: "window".to_string(),
419                start_line,
420                end_line,
421                enriched_content: content.clone(),
422                content,
423            });
424            window_idx += 1;
425        }
426
427        offset += step;
428    }
429
430    chunks
431}
432
433#[cfg(test)]
434mod tests {
435    use super::*;
436    use std::fmt::Write as _;
437    use std::path::Path;
438
439    #[test]
440    fn chunks_rust_functions_and_structs() {
441        let source = "fn hello() { println!(\"hi\"); }\nfn world() {}\nstruct Foo { x: i32 }";
442        let config = crate::languages::config_for_extension("rs").unwrap();
443        let chunks = chunk_file(
444            Path::new("test.rs"),
445            source,
446            &config,
447            &ChunkConfig::default(),
448        );
449        assert!(
450            chunks.len() >= 2,
451            "expected at least 2 chunks, got {}",
452            chunks.len()
453        );
454        assert!(chunks.iter().any(|c| c.name == "hello"));
455        assert!(chunks.iter().any(|c| c.name == "world"));
456    }
457
458    #[test]
459    fn chunks_python_functions_and_classes() {
460        let source = "def greet(name):\n    pass\n\nclass Foo:\n    pass\n";
461        let config = crate::languages::config_for_extension("py").unwrap();
462        let chunks = chunk_file(
463            Path::new("test.py"),
464            source,
465            &config,
466            &ChunkConfig::default(),
467        );
468        assert!(chunks.len() >= 2);
469        assert!(chunks.iter().any(|c| c.name == "greet"));
470        assert!(chunks.iter().any(|c| c.name == "Foo"));
471    }
472
473    #[test]
474    fn fallback_small_file_single_chunk() {
475        // With enriched queries, `let x = 42` matches variable_declarator.
476        // Use a source with NO tree-sitter captures to test the plaintext fallback.
477        let source = "// just a comment\n// and another\n";
478        let config = crate::languages::config_for_extension("js").unwrap();
479        let chunks = chunk_file(
480            Path::new("script.js"),
481            source,
482            &config,
483            &ChunkConfig::default(),
484        );
485        assert_eq!(chunks.len(), 1);
486        assert_eq!(chunks[0].kind, "file");
487    }
488
489    #[test]
490    fn fallback_large_file_produces_windows() {
491        // Create a file larger than default max_chunk_bytes with no function declarations
492        let line = "console.log('hello world, this is a long line of javascript code');\n";
493        let source: String = line.repeat(200); // ~13KB
494        let chunk_config = ChunkConfig::default();
495        assert!(source.len() > chunk_config.max_chunk_bytes);
496
497        let config = crate::languages::config_for_extension("js").unwrap();
498        let chunks = chunk_file(Path::new("big.js"), &source, &config, &chunk_config);
499        assert!(
500            chunks.len() > 1,
501            "expected multiple windows, got {}",
502            chunks.len()
503        );
504        assert!(chunks.iter().all(|c| c.kind == "window"));
505        assert!(chunks[0].name.contains("[0]"));
506    }
507
508    #[test]
509    fn large_definition_is_windowed() {
510        // A Rust function larger than default max_chunk_bytes
511        let mut source = String::from("fn big_function() {\n");
512        for i in 0..200 {
513            writeln!(source, "    let var_{i} = {i} * 2 + 1; // some computation").unwrap();
514        }
515        source.push_str("}\n");
516        let chunk_config = ChunkConfig::default();
517        assert!(source.len() > chunk_config.max_chunk_bytes);
518
519        let config = crate::languages::config_for_extension("rs").unwrap();
520        let chunks = chunk_file(Path::new("test.rs"), &source, &config, &chunk_config);
521        assert!(
522            chunks.len() > 1,
523            "expected windowed chunks, got {}",
524            chunks.len()
525        );
526        assert!(chunks[0].name.starts_with("big_function["));
527    }
528
529    #[test]
530    fn empty_file_produces_no_chunks() {
531        let config = crate::languages::config_for_extension("rs").unwrap();
532        let chunks = chunk_file(Path::new("empty.rs"), "", &config, &ChunkConfig::default());
533        assert!(chunks.is_empty());
534    }
535
536    // --- T1 enrichment tests ---
537
538    /// Helper: parse source with tree-sitter and return the first `@def` node.
539    fn first_def_node(
540        source: &str,
541        ext: &str,
542    ) -> (
543        tree_sitter::Tree,
544        std::sync::Arc<crate::languages::LangConfig>,
545    ) {
546        let config = crate::languages::config_for_extension(ext).unwrap();
547        let mut parser = Parser::new();
548        parser.set_language(&config.language).unwrap();
549        let tree = parser.parse(source, None).unwrap();
550        (tree, config)
551    }
552
553    #[test]
554    fn scope_chain_rust_impl_method() {
555        let source = "impl Foo {\n    fn bar(&self) {}\n}";
556        let (tree, config) = first_def_node(source, "rs");
557        let mut cursor = QueryCursor::new();
558        let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
559
560        let mut def_node = None;
561        while let Some(m) = StreamingIterator::next(&mut matches) {
562            for cap in m.captures {
563                let cap_name = &config.query.capture_names()[cap.index as usize];
564                if *cap_name == "def" {
565                    def_node = Some(cap.node);
566                }
567            }
568        }
569        let node = def_node.expect("should find a @def node");
570        let scope = build_scope_chain(node, source);
571        assert!(
572            scope.contains("impl_item"),
573            "scope should contain impl_item, got: {scope}"
574        );
575        assert!(
576            scope.contains("Foo"),
577            "scope should contain 'Foo', got: {scope}"
578        );
579    }
580
581    #[test]
582    fn scope_chain_python_class_method() {
583        let source = "class Greeter:\n    def say_hello(self):\n        pass\n";
584        let (tree, config) = first_def_node(source, "py");
585        let mut cursor = QueryCursor::new();
586        let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
587
588        // Find the function_definition @def (say_hello), not the class @def
589        let mut fn_node = None;
590        while let Some(m) = StreamingIterator::next(&mut matches) {
591            for cap in m.captures {
592                let cap_name = &config.query.capture_names()[cap.index as usize];
593                if *cap_name == "def" && cap.node.kind() == "function_definition" {
594                    fn_node = Some(cap.node);
595                }
596            }
597        }
598        let node = fn_node.expect("should find say_hello @def node");
599        let scope = build_scope_chain(node, source);
600        assert!(
601            scope.contains("class_definition"),
602            "scope should contain class_definition, got: {scope}"
603        );
604        assert!(
605            scope.contains("Greeter"),
606            "scope should contain 'Greeter', got: {scope}"
607        );
608    }
609
610    #[test]
611    fn extract_signature_rust_function() {
612        let source = "fn greet(name: &str) -> String { name.to_string() }";
613        let (tree, config) = first_def_node(source, "rs");
614        let mut cursor = QueryCursor::new();
615        let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
616
617        let mut def_node = None;
618        while let Some(m) = StreamingIterator::next(&mut matches) {
619            for cap in m.captures {
620                let cap_name = &config.query.capture_names()[cap.index as usize];
621                if *cap_name == "def" {
622                    def_node = Some(cap.node);
623                }
624            }
625        }
626        let node = def_node.expect("should find @def node");
627        let sig = extract_signature(node, source).expect("should extract signature");
628        assert!(
629            sig.contains("greet"),
630            "signature should contain 'greet', got: {sig}"
631        );
632        assert!(
633            sig.contains("name: &str"),
634            "signature should contain parameter, got: {sig}"
635        );
636        assert!(
637            sig.contains("-> String"),
638            "signature should contain return type, got: {sig}"
639        );
640    }
641
642    #[test]
643    fn enriched_content_has_header() {
644        let source = "fn hello() { println!(\"hi\"); }";
645        let config = crate::languages::config_for_extension("rs").unwrap();
646        let chunks = chunk_file(
647            Path::new("src/main.rs"),
648            source,
649            &config,
650            &ChunkConfig::default(),
651        );
652        assert!(!chunks.is_empty());
653        let chunk = &chunks[0];
654        assert!(
655            chunk.enriched_content.starts_with("//"),
656            "enriched_content should start with '//' header, got: {}",
657            &chunk.enriched_content[..chunk.enriched_content.len().min(80)]
658        );
659        assert!(
660            chunk.enriched_content.contains("src/main.rs"),
661            "enriched_content should contain file path"
662        );
663        // Raw content should NOT have the header
664        assert!(
665            !chunk.content.starts_with("//"),
666            "raw content should not start with header"
667        );
668    }
669
670    #[test]
671    fn sliding_window_enriched_equals_content() {
672        let source = "let x = 42;\nconsole.log(x);\n";
673        let chunks = chunk_text(Path::new("test.txt"), source, &ChunkConfig::default());
674        assert!(!chunks.is_empty());
675        for chunk in &chunks {
676            assert_eq!(
677                chunk.enriched_content, chunk.content,
678                "sliding window chunks should have enriched_content == content"
679            );
680        }
681    }
682
683    #[test]
684    fn header_dropped_when_exceeding_max_bytes() {
685        // Create a chunk that barely fits in max_chunk_bytes, so adding
686        // a header would push it over the limit.
687        let tiny_config = ChunkConfig {
688            max_chunk_bytes: 60,
689            window_size: 30,
690            window_overlap: 10,
691        };
692        // Source is exactly at max_chunk_bytes — any header would exceed it
693        let source = "fn f() { let x = 42; return x; }";
694        assert!(source.len() <= tiny_config.max_chunk_bytes);
695
696        let config = crate::languages::config_for_extension("rs").unwrap();
697        let chunks = chunk_file(
698            Path::new("long/path/to/file.rs"),
699            source,
700            &config,
701            &tiny_config,
702        );
703        assert!(!chunks.is_empty());
704        let chunk = &chunks[0];
705        // Header ("// long/path/to/file.rs | defines: ...") + minified content > 60 bytes.
706        // So enriched_content should fall back to minified content (no header),
707        // and raw content is preserved as-is.
708        assert!(
709            !chunk.enriched_content.starts_with("//"),
710            "header should be dropped when it would exceed max_chunk_bytes"
711        );
712        assert_eq!(chunk.content, source, "raw content should be unchanged");
713    }
714
715    #[test]
716    fn minify_whitespace_normalizes_indent_and_strips_trailing() {
717        // 8-space indent → 4-space (halved)
718        let source = "fn foo() {\n        let x = 1;\n        let y = 2;\n}\n";
719        let result = minify_whitespace(source);
720        let lines: Vec<&str> = result.lines().collect();
721        assert_eq!(
722            lines[1], "    let x = 1;",
723            "8-space indent should become 4-space"
724        );
725        assert_eq!(
726            lines[2], "    let y = 2;",
727            "8-space indent should become 4-space"
728        );
729
730        // Trailing whitespace removed
731        let with_trailing = "fn bar()   \n    return 1;   \n";
732        let result2 = minify_whitespace(with_trailing);
733        assert!(
734            result2.lines().all(|l| !l.ends_with(' ')),
735            "trailing whitespace should be stripped"
736        );
737
738        // 3+ consecutive blank lines collapsed to 1
739        let with_blanks = "a\n\n\n\nb\n";
740        let result3 = minify_whitespace(with_blanks);
741        // Should have at most 1 blank line between a and b
742        let blank_runs: Vec<usize> = {
743            let mut runs = Vec::new();
744            let mut count = 0usize;
745            for line in result3.lines() {
746                if line.is_empty() {
747                    count += 1;
748                } else {
749                    if count > 0 {
750                        runs.push(count);
751                    }
752                    count = 0;
753                }
754            }
755            runs
756        };
757        assert!(
758            blank_runs.iter().all(|&n| n <= 1),
759            "3+ blank lines should collapse to 1, got runs: {blank_runs:?}"
760        );
761    }
762}