ck_chunk/
lib.rs

1use anyhow::Result;
2use ck_core::Span;
3use serde::{Deserialize, Serialize};
4
5mod query_chunker;
6
7/// Import token estimation from ck-embed
8pub use ck_embed::TokenEstimator;
9
10/// Fallback to estimation if precise tokenization fails
11fn estimate_tokens(text: &str) -> usize {
12    TokenEstimator::estimate_tokens(text)
13}
14
15/// Get model-specific chunk configuration (target_tokens, overlap_tokens)
16/// Balanced for precision vs context - larger models can handle bigger chunks but not too big
17pub fn get_model_chunk_config(model_name: Option<&str>) -> (usize, usize) {
18    let model = model_name.unwrap_or("nomic-embed-text-v1.5");
19
20    match model {
21        // Small models - keep chunks smaller for better precision
22        "BAAI/bge-small-en-v1.5" | "sentence-transformers/all-MiniLM-L6-v2" => {
23            (400, 80) // 400 tokens target, 80 token overlap (~20%)
24        }
25
26        // Large context models - can use bigger chunks while preserving precision
27        // Sweet spot: enough context to be meaningful, small enough to be precise
28        "nomic-embed-text-v1" | "nomic-embed-text-v1.5" | "jina-embeddings-v2-base-code" => {
29            (1024, 200) // 1024 tokens target, 200 token overlap (~20%) - good balance
30        }
31
32        // BGE variants - stick to smaller for precision
33        "BAAI/bge-base-en-v1.5" | "BAAI/bge-large-en-v1.5" => {
34            (400, 80) // 400 tokens target, 80 token overlap (~20%)
35        }
36
37        // Default to large model config since nomic-v1.5 is default
38        _ => (1024, 200), // Good balance of context vs precision
39    }
40}
41
42/// Information about chunk striding for large chunks that exceed token limits
43#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct StrideInfo {
45    /// Unique ID for the original chunk before striding
46    pub original_chunk_id: String,
47    /// Index of this stride (0-based)
48    pub stride_index: usize,
49    /// Total number of strides for the original chunk
50    pub total_strides: usize,
51    /// Byte offset where overlap with previous stride begins
52    pub overlap_start: usize,
53    /// Byte offset where overlap with next stride ends
54    pub overlap_end: usize,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize, Default)]
58pub struct ChunkMetadata {
59    pub ancestry: Vec<String>,
60    pub breadcrumb: Option<String>,
61    pub leading_trivia: Vec<String>,
62    pub trailing_trivia: Vec<String>,
63    pub byte_length: usize,
64    pub estimated_tokens: usize,
65}
66
67impl ChunkMetadata {
68    fn from_context(
69        text: &str,
70        ancestry: Vec<String>,
71        leading_trivia: Vec<String>,
72        trailing_trivia: Vec<String>,
73    ) -> Self {
74        let breadcrumb = if ancestry.is_empty() {
75            None
76        } else {
77            Some(ancestry.join("::"))
78        };
79
80        Self {
81            ancestry,
82            breadcrumb,
83            leading_trivia,
84            trailing_trivia,
85            byte_length: text.len(),
86            estimated_tokens: estimate_tokens(text),
87        }
88    }
89
90    fn from_text(text: &str) -> Self {
91        Self {
92            ancestry: Vec::new(),
93            breadcrumb: None,
94            leading_trivia: Vec::new(),
95            trailing_trivia: Vec::new(),
96            byte_length: text.len(),
97            estimated_tokens: estimate_tokens(text),
98        }
99    }
100
101    fn with_updated_text(&self, text: &str) -> Self {
102        let mut cloned = self.clone();
103        cloned.byte_length = text.len();
104        cloned.estimated_tokens = estimate_tokens(text);
105        cloned
106    }
107}
108
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct Chunk {
111    pub span: Span,
112    pub text: String,
113    pub chunk_type: ChunkType,
114    /// Stride information if this chunk was created by striding a larger chunk
115    pub stride_info: Option<StrideInfo>,
116    pub metadata: ChunkMetadata,
117}
118
119#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
120pub enum ChunkType {
121    Text,
122    Function,
123    Class,
124    Method,
125    Module,
126}
127
128#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
129pub enum ParseableLanguage {
130    Python,
131    TypeScript,
132    JavaScript,
133    Haskell,
134    Rust,
135    Ruby,
136    Go,
137    CSharp,
138    Zig,
139
140    Dart,
141
142    Elixir,
143}
144
145impl std::fmt::Display for ParseableLanguage {
146    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
147        let name = match self {
148            ParseableLanguage::Python => "python",
149            ParseableLanguage::TypeScript => "typescript",
150            ParseableLanguage::JavaScript => "javascript",
151            ParseableLanguage::Haskell => "haskell",
152            ParseableLanguage::Rust => "rust",
153            ParseableLanguage::Ruby => "ruby",
154            ParseableLanguage::Go => "go",
155            ParseableLanguage::CSharp => "csharp",
156            ParseableLanguage::Zig => "zig",
157
158            ParseableLanguage::Dart => "dart",
159
160            ParseableLanguage::Elixir => "elixir",
161        };
162        write!(f, "{}", name)
163    }
164}
165
166impl TryFrom<ck_core::Language> for ParseableLanguage {
167    type Error = anyhow::Error;
168
169    fn try_from(lang: ck_core::Language) -> Result<Self, Self::Error> {
170        match lang {
171            ck_core::Language::Python => Ok(ParseableLanguage::Python),
172            ck_core::Language::TypeScript => Ok(ParseableLanguage::TypeScript),
173            ck_core::Language::JavaScript => Ok(ParseableLanguage::JavaScript),
174            ck_core::Language::Haskell => Ok(ParseableLanguage::Haskell),
175            ck_core::Language::Rust => Ok(ParseableLanguage::Rust),
176            ck_core::Language::Ruby => Ok(ParseableLanguage::Ruby),
177            ck_core::Language::Go => Ok(ParseableLanguage::Go),
178            ck_core::Language::CSharp => Ok(ParseableLanguage::CSharp),
179            ck_core::Language::Zig => Ok(ParseableLanguage::Zig),
180
181            ck_core::Language::Dart => Ok(ParseableLanguage::Dart),
182
183            ck_core::Language::Elixir => Ok(ParseableLanguage::Elixir),
184
185            _ => Err(anyhow::anyhow!(
186                "Language {:?} is not supported for parsing",
187                lang
188            )),
189        }
190    }
191}
192
193pub fn chunk_text(text: &str, language: Option<ck_core::Language>) -> Result<Vec<Chunk>> {
194    chunk_text_with_config(text, language, &ChunkConfig::default())
195}
196
197/// Configuration for chunking behavior
198#[derive(Debug, Clone)]
199pub struct ChunkConfig {
200    /// Maximum tokens per chunk (for striding)
201    pub max_tokens: usize,
202    /// Overlap size for striding (in tokens)
203    pub stride_overlap: usize,
204    /// Enable striding for chunks that exceed max_tokens
205    pub enable_striding: bool,
206}
207
208impl Default for ChunkConfig {
209    fn default() -> Self {
210        Self {
211            max_tokens: 8192,     // Default to Nomic model limit
212            stride_overlap: 1024, // 12.5% overlap
213            enable_striding: true,
214        }
215    }
216}
217
218/// New function that accepts model name for model-specific chunking
219pub fn chunk_text_with_model(
220    text: &str,
221    language: Option<ck_core::Language>,
222    model_name: Option<&str>,
223) -> Result<Vec<Chunk>> {
224    let (target_tokens, overlap_tokens) = get_model_chunk_config(model_name);
225
226    // Create a config based on model-specific parameters
227    let config = ChunkConfig {
228        max_tokens: target_tokens,
229        stride_overlap: overlap_tokens,
230        enable_striding: true,
231    };
232
233    chunk_text_with_config_and_model(text, language, &config, model_name)
234}
235
236pub fn chunk_text_with_config(
237    text: &str,
238    language: Option<ck_core::Language>,
239    config: &ChunkConfig,
240) -> Result<Vec<Chunk>> {
241    chunk_text_with_config_and_model(text, language, config, None)
242}
243
244fn chunk_text_with_config_and_model(
245    text: &str,
246    language: Option<ck_core::Language>,
247    config: &ChunkConfig,
248    model_name: Option<&str>,
249) -> Result<Vec<Chunk>> {
250    tracing::debug!(
251        "Chunking text with language: {:?}, length: {} chars, config: {:?}",
252        language,
253        text.len(),
254        config
255    );
256
257    let result = match language.map(ParseableLanguage::try_from) {
258        Some(Ok(lang)) => {
259            tracing::debug!("Using {} tree-sitter parser", lang);
260            chunk_language_with_model(text, lang, model_name)
261        }
262        Some(Err(_)) => {
263            tracing::debug!("Language not supported for parsing, using generic chunking strategy");
264            chunk_generic_with_token_config(text, model_name)
265        }
266        None => {
267            tracing::debug!("Using generic chunking strategy");
268            chunk_generic_with_token_config(text, model_name)
269        }
270    };
271
272    let mut chunks = result?;
273
274    // Apply striding if enabled and necessary
275    if config.enable_striding {
276        chunks = apply_striding(chunks, config)?;
277    }
278
279    tracing::debug!("Successfully created {} final chunks", chunks.len());
280    Ok(chunks)
281}
282
283fn chunk_generic(text: &str) -> Result<Vec<Chunk>> {
284    chunk_generic_with_token_config(text, None)
285}
286
287fn chunk_generic_with_token_config(text: &str, model_name: Option<&str>) -> Result<Vec<Chunk>> {
288    let mut chunks = Vec::new();
289    let lines: Vec<&str> = text.lines().collect();
290
291    // Get model-specific optimal chunk size in tokens
292    let (target_tokens, overlap_tokens) = get_model_chunk_config(model_name);
293
294    // Convert token targets to approximate line counts
295    // This is a rough heuristic - we'll validate with actual token counting
296    let avg_tokens_per_line = 10.0; // Rough estimate for code
297    let target_lines = ((target_tokens as f32) / avg_tokens_per_line) as usize;
298    let overlap_lines = ((overlap_tokens as f32) / avg_tokens_per_line) as usize;
299
300    let chunk_size = target_lines.max(5); // Minimum 5 lines
301    let overlap = overlap_lines.max(1); // Minimum 1 line overlap
302
303    // Pre-compute cumulative byte offsets for O(1) lookup, accounting for different line endings
304    let mut line_byte_offsets = Vec::with_capacity(lines.len() + 1);
305    line_byte_offsets.push(0);
306    let mut cumulative_offset = 0;
307    let mut byte_pos = 0;
308
309    for line in lines.iter() {
310        cumulative_offset += line.len();
311
312        // Find the actual line ending length in the original text
313        let line_end_pos = byte_pos + line.len();
314        let newline_len = if line_end_pos < text.len() && text.as_bytes()[line_end_pos] == b'\r' {
315            if line_end_pos + 1 < text.len() && text.as_bytes()[line_end_pos + 1] == b'\n' {
316                2 // CRLF
317            } else {
318                1 // CR only (old Mac)
319            }
320        } else if line_end_pos < text.len() && text.as_bytes()[line_end_pos] == b'\n' {
321            1 // LF only (Unix)
322        } else {
323            0 // No newline at this position (could be last line without newline)
324        };
325
326        cumulative_offset += newline_len;
327        byte_pos = cumulative_offset;
328        line_byte_offsets.push(cumulative_offset);
329    }
330
331    let mut i = 0;
332    while i < lines.len() {
333        let end = (i + chunk_size).min(lines.len());
334        let chunk_lines = &lines[i..end];
335        let chunk_text = chunk_lines.join("\n");
336        let byte_start = line_byte_offsets[i];
337        let byte_end = line_byte_offsets[end];
338        let metadata = ChunkMetadata::from_text(&chunk_text);
339
340        chunks.push(Chunk {
341            span: Span {
342                byte_start,
343                byte_end,
344                line_start: i + 1,
345                line_end: end,
346            },
347            text: chunk_text,
348            chunk_type: ChunkType::Text,
349            stride_info: None,
350            metadata,
351        });
352
353        i += chunk_size - overlap;
354        if i >= lines.len() {
355            break;
356        }
357    }
358
359    Ok(chunks)
360}
361
362pub(crate) fn tree_sitter_language(language: ParseableLanguage) -> Result<tree_sitter::Language> {
363    // tree-sitter-dart v0.0.4 uses an older API that returns Language directly,
364    // while newer bindings (v0.24+) require calling .into() on a factory struct.
365    if language == ParseableLanguage::Dart {
366        return Ok(tree_sitter_dart::language());
367    }
368
369    let ts_language = match language {
370        ParseableLanguage::Python => tree_sitter_python::LANGUAGE,
371        ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => {
372            tree_sitter_typescript::LANGUAGE_TYPESCRIPT
373        }
374        ParseableLanguage::Haskell => tree_sitter_haskell::LANGUAGE,
375        ParseableLanguage::Rust => tree_sitter_rust::LANGUAGE,
376        ParseableLanguage::Ruby => tree_sitter_ruby::LANGUAGE,
377        ParseableLanguage::Go => tree_sitter_go::LANGUAGE,
378        ParseableLanguage::CSharp => tree_sitter_c_sharp::LANGUAGE,
379        ParseableLanguage::Zig => tree_sitter_zig::LANGUAGE,
380
381        ParseableLanguage::Dart => unreachable!("Handled above via early return"),
382
383        ParseableLanguage::Elixir => tree_sitter_elixir::LANGUAGE,
384    };
385
386    Ok(ts_language.into())
387}
388
389fn chunk_language(text: &str, language: ParseableLanguage) -> Result<Vec<Chunk>> {
390    let mut parser = tree_sitter::Parser::new();
391    let ts_language = tree_sitter_language(language)?;
392    parser.set_language(&ts_language)?;
393
394    let tree = parser
395        .parse(text, None)
396        .ok_or_else(|| anyhow::anyhow!("Failed to parse {} code", language))?;
397
398    let mut chunks = match query_chunker::chunk_with_queries(language, ts_language, &tree, text)? {
399        Some(query_chunks) if !query_chunks.is_empty() => query_chunks,
400        _ => {
401            let mut legacy_chunks = Vec::new();
402            let mut cursor = tree.walk();
403            extract_code_chunks(&mut cursor, text, &mut legacy_chunks, language);
404            legacy_chunks
405        }
406    };
407
408    if chunks.is_empty() {
409        return chunk_generic(text);
410    }
411
412    // Post-process Haskell chunks to merge function equations
413    if language == ParseableLanguage::Haskell {
414        chunks = merge_haskell_functions(chunks, text);
415    }
416
417    // Fill gaps between chunks with remainder content
418    chunks = fill_gaps(chunks, text);
419
420    Ok(chunks)
421}
422
423/// Fill gaps between chunks with remainder content
424/// This ensures that leading imports, trailing code, and content between functions gets indexed
425/// Combines contiguous gaps into single chunks (excluding standalone blank lines)
426fn fill_gaps(mut chunks: Vec<Chunk>, text: &str) -> Vec<Chunk> {
427    if chunks.is_empty() {
428        return chunks;
429    }
430
431    // Sort chunks by byte position to identify gaps
432    chunks.sort_by_key(|c| c.span.byte_start);
433
434    let mut result = Vec::new();
435    let mut last_end = 0;
436
437    // Collect all gaps, splitting on blank lines
438    let mut gaps = Vec::new();
439
440    for chunk in &chunks {
441        if last_end < chunk.span.byte_start {
442            // Split this gap by blank lines - use split to make it simple
443            let gap_start = last_end;
444            let gap_text = &text[gap_start..chunk.span.byte_start];
445
446            // Split on sequences of blank lines
447            let mut current_byte = gap_start;
448            let mut segment_start = gap_start;
449
450            for line in gap_text.split('\n') {
451                let line_start_in_gap = current_byte - gap_start;
452                let _line_end_in_gap = line_start_in_gap + line.len();
453
454                if line.trim().is_empty() {
455                    // Found a blank line - save segment before it if it has content
456                    if segment_start < current_byte {
457                        let segment_text = &text[segment_start..current_byte];
458                        if !segment_text.trim().is_empty() {
459                            gaps.push((segment_start, current_byte));
460                        }
461                    }
462                    // Next segment starts after this blank line and its newline
463                    segment_start = current_byte + line.len() + 1;
464                }
465
466                current_byte += line.len() + 1; // +1 for the \n
467            }
468
469            // Handle final segment (after last newline or if no newlines)
470            if segment_start < chunk.span.byte_start {
471                let remaining = &text[segment_start..chunk.span.byte_start];
472                if !remaining.trim().is_empty() {
473                    gaps.push((segment_start, chunk.span.byte_start));
474                }
475            }
476        }
477        last_end = chunk.span.byte_end;
478    }
479
480    // Handle trailing content
481    if last_end < text.len() {
482        let gap_text = &text[last_end..];
483        if !gap_text.trim().is_empty() {
484            gaps.push((last_end, text.len()));
485        }
486    }
487
488    let combined_gaps = gaps;
489
490    // Now interleave chunks and combined gap chunks
491    let mut gap_idx = 0;
492
493    for chunk in chunks {
494        // Add any gap chunks that come before this structural chunk
495        while gap_idx < combined_gaps.len() && combined_gaps[gap_idx].1 <= chunk.span.byte_start {
496            let (gap_start, gap_end) = combined_gaps[gap_idx];
497            let gap_text = &text[gap_start..gap_end];
498
499            // Calculate line numbers by counting newlines before each position
500            let line_start = text[..gap_start].matches('\n').count() + 1;
501            // For line_end, count newlines in the text including the gap
502            // This gives us the line number of the last line with gap content
503            let newlines_up_to_end = text[..gap_end].matches('\n').count();
504            let line_end = if newlines_up_to_end >= line_start - 1 {
505                newlines_up_to_end.max(line_start)
506            } else {
507                line_start
508            };
509
510            let gap_chunk = Chunk {
511                text: gap_text.to_string(),
512                span: Span {
513                    byte_start: gap_start,
514                    byte_end: gap_end,
515                    line_start,
516                    line_end,
517                },
518                chunk_type: ChunkType::Text,
519                metadata: ChunkMetadata::from_text(gap_text),
520                stride_info: None,
521            };
522            result.push(gap_chunk);
523            gap_idx += 1;
524        }
525
526        result.push(chunk.clone());
527    }
528
529    // Add any remaining gap chunks after the last structural chunk
530    while gap_idx < combined_gaps.len() {
531        let (gap_start, gap_end) = combined_gaps[gap_idx];
532        let gap_text = &text[gap_start..gap_end];
533
534        // Calculate line numbers by counting newlines before each position
535        let line_start = text[..gap_start].matches('\n').count() + 1;
536        // For line_end, count newlines in the text including the gap
537        let newlines_up_to_end = text[..gap_end].matches('\n').count();
538        let line_end = if newlines_up_to_end >= line_start - 1 {
539            newlines_up_to_end.max(line_start)
540        } else {
541            line_start
542        };
543
544        let gap_chunk = Chunk {
545            text: gap_text.to_string(),
546            span: Span {
547                byte_start: gap_start,
548                byte_end: gap_end,
549                line_start,
550                line_end,
551            },
552            chunk_type: ChunkType::Text,
553            metadata: ChunkMetadata::from_text(gap_text),
554            stride_info: None,
555        };
556        result.push(gap_chunk);
557        gap_idx += 1;
558    }
559
560    result
561}
562
563/// Merge Haskell function equations that belong to the same function definition
564fn merge_haskell_functions(chunks: Vec<Chunk>, source: &str) -> Vec<Chunk> {
565    if chunks.is_empty() {
566        return chunks;
567    }
568
569    let mut merged = Vec::new();
570    let mut i = 0;
571
572    while i < chunks.len() {
573        let chunk = &chunks[i];
574
575        // Skip chunks that are just fragments or comments
576        let trimmed = chunk.text.trim();
577        if trimmed.is_empty()
578            || trimmed.starts_with("--")
579            || trimmed.starts_with("{-")
580            || !chunk.text.contains(|c: char| c.is_alphanumeric())
581        {
582            i += 1;
583            continue;
584        }
585
586        // Extract function name from the chunk text
587        // Check if it's a signature first (contains ::)
588        let is_signature = chunk.text.contains("::");
589        let function_name = if is_signature {
590            // For signatures like "factorial :: Integer -> Integer", extract "factorial"
591            chunk
592                .text
593                .split("::")
594                .next()
595                .and_then(|s| s.split_whitespace().next())
596                .map(|s| s.to_string())
597        } else {
598            extract_haskell_function_name(&chunk.text)
599        };
600
601        if function_name.is_none() {
602            // Not a function (might be data, newtype, etc.), keep as-is
603            merged.push(chunk.clone());
604            i += 1;
605            continue;
606        }
607
608        let name = function_name.unwrap();
609        let group_start = chunk.span.byte_start;
610        let mut group_end = chunk.span.byte_end;
611        let line_start = chunk.span.line_start;
612        let mut line_end = chunk.span.line_end;
613        let mut trailing_trivia = chunk.metadata.trailing_trivia.clone();
614
615        // Look ahead for function equations with the same name
616        let mut j = i + 1;
617        while j < chunks.len() {
618            let next_chunk = &chunks[j];
619
620            // Skip comments
621            let next_trimmed = next_chunk.text.trim();
622            if next_trimmed.starts_with("--") || next_trimmed.starts_with("{-") {
623                j += 1;
624                continue;
625            }
626
627            let next_is_signature = next_chunk.text.contains("::");
628            let next_name = if next_is_signature {
629                next_chunk
630                    .text
631                    .split("::")
632                    .next()
633                    .and_then(|s| s.split_whitespace().next())
634                    .map(|s| s.to_string())
635            } else {
636                extract_haskell_function_name(&next_chunk.text)
637            };
638
639            if next_name == Some(name.clone()) {
640                // Extend the group to include this equation
641                group_end = next_chunk.span.byte_end;
642                line_end = next_chunk.span.line_end;
643                trailing_trivia = next_chunk.metadata.trailing_trivia.clone();
644                j += 1;
645            } else {
646                break;
647            }
648        }
649
650        // Create merged chunk
651        let merged_text = source.get(group_start..group_end).unwrap_or("").to_string();
652        let mut metadata = chunk.metadata.with_updated_text(&merged_text);
653        metadata.trailing_trivia = trailing_trivia;
654
655        merged.push(Chunk {
656            span: Span {
657                byte_start: group_start,
658                byte_end: group_end,
659                line_start,
660                line_end,
661            },
662            text: merged_text,
663            chunk_type: ChunkType::Function,
664            stride_info: None,
665            metadata,
666        });
667
668        i = j; // Skip past all merged chunks
669    }
670
671    merged
672}
673
674/// Extract the function name from a Haskell function equation
675fn extract_haskell_function_name(text: &str) -> Option<String> {
676    // Haskell function equations start with the function name followed by patterns or =
677    // Examples: "factorial 0 = 1", "map f [] = []"
678    let trimmed = text.trim();
679
680    // Find the first word (function name)
681    let first_word = trimmed
682        .split_whitespace()
683        .next()?
684        .trim_end_matches(|c: char| !c.is_alphanumeric() && c != '_' && c != '\'');
685
686    // Validate it's a valid Haskell identifier (starts with lowercase or underscore)
687    if first_word.is_empty() {
688        return None;
689    }
690
691    let first_char = first_word.chars().next()?;
692    if first_char.is_lowercase() || first_char == '_' {
693        Some(first_word.to_string())
694    } else {
695        None
696    }
697}
698
699fn chunk_language_with_model(
700    text: &str,
701    language: ParseableLanguage,
702    _model_name: Option<&str>,
703) -> Result<Vec<Chunk>> {
704    // For now, language-based chunking doesn't need model-specific behavior
705    // since it's based on semantic code boundaries rather than token counts
706    // We could potentially optimize this in the future by validating chunk token counts
707    chunk_language(text, language)
708}
709
710fn extract_code_chunks(
711    cursor: &mut tree_sitter::TreeCursor,
712    source: &str,
713    chunks: &mut Vec<Chunk>,
714    language: ParseableLanguage,
715) {
716    let node = cursor.node();
717
718    // For Haskell: skip "function" nodes that are nested anywhere inside "signature" nodes
719    // (these are type expressions, not actual function definitions)
720    let should_skip = if language == ParseableLanguage::Haskell && node.kind() == "function" {
721        // Walk up parent chain to check if we're inside a signature
722        let mut current = node.parent();
723        while let Some(parent) = current {
724            if parent.kind() == "signature" {
725                return; // Skip this node and don't recurse
726            }
727            current = parent.parent();
728        }
729        false
730    } else {
731        false
732    };
733
734    if !should_skip
735        && let Some(initial_chunk_type) = chunk_type_for_node(language, &node)
736        && let Some(chunk) = build_chunk(node, source, initial_chunk_type, language)
737    {
738        let is_duplicate = chunks.iter().any(|existing| {
739            existing.span.byte_start == chunk.span.byte_start
740                && existing.span.byte_end == chunk.span.byte_end
741        });
742
743        if !is_duplicate {
744            chunks.push(chunk);
745        }
746    }
747
748    // For Haskell signatures: don't recurse into children (they're just type expressions)
749    let should_recurse = !(language == ParseableLanguage::Haskell && node.kind() == "signature");
750
751    if should_recurse && cursor.goto_first_child() {
752        loop {
753            extract_code_chunks(cursor, source, chunks, language);
754            if !cursor.goto_next_sibling() {
755                break;
756            }
757        }
758        cursor.goto_parent();
759    }
760}
761
762fn chunk_type_for_node(
763    language: ParseableLanguage,
764    node: &tree_sitter::Node<'_>,
765) -> Option<ChunkType> {
766    let kind = node.kind();
767
768    let supported = match language {
769        ParseableLanguage::Python => matches!(kind, "function_definition" | "class_definition"),
770        ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => matches!(
771            kind,
772            "function_declaration" | "class_declaration" | "method_definition" | "arrow_function"
773        ),
774        ParseableLanguage::Haskell => matches!(
775            kind,
776            "function" // Capture function equations
777                | "signature" // Capture type signatures (will be merged with functions)
778                | "data_type"
779                | "newtype"
780                | "type_synonym"
781                | "type_family"
782                | "class"
783                | "instance"
784        ),
785        ParseableLanguage::Rust => matches!(
786            kind,
787            "function_item" | "impl_item" | "struct_item" | "enum_item" | "trait_item" | "mod_item"
788        ),
789        ParseableLanguage::Ruby => {
790            matches!(kind, "method" | "class" | "module" | "singleton_method")
791        }
792        ParseableLanguage::Go => matches!(
793            kind,
794            "function_declaration"
795                | "method_declaration"
796                | "type_declaration"
797                | "var_declaration"
798                | "const_declaration"
799        ),
800        ParseableLanguage::CSharp => matches!(
801            kind,
802            "method_declaration"
803                | "class_declaration"
804                | "interface_declaration"
805                | "variable_declaration"
806        ),
807        ParseableLanguage::Dart => matches!(
808            kind,
809            "class_definition"
810                | "class_declaration"
811                | "mixin_declaration"
812                | "enum_declaration"
813                | "function_declaration"
814                | "method_declaration"
815                | "constructor_declaration"
816                | "variable_declaration"
817                | "local_variable_declaration"
818                | "lambda_expression"
819                | "class_member_definition"
820        ),
821        ParseableLanguage::Zig => matches!(
822            kind,
823            "function_declaration"
824                | "test_declaration"
825                | "variable_declaration"
826                | "struct_declaration"
827                | "enum_declaration"
828                | "union_declaration"
829                | "opaque_declaration"
830                | "error_set_declaration"
831                | "comptime_declaration"
832        ),
833        // Elixir uses "call" nodes for defmodule, def, defp, etc.
834        // We handle this specially via query-based chunking
835        ParseableLanguage::Elixir => matches!(kind, "call" | "do_block"),
836    };
837
838    if !supported {
839        return None;
840    }
841
842    match language {
843        ParseableLanguage::Go
844            if matches!(node.kind(), "var_declaration" | "const_declaration")
845                && node.parent().is_some_and(|p| p.kind() == "block") =>
846        {
847            return None;
848        }
849        ParseableLanguage::CSharp if node.kind() == "variable_declaration" => {
850            if !is_csharp_field_like(*node) {
851                return None;
852            }
853        }
854        _ => {}
855    }
856
857    Some(classify_chunk_kind(kind))
858}
859
860fn classify_chunk_kind(kind: &str) -> ChunkType {
861    match kind {
862        "function_definition"
863        | "function_declaration"
864        | "arrow_function"
865        | "function"
866        | "function_item"
867        | "def"
868        | "defp"
869        | "defn"
870        | "defn-"
871        | "method"
872        | "singleton_method" => ChunkType::Function,
873        "signature" => ChunkType::Function, // Haskell type signatures will be merged with functions
874        "class_definition"
875        | "class_declaration"
876        | "instance_declaration"
877        | "class"
878        | "instance"
879        | "struct_item"
880        | "enum_item"
881        | "defstruct"
882        | "defrecord"
883        | "deftype"
884        | "type_declaration"
885        | "struct_declaration"
886        | "enum_declaration"
887        | "union_declaration"
888        | "opaque_declaration"
889        | "error_set_declaration" => ChunkType::Class,
890        "method_definition" | "method_declaration" | "defmacro" => ChunkType::Method,
891        "data_type"
892        | "newtype"
893        | "type_synonym"
894        | "type_family"
895        | "impl_item"
896        | "trait_item"
897        | "mod_item"
898        | "defmodule"
899        | "module"
900        | "defprotocol"
901        | "interface_declaration"
902        | "ns"
903        | "var_declaration"
904        | "const_declaration"
905        | "variable_declaration"
906        | "test_declaration"
907        | "comptime_declaration" => ChunkType::Module,
908        _ => ChunkType::Text,
909    }
910}
911
912pub(crate) fn build_chunk(
913    node: tree_sitter::Node<'_>,
914    source: &str,
915    initial_type: ChunkType,
916    language: ParseableLanguage,
917) -> Option<Chunk> {
918    let target_node = adjust_node_for_language(node, language);
919    let (byte_start, start_row, leading_segments) =
920        extend_with_leading_trivia(target_node, language, source);
921    let trailing_segments = collect_trailing_trivia(target_node, language, source);
922
923    let byte_end = target_node.end_byte();
924    let end_pos = target_node.end_position();
925
926    if byte_start >= byte_end || byte_end > source.len() {
927        return None;
928    }
929
930    let text = source.get(byte_start..byte_end)?.to_string();
931
932    if text.trim().is_empty() {
933        return None;
934    }
935
936    let chunk_type = adjust_chunk_type_for_context(target_node, initial_type, language);
937    let ancestry = collect_ancestry(target_node, language, source);
938    let leading_trivia = segments_to_strings(&leading_segments, source);
939    let trailing_trivia = segments_to_strings(&trailing_segments, source);
940    let metadata = ChunkMetadata::from_context(&text, ancestry, leading_trivia, trailing_trivia);
941
942    Some(Chunk {
943        span: Span {
944            byte_start,
945            byte_end,
946            line_start: start_row + 1,
947            line_end: end_pos.row + 1,
948        },
949        text,
950        chunk_type,
951        stride_info: None,
952        metadata,
953    })
954}
955
956fn adjust_node_for_language(
957    node: tree_sitter::Node<'_>,
958    language: ParseableLanguage,
959) -> tree_sitter::Node<'_> {
960    match language {
961        ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => {
962            if node.kind() == "arrow_function" {
963                return expand_arrow_function_context(node);
964            }
965            node
966        }
967        _ => node,
968    }
969}
970
971fn expand_arrow_function_context(mut node: tree_sitter::Node<'_>) -> tree_sitter::Node<'_> {
972    const PARENTS: &[&str] = &[
973        "parenthesized_expression",
974        "variable_declarator",
975        "variable_declaration",
976        "lexical_declaration",
977        "assignment_expression",
978        "expression_statement",
979        "public_field_definition",
980        "export_statement",
981    ];
982
983    while let Some(parent) = node.parent() {
984        let kind = parent.kind();
985        if PARENTS.contains(&kind) {
986            node = parent;
987            continue;
988        }
989        break;
990    }
991
992    node
993}
994
995#[derive(Clone, Copy)]
996struct TriviaSegment {
997    start_byte: usize,
998    end_byte: usize,
999}
1000
1001fn extend_with_leading_trivia(
1002    node: tree_sitter::Node<'_>,
1003    language: ParseableLanguage,
1004    source: &str,
1005) -> (usize, usize, Vec<TriviaSegment>) {
1006    let mut start_byte = node.start_byte();
1007    let mut start_row = node.start_position().row;
1008    let mut current = node;
1009    let mut segments = Vec::new();
1010
1011    while let Some(prev) = current.prev_sibling() {
1012        if should_attach_leading_trivia(language, &prev)
1013            && only_whitespace_between(source, prev.end_byte(), start_byte)
1014        {
1015            start_byte = prev.start_byte();
1016            start_row = prev.start_position().row;
1017            segments.push(TriviaSegment {
1018                start_byte: prev.start_byte(),
1019                end_byte: prev.end_byte(),
1020            });
1021            current = prev;
1022            continue;
1023        }
1024        break;
1025    }
1026
1027    segments.reverse();
1028    (start_byte, start_row, segments)
1029}
1030
1031fn should_attach_leading_trivia(language: ParseableLanguage, node: &tree_sitter::Node<'_>) -> bool {
1032    let kind = node.kind();
1033    if kind == "comment" {
1034        return true;
1035    }
1036
1037    match language {
1038        ParseableLanguage::Rust => {
1039            matches!(kind, "line_comment" | "block_comment" | "attribute_item")
1040        }
1041        ParseableLanguage::Python => kind == "decorator",
1042        ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => kind == "decorator",
1043        ParseableLanguage::CSharp => matches!(kind, "attribute_list" | "attribute"),
1044        _ => false,
1045    }
1046}
1047
1048fn collect_trailing_trivia(
1049    node: tree_sitter::Node<'_>,
1050    language: ParseableLanguage,
1051    source: &str,
1052) -> Vec<TriviaSegment> {
1053    let mut segments = Vec::new();
1054    let mut current = node;
1055    let mut previous_end = node.end_byte();
1056
1057    while let Some(next) = current.next_sibling() {
1058        if should_attach_trailing_trivia(language, &next)
1059            && only_whitespace_between(source, previous_end, next.start_byte())
1060        {
1061            segments.push(TriviaSegment {
1062                start_byte: next.start_byte(),
1063                end_byte: next.end_byte(),
1064            });
1065            previous_end = next.end_byte();
1066            current = next;
1067            continue;
1068        }
1069        break;
1070    }
1071
1072    segments
1073}
1074
1075fn should_attach_trailing_trivia(
1076    _language: ParseableLanguage,
1077    node: &tree_sitter::Node<'_>,
1078) -> bool {
1079    node.kind() == "comment"
1080}
1081
1082fn segments_to_strings(segments: &[TriviaSegment], source: &str) -> Vec<String> {
1083    let mut result = Vec::new();
1084
1085    for segment in segments {
1086        if let Some(text) = source
1087            .get(segment.start_byte..segment.end_byte)
1088            .map(|s| s.to_string())
1089        {
1090            result.push(text);
1091        }
1092    }
1093
1094    result
1095}
1096
1097fn collect_ancestry(
1098    mut node: tree_sitter::Node<'_>,
1099    language: ParseableLanguage,
1100    source: &str,
1101) -> Vec<String> {
1102    let mut parts = Vec::new();
1103
1104    while let Some(parent) = node.parent() {
1105        if let Some(parent_chunk_type) = chunk_type_for_node(language, &parent)
1106            && let Some(name) = display_name_for_node(parent, language, source, parent_chunk_type)
1107        {
1108            parts.push(name);
1109        }
1110        node = parent;
1111    }
1112
1113    parts.reverse();
1114    parts
1115}
1116
1117fn display_name_for_node(
1118    node: tree_sitter::Node<'_>,
1119    language: ParseableLanguage,
1120    source: &str,
1121    chunk_type: ChunkType,
1122) -> Option<String> {
1123    if let Some(name_node) = node.child_by_field_name("name") {
1124        return text_for_node(name_node, source);
1125    }
1126
1127    match language {
1128        ParseableLanguage::Rust => rust_display_name(node, source, chunk_type),
1129        ParseableLanguage::Python => find_identifier(node, source, &["identifier"]),
1130        ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => find_identifier(
1131            node,
1132            source,
1133            &["identifier", "type_identifier", "property_identifier"],
1134        ),
1135        ParseableLanguage::Haskell => {
1136            find_identifier(node, source, &["identifier", "type_identifier", "variable"])
1137                .or_else(|| first_word_of_node(node, source))
1138        }
1139        ParseableLanguage::Ruby => find_identifier(node, source, &["identifier"]),
1140        ParseableLanguage::Go => find_identifier(node, source, &["identifier", "type_identifier"]),
1141        ParseableLanguage::CSharp => find_identifier(node, source, &["identifier"]),
1142        ParseableLanguage::Zig => find_identifier(node, source, &["identifier"]),
1143
1144        ParseableLanguage::Dart => {
1145            find_identifier(node, source, &["identifier", "type_identifier"])
1146        }
1147        ParseableLanguage::Elixir => {
1148            // Elixir names can be aliases (module names) or atoms/identifiers
1149            find_identifier(node, source, &["alias", "identifier", "atom"])
1150        }
1151    }
1152}
1153
1154fn rust_display_name(
1155    node: tree_sitter::Node<'_>,
1156    source: &str,
1157    chunk_type: ChunkType,
1158) -> Option<String> {
1159    match node.kind() {
1160        "impl_item" => {
1161            let mut parts = Vec::new();
1162            if let Some(ty) = node.child_by_field_name("type")
1163                && let Some(text) = text_for_node(ty, source)
1164            {
1165                parts.push(text);
1166            }
1167            if let Some(trait_node) = node.child_by_field_name("trait")
1168                && let Some(text) = text_for_node(trait_node, source)
1169            {
1170                if let Some(last) = parts.first() {
1171                    parts[0] = format!("{} (impl {})", last, text.trim());
1172                } else {
1173                    parts.push(format!("impl {}", text.trim()));
1174                }
1175            }
1176            if parts.is_empty() {
1177                find_identifier(node, source, &["identifier"])
1178            } else {
1179                Some(parts.remove(0))
1180            }
1181        }
1182        "mod_item" if chunk_type == ChunkType::Module => {
1183            find_identifier(node, source, &["identifier"])
1184        }
1185        _ => find_identifier(node, source, &["identifier", "type_identifier"]),
1186    }
1187}
1188
1189fn find_identifier(
1190    node: tree_sitter::Node<'_>,
1191    source: &str,
1192    candidate_kinds: &[&str],
1193) -> Option<String> {
1194    let mut cursor = node.walk();
1195    for child in node.children(&mut cursor) {
1196        if candidate_kinds.contains(&child.kind())
1197            && let Some(text) = text_for_node(child, source)
1198        {
1199            return Some(text.trim().to_string());
1200        }
1201    }
1202    None
1203}
1204
1205fn first_word_of_node(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
1206    let text = text_for_node(node, source)?;
1207    text.split_whitespace().next().map(|s| {
1208        s.trim_end_matches(|c: char| !c.is_alphanumeric() && c != '_')
1209            .to_string()
1210    })
1211}
1212
1213fn text_for_node(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
1214    node.utf8_text(source.as_bytes())
1215        .ok()
1216        .map(|s| s.to_string())
1217}
1218
1219fn only_whitespace_between(source: &str, start: usize, end: usize) -> bool {
1220    if start >= end || end > source.len() {
1221        return true;
1222    }
1223
1224    source[start..end].chars().all(|c| c.is_whitespace())
1225}
1226
1227fn adjust_chunk_type_for_context(
1228    node: tree_sitter::Node<'_>,
1229    chunk_type: ChunkType,
1230    language: ParseableLanguage,
1231) -> ChunkType {
1232    if chunk_type != ChunkType::Function {
1233        return chunk_type;
1234    }
1235
1236    if is_method_context(node, language) {
1237        ChunkType::Method
1238    } else {
1239        chunk_type
1240    }
1241}
1242
1243fn is_method_context(node: tree_sitter::Node<'_>, language: ParseableLanguage) -> bool {
1244    const PYTHON_CONTAINERS: &[&str] = &["class_definition"];
1245    const TYPESCRIPT_CONTAINERS: &[&str] = &["class_body", "class_declaration"];
1246    const RUBY_CONTAINERS: &[&str] = &["class", "module"];
1247    const RUST_CONTAINERS: &[&str] = &["impl_item", "trait_item"];
1248    const DART_CONTAINERS: &[&str] = &[
1249        "class_definition",
1250        "class_declaration",
1251        "mixin_declaration",
1252        "enum_declaration",
1253    ];
1254
1255    match language {
1256        ParseableLanguage::Python => ancestor_has_kind(node, PYTHON_CONTAINERS),
1257        ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => {
1258            ancestor_has_kind(node, TYPESCRIPT_CONTAINERS)
1259        }
1260        ParseableLanguage::Ruby => ancestor_has_kind(node, RUBY_CONTAINERS),
1261        ParseableLanguage::Rust => ancestor_has_kind(node, RUST_CONTAINERS),
1262        ParseableLanguage::Go => false,
1263        ParseableLanguage::CSharp => false,
1264        ParseableLanguage::Haskell => false,
1265        ParseableLanguage::Zig => false,
1266
1267        ParseableLanguage::Dart => ancestor_has_kind(node, DART_CONTAINERS),
1268
1269        ParseableLanguage::Elixir => false, // Elixir doesn't have class-based methods
1270    }
1271}
1272
1273fn ancestor_has_kind(node: tree_sitter::Node<'_>, kinds: &[&str]) -> bool {
1274    let mut current = node;
1275    while let Some(parent) = current.parent() {
1276        if kinds.contains(&parent.kind()) {
1277            return true;
1278        }
1279        current = parent;
1280    }
1281    false
1282}
1283
1284fn is_csharp_field_like(node: tree_sitter::Node<'_>) -> bool {
1285    if let Some(parent) = node.parent() {
1286        return matches!(
1287            parent.kind(),
1288            "field_declaration" | "event_field_declaration"
1289        );
1290    }
1291    false
1292}
1293
1294/// Apply striding to chunks that exceed the token limit
1295fn apply_striding(chunks: Vec<Chunk>, config: &ChunkConfig) -> Result<Vec<Chunk>> {
1296    let mut result = Vec::new();
1297
1298    for chunk in chunks {
1299        let estimated_tokens = estimate_tokens(&chunk.text);
1300
1301        if estimated_tokens <= config.max_tokens {
1302            // Chunk fits within limit, no striding needed
1303            result.push(chunk);
1304        } else {
1305            // Chunk exceeds limit, apply striding
1306            tracing::debug!(
1307                "Chunk with {} tokens exceeds limit of {}, applying striding",
1308                estimated_tokens,
1309                config.max_tokens
1310            );
1311
1312            let strided_chunks = stride_large_chunk(chunk, config)?;
1313            result.extend(strided_chunks);
1314        }
1315    }
1316
1317    Ok(result)
1318}
1319
1320/// Create strided chunks from a large chunk that exceeds token limits
1321fn stride_large_chunk(chunk: Chunk, config: &ChunkConfig) -> Result<Vec<Chunk>> {
1322    let text = &chunk.text;
1323
1324    // Early return for empty chunks to avoid divide-by-zero
1325    if text.is_empty() {
1326        return Ok(vec![chunk]);
1327    }
1328
1329    // Calculate stride parameters in characters (not bytes!)
1330    // Use a conservative estimate to ensure we stay under token limits
1331    let char_count = text.chars().count();
1332    let estimated_tokens = estimate_tokens(text);
1333    // Guard against zero token estimate to prevent divide-by-zero panic
1334    let chars_per_token = if estimated_tokens == 0 {
1335        4.5 // Use default average if estimation fails
1336    } else {
1337        char_count as f32 / estimated_tokens as f32
1338    };
1339    let window_chars = ((config.max_tokens as f32 * 0.9) * chars_per_token) as usize; // 10% buffer
1340    let overlap_chars = (config.stride_overlap as f32 * chars_per_token) as usize;
1341    let stride_chars = window_chars.saturating_sub(overlap_chars);
1342
1343    if stride_chars == 0 {
1344        return Err(anyhow::anyhow!("Stride size is too small"));
1345    }
1346
1347    // Build char to byte index mapping to handle UTF-8 safely
1348    let char_byte_indices: Vec<(usize, char)> = text.char_indices().collect();
1349    // Note: char_count is already calculated above, just reference it here
1350
1351    let mut strided_chunks = Vec::new();
1352    let original_chunk_id = format!("{}:{}", chunk.span.byte_start, chunk.span.byte_end);
1353    let mut start_char_idx = 0;
1354    let mut stride_index = 0;
1355
1356    // Calculate total number of strides
1357    let total_strides = if char_count <= window_chars {
1358        1
1359    } else {
1360        ((char_count - overlap_chars) as f32 / stride_chars as f32).ceil() as usize
1361    };
1362
1363    while start_char_idx < char_count {
1364        let end_char_idx = (start_char_idx + window_chars).min(char_count);
1365
1366        // Get byte positions from char indices
1367        let start_byte_pos = char_byte_indices[start_char_idx].0;
1368        let end_byte_pos = if end_char_idx < char_count {
1369            char_byte_indices[end_char_idx].0
1370        } else {
1371            text.len()
1372        };
1373
1374        let stride_text = &text[start_byte_pos..end_byte_pos];
1375
1376        // Calculate overlap information
1377        let overlap_start = if stride_index > 0 { overlap_chars } else { 0 };
1378        let overlap_end = if end_char_idx < char_count {
1379            overlap_chars
1380        } else {
1381            0
1382        };
1383
1384        // Calculate span for this stride
1385        let byte_offset_start = chunk.span.byte_start + start_byte_pos;
1386        let byte_offset_end = chunk.span.byte_start + end_byte_pos;
1387
1388        // Estimate line numbers (approximate)
1389        let text_before_start = &text[..start_byte_pos];
1390        let line_offset_start = text_before_start.lines().count().saturating_sub(1);
1391        let stride_lines = stride_text.lines().count();
1392        let metadata = chunk.metadata.with_updated_text(stride_text);
1393
1394        let stride_chunk = Chunk {
1395            span: Span {
1396                byte_start: byte_offset_start,
1397                byte_end: byte_offset_end,
1398                line_start: chunk.span.line_start + line_offset_start,
1399                // Fix: subtract 1 since stride_lines is a count but line_end should be inclusive
1400                line_end: chunk.span.line_start
1401                    + line_offset_start
1402                    + stride_lines.saturating_sub(1),
1403            },
1404            text: stride_text.to_string(),
1405            chunk_type: chunk.chunk_type.clone(),
1406            stride_info: Some(StrideInfo {
1407                original_chunk_id: original_chunk_id.clone(),
1408                stride_index,
1409                total_strides,
1410                overlap_start,
1411                overlap_end,
1412            }),
1413            metadata,
1414        };
1415
1416        strided_chunks.push(stride_chunk);
1417
1418        // Move to next stride
1419        if end_char_idx >= char_count {
1420            break;
1421        }
1422
1423        start_char_idx += stride_chars;
1424        stride_index += 1;
1425    }
1426
1427    tracing::debug!(
1428        "Created {} strides from chunk of {} tokens",
1429        strided_chunks.len(),
1430        estimate_tokens(text)
1431    );
1432
1433    Ok(strided_chunks)
1434}
1435
1436// Removed duplicate estimate_tokens function - using the one from ck-embed via TokenEstimator
1437
1438#[cfg(test)]
1439mod tests {
1440    use super::*;
1441
1442    fn canonicalize_spans(
1443        mut spans: Vec<(usize, usize, ChunkType)>,
1444    ) -> Vec<(usize, usize, ChunkType)> {
1445        fn chunk_type_order(chunk_type: &ChunkType) -> u8 {
1446            match chunk_type {
1447                ChunkType::Text => 0,
1448                ChunkType::Function => 1,
1449                ChunkType::Class => 2,
1450                ChunkType::Method => 3,
1451                ChunkType::Module => 4,
1452            }
1453        }
1454
1455        spans.sort_by(|a, b| {
1456            let order_a = chunk_type_order(&a.2);
1457            let order_b = chunk_type_order(&b.2);
1458            order_a
1459                .cmp(&order_b)
1460                .then_with(|| a.0.cmp(&b.0))
1461                .then_with(|| a.1.cmp(&b.1))
1462        });
1463
1464        let mut result: Vec<(usize, usize, ChunkType)> = Vec::new();
1465        for (start, end, ty) in spans {
1466            if let Some(last) = result.last_mut()
1467                && last.0 == start
1468                && last.2 == ty
1469            {
1470                if end > last.1 {
1471                    last.1 = end;
1472                }
1473                continue;
1474            }
1475            result.push((start, end, ty));
1476        }
1477
1478        result
1479    }
1480
1481    fn assert_query_parity(language: ParseableLanguage, source: &str) {
1482        let mut parser = tree_sitter::Parser::new();
1483        let ts_language = tree_sitter_language(language).expect("language");
1484        parser.set_language(&ts_language).expect("set language");
1485        let tree = parser.parse(source, None).expect("parse source");
1486
1487        let query_chunks = query_chunker::chunk_with_queries(language, ts_language, &tree, source)
1488            .expect("query execution")
1489            .expect("queries available");
1490
1491        let mut legacy_chunks = Vec::new();
1492        let mut cursor = tree.walk();
1493        extract_code_chunks(&mut cursor, source, &mut legacy_chunks, language);
1494
1495        let query_spans = canonicalize_spans(
1496            query_chunks
1497                .iter()
1498                .map(|chunk| {
1499                    (
1500                        chunk.span.byte_start,
1501                        chunk.span.byte_end,
1502                        chunk.chunk_type.clone(),
1503                    )
1504                })
1505                .collect(),
1506        );
1507        let legacy_spans = canonicalize_spans(
1508            legacy_chunks
1509                .iter()
1510                .map(|chunk| {
1511                    (
1512                        chunk.span.byte_start,
1513                        chunk.span.byte_end,
1514                        chunk.chunk_type.clone(),
1515                    )
1516                })
1517                .collect(),
1518        );
1519
1520        assert_eq!(query_spans, legacy_spans);
1521    }
1522
1523    #[test]
1524    fn test_chunk_generic_byte_offsets() {
1525        // Test that byte offsets are calculated correctly using O(n) algorithm
1526        let text = "line 1\nline 2\nline 3\nline 4\nline 5";
1527        let chunks = chunk_generic(text).unwrap();
1528
1529        assert!(!chunks.is_empty());
1530
1531        // First chunk should start at byte 0
1532        assert_eq!(chunks[0].span.byte_start, 0);
1533
1534        // Each chunk's byte_end should match the actual text length
1535        for chunk in &chunks {
1536            let expected_len = chunk.text.len();
1537            let actual_len = chunk.span.byte_end - chunk.span.byte_start;
1538            assert_eq!(actual_len, expected_len);
1539        }
1540    }
1541
1542    #[test]
1543    fn test_chunk_generic_large_file_performance() {
1544        // Create a large text to ensure O(n) performance
1545        let lines: Vec<String> = (0..1000)
1546            .map(|i| format!("Line {}: Some content here", i))
1547            .collect();
1548        let text = lines.join("\n");
1549
1550        let start = std::time::Instant::now();
1551        let chunks = chunk_generic(&text).unwrap();
1552        let duration = start.elapsed();
1553
1554        // Should complete quickly even for 1000 lines
1555        assert!(
1556            duration.as_millis() < 100,
1557            "Chunking took too long: {:?}",
1558            duration
1559        );
1560        assert!(!chunks.is_empty());
1561
1562        // Verify chunks have correct line numbers
1563        for chunk in &chunks {
1564            assert!(chunk.span.line_start > 0);
1565            assert!(chunk.span.line_end >= chunk.span.line_start);
1566        }
1567    }
1568
1569    #[test]
1570    fn test_chunk_rust() {
1571        let rust_code = r#"
1572pub struct Calculator {
1573    memory: f64,
1574}
1575
1576impl Calculator {
1577    pub fn new() -> Self {
1578        Calculator { memory: 0.0 }
1579    }
1580    
1581    pub fn add(&mut self, a: f64, b: f64) -> f64 {
1582        a + b
1583    }
1584}
1585
1586fn main() {
1587    let calc = Calculator::new();
1588}
1589
1590pub mod utils {
1591    pub fn helper() {}
1592}
1593"#;
1594
1595        let chunks = chunk_language(rust_code, ParseableLanguage::Rust).unwrap();
1596        assert!(!chunks.is_empty());
1597
1598        // Should find struct, impl, functions, and module
1599        let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
1600        assert!(chunk_types.contains(&&ChunkType::Class)); // struct
1601        assert!(chunk_types.contains(&&ChunkType::Module)); // impl and mod
1602        assert!(chunk_types.contains(&&ChunkType::Function)); // functions
1603    }
1604
1605    #[test]
1606    fn test_rust_doc_comments_attached() {
1607        let rust_code = r#"
1608/// Doc comment
1609pub struct Foo {}
1610"#;
1611        let chunks = chunk_language(rust_code, ParseableLanguage::Rust).unwrap();
1612        let struct_chunk = chunks
1613            .iter()
1614            .find(|c| c.text.contains("struct Foo"))
1615            .unwrap();
1616        assert!(
1617            struct_chunk.text.contains("/// Doc comment"),
1618            "Doc comment should be attached"
1619        );
1620    }
1621
1622    #[test]
1623    fn test_rust_query_matches_legacy() {
1624        let source = r#"
1625            mod sample {
1626                struct Thing;
1627
1628                impl Thing {
1629                    fn new() -> Self { Self }
1630                    fn helper(&self) {}
1631                }
1632            }
1633
1634            fn util() {}
1635        "#;
1636
1637        assert_query_parity(ParseableLanguage::Rust, source);
1638    }
1639
1640    #[test]
1641    fn test_python_query_matches_legacy() {
1642        let source = r#"
1643class Example:
1644    @classmethod
1645    def build(cls):
1646        return cls()
1647
1648
1649def helper():
1650    return 1
1651
1652
1653async def async_helper():
1654    return 2
1655"#;
1656
1657        assert_query_parity(ParseableLanguage::Python, source);
1658    }
1659
1660    #[test]
1661    fn test_chunk_ruby() {
1662        let ruby_code = r#"
1663class Calculator
1664  def initialize
1665    @memory = 0.0
1666  end
1667
1668  def add(a, b)
1669    a + b
1670  end
1671
1672  def self.class_method
1673    "class method"
1674  end
1675
1676  private
1677
1678  def private_method
1679    "private"
1680  end
1681end
1682
1683module Utils
1684  def self.helper
1685    "helper"
1686  end
1687end
1688
1689def main
1690  calc = Calculator.new
1691end
1692"#;
1693
1694        let chunks = chunk_language(ruby_code, ParseableLanguage::Ruby).unwrap();
1695        assert!(!chunks.is_empty());
1696
1697        // Should find class, module, and methods
1698        let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
1699        assert!(chunk_types.contains(&&ChunkType::Class)); // class
1700        assert!(chunk_types.contains(&&ChunkType::Module)); // module
1701        assert!(chunk_types.contains(&&ChunkType::Function)); // methods
1702    }
1703
1704    #[test]
1705    fn test_language_detection_fallback() {
1706        // Test that unknown languages fall back to generic chunking
1707        let generic_text = "Some text\nwith multiple lines\nto chunk generically";
1708
1709        let chunks_unknown = chunk_text(generic_text, None).unwrap();
1710        let chunks_generic = chunk_generic(generic_text).unwrap();
1711
1712        // Should produce the same result
1713        assert_eq!(chunks_unknown.len(), chunks_generic.len());
1714        assert_eq!(chunks_unknown[0].text, chunks_generic[0].text);
1715    }
1716
1717    #[test]
1718    fn test_chunk_go() {
1719        let go_code = r#"
1720package main
1721
1722import "fmt"
1723
1724const Pi = 3.14159
1725
1726var memory float64
1727
1728type Calculator struct {
1729    memory float64
1730}
1731
1732type Operation interface {
1733    Calculate(a, b float64) float64
1734}
1735
1736func NewCalculator() *Calculator {
1737    return &Calculator{memory: 0.0}
1738}
1739
1740func (c *Calculator) Add(a, b float64) float64 {
1741    return a + b
1742}
1743
1744func main() {
1745    calc := NewCalculator()
1746}
1747"#;
1748
1749        let chunks = chunk_language(go_code, ParseableLanguage::Go).unwrap();
1750        assert!(!chunks.is_empty());
1751
1752        // Should find const, var, type declarations, functions, and methods
1753        let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
1754        assert!(chunk_types.contains(&&ChunkType::Module)); // const and var
1755        assert!(chunk_types.contains(&&ChunkType::Class)); // struct and interface
1756        assert!(chunk_types.contains(&&ChunkType::Function)); // functions
1757        assert!(chunk_types.contains(&&ChunkType::Method)); // methods
1758    }
1759
1760    #[test]
1761    #[ignore] // TODO: Update test to match query-based chunking behavior
1762    fn test_chunk_typescript_arrow_context() {
1763        let ts_code = r#"
1764// Utility function
1765export const util = () => {
1766    // comment about util
1767    return 42;
1768};
1769
1770export class Example {
1771    // leading comment for method
1772    constructor() {}
1773
1774    // Another comment
1775    run = () => {
1776        return util();
1777    };
1778}
1779
1780const compute = (x: number) => x * 2;
1781"#;
1782
1783        let chunks = chunk_language(ts_code, ParseableLanguage::TypeScript).unwrap();
1784
1785        let util_chunk = chunks
1786            .iter()
1787            .find(|chunk| chunk.text.contains("export const util"))
1788            .expect("Expected chunk for util arrow function");
1789        assert_eq!(util_chunk.chunk_type, ChunkType::Function);
1790        assert!(
1791            util_chunk.text.contains("// Utility function"),
1792            "expected leading comment to be included"
1793        );
1794        assert!(util_chunk.text.contains("export const util ="));
1795
1796        // The class field arrow function should be classified as a method and include its comment
1797        let method_chunk = chunks
1798            .iter()
1799            .find(|chunk| {
1800                chunk.chunk_type == ChunkType::Method && chunk.text.contains("run = () =>")
1801            })
1802            .expect("Expected chunk for class field arrow function");
1803
1804        assert_eq!(method_chunk.chunk_type, ChunkType::Method);
1805        assert!(
1806            method_chunk.text.contains("// Another comment"),
1807            "expected inline comment to be included"
1808        );
1809
1810        let compute_chunk = chunks
1811            .iter()
1812            .find(|chunk| chunk.text.contains("const compute"))
1813            .expect("Expected chunk for compute arrow function");
1814        assert_eq!(compute_chunk.chunk_type, ChunkType::Function);
1815        assert!(
1816            compute_chunk
1817                .text
1818                .contains("const compute = (x: number) => x * 2;")
1819        );
1820
1821        // Ensure we don't create bare arrow-expression chunks without context
1822        assert!(
1823            chunks
1824                .iter()
1825                .all(|chunk| !chunk.text.trim_start().starts_with("() =>"))
1826        );
1827        assert!(
1828            chunks
1829                .iter()
1830                .all(|chunk| !chunk.text.trim_start().starts_with("(x: number) =>"))
1831        );
1832    }
1833
1834    // TODO: Query-based chunking is more accurate than legacy for TypeScript
1835    // and finds additional method chunks. This is the correct behavior.
1836    // Legacy parity tests are disabled until legacy chunking is updated.
1837    #[test]
1838    #[ignore]
1839    fn test_typescript_query_matches_legacy() {
1840        let source = r#"
1841export const util = () => {
1842    return 42;
1843};
1844
1845export class Example {
1846    run = () => {
1847        return util();
1848    };
1849}
1850
1851const compute = (x: number) => x * 2;
1852"#;
1853
1854        assert_query_parity(ParseableLanguage::TypeScript, source);
1855    }
1856
1857    #[test]
1858    fn test_ruby_query_matches_legacy() {
1859        let source = r#"
1860class Calculator
1861  def initialize
1862    @memory = 0.0
1863  end
1864
1865  def add(a, b)
1866    a + b
1867  end
1868
1869  def self.class_method
1870    "class method"
1871  end
1872end
1873"#;
1874
1875        assert_query_parity(ParseableLanguage::Ruby, source);
1876    }
1877
1878    #[test]
1879    fn test_go_query_matches_legacy() {
1880        let source = r#"
1881package main
1882
1883import "fmt"
1884
1885const Pi = 3.14159
1886
1887var memory float64
1888
1889type Calculator struct {
1890    memory float64
1891}
1892
1893func (c *Calculator) Add(a, b float64) float64 {
1894    return a + b
1895}
1896
1897func Helper() {}
1898"#;
1899
1900        assert_query_parity(ParseableLanguage::Go, source);
1901    }
1902
1903    #[test]
1904    fn test_haskell_query_matches_legacy() {
1905        let source = r#"
1906module Example where
1907
1908data Shape
1909  = Circle Float
1910  | Square Float
1911
1912type family Area a
1913
1914class Printable a where
1915    printValue :: a -> String
1916
1917instance Printable Shape where
1918    printValue (Circle _) = "circle"
1919    printValue (Square _) = "square"
1920
1921shapeDescription :: Shape -> String
1922shapeDescription (Circle r) = "circle of radius " ++ show r
1923shapeDescription (Square s) = "square of side " ++ show s
1924"#;
1925
1926        assert_query_parity(ParseableLanguage::Haskell, source);
1927    }
1928
1929    #[test]
1930    fn test_csharp_query_matches_legacy() {
1931        let source = r#"
1932namespace Calculator;
1933
1934public interface ICalculator 
1935{
1936    double Add(double x, double y);
1937}
1938
1939public class Calculator 
1940{
1941    public static double PI = 3.14159;
1942    private double _memory;
1943
1944    public Calculator() 
1945    {
1946        _memory = 0.0;
1947    }
1948
1949    public double Add(double x, double y) 
1950    {
1951        return x + y;
1952    }
1953}
1954"#;
1955
1956        assert_query_parity(ParseableLanguage::CSharp, source);
1957    }
1958
1959    #[test]
1960    fn test_zig_query_matches_legacy() {
1961        let source = r#"
1962const std = @import("std");
1963
1964const Calculator = struct {
1965    memory: f64,
1966
1967    pub fn init() Calculator {
1968        return Calculator{ .memory = 0.0 };
1969    }
1970
1971    pub fn add(self: *Calculator, a: f64, b: f64) f64 {
1972        return a + b;
1973    }
1974};
1975
1976test "calculator addition" {
1977    var calc = Calculator.init();
1978    const result = calc.add(2.0, 3.0);
1979    try std.testing.expect(result == 5.0);
1980}
1981"#;
1982
1983        assert_query_parity(ParseableLanguage::Zig, source);
1984    }
1985
1986    #[test]
1987    fn test_chunk_zig() {
1988        let zig_code = r#"
1989const std = @import("std");
1990
1991const Calculator = struct {
1992    memory: f64,
1993
1994    pub fn init() Calculator {
1995        return Calculator{ .memory = 0.0 };
1996    }
1997
1998    pub fn add(self: *Calculator, a: f64, b: f64) f64 {
1999        const result = a + b;
2000        self.memory = result;
2001        return result;
2002    }
2003};
2004
2005const Color = enum {
2006    Red,
2007    Green,
2008    Blue,
2009};
2010
2011const Value = union(enum) {
2012    int: i32,
2013    float: f64,
2014};
2015
2016const Handle = opaque {};
2017
2018const MathError = error{
2019    DivisionByZero,
2020    Overflow,
2021};
2022
2023pub fn multiply(a: i32, b: i32) i32 {
2024    return a * b;
2025}
2026
2027pub fn divide(a: i32, b: i32) MathError!i32 {
2028    if (b == 0) return error.DivisionByZero;
2029    return @divTrunc(a, b);
2030}
2031
2032comptime {
2033    @compileLog("Compile-time validation");
2034}
2035
2036pub fn main() !void {
2037    var calc = Calculator.init();
2038    const result = calc.add(2.0, 3.0);
2039    std.debug.print("Result: {}\n", .{result});
2040}
2041
2042test "calculator addition" {
2043    var calc = Calculator.init();
2044    const result = calc.add(2.0, 3.0);
2045    try std.testing.expect(result == 5.0);
2046}
2047
2048test "multiply function" {
2049    const result = multiply(3, 4);
2050    try std.testing.expect(result == 12);
2051}
2052"#;
2053
2054        let chunks = chunk_language(zig_code, ParseableLanguage::Zig).unwrap();
2055        assert!(!chunks.is_empty());
2056
2057        let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
2058
2059        let class_count = chunk_types
2060            .iter()
2061            .filter(|&&t| t == &ChunkType::Class)
2062            .count();
2063        let function_count = chunk_types
2064            .iter()
2065            .filter(|&&t| t == &ChunkType::Function)
2066            .count();
2067        let module_count = chunk_types
2068            .iter()
2069            .filter(|&&t| t == &ChunkType::Module)
2070            .count();
2071
2072        assert!(
2073            class_count >= 5,
2074            "Expected at least 5 Class chunks (struct, enum, union, opaque, error set), found {}",
2075            class_count
2076        );
2077
2078        assert!(
2079            function_count >= 3,
2080            "Expected at least 3 functions (multiply, divide, main), found {}",
2081            function_count
2082        );
2083
2084        assert!(
2085            module_count >= 4,
2086            "Expected at least 4 module-type chunks (const std, comptime, 2 tests), found {}",
2087            module_count
2088        );
2089
2090        assert!(
2091            chunk_types.contains(&&ChunkType::Class),
2092            "Expected to find Class chunks"
2093        );
2094        assert!(
2095            chunk_types.contains(&&ChunkType::Function),
2096            "Expected to find Function chunks"
2097        );
2098        assert!(
2099            chunk_types.contains(&&ChunkType::Module),
2100            "Expected to find Module chunks"
2101        );
2102    }
2103
2104    #[test]
2105    fn test_chunk_csharp() {
2106        let csharp_code = r#"
2107namespace Calculator;
2108
2109public interface ICalculator 
2110{
2111    double Add(double x, double y);
2112}
2113
2114public class Calculator 
2115{
2116    public static const double PI = 3.14159;
2117    private double _memory;
2118
2119    public Calculator() 
2120    {
2121        _memory = 0.0;
2122    }
2123
2124    public double Add(double x, double y) 
2125    {
2126        return x + y;
2127    }
2128
2129    public static void Main(string[] args)
2130    {
2131        var calc = new Calculator();
2132    }
2133}
2134"#;
2135
2136        let chunks = chunk_language(csharp_code, ParseableLanguage::CSharp).unwrap();
2137        assert!(!chunks.is_empty());
2138
2139        // Should find variable, class, method and interface declarations
2140        let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
2141        assert!(chunk_types.contains(&&ChunkType::Module)); // var, interface
2142        assert!(chunk_types.contains(&&ChunkType::Class)); // class
2143        assert!(chunk_types.contains(&&ChunkType::Method)); // methods
2144    }
2145
2146    #[test]
2147    fn test_stride_large_chunk_empty_text() {
2148        // Regression test for divide-by-zero bug in stride_large_chunk
2149        let empty_chunk = Chunk {
2150            span: Span {
2151                byte_start: 0,
2152                byte_end: 0,
2153                line_start: 1,
2154                line_end: 1,
2155            },
2156            text: String::new(), // Empty text should not panic
2157            chunk_type: ChunkType::Text,
2158            stride_info: None,
2159            metadata: ChunkMetadata::from_text(""),
2160        };
2161
2162        let config = ChunkConfig::default();
2163        let result = stride_large_chunk(empty_chunk.clone(), &config);
2164
2165        // Should not panic and return the original chunk
2166        assert!(result.is_ok());
2167        let chunks = result.unwrap();
2168        assert_eq!(chunks.len(), 1);
2169        assert_eq!(chunks[0].text, "");
2170    }
2171
2172    #[test]
2173    fn test_stride_large_chunk_zero_token_estimate() {
2174        // Regression test for zero token estimate causing divide-by-zero
2175        let chunk = Chunk {
2176            span: Span {
2177                byte_start: 0,
2178                byte_end: 5,
2179                line_start: 1,
2180                line_end: 1,
2181            },
2182            text: "     ".to_string(), // Whitespace that might return 0 tokens
2183            chunk_type: ChunkType::Text,
2184            stride_info: None,
2185            metadata: ChunkMetadata::from_text("     "),
2186        };
2187
2188        let config = ChunkConfig::default();
2189        let result = stride_large_chunk(chunk, &config);
2190
2191        // Should not panic and handle gracefully
2192        assert!(result.is_ok());
2193    }
2194
2195    #[test]
2196    fn test_strided_chunk_line_calculation() {
2197        // Regression test for line_end calculation in strided chunks
2198        // Create a chunk large enough to force striding
2199        let long_text = (1..=50).map(|i| format!("This is a longer line {} with more content to ensure token count is high enough", i)).collect::<Vec<_>>().join("\n");
2200
2201        let metadata = ChunkMetadata::from_text(&long_text);
2202        let chunk = Chunk {
2203            span: Span {
2204                byte_start: 0,
2205                byte_end: long_text.len(),
2206                line_start: 1,
2207                line_end: 50,
2208            },
2209            text: long_text,
2210            chunk_type: ChunkType::Text,
2211            stride_info: None,
2212            metadata,
2213        };
2214
2215        let config = ChunkConfig {
2216            max_tokens: 100,    // Force striding with reasonable limit
2217            stride_overlap: 10, // Small overlap for testing
2218            ..Default::default()
2219        };
2220
2221        let result = stride_large_chunk(chunk, &config);
2222        if let Err(e) = &result {
2223            eprintln!("Stride error: {}", e);
2224        }
2225        assert!(result.is_ok());
2226
2227        let chunks = result.unwrap();
2228        assert!(
2229            chunks.len() > 1,
2230            "Should create multiple chunks when striding"
2231        );
2232
2233        for chunk in chunks {
2234            // Verify line_end is not off by one
2235            // line_end should be inclusive and not exceed the actual content
2236            assert!(chunk.span.line_end >= chunk.span.line_start);
2237
2238            // Check that line span makes sense for the content
2239            let line_count = chunk.text.lines().count();
2240            if line_count > 0 {
2241                let calculated_line_span = chunk.span.line_end - chunk.span.line_start + 1;
2242
2243                // Allow some tolerance for striding logic
2244                assert!(
2245                    calculated_line_span <= line_count + 1,
2246                    "Line span {} should not exceed content lines {} by more than 1",
2247                    calculated_line_span,
2248                    line_count
2249                );
2250            }
2251        }
2252    }
2253
2254    #[test]
2255    fn test_gap_filling_coverage() {
2256        // Test that all non-whitespace content gets chunked
2257        let test_cases = vec![
2258            (
2259                ParseableLanguage::Rust,
2260                r#"// This is a test file with imports at the top
2261use std::collections::HashMap;
2262use std::sync::Arc;
2263
2264// A comment between imports and code
2265const VERSION: &str = "1.0.0";
2266
2267// Main function
2268fn main() {
2269    println!("Hello, world!");
2270}
2271
2272// Some trailing content
2273// that should be indexed
2274"#,
2275            ),
2276            (
2277                ParseableLanguage::Python,
2278                r#"# Imports at the top
2279import os
2280import sys
2281
2282# Some constant
2283VERSION = "1.0.0"
2284
2285# Main function
2286def main():
2287    print("Hello, world!")
2288
2289# Trailing comment
2290# should be indexed
2291"#,
2292            ),
2293            (
2294                ParseableLanguage::TypeScript,
2295                r#"// Imports at the top
2296import { foo } from 'bar';
2297
2298// Some constant
2299const VERSION = "1.0.0";
2300
2301// Main function
2302function main() {
2303    console.log("Hello, world!");
2304}
2305
2306// Trailing comment
2307// should be indexed
2308"#,
2309            ),
2310        ];
2311
2312        for (language, code) in test_cases {
2313            eprintln!("\n=== Testing {} ===", language);
2314            let chunks = chunk_language(code, language).unwrap();
2315
2316            // Verify all non-whitespace bytes are covered
2317            let mut covered_bytes = vec![false; code.len()];
2318            for chunk in &chunks {
2319                for item in covered_bytes
2320                    .iter_mut()
2321                    .take(chunk.span.byte_end)
2322                    .skip(chunk.span.byte_start)
2323                {
2324                    *item = true;
2325                }
2326            }
2327
2328            let uncovered_non_ws: Vec<usize> = covered_bytes
2329                .iter()
2330                .enumerate()
2331                .filter(|(i, covered)| !**covered && !code.as_bytes()[*i].is_ascii_whitespace())
2332                .map(|(i, _)| i)
2333                .collect();
2334
2335            if !uncovered_non_ws.is_empty() {
2336                eprintln!("\n=== UNCOVERED NON-WHITESPACE for {} ===", language);
2337                eprintln!("Total bytes: {}", code.len());
2338                eprintln!("Uncovered non-whitespace: {}", uncovered_non_ws.len());
2339
2340                // Show what's uncovered
2341                for &pos in uncovered_non_ws.iter().take(10) {
2342                    let context_start = pos.saturating_sub(20);
2343                    let context_end = (pos + 20).min(code.len());
2344                    eprintln!(
2345                        "Uncovered at byte {}: {:?}",
2346                        pos,
2347                        &code[context_start..context_end]
2348                    );
2349                }
2350
2351                eprintln!("\n=== CHUNKS ===");
2352                for (i, chunk) in chunks.iter().enumerate() {
2353                    eprintln!(
2354                        "Chunk {}: {:?} bytes {}-{} (len {})",
2355                        i,
2356                        chunk.chunk_type,
2357                        chunk.span.byte_start,
2358                        chunk.span.byte_end,
2359                        chunk.span.byte_end - chunk.span.byte_start
2360                    );
2361                    eprintln!("  Text: {:?}", &chunk.text[..chunk.text.len().min(60)]);
2362                }
2363            }
2364
2365            assert!(
2366                uncovered_non_ws.is_empty(),
2367                "{}: Expected all non-whitespace covered but found {} uncovered non-whitespace bytes",
2368                language,
2369                uncovered_non_ws.len()
2370            );
2371        }
2372    }
2373
2374    #[test]
2375    fn test_web_server_file_coverage() {
2376        // Test that all non-whitespace content in web_server.rs is covered
2377        let code = std::fs::read_to_string("../examples/code/web_server.rs")
2378            .expect("Failed to read web_server.rs");
2379
2380        let chunks = chunk_language(&code, ParseableLanguage::Rust).unwrap();
2381
2382        // Check coverage for non-whitespace content only
2383        let mut covered = vec![false; code.len()];
2384        for chunk in &chunks {
2385            for item in covered
2386                .iter_mut()
2387                .take(chunk.span.byte_end)
2388                .skip(chunk.span.byte_start)
2389            {
2390                *item = true;
2391            }
2392        }
2393
2394        // Find uncovered bytes that are NOT whitespace
2395        let uncovered_non_whitespace: Vec<(usize, char)> = covered
2396            .iter()
2397            .enumerate()
2398            .filter(|(i, covered)| !**covered && !code.as_bytes()[*i].is_ascii_whitespace())
2399            .map(|(i, _)| (i, code.chars().nth(i).unwrap_or('?')))
2400            .collect();
2401
2402        if !uncovered_non_whitespace.is_empty() {
2403            eprintln!("\n=== WEB_SERVER.RS UNCOVERED NON-WHITESPACE ===");
2404            eprintln!("File size: {} bytes", code.len());
2405            eprintln!("Total chunks: {}", chunks.len());
2406            eprintln!(
2407                "Uncovered non-whitespace: {}",
2408                uncovered_non_whitespace.len()
2409            );
2410
2411            for &(pos, ch) in uncovered_non_whitespace.iter().take(10) {
2412                let start = pos.saturating_sub(30);
2413                let end = (pos + 30).min(code.len());
2414                eprintln!(
2415                    "\nUncovered '{}' at byte {}: {:?}",
2416                    ch,
2417                    pos,
2418                    &code[start..end]
2419                );
2420            }
2421
2422            eprintln!("\n=== CHUNKS ===");
2423            for (i, chunk) in chunks.iter().enumerate().take(20) {
2424                eprintln!(
2425                    "Chunk {}: {:?} bytes {}-{} lines {}-{}",
2426                    i,
2427                    chunk.chunk_type,
2428                    chunk.span.byte_start,
2429                    chunk.span.byte_end,
2430                    chunk.span.line_start,
2431                    chunk.span.line_end
2432                );
2433            }
2434        }
2435
2436        assert!(
2437            uncovered_non_whitespace.is_empty(),
2438            "Expected all non-whitespace content covered but found {} uncovered non-whitespace bytes",
2439            uncovered_non_whitespace.len()
2440        );
2441    }
2442
2443    #[test]
2444    fn test_haskell_function_chunking() {
2445        let haskell_code = r#"
2446factorial :: Integer -> Integer
2447factorial 0 = 1
2448factorial n = n * factorial (n - 1)
2449
2450fibonacci :: Integer -> Integer
2451fibonacci 0 = 0
2452fibonacci 1 = 1
2453fibonacci n = fibonacci (n - 1) + fibonacci (n - 2)
2454"#;
2455
2456        let mut parser = tree_sitter::Parser::new();
2457        parser
2458            .set_language(&tree_sitter_haskell::LANGUAGE.into())
2459            .unwrap();
2460        let tree = parser.parse(haskell_code, None).unwrap();
2461
2462        // Debug: print tree structure
2463        fn walk(node: tree_sitter::Node, _src: &str, depth: usize) {
2464            let kind = node.kind();
2465            let start = node.start_position();
2466            let end = node.end_position();
2467            eprintln!(
2468                "{}{:30} L{}-{}",
2469                "  ".repeat(depth),
2470                kind,
2471                start.row + 1,
2472                end.row + 1
2473            );
2474
2475            let mut cursor = node.walk();
2476            if cursor.goto_first_child() {
2477                loop {
2478                    walk(cursor.node(), _src, depth + 1);
2479                    if !cursor.goto_next_sibling() {
2480                        break;
2481                    }
2482                }
2483            }
2484        }
2485
2486        eprintln!("\n=== TREE STRUCTURE ===");
2487        walk(tree.root_node(), haskell_code, 0);
2488        eprintln!("=== END TREE ===\n");
2489
2490        let chunks = chunk_language(haskell_code, ParseableLanguage::Haskell).unwrap();
2491
2492        eprintln!("\n=== CHUNKS ===");
2493        for (i, chunk) in chunks.iter().enumerate() {
2494            eprintln!(
2495                "Chunk {}: {:?} L{}-{}",
2496                i, chunk.chunk_type, chunk.span.line_start, chunk.span.line_end
2497            );
2498            eprintln!("  Text: {:?}", chunk.text);
2499        }
2500        eprintln!("=== END CHUNKS ===\n");
2501
2502        assert!(!chunks.is_empty(), "Should find chunks in Haskell code");
2503
2504        // Find factorial chunk and verify it includes both signature and implementation
2505        let factorial_chunk = chunks.iter().find(|c| c.text.contains("factorial 0 = 1"));
2506        assert!(
2507            factorial_chunk.is_some(),
2508            "Should find factorial function body"
2509        );
2510
2511        let fac = factorial_chunk.unwrap();
2512        assert!(
2513            fac.text.contains("factorial :: Integer -> Integer"),
2514            "Should include type signature"
2515        );
2516        assert!(
2517            fac.text.contains("factorial 0 = 1"),
2518            "Should include base case"
2519        );
2520        assert!(
2521            fac.text.contains("factorial n = n * factorial (n - 1)"),
2522            "Should include recursive case"
2523        );
2524    }
2525
2526    #[test]
2527    fn test_chunk_elixir_basic() {
2528        let elixir_code = r#"
2529defmodule Calculator do
2530  @moduledoc "A simple calculator module"
2531
2532  def add(a, b) do
2533    a + b
2534  end
2535
2536  defp multiply(a, b) do
2537    a * b
2538  end
2539end
2540"#;
2541
2542        let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
2543
2544        eprintln!("\n=== ELIXIR CHUNKS ===");
2545        for (i, chunk) in chunks.iter().enumerate() {
2546            eprintln!(
2547                "Chunk {}: {:?} L{}-{}",
2548                i, chunk.chunk_type, chunk.span.line_start, chunk.span.line_end
2549            );
2550            eprintln!("  Text: {:?}", &chunk.text[..chunk.text.len().min(80)]);
2551        }
2552        eprintln!("=== END CHUNKS ===\n");
2553
2554        assert!(!chunks.is_empty(), "Should find chunks in Elixir code");
2555
2556        // Should have module and function chunks
2557        let has_module = chunks.iter().any(|c| c.chunk_type == ChunkType::Module);
2558        let has_function = chunks.iter().any(|c| c.chunk_type == ChunkType::Function);
2559
2560        assert!(has_module, "Should detect defmodule as Module");
2561        assert!(has_function, "Should detect def/defp as Function");
2562    }
2563
2564    #[test]
2565    fn test_chunk_elixir_protocol() {
2566        let elixir_code = r#"
2567defprotocol Stringable do
2568  @doc "Converts to string"
2569  def to_string(value)
2570end
2571
2572defimpl Stringable, for: Integer do
2573  def to_string(value), do: Integer.to_string(value)
2574end
2575"#;
2576
2577        let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
2578
2579        eprintln!("\n=== ELIXIR PROTOCOL CHUNKS ===");
2580        for (i, chunk) in chunks.iter().enumerate() {
2581            eprintln!(
2582                "Chunk {}: {:?} L{}-{}",
2583                i, chunk.chunk_type, chunk.span.line_start, chunk.span.line_end
2584            );
2585            eprintln!("  Text: {:?}", &chunk.text[..chunk.text.len().min(80)]);
2586        }
2587        eprintln!("=== END CHUNKS ===\n");
2588
2589        // Should detect protocol and implementation as modules
2590        let modules: Vec<_> = chunks
2591            .iter()
2592            .filter(|c| c.chunk_type == ChunkType::Module)
2593            .collect();
2594
2595        assert!(
2596            modules.len() >= 2,
2597            "Should detect defprotocol and defimpl as modules, found {}",
2598            modules.len()
2599        );
2600    }
2601
2602    #[test]
2603    fn test_chunk_elixir_genserver() {
2604        let elixir_code = r#"
2605defmodule MyServer do
2606  use GenServer
2607
2608  def start_link(opts) do
2609    GenServer.start_link(__MODULE__, opts, name: __MODULE__)
2610  end
2611
2612  def init(state) do
2613    {:ok, state}
2614  end
2615
2616  def handle_call(:get, _from, state) do
2617    {:reply, state, state}
2618  end
2619
2620  def handle_cast({:set, value}, _state) do
2621    {:noreply, value}
2622  end
2623end
2624"#;
2625
2626        let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
2627
2628        // Should capture all GenServer callbacks as functions
2629        let functions: Vec<_> = chunks
2630            .iter()
2631            .filter(|c| c.chunk_type == ChunkType::Function)
2632            .collect();
2633
2634        assert!(
2635            functions.len() >= 4,
2636            "Should detect at least 4 functions (start_link, init, handle_call, handle_cast), found {}",
2637            functions.len()
2638        );
2639    }
2640
2641    #[test]
2642    fn test_elixir_extension_detection() {
2643        use ck_core::Language;
2644
2645        assert_eq!(Language::from_extension("ex"), Some(Language::Elixir));
2646        assert_eq!(Language::from_extension("exs"), Some(Language::Elixir));
2647        assert_eq!(Language::from_extension("EX"), Some(Language::Elixir));
2648        assert_eq!(Language::from_extension("EXS"), Some(Language::Elixir));
2649    }
2650
2651    #[test]
2652    fn test_chunk_elixir_macros() {
2653        let elixir_code = r#"
2654defmodule MyMacros do
2655  defmacro unless(condition, do: block) do
2656    quote do
2657      if !unquote(condition), do: unquote(block)
2658    end
2659  end
2660
2661  defmacrop private_macro(x) do
2662    quote do: unquote(x) * 2
2663  end
2664end
2665"#;
2666
2667        let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
2668
2669        let functions: Vec<_> = chunks
2670            .iter()
2671            .filter(|c| c.chunk_type == ChunkType::Function)
2672            .collect();
2673
2674        assert!(
2675            functions.len() >= 2,
2676            "Should detect defmacro and defmacrop as functions, found {}",
2677            functions.len()
2678        );
2679    }
2680
2681    #[test]
2682    fn test_chunk_elixir_module_attributes() {
2683        let elixir_code = r#"
2684defmodule Calculator do
2685  @moduledoc "A calculator with type specs"
2686
2687  @behaviour GenServer
2688
2689  @type operation :: :add | :subtract | :multiply | :divide
2690  @typep internal_state :: %{history: list()}
2691  @opaque result :: {:ok, number()} | {:error, atom()}
2692
2693  @callback init(args :: term()) :: {:ok, state :: term()}
2694  @callback handle_call(request :: term(), from :: term(), state :: term()) :: {:reply, term(), term()}
2695
2696  @optional_callbacks [handle_info: 2]
2697
2698  @spec add(number(), number()) :: number()
2699  def add(a, b), do: a + b
2700
2701  @spec subtract(number(), number()) :: number()
2702  def subtract(a, b), do: a - b
2703end
2704"#;
2705
2706        let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
2707
2708        eprintln!("\n=== ELIXIR MODULE ATTRIBUTES CHUNKS ===");
2709        for (i, chunk) in chunks.iter().enumerate() {
2710            eprintln!(
2711                "Chunk {}: {:?} L{}-{}",
2712                i, chunk.chunk_type, chunk.span.line_start, chunk.span.line_end
2713            );
2714            eprintln!("  Text: {:?}", &chunk.text[..chunk.text.len().min(80)]);
2715        }
2716        eprintln!("=== END CHUNKS ===\n");
2717
2718        // Check for @behaviour
2719        let has_behaviour = chunks
2720            .iter()
2721            .any(|c| c.chunk_type == ChunkType::Text && c.text.contains("@behaviour GenServer"));
2722        assert!(has_behaviour, "Should capture @behaviour declaration");
2723
2724        // Check for @type definitions
2725        let type_chunks: Vec<_> = chunks
2726            .iter()
2727            .filter(|c| {
2728                c.chunk_type == ChunkType::Text
2729                    && (c.text.contains("@type")
2730                        || c.text.contains("@typep")
2731                        || c.text.contains("@opaque"))
2732            })
2733            .collect();
2734        assert!(
2735            type_chunks.len() >= 3,
2736            "Should capture @type, @typep, and @opaque, found {}",
2737            type_chunks.len()
2738        );
2739
2740        // Check for @callback definitions
2741        let callback_chunks: Vec<_> = chunks
2742            .iter()
2743            .filter(|c| c.chunk_type == ChunkType::Text && c.text.contains("@callback"))
2744            .collect();
2745        assert!(
2746            callback_chunks.len() >= 2,
2747            "Should capture @callback definitions, found {}",
2748            callback_chunks.len()
2749        );
2750
2751        // Check for @spec definitions
2752        let spec_chunks: Vec<_> = chunks
2753            .iter()
2754            .filter(|c| c.chunk_type == ChunkType::Text && c.text.contains("@spec"))
2755            .collect();
2756        assert!(
2757            spec_chunks.len() >= 2,
2758            "Should capture @spec definitions, found {}",
2759            spec_chunks.len()
2760        );
2761
2762        // Verify we still capture the functions
2763        let function_chunks: Vec<_> = chunks
2764            .iter()
2765            .filter(|c| c.chunk_type == ChunkType::Function)
2766            .collect();
2767        assert!(
2768            function_chunks.len() >= 2,
2769            "Should still capture def functions, found {}",
2770            function_chunks.len()
2771        );
2772    }
2773
2774    #[test]
2775    fn test_chunk_elixir_behavior_spelling() {
2776        // Test both British and American spellings
2777        let elixir_code = r#"
2778defmodule BritishModule do
2779  @behaviour GenServer
2780end
2781
2782defmodule AmericanModule do
2783  @behavior GenServer
2784end
2785"#;
2786
2787        let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
2788
2789        let behaviour_chunks: Vec<_> = chunks
2790            .iter()
2791            .filter(|c| {
2792                c.chunk_type == ChunkType::Text
2793                    && (c.text.contains("@behaviour") || c.text.contains("@behavior"))
2794            })
2795            .collect();
2796
2797        assert!(
2798            behaviour_chunks.len() >= 2,
2799            "Should capture both @behaviour and @behavior spellings, found {}",
2800            behaviour_chunks.len()
2801        );
2802    }
2803}
ck_chunk/lib.rs

ck_chunk/
lib.rs