Skip to main content

ck_chunk/
lib.rs

1use anyhow::Result;
2use ck_core::Span;
3use serde::{Deserialize, Serialize};
4
5mod query_chunker;
6
7/// Import token estimation from ck-embed
8pub use ck_embed::TokenEstimator;
9
10/// Fallback to estimation if precise tokenization fails
11fn estimate_tokens(text: &str) -> usize {
12    TokenEstimator::estimate_tokens(text)
13}
14
15/// Get model-specific chunk configuration (target_tokens, overlap_tokens)
16/// Balanced for precision vs context - larger models can handle bigger chunks but not too big
17pub fn get_model_chunk_config(model_name: Option<&str>) -> (usize, usize) {
18    let model = model_name.unwrap_or("nomic-embed-text-v1.5");
19
20    match model {
21        // Small models - keep chunks smaller for better precision
22        "BAAI/bge-small-en-v1.5" | "sentence-transformers/all-MiniLM-L6-v2" => {
23            (400, 80) // 400 tokens target, 80 token overlap (~20%)
24        }
25
26        // Large context models - can use bigger chunks while preserving precision
27        // Sweet spot: enough context to be meaningful, small enough to be precise
28        "nomic-embed-text-v1" | "nomic-embed-text-v1.5" | "jina-embeddings-v2-base-code" => {
29            (1024, 200) // 1024 tokens target, 200 token overlap (~20%) - good balance
30        }
31
32        // BGE variants - stick to smaller for precision
33        "BAAI/bge-base-en-v1.5" | "BAAI/bge-large-en-v1.5" => {
34            (400, 80) // 400 tokens target, 80 token overlap (~20%)
35        }
36
37        // Default to large model config since nomic-v1.5 is default
38        _ => (1024, 200), // Good balance of context vs precision
39    }
40}
41
42/// Information about chunk striding for large chunks that exceed token limits
43#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct StrideInfo {
45    /// Unique ID for the original chunk before striding
46    pub original_chunk_id: String,
47    /// Index of this stride (0-based)
48    pub stride_index: usize,
49    /// Total number of strides for the original chunk
50    pub total_strides: usize,
51    /// Byte offset where overlap with previous stride begins
52    pub overlap_start: usize,
53    /// Byte offset where overlap with next stride ends
54    pub overlap_end: usize,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize, Default)]
58pub struct ChunkMetadata {
59    pub ancestry: Vec<String>,
60    pub breadcrumb: Option<String>,
61    pub leading_trivia: Vec<String>,
62    pub trailing_trivia: Vec<String>,
63    pub byte_length: usize,
64    pub estimated_tokens: usize,
65}
66
67impl ChunkMetadata {
68    fn from_context(
69        text: &str,
70        ancestry: Vec<String>,
71        leading_trivia: Vec<String>,
72        trailing_trivia: Vec<String>,
73    ) -> Self {
74        let breadcrumb = if ancestry.is_empty() {
75            None
76        } else {
77            Some(ancestry.join("::"))
78        };
79
80        Self {
81            ancestry,
82            breadcrumb,
83            leading_trivia,
84            trailing_trivia,
85            byte_length: text.len(),
86            estimated_tokens: estimate_tokens(text),
87        }
88    }
89
90    fn from_text(text: &str) -> Self {
91        Self {
92            ancestry: Vec::new(),
93            breadcrumb: None,
94            leading_trivia: Vec::new(),
95            trailing_trivia: Vec::new(),
96            byte_length: text.len(),
97            estimated_tokens: estimate_tokens(text),
98        }
99    }
100
101    fn with_updated_text(&self, text: &str) -> Self {
102        let mut cloned = self.clone();
103        cloned.byte_length = text.len();
104        cloned.estimated_tokens = estimate_tokens(text);
105        cloned
106    }
107}
108
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct Chunk {
111    pub span: Span,
112    pub text: String,
113    pub chunk_type: ChunkType,
114    /// Stride information if this chunk was created by striding a larger chunk
115    pub stride_info: Option<StrideInfo>,
116    pub metadata: ChunkMetadata,
117}
118
119#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
120pub enum ChunkType {
121    Text,
122    Function,
123    Class,
124    Method,
125    Module,
126}
127
128#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
129pub enum ParseableLanguage {
130    Python,
131    TypeScript,
132    JavaScript,
133    Haskell,
134    Rust,
135    Ruby,
136    Go,
137    C,
138    Cpp,
139    CSharp,
140    Zig,
141
142    Dart,
143
144    Elixir,
145
146    Markdown,
147}
148
149impl std::fmt::Display for ParseableLanguage {
150    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
151        let name = match self {
152            ParseableLanguage::Python => "python",
153            ParseableLanguage::TypeScript => "typescript",
154            ParseableLanguage::JavaScript => "javascript",
155            ParseableLanguage::Haskell => "haskell",
156            ParseableLanguage::Rust => "rust",
157            ParseableLanguage::Ruby => "ruby",
158            ParseableLanguage::Go => "go",
159            ParseableLanguage::C => "c",
160            ParseableLanguage::Cpp => "cpp",
161            ParseableLanguage::CSharp => "csharp",
162            ParseableLanguage::Zig => "zig",
163
164            ParseableLanguage::Dart => "dart",
165
166            ParseableLanguage::Elixir => "elixir",
167
168            ParseableLanguage::Markdown => "markdown",
169        };
170        write!(f, "{name}")
171    }
172}
173
174impl TryFrom<ck_core::Language> for ParseableLanguage {
175    type Error = anyhow::Error;
176
177    fn try_from(lang: ck_core::Language) -> Result<Self, Self::Error> {
178        match lang {
179            ck_core::Language::Python => Ok(ParseableLanguage::Python),
180            ck_core::Language::TypeScript => Ok(ParseableLanguage::TypeScript),
181            ck_core::Language::JavaScript => Ok(ParseableLanguage::JavaScript),
182            ck_core::Language::Haskell => Ok(ParseableLanguage::Haskell),
183            ck_core::Language::Rust => Ok(ParseableLanguage::Rust),
184            ck_core::Language::Ruby => Ok(ParseableLanguage::Ruby),
185            ck_core::Language::Go => Ok(ParseableLanguage::Go),
186            ck_core::Language::C => Ok(ParseableLanguage::C),
187            ck_core::Language::Cpp => Ok(ParseableLanguage::Cpp),
188            ck_core::Language::CSharp => Ok(ParseableLanguage::CSharp),
189            ck_core::Language::Zig => Ok(ParseableLanguage::Zig),
190
191            ck_core::Language::Dart => Ok(ParseableLanguage::Dart),
192
193            ck_core::Language::Elixir => Ok(ParseableLanguage::Elixir),
194
195            ck_core::Language::Markdown => Ok(ParseableLanguage::Markdown),
196
197            _ => Err(anyhow::anyhow!(
198                "Language {lang:?} is not supported for parsing"
199            )),
200        }
201    }
202}
203
204pub fn chunk_text(text: &str, language: Option<ck_core::Language>) -> Result<Vec<Chunk>> {
205    chunk_text_with_config(text, language, &ChunkConfig::default())
206}
207
208/// Configuration for chunking behavior
209#[derive(Debug, Clone)]
210pub struct ChunkConfig {
211    /// Maximum tokens per chunk (for striding)
212    pub max_tokens: usize,
213    /// Overlap size for striding (in tokens)
214    pub stride_overlap: usize,
215    /// Enable striding for chunks that exceed max_tokens
216    pub enable_striding: bool,
217}
218
219impl Default for ChunkConfig {
220    fn default() -> Self {
221        Self {
222            max_tokens: 8192,     // Default to Nomic model limit
223            stride_overlap: 1024, // 12.5% overlap
224            enable_striding: true,
225        }
226    }
227}
228
229/// New function that accepts model name for model-specific chunking
230pub fn chunk_text_with_model(
231    text: &str,
232    language: Option<ck_core::Language>,
233    model_name: Option<&str>,
234) -> Result<Vec<Chunk>> {
235    let (target_tokens, overlap_tokens) = get_model_chunk_config(model_name);
236
237    // Create a config based on model-specific parameters
238    let config = ChunkConfig {
239        max_tokens: target_tokens,
240        stride_overlap: overlap_tokens,
241        enable_striding: true,
242    };
243
244    chunk_text_with_config_and_model(text, language, &config, model_name)
245}
246
247pub fn chunk_text_with_config(
248    text: &str,
249    language: Option<ck_core::Language>,
250    config: &ChunkConfig,
251) -> Result<Vec<Chunk>> {
252    chunk_text_with_config_and_model(text, language, config, None)
253}
254
255fn chunk_text_with_config_and_model(
256    text: &str,
257    language: Option<ck_core::Language>,
258    config: &ChunkConfig,
259    model_name: Option<&str>,
260) -> Result<Vec<Chunk>> {
261    tracing::debug!(
262        "Chunking text with language: {:?}, length: {} chars, config: {:?}",
263        language,
264        text.len(),
265        config
266    );
267
268    let result = match language.map(ParseableLanguage::try_from) {
269        Some(Ok(lang)) => {
270            tracing::debug!("Using {} tree-sitter parser", lang);
271            chunk_language_with_model(text, lang, model_name)
272        }
273        Some(Err(_)) => {
274            tracing::debug!("Language not supported for parsing, using generic chunking strategy");
275            chunk_generic_with_token_config(text, model_name)
276        }
277        None => {
278            tracing::debug!("Using generic chunking strategy");
279            chunk_generic_with_token_config(text, model_name)
280        }
281    };
282
283    let mut chunks = result?;
284
285    // Apply striding if enabled and necessary
286    if config.enable_striding {
287        chunks = apply_striding(chunks, config)?;
288    }
289
290    tracing::debug!("Successfully created {} final chunks", chunks.len());
291    Ok(chunks)
292}
293
294fn chunk_generic(text: &str) -> Result<Vec<Chunk>> {
295    chunk_generic_with_token_config(text, None)
296}
297
298fn chunk_generic_with_token_config(text: &str, model_name: Option<&str>) -> Result<Vec<Chunk>> {
299    let mut chunks = Vec::new();
300    let lines: Vec<&str> = text.lines().collect();
301
302    // Get model-specific optimal chunk size in tokens
303    let (target_tokens, overlap_tokens) = get_model_chunk_config(model_name);
304
305    // Convert token targets to approximate line counts
306    // This is a rough heuristic - we'll validate with actual token counting
307    let avg_tokens_per_line = 10.0; // Rough estimate for code
308    let target_lines = ((target_tokens as f32) / avg_tokens_per_line) as usize;
309    let overlap_lines = ((overlap_tokens as f32) / avg_tokens_per_line) as usize;
310
311    let chunk_size = target_lines.max(5); // Minimum 5 lines
312    let overlap = overlap_lines.max(1); // Minimum 1 line overlap
313
314    // Pre-compute cumulative byte offsets for O(1) lookup, accounting for different line endings
315    let mut line_byte_offsets = Vec::with_capacity(lines.len() + 1);
316    line_byte_offsets.push(0);
317    let mut cumulative_offset = 0;
318    let mut byte_pos = 0;
319
320    for line in lines.iter() {
321        cumulative_offset += line.len();
322
323        // Find the actual line ending length in the original text
324        let line_end_pos = byte_pos + line.len();
325        let newline_len = if line_end_pos < text.len() && text.as_bytes()[line_end_pos] == b'\r' {
326            if line_end_pos + 1 < text.len() && text.as_bytes()[line_end_pos + 1] == b'\n' {
327                2 // CRLF
328            } else {
329                1 // CR only (old Mac)
330            }
331        } else if line_end_pos < text.len() && text.as_bytes()[line_end_pos] == b'\n' {
332            1 // LF only (Unix)
333        } else {
334            0 // No newline at this position (could be last line without newline)
335        };
336
337        cumulative_offset += newline_len;
338        byte_pos = cumulative_offset;
339        line_byte_offsets.push(cumulative_offset);
340    }
341
342    let mut i = 0;
343    while i < lines.len() {
344        let end = (i + chunk_size).min(lines.len());
345        let chunk_lines = &lines[i..end];
346        let chunk_text = chunk_lines.join("\n");
347        let byte_start = line_byte_offsets[i];
348        let byte_end = line_byte_offsets[end];
349        let metadata = ChunkMetadata::from_text(&chunk_text);
350
351        chunks.push(Chunk {
352            span: Span {
353                byte_start,
354                byte_end,
355                line_start: i + 1,
356                line_end: end,
357            },
358            text: chunk_text,
359            chunk_type: ChunkType::Text,
360            stride_info: None,
361            metadata,
362        });
363
364        i += chunk_size - overlap;
365        if i >= lines.len() {
366            break;
367        }
368    }
369
370    Ok(chunks)
371}
372
373pub(crate) fn tree_sitter_language(language: ParseableLanguage) -> Result<tree_sitter::Language> {
374    // tree-sitter-dart v0.0.4 uses an older API that returns Language directly,
375    // while newer bindings (v0.24+) require calling .into() on a factory struct.
376    if language == ParseableLanguage::Dart {
377        return Ok(tree_sitter_dart::language());
378    }
379
380    if language == ParseableLanguage::Markdown {
381        return Ok(tree_sitter_md::LANGUAGE.into());
382    }
383
384    let ts_language = match language {
385        ParseableLanguage::Python => tree_sitter_python::LANGUAGE,
386        ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => {
387            tree_sitter_typescript::LANGUAGE_TYPESCRIPT
388        }
389        ParseableLanguage::Haskell => tree_sitter_haskell::LANGUAGE,
390        ParseableLanguage::Rust => tree_sitter_rust::LANGUAGE,
391        ParseableLanguage::Ruby => tree_sitter_ruby::LANGUAGE,
392        ParseableLanguage::Go => tree_sitter_go::LANGUAGE,
393        ParseableLanguage::C => tree_sitter_c::LANGUAGE,
394        ParseableLanguage::Cpp => tree_sitter_cpp::LANGUAGE,
395        ParseableLanguage::CSharp => tree_sitter_c_sharp::LANGUAGE,
396        ParseableLanguage::Zig => tree_sitter_zig::LANGUAGE,
397
398        ParseableLanguage::Dart => unreachable!("Handled above via early return"),
399
400        ParseableLanguage::Elixir => tree_sitter_elixir::LANGUAGE,
401
402        ParseableLanguage::Markdown => unreachable!("Handled above via early return"),
403    };
404
405    Ok(ts_language.into())
406}
407
408fn chunk_language(text: &str, language: ParseableLanguage) -> Result<Vec<Chunk>> {
409    let mut parser = tree_sitter::Parser::new();
410    let ts_language = tree_sitter_language(language)?;
411    parser.set_language(&ts_language)?;
412
413    let tree = parser
414        .parse(text, None)
415        .ok_or_else(|| anyhow::anyhow!("Failed to parse {language} code"))?;
416
417    let mut chunks = match query_chunker::chunk_with_queries(language, ts_language, &tree, text)? {
418        Some(query_chunks) if !query_chunks.is_empty() => query_chunks,
419        _ => {
420            let mut legacy_chunks = Vec::new();
421            let mut cursor = tree.walk();
422            extract_code_chunks(&mut cursor, text, &mut legacy_chunks, language);
423            legacy_chunks
424        }
425    };
426
427    if chunks.is_empty() {
428        return chunk_generic(text);
429    }
430
431    // Post-process Haskell chunks to merge function equations
432    if language == ParseableLanguage::Haskell {
433        chunks = merge_haskell_functions(chunks, text);
434    }
435
436    // Suppress text chunks fully contained by class/method/function chunks for C/C++
437    if matches!(language, ParseableLanguage::C | ParseableLanguage::Cpp) {
438        chunks = suppress_contained_text_chunks(chunks);
439    }
440
441    // Fill gaps between chunks with remainder content
442    chunks = fill_gaps(chunks, text);
443
444    // Merge template-prefix gap chunks into the following C++ definition chunk
445    if language == ParseableLanguage::Cpp {
446        chunks = merge_cpp_template_prefix_chunks(chunks, text);
447    }
448
449    // Merge small chunks if Markdown
450    if language == ParseableLanguage::Markdown {
451        let (target_tokens, _) = get_model_chunk_config(None);
452        chunks = merge_small_chunks(chunks, text, target_tokens);
453    }
454
455    Ok(chunks)
456}
457
458fn suppress_contained_text_chunks(chunks: Vec<Chunk>) -> Vec<Chunk> {
459    if chunks.is_empty() {
460        return chunks;
461    }
462
463    let mut containers: Vec<(usize, usize)> = chunks
464        .iter()
465        .filter(|chunk| {
466            matches!(
467                chunk.chunk_type,
468                ChunkType::Class | ChunkType::Method | ChunkType::Function
469            )
470        })
471        .map(|chunk| (chunk.span.byte_start, chunk.span.byte_end))
472        .collect();
473
474    if containers.is_empty() {
475        return chunks;
476    }
477
478    containers.sort_by_key(|(start, _)| *start);
479
480    chunks
481        .into_iter()
482        .filter(|chunk| {
483            if chunk.chunk_type != ChunkType::Text {
484                return true;
485            }
486
487            let start = chunk.span.byte_start;
488            let end = chunk.span.byte_end;
489            !containers
490                .iter()
491                .any(|(c_start, c_end)| *c_start <= start && end <= *c_end)
492        })
493        .collect()
494}
495
496fn merge_cpp_template_prefix_chunks(chunks: Vec<Chunk>, text: &str) -> Vec<Chunk> {
497    if chunks.len() < 2 {
498        return chunks;
499    }
500
501    let mut merged = Vec::with_capacity(chunks.len());
502    let mut idx = 0;
503
504    while idx < chunks.len() {
505        if idx + 1 < chunks.len() && is_template_prefix_chunk(&chunks[idx]) {
506            let template_chunk = &chunks[idx];
507            let mut next_chunk = chunks[idx + 1].clone();
508
509            if template_chunk.span.byte_end == next_chunk.span.byte_start
510                && template_chunk.span.byte_start < next_chunk.span.byte_end
511                && next_chunk.span.byte_end <= text.len()
512            {
513                let new_start = template_chunk.span.byte_start;
514                let new_end = next_chunk.span.byte_end;
515
516                if let Some(new_text) = text.get(new_start..new_end) {
517                    let (line_start, line_end) = line_range_for_span(text, new_start, new_end);
518
519                    next_chunk.span.byte_start = new_start;
520                    next_chunk.span.line_start = line_start;
521                    next_chunk.span.line_end = line_end;
522                    next_chunk.text = new_text.to_string();
523                    next_chunk.metadata = next_chunk.metadata.with_updated_text(new_text);
524
525                    merged.push(next_chunk);
526                    idx += 2;
527                    continue;
528                }
529            }
530        }
531
532        merged.push(chunks[idx].clone());
533        idx += 1;
534    }
535
536    merged
537}
538
539fn is_template_prefix_chunk(chunk: &Chunk) -> bool {
540    if chunk.chunk_type != ChunkType::Text {
541        return false;
542    }
543
544    let mut has_template = false;
545    for line in chunk.text.lines() {
546        let trimmed = line.trim();
547        if trimmed.is_empty() {
548            continue;
549        }
550        if trimmed.starts_with("template <") || trimmed.starts_with("template<") {
551            has_template = true;
552            continue;
553        }
554        return false;
555    }
556
557    has_template
558}
559
560fn line_range_for_span(text: &str, byte_start: usize, byte_end: usize) -> (usize, usize) {
561    let line_start = text[..byte_start].matches('\n').count() + 1;
562    let newlines_up_to_end = text[..byte_end].matches('\n').count();
563    let line_end = if newlines_up_to_end >= line_start - 1 {
564        newlines_up_to_end.max(line_start)
565    } else {
566        line_start
567    };
568
569    (line_start, line_end)
570}
571
572/// Fill gaps between chunks with remainder content
573/// This ensures that leading imports, trailing code, and content between functions gets indexed
574/// Combines contiguous gaps into single chunks (excluding standalone blank lines)
575fn fill_gaps(mut chunks: Vec<Chunk>, text: &str) -> Vec<Chunk> {
576    if chunks.is_empty() {
577        return chunks;
578    }
579
580    // Sort chunks by byte position to identify gaps
581    chunks.sort_by_key(|c| c.span.byte_start);
582
583    let mut result = Vec::new();
584    let mut last_end = 0;
585
586    // Collect all gaps, splitting on blank lines
587    let mut gaps = Vec::new();
588
589    for chunk in &chunks {
590        if last_end < chunk.span.byte_start {
591            // Split this gap by blank lines - use split to make it simple
592            let gap_start = last_end;
593            let gap_text = &text[gap_start..chunk.span.byte_start];
594
595            // Split on sequences of blank lines
596            let mut current_byte = gap_start;
597            let mut segment_start = gap_start;
598
599            for line in gap_text.split('\n') {
600                let line_start_in_gap = current_byte - gap_start;
601                let _line_end_in_gap = line_start_in_gap + line.len();
602
603                if line.trim().is_empty() {
604                    // Found a blank line - save segment before it if it has content
605                    if segment_start < current_byte {
606                        let segment_text = &text[segment_start..current_byte];
607                        if !segment_text.trim().is_empty() {
608                            gaps.push((segment_start, current_byte));
609                        }
610                    }
611                    // Next segment starts after this blank line and its newline
612                    segment_start = current_byte + line.len() + 1;
613                }
614
615                current_byte += line.len() + 1; // +1 for the \n
616            }
617
618            // Handle final segment (after last newline or if no newlines)
619            if segment_start < chunk.span.byte_start {
620                let remaining = &text[segment_start..chunk.span.byte_start];
621                if !remaining.trim().is_empty() {
622                    gaps.push((segment_start, chunk.span.byte_start));
623                }
624            }
625        }
626        last_end = last_end.max(chunk.span.byte_end);
627    }
628
629    // Handle trailing content
630    if last_end < text.len() {
631        let gap_text = &text[last_end..];
632        if !gap_text.trim().is_empty() {
633            gaps.push((last_end, text.len()));
634        }
635    }
636
637    let combined_gaps = gaps;
638
639    // Now interleave chunks and combined gap chunks
640    let mut gap_idx = 0;
641
642    for chunk in chunks {
643        // Add any gap chunks that come before this structural chunk
644        while gap_idx < combined_gaps.len() && combined_gaps[gap_idx].1 <= chunk.span.byte_start {
645            let (gap_start, gap_end) = combined_gaps[gap_idx];
646            let gap_text = &text[gap_start..gap_end];
647
648            // Calculate line numbers by counting newlines before each position
649            let line_start = text[..gap_start].matches('\n').count() + 1;
650            // For line_end, count newlines in the text including the gap
651            // This gives us the line number of the last line with gap content
652            let newlines_up_to_end = text[..gap_end].matches('\n').count();
653            let line_end = if newlines_up_to_end >= line_start - 1 {
654                newlines_up_to_end.max(line_start)
655            } else {
656                line_start
657            };
658
659            let gap_chunk = Chunk {
660                text: gap_text.to_string(),
661                span: Span {
662                    byte_start: gap_start,
663                    byte_end: gap_end,
664                    line_start,
665                    line_end,
666                },
667                chunk_type: ChunkType::Text,
668                metadata: ChunkMetadata::from_text(gap_text),
669                stride_info: None,
670            };
671            result.push(gap_chunk);
672            gap_idx += 1;
673        }
674
675        result.push(chunk.clone());
676    }
677
678    // Add any remaining gap chunks after the last structural chunk
679    while gap_idx < combined_gaps.len() {
680        let (gap_start, gap_end) = combined_gaps[gap_idx];
681        let gap_text = &text[gap_start..gap_end];
682
683        // Calculate line numbers by counting newlines before each position
684        let line_start = text[..gap_start].matches('\n').count() + 1;
685        // For line_end, count newlines in the text including the gap
686        let newlines_up_to_end = text[..gap_end].matches('\n').count();
687        let line_end = if newlines_up_to_end >= line_start - 1 {
688            newlines_up_to_end.max(line_start)
689        } else {
690            line_start
691        };
692
693        let gap_chunk = Chunk {
694            text: gap_text.to_string(),
695            span: Span {
696                byte_start: gap_start,
697                byte_end: gap_end,
698                line_start,
699                line_end,
700            },
701            chunk_type: ChunkType::Text,
702            metadata: ChunkMetadata::from_text(gap_text),
703            stride_info: None,
704        };
705        result.push(gap_chunk);
706        gap_idx += 1;
707    }
708
709    result
710}
711
712/// Merge Haskell function equations that belong to the same function definition
713fn merge_haskell_functions(chunks: Vec<Chunk>, source: &str) -> Vec<Chunk> {
714    if chunks.is_empty() {
715        return chunks;
716    }
717
718    let mut merged = Vec::new();
719    let mut i = 0;
720
721    while i < chunks.len() {
722        let chunk = &chunks[i];
723
724        // Skip chunks that are just fragments or comments
725        let trimmed = chunk.text.trim();
726        if trimmed.is_empty()
727            || trimmed.starts_with("--")
728            || trimmed.starts_with("{-")
729            || !chunk.text.contains(|c: char| c.is_alphanumeric())
730        {
731            i += 1;
732            continue;
733        }
734
735        // Extract function name from the chunk text
736        // Check if it's a signature first (contains ::)
737        let is_signature = chunk.text.contains("::");
738        let function_name = if is_signature {
739            // For signatures like "factorial :: Integer -> Integer", extract "factorial"
740            chunk
741                .text
742                .split("::")
743                .next()
744                .and_then(|s| s.split_whitespace().next())
745                .map(std::string::ToString::to_string)
746        } else {
747            extract_haskell_function_name(&chunk.text)
748        };
749
750        if function_name.is_none() {
751            // Not a function (might be data, newtype, etc.), keep as-is
752            merged.push(chunk.clone());
753            i += 1;
754            continue;
755        }
756
757        let name = function_name.unwrap();
758        let group_start = chunk.span.byte_start;
759        let mut group_end = chunk.span.byte_end;
760        let line_start = chunk.span.line_start;
761        let mut line_end = chunk.span.line_end;
762        let mut trailing_trivia = chunk.metadata.trailing_trivia.clone();
763
764        // Look ahead for function equations with the same name
765        let mut j = i + 1;
766        while j < chunks.len() {
767            let next_chunk = &chunks[j];
768
769            // Skip comments
770            let next_trimmed = next_chunk.text.trim();
771            if next_trimmed.starts_with("--") || next_trimmed.starts_with("{-") {
772                j += 1;
773                continue;
774            }
775
776            let next_is_signature = next_chunk.text.contains("::");
777            let next_name = if next_is_signature {
778                next_chunk
779                    .text
780                    .split("::")
781                    .next()
782                    .and_then(|s| s.split_whitespace().next())
783                    .map(std::string::ToString::to_string)
784            } else {
785                extract_haskell_function_name(&next_chunk.text)
786            };
787
788            if next_name == Some(name.clone()) {
789                // Extend the group to include this equation
790                group_end = next_chunk.span.byte_end;
791                line_end = next_chunk.span.line_end;
792                trailing_trivia = next_chunk.metadata.trailing_trivia.clone();
793                j += 1;
794            } else {
795                break;
796            }
797        }
798
799        // Create merged chunk
800        let merged_text = source.get(group_start..group_end).unwrap_or("").to_string();
801        let mut metadata = chunk.metadata.with_updated_text(&merged_text);
802        metadata.trailing_trivia = trailing_trivia;
803
804        merged.push(Chunk {
805            span: Span {
806                byte_start: group_start,
807                byte_end: group_end,
808                line_start,
809                line_end,
810            },
811            text: merged_text,
812            chunk_type: ChunkType::Function,
813            stride_info: None,
814            metadata,
815        });
816
817        i = j; // Skip past all merged chunks
818    }
819
820    merged
821}
822
823/// Extract the function name from a Haskell function equation
824fn extract_haskell_function_name(text: &str) -> Option<String> {
825    // Haskell function equations start with the function name followed by patterns or =
826    // Examples: "factorial 0 = 1", "map f [] = []"
827    let trimmed = text.trim();
828
829    // Find the first word (function name)
830    let first_word = trimmed
831        .split_whitespace()
832        .next()?
833        .trim_end_matches(|c: char| !c.is_alphanumeric() && c != '_' && c != '\'');
834
835    // Validate it's a valid Haskell identifier (starts with lowercase or underscore)
836    if first_word.is_empty() {
837        return None;
838    }
839
840    let first_char = first_word.chars().next()?;
841    if first_char.is_lowercase() || first_char == '_' {
842        Some(first_word.to_string())
843    } else {
844        None
845    }
846}
847
848fn chunk_language_with_model(
849    text: &str,
850    language: ParseableLanguage,
851    _model_name: Option<&str>,
852) -> Result<Vec<Chunk>> {
853    // For now, language-based chunking doesn't need model-specific behavior
854    // since it's based on semantic code boundaries rather than token counts
855    // We could potentially optimize this in the future by validating chunk token counts
856    chunk_language(text, language)
857}
858
859fn extract_code_chunks(
860    cursor: &mut tree_sitter::TreeCursor,
861    source: &str,
862    chunks: &mut Vec<Chunk>,
863    language: ParseableLanguage,
864) {
865    let node = cursor.node();
866
867    // For Haskell: skip "function" nodes that are nested anywhere inside "signature" nodes
868    // (these are type expressions, not actual function definitions)
869    let should_skip = if language == ParseableLanguage::Haskell && node.kind() == "function" {
870        // Walk up parent chain to check if we're inside a signature
871        let mut current = node.parent();
872        while let Some(parent) = current {
873            if parent.kind() == "signature" {
874                return; // Skip this node and don't recurse
875            }
876            current = parent.parent();
877        }
878        false
879    } else {
880        false
881    };
882
883    if !should_skip
884        && let Some(initial_chunk_type) = chunk_type_for_node(language, &node)
885        && let Some(chunk) = build_chunk(node, source, initial_chunk_type, language)
886    {
887        let is_duplicate = chunks.iter().any(|existing| {
888            existing.span.byte_start == chunk.span.byte_start
889                && existing.span.byte_end == chunk.span.byte_end
890        });
891
892        if !is_duplicate {
893            chunks.push(chunk);
894        }
895    }
896
897    // For Haskell signatures: don't recurse into children (they're just type expressions)
898    let should_recurse = !(language == ParseableLanguage::Haskell && node.kind() == "signature");
899
900    if should_recurse && cursor.goto_first_child() {
901        loop {
902            extract_code_chunks(cursor, source, chunks, language);
903            if !cursor.goto_next_sibling() {
904                break;
905            }
906        }
907        cursor.goto_parent();
908    }
909}
910
911fn chunk_type_for_node(
912    language: ParseableLanguage,
913    node: &tree_sitter::Node<'_>,
914) -> Option<ChunkType> {
915    let kind = node.kind();
916
917    let supported = match language {
918        ParseableLanguage::Python => matches!(kind, "function_definition" | "class_definition"),
919        ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => matches!(
920            kind,
921            "function_declaration" | "class_declaration" | "method_definition" | "arrow_function"
922        ),
923        ParseableLanguage::Haskell => matches!(
924            kind,
925            "function" // Capture function equations
926                | "signature" // Capture type signatures (will be merged with functions)
927                | "data_type"
928                | "newtype"
929                | "type_synonym"
930                | "type_family"
931                | "class"
932                | "instance"
933        ),
934        ParseableLanguage::Rust => matches!(
935            kind,
936            "function_item" | "impl_item" | "struct_item" | "enum_item" | "trait_item" | "mod_item"
937        ),
938        ParseableLanguage::Ruby => {
939            matches!(kind, "method" | "class" | "module" | "singleton_method")
940        }
941        ParseableLanguage::Go => matches!(
942            kind,
943            "function_declaration"
944                | "method_declaration"
945                | "type_declaration"
946                | "var_declaration"
947                | "const_declaration"
948        ),
949        ParseableLanguage::C => matches!(
950            kind,
951            "function_definition"
952                | "struct_specifier"
953                | "enum_specifier"
954                | "union_specifier"
955                | "type_definition"
956                | "declaration"
957                | "preproc_function_def"
958                | "preproc_def"
959        ),
960        ParseableLanguage::Cpp => matches!(
961            kind,
962            "function_definition"
963                | "class_specifier"
964                | "struct_specifier"
965                | "enum_specifier"
966                | "union_specifier"
967                | "namespace_definition"
968                | "template_declaration"
969                | "type_definition"
970                | "alias_declaration"
971                | "declaration"
972                | "preproc_function_def"
973                | "preproc_def"
974        ),
975        ParseableLanguage::CSharp => matches!(
976            kind,
977            "method_declaration"
978                | "class_declaration"
979                | "interface_declaration"
980                | "variable_declaration"
981        ),
982        ParseableLanguage::Dart => matches!(
983            kind,
984            "class_definition"
985                | "class_declaration"
986                | "mixin_declaration"
987                | "enum_declaration"
988                | "function_declaration"
989                | "method_declaration"
990                | "constructor_declaration"
991                | "variable_declaration"
992                | "local_variable_declaration"
993                | "lambda_expression"
994                | "class_member_definition"
995        ),
996        ParseableLanguage::Zig => matches!(
997            kind,
998            "function_declaration"
999                | "test_declaration"
1000                | "variable_declaration"
1001                | "struct_declaration"
1002                | "enum_declaration"
1003                | "union_declaration"
1004                | "opaque_declaration"
1005                | "error_set_declaration"
1006                | "comptime_declaration"
1007        ),
1008        // Elixir uses "call" nodes for defmodule, def, defp, etc.
1009        // We handle this specially via query-based chunking
1010        ParseableLanguage::Elixir => matches!(kind, "call" | "do_block"),
1011        ParseableLanguage::Markdown => matches!(
1012            kind,
1013            "atx_heading"
1014                | "setext_heading"
1015                | "heading"
1016                | "section"
1017                | "fenced_code_block"
1018                | "indented_code_block"
1019                | "block_quote"
1020                | "list"
1021                | "list_item"
1022                | "paragraph"
1023                | "thematic_break"
1024        ),
1025    };
1026
1027    if !supported {
1028        return None;
1029    }
1030
1031    match language {
1032        ParseableLanguage::Go
1033            if matches!(node.kind(), "var_declaration" | "const_declaration")
1034                && node.parent().is_some_and(|p| p.kind() == "block") =>
1035        {
1036            return None;
1037        }
1038        ParseableLanguage::CSharp if node.kind() == "variable_declaration" => {
1039            if !is_csharp_field_like(*node) {
1040                return None;
1041            }
1042        }
1043        _ => {}
1044    }
1045
1046    Some(classify_chunk_kind(kind))
1047}
1048
1049fn classify_chunk_kind(kind: &str) -> ChunkType {
1050    match kind {
1051        "function_definition"
1052        | "function_declaration"
1053        | "arrow_function"
1054        | "function"
1055        | "function_item"
1056        | "def"
1057        | "defp"
1058        | "defn"
1059        | "defn-"
1060        | "method"
1061        | "singleton_method"
1062        | "preproc_function_def" => ChunkType::Function,
1063        "signature" => ChunkType::Function, // Haskell type signatures will be merged with functions
1064        "class_definition"
1065        | "class_declaration"
1066        | "instance_declaration"
1067        | "class"
1068        | "instance"
1069        | "struct_item"
1070        | "enum_item"
1071        | "class_specifier"
1072        | "struct_specifier"
1073        | "enum_specifier"
1074        | "union_specifier"
1075        | "defstruct"
1076        | "defrecord"
1077        | "deftype"
1078        | "type_declaration"
1079        | "struct_declaration"
1080        | "enum_declaration"
1081        | "union_declaration"
1082        | "opaque_declaration"
1083        | "error_set_declaration" => ChunkType::Class,
1084        "method_definition" | "method_declaration" | "defmacro" => ChunkType::Method,
1085        "data_type"
1086        | "newtype"
1087        | "type_synonym"
1088        | "type_family"
1089        | "impl_item"
1090        | "trait_item"
1091        | "mod_item"
1092        | "namespace_definition"
1093        | "defmodule"
1094        | "module"
1095        | "defprotocol"
1096        | "interface_declaration"
1097        | "ns"
1098        | "var_declaration"
1099        | "const_declaration"
1100        | "variable_declaration"
1101        | "test_declaration"
1102        | "comptime_declaration"
1103        | "atx_heading"
1104        | "setext_heading"
1105        | "heading"
1106        | "section" => ChunkType::Module,
1107        _ => ChunkType::Text,
1108    }
1109}
1110
1111pub(crate) fn build_chunk(
1112    node: tree_sitter::Node<'_>,
1113    source: &str,
1114    initial_type: ChunkType,
1115    language: ParseableLanguage,
1116) -> Option<Chunk> {
1117    let target_node = adjust_node_for_language(node, language);
1118
1119    if matches!(language, ParseableLanguage::C | ParseableLanguage::Cpp)
1120        && matches!(initial_type, ChunkType::Class)
1121        && matches!(
1122            target_node.kind(),
1123            "struct_specifier" | "union_specifier" | "enum_specifier"
1124        )
1125        && !c_cpp_type_has_body_node(target_node)
1126    {
1127        return None;
1128    }
1129    let (byte_start, start_row, leading_segments) =
1130        extend_with_leading_trivia(target_node, language, source);
1131    let trailing_segments = collect_trailing_trivia(target_node, language, source);
1132
1133    let byte_end = target_node.end_byte();
1134    let end_pos = target_node.end_position();
1135
1136    if byte_start >= byte_end || byte_end > source.len() {
1137        return None;
1138    }
1139
1140    let chunk_type = adjust_chunk_type_for_context(target_node, initial_type, language);
1141    let mut text = source.get(byte_start..byte_end)?.to_string();
1142    if matches!(language, ParseableLanguage::C | ParseableLanguage::Cpp)
1143        && chunk_type == ChunkType::Class
1144    {
1145        text = strip_method_bodies_in_class_text(target_node, source, byte_start, byte_end);
1146    }
1147
1148    if text.trim().is_empty() {
1149        return None;
1150    }
1151    let ancestry = collect_ancestry(target_node, language, source);
1152    let leading_trivia = segments_to_strings(&leading_segments, source);
1153    let trailing_trivia = segments_to_strings(&trailing_segments, source);
1154    let mut metadata =
1155        ChunkMetadata::from_context(&text, ancestry, leading_trivia, trailing_trivia);
1156    if matches!(language, ParseableLanguage::C | ParseableLanguage::Cpp)
1157        && matches!(chunk_type, ChunkType::Function | ChunkType::Method)
1158        && let Some(full_name) = c_cpp_function_breadcrumb(target_node, language, source)
1159    {
1160        metadata.breadcrumb = Some(full_name);
1161    }
1162
1163    Some(Chunk {
1164        span: Span {
1165            byte_start,
1166            byte_end,
1167            line_start: start_row + 1,
1168            line_end: end_pos.row + 1,
1169        },
1170        text,
1171        chunk_type,
1172        stride_info: None,
1173        metadata,
1174    })
1175}
1176
1177fn c_cpp_type_has_body_node(node: tree_sitter::Node<'_>) -> bool {
1178    let mut cursor = node.walk();
1179
1180    match node.kind() {
1181        "struct_specifier" | "union_specifier" => node
1182            .children(&mut cursor)
1183            .any(|child| child.kind() == "field_declaration_list"),
1184        "enum_specifier" => node
1185            .children(&mut cursor)
1186            .any(|child| child.kind() == "enumerator_list"),
1187        _ => false,
1188    }
1189}
1190
1191fn c_cpp_function_breadcrumb(
1192    node: tree_sitter::Node<'_>,
1193    language: ParseableLanguage,
1194    source: &str,
1195) -> Option<String> {
1196    let name = display_name_for_node(node, language, source, ChunkType::Function)?;
1197    let context = collect_c_cpp_context_names(node, language, source);
1198    let context_path = context.join("::");
1199
1200    if name.contains("::") {
1201        if context_path.is_empty() || name.starts_with(&format!("{}::", context_path)) {
1202            Some(name)
1203        } else {
1204            Some(format!("{}::{}", context_path, name))
1205        }
1206    } else if context_path.is_empty() {
1207        Some(name)
1208    } else {
1209        Some(format!("{}::{}", context_path, name))
1210    }
1211}
1212
1213fn collect_c_cpp_context_names(
1214    mut node: tree_sitter::Node<'_>,
1215    language: ParseableLanguage,
1216    source: &str,
1217) -> Vec<String> {
1218    let mut parts = Vec::new();
1219
1220    while let Some(parent) = node.parent() {
1221        let kind = parent.kind();
1222        let include = match language {
1223            ParseableLanguage::Cpp => matches!(
1224                kind,
1225                "namespace_definition" | "class_specifier" | "struct_specifier"
1226            ),
1227            ParseableLanguage::C => matches!(kind, "struct_specifier"),
1228            _ => false,
1229        };
1230
1231        if include
1232            && let Some(name) = display_name_for_node(parent, language, source, ChunkType::Class)
1233        {
1234            parts.push(name);
1235        }
1236
1237        node = parent;
1238    }
1239
1240    parts.reverse();
1241    parts
1242}
1243
1244fn strip_method_bodies_in_class_text(
1245    class_node: tree_sitter::Node<'_>,
1246    source: &str,
1247    byte_start: usize,
1248    byte_end: usize,
1249) -> String {
1250    let mut replacements: Vec<(usize, usize, String)> = Vec::new();
1251    let mut stack = vec![class_node];
1252
1253    while let Some(node) = stack.pop() {
1254        if is_method_like_node(node.kind())
1255            && let Some(body) = find_method_body_node(node)
1256        {
1257            let start = body.start_byte();
1258            let end = body.end_byte();
1259            if start >= byte_start && end <= byte_end && start < end {
1260                let replacement = method_body_placeholder(body, source);
1261                replacements.push((start, end, replacement));
1262            }
1263        }
1264
1265        let child_count = node.child_count();
1266        for idx in (0..child_count).rev() {
1267            if let Some(child) = node.child(idx) {
1268                stack.push(child);
1269            }
1270        }
1271    }
1272
1273    if replacements.is_empty() {
1274        return source
1275            .get(byte_start..byte_end)
1276            .unwrap_or_default()
1277            .to_string();
1278    }
1279
1280    replacements.sort_by(|a, b| b.0.cmp(&a.0));
1281    let mut text = source
1282        .get(byte_start..byte_end)
1283        .unwrap_or_default()
1284        .to_string();
1285
1286    for (start, end, replacement) in replacements {
1287        if start < byte_start || end > byte_end || end <= start {
1288            continue;
1289        }
1290        let local_start = start - byte_start;
1291        let local_end = end - byte_start;
1292        if local_end <= text.len() {
1293            text.replace_range(local_start..local_end, &replacement);
1294        }
1295    }
1296
1297    text
1298}
1299
1300fn is_method_like_node(kind: &str) -> bool {
1301    matches!(
1302        kind,
1303        "function_definition"
1304            | "method_definition"
1305            | "method_declaration"
1306            | "constructor_declaration"
1307            | "destructor_declaration"
1308            | "function_item"
1309            | "method"
1310            | "singleton_method"
1311    )
1312}
1313
1314fn find_method_body_node(node: tree_sitter::Node<'_>) -> Option<tree_sitter::Node<'_>> {
1315    let body_kinds = [
1316        "compound_statement",
1317        "statement_block",
1318        "block",
1319        "body",
1320        "body_statement",
1321        "declaration_list",
1322    ];
1323
1324    for idx in 0..node.child_count() {
1325        if let Some(child) = node.child(idx)
1326            && body_kinds.contains(&child.kind())
1327        {
1328            return Some(child);
1329        }
1330    }
1331
1332    None
1333}
1334
1335fn method_body_placeholder(_body: tree_sitter::Node<'_>, _source: &str) -> String {
1336    ";".to_string()
1337}
1338
1339fn adjust_node_for_language(
1340    node: tree_sitter::Node<'_>,
1341    language: ParseableLanguage,
1342) -> tree_sitter::Node<'_> {
1343    match language {
1344        ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => {
1345            if node.kind() == "arrow_function" {
1346                return expand_arrow_function_context(node);
1347            }
1348            node
1349        }
1350        _ => node,
1351    }
1352}
1353
1354fn expand_arrow_function_context(mut node: tree_sitter::Node<'_>) -> tree_sitter::Node<'_> {
1355    const PARENTS: &[&str] = &[
1356        "parenthesized_expression",
1357        "variable_declarator",
1358        "variable_declaration",
1359        "lexical_declaration",
1360        "assignment_expression",
1361        "expression_statement",
1362        "public_field_definition",
1363        "export_statement",
1364    ];
1365
1366    while let Some(parent) = node.parent() {
1367        let kind = parent.kind();
1368        if PARENTS.contains(&kind) {
1369            node = parent;
1370            continue;
1371        }
1372        break;
1373    }
1374
1375    node
1376}
1377
1378#[derive(Clone, Copy)]
1379struct TriviaSegment {
1380    start_byte: usize,
1381    end_byte: usize,
1382}
1383
1384fn extend_with_leading_trivia(
1385    node: tree_sitter::Node<'_>,
1386    language: ParseableLanguage,
1387    source: &str,
1388) -> (usize, usize, Vec<TriviaSegment>) {
1389    let mut start_byte = node.start_byte();
1390    let mut start_row = node.start_position().row;
1391    let mut current = node;
1392    let mut segments = Vec::new();
1393
1394    while let Some(prev) = current.prev_sibling() {
1395        if should_attach_leading_trivia(language, &prev)
1396            && only_whitespace_between(source, prev.end_byte(), start_byte)
1397        {
1398            start_byte = prev.start_byte();
1399            start_row = prev.start_position().row;
1400            segments.push(TriviaSegment {
1401                start_byte: prev.start_byte(),
1402                end_byte: prev.end_byte(),
1403            });
1404            current = prev;
1405            continue;
1406        }
1407        break;
1408    }
1409
1410    segments.reverse();
1411    (start_byte, start_row, segments)
1412}
1413
1414fn should_attach_leading_trivia(language: ParseableLanguage, node: &tree_sitter::Node<'_>) -> bool {
1415    let kind = node.kind();
1416    if kind == "comment" {
1417        return true;
1418    }
1419
1420    match language {
1421        ParseableLanguage::Rust => {
1422            matches!(kind, "line_comment" | "block_comment" | "attribute_item")
1423        }
1424        ParseableLanguage::Python => kind == "decorator",
1425        ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => kind == "decorator",
1426        ParseableLanguage::C | ParseableLanguage::Cpp | ParseableLanguage::Markdown => {
1427            kind == "comment"
1428        }
1429        ParseableLanguage::CSharp => matches!(kind, "attribute_list" | "attribute"),
1430        _ => false,
1431    }
1432}
1433
1434fn collect_trailing_trivia(
1435    node: tree_sitter::Node<'_>,
1436    language: ParseableLanguage,
1437    source: &str,
1438) -> Vec<TriviaSegment> {
1439    let mut segments = Vec::new();
1440    let mut current = node;
1441    let mut previous_end = node.end_byte();
1442
1443    while let Some(next) = current.next_sibling() {
1444        if should_attach_trailing_trivia(language, &next)
1445            && only_whitespace_between(source, previous_end, next.start_byte())
1446        {
1447            segments.push(TriviaSegment {
1448                start_byte: next.start_byte(),
1449                end_byte: next.end_byte(),
1450            });
1451            previous_end = next.end_byte();
1452            current = next;
1453            continue;
1454        }
1455        break;
1456    }
1457
1458    segments
1459}
1460
1461fn should_attach_trailing_trivia(
1462    _language: ParseableLanguage,
1463    node: &tree_sitter::Node<'_>,
1464) -> bool {
1465    node.kind() == "comment"
1466}
1467
1468fn segments_to_strings(segments: &[TriviaSegment], source: &str) -> Vec<String> {
1469    let mut result = Vec::new();
1470
1471    for segment in segments {
1472        if let Some(text) = source
1473            .get(segment.start_byte..segment.end_byte)
1474            .map(std::string::ToString::to_string)
1475        {
1476            result.push(text);
1477        }
1478    }
1479
1480    result
1481}
1482
1483fn collect_ancestry(
1484    mut node: tree_sitter::Node<'_>,
1485    language: ParseableLanguage,
1486    source: &str,
1487) -> Vec<String> {
1488    if language == ParseableLanguage::Markdown {
1489        return markdown_heading_ancestry(node, source);
1490    }
1491
1492    let mut parts = Vec::new();
1493
1494    while let Some(parent) = node.parent() {
1495        if let Some(parent_chunk_type) = chunk_type_for_node(language, &parent)
1496            && let Some(name) = display_name_for_node(parent, language, source, parent_chunk_type)
1497        {
1498            parts.push(name);
1499        }
1500        node = parent;
1501    }
1502
1503    parts.reverse();
1504    parts
1505}
1506
1507fn display_name_for_node(
1508    node: tree_sitter::Node<'_>,
1509    language: ParseableLanguage,
1510    source: &str,
1511    chunk_type: ChunkType,
1512) -> Option<String> {
1513    if let Some(name_node) = node.child_by_field_name("name") {
1514        return text_for_node(name_node, source);
1515    }
1516
1517    match language {
1518        ParseableLanguage::Rust => rust_display_name(node, source, chunk_type),
1519        ParseableLanguage::Python => find_identifier(node, source, &["identifier"]),
1520        ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => find_identifier(
1521            node,
1522            source,
1523            &["identifier", "type_identifier", "property_identifier"],
1524        ),
1525        ParseableLanguage::Haskell => {
1526            find_identifier(node, source, &["identifier", "type_identifier", "variable"])
1527                .or_else(|| first_word_of_node(node, source))
1528        }
1529        ParseableLanguage::Ruby => find_identifier(node, source, &["identifier"]),
1530        ParseableLanguage::Go => find_identifier(node, source, &["identifier", "type_identifier"]),
1531        ParseableLanguage::C => c_display_name(node, source, chunk_type),
1532        ParseableLanguage::Cpp => cpp_display_name(node, source, chunk_type),
1533        ParseableLanguage::CSharp => find_identifier(node, source, &["identifier"]),
1534        ParseableLanguage::Zig => find_identifier(node, source, &["identifier"]),
1535
1536        ParseableLanguage::Markdown => markdown_display_name(node, source, chunk_type),
1537
1538        ParseableLanguage::Dart => {
1539            find_identifier(node, source, &["identifier", "type_identifier"])
1540        }
1541        ParseableLanguage::Elixir => {
1542            // Elixir names can be aliases (module names) or atoms/identifiers
1543            find_identifier(node, source, &["alias", "identifier", "atom"])
1544        }
1545    }
1546}
1547
1548fn markdown_display_name(
1549    node: tree_sitter::Node<'_>,
1550    source: &str,
1551    _chunk_type: ChunkType,
1552) -> Option<String> {
1553    if node.kind() == "section" {
1554        return markdown_section_heading(node, source);
1555    }
1556
1557    if markdown_heading_kind(node.kind()) {
1558        return markdown_heading_text(node, source);
1559    }
1560
1561    None
1562}
1563
1564fn markdown_section_heading(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
1565    let mut cursor = node.walk();
1566    for child in node.children(&mut cursor) {
1567        if markdown_heading_kind(child.kind()) {
1568            return markdown_heading_text(child, source);
1569        }
1570    }
1571    None
1572}
1573
1574fn markdown_heading_kind(kind: &str) -> bool {
1575    matches!(kind, "atx_heading" | "setext_heading" | "heading")
1576}
1577
1578fn markdown_heading_text(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
1579    let text = text_for_node(node, source)?;
1580    let mut lines = text.lines();
1581    let first_line = lines.next().unwrap_or("");
1582
1583    if let Some((_, heading)) = parse_atx_heading_line(first_line) {
1584        return Some(heading);
1585    }
1586
1587    let second_line = lines.next().unwrap_or("");
1588    if parse_setext_level(second_line).is_some() {
1589        let trimmed = first_line.trim();
1590        if !trimmed.is_empty() {
1591            return Some(trimmed.to_string());
1592        }
1593    }
1594
1595    let trimmed = first_line.trim();
1596    if trimmed.is_empty() {
1597        None
1598    } else {
1599        Some(trimmed.to_string())
1600    }
1601}
1602
1603fn markdown_heading_ancestry(node: tree_sitter::Node<'_>, source: &str) -> Vec<String> {
1604    let mut target_row = node.start_position().row;
1605    if node.kind() == "section" || markdown_heading_kind(node.kind()) {
1606        target_row = target_row.saturating_sub(1);
1607    }
1608    let lines: Vec<&str> = source.lines().collect();
1609    let mut stack: Vec<(usize, String)> = Vec::new();
1610    let mut i = 0;
1611
1612    while i < lines.len() && i <= target_row {
1613        let line = lines[i];
1614
1615        if let Some((level, heading)) = parse_atx_heading_line(line) {
1616            update_markdown_heading_stack(&mut stack, level, heading);
1617            i += 1;
1618            continue;
1619        }
1620
1621        if i + 1 < lines.len() && i < target_row {
1622            let underline = lines[i + 1];
1623            if let Some(level) = parse_setext_level(underline) {
1624                let heading_text = line.trim();
1625                if !heading_text.is_empty() {
1626                    update_markdown_heading_stack(&mut stack, level, heading_text.to_string());
1627                }
1628                i += 2;
1629                continue;
1630            }
1631        }
1632
1633        i += 1;
1634    }
1635
1636    stack.into_iter().map(|(_, heading)| heading).collect()
1637}
1638
1639fn update_markdown_heading_stack(stack: &mut Vec<(usize, String)>, level: usize, text: String) {
1640    while let Some((existing_level, _)) = stack.last() {
1641        if *existing_level < level {
1642            break;
1643        }
1644        stack.pop();
1645    }
1646    stack.push((level, text));
1647}
1648
1649fn parse_atx_heading_line(line: &str) -> Option<(usize, String)> {
1650    let trimmed = line.trim_start();
1651    if !trimmed.starts_with('#') {
1652        return None;
1653    }
1654
1655    let level = trimmed.chars().take_while(|c| *c == '#').count();
1656    if level == 0 {
1657        return None;
1658    }
1659
1660    let mut text = trimmed[level..].trim();
1661    text = text.trim_end_matches('#').trim();
1662    if text.is_empty() {
1663        return None;
1664    }
1665
1666    Some((level, text.to_string()))
1667}
1668
1669fn parse_setext_level(line: &str) -> Option<usize> {
1670    let trimmed = line.trim();
1671    if trimmed.is_empty() {
1672        return None;
1673    }
1674
1675    if trimmed.chars().all(|c| c == '=') {
1676        Some(1)
1677    } else if trimmed.chars().all(|c| c == '-') {
1678        Some(2)
1679    } else {
1680        None
1681    }
1682}
1683
1684fn rust_display_name(
1685    node: tree_sitter::Node<'_>,
1686    source: &str,
1687    chunk_type: ChunkType,
1688) -> Option<String> {
1689    match node.kind() {
1690        "impl_item" => {
1691            let mut parts = Vec::new();
1692            if let Some(ty) = node.child_by_field_name("type")
1693                && let Some(text) = text_for_node(ty, source)
1694            {
1695                parts.push(text);
1696            }
1697            if let Some(trait_node) = node.child_by_field_name("trait")
1698                && let Some(text) = text_for_node(trait_node, source)
1699            {
1700                if let Some(last) = parts.first() {
1701                    parts[0] = format!("{} (impl {})", last, text.trim());
1702                } else {
1703                    parts.push(format!("impl {}", text.trim()));
1704                }
1705            }
1706            if parts.is_empty() {
1707                find_identifier(node, source, &["identifier"])
1708            } else {
1709                Some(parts.remove(0))
1710            }
1711        }
1712        "mod_item" if chunk_type == ChunkType::Module => {
1713            find_identifier(node, source, &["identifier"])
1714        }
1715        _ => find_identifier(node, source, &["identifier", "type_identifier"]),
1716    }
1717}
1718
1719fn c_display_name(
1720    node: tree_sitter::Node<'_>,
1721    source: &str,
1722    _chunk_type: ChunkType,
1723) -> Option<String> {
1724    match node.kind() {
1725        "function_definition" => {
1726            // C function: look for the declarator, then the identifier inside it
1727            if let Some(declarator) = node.child_by_field_name("declarator") {
1728                return find_identifier_recursive(declarator, source, &["identifier"]);
1729            }
1730            None
1731        }
1732        "struct_specifier" | "enum_specifier" | "union_specifier" => {
1733            find_identifier(node, source, &["type_identifier", "identifier"])
1734        }
1735        "type_definition" => find_identifier(node, source, &["type_identifier", "identifier"]),
1736        "preproc_function_def" | "preproc_def" => find_identifier(node, source, &["identifier"]),
1737        _ => find_identifier(node, source, &["identifier", "type_identifier"]),
1738    }
1739}
1740
1741fn cpp_display_name(
1742    node: tree_sitter::Node<'_>,
1743    source: &str,
1744    _chunk_type: ChunkType,
1745) -> Option<String> {
1746    match node.kind() {
1747        "function_definition" => {
1748            if let Some(declarator) = node.child_by_field_name("declarator") {
1749                return find_identifier_recursive(
1750                    declarator,
1751                    source,
1752                    &[
1753                        "identifier",
1754                        "field_identifier",
1755                        "destructor_name",
1756                        "qualified_identifier",
1757                    ],
1758                );
1759            }
1760            None
1761        }
1762        "declaration" => {
1763            if let Some(declarator) = node.child_by_field_name("declarator") {
1764                return find_identifier_recursive(
1765                    declarator,
1766                    source,
1767                    &[
1768                        "identifier",
1769                        "field_identifier",
1770                        "destructor_name",
1771                        "qualified_identifier",
1772                    ],
1773                );
1774            }
1775            find_identifier(node, source, &["identifier", "type_identifier"])
1776        }
1777        "class_specifier" | "struct_specifier" | "enum_specifier" | "union_specifier" => {
1778            find_identifier(node, source, &["type_identifier", "identifier"])
1779        }
1780        "namespace_definition" => {
1781            find_identifier(node, source, &["identifier", "namespace_identifier"])
1782        }
1783        "alias_declaration" | "type_definition" => {
1784            find_identifier(node, source, &["type_identifier", "identifier"])
1785        }
1786        "template_declaration" => {
1787            let mut cursor = node.walk();
1788            for child in node.children(&mut cursor) {
1789                if matches!(
1790                    child.kind(),
1791                    "class_specifier"
1792                        | "struct_specifier"
1793                        | "enum_specifier"
1794                        | "union_specifier"
1795                        | "function_definition"
1796                        | "declaration"
1797                        | "alias_declaration"
1798                        | "type_definition"
1799                        | "concept_definition"
1800                ) {
1801                    return cpp_display_name(child, source, _chunk_type);
1802                }
1803            }
1804            find_identifier(node, source, &["type_identifier", "identifier"])
1805        }
1806        _ => find_identifier(node, source, &["identifier", "type_identifier"]),
1807    }
1808}
1809
1810/// Recursively search for an identifier in nested declarators (e.g., C function declarators)
1811fn find_identifier_recursive(
1812    node: tree_sitter::Node<'_>,
1813    source: &str,
1814    candidate_kinds: &[&str],
1815) -> Option<String> {
1816    if candidate_kinds.contains(&node.kind()) {
1817        return text_for_node(node, source).map(|s| s.trim().to_string());
1818    }
1819    let mut cursor = node.walk();
1820    for child in node.children(&mut cursor) {
1821        if let Some(result) = find_identifier_recursive(child, source, candidate_kinds) {
1822            return Some(result);
1823        }
1824    }
1825    None
1826}
1827
1828fn find_identifier(
1829    node: tree_sitter::Node<'_>,
1830    source: &str,
1831    candidate_kinds: &[&str],
1832) -> Option<String> {
1833    let mut cursor = node.walk();
1834    for child in node.children(&mut cursor) {
1835        if candidate_kinds.contains(&child.kind())
1836            && let Some(text) = text_for_node(child, source)
1837        {
1838            return Some(text.trim().to_string());
1839        }
1840    }
1841    None
1842}
1843
1844fn first_word_of_node(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
1845    let text = text_for_node(node, source)?;
1846    text.split_whitespace().next().map(|s| {
1847        s.trim_end_matches(|c: char| !c.is_alphanumeric() && c != '_')
1848            .to_string()
1849    })
1850}
1851
1852fn text_for_node(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
1853    node.utf8_text(source.as_bytes())
1854        .ok()
1855        .map(std::string::ToString::to_string)
1856}
1857
1858fn only_whitespace_between(source: &str, start: usize, end: usize) -> bool {
1859    if start >= end || end > source.len() {
1860        return true;
1861    }
1862
1863    source[start..end].chars().all(char::is_whitespace)
1864}
1865
1866fn adjust_chunk_type_for_context(
1867    node: tree_sitter::Node<'_>,
1868    chunk_type: ChunkType,
1869    language: ParseableLanguage,
1870) -> ChunkType {
1871    if chunk_type != ChunkType::Function {
1872        return chunk_type;
1873    }
1874
1875    if is_method_context(node, language) {
1876        ChunkType::Method
1877    } else {
1878        chunk_type
1879    }
1880}
1881
1882fn is_method_context(node: tree_sitter::Node<'_>, language: ParseableLanguage) -> bool {
1883    const PYTHON_CONTAINERS: &[&str] = &["class_definition"];
1884    const TYPESCRIPT_CONTAINERS: &[&str] = &["class_body", "class_declaration"];
1885    const RUBY_CONTAINERS: &[&str] = &["class", "module"];
1886    const RUST_CONTAINERS: &[&str] = &["impl_item", "trait_item"];
1887    const DART_CONTAINERS: &[&str] = &[
1888        "class_definition",
1889        "class_declaration",
1890        "mixin_declaration",
1891        "enum_declaration",
1892    ];
1893
1894    match language {
1895        ParseableLanguage::Python => ancestor_has_kind(node, PYTHON_CONTAINERS),
1896        ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => {
1897            ancestor_has_kind(node, TYPESCRIPT_CONTAINERS)
1898        }
1899        ParseableLanguage::Ruby => ancestor_has_kind(node, RUBY_CONTAINERS),
1900        ParseableLanguage::Rust => ancestor_has_kind(node, RUST_CONTAINERS),
1901        ParseableLanguage::Go => false,
1902        ParseableLanguage::C => ancestor_has_kind(node, &["struct_specifier"]),
1903        ParseableLanguage::Cpp => ancestor_has_kind(node, &["class_specifier", "struct_specifier"]),
1904        ParseableLanguage::CSharp => false,
1905        ParseableLanguage::Haskell => false,
1906        ParseableLanguage::Zig => false,
1907
1908        ParseableLanguage::Dart => ancestor_has_kind(node, DART_CONTAINERS),
1909
1910        ParseableLanguage::Elixir => false, // Elixir doesn't have class-based methods
1911        ParseableLanguage::Markdown => false,
1912    }
1913}
1914
1915fn ancestor_has_kind(node: tree_sitter::Node<'_>, kinds: &[&str]) -> bool {
1916    let mut current = node;
1917    while let Some(parent) = current.parent() {
1918        if kinds.contains(&parent.kind()) {
1919            return true;
1920        }
1921        current = parent;
1922    }
1923    false
1924}
1925
1926fn is_csharp_field_like(node: tree_sitter::Node<'_>) -> bool {
1927    if let Some(parent) = node.parent() {
1928        return matches!(
1929            parent.kind(),
1930            "field_declaration" | "event_field_declaration"
1931        );
1932    }
1933    false
1934}
1935
1936/// Apply striding to chunks that exceed the token limit
1937fn apply_striding(chunks: Vec<Chunk>, config: &ChunkConfig) -> Result<Vec<Chunk>> {
1938    let mut result = Vec::new();
1939
1940    for chunk in chunks {
1941        let estimated_tokens = estimate_tokens(&chunk.text);
1942
1943        if estimated_tokens <= config.max_tokens {
1944            // Chunk fits within limit, no striding needed
1945            result.push(chunk);
1946        } else {
1947            // Chunk exceeds limit, apply striding
1948            tracing::debug!(
1949                "Chunk with {} tokens exceeds limit of {}, applying striding",
1950                estimated_tokens,
1951                config.max_tokens
1952            );
1953
1954            let strided_chunks = stride_large_chunk(chunk, config)?;
1955            result.extend(strided_chunks);
1956        }
1957    }
1958
1959    Ok(result)
1960}
1961
1962/// Create strided chunks from a large chunk that exceeds token limits
1963fn stride_large_chunk(chunk: Chunk, config: &ChunkConfig) -> Result<Vec<Chunk>> {
1964    let text = &chunk.text;
1965
1966    // Early return for empty chunks to avoid divide-by-zero
1967    if text.is_empty() {
1968        return Ok(vec![chunk]);
1969    }
1970
1971    // Calculate stride parameters in characters (not bytes!)
1972    // Use a conservative estimate to ensure we stay under token limits
1973    let char_count = text.chars().count();
1974    let estimated_tokens = estimate_tokens(text);
1975    // Guard against zero token estimate to prevent divide-by-zero panic
1976    let chars_per_token = if estimated_tokens == 0 {
1977        4.5 // Use default average if estimation fails
1978    } else {
1979        char_count as f32 / estimated_tokens as f32
1980    };
1981    let window_chars = ((config.max_tokens as f32 * 0.9) * chars_per_token) as usize; // 10% buffer
1982    let overlap_chars = (config.stride_overlap as f32 * chars_per_token) as usize;
1983    let stride_chars = window_chars.saturating_sub(overlap_chars);
1984
1985    if stride_chars == 0 {
1986        return Err(anyhow::anyhow!("Stride size is too small"));
1987    }
1988
1989    // Build char to byte index mapping to handle UTF-8 safely
1990    let char_byte_indices: Vec<(usize, char)> = text.char_indices().collect();
1991    // Note: char_count is already calculated above, just reference it here
1992
1993    let mut strided_chunks = Vec::new();
1994    let original_chunk_id = format!("{}:{}", chunk.span.byte_start, chunk.span.byte_end);
1995    let mut start_char_idx = 0;
1996    let mut stride_index = 0;
1997
1998    // Calculate total number of strides
1999    let total_strides = if char_count <= window_chars {
2000        1
2001    } else {
2002        ((char_count - overlap_chars) as f32 / stride_chars as f32).ceil() as usize
2003    };
2004
2005    while start_char_idx < char_count {
2006        let end_char_idx = (start_char_idx + window_chars).min(char_count);
2007
2008        // Get byte positions from char indices
2009        let start_byte_pos = char_byte_indices[start_char_idx].0;
2010        let end_byte_pos = if end_char_idx < char_count {
2011            char_byte_indices[end_char_idx].0
2012        } else {
2013            text.len()
2014        };
2015
2016        let stride_text = &text[start_byte_pos..end_byte_pos];
2017
2018        // Calculate overlap information
2019        let overlap_start = if stride_index > 0 { overlap_chars } else { 0 };
2020        let overlap_end = if end_char_idx < char_count {
2021            overlap_chars
2022        } else {
2023            0
2024        };
2025
2026        // Calculate span for this stride
2027        let byte_offset_start = chunk.span.byte_start + start_byte_pos;
2028        let byte_offset_end = chunk.span.byte_start + end_byte_pos;
2029
2030        // Estimate line numbers (approximate)
2031        let text_before_start = &text[..start_byte_pos];
2032        let line_offset_start = text_before_start.lines().count().saturating_sub(1);
2033        let stride_lines = stride_text.lines().count();
2034        let metadata = chunk.metadata.with_updated_text(stride_text);
2035
2036        let stride_chunk = Chunk {
2037            span: Span {
2038                byte_start: byte_offset_start,
2039                byte_end: byte_offset_end,
2040                line_start: chunk.span.line_start + line_offset_start,
2041                // Fix: subtract 1 since stride_lines is a count but line_end should be inclusive
2042                line_end: chunk.span.line_start
2043                    + line_offset_start
2044                    + stride_lines.saturating_sub(1),
2045            },
2046            text: stride_text.to_string(),
2047            chunk_type: chunk.chunk_type.clone(),
2048            stride_info: Some(StrideInfo {
2049                original_chunk_id: original_chunk_id.clone(),
2050                stride_index,
2051                total_strides,
2052                overlap_start,
2053                overlap_end,
2054            }),
2055            metadata,
2056        };
2057
2058        strided_chunks.push(stride_chunk);
2059
2060        // Move to next stride
2061        if end_char_idx >= char_count {
2062            break;
2063        }
2064
2065        start_char_idx += stride_chars;
2066        stride_index += 1;
2067    }
2068
2069    tracing::debug!(
2070        "Created {} strides from chunk of {} tokens",
2071        strided_chunks.len(),
2072        estimate_tokens(text)
2073    );
2074
2075    Ok(strided_chunks)
2076}
2077
2078/// Merge adjacent Markdown chunks that are individually too small to be useful.
2079///
2080/// Markdown is often structured as many tiny logical units (a heading, a short
2081/// paragraph, a single list item).  Emitting each one as its own chunk would
2082/// pollute the embedding index with near-empty vectors that carry little
2083/// semantic signal.  This pass greedily accumulates adjacent chunks until the
2084/// running token count approaches `target_tokens`, then emits one merged chunk.
2085///
2086/// Side-effect: merged chunks lose their individual `ChunkType` labels (heading
2087/// becomes `ChunkType::Text`) because the merged span covers mixed content.
2088/// That is intentional — the important thing is that the *text* survives so
2089/// search can still match it.
2090fn merge_small_chunks(chunks: Vec<Chunk>, text: &str, target_tokens: usize) -> Vec<Chunk> {
2091    if chunks.is_empty() {
2092        return chunks;
2093    }
2094
2095    let mut result = Vec::new();
2096    let mut current_group: Vec<Chunk> = Vec::new();
2097    let mut current_tokens = 0;
2098
2099    for chunk in chunks {
2100        let chunk_tokens = chunk.metadata.estimated_tokens;
2101
2102        if current_tokens + chunk_tokens > target_tokens {
2103            // Flush
2104            if !current_group.is_empty() {
2105                result.push(merge_group(&current_group, text));
2106                current_group.clear();
2107                current_tokens = 0;
2108            }
2109        }
2110
2111        // If single chunk is huge
2112        if chunk_tokens > target_tokens {
2113            if !current_group.is_empty() {
2114                result.push(merge_group(&current_group, text));
2115                current_group.clear();
2116                current_tokens = 0;
2117            }
2118            result.push(chunk);
2119            continue;
2120        }
2121
2122        current_group.push(chunk);
2123        current_tokens += chunk_tokens;
2124    }
2125
2126    // Flush remaining
2127    if !current_group.is_empty() {
2128        result.push(merge_group(&current_group, text));
2129    }
2130
2131    result
2132}
2133
2134fn merge_group(group: &[Chunk], text: &str) -> Chunk {
2135    if group.len() == 1 {
2136        return group[0].clone();
2137    }
2138
2139    let first = &group[0];
2140    let last = &group[group.len() - 1];
2141
2142    // Calculate new span
2143    // Assuming sorted, which fill_gaps ensures
2144    let byte_start = first.span.byte_start;
2145    let byte_end = last.span.byte_end;
2146    let line_start = first.span.line_start;
2147    let line_end = last.span.line_end;
2148
2149    // Safely slice text
2150    let chunk_text = if byte_end <= text.len() {
2151        text[byte_start..byte_end].to_string()
2152    } else {
2153        // Fallback for safety, though existing chunks should be valid
2154        text.get(byte_start..).unwrap_or("").to_string()
2155    };
2156
2157    let metadata = ChunkMetadata::from_text(&chunk_text);
2158
2159    // If all chunks in the group share the same semantic type, preserve it.
2160    // Otherwise (the common case for Markdown, where headings, paragraphs, and
2161    // code blocks are merged together), fall back to ChunkType::Text.  This is
2162    // intentional: the merged chunk is a mixed-content blob and no single type
2163    // describes it accurately.  Callers should rely on chunk *text* content
2164    // rather than chunk type when working with merged Markdown output.
2165    let chunk_type = if group.iter().all(|c| c.chunk_type == first.chunk_type) {
2166        first.chunk_type.clone()
2167    } else {
2168        ChunkType::Text
2169    };
2170
2171    Chunk {
2172        span: Span {
2173            byte_start,
2174            byte_end,
2175            line_start,
2176            line_end,
2177        },
2178        text: chunk_text,
2179        chunk_type,
2180        stride_info: None,
2181        metadata,
2182    }
2183}
2184
2185// Removed duplicate estimate_tokens function - using the one from ck-embed via TokenEstimator
2186
2187#[cfg(test)]
2188mod tests {
2189    use super::*;
2190
2191    fn canonicalize_spans(
2192        mut spans: Vec<(usize, usize, ChunkType)>,
2193    ) -> Vec<(usize, usize, ChunkType)> {
2194        fn chunk_type_order(chunk_type: &ChunkType) -> u8 {
2195            match chunk_type {
2196                ChunkType::Text => 0,
2197                ChunkType::Function => 1,
2198                ChunkType::Class => 2,
2199                ChunkType::Method => 3,
2200                ChunkType::Module => 4,
2201            }
2202        }
2203
2204        spans.sort_by(|a, b| {
2205            let order_a = chunk_type_order(&a.2);
2206            let order_b = chunk_type_order(&b.2);
2207            order_a
2208                .cmp(&order_b)
2209                .then_with(|| a.0.cmp(&b.0))
2210                .then_with(|| a.1.cmp(&b.1))
2211        });
2212
2213        let mut result: Vec<(usize, usize, ChunkType)> = Vec::new();
2214        for (start, end, ty) in spans {
2215            if let Some(last) = result.last_mut()
2216                && last.0 == start
2217                && last.2 == ty
2218            {
2219                if end > last.1 {
2220                    last.1 = end;
2221                }
2222                continue;
2223            }
2224            result.push((start, end, ty));
2225        }
2226
2227        result
2228    }
2229
2230    fn assert_query_parity(language: ParseableLanguage, source: &str) {
2231        let mut parser = tree_sitter::Parser::new();
2232        let ts_language = tree_sitter_language(language).expect("language");
2233        parser.set_language(&ts_language).expect("set language");
2234        let tree = parser.parse(source, None).expect("parse source");
2235
2236        let query_chunks = query_chunker::chunk_with_queries(language, ts_language, &tree, source)
2237            .expect("query execution")
2238            .expect("queries available");
2239
2240        let mut legacy_chunks = Vec::new();
2241        let mut cursor = tree.walk();
2242        extract_code_chunks(&mut cursor, source, &mut legacy_chunks, language);
2243
2244        let query_spans = canonicalize_spans(
2245            query_chunks
2246                .iter()
2247                .map(|chunk| {
2248                    (
2249                        chunk.span.byte_start,
2250                        chunk.span.byte_end,
2251                        chunk.chunk_type.clone(),
2252                    )
2253                })
2254                .collect(),
2255        );
2256        let legacy_spans = canonicalize_spans(
2257            legacy_chunks
2258                .iter()
2259                .map(|chunk| {
2260                    (
2261                        chunk.span.byte_start,
2262                        chunk.span.byte_end,
2263                        chunk.chunk_type.clone(),
2264                    )
2265                })
2266                .collect(),
2267        );
2268
2269        assert_eq!(query_spans, legacy_spans);
2270    }
2271
2272    #[test]
2273    fn test_chunk_generic_byte_offsets() {
2274        // Test that byte offsets are calculated correctly using O(n) algorithm
2275        let text = "line 1\nline 2\nline 3\nline 4\nline 5";
2276        let chunks = chunk_generic(text).unwrap();
2277
2278        assert!(!chunks.is_empty());
2279
2280        // First chunk should start at byte 0
2281        assert_eq!(chunks[0].span.byte_start, 0);
2282
2283        // Each chunk's byte_end should match the actual text length
2284        for chunk in &chunks {
2285            let expected_len = chunk.text.len();
2286            let actual_len = chunk.span.byte_end - chunk.span.byte_start;
2287            assert_eq!(actual_len, expected_len);
2288        }
2289    }
2290
2291    #[test]
2292    fn test_chunk_generic_large_file_performance() {
2293        // Create a large text to ensure O(n) performance
2294        let lines: Vec<String> = (0..1000)
2295            .map(|i| format!("Line {i}: Some content here"))
2296            .collect();
2297        let text = lines.join("\n");
2298
2299        let start = std::time::Instant::now();
2300        let chunks = chunk_generic(&text).unwrap();
2301        let duration = start.elapsed();
2302
2303        // Should complete quickly even for 1000 lines
2304        assert!(
2305            duration.as_millis() < 100,
2306            "Chunking took too long: {duration:?}"
2307        );
2308        assert!(!chunks.is_empty());
2309
2310        // Verify chunks have correct line numbers
2311        for chunk in &chunks {
2312            assert!(chunk.span.line_start > 0);
2313            assert!(chunk.span.line_end >= chunk.span.line_start);
2314        }
2315    }
2316
2317    #[test]
2318    fn test_chunk_rust() {
2319        let rust_code = r"
2320pub struct Calculator {
2321    memory: f64,
2322}
2323
2324impl Calculator {
2325    pub fn new() -> Self {
2326        Calculator { memory: 0.0 }
2327    }
2328
2329    pub fn add(&mut self, a: f64, b: f64) -> f64 {
2330        a + b
2331    }
2332}
2333
2334fn main() {
2335    let calc = Calculator::new();
2336}
2337
2338pub mod utils {
2339    pub fn helper() {}
2340}
2341";
2342
2343        let chunks = chunk_language(rust_code, ParseableLanguage::Rust).unwrap();
2344        assert!(!chunks.is_empty());
2345
2346        // Should find struct, impl, functions, and module
2347        let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
2348        assert!(chunk_types.contains(&&ChunkType::Class)); // struct
2349        assert!(chunk_types.contains(&&ChunkType::Module)); // impl and mod
2350        assert!(chunk_types.contains(&&ChunkType::Function)); // functions
2351    }
2352
2353    #[test]
2354    fn test_rust_doc_comments_attached() {
2355        let rust_code = r"
2356/// Doc comment
2357pub struct Foo {}
2358";
2359        let chunks = chunk_language(rust_code, ParseableLanguage::Rust).unwrap();
2360        let struct_chunk = chunks
2361            .iter()
2362            .find(|c| c.text.contains("struct Foo"))
2363            .unwrap();
2364        assert!(
2365            struct_chunk.text.contains("/// Doc comment"),
2366            "Doc comment should be attached"
2367        );
2368    }
2369
2370    #[test]
2371    fn test_rust_query_matches_legacy() {
2372        let source = r"
2373            mod sample {
2374                struct Thing;
2375
2376                impl Thing {
2377                    fn new() -> Self { Self }
2378                    fn helper(&self) {}
2379                }
2380            }
2381
2382            fn util() {}
2383        ";
2384
2385        assert_query_parity(ParseableLanguage::Rust, source);
2386    }
2387
2388    #[test]
2389    fn test_python_query_matches_legacy() {
2390        let source = r"
2391class Example:
2392    @classmethod
2393    def build(cls):
2394        return cls()
2395
2396
2397def helper():
2398    return 1
2399
2400
2401async def async_helper():
2402    return 2
2403";
2404
2405        assert_query_parity(ParseableLanguage::Python, source);
2406    }
2407
2408    #[test]
2409    fn test_chunk_ruby() {
2410        let ruby_code = r#"
2411class Calculator
2412  def initialize
2413    @memory = 0.0
2414  end
2415
2416  def add(a, b)
2417    a + b
2418  end
2419
2420  def self.class_method
2421    "class method"
2422  end
2423
2424  private
2425
2426  def private_method
2427    "private"
2428  end
2429end
2430
2431module Utils
2432  def self.helper
2433    "helper"
2434  end
2435end
2436
2437def main
2438  calc = Calculator.new
2439end
2440"#;
2441
2442        let chunks = chunk_language(ruby_code, ParseableLanguage::Ruby).unwrap();
2443        assert!(!chunks.is_empty());
2444
2445        // Should find class, module, and methods
2446        let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
2447        assert!(chunk_types.contains(&&ChunkType::Class)); // class
2448        assert!(chunk_types.contains(&&ChunkType::Module)); // module
2449        assert!(chunk_types.contains(&&ChunkType::Function)); // methods
2450    }
2451
2452    #[test]
2453    fn test_language_detection_fallback() {
2454        // Test that unknown languages fall back to generic chunking
2455        let generic_text = "Some text\nwith multiple lines\nto chunk generically";
2456
2457        let chunks_unknown = chunk_text(generic_text, None).unwrap();
2458        let chunks_generic = chunk_generic(generic_text).unwrap();
2459
2460        // Should produce the same result
2461        assert_eq!(chunks_unknown.len(), chunks_generic.len());
2462        assert_eq!(chunks_unknown[0].text, chunks_generic[0].text);
2463    }
2464
2465    #[test]
2466    fn test_chunk_go() {
2467        let go_code = r#"
2468package main
2469
2470import "fmt"
2471
2472const Pi = 3.14159
2473
2474var memory float64
2475
2476type Calculator struct {
2477    memory float64
2478}
2479
2480type Operation interface {
2481    Calculate(a, b float64) float64
2482}
2483
2484func NewCalculator() *Calculator {
2485    return &Calculator{memory: 0.0}
2486}
2487
2488func (c *Calculator) Add(a, b float64) float64 {
2489    return a + b
2490}
2491
2492func main() {
2493    calc := NewCalculator()
2494}
2495"#;
2496
2497        let chunks = chunk_language(go_code, ParseableLanguage::Go).unwrap();
2498        assert!(!chunks.is_empty());
2499
2500        // Should find const, var, type declarations, functions, and methods
2501        let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
2502        assert!(chunk_types.contains(&&ChunkType::Module)); // const and var
2503        assert!(chunk_types.contains(&&ChunkType::Class)); // struct and interface
2504        assert!(chunk_types.contains(&&ChunkType::Function)); // functions
2505        assert!(chunk_types.contains(&&ChunkType::Method)); // methods
2506    }
2507
2508    #[test]
2509    #[ignore] // TODO: Update test to match query-based chunking behavior
2510    fn test_chunk_typescript_arrow_context() {
2511        let ts_code = r"
2512// Utility function
2513export const util = () => {
2514    // comment about util
2515    return 42;
2516};
2517
2518export class Example {
2519    // leading comment for method
2520    constructor() {}
2521
2522    // Another comment
2523    run = () => {
2524        return util();
2525    };
2526}
2527
2528const compute = (x: number) => x * 2;
2529";
2530
2531        let chunks = chunk_language(ts_code, ParseableLanguage::TypeScript).unwrap();
2532
2533        let util_chunk = chunks
2534            .iter()
2535            .find(|chunk| chunk.text.contains("export const util"))
2536            .expect("Expected chunk for util arrow function");
2537        assert_eq!(util_chunk.chunk_type, ChunkType::Function);
2538        assert!(
2539            util_chunk.text.contains("// Utility function"),
2540            "expected leading comment to be included"
2541        );
2542        assert!(util_chunk.text.contains("export const util ="));
2543
2544        // The class field arrow function should be classified as a method and include its comment
2545        let method_chunk = chunks
2546            .iter()
2547            .find(|chunk| {
2548                chunk.chunk_type == ChunkType::Method && chunk.text.contains("run = () =>")
2549            })
2550            .expect("Expected chunk for class field arrow function");
2551
2552        assert_eq!(method_chunk.chunk_type, ChunkType::Method);
2553        assert!(
2554            method_chunk.text.contains("// Another comment"),
2555            "expected inline comment to be included"
2556        );
2557
2558        let compute_chunk = chunks
2559            .iter()
2560            .find(|chunk| chunk.text.contains("const compute"))
2561            .expect("Expected chunk for compute arrow function");
2562        assert_eq!(compute_chunk.chunk_type, ChunkType::Function);
2563        assert!(
2564            compute_chunk
2565                .text
2566                .contains("const compute = (x: number) => x * 2;")
2567        );
2568
2569        // Ensure we don't create bare arrow-expression chunks without context
2570        assert!(
2571            chunks
2572                .iter()
2573                .all(|chunk| !chunk.text.trim_start().starts_with("() =>"))
2574        );
2575        assert!(
2576            chunks
2577                .iter()
2578                .all(|chunk| !chunk.text.trim_start().starts_with("(x: number) =>"))
2579        );
2580    }
2581
2582    // TODO: Query-based chunking is more accurate than legacy for TypeScript
2583    // and finds additional method chunks. This is the correct behavior.
2584    // Legacy parity tests are disabled until legacy chunking is updated.
2585    #[test]
2586    #[ignore]
2587    fn test_typescript_query_matches_legacy() {
2588        let source = r"
2589export const util = () => {
2590    return 42;
2591};
2592
2593export class Example {
2594    run = () => {
2595        return util();
2596    };
2597}
2598
2599const compute = (x: number) => x * 2;
2600";
2601
2602        assert_query_parity(ParseableLanguage::TypeScript, source);
2603    }
2604
2605    #[test]
2606    fn test_ruby_query_matches_legacy() {
2607        let source = r#"
2608class Calculator
2609  def initialize
2610    @memory = 0.0
2611  end
2612
2613  def add(a, b)
2614    a + b
2615  end
2616
2617  def self.class_method
2618    "class method"
2619  end
2620end
2621"#;
2622
2623        assert_query_parity(ParseableLanguage::Ruby, source);
2624    }
2625
2626    #[test]
2627    fn test_go_query_matches_legacy() {
2628        let source = r#"
2629package main
2630
2631import "fmt"
2632
2633const Pi = 3.14159
2634
2635var memory float64
2636
2637type Calculator struct {
2638    memory float64
2639}
2640
2641func (c *Calculator) Add(a, b float64) float64 {
2642    return a + b
2643}
2644
2645func Helper() {}
2646"#;
2647
2648        assert_query_parity(ParseableLanguage::Go, source);
2649    }
2650
2651    #[test]
2652    fn test_chunk_c_corner_cases() {
2653        let c_code = r#"
2654#define MAX(a,b) ((a) > (b) ? (a) : (b))
2655#define VERSION 3
2656
2657typedef struct Node {
2658    int value;
2659    struct Node* next;
2660} Node;
2661
2662union Payload {
2663    int i;
2664    float f;
2665};
2666
2667enum Color {
2668    Red,
2669    Green,
2670    Blue,
2671};
2672
2673static inline int add(int a, int b) {
2674    return a + b;
2675}
2676
2677int main(void) {
2678    return MAX(add(1, 2), VERSION);
2679}
2680"#;
2681
2682        let chunks = chunk_language(c_code, ParseableLanguage::C).unwrap();
2683        assert!(!chunks.is_empty());
2684
2685        assert!(
2686            chunks
2687                .iter()
2688                .any(|c| { c.chunk_type == ChunkType::Function && c.text.contains("#define MAX") })
2689        );
2690        assert!(
2691            chunks
2692                .iter()
2693                .any(|c| { c.chunk_type == ChunkType::Text && c.text.contains("#define VERSION") })
2694        );
2695        assert!(
2696            chunks
2697                .iter()
2698                .any(|c| { c.chunk_type == ChunkType::Class && c.text.contains("struct Node") })
2699        );
2700        assert!(
2701            chunks
2702                .iter()
2703                .any(|c| { c.chunk_type == ChunkType::Class && c.text.contains("union Payload") })
2704        );
2705        assert!(
2706            chunks
2707                .iter()
2708                .any(|c| { c.chunk_type == ChunkType::Class && c.text.contains("enum Color") })
2709        );
2710        assert!(chunks.iter().any(|c| {
2711            c.chunk_type == ChunkType::Function && c.text.contains("static inline int add")
2712        }));
2713        assert!(
2714            chunks
2715                .iter()
2716                .any(|c| { c.chunk_type == ChunkType::Function && c.text.contains("int main") })
2717        );
2718    }
2719
2720    #[test]
2721    fn test_chunk_c_struct_declaration_without_body_stays_intact() {
2722        let c_code = r#"
2723#include <stdint.h>
2724
2725struct mtd_info_user meminfo;
2726struct foo forward;
2727"#;
2728
2729        let chunks = chunk_language(c_code, ParseableLanguage::C).unwrap();
2730
2731        assert!(
2732            chunks
2733                .iter()
2734                .any(|c| { c.text.contains("struct mtd_info_user meminfo;") })
2735        );
2736        assert!(
2737            chunks
2738                .iter()
2739                .any(|c| c.text.contains("struct foo forward;"))
2740        );
2741        assert!(
2742            !chunks
2743                .iter()
2744                .any(|c| c.text.trim() == "struct mtd_info_user")
2745        );
2746        assert!(!chunks.iter().any(|c| c.text.trim() == "struct foo"));
2747    }
2748
2749    #[test]
2750    fn test_chunk_cpp_corner_cases() {
2751        let cpp_code = r#"
2752#include <vector>
2753#define SQUARE(x) ((x) * (x))
2754
2755namespace math {
2756template <typename T>
2757T add(T a, T b) {
2758    return a + b;
2759}
2760
2761using Vec = std::vector<int>;
2762typedef unsigned long ulong_t;
2763
2764struct Point {
2765    int x;
2766    int y;
2767};
2768
2769class Calculator {
2770public:
2771    int add(int a, int b) { return a + b; }
2772};
2773
2774enum class Color { Red, Green, Blue };
2775} // namespace math
2776
2777int main() {
2778    return math::add(1, 2);
2779}
2780"#;
2781
2782        let chunks = chunk_language(cpp_code, ParseableLanguage::Cpp).unwrap();
2783        assert!(!chunks.is_empty());
2784
2785        assert!(
2786            chunks
2787                .iter()
2788                .any(|c| c.text.contains("template <typename T>"))
2789        );
2790        assert!(
2791            chunks
2792                .iter()
2793                .any(|c| { c.chunk_type == ChunkType::Text && c.text.contains("using Vec") })
2794        );
2795        assert!(chunks.iter().any(|c| {
2796            c.chunk_type == ChunkType::Text && c.text.contains("typedef unsigned long")
2797        }));
2798        assert!(
2799            chunks.iter().any(|c| {
2800                c.chunk_type == ChunkType::Function && c.text.contains("#define SQUARE")
2801            })
2802        );
2803        let calculator_chunk = chunks
2804            .iter()
2805            .find(|c| c.chunk_type == ChunkType::Class && c.text.contains("class Calculator"));
2806        assert!(calculator_chunk.is_some());
2807        let calculator_chunk = calculator_chunk.unwrap();
2808        assert!(calculator_chunk.text.contains("int add"));
2809        assert!(!calculator_chunk.text.contains("return a + b"));
2810
2811        assert!(
2812            chunks
2813                .iter()
2814                .any(|c| { c.chunk_type == ChunkType::Class && c.text.contains("struct Point") })
2815        );
2816        assert!(
2817            chunks.iter().any(|c| {
2818                c.chunk_type == ChunkType::Class && c.text.contains("enum class Color")
2819            })
2820        );
2821        assert!(
2822            chunks
2823                .iter()
2824                .any(|c| { c.chunk_type == ChunkType::Function && c.text.contains("int main") })
2825        );
2826        assert!(
2827            chunks
2828                .iter()
2829                .any(|c| { c.chunk_type == ChunkType::Function && c.text.contains("T add") })
2830        );
2831        assert!(chunks.iter().any(|c| {
2832            c.chunk_type == ChunkType::Method && c.text.contains("int add(int a, int b)")
2833        }));
2834    }
2835
2836    #[test]
2837    fn test_cpp_suppresses_contained_text_chunks() {
2838        let cpp_code = r#"
2839class Widget {
2840public:
2841    using Alias = int;
2842    int calc() { int local = 1; return local; }
2843};
2844
2845using TopLevel = double;
2846"#;
2847
2848        let chunks = chunk_language(cpp_code, ParseableLanguage::Cpp).unwrap();
2849
2850        assert!(
2851            !chunks
2852                .iter()
2853                .any(|c| { c.chunk_type == ChunkType::Text && c.text.contains("using Alias") })
2854        );
2855        assert!(
2856            !chunks
2857                .iter()
2858                .any(|c| { c.chunk_type == ChunkType::Text && c.text.contains("int local") })
2859        );
2860        assert!(
2861            chunks
2862                .iter()
2863                .any(|c| { c.chunk_type == ChunkType::Text && c.text.contains("using TopLevel") })
2864        );
2865        assert!(
2866            chunks
2867                .iter()
2868                .any(|c| { c.chunk_type == ChunkType::Method && c.text.contains("int calc") })
2869        );
2870    }
2871
2872    #[test]
2873    fn test_cpp_template_prefix_merges_with_definition() {
2874        let cpp_code = r#"
2875template <typename T>
2876struct Box {
2877    static int value;
2878};
2879
2880template <typename T>
2881int Box<T>::value = 0;
2882"#;
2883
2884        let chunks = chunk_language(cpp_code, ParseableLanguage::Cpp).unwrap();
2885
2886        let def_chunk = chunks
2887            .iter()
2888            .find(|c| c.text.contains("int Box<T>::value = 0;"))
2889            .expect("static member definition chunk present");
2890
2891        assert!(def_chunk.text.contains("template <typename T>"));
2892
2893        assert!(!chunks.iter().any(|c| {
2894            c.chunk_type == ChunkType::Text && c.text.trim() == "template <typename T>"
2895        }));
2896    }
2897
2898    #[test]
2899    fn test_cpp_template_method_breadcrumb_in_namespaces() {
2900        let cpp_code = r#"
2901namespace com {
2902namespace ford {
2903
2904template <typename T>
2905class Wrapper {
2906public:
2907    template <typename U>
2908    U convert(U value) { return value; }
2909};
2910
2911} // namespace ford
2912} // namespace com
2913"#;
2914
2915        let chunks = chunk_language(cpp_code, ParseableLanguage::Cpp).unwrap();
2916        let method_chunk = chunks
2917            .iter()
2918            .find(|c| c.chunk_type == ChunkType::Method && c.text.contains("convert"))
2919            .expect("convert method chunk present");
2920
2921        assert_eq!(
2922            method_chunk.metadata.breadcrumb.as_deref(),
2923            Some("com::ford::Wrapper::convert")
2924        );
2925    }
2926
2927    #[test]
2928    fn test_cpp_function_breadcrumb_qualification() {
2929        let cpp_code = r#"
2930namespace outer {
2931class A {
2932public:
2933    void m();
2934};
2935}
2936
2937void outer::A::m() {
2938    // body
2939}
2940"#;
2941
2942        let chunks = chunk_language(cpp_code, ParseableLanguage::Cpp).unwrap();
2943        let method_chunk = chunks
2944            .iter()
2945            .find(|c| c.chunk_type == ChunkType::Function && c.text.contains("outer::A::m"))
2946            .expect("method chunk should exist");
2947        assert_eq!(
2948            method_chunk.metadata.breadcrumb.as_deref(),
2949            Some("outer::A::m")
2950        );
2951    }
2952
2953    #[test]
2954    fn test_haskell_query_matches_legacy() {
2955        let source = r#"
2956module Example where
2957
2958data Shape
2959  = Circle Float
2960  | Square Float
2961
2962type family Area a
2963
2964class Printable a where
2965    printValue :: a -> String
2966
2967instance Printable Shape where
2968    printValue (Circle _) = "circle"
2969    printValue (Square _) = "square"
2970
2971shapeDescription :: Shape -> String
2972shapeDescription (Circle r) = "circle of radius " ++ show r
2973shapeDescription (Square s) = "square of side " ++ show s
2974"#;
2975
2976        assert_query_parity(ParseableLanguage::Haskell, source);
2977    }
2978
2979    #[test]
2980    fn test_markdown_real_file_breadcrumbs() {
2981        let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
2982            .join("tests/fixtures/markdown_breadcrumbs.md");
2983        let source = std::fs::read_to_string(&path).expect("read markdown file");
2984
2985        let chunks =
2986            chunk_text(&source, Some(ck_core::Language::Markdown)).expect("chunk markdown");
2987
2988        // The fixture is a small document (well under the token target), so
2989        // merge_small_chunks collapses all individual heading / paragraph chunks
2990        // into a single merged chunk.  After merging the chunk_type is
2991        // ChunkType::Text (mixed content) — that is intentional; see
2992        // merge_small_chunks for the rationale.  What we verify here is that
2993        // the actual heading text survives in the merged output so that
2994        // full-text search can still find it.
2995        let all_text: String = chunks
2996            .iter()
2997            .map(|c| c.text.as_str())
2998            .collect::<Vec<_>>()
2999            .join("\n");
3000
3001        assert!(
3002            all_text.contains("Project Overview"),
3003            "expected top-level heading text to be present in merged chunk"
3004        );
3005        assert!(
3006            all_text.contains("## Usage"),
3007            "expected second-level heading text to be present in merged chunk"
3008        );
3009        assert!(
3010            all_text.contains("Setext Section"),
3011            "expected setext heading text to be present in merged chunk"
3012        );
3013    }
3014
3015    #[test]
3016    fn test_markdown_inline_fixtures_cover_blocks() {
3017        let source = r#"
3018# Title
3019
3020Intro paragraph with **bold** text.
3021
3022## Usage
3023
3024```rust
3025fn main() {
3026    println!("hi");
3027}
3028```
3029
3030> Blockquote with _emphasis_.
3031
3032- Item one
3033- Item two
3034
3035Setext Section
3036==============
3037
3038Trailing paragraph.
3039"#;
3040
3041        let chunks = chunk_text(source, Some(ck_core::Language::Markdown)).expect("chunk markdown");
3042
3043        // This source is a small synthetic document.  merge_small_chunks will
3044        // collapse all individual heading / paragraph / code-block chunks into
3045        // one (or a few) merged chunks whose chunk_type is ChunkType::Text.
3046        // That is intentional — tiny Markdown chunks produce weak embeddings;
3047        // see merge_small_chunks for the full rationale.  What matters is that
3048        // all block content is preserved in the output text so that full-text
3049        // and semantic search can still find it.
3050        let all_text: String = chunks
3051            .iter()
3052            .map(|c| c.text.as_str())
3053            .collect::<Vec<_>>()
3054            .join("\n");
3055
3056        assert!(
3057            all_text.contains("# Title") || all_text.contains("## Usage"),
3058            "expected heading text to be present after merging"
3059        );
3060        assert!(
3061            all_text.contains("```rust"),
3062            "expected markdown to include fenced code block"
3063        );
3064        assert!(
3065            all_text.contains("> Blockquote"),
3066            "expected markdown to include blockquote text"
3067        );
3068        assert!(
3069            all_text.contains("Setext Section"),
3070            "expected markdown to include Setext heading text"
3071        );
3072    }
3073
3074    #[test]
3075    fn test_csharp_query_matches_legacy() {
3076        let source = r"
3077namespace Calculator;
3078
3079public interface ICalculator
3080{
3081    double Add(double x, double y);
3082}
3083
3084public class Calculator
3085{
3086    public static double PI = 3.14159;
3087    private double _memory;
3088
3089    public Calculator()
3090    {
3091        _memory = 0.0;
3092    }
3093
3094    public double Add(double x, double y)
3095    {
3096        return x + y;
3097    }
3098}
3099";
3100
3101        assert_query_parity(ParseableLanguage::CSharp, source);
3102    }
3103
3104    #[test]
3105    fn test_zig_query_matches_legacy() {
3106        let source = r#"
3107const std = @import("std");
3108
3109const Calculator = struct {
3110    memory: f64,
3111
3112    pub fn init() Calculator {
3113        return Calculator{ .memory = 0.0 };
3114    }
3115
3116    pub fn add(self: *Calculator, a: f64, b: f64) f64 {
3117        return a + b;
3118    }
3119};
3120
3121test "calculator addition" {
3122    var calc = Calculator.init();
3123    const result = calc.add(2.0, 3.0);
3124    try std.testing.expect(result == 5.0);
3125}
3126"#;
3127
3128        assert_query_parity(ParseableLanguage::Zig, source);
3129    }
3130
3131    #[test]
3132    fn test_chunk_zig() {
3133        let zig_code = r#"
3134const std = @import("std");
3135
3136const Calculator = struct {
3137    memory: f64,
3138
3139    pub fn init() Calculator {
3140        return Calculator{ .memory = 0.0 };
3141    }
3142
3143    pub fn add(self: *Calculator, a: f64, b: f64) f64 {
3144        const result = a + b;
3145        self.memory = result;
3146        return result;
3147    }
3148};
3149
3150const Color = enum {
3151    Red,
3152    Green,
3153    Blue,
3154};
3155
3156const Value = union(enum) {
3157    int: i32,
3158    float: f64,
3159};
3160
3161const Handle = opaque {};
3162
3163const MathError = error{
3164    DivisionByZero,
3165    Overflow,
3166};
3167
3168pub fn multiply(a: i32, b: i32) i32 {
3169    return a * b;
3170}
3171
3172pub fn divide(a: i32, b: i32) MathError!i32 {
3173    if (b == 0) return error.DivisionByZero;
3174    return @divTrunc(a, b);
3175}
3176
3177comptime {
3178    @compileLog("Compile-time validation");
3179}
3180
3181pub fn main() !void {
3182    var calc = Calculator.init();
3183    const result = calc.add(2.0, 3.0);
3184    std.debug.print("Result: {}\n", .{result});
3185}
3186
3187test "calculator addition" {
3188    var calc = Calculator.init();
3189    const result = calc.add(2.0, 3.0);
3190    try std.testing.expect(result == 5.0);
3191}
3192
3193test "multiply function" {
3194    const result = multiply(3, 4);
3195    try std.testing.expect(result == 12);
3196}
3197"#;
3198
3199        let chunks = chunk_language(zig_code, ParseableLanguage::Zig).unwrap();
3200        assert!(!chunks.is_empty());
3201
3202        let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
3203
3204        let class_count = chunk_types
3205            .iter()
3206            .filter(|&&t| t == &ChunkType::Class)
3207            .count();
3208        let function_count = chunk_types
3209            .iter()
3210            .filter(|&&t| t == &ChunkType::Function)
3211            .count();
3212        let module_count = chunk_types
3213            .iter()
3214            .filter(|&&t| t == &ChunkType::Module)
3215            .count();
3216
3217        assert!(
3218            class_count >= 5,
3219            "Expected at least 5 Class chunks (struct, enum, union, opaque, error set), found {class_count}"
3220        );
3221
3222        assert!(
3223            function_count >= 3,
3224            "Expected at least 3 functions (multiply, divide, main), found {function_count}"
3225        );
3226
3227        assert!(
3228            module_count >= 4,
3229            "Expected at least 4 module-type chunks (const std, comptime, 2 tests), found {module_count}"
3230        );
3231
3232        assert!(
3233            chunk_types.contains(&&ChunkType::Class),
3234            "Expected to find Class chunks"
3235        );
3236        assert!(
3237            chunk_types.contains(&&ChunkType::Function),
3238            "Expected to find Function chunks"
3239        );
3240        assert!(
3241            chunk_types.contains(&&ChunkType::Module),
3242            "Expected to find Module chunks"
3243        );
3244    }
3245
3246    #[test]
3247    fn test_chunk_csharp() {
3248        let csharp_code = r"
3249namespace Calculator;
3250
3251public interface ICalculator
3252{
3253    double Add(double x, double y);
3254}
3255
3256public class Calculator
3257{
3258    public static const double PI = 3.14159;
3259    private double _memory;
3260
3261    public Calculator()
3262    {
3263        _memory = 0.0;
3264    }
3265
3266    public double Add(double x, double y)
3267    {
3268        return x + y;
3269    }
3270
3271    public static void Main(string[] args)
3272    {
3273        var calc = new Calculator();
3274    }
3275}
3276";
3277
3278        let chunks = chunk_language(csharp_code, ParseableLanguage::CSharp).unwrap();
3279        assert!(!chunks.is_empty());
3280
3281        // Should find variable, class, method and interface declarations
3282        let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
3283        assert!(chunk_types.contains(&&ChunkType::Module)); // var, interface
3284        assert!(chunk_types.contains(&&ChunkType::Class)); // class
3285        assert!(chunk_types.contains(&&ChunkType::Method)); // methods
3286    }
3287
3288    #[test]
3289    fn test_stride_large_chunk_empty_text() {
3290        // Regression test for divide-by-zero bug in stride_large_chunk
3291        let empty_chunk = Chunk {
3292            span: Span {
3293                byte_start: 0,
3294                byte_end: 0,
3295                line_start: 1,
3296                line_end: 1,
3297            },
3298            text: String::new(), // Empty text should not panic
3299            chunk_type: ChunkType::Text,
3300            stride_info: None,
3301            metadata: ChunkMetadata::from_text(""),
3302        };
3303
3304        let config = ChunkConfig::default();
3305        let result = stride_large_chunk(empty_chunk.clone(), &config);
3306
3307        // Should not panic and return the original chunk
3308        assert!(result.is_ok());
3309        let chunks = result.unwrap();
3310        assert_eq!(chunks.len(), 1);
3311        assert_eq!(chunks[0].text, "");
3312    }
3313
3314    #[test]
3315    fn test_stride_large_chunk_zero_token_estimate() {
3316        // Regression test for zero token estimate causing divide-by-zero
3317        let chunk = Chunk {
3318            span: Span {
3319                byte_start: 0,
3320                byte_end: 5,
3321                line_start: 1,
3322                line_end: 1,
3323            },
3324            text: "     ".to_string(), // Whitespace that might return 0 tokens
3325            chunk_type: ChunkType::Text,
3326            stride_info: None,
3327            metadata: ChunkMetadata::from_text("     "),
3328        };
3329
3330        let config = ChunkConfig::default();
3331        let result = stride_large_chunk(chunk, &config);
3332
3333        // Should not panic and handle gracefully
3334        assert!(result.is_ok());
3335    }
3336
3337    #[test]
3338    fn test_strided_chunk_line_calculation() {
3339        // Regression test for line_end calculation in strided chunks
3340        // Create a chunk large enough to force striding
3341        let long_text = (1..=50).map(|i| format!("This is a longer line {i} with more content to ensure token count is high enough")).collect::<Vec<_>>().join("\n");
3342
3343        let metadata = ChunkMetadata::from_text(&long_text);
3344        let chunk = Chunk {
3345            span: Span {
3346                byte_start: 0,
3347                byte_end: long_text.len(),
3348                line_start: 1,
3349                line_end: 50,
3350            },
3351            text: long_text,
3352            chunk_type: ChunkType::Text,
3353            stride_info: None,
3354            metadata,
3355        };
3356
3357        let config = ChunkConfig {
3358            max_tokens: 100,    // Force striding with reasonable limit
3359            stride_overlap: 10, // Small overlap for testing
3360            ..Default::default()
3361        };
3362
3363        let result = stride_large_chunk(chunk, &config);
3364        if let Err(e) = &result {
3365            eprintln!("Stride error: {e}");
3366        }
3367        assert!(result.is_ok());
3368
3369        let chunks = result.unwrap();
3370        assert!(
3371            chunks.len() > 1,
3372            "Should create multiple chunks when striding"
3373        );
3374
3375        for chunk in chunks {
3376            // Verify line_end is not off by one
3377            // line_end should be inclusive and not exceed the actual content
3378            assert!(chunk.span.line_end >= chunk.span.line_start);
3379
3380            // Check that line span makes sense for the content
3381            let line_count = chunk.text.lines().count();
3382            if line_count > 0 {
3383                let calculated_line_span = chunk.span.line_end - chunk.span.line_start + 1;
3384
3385                // Allow some tolerance for striding logic
3386                assert!(
3387                    calculated_line_span <= line_count + 1,
3388                    "Line span {calculated_line_span} should not exceed content lines {line_count} by more than 1"
3389                );
3390            }
3391        }
3392    }
3393
3394    #[test]
3395    fn test_gap_filling_coverage() {
3396        // Test that all non-whitespace content gets chunked
3397        let test_cases = vec![
3398            (
3399                ParseableLanguage::Rust,
3400                r#"// This is a test file with imports at the top
3401use std::collections::HashMap;
3402use std::sync::Arc;
3403
3404// A comment between imports and code
3405const VERSION: &str = "1.0.0";
3406
3407// Main function
3408fn main() {
3409    println!("Hello, world!");
3410}
3411
3412// Some trailing content
3413// that should be indexed
3414"#,
3415            ),
3416            (
3417                ParseableLanguage::Python,
3418                r#"# Imports at the top
3419import os
3420import sys
3421
3422# Some constant
3423VERSION = "1.0.0"
3424
3425# Main function
3426def main():
3427    print("Hello, world!")
3428
3429# Trailing comment
3430# should be indexed
3431"#,
3432            ),
3433            (
3434                ParseableLanguage::TypeScript,
3435                r#"// Imports at the top
3436import { foo } from 'bar';
3437
3438// Some constant
3439const VERSION = "1.0.0";
3440
3441// Main function
3442function main() {
3443    console.log("Hello, world!");
3444}
3445
3446// Trailing comment
3447// should be indexed
3448"#,
3449            ),
3450        ];
3451
3452        for (language, code) in test_cases {
3453            eprintln!("\n=== Testing {language} ===");
3454            let chunks = chunk_language(code, language).unwrap();
3455
3456            // Verify all non-whitespace bytes are covered
3457            let mut covered_bytes = vec![false; code.len()];
3458            for chunk in &chunks {
3459                for item in covered_bytes
3460                    .iter_mut()
3461                    .take(chunk.span.byte_end)
3462                    .skip(chunk.span.byte_start)
3463                {
3464                    *item = true;
3465                }
3466            }
3467
3468            let uncovered_non_ws: Vec<usize> = covered_bytes
3469                .iter()
3470                .enumerate()
3471                .filter(|(i, covered)| !**covered && !code.as_bytes()[*i].is_ascii_whitespace())
3472                .map(|(i, _)| i)
3473                .collect();
3474
3475            if !uncovered_non_ws.is_empty() {
3476                eprintln!("\n=== UNCOVERED NON-WHITESPACE for {language} ===");
3477                eprintln!("Total bytes: {}", code.len());
3478                eprintln!("Uncovered non-whitespace: {}", uncovered_non_ws.len());
3479
3480                // Show what's uncovered
3481                for &pos in uncovered_non_ws.iter().take(10) {
3482                    let context_start = pos.saturating_sub(20);
3483                    let context_end = (pos + 20).min(code.len());
3484                    eprintln!(
3485                        "Uncovered at byte {}: {:?}",
3486                        pos,
3487                        &code[context_start..context_end]
3488                    );
3489                }
3490
3491                eprintln!("\n=== CHUNKS ===");
3492                for (i, chunk) in chunks.iter().enumerate() {
3493                    eprintln!(
3494                        "Chunk {}: {:?} bytes {}-{} (len {})",
3495                        i,
3496                        chunk.chunk_type,
3497                        chunk.span.byte_start,
3498                        chunk.span.byte_end,
3499                        chunk.span.byte_end - chunk.span.byte_start
3500                    );
3501                    eprintln!("  Text: {:?}", &chunk.text[..chunk.text.len().min(60)]);
3502                }
3503            }
3504
3505            assert!(
3506                uncovered_non_ws.is_empty(),
3507                "{}: Expected all non-whitespace covered but found {} uncovered non-whitespace bytes",
3508                language,
3509                uncovered_non_ws.len()
3510            );
3511        }
3512    }
3513
3514    #[test]
3515    fn test_web_server_file_coverage() {
3516        // Test that all non-whitespace content in web_server.rs is covered
3517        let code = std::fs::read_to_string("../examples/code/web_server.rs")
3518            .expect("Failed to read web_server.rs");
3519
3520        let chunks = chunk_language(&code, ParseableLanguage::Rust).unwrap();
3521
3522        // Check coverage for non-whitespace content only
3523        let mut covered = vec![false; code.len()];
3524        for chunk in &chunks {
3525            for item in covered
3526                .iter_mut()
3527                .take(chunk.span.byte_end)
3528                .skip(chunk.span.byte_start)
3529            {
3530                *item = true;
3531            }
3532        }
3533
3534        // Find uncovered bytes that are NOT whitespace
3535        let uncovered_non_whitespace: Vec<(usize, char)> = covered
3536            .iter()
3537            .enumerate()
3538            .filter(|(i, covered)| !**covered && !code.as_bytes()[*i].is_ascii_whitespace())
3539            .map(|(i, _)| (i, code.chars().nth(i).unwrap_or('?')))
3540            .collect();
3541
3542        if !uncovered_non_whitespace.is_empty() {
3543            eprintln!("\n=== WEB_SERVER.RS UNCOVERED NON-WHITESPACE ===");
3544            eprintln!("File size: {} bytes", code.len());
3545            eprintln!("Total chunks: {}", chunks.len());
3546            eprintln!(
3547                "Uncovered non-whitespace: {}",
3548                uncovered_non_whitespace.len()
3549            );
3550
3551            for &(pos, ch) in uncovered_non_whitespace.iter().take(10) {
3552                let start = pos.saturating_sub(30);
3553                let end = (pos + 30).min(code.len());
3554                eprintln!(
3555                    "\nUncovered '{}' at byte {}: {:?}",
3556                    ch,
3557                    pos,
3558                    &code[start..end]
3559                );
3560            }
3561
3562            eprintln!("\n=== CHUNKS ===");
3563            for (i, chunk) in chunks.iter().enumerate().take(20) {
3564                eprintln!(
3565                    "Chunk {}: {:?} bytes {}-{} lines {}-{}",
3566                    i,
3567                    chunk.chunk_type,
3568                    chunk.span.byte_start,
3569                    chunk.span.byte_end,
3570                    chunk.span.line_start,
3571                    chunk.span.line_end
3572                );
3573            }
3574        }
3575
3576        assert!(
3577            uncovered_non_whitespace.is_empty(),
3578            "Expected all non-whitespace content covered but found {} uncovered non-whitespace bytes",
3579            uncovered_non_whitespace.len()
3580        );
3581    }
3582
3583    #[test]
3584    fn test_haskell_function_chunking() {
3585        let haskell_code = r"
3586factorial :: Integer -> Integer
3587factorial 0 = 1
3588factorial n = n * factorial (n - 1)
3589
3590fibonacci :: Integer -> Integer
3591fibonacci 0 = 0
3592fibonacci 1 = 1
3593fibonacci n = fibonacci (n - 1) + fibonacci (n - 2)
3594";
3595
3596        let mut parser = tree_sitter::Parser::new();
3597        parser
3598            .set_language(&tree_sitter_haskell::LANGUAGE.into())
3599            .unwrap();
3600        let tree = parser.parse(haskell_code, None).unwrap();
3601
3602        // Debug: print tree structure
3603        fn walk(node: tree_sitter::Node, _src: &str, depth: usize) {
3604            let kind = node.kind();
3605            let start = node.start_position();
3606            let end = node.end_position();
3607            eprintln!(
3608                "{}{:30} L{}-{}",
3609                "  ".repeat(depth),
3610                kind,
3611                start.row + 1,
3612                end.row + 1
3613            );
3614
3615            let mut cursor = node.walk();
3616            if cursor.goto_first_child() {
3617                loop {
3618                    walk(cursor.node(), _src, depth + 1);
3619                    if !cursor.goto_next_sibling() {
3620                        break;
3621                    }
3622                }
3623            }
3624        }
3625
3626        eprintln!("\n=== TREE STRUCTURE ===");
3627        walk(tree.root_node(), haskell_code, 0);
3628        eprintln!("=== END TREE ===\n");
3629
3630        let chunks = chunk_language(haskell_code, ParseableLanguage::Haskell).unwrap();
3631
3632        eprintln!("\n=== CHUNKS ===");
3633        for (i, chunk) in chunks.iter().enumerate() {
3634            eprintln!(
3635                "Chunk {}: {:?} L{}-{}",
3636                i, chunk.chunk_type, chunk.span.line_start, chunk.span.line_end
3637            );
3638            eprintln!("  Text: {:?}", chunk.text);
3639        }
3640        eprintln!("=== END CHUNKS ===\n");
3641
3642        assert!(!chunks.is_empty(), "Should find chunks in Haskell code");
3643
3644        // Find factorial chunk and verify it includes both signature and implementation
3645        let factorial_chunk = chunks.iter().find(|c| c.text.contains("factorial 0 = 1"));
3646        assert!(
3647            factorial_chunk.is_some(),
3648            "Should find factorial function body"
3649        );
3650
3651        let fac = factorial_chunk.unwrap();
3652        assert!(
3653            fac.text.contains("factorial :: Integer -> Integer"),
3654            "Should include type signature"
3655        );
3656        assert!(
3657            fac.text.contains("factorial 0 = 1"),
3658            "Should include base case"
3659        );
3660        assert!(
3661            fac.text.contains("factorial n = n * factorial (n - 1)"),
3662            "Should include recursive case"
3663        );
3664    }
3665
3666    #[test]
3667    fn test_chunk_elixir_basic() {
3668        let elixir_code = r#"
3669defmodule Calculator do
3670  @moduledoc "A simple calculator module"
3671
3672  def add(a, b) do
3673    a + b
3674  end
3675
3676  defp multiply(a, b) do
3677    a * b
3678  end
3679end
3680"#;
3681
3682        let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
3683
3684        eprintln!("\n=== ELIXIR CHUNKS ===");
3685        for (i, chunk) in chunks.iter().enumerate() {
3686            eprintln!(
3687                "Chunk {}: {:?} L{}-{}",
3688                i, chunk.chunk_type, chunk.span.line_start, chunk.span.line_end
3689            );
3690            eprintln!("  Text: {:?}", &chunk.text[..chunk.text.len().min(80)]);
3691        }
3692        eprintln!("=== END CHUNKS ===\n");
3693
3694        assert!(!chunks.is_empty(), "Should find chunks in Elixir code");
3695
3696        // Should have module and function chunks
3697        let has_module = chunks.iter().any(|c| c.chunk_type == ChunkType::Module);
3698        let has_function = chunks.iter().any(|c| c.chunk_type == ChunkType::Function);
3699
3700        assert!(has_module, "Should detect defmodule as Module");
3701        assert!(has_function, "Should detect def/defp as Function");
3702    }
3703
3704    #[test]
3705    fn test_chunk_elixir_protocol() {
3706        let elixir_code = r#"
3707defprotocol Stringable do
3708  @doc "Converts to string"
3709  def to_string(value)
3710end
3711
3712defimpl Stringable, for: Integer do
3713  def to_string(value), do: Integer.to_string(value)
3714end
3715"#;
3716
3717        let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
3718
3719        eprintln!("\n=== ELIXIR PROTOCOL CHUNKS ===");
3720        for (i, chunk) in chunks.iter().enumerate() {
3721            eprintln!(
3722                "Chunk {}: {:?} L{}-{}",
3723                i, chunk.chunk_type, chunk.span.line_start, chunk.span.line_end
3724            );
3725            eprintln!("  Text: {:?}", &chunk.text[..chunk.text.len().min(80)]);
3726        }
3727        eprintln!("=== END CHUNKS ===\n");
3728
3729        // Should detect protocol and implementation as modules
3730        let modules: Vec<_> = chunks
3731            .iter()
3732            .filter(|c| c.chunk_type == ChunkType::Module)
3733            .collect();
3734
3735        assert!(
3736            modules.len() >= 2,
3737            "Should detect defprotocol and defimpl as modules, found {}",
3738            modules.len()
3739        );
3740    }
3741
3742    #[test]
3743    fn test_chunk_elixir_genserver() {
3744        let elixir_code = r"
3745defmodule MyServer do
3746  use GenServer
3747
3748  def start_link(opts) do
3749    GenServer.start_link(__MODULE__, opts, name: __MODULE__)
3750  end
3751
3752  def init(state) do
3753    {:ok, state}
3754  end
3755
3756  def handle_call(:get, _from, state) do
3757    {:reply, state, state}
3758  end
3759
3760  def handle_cast({:set, value}, _state) do
3761    {:noreply, value}
3762  end
3763end
3764";
3765
3766        let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
3767
3768        // Should capture all GenServer callbacks as functions
3769        let functions: Vec<_> = chunks
3770            .iter()
3771            .filter(|c| c.chunk_type == ChunkType::Function)
3772            .collect();
3773
3774        assert!(
3775            functions.len() >= 4,
3776            "Should detect at least 4 functions (start_link, init, handle_call, handle_cast), found {}",
3777            functions.len()
3778        );
3779    }
3780
3781    #[test]
3782    fn test_elixir_extension_detection() {
3783        use ck_core::Language;
3784
3785        assert_eq!(Language::from_extension("ex"), Some(Language::Elixir));
3786        assert_eq!(Language::from_extension("exs"), Some(Language::Elixir));
3787        assert_eq!(Language::from_extension("EX"), Some(Language::Elixir));
3788        assert_eq!(Language::from_extension("EXS"), Some(Language::Elixir));
3789    }
3790
3791    #[test]
3792    fn test_chunk_elixir_macros() {
3793        let elixir_code = r"
3794defmodule MyMacros do
3795  defmacro unless(condition, do: block) do
3796    quote do
3797      if !unquote(condition), do: unquote(block)
3798    end
3799  end
3800
3801  defmacrop private_macro(x) do
3802    quote do: unquote(x) * 2
3803  end
3804end
3805";
3806
3807        let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
3808
3809        let functions: Vec<_> = chunks
3810            .iter()
3811            .filter(|c| c.chunk_type == ChunkType::Function)
3812            .collect();
3813
3814        assert!(
3815            functions.len() >= 2,
3816            "Should detect defmacro and defmacrop as functions, found {}",
3817            functions.len()
3818        );
3819    }
3820
3821    #[test]
3822    fn test_chunk_elixir_module_attributes() {
3823        let elixir_code = r#"
3824defmodule Calculator do
3825  @moduledoc "A calculator with type specs"
3826
3827  @behaviour GenServer
3828
3829  @type operation :: :add | :subtract | :multiply | :divide
3830  @typep internal_state :: %{history: list()}
3831  @opaque result :: {:ok, number()} | {:error, atom()}
3832
3833  @callback init(args :: term()) :: {:ok, state :: term()}
3834  @callback handle_call(request :: term(), from :: term(), state :: term()) :: {:reply, term(), term()}
3835
3836  @optional_callbacks [handle_info: 2]
3837
3838  @spec add(number(), number()) :: number()
3839  def add(a, b), do: a + b
3840
3841  @spec subtract(number(), number()) :: number()
3842  def subtract(a, b), do: a - b
3843end
3844"#;
3845
3846        let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
3847
3848        eprintln!("\n=== ELIXIR MODULE ATTRIBUTES CHUNKS ===");
3849        for (i, chunk) in chunks.iter().enumerate() {
3850            eprintln!(
3851                "Chunk {}: {:?} L{}-{}",
3852                i, chunk.chunk_type, chunk.span.line_start, chunk.span.line_end
3853            );
3854            eprintln!("  Text: {:?}", &chunk.text[..chunk.text.len().min(80)]);
3855        }
3856        eprintln!("=== END CHUNKS ===\n");
3857
3858        // Check for @behaviour
3859        let has_behaviour = chunks
3860            .iter()
3861            .any(|c| c.chunk_type == ChunkType::Text && c.text.contains("@behaviour GenServer"));
3862        assert!(has_behaviour, "Should capture @behaviour declaration");
3863
3864        // Check for @type definitions
3865        let type_chunks: Vec<_> = chunks
3866            .iter()
3867            .filter(|c| {
3868                c.chunk_type == ChunkType::Text
3869                    && (c.text.contains("@type")
3870                        || c.text.contains("@typep")
3871                        || c.text.contains("@opaque"))
3872            })
3873            .collect();
3874        assert!(
3875            type_chunks.len() >= 3,
3876            "Should capture @type, @typep, and @opaque, found {}",
3877            type_chunks.len()
3878        );
3879
3880        // Check for @callback definitions
3881        let callback_chunks: Vec<_> = chunks
3882            .iter()
3883            .filter(|c| c.chunk_type == ChunkType::Text && c.text.contains("@callback"))
3884            .collect();
3885        assert!(
3886            callback_chunks.len() >= 2,
3887            "Should capture @callback definitions, found {}",
3888            callback_chunks.len()
3889        );
3890
3891        // Check for @spec definitions
3892        let spec_chunks: Vec<_> = chunks
3893            .iter()
3894            .filter(|c| c.chunk_type == ChunkType::Text && c.text.contains("@spec"))
3895            .collect();
3896        assert!(
3897            spec_chunks.len() >= 2,
3898            "Should capture @spec definitions, found {}",
3899            spec_chunks.len()
3900        );
3901
3902        // Verify we still capture the functions
3903        let function_chunks: Vec<_> = chunks
3904            .iter()
3905            .filter(|c| c.chunk_type == ChunkType::Function)
3906            .collect();
3907        assert!(
3908            function_chunks.len() >= 2,
3909            "Should still capture def functions, found {}",
3910            function_chunks.len()
3911        );
3912    }
3913
3914    #[test]
3915    fn test_chunk_elixir_behavior_spelling() {
3916        // Test both British and American spellings
3917        let elixir_code = r"
3918defmodule BritishModule do
3919  @behaviour GenServer
3920end
3921
3922defmodule AmericanModule do
3923  @behavior GenServer
3924end
3925";
3926
3927        let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
3928
3929        let behaviour_chunks: Vec<_> = chunks
3930            .iter()
3931            .filter(|c| {
3932                c.chunk_type == ChunkType::Text
3933                    && (c.text.contains("@behaviour") || c.text.contains("@behavior"))
3934            })
3935            .collect();
3936
3937        assert!(
3938            behaviour_chunks.len() >= 2,
3939            "Should capture both @behaviour and @behavior spellings, found {}",
3940            behaviour_chunks.len()
3941        );
3942    }
3943}