1use anyhow::Result;
2use ck_core::Span;
3use serde::{Deserialize, Serialize};
4
5mod query_chunker;
6
7pub use ck_embed::TokenEstimator;
9
10fn estimate_tokens(text: &str) -> usize {
12 TokenEstimator::estimate_tokens(text)
13}
14
15pub fn get_model_chunk_config(model_name: Option<&str>) -> (usize, usize) {
18 let model = model_name.unwrap_or("nomic-embed-text-v1.5");
19
20 match model {
21 "BAAI/bge-small-en-v1.5" | "sentence-transformers/all-MiniLM-L6-v2" => {
23 (400, 80) }
25
26 "nomic-embed-text-v1" | "nomic-embed-text-v1.5" | "jina-embeddings-v2-base-code" => {
29 (1024, 200) }
31
32 "BAAI/bge-base-en-v1.5" | "BAAI/bge-large-en-v1.5" => {
34 (400, 80) }
36
37 _ => (1024, 200), }
40}
41
42#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct StrideInfo {
45 pub original_chunk_id: String,
47 pub stride_index: usize,
49 pub total_strides: usize,
51 pub overlap_start: usize,
53 pub overlap_end: usize,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize, Default)]
58pub struct ChunkMetadata {
59 pub ancestry: Vec<String>,
60 pub breadcrumb: Option<String>,
61 pub leading_trivia: Vec<String>,
62 pub trailing_trivia: Vec<String>,
63 pub byte_length: usize,
64 pub estimated_tokens: usize,
65}
66
67impl ChunkMetadata {
68 fn from_context(
69 text: &str,
70 ancestry: Vec<String>,
71 leading_trivia: Vec<String>,
72 trailing_trivia: Vec<String>,
73 ) -> Self {
74 let breadcrumb = if ancestry.is_empty() {
75 None
76 } else {
77 Some(ancestry.join("::"))
78 };
79
80 Self {
81 ancestry,
82 breadcrumb,
83 leading_trivia,
84 trailing_trivia,
85 byte_length: text.len(),
86 estimated_tokens: estimate_tokens(text),
87 }
88 }
89
90 fn from_text(text: &str) -> Self {
91 Self {
92 ancestry: Vec::new(),
93 breadcrumb: None,
94 leading_trivia: Vec::new(),
95 trailing_trivia: Vec::new(),
96 byte_length: text.len(),
97 estimated_tokens: estimate_tokens(text),
98 }
99 }
100
101 fn with_updated_text(&self, text: &str) -> Self {
102 let mut cloned = self.clone();
103 cloned.byte_length = text.len();
104 cloned.estimated_tokens = estimate_tokens(text);
105 cloned
106 }
107}
108
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct Chunk {
111 pub span: Span,
112 pub text: String,
113 pub chunk_type: ChunkType,
114 pub stride_info: Option<StrideInfo>,
116 pub metadata: ChunkMetadata,
117}
118
119#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
120pub enum ChunkType {
121 Text,
122 Function,
123 Class,
124 Method,
125 Module,
126}
127
128#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
129pub enum ParseableLanguage {
130 Python,
131 TypeScript,
132 JavaScript,
133 Haskell,
134 Rust,
135 Ruby,
136 Go,
137 CSharp,
138 Zig,
139
140 Dart,
141
142 Elixir,
143}
144
145impl std::fmt::Display for ParseableLanguage {
146 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
147 let name = match self {
148 ParseableLanguage::Python => "python",
149 ParseableLanguage::TypeScript => "typescript",
150 ParseableLanguage::JavaScript => "javascript",
151 ParseableLanguage::Haskell => "haskell",
152 ParseableLanguage::Rust => "rust",
153 ParseableLanguage::Ruby => "ruby",
154 ParseableLanguage::Go => "go",
155 ParseableLanguage::CSharp => "csharp",
156 ParseableLanguage::Zig => "zig",
157
158 ParseableLanguage::Dart => "dart",
159
160 ParseableLanguage::Elixir => "elixir",
161 };
162 write!(f, "{}", name)
163 }
164}
165
166impl TryFrom<ck_core::Language> for ParseableLanguage {
167 type Error = anyhow::Error;
168
169 fn try_from(lang: ck_core::Language) -> Result<Self, Self::Error> {
170 match lang {
171 ck_core::Language::Python => Ok(ParseableLanguage::Python),
172 ck_core::Language::TypeScript => Ok(ParseableLanguage::TypeScript),
173 ck_core::Language::JavaScript => Ok(ParseableLanguage::JavaScript),
174 ck_core::Language::Haskell => Ok(ParseableLanguage::Haskell),
175 ck_core::Language::Rust => Ok(ParseableLanguage::Rust),
176 ck_core::Language::Ruby => Ok(ParseableLanguage::Ruby),
177 ck_core::Language::Go => Ok(ParseableLanguage::Go),
178 ck_core::Language::CSharp => Ok(ParseableLanguage::CSharp),
179 ck_core::Language::Zig => Ok(ParseableLanguage::Zig),
180
181 ck_core::Language::Dart => Ok(ParseableLanguage::Dart),
182
183 ck_core::Language::Elixir => Ok(ParseableLanguage::Elixir),
184
185 _ => Err(anyhow::anyhow!(
186 "Language {:?} is not supported for parsing",
187 lang
188 )),
189 }
190 }
191}
192
193pub fn chunk_text(text: &str, language: Option<ck_core::Language>) -> Result<Vec<Chunk>> {
194 chunk_text_with_config(text, language, &ChunkConfig::default())
195}
196
197#[derive(Debug, Clone)]
199pub struct ChunkConfig {
200 pub max_tokens: usize,
202 pub stride_overlap: usize,
204 pub enable_striding: bool,
206}
207
208impl Default for ChunkConfig {
209 fn default() -> Self {
210 Self {
211 max_tokens: 8192, stride_overlap: 1024, enable_striding: true,
214 }
215 }
216}
217
218pub fn chunk_text_with_model(
220 text: &str,
221 language: Option<ck_core::Language>,
222 model_name: Option<&str>,
223) -> Result<Vec<Chunk>> {
224 let (target_tokens, overlap_tokens) = get_model_chunk_config(model_name);
225
226 let config = ChunkConfig {
228 max_tokens: target_tokens,
229 stride_overlap: overlap_tokens,
230 enable_striding: true,
231 };
232
233 chunk_text_with_config_and_model(text, language, &config, model_name)
234}
235
236pub fn chunk_text_with_config(
237 text: &str,
238 language: Option<ck_core::Language>,
239 config: &ChunkConfig,
240) -> Result<Vec<Chunk>> {
241 chunk_text_with_config_and_model(text, language, config, None)
242}
243
244fn chunk_text_with_config_and_model(
245 text: &str,
246 language: Option<ck_core::Language>,
247 config: &ChunkConfig,
248 model_name: Option<&str>,
249) -> Result<Vec<Chunk>> {
250 tracing::debug!(
251 "Chunking text with language: {:?}, length: {} chars, config: {:?}",
252 language,
253 text.len(),
254 config
255 );
256
257 let result = match language.map(ParseableLanguage::try_from) {
258 Some(Ok(lang)) => {
259 tracing::debug!("Using {} tree-sitter parser", lang);
260 chunk_language_with_model(text, lang, model_name)
261 }
262 Some(Err(_)) => {
263 tracing::debug!("Language not supported for parsing, using generic chunking strategy");
264 chunk_generic_with_token_config(text, model_name)
265 }
266 None => {
267 tracing::debug!("Using generic chunking strategy");
268 chunk_generic_with_token_config(text, model_name)
269 }
270 };
271
272 let mut chunks = result?;
273
274 if config.enable_striding {
276 chunks = apply_striding(chunks, config)?;
277 }
278
279 tracing::debug!("Successfully created {} final chunks", chunks.len());
280 Ok(chunks)
281}
282
283fn chunk_generic(text: &str) -> Result<Vec<Chunk>> {
284 chunk_generic_with_token_config(text, None)
285}
286
287fn chunk_generic_with_token_config(text: &str, model_name: Option<&str>) -> Result<Vec<Chunk>> {
288 let mut chunks = Vec::new();
289 let lines: Vec<&str> = text.lines().collect();
290
291 let (target_tokens, overlap_tokens) = get_model_chunk_config(model_name);
293
294 let avg_tokens_per_line = 10.0; let target_lines = ((target_tokens as f32) / avg_tokens_per_line) as usize;
298 let overlap_lines = ((overlap_tokens as f32) / avg_tokens_per_line) as usize;
299
300 let chunk_size = target_lines.max(5); let overlap = overlap_lines.max(1); let mut line_byte_offsets = Vec::with_capacity(lines.len() + 1);
305 line_byte_offsets.push(0);
306 let mut cumulative_offset = 0;
307 let mut byte_pos = 0;
308
309 for line in lines.iter() {
310 cumulative_offset += line.len();
311
312 let line_end_pos = byte_pos + line.len();
314 let newline_len = if line_end_pos < text.len() && text.as_bytes()[line_end_pos] == b'\r' {
315 if line_end_pos + 1 < text.len() && text.as_bytes()[line_end_pos + 1] == b'\n' {
316 2 } else {
318 1 }
320 } else if line_end_pos < text.len() && text.as_bytes()[line_end_pos] == b'\n' {
321 1 } else {
323 0 };
325
326 cumulative_offset += newline_len;
327 byte_pos = cumulative_offset;
328 line_byte_offsets.push(cumulative_offset);
329 }
330
331 let mut i = 0;
332 while i < lines.len() {
333 let end = (i + chunk_size).min(lines.len());
334 let chunk_lines = &lines[i..end];
335 let chunk_text = chunk_lines.join("\n");
336 let byte_start = line_byte_offsets[i];
337 let byte_end = line_byte_offsets[end];
338 let metadata = ChunkMetadata::from_text(&chunk_text);
339
340 chunks.push(Chunk {
341 span: Span {
342 byte_start,
343 byte_end,
344 line_start: i + 1,
345 line_end: end,
346 },
347 text: chunk_text,
348 chunk_type: ChunkType::Text,
349 stride_info: None,
350 metadata,
351 });
352
353 i += chunk_size - overlap;
354 if i >= lines.len() {
355 break;
356 }
357 }
358
359 Ok(chunks)
360}
361
362pub(crate) fn tree_sitter_language(language: ParseableLanguage) -> Result<tree_sitter::Language> {
363 if language == ParseableLanguage::Dart {
366 return Ok(tree_sitter_dart::language());
367 }
368
369 let ts_language = match language {
370 ParseableLanguage::Python => tree_sitter_python::LANGUAGE,
371 ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => {
372 tree_sitter_typescript::LANGUAGE_TYPESCRIPT
373 }
374 ParseableLanguage::Haskell => tree_sitter_haskell::LANGUAGE,
375 ParseableLanguage::Rust => tree_sitter_rust::LANGUAGE,
376 ParseableLanguage::Ruby => tree_sitter_ruby::LANGUAGE,
377 ParseableLanguage::Go => tree_sitter_go::LANGUAGE,
378 ParseableLanguage::CSharp => tree_sitter_c_sharp::LANGUAGE,
379 ParseableLanguage::Zig => tree_sitter_zig::LANGUAGE,
380
381 ParseableLanguage::Dart => unreachable!("Handled above via early return"),
382
383 ParseableLanguage::Elixir => tree_sitter_elixir::LANGUAGE,
384 };
385
386 Ok(ts_language.into())
387}
388
389fn chunk_language(text: &str, language: ParseableLanguage) -> Result<Vec<Chunk>> {
390 let mut parser = tree_sitter::Parser::new();
391 let ts_language = tree_sitter_language(language)?;
392 parser.set_language(&ts_language)?;
393
394 let tree = parser
395 .parse(text, None)
396 .ok_or_else(|| anyhow::anyhow!("Failed to parse {} code", language))?;
397
398 let mut chunks = match query_chunker::chunk_with_queries(language, ts_language, &tree, text)? {
399 Some(query_chunks) if !query_chunks.is_empty() => query_chunks,
400 _ => {
401 let mut legacy_chunks = Vec::new();
402 let mut cursor = tree.walk();
403 extract_code_chunks(&mut cursor, text, &mut legacy_chunks, language);
404 legacy_chunks
405 }
406 };
407
408 if chunks.is_empty() {
409 return chunk_generic(text);
410 }
411
412 if language == ParseableLanguage::Haskell {
414 chunks = merge_haskell_functions(chunks, text);
415 }
416
417 chunks = fill_gaps(chunks, text);
419
420 Ok(chunks)
421}
422
423fn fill_gaps(mut chunks: Vec<Chunk>, text: &str) -> Vec<Chunk> {
427 if chunks.is_empty() {
428 return chunks;
429 }
430
431 chunks.sort_by_key(|c| c.span.byte_start);
433
434 let mut result = Vec::new();
435 let mut last_end = 0;
436
437 let mut gaps = Vec::new();
439
440 for chunk in &chunks {
441 if last_end < chunk.span.byte_start {
442 let gap_start = last_end;
444 let gap_text = &text[gap_start..chunk.span.byte_start];
445
446 let mut current_byte = gap_start;
448 let mut segment_start = gap_start;
449
450 for line in gap_text.split('\n') {
451 let line_start_in_gap = current_byte - gap_start;
452 let _line_end_in_gap = line_start_in_gap + line.len();
453
454 if line.trim().is_empty() {
455 if segment_start < current_byte {
457 let segment_text = &text[segment_start..current_byte];
458 if !segment_text.trim().is_empty() {
459 gaps.push((segment_start, current_byte));
460 }
461 }
462 segment_start = current_byte + line.len() + 1;
464 }
465
466 current_byte += line.len() + 1; }
468
469 if segment_start < chunk.span.byte_start {
471 let remaining = &text[segment_start..chunk.span.byte_start];
472 if !remaining.trim().is_empty() {
473 gaps.push((segment_start, chunk.span.byte_start));
474 }
475 }
476 }
477 last_end = chunk.span.byte_end;
478 }
479
480 if last_end < text.len() {
482 let gap_text = &text[last_end..];
483 if !gap_text.trim().is_empty() {
484 gaps.push((last_end, text.len()));
485 }
486 }
487
488 let combined_gaps = gaps;
489
490 let mut gap_idx = 0;
492
493 for chunk in chunks {
494 while gap_idx < combined_gaps.len() && combined_gaps[gap_idx].1 <= chunk.span.byte_start {
496 let (gap_start, gap_end) = combined_gaps[gap_idx];
497 let gap_text = &text[gap_start..gap_end];
498
499 let line_start = text[..gap_start].matches('\n').count() + 1;
501 let newlines_up_to_end = text[..gap_end].matches('\n').count();
504 let line_end = if newlines_up_to_end >= line_start - 1 {
505 newlines_up_to_end.max(line_start)
506 } else {
507 line_start
508 };
509
510 let gap_chunk = Chunk {
511 text: gap_text.to_string(),
512 span: Span {
513 byte_start: gap_start,
514 byte_end: gap_end,
515 line_start,
516 line_end,
517 },
518 chunk_type: ChunkType::Text,
519 metadata: ChunkMetadata::from_text(gap_text),
520 stride_info: None,
521 };
522 result.push(gap_chunk);
523 gap_idx += 1;
524 }
525
526 result.push(chunk.clone());
527 }
528
529 while gap_idx < combined_gaps.len() {
531 let (gap_start, gap_end) = combined_gaps[gap_idx];
532 let gap_text = &text[gap_start..gap_end];
533
534 let line_start = text[..gap_start].matches('\n').count() + 1;
536 let newlines_up_to_end = text[..gap_end].matches('\n').count();
538 let line_end = if newlines_up_to_end >= line_start - 1 {
539 newlines_up_to_end.max(line_start)
540 } else {
541 line_start
542 };
543
544 let gap_chunk = Chunk {
545 text: gap_text.to_string(),
546 span: Span {
547 byte_start: gap_start,
548 byte_end: gap_end,
549 line_start,
550 line_end,
551 },
552 chunk_type: ChunkType::Text,
553 metadata: ChunkMetadata::from_text(gap_text),
554 stride_info: None,
555 };
556 result.push(gap_chunk);
557 gap_idx += 1;
558 }
559
560 result
561}
562
563fn merge_haskell_functions(chunks: Vec<Chunk>, source: &str) -> Vec<Chunk> {
565 if chunks.is_empty() {
566 return chunks;
567 }
568
569 let mut merged = Vec::new();
570 let mut i = 0;
571
572 while i < chunks.len() {
573 let chunk = &chunks[i];
574
575 let trimmed = chunk.text.trim();
577 if trimmed.is_empty()
578 || trimmed.starts_with("--")
579 || trimmed.starts_with("{-")
580 || !chunk.text.contains(|c: char| c.is_alphanumeric())
581 {
582 i += 1;
583 continue;
584 }
585
586 let is_signature = chunk.text.contains("::");
589 let function_name = if is_signature {
590 chunk
592 .text
593 .split("::")
594 .next()
595 .and_then(|s| s.split_whitespace().next())
596 .map(|s| s.to_string())
597 } else {
598 extract_haskell_function_name(&chunk.text)
599 };
600
601 if function_name.is_none() {
602 merged.push(chunk.clone());
604 i += 1;
605 continue;
606 }
607
608 let name = function_name.unwrap();
609 let group_start = chunk.span.byte_start;
610 let mut group_end = chunk.span.byte_end;
611 let line_start = chunk.span.line_start;
612 let mut line_end = chunk.span.line_end;
613 let mut trailing_trivia = chunk.metadata.trailing_trivia.clone();
614
615 let mut j = i + 1;
617 while j < chunks.len() {
618 let next_chunk = &chunks[j];
619
620 let next_trimmed = next_chunk.text.trim();
622 if next_trimmed.starts_with("--") || next_trimmed.starts_with("{-") {
623 j += 1;
624 continue;
625 }
626
627 let next_is_signature = next_chunk.text.contains("::");
628 let next_name = if next_is_signature {
629 next_chunk
630 .text
631 .split("::")
632 .next()
633 .and_then(|s| s.split_whitespace().next())
634 .map(|s| s.to_string())
635 } else {
636 extract_haskell_function_name(&next_chunk.text)
637 };
638
639 if next_name == Some(name.clone()) {
640 group_end = next_chunk.span.byte_end;
642 line_end = next_chunk.span.line_end;
643 trailing_trivia = next_chunk.metadata.trailing_trivia.clone();
644 j += 1;
645 } else {
646 break;
647 }
648 }
649
650 let merged_text = source.get(group_start..group_end).unwrap_or("").to_string();
652 let mut metadata = chunk.metadata.with_updated_text(&merged_text);
653 metadata.trailing_trivia = trailing_trivia;
654
655 merged.push(Chunk {
656 span: Span {
657 byte_start: group_start,
658 byte_end: group_end,
659 line_start,
660 line_end,
661 },
662 text: merged_text,
663 chunk_type: ChunkType::Function,
664 stride_info: None,
665 metadata,
666 });
667
668 i = j; }
670
671 merged
672}
673
674fn extract_haskell_function_name(text: &str) -> Option<String> {
676 let trimmed = text.trim();
679
680 let first_word = trimmed
682 .split_whitespace()
683 .next()?
684 .trim_end_matches(|c: char| !c.is_alphanumeric() && c != '_' && c != '\'');
685
686 if first_word.is_empty() {
688 return None;
689 }
690
691 let first_char = first_word.chars().next()?;
692 if first_char.is_lowercase() || first_char == '_' {
693 Some(first_word.to_string())
694 } else {
695 None
696 }
697}
698
699fn chunk_language_with_model(
700 text: &str,
701 language: ParseableLanguage,
702 _model_name: Option<&str>,
703) -> Result<Vec<Chunk>> {
704 chunk_language(text, language)
708}
709
710fn extract_code_chunks(
711 cursor: &mut tree_sitter::TreeCursor,
712 source: &str,
713 chunks: &mut Vec<Chunk>,
714 language: ParseableLanguage,
715) {
716 let node = cursor.node();
717
718 let should_skip = if language == ParseableLanguage::Haskell && node.kind() == "function" {
721 let mut current = node.parent();
723 while let Some(parent) = current {
724 if parent.kind() == "signature" {
725 return; }
727 current = parent.parent();
728 }
729 false
730 } else {
731 false
732 };
733
734 if !should_skip
735 && let Some(initial_chunk_type) = chunk_type_for_node(language, &node)
736 && let Some(chunk) = build_chunk(node, source, initial_chunk_type, language)
737 {
738 let is_duplicate = chunks.iter().any(|existing| {
739 existing.span.byte_start == chunk.span.byte_start
740 && existing.span.byte_end == chunk.span.byte_end
741 });
742
743 if !is_duplicate {
744 chunks.push(chunk);
745 }
746 }
747
748 let should_recurse = !(language == ParseableLanguage::Haskell && node.kind() == "signature");
750
751 if should_recurse && cursor.goto_first_child() {
752 loop {
753 extract_code_chunks(cursor, source, chunks, language);
754 if !cursor.goto_next_sibling() {
755 break;
756 }
757 }
758 cursor.goto_parent();
759 }
760}
761
762fn chunk_type_for_node(
763 language: ParseableLanguage,
764 node: &tree_sitter::Node<'_>,
765) -> Option<ChunkType> {
766 let kind = node.kind();
767
768 let supported = match language {
769 ParseableLanguage::Python => matches!(kind, "function_definition" | "class_definition"),
770 ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => matches!(
771 kind,
772 "function_declaration" | "class_declaration" | "method_definition" | "arrow_function"
773 ),
774 ParseableLanguage::Haskell => matches!(
775 kind,
776 "function" | "signature" | "data_type"
779 | "newtype"
780 | "type_synonym"
781 | "type_family"
782 | "class"
783 | "instance"
784 ),
785 ParseableLanguage::Rust => matches!(
786 kind,
787 "function_item" | "impl_item" | "struct_item" | "enum_item" | "trait_item" | "mod_item"
788 ),
789 ParseableLanguage::Ruby => {
790 matches!(kind, "method" | "class" | "module" | "singleton_method")
791 }
792 ParseableLanguage::Go => matches!(
793 kind,
794 "function_declaration"
795 | "method_declaration"
796 | "type_declaration"
797 | "var_declaration"
798 | "const_declaration"
799 ),
800 ParseableLanguage::CSharp => matches!(
801 kind,
802 "method_declaration"
803 | "class_declaration"
804 | "interface_declaration"
805 | "variable_declaration"
806 ),
807 ParseableLanguage::Dart => matches!(
808 kind,
809 "class_definition"
810 | "class_declaration"
811 | "mixin_declaration"
812 | "enum_declaration"
813 | "function_declaration"
814 | "method_declaration"
815 | "constructor_declaration"
816 | "variable_declaration"
817 | "local_variable_declaration"
818 | "lambda_expression"
819 | "class_member_definition"
820 ),
821 ParseableLanguage::Zig => matches!(
822 kind,
823 "function_declaration"
824 | "test_declaration"
825 | "variable_declaration"
826 | "struct_declaration"
827 | "enum_declaration"
828 | "union_declaration"
829 | "opaque_declaration"
830 | "error_set_declaration"
831 | "comptime_declaration"
832 ),
833 ParseableLanguage::Elixir => matches!(kind, "call" | "do_block"),
836 };
837
838 if !supported {
839 return None;
840 }
841
842 match language {
843 ParseableLanguage::Go
844 if matches!(node.kind(), "var_declaration" | "const_declaration")
845 && node.parent().is_some_and(|p| p.kind() == "block") =>
846 {
847 return None;
848 }
849 ParseableLanguage::CSharp if node.kind() == "variable_declaration" => {
850 if !is_csharp_field_like(*node) {
851 return None;
852 }
853 }
854 _ => {}
855 }
856
857 Some(classify_chunk_kind(kind))
858}
859
860fn classify_chunk_kind(kind: &str) -> ChunkType {
861 match kind {
862 "function_definition"
863 | "function_declaration"
864 | "arrow_function"
865 | "function"
866 | "function_item"
867 | "def"
868 | "defp"
869 | "defn"
870 | "defn-"
871 | "method"
872 | "singleton_method" => ChunkType::Function,
873 "signature" => ChunkType::Function, "class_definition"
875 | "class_declaration"
876 | "instance_declaration"
877 | "class"
878 | "instance"
879 | "struct_item"
880 | "enum_item"
881 | "defstruct"
882 | "defrecord"
883 | "deftype"
884 | "type_declaration"
885 | "struct_declaration"
886 | "enum_declaration"
887 | "union_declaration"
888 | "opaque_declaration"
889 | "error_set_declaration" => ChunkType::Class,
890 "method_definition" | "method_declaration" | "defmacro" => ChunkType::Method,
891 "data_type"
892 | "newtype"
893 | "type_synonym"
894 | "type_family"
895 | "impl_item"
896 | "trait_item"
897 | "mod_item"
898 | "defmodule"
899 | "module"
900 | "defprotocol"
901 | "interface_declaration"
902 | "ns"
903 | "var_declaration"
904 | "const_declaration"
905 | "variable_declaration"
906 | "test_declaration"
907 | "comptime_declaration" => ChunkType::Module,
908 _ => ChunkType::Text,
909 }
910}
911
912pub(crate) fn build_chunk(
913 node: tree_sitter::Node<'_>,
914 source: &str,
915 initial_type: ChunkType,
916 language: ParseableLanguage,
917) -> Option<Chunk> {
918 let target_node = adjust_node_for_language(node, language);
919 let (byte_start, start_row, leading_segments) =
920 extend_with_leading_trivia(target_node, language, source);
921 let trailing_segments = collect_trailing_trivia(target_node, language, source);
922
923 let byte_end = target_node.end_byte();
924 let end_pos = target_node.end_position();
925
926 if byte_start >= byte_end || byte_end > source.len() {
927 return None;
928 }
929
930 let text = source.get(byte_start..byte_end)?.to_string();
931
932 if text.trim().is_empty() {
933 return None;
934 }
935
936 let chunk_type = adjust_chunk_type_for_context(target_node, initial_type, language);
937 let ancestry = collect_ancestry(target_node, language, source);
938 let leading_trivia = segments_to_strings(&leading_segments, source);
939 let trailing_trivia = segments_to_strings(&trailing_segments, source);
940 let metadata = ChunkMetadata::from_context(&text, ancestry, leading_trivia, trailing_trivia);
941
942 Some(Chunk {
943 span: Span {
944 byte_start,
945 byte_end,
946 line_start: start_row + 1,
947 line_end: end_pos.row + 1,
948 },
949 text,
950 chunk_type,
951 stride_info: None,
952 metadata,
953 })
954}
955
956fn adjust_node_for_language(
957 node: tree_sitter::Node<'_>,
958 language: ParseableLanguage,
959) -> tree_sitter::Node<'_> {
960 match language {
961 ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => {
962 if node.kind() == "arrow_function" {
963 return expand_arrow_function_context(node);
964 }
965 node
966 }
967 _ => node,
968 }
969}
970
971fn expand_arrow_function_context(mut node: tree_sitter::Node<'_>) -> tree_sitter::Node<'_> {
972 const PARENTS: &[&str] = &[
973 "parenthesized_expression",
974 "variable_declarator",
975 "variable_declaration",
976 "lexical_declaration",
977 "assignment_expression",
978 "expression_statement",
979 "public_field_definition",
980 "export_statement",
981 ];
982
983 while let Some(parent) = node.parent() {
984 let kind = parent.kind();
985 if PARENTS.contains(&kind) {
986 node = parent;
987 continue;
988 }
989 break;
990 }
991
992 node
993}
994
995#[derive(Clone, Copy)]
996struct TriviaSegment {
997 start_byte: usize,
998 end_byte: usize,
999}
1000
1001fn extend_with_leading_trivia(
1002 node: tree_sitter::Node<'_>,
1003 language: ParseableLanguage,
1004 source: &str,
1005) -> (usize, usize, Vec<TriviaSegment>) {
1006 let mut start_byte = node.start_byte();
1007 let mut start_row = node.start_position().row;
1008 let mut current = node;
1009 let mut segments = Vec::new();
1010
1011 while let Some(prev) = current.prev_sibling() {
1012 if should_attach_leading_trivia(language, &prev)
1013 && only_whitespace_between(source, prev.end_byte(), start_byte)
1014 {
1015 start_byte = prev.start_byte();
1016 start_row = prev.start_position().row;
1017 segments.push(TriviaSegment {
1018 start_byte: prev.start_byte(),
1019 end_byte: prev.end_byte(),
1020 });
1021 current = prev;
1022 continue;
1023 }
1024 break;
1025 }
1026
1027 segments.reverse();
1028 (start_byte, start_row, segments)
1029}
1030
1031fn should_attach_leading_trivia(language: ParseableLanguage, node: &tree_sitter::Node<'_>) -> bool {
1032 let kind = node.kind();
1033 if kind == "comment" {
1034 return true;
1035 }
1036
1037 match language {
1038 ParseableLanguage::Rust => {
1039 matches!(kind, "line_comment" | "block_comment" | "attribute_item")
1040 }
1041 ParseableLanguage::Python => kind == "decorator",
1042 ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => kind == "decorator",
1043 ParseableLanguage::CSharp => matches!(kind, "attribute_list" | "attribute"),
1044 _ => false,
1045 }
1046}
1047
1048fn collect_trailing_trivia(
1049 node: tree_sitter::Node<'_>,
1050 language: ParseableLanguage,
1051 source: &str,
1052) -> Vec<TriviaSegment> {
1053 let mut segments = Vec::new();
1054 let mut current = node;
1055 let mut previous_end = node.end_byte();
1056
1057 while let Some(next) = current.next_sibling() {
1058 if should_attach_trailing_trivia(language, &next)
1059 && only_whitespace_between(source, previous_end, next.start_byte())
1060 {
1061 segments.push(TriviaSegment {
1062 start_byte: next.start_byte(),
1063 end_byte: next.end_byte(),
1064 });
1065 previous_end = next.end_byte();
1066 current = next;
1067 continue;
1068 }
1069 break;
1070 }
1071
1072 segments
1073}
1074
1075fn should_attach_trailing_trivia(
1076 _language: ParseableLanguage,
1077 node: &tree_sitter::Node<'_>,
1078) -> bool {
1079 node.kind() == "comment"
1080}
1081
1082fn segments_to_strings(segments: &[TriviaSegment], source: &str) -> Vec<String> {
1083 let mut result = Vec::new();
1084
1085 for segment in segments {
1086 if let Some(text) = source
1087 .get(segment.start_byte..segment.end_byte)
1088 .map(|s| s.to_string())
1089 {
1090 result.push(text);
1091 }
1092 }
1093
1094 result
1095}
1096
1097fn collect_ancestry(
1098 mut node: tree_sitter::Node<'_>,
1099 language: ParseableLanguage,
1100 source: &str,
1101) -> Vec<String> {
1102 let mut parts = Vec::new();
1103
1104 while let Some(parent) = node.parent() {
1105 if let Some(parent_chunk_type) = chunk_type_for_node(language, &parent)
1106 && let Some(name) = display_name_for_node(parent, language, source, parent_chunk_type)
1107 {
1108 parts.push(name);
1109 }
1110 node = parent;
1111 }
1112
1113 parts.reverse();
1114 parts
1115}
1116
1117fn display_name_for_node(
1118 node: tree_sitter::Node<'_>,
1119 language: ParseableLanguage,
1120 source: &str,
1121 chunk_type: ChunkType,
1122) -> Option<String> {
1123 if let Some(name_node) = node.child_by_field_name("name") {
1124 return text_for_node(name_node, source);
1125 }
1126
1127 match language {
1128 ParseableLanguage::Rust => rust_display_name(node, source, chunk_type),
1129 ParseableLanguage::Python => find_identifier(node, source, &["identifier"]),
1130 ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => find_identifier(
1131 node,
1132 source,
1133 &["identifier", "type_identifier", "property_identifier"],
1134 ),
1135 ParseableLanguage::Haskell => {
1136 find_identifier(node, source, &["identifier", "type_identifier", "variable"])
1137 .or_else(|| first_word_of_node(node, source))
1138 }
1139 ParseableLanguage::Ruby => find_identifier(node, source, &["identifier"]),
1140 ParseableLanguage::Go => find_identifier(node, source, &["identifier", "type_identifier"]),
1141 ParseableLanguage::CSharp => find_identifier(node, source, &["identifier"]),
1142 ParseableLanguage::Zig => find_identifier(node, source, &["identifier"]),
1143
1144 ParseableLanguage::Dart => {
1145 find_identifier(node, source, &["identifier", "type_identifier"])
1146 }
1147 ParseableLanguage::Elixir => {
1148 find_identifier(node, source, &["alias", "identifier", "atom"])
1150 }
1151 }
1152}
1153
1154fn rust_display_name(
1155 node: tree_sitter::Node<'_>,
1156 source: &str,
1157 chunk_type: ChunkType,
1158) -> Option<String> {
1159 match node.kind() {
1160 "impl_item" => {
1161 let mut parts = Vec::new();
1162 if let Some(ty) = node.child_by_field_name("type")
1163 && let Some(text) = text_for_node(ty, source)
1164 {
1165 parts.push(text);
1166 }
1167 if let Some(trait_node) = node.child_by_field_name("trait")
1168 && let Some(text) = text_for_node(trait_node, source)
1169 {
1170 if let Some(last) = parts.first() {
1171 parts[0] = format!("{} (impl {})", last, text.trim());
1172 } else {
1173 parts.push(format!("impl {}", text.trim()));
1174 }
1175 }
1176 if parts.is_empty() {
1177 find_identifier(node, source, &["identifier"])
1178 } else {
1179 Some(parts.remove(0))
1180 }
1181 }
1182 "mod_item" if chunk_type == ChunkType::Module => {
1183 find_identifier(node, source, &["identifier"])
1184 }
1185 _ => find_identifier(node, source, &["identifier", "type_identifier"]),
1186 }
1187}
1188
1189fn find_identifier(
1190 node: tree_sitter::Node<'_>,
1191 source: &str,
1192 candidate_kinds: &[&str],
1193) -> Option<String> {
1194 let mut cursor = node.walk();
1195 for child in node.children(&mut cursor) {
1196 if candidate_kinds.contains(&child.kind())
1197 && let Some(text) = text_for_node(child, source)
1198 {
1199 return Some(text.trim().to_string());
1200 }
1201 }
1202 None
1203}
1204
1205fn first_word_of_node(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
1206 let text = text_for_node(node, source)?;
1207 text.split_whitespace().next().map(|s| {
1208 s.trim_end_matches(|c: char| !c.is_alphanumeric() && c != '_')
1209 .to_string()
1210 })
1211}
1212
1213fn text_for_node(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
1214 node.utf8_text(source.as_bytes())
1215 .ok()
1216 .map(|s| s.to_string())
1217}
1218
1219fn only_whitespace_between(source: &str, start: usize, end: usize) -> bool {
1220 if start >= end || end > source.len() {
1221 return true;
1222 }
1223
1224 source[start..end].chars().all(|c| c.is_whitespace())
1225}
1226
1227fn adjust_chunk_type_for_context(
1228 node: tree_sitter::Node<'_>,
1229 chunk_type: ChunkType,
1230 language: ParseableLanguage,
1231) -> ChunkType {
1232 if chunk_type != ChunkType::Function {
1233 return chunk_type;
1234 }
1235
1236 if is_method_context(node, language) {
1237 ChunkType::Method
1238 } else {
1239 chunk_type
1240 }
1241}
1242
1243fn is_method_context(node: tree_sitter::Node<'_>, language: ParseableLanguage) -> bool {
1244 const PYTHON_CONTAINERS: &[&str] = &["class_definition"];
1245 const TYPESCRIPT_CONTAINERS: &[&str] = &["class_body", "class_declaration"];
1246 const RUBY_CONTAINERS: &[&str] = &["class", "module"];
1247 const RUST_CONTAINERS: &[&str] = &["impl_item", "trait_item"];
1248 const DART_CONTAINERS: &[&str] = &[
1249 "class_definition",
1250 "class_declaration",
1251 "mixin_declaration",
1252 "enum_declaration",
1253 ];
1254
1255 match language {
1256 ParseableLanguage::Python => ancestor_has_kind(node, PYTHON_CONTAINERS),
1257 ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => {
1258 ancestor_has_kind(node, TYPESCRIPT_CONTAINERS)
1259 }
1260 ParseableLanguage::Ruby => ancestor_has_kind(node, RUBY_CONTAINERS),
1261 ParseableLanguage::Rust => ancestor_has_kind(node, RUST_CONTAINERS),
1262 ParseableLanguage::Go => false,
1263 ParseableLanguage::CSharp => false,
1264 ParseableLanguage::Haskell => false,
1265 ParseableLanguage::Zig => false,
1266
1267 ParseableLanguage::Dart => ancestor_has_kind(node, DART_CONTAINERS),
1268
1269 ParseableLanguage::Elixir => false, }
1271}
1272
1273fn ancestor_has_kind(node: tree_sitter::Node<'_>, kinds: &[&str]) -> bool {
1274 let mut current = node;
1275 while let Some(parent) = current.parent() {
1276 if kinds.contains(&parent.kind()) {
1277 return true;
1278 }
1279 current = parent;
1280 }
1281 false
1282}
1283
1284fn is_csharp_field_like(node: tree_sitter::Node<'_>) -> bool {
1285 if let Some(parent) = node.parent() {
1286 return matches!(
1287 parent.kind(),
1288 "field_declaration" | "event_field_declaration"
1289 );
1290 }
1291 false
1292}
1293
1294fn apply_striding(chunks: Vec<Chunk>, config: &ChunkConfig) -> Result<Vec<Chunk>> {
1296 let mut result = Vec::new();
1297
1298 for chunk in chunks {
1299 let estimated_tokens = estimate_tokens(&chunk.text);
1300
1301 if estimated_tokens <= config.max_tokens {
1302 result.push(chunk);
1304 } else {
1305 tracing::debug!(
1307 "Chunk with {} tokens exceeds limit of {}, applying striding",
1308 estimated_tokens,
1309 config.max_tokens
1310 );
1311
1312 let strided_chunks = stride_large_chunk(chunk, config)?;
1313 result.extend(strided_chunks);
1314 }
1315 }
1316
1317 Ok(result)
1318}
1319
1320fn stride_large_chunk(chunk: Chunk, config: &ChunkConfig) -> Result<Vec<Chunk>> {
1322 let text = &chunk.text;
1323
1324 if text.is_empty() {
1326 return Ok(vec![chunk]);
1327 }
1328
1329 let char_count = text.chars().count();
1332 let estimated_tokens = estimate_tokens(text);
1333 let chars_per_token = if estimated_tokens == 0 {
1335 4.5 } else {
1337 char_count as f32 / estimated_tokens as f32
1338 };
1339 let window_chars = ((config.max_tokens as f32 * 0.9) * chars_per_token) as usize; let overlap_chars = (config.stride_overlap as f32 * chars_per_token) as usize;
1341 let stride_chars = window_chars.saturating_sub(overlap_chars);
1342
1343 if stride_chars == 0 {
1344 return Err(anyhow::anyhow!("Stride size is too small"));
1345 }
1346
1347 let char_byte_indices: Vec<(usize, char)> = text.char_indices().collect();
1349 let mut strided_chunks = Vec::new();
1352 let original_chunk_id = format!("{}:{}", chunk.span.byte_start, chunk.span.byte_end);
1353 let mut start_char_idx = 0;
1354 let mut stride_index = 0;
1355
1356 let total_strides = if char_count <= window_chars {
1358 1
1359 } else {
1360 ((char_count - overlap_chars) as f32 / stride_chars as f32).ceil() as usize
1361 };
1362
1363 while start_char_idx < char_count {
1364 let end_char_idx = (start_char_idx + window_chars).min(char_count);
1365
1366 let start_byte_pos = char_byte_indices[start_char_idx].0;
1368 let end_byte_pos = if end_char_idx < char_count {
1369 char_byte_indices[end_char_idx].0
1370 } else {
1371 text.len()
1372 };
1373
1374 let stride_text = &text[start_byte_pos..end_byte_pos];
1375
1376 let overlap_start = if stride_index > 0 { overlap_chars } else { 0 };
1378 let overlap_end = if end_char_idx < char_count {
1379 overlap_chars
1380 } else {
1381 0
1382 };
1383
1384 let byte_offset_start = chunk.span.byte_start + start_byte_pos;
1386 let byte_offset_end = chunk.span.byte_start + end_byte_pos;
1387
1388 let text_before_start = &text[..start_byte_pos];
1390 let line_offset_start = text_before_start.lines().count().saturating_sub(1);
1391 let stride_lines = stride_text.lines().count();
1392 let metadata = chunk.metadata.with_updated_text(stride_text);
1393
1394 let stride_chunk = Chunk {
1395 span: Span {
1396 byte_start: byte_offset_start,
1397 byte_end: byte_offset_end,
1398 line_start: chunk.span.line_start + line_offset_start,
1399 line_end: chunk.span.line_start
1401 + line_offset_start
1402 + stride_lines.saturating_sub(1),
1403 },
1404 text: stride_text.to_string(),
1405 chunk_type: chunk.chunk_type.clone(),
1406 stride_info: Some(StrideInfo {
1407 original_chunk_id: original_chunk_id.clone(),
1408 stride_index,
1409 total_strides,
1410 overlap_start,
1411 overlap_end,
1412 }),
1413 metadata,
1414 };
1415
1416 strided_chunks.push(stride_chunk);
1417
1418 if end_char_idx >= char_count {
1420 break;
1421 }
1422
1423 start_char_idx += stride_chars;
1424 stride_index += 1;
1425 }
1426
1427 tracing::debug!(
1428 "Created {} strides from chunk of {} tokens",
1429 strided_chunks.len(),
1430 estimate_tokens(text)
1431 );
1432
1433 Ok(strided_chunks)
1434}
1435
1436#[cfg(test)]
1439mod tests {
1440 use super::*;
1441
1442 fn canonicalize_spans(
1443 mut spans: Vec<(usize, usize, ChunkType)>,
1444 ) -> Vec<(usize, usize, ChunkType)> {
1445 fn chunk_type_order(chunk_type: &ChunkType) -> u8 {
1446 match chunk_type {
1447 ChunkType::Text => 0,
1448 ChunkType::Function => 1,
1449 ChunkType::Class => 2,
1450 ChunkType::Method => 3,
1451 ChunkType::Module => 4,
1452 }
1453 }
1454
1455 spans.sort_by(|a, b| {
1456 let order_a = chunk_type_order(&a.2);
1457 let order_b = chunk_type_order(&b.2);
1458 order_a
1459 .cmp(&order_b)
1460 .then_with(|| a.0.cmp(&b.0))
1461 .then_with(|| a.1.cmp(&b.1))
1462 });
1463
1464 let mut result: Vec<(usize, usize, ChunkType)> = Vec::new();
1465 for (start, end, ty) in spans {
1466 if let Some(last) = result.last_mut()
1467 && last.0 == start
1468 && last.2 == ty
1469 {
1470 if end > last.1 {
1471 last.1 = end;
1472 }
1473 continue;
1474 }
1475 result.push((start, end, ty));
1476 }
1477
1478 result
1479 }
1480
1481 fn assert_query_parity(language: ParseableLanguage, source: &str) {
1482 let mut parser = tree_sitter::Parser::new();
1483 let ts_language = tree_sitter_language(language).expect("language");
1484 parser.set_language(&ts_language).expect("set language");
1485 let tree = parser.parse(source, None).expect("parse source");
1486
1487 let query_chunks = query_chunker::chunk_with_queries(language, ts_language, &tree, source)
1488 .expect("query execution")
1489 .expect("queries available");
1490
1491 let mut legacy_chunks = Vec::new();
1492 let mut cursor = tree.walk();
1493 extract_code_chunks(&mut cursor, source, &mut legacy_chunks, language);
1494
1495 let query_spans = canonicalize_spans(
1496 query_chunks
1497 .iter()
1498 .map(|chunk| {
1499 (
1500 chunk.span.byte_start,
1501 chunk.span.byte_end,
1502 chunk.chunk_type.clone(),
1503 )
1504 })
1505 .collect(),
1506 );
1507 let legacy_spans = canonicalize_spans(
1508 legacy_chunks
1509 .iter()
1510 .map(|chunk| {
1511 (
1512 chunk.span.byte_start,
1513 chunk.span.byte_end,
1514 chunk.chunk_type.clone(),
1515 )
1516 })
1517 .collect(),
1518 );
1519
1520 assert_eq!(query_spans, legacy_spans);
1521 }
1522
1523 #[test]
1524 fn test_chunk_generic_byte_offsets() {
1525 let text = "line 1\nline 2\nline 3\nline 4\nline 5";
1527 let chunks = chunk_generic(text).unwrap();
1528
1529 assert!(!chunks.is_empty());
1530
1531 assert_eq!(chunks[0].span.byte_start, 0);
1533
1534 for chunk in &chunks {
1536 let expected_len = chunk.text.len();
1537 let actual_len = chunk.span.byte_end - chunk.span.byte_start;
1538 assert_eq!(actual_len, expected_len);
1539 }
1540 }
1541
1542 #[test]
1543 fn test_chunk_generic_large_file_performance() {
1544 let lines: Vec<String> = (0..1000)
1546 .map(|i| format!("Line {}: Some content here", i))
1547 .collect();
1548 let text = lines.join("\n");
1549
1550 let start = std::time::Instant::now();
1551 let chunks = chunk_generic(&text).unwrap();
1552 let duration = start.elapsed();
1553
1554 assert!(
1556 duration.as_millis() < 100,
1557 "Chunking took too long: {:?}",
1558 duration
1559 );
1560 assert!(!chunks.is_empty());
1561
1562 for chunk in &chunks {
1564 assert!(chunk.span.line_start > 0);
1565 assert!(chunk.span.line_end >= chunk.span.line_start);
1566 }
1567 }
1568
1569 #[test]
1570 fn test_chunk_rust() {
1571 let rust_code = r#"
1572pub struct Calculator {
1573 memory: f64,
1574}
1575
1576impl Calculator {
1577 pub fn new() -> Self {
1578 Calculator { memory: 0.0 }
1579 }
1580
1581 pub fn add(&mut self, a: f64, b: f64) -> f64 {
1582 a + b
1583 }
1584}
1585
1586fn main() {
1587 let calc = Calculator::new();
1588}
1589
1590pub mod utils {
1591 pub fn helper() {}
1592}
1593"#;
1594
1595 let chunks = chunk_language(rust_code, ParseableLanguage::Rust).unwrap();
1596 assert!(!chunks.is_empty());
1597
1598 let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
1600 assert!(chunk_types.contains(&&ChunkType::Class)); assert!(chunk_types.contains(&&ChunkType::Module)); assert!(chunk_types.contains(&&ChunkType::Function)); }
1604
1605 #[test]
1606 fn test_rust_doc_comments_attached() {
1607 let rust_code = r#"
1608/// Doc comment
1609pub struct Foo {}
1610"#;
1611 let chunks = chunk_language(rust_code, ParseableLanguage::Rust).unwrap();
1612 let struct_chunk = chunks
1613 .iter()
1614 .find(|c| c.text.contains("struct Foo"))
1615 .unwrap();
1616 assert!(
1617 struct_chunk.text.contains("/// Doc comment"),
1618 "Doc comment should be attached"
1619 );
1620 }
1621
1622 #[test]
1623 fn test_rust_query_matches_legacy() {
1624 let source = r#"
1625 mod sample {
1626 struct Thing;
1627
1628 impl Thing {
1629 fn new() -> Self { Self }
1630 fn helper(&self) {}
1631 }
1632 }
1633
1634 fn util() {}
1635 "#;
1636
1637 assert_query_parity(ParseableLanguage::Rust, source);
1638 }
1639
1640 #[test]
1641 fn test_python_query_matches_legacy() {
1642 let source = r#"
1643class Example:
1644 @classmethod
1645 def build(cls):
1646 return cls()
1647
1648
1649def helper():
1650 return 1
1651
1652
1653async def async_helper():
1654 return 2
1655"#;
1656
1657 assert_query_parity(ParseableLanguage::Python, source);
1658 }
1659
1660 #[test]
1661 fn test_chunk_ruby() {
1662 let ruby_code = r#"
1663class Calculator
1664 def initialize
1665 @memory = 0.0
1666 end
1667
1668 def add(a, b)
1669 a + b
1670 end
1671
1672 def self.class_method
1673 "class method"
1674 end
1675
1676 private
1677
1678 def private_method
1679 "private"
1680 end
1681end
1682
1683module Utils
1684 def self.helper
1685 "helper"
1686 end
1687end
1688
1689def main
1690 calc = Calculator.new
1691end
1692"#;
1693
1694 let chunks = chunk_language(ruby_code, ParseableLanguage::Ruby).unwrap();
1695 assert!(!chunks.is_empty());
1696
1697 let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
1699 assert!(chunk_types.contains(&&ChunkType::Class)); assert!(chunk_types.contains(&&ChunkType::Module)); assert!(chunk_types.contains(&&ChunkType::Function)); }
1703
1704 #[test]
1705 fn test_language_detection_fallback() {
1706 let generic_text = "Some text\nwith multiple lines\nto chunk generically";
1708
1709 let chunks_unknown = chunk_text(generic_text, None).unwrap();
1710 let chunks_generic = chunk_generic(generic_text).unwrap();
1711
1712 assert_eq!(chunks_unknown.len(), chunks_generic.len());
1714 assert_eq!(chunks_unknown[0].text, chunks_generic[0].text);
1715 }
1716
1717 #[test]
1718 fn test_chunk_go() {
1719 let go_code = r#"
1720package main
1721
1722import "fmt"
1723
1724const Pi = 3.14159
1725
1726var memory float64
1727
1728type Calculator struct {
1729 memory float64
1730}
1731
1732type Operation interface {
1733 Calculate(a, b float64) float64
1734}
1735
1736func NewCalculator() *Calculator {
1737 return &Calculator{memory: 0.0}
1738}
1739
1740func (c *Calculator) Add(a, b float64) float64 {
1741 return a + b
1742}
1743
1744func main() {
1745 calc := NewCalculator()
1746}
1747"#;
1748
1749 let chunks = chunk_language(go_code, ParseableLanguage::Go).unwrap();
1750 assert!(!chunks.is_empty());
1751
1752 let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
1754 assert!(chunk_types.contains(&&ChunkType::Module)); assert!(chunk_types.contains(&&ChunkType::Class)); assert!(chunk_types.contains(&&ChunkType::Function)); assert!(chunk_types.contains(&&ChunkType::Method)); }
1759
1760 #[test]
1761 #[ignore] fn test_chunk_typescript_arrow_context() {
1763 let ts_code = r#"
1764// Utility function
1765export const util = () => {
1766 // comment about util
1767 return 42;
1768};
1769
1770export class Example {
1771 // leading comment for method
1772 constructor() {}
1773
1774 // Another comment
1775 run = () => {
1776 return util();
1777 };
1778}
1779
1780const compute = (x: number) => x * 2;
1781"#;
1782
1783 let chunks = chunk_language(ts_code, ParseableLanguage::TypeScript).unwrap();
1784
1785 let util_chunk = chunks
1786 .iter()
1787 .find(|chunk| chunk.text.contains("export const util"))
1788 .expect("Expected chunk for util arrow function");
1789 assert_eq!(util_chunk.chunk_type, ChunkType::Function);
1790 assert!(
1791 util_chunk.text.contains("// Utility function"),
1792 "expected leading comment to be included"
1793 );
1794 assert!(util_chunk.text.contains("export const util ="));
1795
1796 let method_chunk = chunks
1798 .iter()
1799 .find(|chunk| {
1800 chunk.chunk_type == ChunkType::Method && chunk.text.contains("run = () =>")
1801 })
1802 .expect("Expected chunk for class field arrow function");
1803
1804 assert_eq!(method_chunk.chunk_type, ChunkType::Method);
1805 assert!(
1806 method_chunk.text.contains("// Another comment"),
1807 "expected inline comment to be included"
1808 );
1809
1810 let compute_chunk = chunks
1811 .iter()
1812 .find(|chunk| chunk.text.contains("const compute"))
1813 .expect("Expected chunk for compute arrow function");
1814 assert_eq!(compute_chunk.chunk_type, ChunkType::Function);
1815 assert!(
1816 compute_chunk
1817 .text
1818 .contains("const compute = (x: number) => x * 2;")
1819 );
1820
1821 assert!(
1823 chunks
1824 .iter()
1825 .all(|chunk| !chunk.text.trim_start().starts_with("() =>"))
1826 );
1827 assert!(
1828 chunks
1829 .iter()
1830 .all(|chunk| !chunk.text.trim_start().starts_with("(x: number) =>"))
1831 );
1832 }
1833
1834 #[test]
1838 #[ignore]
1839 fn test_typescript_query_matches_legacy() {
1840 let source = r#"
1841export const util = () => {
1842 return 42;
1843};
1844
1845export class Example {
1846 run = () => {
1847 return util();
1848 };
1849}
1850
1851const compute = (x: number) => x * 2;
1852"#;
1853
1854 assert_query_parity(ParseableLanguage::TypeScript, source);
1855 }
1856
1857 #[test]
1858 fn test_ruby_query_matches_legacy() {
1859 let source = r#"
1860class Calculator
1861 def initialize
1862 @memory = 0.0
1863 end
1864
1865 def add(a, b)
1866 a + b
1867 end
1868
1869 def self.class_method
1870 "class method"
1871 end
1872end
1873"#;
1874
1875 assert_query_parity(ParseableLanguage::Ruby, source);
1876 }
1877
1878 #[test]
1879 fn test_go_query_matches_legacy() {
1880 let source = r#"
1881package main
1882
1883import "fmt"
1884
1885const Pi = 3.14159
1886
1887var memory float64
1888
1889type Calculator struct {
1890 memory float64
1891}
1892
1893func (c *Calculator) Add(a, b float64) float64 {
1894 return a + b
1895}
1896
1897func Helper() {}
1898"#;
1899
1900 assert_query_parity(ParseableLanguage::Go, source);
1901 }
1902
1903 #[test]
1904 fn test_haskell_query_matches_legacy() {
1905 let source = r#"
1906module Example where
1907
1908data Shape
1909 = Circle Float
1910 | Square Float
1911
1912type family Area a
1913
1914class Printable a where
1915 printValue :: a -> String
1916
1917instance Printable Shape where
1918 printValue (Circle _) = "circle"
1919 printValue (Square _) = "square"
1920
1921shapeDescription :: Shape -> String
1922shapeDescription (Circle r) = "circle of radius " ++ show r
1923shapeDescription (Square s) = "square of side " ++ show s
1924"#;
1925
1926 assert_query_parity(ParseableLanguage::Haskell, source);
1927 }
1928
1929 #[test]
1930 fn test_csharp_query_matches_legacy() {
1931 let source = r#"
1932namespace Calculator;
1933
1934public interface ICalculator
1935{
1936 double Add(double x, double y);
1937}
1938
1939public class Calculator
1940{
1941 public static double PI = 3.14159;
1942 private double _memory;
1943
1944 public Calculator()
1945 {
1946 _memory = 0.0;
1947 }
1948
1949 public double Add(double x, double y)
1950 {
1951 return x + y;
1952 }
1953}
1954"#;
1955
1956 assert_query_parity(ParseableLanguage::CSharp, source);
1957 }
1958
1959 #[test]
1960 fn test_zig_query_matches_legacy() {
1961 let source = r#"
1962const std = @import("std");
1963
1964const Calculator = struct {
1965 memory: f64,
1966
1967 pub fn init() Calculator {
1968 return Calculator{ .memory = 0.0 };
1969 }
1970
1971 pub fn add(self: *Calculator, a: f64, b: f64) f64 {
1972 return a + b;
1973 }
1974};
1975
1976test "calculator addition" {
1977 var calc = Calculator.init();
1978 const result = calc.add(2.0, 3.0);
1979 try std.testing.expect(result == 5.0);
1980}
1981"#;
1982
1983 assert_query_parity(ParseableLanguage::Zig, source);
1984 }
1985
1986 #[test]
1987 fn test_chunk_zig() {
1988 let zig_code = r#"
1989const std = @import("std");
1990
1991const Calculator = struct {
1992 memory: f64,
1993
1994 pub fn init() Calculator {
1995 return Calculator{ .memory = 0.0 };
1996 }
1997
1998 pub fn add(self: *Calculator, a: f64, b: f64) f64 {
1999 const result = a + b;
2000 self.memory = result;
2001 return result;
2002 }
2003};
2004
2005const Color = enum {
2006 Red,
2007 Green,
2008 Blue,
2009};
2010
2011const Value = union(enum) {
2012 int: i32,
2013 float: f64,
2014};
2015
2016const Handle = opaque {};
2017
2018const MathError = error{
2019 DivisionByZero,
2020 Overflow,
2021};
2022
2023pub fn multiply(a: i32, b: i32) i32 {
2024 return a * b;
2025}
2026
2027pub fn divide(a: i32, b: i32) MathError!i32 {
2028 if (b == 0) return error.DivisionByZero;
2029 return @divTrunc(a, b);
2030}
2031
2032comptime {
2033 @compileLog("Compile-time validation");
2034}
2035
2036pub fn main() !void {
2037 var calc = Calculator.init();
2038 const result = calc.add(2.0, 3.0);
2039 std.debug.print("Result: {}\n", .{result});
2040}
2041
2042test "calculator addition" {
2043 var calc = Calculator.init();
2044 const result = calc.add(2.0, 3.0);
2045 try std.testing.expect(result == 5.0);
2046}
2047
2048test "multiply function" {
2049 const result = multiply(3, 4);
2050 try std.testing.expect(result == 12);
2051}
2052"#;
2053
2054 let chunks = chunk_language(zig_code, ParseableLanguage::Zig).unwrap();
2055 assert!(!chunks.is_empty());
2056
2057 let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
2058
2059 let class_count = chunk_types
2060 .iter()
2061 .filter(|&&t| t == &ChunkType::Class)
2062 .count();
2063 let function_count = chunk_types
2064 .iter()
2065 .filter(|&&t| t == &ChunkType::Function)
2066 .count();
2067 let module_count = chunk_types
2068 .iter()
2069 .filter(|&&t| t == &ChunkType::Module)
2070 .count();
2071
2072 assert!(
2073 class_count >= 5,
2074 "Expected at least 5 Class chunks (struct, enum, union, opaque, error set), found {}",
2075 class_count
2076 );
2077
2078 assert!(
2079 function_count >= 3,
2080 "Expected at least 3 functions (multiply, divide, main), found {}",
2081 function_count
2082 );
2083
2084 assert!(
2085 module_count >= 4,
2086 "Expected at least 4 module-type chunks (const std, comptime, 2 tests), found {}",
2087 module_count
2088 );
2089
2090 assert!(
2091 chunk_types.contains(&&ChunkType::Class),
2092 "Expected to find Class chunks"
2093 );
2094 assert!(
2095 chunk_types.contains(&&ChunkType::Function),
2096 "Expected to find Function chunks"
2097 );
2098 assert!(
2099 chunk_types.contains(&&ChunkType::Module),
2100 "Expected to find Module chunks"
2101 );
2102 }
2103
2104 #[test]
2105 fn test_chunk_csharp() {
2106 let csharp_code = r#"
2107namespace Calculator;
2108
2109public interface ICalculator
2110{
2111 double Add(double x, double y);
2112}
2113
2114public class Calculator
2115{
2116 public static const double PI = 3.14159;
2117 private double _memory;
2118
2119 public Calculator()
2120 {
2121 _memory = 0.0;
2122 }
2123
2124 public double Add(double x, double y)
2125 {
2126 return x + y;
2127 }
2128
2129 public static void Main(string[] args)
2130 {
2131 var calc = new Calculator();
2132 }
2133}
2134"#;
2135
2136 let chunks = chunk_language(csharp_code, ParseableLanguage::CSharp).unwrap();
2137 assert!(!chunks.is_empty());
2138
2139 let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
2141 assert!(chunk_types.contains(&&ChunkType::Module)); assert!(chunk_types.contains(&&ChunkType::Class)); assert!(chunk_types.contains(&&ChunkType::Method)); }
2145
2146 #[test]
2147 fn test_stride_large_chunk_empty_text() {
2148 let empty_chunk = Chunk {
2150 span: Span {
2151 byte_start: 0,
2152 byte_end: 0,
2153 line_start: 1,
2154 line_end: 1,
2155 },
2156 text: String::new(), chunk_type: ChunkType::Text,
2158 stride_info: None,
2159 metadata: ChunkMetadata::from_text(""),
2160 };
2161
2162 let config = ChunkConfig::default();
2163 let result = stride_large_chunk(empty_chunk.clone(), &config);
2164
2165 assert!(result.is_ok());
2167 let chunks = result.unwrap();
2168 assert_eq!(chunks.len(), 1);
2169 assert_eq!(chunks[0].text, "");
2170 }
2171
2172 #[test]
2173 fn test_stride_large_chunk_zero_token_estimate() {
2174 let chunk = Chunk {
2176 span: Span {
2177 byte_start: 0,
2178 byte_end: 5,
2179 line_start: 1,
2180 line_end: 1,
2181 },
2182 text: " ".to_string(), chunk_type: ChunkType::Text,
2184 stride_info: None,
2185 metadata: ChunkMetadata::from_text(" "),
2186 };
2187
2188 let config = ChunkConfig::default();
2189 let result = stride_large_chunk(chunk, &config);
2190
2191 assert!(result.is_ok());
2193 }
2194
2195 #[test]
2196 fn test_strided_chunk_line_calculation() {
2197 let long_text = (1..=50).map(|i| format!("This is a longer line {} with more content to ensure token count is high enough", i)).collect::<Vec<_>>().join("\n");
2200
2201 let metadata = ChunkMetadata::from_text(&long_text);
2202 let chunk = Chunk {
2203 span: Span {
2204 byte_start: 0,
2205 byte_end: long_text.len(),
2206 line_start: 1,
2207 line_end: 50,
2208 },
2209 text: long_text,
2210 chunk_type: ChunkType::Text,
2211 stride_info: None,
2212 metadata,
2213 };
2214
2215 let config = ChunkConfig {
2216 max_tokens: 100, stride_overlap: 10, ..Default::default()
2219 };
2220
2221 let result = stride_large_chunk(chunk, &config);
2222 if let Err(e) = &result {
2223 eprintln!("Stride error: {}", e);
2224 }
2225 assert!(result.is_ok());
2226
2227 let chunks = result.unwrap();
2228 assert!(
2229 chunks.len() > 1,
2230 "Should create multiple chunks when striding"
2231 );
2232
2233 for chunk in chunks {
2234 assert!(chunk.span.line_end >= chunk.span.line_start);
2237
2238 let line_count = chunk.text.lines().count();
2240 if line_count > 0 {
2241 let calculated_line_span = chunk.span.line_end - chunk.span.line_start + 1;
2242
2243 assert!(
2245 calculated_line_span <= line_count + 1,
2246 "Line span {} should not exceed content lines {} by more than 1",
2247 calculated_line_span,
2248 line_count
2249 );
2250 }
2251 }
2252 }
2253
2254 #[test]
2255 fn test_gap_filling_coverage() {
2256 let test_cases = vec![
2258 (
2259 ParseableLanguage::Rust,
2260 r#"// This is a test file with imports at the top
2261use std::collections::HashMap;
2262use std::sync::Arc;
2263
2264// A comment between imports and code
2265const VERSION: &str = "1.0.0";
2266
2267// Main function
2268fn main() {
2269 println!("Hello, world!");
2270}
2271
2272// Some trailing content
2273// that should be indexed
2274"#,
2275 ),
2276 (
2277 ParseableLanguage::Python,
2278 r#"# Imports at the top
2279import os
2280import sys
2281
2282# Some constant
2283VERSION = "1.0.0"
2284
2285# Main function
2286def main():
2287 print("Hello, world!")
2288
2289# Trailing comment
2290# should be indexed
2291"#,
2292 ),
2293 (
2294 ParseableLanguage::TypeScript,
2295 r#"// Imports at the top
2296import { foo } from 'bar';
2297
2298// Some constant
2299const VERSION = "1.0.0";
2300
2301// Main function
2302function main() {
2303 console.log("Hello, world!");
2304}
2305
2306// Trailing comment
2307// should be indexed
2308"#,
2309 ),
2310 ];
2311
2312 for (language, code) in test_cases {
2313 eprintln!("\n=== Testing {} ===", language);
2314 let chunks = chunk_language(code, language).unwrap();
2315
2316 let mut covered_bytes = vec![false; code.len()];
2318 for chunk in &chunks {
2319 for item in covered_bytes
2320 .iter_mut()
2321 .take(chunk.span.byte_end)
2322 .skip(chunk.span.byte_start)
2323 {
2324 *item = true;
2325 }
2326 }
2327
2328 let uncovered_non_ws: Vec<usize> = covered_bytes
2329 .iter()
2330 .enumerate()
2331 .filter(|(i, covered)| !**covered && !code.as_bytes()[*i].is_ascii_whitespace())
2332 .map(|(i, _)| i)
2333 .collect();
2334
2335 if !uncovered_non_ws.is_empty() {
2336 eprintln!("\n=== UNCOVERED NON-WHITESPACE for {} ===", language);
2337 eprintln!("Total bytes: {}", code.len());
2338 eprintln!("Uncovered non-whitespace: {}", uncovered_non_ws.len());
2339
2340 for &pos in uncovered_non_ws.iter().take(10) {
2342 let context_start = pos.saturating_sub(20);
2343 let context_end = (pos + 20).min(code.len());
2344 eprintln!(
2345 "Uncovered at byte {}: {:?}",
2346 pos,
2347 &code[context_start..context_end]
2348 );
2349 }
2350
2351 eprintln!("\n=== CHUNKS ===");
2352 for (i, chunk) in chunks.iter().enumerate() {
2353 eprintln!(
2354 "Chunk {}: {:?} bytes {}-{} (len {})",
2355 i,
2356 chunk.chunk_type,
2357 chunk.span.byte_start,
2358 chunk.span.byte_end,
2359 chunk.span.byte_end - chunk.span.byte_start
2360 );
2361 eprintln!(" Text: {:?}", &chunk.text[..chunk.text.len().min(60)]);
2362 }
2363 }
2364
2365 assert!(
2366 uncovered_non_ws.is_empty(),
2367 "{}: Expected all non-whitespace covered but found {} uncovered non-whitespace bytes",
2368 language,
2369 uncovered_non_ws.len()
2370 );
2371 }
2372 }
2373
2374 #[test]
2375 fn test_web_server_file_coverage() {
2376 let code = std::fs::read_to_string("../examples/code/web_server.rs")
2378 .expect("Failed to read web_server.rs");
2379
2380 let chunks = chunk_language(&code, ParseableLanguage::Rust).unwrap();
2381
2382 let mut covered = vec![false; code.len()];
2384 for chunk in &chunks {
2385 for item in covered
2386 .iter_mut()
2387 .take(chunk.span.byte_end)
2388 .skip(chunk.span.byte_start)
2389 {
2390 *item = true;
2391 }
2392 }
2393
2394 let uncovered_non_whitespace: Vec<(usize, char)> = covered
2396 .iter()
2397 .enumerate()
2398 .filter(|(i, covered)| !**covered && !code.as_bytes()[*i].is_ascii_whitespace())
2399 .map(|(i, _)| (i, code.chars().nth(i).unwrap_or('?')))
2400 .collect();
2401
2402 if !uncovered_non_whitespace.is_empty() {
2403 eprintln!("\n=== WEB_SERVER.RS UNCOVERED NON-WHITESPACE ===");
2404 eprintln!("File size: {} bytes", code.len());
2405 eprintln!("Total chunks: {}", chunks.len());
2406 eprintln!(
2407 "Uncovered non-whitespace: {}",
2408 uncovered_non_whitespace.len()
2409 );
2410
2411 for &(pos, ch) in uncovered_non_whitespace.iter().take(10) {
2412 let start = pos.saturating_sub(30);
2413 let end = (pos + 30).min(code.len());
2414 eprintln!(
2415 "\nUncovered '{}' at byte {}: {:?}",
2416 ch,
2417 pos,
2418 &code[start..end]
2419 );
2420 }
2421
2422 eprintln!("\n=== CHUNKS ===");
2423 for (i, chunk) in chunks.iter().enumerate().take(20) {
2424 eprintln!(
2425 "Chunk {}: {:?} bytes {}-{} lines {}-{}",
2426 i,
2427 chunk.chunk_type,
2428 chunk.span.byte_start,
2429 chunk.span.byte_end,
2430 chunk.span.line_start,
2431 chunk.span.line_end
2432 );
2433 }
2434 }
2435
2436 assert!(
2437 uncovered_non_whitespace.is_empty(),
2438 "Expected all non-whitespace content covered but found {} uncovered non-whitespace bytes",
2439 uncovered_non_whitespace.len()
2440 );
2441 }
2442
2443 #[test]
2444 fn test_haskell_function_chunking() {
2445 let haskell_code = r#"
2446factorial :: Integer -> Integer
2447factorial 0 = 1
2448factorial n = n * factorial (n - 1)
2449
2450fibonacci :: Integer -> Integer
2451fibonacci 0 = 0
2452fibonacci 1 = 1
2453fibonacci n = fibonacci (n - 1) + fibonacci (n - 2)
2454"#;
2455
2456 let mut parser = tree_sitter::Parser::new();
2457 parser
2458 .set_language(&tree_sitter_haskell::LANGUAGE.into())
2459 .unwrap();
2460 let tree = parser.parse(haskell_code, None).unwrap();
2461
2462 fn walk(node: tree_sitter::Node, _src: &str, depth: usize) {
2464 let kind = node.kind();
2465 let start = node.start_position();
2466 let end = node.end_position();
2467 eprintln!(
2468 "{}{:30} L{}-{}",
2469 " ".repeat(depth),
2470 kind,
2471 start.row + 1,
2472 end.row + 1
2473 );
2474
2475 let mut cursor = node.walk();
2476 if cursor.goto_first_child() {
2477 loop {
2478 walk(cursor.node(), _src, depth + 1);
2479 if !cursor.goto_next_sibling() {
2480 break;
2481 }
2482 }
2483 }
2484 }
2485
2486 eprintln!("\n=== TREE STRUCTURE ===");
2487 walk(tree.root_node(), haskell_code, 0);
2488 eprintln!("=== END TREE ===\n");
2489
2490 let chunks = chunk_language(haskell_code, ParseableLanguage::Haskell).unwrap();
2491
2492 eprintln!("\n=== CHUNKS ===");
2493 for (i, chunk) in chunks.iter().enumerate() {
2494 eprintln!(
2495 "Chunk {}: {:?} L{}-{}",
2496 i, chunk.chunk_type, chunk.span.line_start, chunk.span.line_end
2497 );
2498 eprintln!(" Text: {:?}", chunk.text);
2499 }
2500 eprintln!("=== END CHUNKS ===\n");
2501
2502 assert!(!chunks.is_empty(), "Should find chunks in Haskell code");
2503
2504 let factorial_chunk = chunks.iter().find(|c| c.text.contains("factorial 0 = 1"));
2506 assert!(
2507 factorial_chunk.is_some(),
2508 "Should find factorial function body"
2509 );
2510
2511 let fac = factorial_chunk.unwrap();
2512 assert!(
2513 fac.text.contains("factorial :: Integer -> Integer"),
2514 "Should include type signature"
2515 );
2516 assert!(
2517 fac.text.contains("factorial 0 = 1"),
2518 "Should include base case"
2519 );
2520 assert!(
2521 fac.text.contains("factorial n = n * factorial (n - 1)"),
2522 "Should include recursive case"
2523 );
2524 }
2525
2526 #[test]
2527 fn test_chunk_elixir_basic() {
2528 let elixir_code = r#"
2529defmodule Calculator do
2530 @moduledoc "A simple calculator module"
2531
2532 def add(a, b) do
2533 a + b
2534 end
2535
2536 defp multiply(a, b) do
2537 a * b
2538 end
2539end
2540"#;
2541
2542 let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
2543
2544 eprintln!("\n=== ELIXIR CHUNKS ===");
2545 for (i, chunk) in chunks.iter().enumerate() {
2546 eprintln!(
2547 "Chunk {}: {:?} L{}-{}",
2548 i, chunk.chunk_type, chunk.span.line_start, chunk.span.line_end
2549 );
2550 eprintln!(" Text: {:?}", &chunk.text[..chunk.text.len().min(80)]);
2551 }
2552 eprintln!("=== END CHUNKS ===\n");
2553
2554 assert!(!chunks.is_empty(), "Should find chunks in Elixir code");
2555
2556 let has_module = chunks.iter().any(|c| c.chunk_type == ChunkType::Module);
2558 let has_function = chunks.iter().any(|c| c.chunk_type == ChunkType::Function);
2559
2560 assert!(has_module, "Should detect defmodule as Module");
2561 assert!(has_function, "Should detect def/defp as Function");
2562 }
2563
2564 #[test]
2565 fn test_chunk_elixir_protocol() {
2566 let elixir_code = r#"
2567defprotocol Stringable do
2568 @doc "Converts to string"
2569 def to_string(value)
2570end
2571
2572defimpl Stringable, for: Integer do
2573 def to_string(value), do: Integer.to_string(value)
2574end
2575"#;
2576
2577 let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
2578
2579 eprintln!("\n=== ELIXIR PROTOCOL CHUNKS ===");
2580 for (i, chunk) in chunks.iter().enumerate() {
2581 eprintln!(
2582 "Chunk {}: {:?} L{}-{}",
2583 i, chunk.chunk_type, chunk.span.line_start, chunk.span.line_end
2584 );
2585 eprintln!(" Text: {:?}", &chunk.text[..chunk.text.len().min(80)]);
2586 }
2587 eprintln!("=== END CHUNKS ===\n");
2588
2589 let modules: Vec<_> = chunks
2591 .iter()
2592 .filter(|c| c.chunk_type == ChunkType::Module)
2593 .collect();
2594
2595 assert!(
2596 modules.len() >= 2,
2597 "Should detect defprotocol and defimpl as modules, found {}",
2598 modules.len()
2599 );
2600 }
2601
2602 #[test]
2603 fn test_chunk_elixir_genserver() {
2604 let elixir_code = r#"
2605defmodule MyServer do
2606 use GenServer
2607
2608 def start_link(opts) do
2609 GenServer.start_link(__MODULE__, opts, name: __MODULE__)
2610 end
2611
2612 def init(state) do
2613 {:ok, state}
2614 end
2615
2616 def handle_call(:get, _from, state) do
2617 {:reply, state, state}
2618 end
2619
2620 def handle_cast({:set, value}, _state) do
2621 {:noreply, value}
2622 end
2623end
2624"#;
2625
2626 let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
2627
2628 let functions: Vec<_> = chunks
2630 .iter()
2631 .filter(|c| c.chunk_type == ChunkType::Function)
2632 .collect();
2633
2634 assert!(
2635 functions.len() >= 4,
2636 "Should detect at least 4 functions (start_link, init, handle_call, handle_cast), found {}",
2637 functions.len()
2638 );
2639 }
2640
2641 #[test]
2642 fn test_elixir_extension_detection() {
2643 use ck_core::Language;
2644
2645 assert_eq!(Language::from_extension("ex"), Some(Language::Elixir));
2646 assert_eq!(Language::from_extension("exs"), Some(Language::Elixir));
2647 assert_eq!(Language::from_extension("EX"), Some(Language::Elixir));
2648 assert_eq!(Language::from_extension("EXS"), Some(Language::Elixir));
2649 }
2650
2651 #[test]
2652 fn test_chunk_elixir_macros() {
2653 let elixir_code = r#"
2654defmodule MyMacros do
2655 defmacro unless(condition, do: block) do
2656 quote do
2657 if !unquote(condition), do: unquote(block)
2658 end
2659 end
2660
2661 defmacrop private_macro(x) do
2662 quote do: unquote(x) * 2
2663 end
2664end
2665"#;
2666
2667 let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
2668
2669 let functions: Vec<_> = chunks
2670 .iter()
2671 .filter(|c| c.chunk_type == ChunkType::Function)
2672 .collect();
2673
2674 assert!(
2675 functions.len() >= 2,
2676 "Should detect defmacro and defmacrop as functions, found {}",
2677 functions.len()
2678 );
2679 }
2680
2681 #[test]
2682 fn test_chunk_elixir_module_attributes() {
2683 let elixir_code = r#"
2684defmodule Calculator do
2685 @moduledoc "A calculator with type specs"
2686
2687 @behaviour GenServer
2688
2689 @type operation :: :add | :subtract | :multiply | :divide
2690 @typep internal_state :: %{history: list()}
2691 @opaque result :: {:ok, number()} | {:error, atom()}
2692
2693 @callback init(args :: term()) :: {:ok, state :: term()}
2694 @callback handle_call(request :: term(), from :: term(), state :: term()) :: {:reply, term(), term()}
2695
2696 @optional_callbacks [handle_info: 2]
2697
2698 @spec add(number(), number()) :: number()
2699 def add(a, b), do: a + b
2700
2701 @spec subtract(number(), number()) :: number()
2702 def subtract(a, b), do: a - b
2703end
2704"#;
2705
2706 let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
2707
2708 eprintln!("\n=== ELIXIR MODULE ATTRIBUTES CHUNKS ===");
2709 for (i, chunk) in chunks.iter().enumerate() {
2710 eprintln!(
2711 "Chunk {}: {:?} L{}-{}",
2712 i, chunk.chunk_type, chunk.span.line_start, chunk.span.line_end
2713 );
2714 eprintln!(" Text: {:?}", &chunk.text[..chunk.text.len().min(80)]);
2715 }
2716 eprintln!("=== END CHUNKS ===\n");
2717
2718 let has_behaviour = chunks
2720 .iter()
2721 .any(|c| c.chunk_type == ChunkType::Text && c.text.contains("@behaviour GenServer"));
2722 assert!(has_behaviour, "Should capture @behaviour declaration");
2723
2724 let type_chunks: Vec<_> = chunks
2726 .iter()
2727 .filter(|c| {
2728 c.chunk_type == ChunkType::Text
2729 && (c.text.contains("@type")
2730 || c.text.contains("@typep")
2731 || c.text.contains("@opaque"))
2732 })
2733 .collect();
2734 assert!(
2735 type_chunks.len() >= 3,
2736 "Should capture @type, @typep, and @opaque, found {}",
2737 type_chunks.len()
2738 );
2739
2740 let callback_chunks: Vec<_> = chunks
2742 .iter()
2743 .filter(|c| c.chunk_type == ChunkType::Text && c.text.contains("@callback"))
2744 .collect();
2745 assert!(
2746 callback_chunks.len() >= 2,
2747 "Should capture @callback definitions, found {}",
2748 callback_chunks.len()
2749 );
2750
2751 let spec_chunks: Vec<_> = chunks
2753 .iter()
2754 .filter(|c| c.chunk_type == ChunkType::Text && c.text.contains("@spec"))
2755 .collect();
2756 assert!(
2757 spec_chunks.len() >= 2,
2758 "Should capture @spec definitions, found {}",
2759 spec_chunks.len()
2760 );
2761
2762 let function_chunks: Vec<_> = chunks
2764 .iter()
2765 .filter(|c| c.chunk_type == ChunkType::Function)
2766 .collect();
2767 assert!(
2768 function_chunks.len() >= 2,
2769 "Should still capture def functions, found {}",
2770 function_chunks.len()
2771 );
2772 }
2773
2774 #[test]
2775 fn test_chunk_elixir_behavior_spelling() {
2776 let elixir_code = r#"
2778defmodule BritishModule do
2779 @behaviour GenServer
2780end
2781
2782defmodule AmericanModule do
2783 @behavior GenServer
2784end
2785"#;
2786
2787 let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
2788
2789 let behaviour_chunks: Vec<_> = chunks
2790 .iter()
2791 .filter(|c| {
2792 c.chunk_type == ChunkType::Text
2793 && (c.text.contains("@behaviour") || c.text.contains("@behavior"))
2794 })
2795 .collect();
2796
2797 assert!(
2798 behaviour_chunks.len() >= 2,
2799 "Should capture both @behaviour and @behavior spellings, found {}",
2800 behaviour_chunks.len()
2801 );
2802 }
2803}