1use anyhow::Result;
2use ck_core::Span;
3use serde::{Deserialize, Serialize};
4
5mod query_chunker;
6
7pub use ck_embed::TokenEstimator;
9
10fn estimate_tokens(text: &str) -> usize {
12 TokenEstimator::estimate_tokens(text)
13}
14
15pub fn get_model_chunk_config(model_name: Option<&str>) -> (usize, usize) {
18 let model = model_name.unwrap_or("nomic-embed-text-v1.5");
19
20 match model {
21 "BAAI/bge-small-en-v1.5" | "sentence-transformers/all-MiniLM-L6-v2" => {
23 (400, 80) }
25
26 "nomic-embed-text-v1" | "nomic-embed-text-v1.5" | "jina-embeddings-v2-base-code" => {
29 (1024, 200) }
31
32 "BAAI/bge-base-en-v1.5" | "BAAI/bge-large-en-v1.5" => {
34 (400, 80) }
36
37 _ => (1024, 200), }
40}
41
42#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct StrideInfo {
45 pub original_chunk_id: String,
47 pub stride_index: usize,
49 pub total_strides: usize,
51 pub overlap_start: usize,
53 pub overlap_end: usize,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize, Default)]
58pub struct ChunkMetadata {
59 pub ancestry: Vec<String>,
60 pub breadcrumb: Option<String>,
61 pub leading_trivia: Vec<String>,
62 pub trailing_trivia: Vec<String>,
63 pub byte_length: usize,
64 pub estimated_tokens: usize,
65}
66
67impl ChunkMetadata {
68 fn from_context(
69 text: &str,
70 ancestry: Vec<String>,
71 leading_trivia: Vec<String>,
72 trailing_trivia: Vec<String>,
73 ) -> Self {
74 let breadcrumb = if ancestry.is_empty() {
75 None
76 } else {
77 Some(ancestry.join("::"))
78 };
79
80 Self {
81 ancestry,
82 breadcrumb,
83 leading_trivia,
84 trailing_trivia,
85 byte_length: text.len(),
86 estimated_tokens: estimate_tokens(text),
87 }
88 }
89
90 fn from_text(text: &str) -> Self {
91 Self {
92 ancestry: Vec::new(),
93 breadcrumb: None,
94 leading_trivia: Vec::new(),
95 trailing_trivia: Vec::new(),
96 byte_length: text.len(),
97 estimated_tokens: estimate_tokens(text),
98 }
99 }
100
101 fn with_updated_text(&self, text: &str) -> Self {
102 let mut cloned = self.clone();
103 cloned.byte_length = text.len();
104 cloned.estimated_tokens = estimate_tokens(text);
105 cloned
106 }
107}
108
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct Chunk {
111 pub span: Span,
112 pub text: String,
113 pub chunk_type: ChunkType,
114 pub stride_info: Option<StrideInfo>,
116 pub metadata: ChunkMetadata,
117}
118
119#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
120pub enum ChunkType {
121 Text,
122 Function,
123 Class,
124 Method,
125 Module,
126}
127
128#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
129pub enum ParseableLanguage {
130 Python,
131 TypeScript,
132 JavaScript,
133 Haskell,
134 Rust,
135 Ruby,
136 Go,
137 C,
138 Cpp,
139 CSharp,
140 Zig,
141
142 Dart,
143
144 Elixir,
145
146 Markdown,
147}
148
149impl std::fmt::Display for ParseableLanguage {
150 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
151 let name = match self {
152 ParseableLanguage::Python => "python",
153 ParseableLanguage::TypeScript => "typescript",
154 ParseableLanguage::JavaScript => "javascript",
155 ParseableLanguage::Haskell => "haskell",
156 ParseableLanguage::Rust => "rust",
157 ParseableLanguage::Ruby => "ruby",
158 ParseableLanguage::Go => "go",
159 ParseableLanguage::C => "c",
160 ParseableLanguage::Cpp => "cpp",
161 ParseableLanguage::CSharp => "csharp",
162 ParseableLanguage::Zig => "zig",
163
164 ParseableLanguage::Dart => "dart",
165
166 ParseableLanguage::Elixir => "elixir",
167
168 ParseableLanguage::Markdown => "markdown",
169 };
170 write!(f, "{name}")
171 }
172}
173
174impl TryFrom<ck_core::Language> for ParseableLanguage {
175 type Error = anyhow::Error;
176
177 fn try_from(lang: ck_core::Language) -> Result<Self, Self::Error> {
178 match lang {
179 ck_core::Language::Python => Ok(ParseableLanguage::Python),
180 ck_core::Language::TypeScript => Ok(ParseableLanguage::TypeScript),
181 ck_core::Language::JavaScript => Ok(ParseableLanguage::JavaScript),
182 ck_core::Language::Haskell => Ok(ParseableLanguage::Haskell),
183 ck_core::Language::Rust => Ok(ParseableLanguage::Rust),
184 ck_core::Language::Ruby => Ok(ParseableLanguage::Ruby),
185 ck_core::Language::Go => Ok(ParseableLanguage::Go),
186 ck_core::Language::C => Ok(ParseableLanguage::C),
187 ck_core::Language::Cpp => Ok(ParseableLanguage::Cpp),
188 ck_core::Language::CSharp => Ok(ParseableLanguage::CSharp),
189 ck_core::Language::Zig => Ok(ParseableLanguage::Zig),
190
191 ck_core::Language::Dart => Ok(ParseableLanguage::Dart),
192
193 ck_core::Language::Elixir => Ok(ParseableLanguage::Elixir),
194
195 ck_core::Language::Markdown => Ok(ParseableLanguage::Markdown),
196
197 _ => Err(anyhow::anyhow!(
198 "Language {lang:?} is not supported for parsing"
199 )),
200 }
201 }
202}
203
204pub fn chunk_text(text: &str, language: Option<ck_core::Language>) -> Result<Vec<Chunk>> {
205 chunk_text_with_config(text, language, &ChunkConfig::default())
206}
207
208#[derive(Debug, Clone)]
210pub struct ChunkConfig {
211 pub max_tokens: usize,
213 pub stride_overlap: usize,
215 pub enable_striding: bool,
217}
218
219impl Default for ChunkConfig {
220 fn default() -> Self {
221 Self {
222 max_tokens: 8192, stride_overlap: 1024, enable_striding: true,
225 }
226 }
227}
228
229pub fn chunk_text_with_model(
231 text: &str,
232 language: Option<ck_core::Language>,
233 model_name: Option<&str>,
234) -> Result<Vec<Chunk>> {
235 let (target_tokens, overlap_tokens) = get_model_chunk_config(model_name);
236
237 let config = ChunkConfig {
239 max_tokens: target_tokens,
240 stride_overlap: overlap_tokens,
241 enable_striding: true,
242 };
243
244 chunk_text_with_config_and_model(text, language, &config, model_name)
245}
246
247pub fn chunk_text_with_config(
248 text: &str,
249 language: Option<ck_core::Language>,
250 config: &ChunkConfig,
251) -> Result<Vec<Chunk>> {
252 chunk_text_with_config_and_model(text, language, config, None)
253}
254
255fn chunk_text_with_config_and_model(
256 text: &str,
257 language: Option<ck_core::Language>,
258 config: &ChunkConfig,
259 model_name: Option<&str>,
260) -> Result<Vec<Chunk>> {
261 tracing::debug!(
262 "Chunking text with language: {:?}, length: {} chars, config: {:?}",
263 language,
264 text.len(),
265 config
266 );
267
268 let result = match language.map(ParseableLanguage::try_from) {
269 Some(Ok(lang)) => {
270 tracing::debug!("Using {} tree-sitter parser", lang);
271 chunk_language_with_model(text, lang, model_name)
272 }
273 Some(Err(_)) => {
274 tracing::debug!("Language not supported for parsing, using generic chunking strategy");
275 chunk_generic_with_token_config(text, model_name)
276 }
277 None => {
278 tracing::debug!("Using generic chunking strategy");
279 chunk_generic_with_token_config(text, model_name)
280 }
281 };
282
283 let mut chunks = result?;
284
285 if config.enable_striding {
287 chunks = apply_striding(chunks, config)?;
288 }
289
290 tracing::debug!("Successfully created {} final chunks", chunks.len());
291 Ok(chunks)
292}
293
294fn chunk_generic(text: &str) -> Result<Vec<Chunk>> {
295 chunk_generic_with_token_config(text, None)
296}
297
298fn chunk_generic_with_token_config(text: &str, model_name: Option<&str>) -> Result<Vec<Chunk>> {
299 let mut chunks = Vec::new();
300 let lines: Vec<&str> = text.lines().collect();
301
302 let (target_tokens, overlap_tokens) = get_model_chunk_config(model_name);
304
305 let avg_tokens_per_line = 10.0; let target_lines = ((target_tokens as f32) / avg_tokens_per_line) as usize;
309 let overlap_lines = ((overlap_tokens as f32) / avg_tokens_per_line) as usize;
310
311 let chunk_size = target_lines.max(5); let overlap = overlap_lines.max(1); let mut line_byte_offsets = Vec::with_capacity(lines.len() + 1);
316 line_byte_offsets.push(0);
317 let mut cumulative_offset = 0;
318 let mut byte_pos = 0;
319
320 for line in lines.iter() {
321 cumulative_offset += line.len();
322
323 let line_end_pos = byte_pos + line.len();
325 let newline_len = if line_end_pos < text.len() && text.as_bytes()[line_end_pos] == b'\r' {
326 if line_end_pos + 1 < text.len() && text.as_bytes()[line_end_pos + 1] == b'\n' {
327 2 } else {
329 1 }
331 } else if line_end_pos < text.len() && text.as_bytes()[line_end_pos] == b'\n' {
332 1 } else {
334 0 };
336
337 cumulative_offset += newline_len;
338 byte_pos = cumulative_offset;
339 line_byte_offsets.push(cumulative_offset);
340 }
341
342 let mut i = 0;
343 while i < lines.len() {
344 let end = (i + chunk_size).min(lines.len());
345 let chunk_lines = &lines[i..end];
346 let chunk_text = chunk_lines.join("\n");
347 let byte_start = line_byte_offsets[i];
348 let byte_end = line_byte_offsets[end];
349 let metadata = ChunkMetadata::from_text(&chunk_text);
350
351 chunks.push(Chunk {
352 span: Span {
353 byte_start,
354 byte_end,
355 line_start: i + 1,
356 line_end: end,
357 },
358 text: chunk_text,
359 chunk_type: ChunkType::Text,
360 stride_info: None,
361 metadata,
362 });
363
364 i += chunk_size - overlap;
365 if i >= lines.len() {
366 break;
367 }
368 }
369
370 Ok(chunks)
371}
372
373pub(crate) fn tree_sitter_language(language: ParseableLanguage) -> Result<tree_sitter::Language> {
374 if language == ParseableLanguage::Dart {
377 return Ok(tree_sitter_dart::language());
378 }
379
380 if language == ParseableLanguage::Markdown {
381 return Ok(tree_sitter_md::LANGUAGE.into());
382 }
383
384 let ts_language = match language {
385 ParseableLanguage::Python => tree_sitter_python::LANGUAGE,
386 ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => {
387 tree_sitter_typescript::LANGUAGE_TYPESCRIPT
388 }
389 ParseableLanguage::Haskell => tree_sitter_haskell::LANGUAGE,
390 ParseableLanguage::Rust => tree_sitter_rust::LANGUAGE,
391 ParseableLanguage::Ruby => tree_sitter_ruby::LANGUAGE,
392 ParseableLanguage::Go => tree_sitter_go::LANGUAGE,
393 ParseableLanguage::C => tree_sitter_c::LANGUAGE,
394 ParseableLanguage::Cpp => tree_sitter_cpp::LANGUAGE,
395 ParseableLanguage::CSharp => tree_sitter_c_sharp::LANGUAGE,
396 ParseableLanguage::Zig => tree_sitter_zig::LANGUAGE,
397
398 ParseableLanguage::Dart => unreachable!("Handled above via early return"),
399
400 ParseableLanguage::Elixir => tree_sitter_elixir::LANGUAGE,
401
402 ParseableLanguage::Markdown => unreachable!("Handled above via early return"),
403 };
404
405 Ok(ts_language.into())
406}
407
408fn chunk_language(text: &str, language: ParseableLanguage) -> Result<Vec<Chunk>> {
409 let mut parser = tree_sitter::Parser::new();
410 let ts_language = tree_sitter_language(language)?;
411 parser.set_language(&ts_language)?;
412
413 let tree = parser
414 .parse(text, None)
415 .ok_or_else(|| anyhow::anyhow!("Failed to parse {language} code"))?;
416
417 let mut chunks = match query_chunker::chunk_with_queries(language, ts_language, &tree, text)? {
418 Some(query_chunks) if !query_chunks.is_empty() => query_chunks,
419 _ => {
420 let mut legacy_chunks = Vec::new();
421 let mut cursor = tree.walk();
422 extract_code_chunks(&mut cursor, text, &mut legacy_chunks, language);
423 legacy_chunks
424 }
425 };
426
427 if chunks.is_empty() {
428 return chunk_generic(text);
429 }
430
431 if language == ParseableLanguage::Haskell {
433 chunks = merge_haskell_functions(chunks, text);
434 }
435
436 if matches!(language, ParseableLanguage::C | ParseableLanguage::Cpp) {
438 chunks = suppress_contained_text_chunks(chunks);
439 }
440
441 chunks = fill_gaps(chunks, text);
443
444 if language == ParseableLanguage::Cpp {
446 chunks = merge_cpp_template_prefix_chunks(chunks, text);
447 }
448
449 if language == ParseableLanguage::Markdown {
451 let (target_tokens, _) = get_model_chunk_config(None);
452 chunks = merge_small_chunks(chunks, text, target_tokens);
453 }
454
455 Ok(chunks)
456}
457
458fn suppress_contained_text_chunks(chunks: Vec<Chunk>) -> Vec<Chunk> {
459 if chunks.is_empty() {
460 return chunks;
461 }
462
463 let mut containers: Vec<(usize, usize)> = chunks
464 .iter()
465 .filter(|chunk| {
466 matches!(
467 chunk.chunk_type,
468 ChunkType::Class | ChunkType::Method | ChunkType::Function
469 )
470 })
471 .map(|chunk| (chunk.span.byte_start, chunk.span.byte_end))
472 .collect();
473
474 if containers.is_empty() {
475 return chunks;
476 }
477
478 containers.sort_by_key(|(start, _)| *start);
479
480 chunks
481 .into_iter()
482 .filter(|chunk| {
483 if chunk.chunk_type != ChunkType::Text {
484 return true;
485 }
486
487 let start = chunk.span.byte_start;
488 let end = chunk.span.byte_end;
489 !containers
490 .iter()
491 .any(|(c_start, c_end)| *c_start <= start && end <= *c_end)
492 })
493 .collect()
494}
495
496fn merge_cpp_template_prefix_chunks(chunks: Vec<Chunk>, text: &str) -> Vec<Chunk> {
497 if chunks.len() < 2 {
498 return chunks;
499 }
500
501 let mut merged = Vec::with_capacity(chunks.len());
502 let mut idx = 0;
503
504 while idx < chunks.len() {
505 if idx + 1 < chunks.len() && is_template_prefix_chunk(&chunks[idx]) {
506 let template_chunk = &chunks[idx];
507 let mut next_chunk = chunks[idx + 1].clone();
508
509 if template_chunk.span.byte_end == next_chunk.span.byte_start
510 && template_chunk.span.byte_start < next_chunk.span.byte_end
511 && next_chunk.span.byte_end <= text.len()
512 {
513 let new_start = template_chunk.span.byte_start;
514 let new_end = next_chunk.span.byte_end;
515
516 if let Some(new_text) = text.get(new_start..new_end) {
517 let (line_start, line_end) = line_range_for_span(text, new_start, new_end);
518
519 next_chunk.span.byte_start = new_start;
520 next_chunk.span.line_start = line_start;
521 next_chunk.span.line_end = line_end;
522 next_chunk.text = new_text.to_string();
523 next_chunk.metadata = next_chunk.metadata.with_updated_text(new_text);
524
525 merged.push(next_chunk);
526 idx += 2;
527 continue;
528 }
529 }
530 }
531
532 merged.push(chunks[idx].clone());
533 idx += 1;
534 }
535
536 merged
537}
538
539fn is_template_prefix_chunk(chunk: &Chunk) -> bool {
540 if chunk.chunk_type != ChunkType::Text {
541 return false;
542 }
543
544 let mut has_template = false;
545 for line in chunk.text.lines() {
546 let trimmed = line.trim();
547 if trimmed.is_empty() {
548 continue;
549 }
550 if trimmed.starts_with("template <") || trimmed.starts_with("template<") {
551 has_template = true;
552 continue;
553 }
554 return false;
555 }
556
557 has_template
558}
559
560fn line_range_for_span(text: &str, byte_start: usize, byte_end: usize) -> (usize, usize) {
561 let line_start = text[..byte_start].matches('\n').count() + 1;
562 let newlines_up_to_end = text[..byte_end].matches('\n').count();
563 let line_end = if newlines_up_to_end >= line_start - 1 {
564 newlines_up_to_end.max(line_start)
565 } else {
566 line_start
567 };
568
569 (line_start, line_end)
570}
571
572fn fill_gaps(mut chunks: Vec<Chunk>, text: &str) -> Vec<Chunk> {
576 if chunks.is_empty() {
577 return chunks;
578 }
579
580 chunks.sort_by_key(|c| c.span.byte_start);
582
583 let mut result = Vec::new();
584 let mut last_end = 0;
585
586 let mut gaps = Vec::new();
588
589 for chunk in &chunks {
590 if last_end < chunk.span.byte_start {
591 let gap_start = last_end;
593 let gap_text = &text[gap_start..chunk.span.byte_start];
594
595 let mut current_byte = gap_start;
597 let mut segment_start = gap_start;
598
599 for line in gap_text.split('\n') {
600 let line_start_in_gap = current_byte - gap_start;
601 let _line_end_in_gap = line_start_in_gap + line.len();
602
603 if line.trim().is_empty() {
604 if segment_start < current_byte {
606 let segment_text = &text[segment_start..current_byte];
607 if !segment_text.trim().is_empty() {
608 gaps.push((segment_start, current_byte));
609 }
610 }
611 segment_start = current_byte + line.len() + 1;
613 }
614
615 current_byte += line.len() + 1; }
617
618 if segment_start < chunk.span.byte_start {
620 let remaining = &text[segment_start..chunk.span.byte_start];
621 if !remaining.trim().is_empty() {
622 gaps.push((segment_start, chunk.span.byte_start));
623 }
624 }
625 }
626 last_end = last_end.max(chunk.span.byte_end);
627 }
628
629 if last_end < text.len() {
631 let gap_text = &text[last_end..];
632 if !gap_text.trim().is_empty() {
633 gaps.push((last_end, text.len()));
634 }
635 }
636
637 let combined_gaps = gaps;
638
639 let mut gap_idx = 0;
641
642 for chunk in chunks {
643 while gap_idx < combined_gaps.len() && combined_gaps[gap_idx].1 <= chunk.span.byte_start {
645 let (gap_start, gap_end) = combined_gaps[gap_idx];
646 let gap_text = &text[gap_start..gap_end];
647
648 let line_start = text[..gap_start].matches('\n').count() + 1;
650 let newlines_up_to_end = text[..gap_end].matches('\n').count();
653 let line_end = if newlines_up_to_end >= line_start - 1 {
654 newlines_up_to_end.max(line_start)
655 } else {
656 line_start
657 };
658
659 let gap_chunk = Chunk {
660 text: gap_text.to_string(),
661 span: Span {
662 byte_start: gap_start,
663 byte_end: gap_end,
664 line_start,
665 line_end,
666 },
667 chunk_type: ChunkType::Text,
668 metadata: ChunkMetadata::from_text(gap_text),
669 stride_info: None,
670 };
671 result.push(gap_chunk);
672 gap_idx += 1;
673 }
674
675 result.push(chunk.clone());
676 }
677
678 while gap_idx < combined_gaps.len() {
680 let (gap_start, gap_end) = combined_gaps[gap_idx];
681 let gap_text = &text[gap_start..gap_end];
682
683 let line_start = text[..gap_start].matches('\n').count() + 1;
685 let newlines_up_to_end = text[..gap_end].matches('\n').count();
687 let line_end = if newlines_up_to_end >= line_start - 1 {
688 newlines_up_to_end.max(line_start)
689 } else {
690 line_start
691 };
692
693 let gap_chunk = Chunk {
694 text: gap_text.to_string(),
695 span: Span {
696 byte_start: gap_start,
697 byte_end: gap_end,
698 line_start,
699 line_end,
700 },
701 chunk_type: ChunkType::Text,
702 metadata: ChunkMetadata::from_text(gap_text),
703 stride_info: None,
704 };
705 result.push(gap_chunk);
706 gap_idx += 1;
707 }
708
709 result
710}
711
712fn merge_haskell_functions(chunks: Vec<Chunk>, source: &str) -> Vec<Chunk> {
714 if chunks.is_empty() {
715 return chunks;
716 }
717
718 let mut merged = Vec::new();
719 let mut i = 0;
720
721 while i < chunks.len() {
722 let chunk = &chunks[i];
723
724 let trimmed = chunk.text.trim();
726 if trimmed.is_empty()
727 || trimmed.starts_with("--")
728 || trimmed.starts_with("{-")
729 || !chunk.text.contains(|c: char| c.is_alphanumeric())
730 {
731 i += 1;
732 continue;
733 }
734
735 let is_signature = chunk.text.contains("::");
738 let function_name = if is_signature {
739 chunk
741 .text
742 .split("::")
743 .next()
744 .and_then(|s| s.split_whitespace().next())
745 .map(std::string::ToString::to_string)
746 } else {
747 extract_haskell_function_name(&chunk.text)
748 };
749
750 if function_name.is_none() {
751 merged.push(chunk.clone());
753 i += 1;
754 continue;
755 }
756
757 let name = function_name.unwrap();
758 let group_start = chunk.span.byte_start;
759 let mut group_end = chunk.span.byte_end;
760 let line_start = chunk.span.line_start;
761 let mut line_end = chunk.span.line_end;
762 let mut trailing_trivia = chunk.metadata.trailing_trivia.clone();
763
764 let mut j = i + 1;
766 while j < chunks.len() {
767 let next_chunk = &chunks[j];
768
769 let next_trimmed = next_chunk.text.trim();
771 if next_trimmed.starts_with("--") || next_trimmed.starts_with("{-") {
772 j += 1;
773 continue;
774 }
775
776 let next_is_signature = next_chunk.text.contains("::");
777 let next_name = if next_is_signature {
778 next_chunk
779 .text
780 .split("::")
781 .next()
782 .and_then(|s| s.split_whitespace().next())
783 .map(std::string::ToString::to_string)
784 } else {
785 extract_haskell_function_name(&next_chunk.text)
786 };
787
788 if next_name == Some(name.clone()) {
789 group_end = next_chunk.span.byte_end;
791 line_end = next_chunk.span.line_end;
792 trailing_trivia = next_chunk.metadata.trailing_trivia.clone();
793 j += 1;
794 } else {
795 break;
796 }
797 }
798
799 let merged_text = source.get(group_start..group_end).unwrap_or("").to_string();
801 let mut metadata = chunk.metadata.with_updated_text(&merged_text);
802 metadata.trailing_trivia = trailing_trivia;
803
804 merged.push(Chunk {
805 span: Span {
806 byte_start: group_start,
807 byte_end: group_end,
808 line_start,
809 line_end,
810 },
811 text: merged_text,
812 chunk_type: ChunkType::Function,
813 stride_info: None,
814 metadata,
815 });
816
817 i = j; }
819
820 merged
821}
822
823fn extract_haskell_function_name(text: &str) -> Option<String> {
825 let trimmed = text.trim();
828
829 let first_word = trimmed
831 .split_whitespace()
832 .next()?
833 .trim_end_matches(|c: char| !c.is_alphanumeric() && c != '_' && c != '\'');
834
835 if first_word.is_empty() {
837 return None;
838 }
839
840 let first_char = first_word.chars().next()?;
841 if first_char.is_lowercase() || first_char == '_' {
842 Some(first_word.to_string())
843 } else {
844 None
845 }
846}
847
848fn chunk_language_with_model(
849 text: &str,
850 language: ParseableLanguage,
851 _model_name: Option<&str>,
852) -> Result<Vec<Chunk>> {
853 chunk_language(text, language)
857}
858
859fn extract_code_chunks(
860 cursor: &mut tree_sitter::TreeCursor,
861 source: &str,
862 chunks: &mut Vec<Chunk>,
863 language: ParseableLanguage,
864) {
865 let node = cursor.node();
866
867 let should_skip = if language == ParseableLanguage::Haskell && node.kind() == "function" {
870 let mut current = node.parent();
872 while let Some(parent) = current {
873 if parent.kind() == "signature" {
874 return; }
876 current = parent.parent();
877 }
878 false
879 } else {
880 false
881 };
882
883 if !should_skip
884 && let Some(initial_chunk_type) = chunk_type_for_node(language, &node)
885 && let Some(chunk) = build_chunk(node, source, initial_chunk_type, language)
886 {
887 let is_duplicate = chunks.iter().any(|existing| {
888 existing.span.byte_start == chunk.span.byte_start
889 && existing.span.byte_end == chunk.span.byte_end
890 });
891
892 if !is_duplicate {
893 chunks.push(chunk);
894 }
895 }
896
897 let should_recurse = !(language == ParseableLanguage::Haskell && node.kind() == "signature");
899
900 if should_recurse && cursor.goto_first_child() {
901 loop {
902 extract_code_chunks(cursor, source, chunks, language);
903 if !cursor.goto_next_sibling() {
904 break;
905 }
906 }
907 cursor.goto_parent();
908 }
909}
910
911fn chunk_type_for_node(
912 language: ParseableLanguage,
913 node: &tree_sitter::Node<'_>,
914) -> Option<ChunkType> {
915 let kind = node.kind();
916
917 let supported = match language {
918 ParseableLanguage::Python => matches!(kind, "function_definition" | "class_definition"),
919 ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => matches!(
920 kind,
921 "function_declaration" | "class_declaration" | "method_definition" | "arrow_function"
922 ),
923 ParseableLanguage::Haskell => matches!(
924 kind,
925 "function" | "signature" | "data_type"
928 | "newtype"
929 | "type_synonym"
930 | "type_family"
931 | "class"
932 | "instance"
933 ),
934 ParseableLanguage::Rust => matches!(
935 kind,
936 "function_item" | "impl_item" | "struct_item" | "enum_item" | "trait_item" | "mod_item"
937 ),
938 ParseableLanguage::Ruby => {
939 matches!(kind, "method" | "class" | "module" | "singleton_method")
940 }
941 ParseableLanguage::Go => matches!(
942 kind,
943 "function_declaration"
944 | "method_declaration"
945 | "type_declaration"
946 | "var_declaration"
947 | "const_declaration"
948 ),
949 ParseableLanguage::C => matches!(
950 kind,
951 "function_definition"
952 | "struct_specifier"
953 | "enum_specifier"
954 | "union_specifier"
955 | "type_definition"
956 | "declaration"
957 | "preproc_function_def"
958 | "preproc_def"
959 ),
960 ParseableLanguage::Cpp => matches!(
961 kind,
962 "function_definition"
963 | "class_specifier"
964 | "struct_specifier"
965 | "enum_specifier"
966 | "union_specifier"
967 | "namespace_definition"
968 | "template_declaration"
969 | "type_definition"
970 | "alias_declaration"
971 | "declaration"
972 | "preproc_function_def"
973 | "preproc_def"
974 ),
975 ParseableLanguage::CSharp => matches!(
976 kind,
977 "method_declaration"
978 | "class_declaration"
979 | "interface_declaration"
980 | "variable_declaration"
981 ),
982 ParseableLanguage::Dart => matches!(
983 kind,
984 "class_definition"
985 | "class_declaration"
986 | "mixin_declaration"
987 | "enum_declaration"
988 | "function_declaration"
989 | "method_declaration"
990 | "constructor_declaration"
991 | "variable_declaration"
992 | "local_variable_declaration"
993 | "lambda_expression"
994 | "class_member_definition"
995 ),
996 ParseableLanguage::Zig => matches!(
997 kind,
998 "function_declaration"
999 | "test_declaration"
1000 | "variable_declaration"
1001 | "struct_declaration"
1002 | "enum_declaration"
1003 | "union_declaration"
1004 | "opaque_declaration"
1005 | "error_set_declaration"
1006 | "comptime_declaration"
1007 ),
1008 ParseableLanguage::Elixir => matches!(kind, "call" | "do_block"),
1011 ParseableLanguage::Markdown => matches!(
1012 kind,
1013 "atx_heading"
1014 | "setext_heading"
1015 | "heading"
1016 | "section"
1017 | "fenced_code_block"
1018 | "indented_code_block"
1019 | "block_quote"
1020 | "list"
1021 | "list_item"
1022 | "paragraph"
1023 | "thematic_break"
1024 ),
1025 };
1026
1027 if !supported {
1028 return None;
1029 }
1030
1031 match language {
1032 ParseableLanguage::Go
1033 if matches!(node.kind(), "var_declaration" | "const_declaration")
1034 && node.parent().is_some_and(|p| p.kind() == "block") =>
1035 {
1036 return None;
1037 }
1038 ParseableLanguage::CSharp if node.kind() == "variable_declaration" => {
1039 if !is_csharp_field_like(*node) {
1040 return None;
1041 }
1042 }
1043 _ => {}
1044 }
1045
1046 Some(classify_chunk_kind(kind))
1047}
1048
1049fn classify_chunk_kind(kind: &str) -> ChunkType {
1050 match kind {
1051 "function_definition"
1052 | "function_declaration"
1053 | "arrow_function"
1054 | "function"
1055 | "function_item"
1056 | "def"
1057 | "defp"
1058 | "defn"
1059 | "defn-"
1060 | "method"
1061 | "singleton_method"
1062 | "preproc_function_def" => ChunkType::Function,
1063 "signature" => ChunkType::Function, "class_definition"
1065 | "class_declaration"
1066 | "instance_declaration"
1067 | "class"
1068 | "instance"
1069 | "struct_item"
1070 | "enum_item"
1071 | "class_specifier"
1072 | "struct_specifier"
1073 | "enum_specifier"
1074 | "union_specifier"
1075 | "defstruct"
1076 | "defrecord"
1077 | "deftype"
1078 | "type_declaration"
1079 | "struct_declaration"
1080 | "enum_declaration"
1081 | "union_declaration"
1082 | "opaque_declaration"
1083 | "error_set_declaration" => ChunkType::Class,
1084 "method_definition" | "method_declaration" | "defmacro" => ChunkType::Method,
1085 "data_type"
1086 | "newtype"
1087 | "type_synonym"
1088 | "type_family"
1089 | "impl_item"
1090 | "trait_item"
1091 | "mod_item"
1092 | "namespace_definition"
1093 | "defmodule"
1094 | "module"
1095 | "defprotocol"
1096 | "interface_declaration"
1097 | "ns"
1098 | "var_declaration"
1099 | "const_declaration"
1100 | "variable_declaration"
1101 | "test_declaration"
1102 | "comptime_declaration"
1103 | "atx_heading"
1104 | "setext_heading"
1105 | "heading"
1106 | "section" => ChunkType::Module,
1107 _ => ChunkType::Text,
1108 }
1109}
1110
1111pub(crate) fn build_chunk(
1112 node: tree_sitter::Node<'_>,
1113 source: &str,
1114 initial_type: ChunkType,
1115 language: ParseableLanguage,
1116) -> Option<Chunk> {
1117 let target_node = adjust_node_for_language(node, language);
1118
1119 if matches!(language, ParseableLanguage::C | ParseableLanguage::Cpp)
1120 && matches!(initial_type, ChunkType::Class)
1121 && matches!(
1122 target_node.kind(),
1123 "struct_specifier" | "union_specifier" | "enum_specifier"
1124 )
1125 && !c_cpp_type_has_body_node(target_node)
1126 {
1127 return None;
1128 }
1129 let (byte_start, start_row, leading_segments) =
1130 extend_with_leading_trivia(target_node, language, source);
1131 let trailing_segments = collect_trailing_trivia(target_node, language, source);
1132
1133 let byte_end = target_node.end_byte();
1134 let end_pos = target_node.end_position();
1135
1136 if byte_start >= byte_end || byte_end > source.len() {
1137 return None;
1138 }
1139
1140 let chunk_type = adjust_chunk_type_for_context(target_node, initial_type, language);
1141 let mut text = source.get(byte_start..byte_end)?.to_string();
1142 if matches!(language, ParseableLanguage::C | ParseableLanguage::Cpp)
1143 && chunk_type == ChunkType::Class
1144 {
1145 text = strip_method_bodies_in_class_text(target_node, source, byte_start, byte_end);
1146 }
1147
1148 if text.trim().is_empty() {
1149 return None;
1150 }
1151 let ancestry = collect_ancestry(target_node, language, source);
1152 let leading_trivia = segments_to_strings(&leading_segments, source);
1153 let trailing_trivia = segments_to_strings(&trailing_segments, source);
1154 let mut metadata =
1155 ChunkMetadata::from_context(&text, ancestry, leading_trivia, trailing_trivia);
1156 if matches!(language, ParseableLanguage::C | ParseableLanguage::Cpp)
1157 && matches!(chunk_type, ChunkType::Function | ChunkType::Method)
1158 && let Some(full_name) = c_cpp_function_breadcrumb(target_node, language, source)
1159 {
1160 metadata.breadcrumb = Some(full_name);
1161 }
1162
1163 Some(Chunk {
1164 span: Span {
1165 byte_start,
1166 byte_end,
1167 line_start: start_row + 1,
1168 line_end: end_pos.row + 1,
1169 },
1170 text,
1171 chunk_type,
1172 stride_info: None,
1173 metadata,
1174 })
1175}
1176
1177fn c_cpp_type_has_body_node(node: tree_sitter::Node<'_>) -> bool {
1178 let mut cursor = node.walk();
1179
1180 match node.kind() {
1181 "struct_specifier" | "union_specifier" => node
1182 .children(&mut cursor)
1183 .any(|child| child.kind() == "field_declaration_list"),
1184 "enum_specifier" => node
1185 .children(&mut cursor)
1186 .any(|child| child.kind() == "enumerator_list"),
1187 _ => false,
1188 }
1189}
1190
1191fn c_cpp_function_breadcrumb(
1192 node: tree_sitter::Node<'_>,
1193 language: ParseableLanguage,
1194 source: &str,
1195) -> Option<String> {
1196 let name = display_name_for_node(node, language, source, ChunkType::Function)?;
1197 let context = collect_c_cpp_context_names(node, language, source);
1198 let context_path = context.join("::");
1199
1200 if name.contains("::") {
1201 if context_path.is_empty() || name.starts_with(&format!("{}::", context_path)) {
1202 Some(name)
1203 } else {
1204 Some(format!("{}::{}", context_path, name))
1205 }
1206 } else if context_path.is_empty() {
1207 Some(name)
1208 } else {
1209 Some(format!("{}::{}", context_path, name))
1210 }
1211}
1212
1213fn collect_c_cpp_context_names(
1214 mut node: tree_sitter::Node<'_>,
1215 language: ParseableLanguage,
1216 source: &str,
1217) -> Vec<String> {
1218 let mut parts = Vec::new();
1219
1220 while let Some(parent) = node.parent() {
1221 let kind = parent.kind();
1222 let include = match language {
1223 ParseableLanguage::Cpp => matches!(
1224 kind,
1225 "namespace_definition" | "class_specifier" | "struct_specifier"
1226 ),
1227 ParseableLanguage::C => matches!(kind, "struct_specifier"),
1228 _ => false,
1229 };
1230
1231 if include
1232 && let Some(name) = display_name_for_node(parent, language, source, ChunkType::Class)
1233 {
1234 parts.push(name);
1235 }
1236
1237 node = parent;
1238 }
1239
1240 parts.reverse();
1241 parts
1242}
1243
1244fn strip_method_bodies_in_class_text(
1245 class_node: tree_sitter::Node<'_>,
1246 source: &str,
1247 byte_start: usize,
1248 byte_end: usize,
1249) -> String {
1250 let mut replacements: Vec<(usize, usize, String)> = Vec::new();
1251 let mut stack = vec![class_node];
1252
1253 while let Some(node) = stack.pop() {
1254 if is_method_like_node(node.kind())
1255 && let Some(body) = find_method_body_node(node)
1256 {
1257 let start = body.start_byte();
1258 let end = body.end_byte();
1259 if start >= byte_start && end <= byte_end && start < end {
1260 let replacement = method_body_placeholder(body, source);
1261 replacements.push((start, end, replacement));
1262 }
1263 }
1264
1265 let child_count = node.child_count();
1266 for idx in (0..child_count).rev() {
1267 if let Some(child) = node.child(idx) {
1268 stack.push(child);
1269 }
1270 }
1271 }
1272
1273 if replacements.is_empty() {
1274 return source
1275 .get(byte_start..byte_end)
1276 .unwrap_or_default()
1277 .to_string();
1278 }
1279
1280 replacements.sort_by(|a, b| b.0.cmp(&a.0));
1281 let mut text = source
1282 .get(byte_start..byte_end)
1283 .unwrap_or_default()
1284 .to_string();
1285
1286 for (start, end, replacement) in replacements {
1287 if start < byte_start || end > byte_end || end <= start {
1288 continue;
1289 }
1290 let local_start = start - byte_start;
1291 let local_end = end - byte_start;
1292 if local_end <= text.len() {
1293 text.replace_range(local_start..local_end, &replacement);
1294 }
1295 }
1296
1297 text
1298}
1299
1300fn is_method_like_node(kind: &str) -> bool {
1301 matches!(
1302 kind,
1303 "function_definition"
1304 | "method_definition"
1305 | "method_declaration"
1306 | "constructor_declaration"
1307 | "destructor_declaration"
1308 | "function_item"
1309 | "method"
1310 | "singleton_method"
1311 )
1312}
1313
1314fn find_method_body_node(node: tree_sitter::Node<'_>) -> Option<tree_sitter::Node<'_>> {
1315 let body_kinds = [
1316 "compound_statement",
1317 "statement_block",
1318 "block",
1319 "body",
1320 "body_statement",
1321 "declaration_list",
1322 ];
1323
1324 for idx in 0..node.child_count() {
1325 if let Some(child) = node.child(idx)
1326 && body_kinds.contains(&child.kind())
1327 {
1328 return Some(child);
1329 }
1330 }
1331
1332 None
1333}
1334
1335fn method_body_placeholder(_body: tree_sitter::Node<'_>, _source: &str) -> String {
1336 ";".to_string()
1337}
1338
1339fn adjust_node_for_language(
1340 node: tree_sitter::Node<'_>,
1341 language: ParseableLanguage,
1342) -> tree_sitter::Node<'_> {
1343 match language {
1344 ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => {
1345 if node.kind() == "arrow_function" {
1346 return expand_arrow_function_context(node);
1347 }
1348 node
1349 }
1350 _ => node,
1351 }
1352}
1353
1354fn expand_arrow_function_context(mut node: tree_sitter::Node<'_>) -> tree_sitter::Node<'_> {
1355 const PARENTS: &[&str] = &[
1356 "parenthesized_expression",
1357 "variable_declarator",
1358 "variable_declaration",
1359 "lexical_declaration",
1360 "assignment_expression",
1361 "expression_statement",
1362 "public_field_definition",
1363 "export_statement",
1364 ];
1365
1366 while let Some(parent) = node.parent() {
1367 let kind = parent.kind();
1368 if PARENTS.contains(&kind) {
1369 node = parent;
1370 continue;
1371 }
1372 break;
1373 }
1374
1375 node
1376}
1377
1378#[derive(Clone, Copy)]
1379struct TriviaSegment {
1380 start_byte: usize,
1381 end_byte: usize,
1382}
1383
1384fn extend_with_leading_trivia(
1385 node: tree_sitter::Node<'_>,
1386 language: ParseableLanguage,
1387 source: &str,
1388) -> (usize, usize, Vec<TriviaSegment>) {
1389 let mut start_byte = node.start_byte();
1390 let mut start_row = node.start_position().row;
1391 let mut current = node;
1392 let mut segments = Vec::new();
1393
1394 while let Some(prev) = current.prev_sibling() {
1395 if should_attach_leading_trivia(language, &prev)
1396 && only_whitespace_between(source, prev.end_byte(), start_byte)
1397 {
1398 start_byte = prev.start_byte();
1399 start_row = prev.start_position().row;
1400 segments.push(TriviaSegment {
1401 start_byte: prev.start_byte(),
1402 end_byte: prev.end_byte(),
1403 });
1404 current = prev;
1405 continue;
1406 }
1407 break;
1408 }
1409
1410 segments.reverse();
1411 (start_byte, start_row, segments)
1412}
1413
1414fn should_attach_leading_trivia(language: ParseableLanguage, node: &tree_sitter::Node<'_>) -> bool {
1415 let kind = node.kind();
1416 if kind == "comment" {
1417 return true;
1418 }
1419
1420 match language {
1421 ParseableLanguage::Rust => {
1422 matches!(kind, "line_comment" | "block_comment" | "attribute_item")
1423 }
1424 ParseableLanguage::Python => kind == "decorator",
1425 ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => kind == "decorator",
1426 ParseableLanguage::C | ParseableLanguage::Cpp | ParseableLanguage::Markdown => {
1427 kind == "comment"
1428 }
1429 ParseableLanguage::CSharp => matches!(kind, "attribute_list" | "attribute"),
1430 _ => false,
1431 }
1432}
1433
1434fn collect_trailing_trivia(
1435 node: tree_sitter::Node<'_>,
1436 language: ParseableLanguage,
1437 source: &str,
1438) -> Vec<TriviaSegment> {
1439 let mut segments = Vec::new();
1440 let mut current = node;
1441 let mut previous_end = node.end_byte();
1442
1443 while let Some(next) = current.next_sibling() {
1444 if should_attach_trailing_trivia(language, &next)
1445 && only_whitespace_between(source, previous_end, next.start_byte())
1446 {
1447 segments.push(TriviaSegment {
1448 start_byte: next.start_byte(),
1449 end_byte: next.end_byte(),
1450 });
1451 previous_end = next.end_byte();
1452 current = next;
1453 continue;
1454 }
1455 break;
1456 }
1457
1458 segments
1459}
1460
1461fn should_attach_trailing_trivia(
1462 _language: ParseableLanguage,
1463 node: &tree_sitter::Node<'_>,
1464) -> bool {
1465 node.kind() == "comment"
1466}
1467
1468fn segments_to_strings(segments: &[TriviaSegment], source: &str) -> Vec<String> {
1469 let mut result = Vec::new();
1470
1471 for segment in segments {
1472 if let Some(text) = source
1473 .get(segment.start_byte..segment.end_byte)
1474 .map(std::string::ToString::to_string)
1475 {
1476 result.push(text);
1477 }
1478 }
1479
1480 result
1481}
1482
1483fn collect_ancestry(
1484 mut node: tree_sitter::Node<'_>,
1485 language: ParseableLanguage,
1486 source: &str,
1487) -> Vec<String> {
1488 if language == ParseableLanguage::Markdown {
1489 return markdown_heading_ancestry(node, source);
1490 }
1491
1492 let mut parts = Vec::new();
1493
1494 while let Some(parent) = node.parent() {
1495 if let Some(parent_chunk_type) = chunk_type_for_node(language, &parent)
1496 && let Some(name) = display_name_for_node(parent, language, source, parent_chunk_type)
1497 {
1498 parts.push(name);
1499 }
1500 node = parent;
1501 }
1502
1503 parts.reverse();
1504 parts
1505}
1506
1507fn display_name_for_node(
1508 node: tree_sitter::Node<'_>,
1509 language: ParseableLanguage,
1510 source: &str,
1511 chunk_type: ChunkType,
1512) -> Option<String> {
1513 if let Some(name_node) = node.child_by_field_name("name") {
1514 return text_for_node(name_node, source);
1515 }
1516
1517 match language {
1518 ParseableLanguage::Rust => rust_display_name(node, source, chunk_type),
1519 ParseableLanguage::Python => find_identifier(node, source, &["identifier"]),
1520 ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => find_identifier(
1521 node,
1522 source,
1523 &["identifier", "type_identifier", "property_identifier"],
1524 ),
1525 ParseableLanguage::Haskell => {
1526 find_identifier(node, source, &["identifier", "type_identifier", "variable"])
1527 .or_else(|| first_word_of_node(node, source))
1528 }
1529 ParseableLanguage::Ruby => find_identifier(node, source, &["identifier"]),
1530 ParseableLanguage::Go => find_identifier(node, source, &["identifier", "type_identifier"]),
1531 ParseableLanguage::C => c_display_name(node, source, chunk_type),
1532 ParseableLanguage::Cpp => cpp_display_name(node, source, chunk_type),
1533 ParseableLanguage::CSharp => find_identifier(node, source, &["identifier"]),
1534 ParseableLanguage::Zig => find_identifier(node, source, &["identifier"]),
1535
1536 ParseableLanguage::Markdown => markdown_display_name(node, source, chunk_type),
1537
1538 ParseableLanguage::Dart => {
1539 find_identifier(node, source, &["identifier", "type_identifier"])
1540 }
1541 ParseableLanguage::Elixir => {
1542 find_identifier(node, source, &["alias", "identifier", "atom"])
1544 }
1545 }
1546}
1547
1548fn markdown_display_name(
1549 node: tree_sitter::Node<'_>,
1550 source: &str,
1551 _chunk_type: ChunkType,
1552) -> Option<String> {
1553 if node.kind() == "section" {
1554 return markdown_section_heading(node, source);
1555 }
1556
1557 if markdown_heading_kind(node.kind()) {
1558 return markdown_heading_text(node, source);
1559 }
1560
1561 None
1562}
1563
1564fn markdown_section_heading(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
1565 let mut cursor = node.walk();
1566 for child in node.children(&mut cursor) {
1567 if markdown_heading_kind(child.kind()) {
1568 return markdown_heading_text(child, source);
1569 }
1570 }
1571 None
1572}
1573
1574fn markdown_heading_kind(kind: &str) -> bool {
1575 matches!(kind, "atx_heading" | "setext_heading" | "heading")
1576}
1577
1578fn markdown_heading_text(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
1579 let text = text_for_node(node, source)?;
1580 let mut lines = text.lines();
1581 let first_line = lines.next().unwrap_or("");
1582
1583 if let Some((_, heading)) = parse_atx_heading_line(first_line) {
1584 return Some(heading);
1585 }
1586
1587 let second_line = lines.next().unwrap_or("");
1588 if parse_setext_level(second_line).is_some() {
1589 let trimmed = first_line.trim();
1590 if !trimmed.is_empty() {
1591 return Some(trimmed.to_string());
1592 }
1593 }
1594
1595 let trimmed = first_line.trim();
1596 if trimmed.is_empty() {
1597 None
1598 } else {
1599 Some(trimmed.to_string())
1600 }
1601}
1602
1603fn markdown_heading_ancestry(node: tree_sitter::Node<'_>, source: &str) -> Vec<String> {
1604 let mut target_row = node.start_position().row;
1605 if node.kind() == "section" || markdown_heading_kind(node.kind()) {
1606 target_row = target_row.saturating_sub(1);
1607 }
1608 let lines: Vec<&str> = source.lines().collect();
1609 let mut stack: Vec<(usize, String)> = Vec::new();
1610 let mut i = 0;
1611
1612 while i < lines.len() && i <= target_row {
1613 let line = lines[i];
1614
1615 if let Some((level, heading)) = parse_atx_heading_line(line) {
1616 update_markdown_heading_stack(&mut stack, level, heading);
1617 i += 1;
1618 continue;
1619 }
1620
1621 if i + 1 < lines.len() && i < target_row {
1622 let underline = lines[i + 1];
1623 if let Some(level) = parse_setext_level(underline) {
1624 let heading_text = line.trim();
1625 if !heading_text.is_empty() {
1626 update_markdown_heading_stack(&mut stack, level, heading_text.to_string());
1627 }
1628 i += 2;
1629 continue;
1630 }
1631 }
1632
1633 i += 1;
1634 }
1635
1636 stack.into_iter().map(|(_, heading)| heading).collect()
1637}
1638
1639fn update_markdown_heading_stack(stack: &mut Vec<(usize, String)>, level: usize, text: String) {
1640 while let Some((existing_level, _)) = stack.last() {
1641 if *existing_level < level {
1642 break;
1643 }
1644 stack.pop();
1645 }
1646 stack.push((level, text));
1647}
1648
1649fn parse_atx_heading_line(line: &str) -> Option<(usize, String)> {
1650 let trimmed = line.trim_start();
1651 if !trimmed.starts_with('#') {
1652 return None;
1653 }
1654
1655 let level = trimmed.chars().take_while(|c| *c == '#').count();
1656 if level == 0 {
1657 return None;
1658 }
1659
1660 let mut text = trimmed[level..].trim();
1661 text = text.trim_end_matches('#').trim();
1662 if text.is_empty() {
1663 return None;
1664 }
1665
1666 Some((level, text.to_string()))
1667}
1668
1669fn parse_setext_level(line: &str) -> Option<usize> {
1670 let trimmed = line.trim();
1671 if trimmed.is_empty() {
1672 return None;
1673 }
1674
1675 if trimmed.chars().all(|c| c == '=') {
1676 Some(1)
1677 } else if trimmed.chars().all(|c| c == '-') {
1678 Some(2)
1679 } else {
1680 None
1681 }
1682}
1683
1684fn rust_display_name(
1685 node: tree_sitter::Node<'_>,
1686 source: &str,
1687 chunk_type: ChunkType,
1688) -> Option<String> {
1689 match node.kind() {
1690 "impl_item" => {
1691 let mut parts = Vec::new();
1692 if let Some(ty) = node.child_by_field_name("type")
1693 && let Some(text) = text_for_node(ty, source)
1694 {
1695 parts.push(text);
1696 }
1697 if let Some(trait_node) = node.child_by_field_name("trait")
1698 && let Some(text) = text_for_node(trait_node, source)
1699 {
1700 if let Some(last) = parts.first() {
1701 parts[0] = format!("{} (impl {})", last, text.trim());
1702 } else {
1703 parts.push(format!("impl {}", text.trim()));
1704 }
1705 }
1706 if parts.is_empty() {
1707 find_identifier(node, source, &["identifier"])
1708 } else {
1709 Some(parts.remove(0))
1710 }
1711 }
1712 "mod_item" if chunk_type == ChunkType::Module => {
1713 find_identifier(node, source, &["identifier"])
1714 }
1715 _ => find_identifier(node, source, &["identifier", "type_identifier"]),
1716 }
1717}
1718
1719fn c_display_name(
1720 node: tree_sitter::Node<'_>,
1721 source: &str,
1722 _chunk_type: ChunkType,
1723) -> Option<String> {
1724 match node.kind() {
1725 "function_definition" => {
1726 if let Some(declarator) = node.child_by_field_name("declarator") {
1728 return find_identifier_recursive(declarator, source, &["identifier"]);
1729 }
1730 None
1731 }
1732 "struct_specifier" | "enum_specifier" | "union_specifier" => {
1733 find_identifier(node, source, &["type_identifier", "identifier"])
1734 }
1735 "type_definition" => find_identifier(node, source, &["type_identifier", "identifier"]),
1736 "preproc_function_def" | "preproc_def" => find_identifier(node, source, &["identifier"]),
1737 _ => find_identifier(node, source, &["identifier", "type_identifier"]),
1738 }
1739}
1740
1741fn cpp_display_name(
1742 node: tree_sitter::Node<'_>,
1743 source: &str,
1744 _chunk_type: ChunkType,
1745) -> Option<String> {
1746 match node.kind() {
1747 "function_definition" => {
1748 if let Some(declarator) = node.child_by_field_name("declarator") {
1749 return find_identifier_recursive(
1750 declarator,
1751 source,
1752 &[
1753 "identifier",
1754 "field_identifier",
1755 "destructor_name",
1756 "qualified_identifier",
1757 ],
1758 );
1759 }
1760 None
1761 }
1762 "declaration" => {
1763 if let Some(declarator) = node.child_by_field_name("declarator") {
1764 return find_identifier_recursive(
1765 declarator,
1766 source,
1767 &[
1768 "identifier",
1769 "field_identifier",
1770 "destructor_name",
1771 "qualified_identifier",
1772 ],
1773 );
1774 }
1775 find_identifier(node, source, &["identifier", "type_identifier"])
1776 }
1777 "class_specifier" | "struct_specifier" | "enum_specifier" | "union_specifier" => {
1778 find_identifier(node, source, &["type_identifier", "identifier"])
1779 }
1780 "namespace_definition" => {
1781 find_identifier(node, source, &["identifier", "namespace_identifier"])
1782 }
1783 "alias_declaration" | "type_definition" => {
1784 find_identifier(node, source, &["type_identifier", "identifier"])
1785 }
1786 "template_declaration" => {
1787 let mut cursor = node.walk();
1788 for child in node.children(&mut cursor) {
1789 if matches!(
1790 child.kind(),
1791 "class_specifier"
1792 | "struct_specifier"
1793 | "enum_specifier"
1794 | "union_specifier"
1795 | "function_definition"
1796 | "declaration"
1797 | "alias_declaration"
1798 | "type_definition"
1799 | "concept_definition"
1800 ) {
1801 return cpp_display_name(child, source, _chunk_type);
1802 }
1803 }
1804 find_identifier(node, source, &["type_identifier", "identifier"])
1805 }
1806 _ => find_identifier(node, source, &["identifier", "type_identifier"]),
1807 }
1808}
1809
1810fn find_identifier_recursive(
1812 node: tree_sitter::Node<'_>,
1813 source: &str,
1814 candidate_kinds: &[&str],
1815) -> Option<String> {
1816 if candidate_kinds.contains(&node.kind()) {
1817 return text_for_node(node, source).map(|s| s.trim().to_string());
1818 }
1819 let mut cursor = node.walk();
1820 for child in node.children(&mut cursor) {
1821 if let Some(result) = find_identifier_recursive(child, source, candidate_kinds) {
1822 return Some(result);
1823 }
1824 }
1825 None
1826}
1827
1828fn find_identifier(
1829 node: tree_sitter::Node<'_>,
1830 source: &str,
1831 candidate_kinds: &[&str],
1832) -> Option<String> {
1833 let mut cursor = node.walk();
1834 for child in node.children(&mut cursor) {
1835 if candidate_kinds.contains(&child.kind())
1836 && let Some(text) = text_for_node(child, source)
1837 {
1838 return Some(text.trim().to_string());
1839 }
1840 }
1841 None
1842}
1843
1844fn first_word_of_node(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
1845 let text = text_for_node(node, source)?;
1846 text.split_whitespace().next().map(|s| {
1847 s.trim_end_matches(|c: char| !c.is_alphanumeric() && c != '_')
1848 .to_string()
1849 })
1850}
1851
1852fn text_for_node(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
1853 node.utf8_text(source.as_bytes())
1854 .ok()
1855 .map(std::string::ToString::to_string)
1856}
1857
1858fn only_whitespace_between(source: &str, start: usize, end: usize) -> bool {
1859 if start >= end || end > source.len() {
1860 return true;
1861 }
1862
1863 source[start..end].chars().all(char::is_whitespace)
1864}
1865
1866fn adjust_chunk_type_for_context(
1867 node: tree_sitter::Node<'_>,
1868 chunk_type: ChunkType,
1869 language: ParseableLanguage,
1870) -> ChunkType {
1871 if chunk_type != ChunkType::Function {
1872 return chunk_type;
1873 }
1874
1875 if is_method_context(node, language) {
1876 ChunkType::Method
1877 } else {
1878 chunk_type
1879 }
1880}
1881
1882fn is_method_context(node: tree_sitter::Node<'_>, language: ParseableLanguage) -> bool {
1883 const PYTHON_CONTAINERS: &[&str] = &["class_definition"];
1884 const TYPESCRIPT_CONTAINERS: &[&str] = &["class_body", "class_declaration"];
1885 const RUBY_CONTAINERS: &[&str] = &["class", "module"];
1886 const RUST_CONTAINERS: &[&str] = &["impl_item", "trait_item"];
1887 const DART_CONTAINERS: &[&str] = &[
1888 "class_definition",
1889 "class_declaration",
1890 "mixin_declaration",
1891 "enum_declaration",
1892 ];
1893
1894 match language {
1895 ParseableLanguage::Python => ancestor_has_kind(node, PYTHON_CONTAINERS),
1896 ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => {
1897 ancestor_has_kind(node, TYPESCRIPT_CONTAINERS)
1898 }
1899 ParseableLanguage::Ruby => ancestor_has_kind(node, RUBY_CONTAINERS),
1900 ParseableLanguage::Rust => ancestor_has_kind(node, RUST_CONTAINERS),
1901 ParseableLanguage::Go => false,
1902 ParseableLanguage::C => ancestor_has_kind(node, &["struct_specifier"]),
1903 ParseableLanguage::Cpp => ancestor_has_kind(node, &["class_specifier", "struct_specifier"]),
1904 ParseableLanguage::CSharp => false,
1905 ParseableLanguage::Haskell => false,
1906 ParseableLanguage::Zig => false,
1907
1908 ParseableLanguage::Dart => ancestor_has_kind(node, DART_CONTAINERS),
1909
1910 ParseableLanguage::Elixir => false, ParseableLanguage::Markdown => false,
1912 }
1913}
1914
1915fn ancestor_has_kind(node: tree_sitter::Node<'_>, kinds: &[&str]) -> bool {
1916 let mut current = node;
1917 while let Some(parent) = current.parent() {
1918 if kinds.contains(&parent.kind()) {
1919 return true;
1920 }
1921 current = parent;
1922 }
1923 false
1924}
1925
1926fn is_csharp_field_like(node: tree_sitter::Node<'_>) -> bool {
1927 if let Some(parent) = node.parent() {
1928 return matches!(
1929 parent.kind(),
1930 "field_declaration" | "event_field_declaration"
1931 );
1932 }
1933 false
1934}
1935
1936fn apply_striding(chunks: Vec<Chunk>, config: &ChunkConfig) -> Result<Vec<Chunk>> {
1938 let mut result = Vec::new();
1939
1940 for chunk in chunks {
1941 let estimated_tokens = estimate_tokens(&chunk.text);
1942
1943 if estimated_tokens <= config.max_tokens {
1944 result.push(chunk);
1946 } else {
1947 tracing::debug!(
1949 "Chunk with {} tokens exceeds limit of {}, applying striding",
1950 estimated_tokens,
1951 config.max_tokens
1952 );
1953
1954 let strided_chunks = stride_large_chunk(chunk, config)?;
1955 result.extend(strided_chunks);
1956 }
1957 }
1958
1959 Ok(result)
1960}
1961
1962fn stride_large_chunk(chunk: Chunk, config: &ChunkConfig) -> Result<Vec<Chunk>> {
1964 let text = &chunk.text;
1965
1966 if text.is_empty() {
1968 return Ok(vec![chunk]);
1969 }
1970
1971 let char_count = text.chars().count();
1974 let estimated_tokens = estimate_tokens(text);
1975 let chars_per_token = if estimated_tokens == 0 {
1977 4.5 } else {
1979 char_count as f32 / estimated_tokens as f32
1980 };
1981 let window_chars = ((config.max_tokens as f32 * 0.9) * chars_per_token) as usize; let overlap_chars = (config.stride_overlap as f32 * chars_per_token) as usize;
1983 let stride_chars = window_chars.saturating_sub(overlap_chars);
1984
1985 if stride_chars == 0 {
1986 return Err(anyhow::anyhow!("Stride size is too small"));
1987 }
1988
1989 let char_byte_indices: Vec<(usize, char)> = text.char_indices().collect();
1991 let mut strided_chunks = Vec::new();
1994 let original_chunk_id = format!("{}:{}", chunk.span.byte_start, chunk.span.byte_end);
1995 let mut start_char_idx = 0;
1996 let mut stride_index = 0;
1997
1998 let total_strides = if char_count <= window_chars {
2000 1
2001 } else {
2002 ((char_count - overlap_chars) as f32 / stride_chars as f32).ceil() as usize
2003 };
2004
2005 while start_char_idx < char_count {
2006 let end_char_idx = (start_char_idx + window_chars).min(char_count);
2007
2008 let start_byte_pos = char_byte_indices[start_char_idx].0;
2010 let end_byte_pos = if end_char_idx < char_count {
2011 char_byte_indices[end_char_idx].0
2012 } else {
2013 text.len()
2014 };
2015
2016 let stride_text = &text[start_byte_pos..end_byte_pos];
2017
2018 let overlap_start = if stride_index > 0 { overlap_chars } else { 0 };
2020 let overlap_end = if end_char_idx < char_count {
2021 overlap_chars
2022 } else {
2023 0
2024 };
2025
2026 let byte_offset_start = chunk.span.byte_start + start_byte_pos;
2028 let byte_offset_end = chunk.span.byte_start + end_byte_pos;
2029
2030 let text_before_start = &text[..start_byte_pos];
2032 let line_offset_start = text_before_start.lines().count().saturating_sub(1);
2033 let stride_lines = stride_text.lines().count();
2034 let metadata = chunk.metadata.with_updated_text(stride_text);
2035
2036 let stride_chunk = Chunk {
2037 span: Span {
2038 byte_start: byte_offset_start,
2039 byte_end: byte_offset_end,
2040 line_start: chunk.span.line_start + line_offset_start,
2041 line_end: chunk.span.line_start
2043 + line_offset_start
2044 + stride_lines.saturating_sub(1),
2045 },
2046 text: stride_text.to_string(),
2047 chunk_type: chunk.chunk_type.clone(),
2048 stride_info: Some(StrideInfo {
2049 original_chunk_id: original_chunk_id.clone(),
2050 stride_index,
2051 total_strides,
2052 overlap_start,
2053 overlap_end,
2054 }),
2055 metadata,
2056 };
2057
2058 strided_chunks.push(stride_chunk);
2059
2060 if end_char_idx >= char_count {
2062 break;
2063 }
2064
2065 start_char_idx += stride_chars;
2066 stride_index += 1;
2067 }
2068
2069 tracing::debug!(
2070 "Created {} strides from chunk of {} tokens",
2071 strided_chunks.len(),
2072 estimate_tokens(text)
2073 );
2074
2075 Ok(strided_chunks)
2076}
2077
2078fn merge_small_chunks(chunks: Vec<Chunk>, text: &str, target_tokens: usize) -> Vec<Chunk> {
2091 if chunks.is_empty() {
2092 return chunks;
2093 }
2094
2095 let mut result = Vec::new();
2096 let mut current_group: Vec<Chunk> = Vec::new();
2097 let mut current_tokens = 0;
2098
2099 for chunk in chunks {
2100 let chunk_tokens = chunk.metadata.estimated_tokens;
2101
2102 if current_tokens + chunk_tokens > target_tokens {
2103 if !current_group.is_empty() {
2105 result.push(merge_group(¤t_group, text));
2106 current_group.clear();
2107 current_tokens = 0;
2108 }
2109 }
2110
2111 if chunk_tokens > target_tokens {
2113 if !current_group.is_empty() {
2114 result.push(merge_group(¤t_group, text));
2115 current_group.clear();
2116 current_tokens = 0;
2117 }
2118 result.push(chunk);
2119 continue;
2120 }
2121
2122 current_group.push(chunk);
2123 current_tokens += chunk_tokens;
2124 }
2125
2126 if !current_group.is_empty() {
2128 result.push(merge_group(¤t_group, text));
2129 }
2130
2131 result
2132}
2133
2134fn merge_group(group: &[Chunk], text: &str) -> Chunk {
2135 if group.len() == 1 {
2136 return group[0].clone();
2137 }
2138
2139 let first = &group[0];
2140 let last = &group[group.len() - 1];
2141
2142 let byte_start = first.span.byte_start;
2145 let byte_end = last.span.byte_end;
2146 let line_start = first.span.line_start;
2147 let line_end = last.span.line_end;
2148
2149 let chunk_text = if byte_end <= text.len() {
2151 text[byte_start..byte_end].to_string()
2152 } else {
2153 text.get(byte_start..).unwrap_or("").to_string()
2155 };
2156
2157 let metadata = ChunkMetadata::from_text(&chunk_text);
2158
2159 let chunk_type = if group.iter().all(|c| c.chunk_type == first.chunk_type) {
2166 first.chunk_type.clone()
2167 } else {
2168 ChunkType::Text
2169 };
2170
2171 Chunk {
2172 span: Span {
2173 byte_start,
2174 byte_end,
2175 line_start,
2176 line_end,
2177 },
2178 text: chunk_text,
2179 chunk_type,
2180 stride_info: None,
2181 metadata,
2182 }
2183}
2184
2185#[cfg(test)]
2188mod tests {
2189 use super::*;
2190
2191 fn canonicalize_spans(
2192 mut spans: Vec<(usize, usize, ChunkType)>,
2193 ) -> Vec<(usize, usize, ChunkType)> {
2194 fn chunk_type_order(chunk_type: &ChunkType) -> u8 {
2195 match chunk_type {
2196 ChunkType::Text => 0,
2197 ChunkType::Function => 1,
2198 ChunkType::Class => 2,
2199 ChunkType::Method => 3,
2200 ChunkType::Module => 4,
2201 }
2202 }
2203
2204 spans.sort_by(|a, b| {
2205 let order_a = chunk_type_order(&a.2);
2206 let order_b = chunk_type_order(&b.2);
2207 order_a
2208 .cmp(&order_b)
2209 .then_with(|| a.0.cmp(&b.0))
2210 .then_with(|| a.1.cmp(&b.1))
2211 });
2212
2213 let mut result: Vec<(usize, usize, ChunkType)> = Vec::new();
2214 for (start, end, ty) in spans {
2215 if let Some(last) = result.last_mut()
2216 && last.0 == start
2217 && last.2 == ty
2218 {
2219 if end > last.1 {
2220 last.1 = end;
2221 }
2222 continue;
2223 }
2224 result.push((start, end, ty));
2225 }
2226
2227 result
2228 }
2229
2230 fn assert_query_parity(language: ParseableLanguage, source: &str) {
2231 let mut parser = tree_sitter::Parser::new();
2232 let ts_language = tree_sitter_language(language).expect("language");
2233 parser.set_language(&ts_language).expect("set language");
2234 let tree = parser.parse(source, None).expect("parse source");
2235
2236 let query_chunks = query_chunker::chunk_with_queries(language, ts_language, &tree, source)
2237 .expect("query execution")
2238 .expect("queries available");
2239
2240 let mut legacy_chunks = Vec::new();
2241 let mut cursor = tree.walk();
2242 extract_code_chunks(&mut cursor, source, &mut legacy_chunks, language);
2243
2244 let query_spans = canonicalize_spans(
2245 query_chunks
2246 .iter()
2247 .map(|chunk| {
2248 (
2249 chunk.span.byte_start,
2250 chunk.span.byte_end,
2251 chunk.chunk_type.clone(),
2252 )
2253 })
2254 .collect(),
2255 );
2256 let legacy_spans = canonicalize_spans(
2257 legacy_chunks
2258 .iter()
2259 .map(|chunk| {
2260 (
2261 chunk.span.byte_start,
2262 chunk.span.byte_end,
2263 chunk.chunk_type.clone(),
2264 )
2265 })
2266 .collect(),
2267 );
2268
2269 assert_eq!(query_spans, legacy_spans);
2270 }
2271
2272 #[test]
2273 fn test_chunk_generic_byte_offsets() {
2274 let text = "line 1\nline 2\nline 3\nline 4\nline 5";
2276 let chunks = chunk_generic(text).unwrap();
2277
2278 assert!(!chunks.is_empty());
2279
2280 assert_eq!(chunks[0].span.byte_start, 0);
2282
2283 for chunk in &chunks {
2285 let expected_len = chunk.text.len();
2286 let actual_len = chunk.span.byte_end - chunk.span.byte_start;
2287 assert_eq!(actual_len, expected_len);
2288 }
2289 }
2290
2291 #[test]
2292 fn test_chunk_generic_large_file_performance() {
2293 let lines: Vec<String> = (0..1000)
2295 .map(|i| format!("Line {i}: Some content here"))
2296 .collect();
2297 let text = lines.join("\n");
2298
2299 let start = std::time::Instant::now();
2300 let chunks = chunk_generic(&text).unwrap();
2301 let duration = start.elapsed();
2302
2303 assert!(
2305 duration.as_millis() < 100,
2306 "Chunking took too long: {duration:?}"
2307 );
2308 assert!(!chunks.is_empty());
2309
2310 for chunk in &chunks {
2312 assert!(chunk.span.line_start > 0);
2313 assert!(chunk.span.line_end >= chunk.span.line_start);
2314 }
2315 }
2316
2317 #[test]
2318 fn test_chunk_rust() {
2319 let rust_code = r"
2320pub struct Calculator {
2321 memory: f64,
2322}
2323
2324impl Calculator {
2325 pub fn new() -> Self {
2326 Calculator { memory: 0.0 }
2327 }
2328
2329 pub fn add(&mut self, a: f64, b: f64) -> f64 {
2330 a + b
2331 }
2332}
2333
2334fn main() {
2335 let calc = Calculator::new();
2336}
2337
2338pub mod utils {
2339 pub fn helper() {}
2340}
2341";
2342
2343 let chunks = chunk_language(rust_code, ParseableLanguage::Rust).unwrap();
2344 assert!(!chunks.is_empty());
2345
2346 let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
2348 assert!(chunk_types.contains(&&ChunkType::Class)); assert!(chunk_types.contains(&&ChunkType::Module)); assert!(chunk_types.contains(&&ChunkType::Function)); }
2352
2353 #[test]
2354 fn test_rust_doc_comments_attached() {
2355 let rust_code = r"
2356/// Doc comment
2357pub struct Foo {}
2358";
2359 let chunks = chunk_language(rust_code, ParseableLanguage::Rust).unwrap();
2360 let struct_chunk = chunks
2361 .iter()
2362 .find(|c| c.text.contains("struct Foo"))
2363 .unwrap();
2364 assert!(
2365 struct_chunk.text.contains("/// Doc comment"),
2366 "Doc comment should be attached"
2367 );
2368 }
2369
2370 #[test]
2371 fn test_rust_query_matches_legacy() {
2372 let source = r"
2373 mod sample {
2374 struct Thing;
2375
2376 impl Thing {
2377 fn new() -> Self { Self }
2378 fn helper(&self) {}
2379 }
2380 }
2381
2382 fn util() {}
2383 ";
2384
2385 assert_query_parity(ParseableLanguage::Rust, source);
2386 }
2387
2388 #[test]
2389 fn test_python_query_matches_legacy() {
2390 let source = r"
2391class Example:
2392 @classmethod
2393 def build(cls):
2394 return cls()
2395
2396
2397def helper():
2398 return 1
2399
2400
2401async def async_helper():
2402 return 2
2403";
2404
2405 assert_query_parity(ParseableLanguage::Python, source);
2406 }
2407
2408 #[test]
2409 fn test_chunk_ruby() {
2410 let ruby_code = r#"
2411class Calculator
2412 def initialize
2413 @memory = 0.0
2414 end
2415
2416 def add(a, b)
2417 a + b
2418 end
2419
2420 def self.class_method
2421 "class method"
2422 end
2423
2424 private
2425
2426 def private_method
2427 "private"
2428 end
2429end
2430
2431module Utils
2432 def self.helper
2433 "helper"
2434 end
2435end
2436
2437def main
2438 calc = Calculator.new
2439end
2440"#;
2441
2442 let chunks = chunk_language(ruby_code, ParseableLanguage::Ruby).unwrap();
2443 assert!(!chunks.is_empty());
2444
2445 let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
2447 assert!(chunk_types.contains(&&ChunkType::Class)); assert!(chunk_types.contains(&&ChunkType::Module)); assert!(chunk_types.contains(&&ChunkType::Function)); }
2451
2452 #[test]
2453 fn test_language_detection_fallback() {
2454 let generic_text = "Some text\nwith multiple lines\nto chunk generically";
2456
2457 let chunks_unknown = chunk_text(generic_text, None).unwrap();
2458 let chunks_generic = chunk_generic(generic_text).unwrap();
2459
2460 assert_eq!(chunks_unknown.len(), chunks_generic.len());
2462 assert_eq!(chunks_unknown[0].text, chunks_generic[0].text);
2463 }
2464
2465 #[test]
2466 fn test_chunk_go() {
2467 let go_code = r#"
2468package main
2469
2470import "fmt"
2471
2472const Pi = 3.14159
2473
2474var memory float64
2475
2476type Calculator struct {
2477 memory float64
2478}
2479
2480type Operation interface {
2481 Calculate(a, b float64) float64
2482}
2483
2484func NewCalculator() *Calculator {
2485 return &Calculator{memory: 0.0}
2486}
2487
2488func (c *Calculator) Add(a, b float64) float64 {
2489 return a + b
2490}
2491
2492func main() {
2493 calc := NewCalculator()
2494}
2495"#;
2496
2497 let chunks = chunk_language(go_code, ParseableLanguage::Go).unwrap();
2498 assert!(!chunks.is_empty());
2499
2500 let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
2502 assert!(chunk_types.contains(&&ChunkType::Module)); assert!(chunk_types.contains(&&ChunkType::Class)); assert!(chunk_types.contains(&&ChunkType::Function)); assert!(chunk_types.contains(&&ChunkType::Method)); }
2507
2508 #[test]
2509 #[ignore] fn test_chunk_typescript_arrow_context() {
2511 let ts_code = r"
2512// Utility function
2513export const util = () => {
2514 // comment about util
2515 return 42;
2516};
2517
2518export class Example {
2519 // leading comment for method
2520 constructor() {}
2521
2522 // Another comment
2523 run = () => {
2524 return util();
2525 };
2526}
2527
2528const compute = (x: number) => x * 2;
2529";
2530
2531 let chunks = chunk_language(ts_code, ParseableLanguage::TypeScript).unwrap();
2532
2533 let util_chunk = chunks
2534 .iter()
2535 .find(|chunk| chunk.text.contains("export const util"))
2536 .expect("Expected chunk for util arrow function");
2537 assert_eq!(util_chunk.chunk_type, ChunkType::Function);
2538 assert!(
2539 util_chunk.text.contains("// Utility function"),
2540 "expected leading comment to be included"
2541 );
2542 assert!(util_chunk.text.contains("export const util ="));
2543
2544 let method_chunk = chunks
2546 .iter()
2547 .find(|chunk| {
2548 chunk.chunk_type == ChunkType::Method && chunk.text.contains("run = () =>")
2549 })
2550 .expect("Expected chunk for class field arrow function");
2551
2552 assert_eq!(method_chunk.chunk_type, ChunkType::Method);
2553 assert!(
2554 method_chunk.text.contains("// Another comment"),
2555 "expected inline comment to be included"
2556 );
2557
2558 let compute_chunk = chunks
2559 .iter()
2560 .find(|chunk| chunk.text.contains("const compute"))
2561 .expect("Expected chunk for compute arrow function");
2562 assert_eq!(compute_chunk.chunk_type, ChunkType::Function);
2563 assert!(
2564 compute_chunk
2565 .text
2566 .contains("const compute = (x: number) => x * 2;")
2567 );
2568
2569 assert!(
2571 chunks
2572 .iter()
2573 .all(|chunk| !chunk.text.trim_start().starts_with("() =>"))
2574 );
2575 assert!(
2576 chunks
2577 .iter()
2578 .all(|chunk| !chunk.text.trim_start().starts_with("(x: number) =>"))
2579 );
2580 }
2581
2582 #[test]
2586 #[ignore]
2587 fn test_typescript_query_matches_legacy() {
2588 let source = r"
2589export const util = () => {
2590 return 42;
2591};
2592
2593export class Example {
2594 run = () => {
2595 return util();
2596 };
2597}
2598
2599const compute = (x: number) => x * 2;
2600";
2601
2602 assert_query_parity(ParseableLanguage::TypeScript, source);
2603 }
2604
2605 #[test]
2606 fn test_ruby_query_matches_legacy() {
2607 let source = r#"
2608class Calculator
2609 def initialize
2610 @memory = 0.0
2611 end
2612
2613 def add(a, b)
2614 a + b
2615 end
2616
2617 def self.class_method
2618 "class method"
2619 end
2620end
2621"#;
2622
2623 assert_query_parity(ParseableLanguage::Ruby, source);
2624 }
2625
2626 #[test]
2627 fn test_go_query_matches_legacy() {
2628 let source = r#"
2629package main
2630
2631import "fmt"
2632
2633const Pi = 3.14159
2634
2635var memory float64
2636
2637type Calculator struct {
2638 memory float64
2639}
2640
2641func (c *Calculator) Add(a, b float64) float64 {
2642 return a + b
2643}
2644
2645func Helper() {}
2646"#;
2647
2648 assert_query_parity(ParseableLanguage::Go, source);
2649 }
2650
2651 #[test]
2652 fn test_chunk_c_corner_cases() {
2653 let c_code = r#"
2654#define MAX(a,b) ((a) > (b) ? (a) : (b))
2655#define VERSION 3
2656
2657typedef struct Node {
2658 int value;
2659 struct Node* next;
2660} Node;
2661
2662union Payload {
2663 int i;
2664 float f;
2665};
2666
2667enum Color {
2668 Red,
2669 Green,
2670 Blue,
2671};
2672
2673static inline int add(int a, int b) {
2674 return a + b;
2675}
2676
2677int main(void) {
2678 return MAX(add(1, 2), VERSION);
2679}
2680"#;
2681
2682 let chunks = chunk_language(c_code, ParseableLanguage::C).unwrap();
2683 assert!(!chunks.is_empty());
2684
2685 assert!(
2686 chunks
2687 .iter()
2688 .any(|c| { c.chunk_type == ChunkType::Function && c.text.contains("#define MAX") })
2689 );
2690 assert!(
2691 chunks
2692 .iter()
2693 .any(|c| { c.chunk_type == ChunkType::Text && c.text.contains("#define VERSION") })
2694 );
2695 assert!(
2696 chunks
2697 .iter()
2698 .any(|c| { c.chunk_type == ChunkType::Class && c.text.contains("struct Node") })
2699 );
2700 assert!(
2701 chunks
2702 .iter()
2703 .any(|c| { c.chunk_type == ChunkType::Class && c.text.contains("union Payload") })
2704 );
2705 assert!(
2706 chunks
2707 .iter()
2708 .any(|c| { c.chunk_type == ChunkType::Class && c.text.contains("enum Color") })
2709 );
2710 assert!(chunks.iter().any(|c| {
2711 c.chunk_type == ChunkType::Function && c.text.contains("static inline int add")
2712 }));
2713 assert!(
2714 chunks
2715 .iter()
2716 .any(|c| { c.chunk_type == ChunkType::Function && c.text.contains("int main") })
2717 );
2718 }
2719
2720 #[test]
2721 fn test_chunk_c_struct_declaration_without_body_stays_intact() {
2722 let c_code = r#"
2723#include <stdint.h>
2724
2725struct mtd_info_user meminfo;
2726struct foo forward;
2727"#;
2728
2729 let chunks = chunk_language(c_code, ParseableLanguage::C).unwrap();
2730
2731 assert!(
2732 chunks
2733 .iter()
2734 .any(|c| { c.text.contains("struct mtd_info_user meminfo;") })
2735 );
2736 assert!(
2737 chunks
2738 .iter()
2739 .any(|c| c.text.contains("struct foo forward;"))
2740 );
2741 assert!(
2742 !chunks
2743 .iter()
2744 .any(|c| c.text.trim() == "struct mtd_info_user")
2745 );
2746 assert!(!chunks.iter().any(|c| c.text.trim() == "struct foo"));
2747 }
2748
2749 #[test]
2750 fn test_chunk_cpp_corner_cases() {
2751 let cpp_code = r#"
2752#include <vector>
2753#define SQUARE(x) ((x) * (x))
2754
2755namespace math {
2756template <typename T>
2757T add(T a, T b) {
2758 return a + b;
2759}
2760
2761using Vec = std::vector<int>;
2762typedef unsigned long ulong_t;
2763
2764struct Point {
2765 int x;
2766 int y;
2767};
2768
2769class Calculator {
2770public:
2771 int add(int a, int b) { return a + b; }
2772};
2773
2774enum class Color { Red, Green, Blue };
2775} // namespace math
2776
2777int main() {
2778 return math::add(1, 2);
2779}
2780"#;
2781
2782 let chunks = chunk_language(cpp_code, ParseableLanguage::Cpp).unwrap();
2783 assert!(!chunks.is_empty());
2784
2785 assert!(
2786 chunks
2787 .iter()
2788 .any(|c| c.text.contains("template <typename T>"))
2789 );
2790 assert!(
2791 chunks
2792 .iter()
2793 .any(|c| { c.chunk_type == ChunkType::Text && c.text.contains("using Vec") })
2794 );
2795 assert!(chunks.iter().any(|c| {
2796 c.chunk_type == ChunkType::Text && c.text.contains("typedef unsigned long")
2797 }));
2798 assert!(
2799 chunks.iter().any(|c| {
2800 c.chunk_type == ChunkType::Function && c.text.contains("#define SQUARE")
2801 })
2802 );
2803 let calculator_chunk = chunks
2804 .iter()
2805 .find(|c| c.chunk_type == ChunkType::Class && c.text.contains("class Calculator"));
2806 assert!(calculator_chunk.is_some());
2807 let calculator_chunk = calculator_chunk.unwrap();
2808 assert!(calculator_chunk.text.contains("int add"));
2809 assert!(!calculator_chunk.text.contains("return a + b"));
2810
2811 assert!(
2812 chunks
2813 .iter()
2814 .any(|c| { c.chunk_type == ChunkType::Class && c.text.contains("struct Point") })
2815 );
2816 assert!(
2817 chunks.iter().any(|c| {
2818 c.chunk_type == ChunkType::Class && c.text.contains("enum class Color")
2819 })
2820 );
2821 assert!(
2822 chunks
2823 .iter()
2824 .any(|c| { c.chunk_type == ChunkType::Function && c.text.contains("int main") })
2825 );
2826 assert!(
2827 chunks
2828 .iter()
2829 .any(|c| { c.chunk_type == ChunkType::Function && c.text.contains("T add") })
2830 );
2831 assert!(chunks.iter().any(|c| {
2832 c.chunk_type == ChunkType::Method && c.text.contains("int add(int a, int b)")
2833 }));
2834 }
2835
2836 #[test]
2837 fn test_cpp_suppresses_contained_text_chunks() {
2838 let cpp_code = r#"
2839class Widget {
2840public:
2841 using Alias = int;
2842 int calc() { int local = 1; return local; }
2843};
2844
2845using TopLevel = double;
2846"#;
2847
2848 let chunks = chunk_language(cpp_code, ParseableLanguage::Cpp).unwrap();
2849
2850 assert!(
2851 !chunks
2852 .iter()
2853 .any(|c| { c.chunk_type == ChunkType::Text && c.text.contains("using Alias") })
2854 );
2855 assert!(
2856 !chunks
2857 .iter()
2858 .any(|c| { c.chunk_type == ChunkType::Text && c.text.contains("int local") })
2859 );
2860 assert!(
2861 chunks
2862 .iter()
2863 .any(|c| { c.chunk_type == ChunkType::Text && c.text.contains("using TopLevel") })
2864 );
2865 assert!(
2866 chunks
2867 .iter()
2868 .any(|c| { c.chunk_type == ChunkType::Method && c.text.contains("int calc") })
2869 );
2870 }
2871
2872 #[test]
2873 fn test_cpp_template_prefix_merges_with_definition() {
2874 let cpp_code = r#"
2875template <typename T>
2876struct Box {
2877 static int value;
2878};
2879
2880template <typename T>
2881int Box<T>::value = 0;
2882"#;
2883
2884 let chunks = chunk_language(cpp_code, ParseableLanguage::Cpp).unwrap();
2885
2886 let def_chunk = chunks
2887 .iter()
2888 .find(|c| c.text.contains("int Box<T>::value = 0;"))
2889 .expect("static member definition chunk present");
2890
2891 assert!(def_chunk.text.contains("template <typename T>"));
2892
2893 assert!(!chunks.iter().any(|c| {
2894 c.chunk_type == ChunkType::Text && c.text.trim() == "template <typename T>"
2895 }));
2896 }
2897
2898 #[test]
2899 fn test_cpp_template_method_breadcrumb_in_namespaces() {
2900 let cpp_code = r#"
2901namespace com {
2902namespace ford {
2903
2904template <typename T>
2905class Wrapper {
2906public:
2907 template <typename U>
2908 U convert(U value) { return value; }
2909};
2910
2911} // namespace ford
2912} // namespace com
2913"#;
2914
2915 let chunks = chunk_language(cpp_code, ParseableLanguage::Cpp).unwrap();
2916 let method_chunk = chunks
2917 .iter()
2918 .find(|c| c.chunk_type == ChunkType::Method && c.text.contains("convert"))
2919 .expect("convert method chunk present");
2920
2921 assert_eq!(
2922 method_chunk.metadata.breadcrumb.as_deref(),
2923 Some("com::ford::Wrapper::convert")
2924 );
2925 }
2926
2927 #[test]
2928 fn test_cpp_function_breadcrumb_qualification() {
2929 let cpp_code = r#"
2930namespace outer {
2931class A {
2932public:
2933 void m();
2934};
2935}
2936
2937void outer::A::m() {
2938 // body
2939}
2940"#;
2941
2942 let chunks = chunk_language(cpp_code, ParseableLanguage::Cpp).unwrap();
2943 let method_chunk = chunks
2944 .iter()
2945 .find(|c| c.chunk_type == ChunkType::Function && c.text.contains("outer::A::m"))
2946 .expect("method chunk should exist");
2947 assert_eq!(
2948 method_chunk.metadata.breadcrumb.as_deref(),
2949 Some("outer::A::m")
2950 );
2951 }
2952
2953 #[test]
2954 fn test_haskell_query_matches_legacy() {
2955 let source = r#"
2956module Example where
2957
2958data Shape
2959 = Circle Float
2960 | Square Float
2961
2962type family Area a
2963
2964class Printable a where
2965 printValue :: a -> String
2966
2967instance Printable Shape where
2968 printValue (Circle _) = "circle"
2969 printValue (Square _) = "square"
2970
2971shapeDescription :: Shape -> String
2972shapeDescription (Circle r) = "circle of radius " ++ show r
2973shapeDescription (Square s) = "square of side " ++ show s
2974"#;
2975
2976 assert_query_parity(ParseableLanguage::Haskell, source);
2977 }
2978
2979 #[test]
2980 fn test_markdown_real_file_breadcrumbs() {
2981 let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
2982 .join("tests/fixtures/markdown_breadcrumbs.md");
2983 let source = std::fs::read_to_string(&path).expect("read markdown file");
2984
2985 let chunks =
2986 chunk_text(&source, Some(ck_core::Language::Markdown)).expect("chunk markdown");
2987
2988 let all_text: String = chunks
2996 .iter()
2997 .map(|c| c.text.as_str())
2998 .collect::<Vec<_>>()
2999 .join("\n");
3000
3001 assert!(
3002 all_text.contains("Project Overview"),
3003 "expected top-level heading text to be present in merged chunk"
3004 );
3005 assert!(
3006 all_text.contains("## Usage"),
3007 "expected second-level heading text to be present in merged chunk"
3008 );
3009 assert!(
3010 all_text.contains("Setext Section"),
3011 "expected setext heading text to be present in merged chunk"
3012 );
3013 }
3014
3015 #[test]
3016 fn test_markdown_inline_fixtures_cover_blocks() {
3017 let source = r#"
3018# Title
3019
3020Intro paragraph with **bold** text.
3021
3022## Usage
3023
3024```rust
3025fn main() {
3026 println!("hi");
3027}
3028```
3029
3030> Blockquote with _emphasis_.
3031
3032- Item one
3033- Item two
3034
3035Setext Section
3036==============
3037
3038Trailing paragraph.
3039"#;
3040
3041 let chunks = chunk_text(source, Some(ck_core::Language::Markdown)).expect("chunk markdown");
3042
3043 let all_text: String = chunks
3051 .iter()
3052 .map(|c| c.text.as_str())
3053 .collect::<Vec<_>>()
3054 .join("\n");
3055
3056 assert!(
3057 all_text.contains("# Title") || all_text.contains("## Usage"),
3058 "expected heading text to be present after merging"
3059 );
3060 assert!(
3061 all_text.contains("```rust"),
3062 "expected markdown to include fenced code block"
3063 );
3064 assert!(
3065 all_text.contains("> Blockquote"),
3066 "expected markdown to include blockquote text"
3067 );
3068 assert!(
3069 all_text.contains("Setext Section"),
3070 "expected markdown to include Setext heading text"
3071 );
3072 }
3073
3074 #[test]
3075 fn test_csharp_query_matches_legacy() {
3076 let source = r"
3077namespace Calculator;
3078
3079public interface ICalculator
3080{
3081 double Add(double x, double y);
3082}
3083
3084public class Calculator
3085{
3086 public static double PI = 3.14159;
3087 private double _memory;
3088
3089 public Calculator()
3090 {
3091 _memory = 0.0;
3092 }
3093
3094 public double Add(double x, double y)
3095 {
3096 return x + y;
3097 }
3098}
3099";
3100
3101 assert_query_parity(ParseableLanguage::CSharp, source);
3102 }
3103
3104 #[test]
3105 fn test_zig_query_matches_legacy() {
3106 let source = r#"
3107const std = @import("std");
3108
3109const Calculator = struct {
3110 memory: f64,
3111
3112 pub fn init() Calculator {
3113 return Calculator{ .memory = 0.0 };
3114 }
3115
3116 pub fn add(self: *Calculator, a: f64, b: f64) f64 {
3117 return a + b;
3118 }
3119};
3120
3121test "calculator addition" {
3122 var calc = Calculator.init();
3123 const result = calc.add(2.0, 3.0);
3124 try std.testing.expect(result == 5.0);
3125}
3126"#;
3127
3128 assert_query_parity(ParseableLanguage::Zig, source);
3129 }
3130
3131 #[test]
3132 fn test_chunk_zig() {
3133 let zig_code = r#"
3134const std = @import("std");
3135
3136const Calculator = struct {
3137 memory: f64,
3138
3139 pub fn init() Calculator {
3140 return Calculator{ .memory = 0.0 };
3141 }
3142
3143 pub fn add(self: *Calculator, a: f64, b: f64) f64 {
3144 const result = a + b;
3145 self.memory = result;
3146 return result;
3147 }
3148};
3149
3150const Color = enum {
3151 Red,
3152 Green,
3153 Blue,
3154};
3155
3156const Value = union(enum) {
3157 int: i32,
3158 float: f64,
3159};
3160
3161const Handle = opaque {};
3162
3163const MathError = error{
3164 DivisionByZero,
3165 Overflow,
3166};
3167
3168pub fn multiply(a: i32, b: i32) i32 {
3169 return a * b;
3170}
3171
3172pub fn divide(a: i32, b: i32) MathError!i32 {
3173 if (b == 0) return error.DivisionByZero;
3174 return @divTrunc(a, b);
3175}
3176
3177comptime {
3178 @compileLog("Compile-time validation");
3179}
3180
3181pub fn main() !void {
3182 var calc = Calculator.init();
3183 const result = calc.add(2.0, 3.0);
3184 std.debug.print("Result: {}\n", .{result});
3185}
3186
3187test "calculator addition" {
3188 var calc = Calculator.init();
3189 const result = calc.add(2.0, 3.0);
3190 try std.testing.expect(result == 5.0);
3191}
3192
3193test "multiply function" {
3194 const result = multiply(3, 4);
3195 try std.testing.expect(result == 12);
3196}
3197"#;
3198
3199 let chunks = chunk_language(zig_code, ParseableLanguage::Zig).unwrap();
3200 assert!(!chunks.is_empty());
3201
3202 let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
3203
3204 let class_count = chunk_types
3205 .iter()
3206 .filter(|&&t| t == &ChunkType::Class)
3207 .count();
3208 let function_count = chunk_types
3209 .iter()
3210 .filter(|&&t| t == &ChunkType::Function)
3211 .count();
3212 let module_count = chunk_types
3213 .iter()
3214 .filter(|&&t| t == &ChunkType::Module)
3215 .count();
3216
3217 assert!(
3218 class_count >= 5,
3219 "Expected at least 5 Class chunks (struct, enum, union, opaque, error set), found {class_count}"
3220 );
3221
3222 assert!(
3223 function_count >= 3,
3224 "Expected at least 3 functions (multiply, divide, main), found {function_count}"
3225 );
3226
3227 assert!(
3228 module_count >= 4,
3229 "Expected at least 4 module-type chunks (const std, comptime, 2 tests), found {module_count}"
3230 );
3231
3232 assert!(
3233 chunk_types.contains(&&ChunkType::Class),
3234 "Expected to find Class chunks"
3235 );
3236 assert!(
3237 chunk_types.contains(&&ChunkType::Function),
3238 "Expected to find Function chunks"
3239 );
3240 assert!(
3241 chunk_types.contains(&&ChunkType::Module),
3242 "Expected to find Module chunks"
3243 );
3244 }
3245
3246 #[test]
3247 fn test_chunk_csharp() {
3248 let csharp_code = r"
3249namespace Calculator;
3250
3251public interface ICalculator
3252{
3253 double Add(double x, double y);
3254}
3255
3256public class Calculator
3257{
3258 public static const double PI = 3.14159;
3259 private double _memory;
3260
3261 public Calculator()
3262 {
3263 _memory = 0.0;
3264 }
3265
3266 public double Add(double x, double y)
3267 {
3268 return x + y;
3269 }
3270
3271 public static void Main(string[] args)
3272 {
3273 var calc = new Calculator();
3274 }
3275}
3276";
3277
3278 let chunks = chunk_language(csharp_code, ParseableLanguage::CSharp).unwrap();
3279 assert!(!chunks.is_empty());
3280
3281 let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
3283 assert!(chunk_types.contains(&&ChunkType::Module)); assert!(chunk_types.contains(&&ChunkType::Class)); assert!(chunk_types.contains(&&ChunkType::Method)); }
3287
3288 #[test]
3289 fn test_stride_large_chunk_empty_text() {
3290 let empty_chunk = Chunk {
3292 span: Span {
3293 byte_start: 0,
3294 byte_end: 0,
3295 line_start: 1,
3296 line_end: 1,
3297 },
3298 text: String::new(), chunk_type: ChunkType::Text,
3300 stride_info: None,
3301 metadata: ChunkMetadata::from_text(""),
3302 };
3303
3304 let config = ChunkConfig::default();
3305 let result = stride_large_chunk(empty_chunk.clone(), &config);
3306
3307 assert!(result.is_ok());
3309 let chunks = result.unwrap();
3310 assert_eq!(chunks.len(), 1);
3311 assert_eq!(chunks[0].text, "");
3312 }
3313
3314 #[test]
3315 fn test_stride_large_chunk_zero_token_estimate() {
3316 let chunk = Chunk {
3318 span: Span {
3319 byte_start: 0,
3320 byte_end: 5,
3321 line_start: 1,
3322 line_end: 1,
3323 },
3324 text: " ".to_string(), chunk_type: ChunkType::Text,
3326 stride_info: None,
3327 metadata: ChunkMetadata::from_text(" "),
3328 };
3329
3330 let config = ChunkConfig::default();
3331 let result = stride_large_chunk(chunk, &config);
3332
3333 assert!(result.is_ok());
3335 }
3336
3337 #[test]
3338 fn test_strided_chunk_line_calculation() {
3339 let long_text = (1..=50).map(|i| format!("This is a longer line {i} with more content to ensure token count is high enough")).collect::<Vec<_>>().join("\n");
3342
3343 let metadata = ChunkMetadata::from_text(&long_text);
3344 let chunk = Chunk {
3345 span: Span {
3346 byte_start: 0,
3347 byte_end: long_text.len(),
3348 line_start: 1,
3349 line_end: 50,
3350 },
3351 text: long_text,
3352 chunk_type: ChunkType::Text,
3353 stride_info: None,
3354 metadata,
3355 };
3356
3357 let config = ChunkConfig {
3358 max_tokens: 100, stride_overlap: 10, ..Default::default()
3361 };
3362
3363 let result = stride_large_chunk(chunk, &config);
3364 if let Err(e) = &result {
3365 eprintln!("Stride error: {e}");
3366 }
3367 assert!(result.is_ok());
3368
3369 let chunks = result.unwrap();
3370 assert!(
3371 chunks.len() > 1,
3372 "Should create multiple chunks when striding"
3373 );
3374
3375 for chunk in chunks {
3376 assert!(chunk.span.line_end >= chunk.span.line_start);
3379
3380 let line_count = chunk.text.lines().count();
3382 if line_count > 0 {
3383 let calculated_line_span = chunk.span.line_end - chunk.span.line_start + 1;
3384
3385 assert!(
3387 calculated_line_span <= line_count + 1,
3388 "Line span {calculated_line_span} should not exceed content lines {line_count} by more than 1"
3389 );
3390 }
3391 }
3392 }
3393
3394 #[test]
3395 fn test_gap_filling_coverage() {
3396 let test_cases = vec![
3398 (
3399 ParseableLanguage::Rust,
3400 r#"// This is a test file with imports at the top
3401use std::collections::HashMap;
3402use std::sync::Arc;
3403
3404// A comment between imports and code
3405const VERSION: &str = "1.0.0";
3406
3407// Main function
3408fn main() {
3409 println!("Hello, world!");
3410}
3411
3412// Some trailing content
3413// that should be indexed
3414"#,
3415 ),
3416 (
3417 ParseableLanguage::Python,
3418 r#"# Imports at the top
3419import os
3420import sys
3421
3422# Some constant
3423VERSION = "1.0.0"
3424
3425# Main function
3426def main():
3427 print("Hello, world!")
3428
3429# Trailing comment
3430# should be indexed
3431"#,
3432 ),
3433 (
3434 ParseableLanguage::TypeScript,
3435 r#"// Imports at the top
3436import { foo } from 'bar';
3437
3438// Some constant
3439const VERSION = "1.0.0";
3440
3441// Main function
3442function main() {
3443 console.log("Hello, world!");
3444}
3445
3446// Trailing comment
3447// should be indexed
3448"#,
3449 ),
3450 ];
3451
3452 for (language, code) in test_cases {
3453 eprintln!("\n=== Testing {language} ===");
3454 let chunks = chunk_language(code, language).unwrap();
3455
3456 let mut covered_bytes = vec![false; code.len()];
3458 for chunk in &chunks {
3459 for item in covered_bytes
3460 .iter_mut()
3461 .take(chunk.span.byte_end)
3462 .skip(chunk.span.byte_start)
3463 {
3464 *item = true;
3465 }
3466 }
3467
3468 let uncovered_non_ws: Vec<usize> = covered_bytes
3469 .iter()
3470 .enumerate()
3471 .filter(|(i, covered)| !**covered && !code.as_bytes()[*i].is_ascii_whitespace())
3472 .map(|(i, _)| i)
3473 .collect();
3474
3475 if !uncovered_non_ws.is_empty() {
3476 eprintln!("\n=== UNCOVERED NON-WHITESPACE for {language} ===");
3477 eprintln!("Total bytes: {}", code.len());
3478 eprintln!("Uncovered non-whitespace: {}", uncovered_non_ws.len());
3479
3480 for &pos in uncovered_non_ws.iter().take(10) {
3482 let context_start = pos.saturating_sub(20);
3483 let context_end = (pos + 20).min(code.len());
3484 eprintln!(
3485 "Uncovered at byte {}: {:?}",
3486 pos,
3487 &code[context_start..context_end]
3488 );
3489 }
3490
3491 eprintln!("\n=== CHUNKS ===");
3492 for (i, chunk) in chunks.iter().enumerate() {
3493 eprintln!(
3494 "Chunk {}: {:?} bytes {}-{} (len {})",
3495 i,
3496 chunk.chunk_type,
3497 chunk.span.byte_start,
3498 chunk.span.byte_end,
3499 chunk.span.byte_end - chunk.span.byte_start
3500 );
3501 eprintln!(" Text: {:?}", &chunk.text[..chunk.text.len().min(60)]);
3502 }
3503 }
3504
3505 assert!(
3506 uncovered_non_ws.is_empty(),
3507 "{}: Expected all non-whitespace covered but found {} uncovered non-whitespace bytes",
3508 language,
3509 uncovered_non_ws.len()
3510 );
3511 }
3512 }
3513
3514 #[test]
3515 fn test_web_server_file_coverage() {
3516 let code = std::fs::read_to_string("../examples/code/web_server.rs")
3518 .expect("Failed to read web_server.rs");
3519
3520 let chunks = chunk_language(&code, ParseableLanguage::Rust).unwrap();
3521
3522 let mut covered = vec![false; code.len()];
3524 for chunk in &chunks {
3525 for item in covered
3526 .iter_mut()
3527 .take(chunk.span.byte_end)
3528 .skip(chunk.span.byte_start)
3529 {
3530 *item = true;
3531 }
3532 }
3533
3534 let uncovered_non_whitespace: Vec<(usize, char)> = covered
3536 .iter()
3537 .enumerate()
3538 .filter(|(i, covered)| !**covered && !code.as_bytes()[*i].is_ascii_whitespace())
3539 .map(|(i, _)| (i, code.chars().nth(i).unwrap_or('?')))
3540 .collect();
3541
3542 if !uncovered_non_whitespace.is_empty() {
3543 eprintln!("\n=== WEB_SERVER.RS UNCOVERED NON-WHITESPACE ===");
3544 eprintln!("File size: {} bytes", code.len());
3545 eprintln!("Total chunks: {}", chunks.len());
3546 eprintln!(
3547 "Uncovered non-whitespace: {}",
3548 uncovered_non_whitespace.len()
3549 );
3550
3551 for &(pos, ch) in uncovered_non_whitespace.iter().take(10) {
3552 let start = pos.saturating_sub(30);
3553 let end = (pos + 30).min(code.len());
3554 eprintln!(
3555 "\nUncovered '{}' at byte {}: {:?}",
3556 ch,
3557 pos,
3558 &code[start..end]
3559 );
3560 }
3561
3562 eprintln!("\n=== CHUNKS ===");
3563 for (i, chunk) in chunks.iter().enumerate().take(20) {
3564 eprintln!(
3565 "Chunk {}: {:?} bytes {}-{} lines {}-{}",
3566 i,
3567 chunk.chunk_type,
3568 chunk.span.byte_start,
3569 chunk.span.byte_end,
3570 chunk.span.line_start,
3571 chunk.span.line_end
3572 );
3573 }
3574 }
3575
3576 assert!(
3577 uncovered_non_whitespace.is_empty(),
3578 "Expected all non-whitespace content covered but found {} uncovered non-whitespace bytes",
3579 uncovered_non_whitespace.len()
3580 );
3581 }
3582
3583 #[test]
3584 fn test_haskell_function_chunking() {
3585 let haskell_code = r"
3586factorial :: Integer -> Integer
3587factorial 0 = 1
3588factorial n = n * factorial (n - 1)
3589
3590fibonacci :: Integer -> Integer
3591fibonacci 0 = 0
3592fibonacci 1 = 1
3593fibonacci n = fibonacci (n - 1) + fibonacci (n - 2)
3594";
3595
3596 let mut parser = tree_sitter::Parser::new();
3597 parser
3598 .set_language(&tree_sitter_haskell::LANGUAGE.into())
3599 .unwrap();
3600 let tree = parser.parse(haskell_code, None).unwrap();
3601
3602 fn walk(node: tree_sitter::Node, _src: &str, depth: usize) {
3604 let kind = node.kind();
3605 let start = node.start_position();
3606 let end = node.end_position();
3607 eprintln!(
3608 "{}{:30} L{}-{}",
3609 " ".repeat(depth),
3610 kind,
3611 start.row + 1,
3612 end.row + 1
3613 );
3614
3615 let mut cursor = node.walk();
3616 if cursor.goto_first_child() {
3617 loop {
3618 walk(cursor.node(), _src, depth + 1);
3619 if !cursor.goto_next_sibling() {
3620 break;
3621 }
3622 }
3623 }
3624 }
3625
3626 eprintln!("\n=== TREE STRUCTURE ===");
3627 walk(tree.root_node(), haskell_code, 0);
3628 eprintln!("=== END TREE ===\n");
3629
3630 let chunks = chunk_language(haskell_code, ParseableLanguage::Haskell).unwrap();
3631
3632 eprintln!("\n=== CHUNKS ===");
3633 for (i, chunk) in chunks.iter().enumerate() {
3634 eprintln!(
3635 "Chunk {}: {:?} L{}-{}",
3636 i, chunk.chunk_type, chunk.span.line_start, chunk.span.line_end
3637 );
3638 eprintln!(" Text: {:?}", chunk.text);
3639 }
3640 eprintln!("=== END CHUNKS ===\n");
3641
3642 assert!(!chunks.is_empty(), "Should find chunks in Haskell code");
3643
3644 let factorial_chunk = chunks.iter().find(|c| c.text.contains("factorial 0 = 1"));
3646 assert!(
3647 factorial_chunk.is_some(),
3648 "Should find factorial function body"
3649 );
3650
3651 let fac = factorial_chunk.unwrap();
3652 assert!(
3653 fac.text.contains("factorial :: Integer -> Integer"),
3654 "Should include type signature"
3655 );
3656 assert!(
3657 fac.text.contains("factorial 0 = 1"),
3658 "Should include base case"
3659 );
3660 assert!(
3661 fac.text.contains("factorial n = n * factorial (n - 1)"),
3662 "Should include recursive case"
3663 );
3664 }
3665
3666 #[test]
3667 fn test_chunk_elixir_basic() {
3668 let elixir_code = r#"
3669defmodule Calculator do
3670 @moduledoc "A simple calculator module"
3671
3672 def add(a, b) do
3673 a + b
3674 end
3675
3676 defp multiply(a, b) do
3677 a * b
3678 end
3679end
3680"#;
3681
3682 let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
3683
3684 eprintln!("\n=== ELIXIR CHUNKS ===");
3685 for (i, chunk) in chunks.iter().enumerate() {
3686 eprintln!(
3687 "Chunk {}: {:?} L{}-{}",
3688 i, chunk.chunk_type, chunk.span.line_start, chunk.span.line_end
3689 );
3690 eprintln!(" Text: {:?}", &chunk.text[..chunk.text.len().min(80)]);
3691 }
3692 eprintln!("=== END CHUNKS ===\n");
3693
3694 assert!(!chunks.is_empty(), "Should find chunks in Elixir code");
3695
3696 let has_module = chunks.iter().any(|c| c.chunk_type == ChunkType::Module);
3698 let has_function = chunks.iter().any(|c| c.chunk_type == ChunkType::Function);
3699
3700 assert!(has_module, "Should detect defmodule as Module");
3701 assert!(has_function, "Should detect def/defp as Function");
3702 }
3703
3704 #[test]
3705 fn test_chunk_elixir_protocol() {
3706 let elixir_code = r#"
3707defprotocol Stringable do
3708 @doc "Converts to string"
3709 def to_string(value)
3710end
3711
3712defimpl Stringable, for: Integer do
3713 def to_string(value), do: Integer.to_string(value)
3714end
3715"#;
3716
3717 let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
3718
3719 eprintln!("\n=== ELIXIR PROTOCOL CHUNKS ===");
3720 for (i, chunk) in chunks.iter().enumerate() {
3721 eprintln!(
3722 "Chunk {}: {:?} L{}-{}",
3723 i, chunk.chunk_type, chunk.span.line_start, chunk.span.line_end
3724 );
3725 eprintln!(" Text: {:?}", &chunk.text[..chunk.text.len().min(80)]);
3726 }
3727 eprintln!("=== END CHUNKS ===\n");
3728
3729 let modules: Vec<_> = chunks
3731 .iter()
3732 .filter(|c| c.chunk_type == ChunkType::Module)
3733 .collect();
3734
3735 assert!(
3736 modules.len() >= 2,
3737 "Should detect defprotocol and defimpl as modules, found {}",
3738 modules.len()
3739 );
3740 }
3741
3742 #[test]
3743 fn test_chunk_elixir_genserver() {
3744 let elixir_code = r"
3745defmodule MyServer do
3746 use GenServer
3747
3748 def start_link(opts) do
3749 GenServer.start_link(__MODULE__, opts, name: __MODULE__)
3750 end
3751
3752 def init(state) do
3753 {:ok, state}
3754 end
3755
3756 def handle_call(:get, _from, state) do
3757 {:reply, state, state}
3758 end
3759
3760 def handle_cast({:set, value}, _state) do
3761 {:noreply, value}
3762 end
3763end
3764";
3765
3766 let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
3767
3768 let functions: Vec<_> = chunks
3770 .iter()
3771 .filter(|c| c.chunk_type == ChunkType::Function)
3772 .collect();
3773
3774 assert!(
3775 functions.len() >= 4,
3776 "Should detect at least 4 functions (start_link, init, handle_call, handle_cast), found {}",
3777 functions.len()
3778 );
3779 }
3780
3781 #[test]
3782 fn test_elixir_extension_detection() {
3783 use ck_core::Language;
3784
3785 assert_eq!(Language::from_extension("ex"), Some(Language::Elixir));
3786 assert_eq!(Language::from_extension("exs"), Some(Language::Elixir));
3787 assert_eq!(Language::from_extension("EX"), Some(Language::Elixir));
3788 assert_eq!(Language::from_extension("EXS"), Some(Language::Elixir));
3789 }
3790
3791 #[test]
3792 fn test_chunk_elixir_macros() {
3793 let elixir_code = r"
3794defmodule MyMacros do
3795 defmacro unless(condition, do: block) do
3796 quote do
3797 if !unquote(condition), do: unquote(block)
3798 end
3799 end
3800
3801 defmacrop private_macro(x) do
3802 quote do: unquote(x) * 2
3803 end
3804end
3805";
3806
3807 let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
3808
3809 let functions: Vec<_> = chunks
3810 .iter()
3811 .filter(|c| c.chunk_type == ChunkType::Function)
3812 .collect();
3813
3814 assert!(
3815 functions.len() >= 2,
3816 "Should detect defmacro and defmacrop as functions, found {}",
3817 functions.len()
3818 );
3819 }
3820
3821 #[test]
3822 fn test_chunk_elixir_module_attributes() {
3823 let elixir_code = r#"
3824defmodule Calculator do
3825 @moduledoc "A calculator with type specs"
3826
3827 @behaviour GenServer
3828
3829 @type operation :: :add | :subtract | :multiply | :divide
3830 @typep internal_state :: %{history: list()}
3831 @opaque result :: {:ok, number()} | {:error, atom()}
3832
3833 @callback init(args :: term()) :: {:ok, state :: term()}
3834 @callback handle_call(request :: term(), from :: term(), state :: term()) :: {:reply, term(), term()}
3835
3836 @optional_callbacks [handle_info: 2]
3837
3838 @spec add(number(), number()) :: number()
3839 def add(a, b), do: a + b
3840
3841 @spec subtract(number(), number()) :: number()
3842 def subtract(a, b), do: a - b
3843end
3844"#;
3845
3846 let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
3847
3848 eprintln!("\n=== ELIXIR MODULE ATTRIBUTES CHUNKS ===");
3849 for (i, chunk) in chunks.iter().enumerate() {
3850 eprintln!(
3851 "Chunk {}: {:?} L{}-{}",
3852 i, chunk.chunk_type, chunk.span.line_start, chunk.span.line_end
3853 );
3854 eprintln!(" Text: {:?}", &chunk.text[..chunk.text.len().min(80)]);
3855 }
3856 eprintln!("=== END CHUNKS ===\n");
3857
3858 let has_behaviour = chunks
3860 .iter()
3861 .any(|c| c.chunk_type == ChunkType::Text && c.text.contains("@behaviour GenServer"));
3862 assert!(has_behaviour, "Should capture @behaviour declaration");
3863
3864 let type_chunks: Vec<_> = chunks
3866 .iter()
3867 .filter(|c| {
3868 c.chunk_type == ChunkType::Text
3869 && (c.text.contains("@type")
3870 || c.text.contains("@typep")
3871 || c.text.contains("@opaque"))
3872 })
3873 .collect();
3874 assert!(
3875 type_chunks.len() >= 3,
3876 "Should capture @type, @typep, and @opaque, found {}",
3877 type_chunks.len()
3878 );
3879
3880 let callback_chunks: Vec<_> = chunks
3882 .iter()
3883 .filter(|c| c.chunk_type == ChunkType::Text && c.text.contains("@callback"))
3884 .collect();
3885 assert!(
3886 callback_chunks.len() >= 2,
3887 "Should capture @callback definitions, found {}",
3888 callback_chunks.len()
3889 );
3890
3891 let spec_chunks: Vec<_> = chunks
3893 .iter()
3894 .filter(|c| c.chunk_type == ChunkType::Text && c.text.contains("@spec"))
3895 .collect();
3896 assert!(
3897 spec_chunks.len() >= 2,
3898 "Should capture @spec definitions, found {}",
3899 spec_chunks.len()
3900 );
3901
3902 let function_chunks: Vec<_> = chunks
3904 .iter()
3905 .filter(|c| c.chunk_type == ChunkType::Function)
3906 .collect();
3907 assert!(
3908 function_chunks.len() >= 2,
3909 "Should still capture def functions, found {}",
3910 function_chunks.len()
3911 );
3912 }
3913
3914 #[test]
3915 fn test_chunk_elixir_behavior_spelling() {
3916 let elixir_code = r"
3918defmodule BritishModule do
3919 @behaviour GenServer
3920end
3921
3922defmodule AmericanModule do
3923 @behavior GenServer
3924end
3925";
3926
3927 let chunks = chunk_language(elixir_code, ParseableLanguage::Elixir).unwrap();
3928
3929 let behaviour_chunks: Vec<_> = chunks
3930 .iter()
3931 .filter(|c| {
3932 c.chunk_type == ChunkType::Text
3933 && (c.text.contains("@behaviour") || c.text.contains("@behavior"))
3934 })
3935 .collect();
3936
3937 assert!(
3938 behaviour_chunks.len() >= 2,
3939 "Should capture both @behaviour and @behavior spellings, found {}",
3940 behaviour_chunks.len()
3941 );
3942 }
3943}