ck_chunk/
lib.rs

1use anyhow::Result;
2use ck_core::Span;
3use serde::{Deserialize, Serialize};
4
5/// Import token estimation from ck-embed
6pub use ck_embed::TokenEstimator;
7
8/// Fallback to estimation if precise tokenization fails
9fn estimate_tokens(text: &str) -> usize {
10    TokenEstimator::estimate_tokens(text)
11}
12
13/// Get model-specific chunk configuration (target_tokens, overlap_tokens)
14/// Balanced for precision vs context - larger models can handle bigger chunks but not too big
15pub fn get_model_chunk_config(model_name: Option<&str>) -> (usize, usize) {
16    let model = model_name.unwrap_or("nomic-embed-text-v1.5");
17
18    match model {
19        // Small models - keep chunks smaller for better precision
20        "BAAI/bge-small-en-v1.5" | "sentence-transformers/all-MiniLM-L6-v2" => {
21            (400, 80) // 400 tokens target, 80 token overlap (~20%)
22        }
23
24        // Large context models - can use bigger chunks while preserving precision
25        // Sweet spot: enough context to be meaningful, small enough to be precise
26        "nomic-embed-text-v1" | "nomic-embed-text-v1.5" | "jina-embeddings-v2-base-code" => {
27            (1024, 200) // 1024 tokens target, 200 token overlap (~20%) - good balance
28        }
29
30        // BGE variants - stick to smaller for precision
31        "BAAI/bge-base-en-v1.5" | "BAAI/bge-large-en-v1.5" => {
32            (400, 80) // 400 tokens target, 80 token overlap (~20%)
33        }
34
35        // Default to large model config since nomic-v1.5 is default
36        _ => (1024, 200), // Good balance of context vs precision
37    }
38}
39
40/// Information about chunk striding for large chunks that exceed token limits
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct StrideInfo {
43    /// Unique ID for the original chunk before striding
44    pub original_chunk_id: String,
45    /// Index of this stride (0-based)
46    pub stride_index: usize,
47    /// Total number of strides for the original chunk
48    pub total_strides: usize,
49    /// Byte offset where overlap with previous stride begins
50    pub overlap_start: usize,
51    /// Byte offset where overlap with next stride ends
52    pub overlap_end: usize,
53}
54
55#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct Chunk {
57    pub span: Span,
58    pub text: String,
59    pub chunk_type: ChunkType,
60    /// Stride information if this chunk was created by striding a larger chunk
61    pub stride_info: Option<StrideInfo>,
62}
63
64#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
65pub enum ChunkType {
66    Text,
67    Function,
68    Class,
69    Method,
70    Module,
71}
72
73#[derive(Debug, Clone, Copy, PartialEq, Eq)]
74pub enum ParseableLanguage {
75    Python,
76    TypeScript,
77    JavaScript,
78    Haskell,
79    Rust,
80    Ruby,
81    Go,
82    CSharp,
83}
84
85impl std::fmt::Display for ParseableLanguage {
86    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
87        let name = match self {
88            ParseableLanguage::Python => "python",
89            ParseableLanguage::TypeScript => "typescript",
90            ParseableLanguage::JavaScript => "javascript",
91            ParseableLanguage::Haskell => "haskell",
92            ParseableLanguage::Rust => "rust",
93            ParseableLanguage::Ruby => "ruby",
94            ParseableLanguage::Go => "go",
95            ParseableLanguage::CSharp => "csharp",
96        };
97        write!(f, "{}", name)
98    }
99}
100
101impl TryFrom<ck_core::Language> for ParseableLanguage {
102    type Error = anyhow::Error;
103
104    fn try_from(lang: ck_core::Language) -> Result<Self, Self::Error> {
105        match lang {
106            ck_core::Language::Python => Ok(ParseableLanguage::Python),
107            ck_core::Language::TypeScript => Ok(ParseableLanguage::TypeScript),
108            ck_core::Language::JavaScript => Ok(ParseableLanguage::JavaScript),
109            ck_core::Language::Haskell => Ok(ParseableLanguage::Haskell),
110            ck_core::Language::Rust => Ok(ParseableLanguage::Rust),
111            ck_core::Language::Ruby => Ok(ParseableLanguage::Ruby),
112            ck_core::Language::Go => Ok(ParseableLanguage::Go),
113            ck_core::Language::CSharp => Ok(ParseableLanguage::CSharp),
114            _ => Err(anyhow::anyhow!(
115                "Language {:?} is not supported for parsing",
116                lang
117            )),
118        }
119    }
120}
121
122pub fn chunk_text(text: &str, language: Option<ck_core::Language>) -> Result<Vec<Chunk>> {
123    chunk_text_with_config(text, language, &ChunkConfig::default())
124}
125
126/// Configuration for chunking behavior
127#[derive(Debug, Clone)]
128pub struct ChunkConfig {
129    /// Maximum tokens per chunk (for striding)
130    pub max_tokens: usize,
131    /// Overlap size for striding (in tokens)
132    pub stride_overlap: usize,
133    /// Enable striding for chunks that exceed max_tokens
134    pub enable_striding: bool,
135}
136
137impl Default for ChunkConfig {
138    fn default() -> Self {
139        Self {
140            max_tokens: 8192,     // Default to Nomic model limit
141            stride_overlap: 1024, // 12.5% overlap
142            enable_striding: true,
143        }
144    }
145}
146
147/// New function that accepts model name for model-specific chunking
148pub fn chunk_text_with_model(
149    text: &str,
150    language: Option<ck_core::Language>,
151    model_name: Option<&str>,
152) -> Result<Vec<Chunk>> {
153    let (target_tokens, overlap_tokens) = get_model_chunk_config(model_name);
154
155    // Create a config based on model-specific parameters
156    let config = ChunkConfig {
157        max_tokens: target_tokens,
158        stride_overlap: overlap_tokens,
159        enable_striding: true,
160    };
161
162    chunk_text_with_config_and_model(text, language, &config, model_name)
163}
164
165pub fn chunk_text_with_config(
166    text: &str,
167    language: Option<ck_core::Language>,
168    config: &ChunkConfig,
169) -> Result<Vec<Chunk>> {
170    chunk_text_with_config_and_model(text, language, config, None)
171}
172
173fn chunk_text_with_config_and_model(
174    text: &str,
175    language: Option<ck_core::Language>,
176    config: &ChunkConfig,
177    model_name: Option<&str>,
178) -> Result<Vec<Chunk>> {
179    tracing::debug!(
180        "Chunking text with language: {:?}, length: {} chars, config: {:?}",
181        language,
182        text.len(),
183        config
184    );
185
186    let result = match language.map(ParseableLanguage::try_from) {
187        Some(Ok(lang)) => {
188            tracing::debug!("Using {} tree-sitter parser", lang);
189            chunk_language_with_model(text, lang, model_name)
190        }
191        Some(Err(_)) => {
192            tracing::debug!("Language not supported for parsing, using generic chunking strategy");
193            chunk_generic_with_token_config(text, model_name)
194        }
195        None => {
196            tracing::debug!("Using generic chunking strategy");
197            chunk_generic_with_token_config(text, model_name)
198        }
199    };
200
201    let mut chunks = result?;
202
203    // Apply striding if enabled and necessary
204    if config.enable_striding {
205        chunks = apply_striding(chunks, config)?;
206    }
207
208    tracing::debug!("Successfully created {} final chunks", chunks.len());
209    Ok(chunks)
210}
211
212fn chunk_generic(text: &str) -> Result<Vec<Chunk>> {
213    chunk_generic_with_token_config(text, None)
214}
215
216fn chunk_generic_with_token_config(text: &str, model_name: Option<&str>) -> Result<Vec<Chunk>> {
217    let mut chunks = Vec::new();
218    let lines: Vec<&str> = text.lines().collect();
219
220    // Get model-specific optimal chunk size in tokens
221    let (target_tokens, overlap_tokens) = get_model_chunk_config(model_name);
222
223    // Convert token targets to approximate line counts
224    // This is a rough heuristic - we'll validate with actual token counting
225    let avg_tokens_per_line = 10.0; // Rough estimate for code
226    let target_lines = ((target_tokens as f32) / avg_tokens_per_line) as usize;
227    let overlap_lines = ((overlap_tokens as f32) / avg_tokens_per_line) as usize;
228
229    let chunk_size = target_lines.max(5); // Minimum 5 lines
230    let overlap = overlap_lines.max(1); // Minimum 1 line overlap
231
232    // Pre-compute cumulative byte offsets for O(1) lookup, accounting for different line endings
233    let mut line_byte_offsets = Vec::with_capacity(lines.len() + 1);
234    line_byte_offsets.push(0);
235    let mut cumulative_offset = 0;
236    let mut byte_pos = 0;
237
238    for line in lines.iter() {
239        cumulative_offset += line.len();
240
241        // Find the actual line ending length in the original text
242        let line_end_pos = byte_pos + line.len();
243        let newline_len = if line_end_pos < text.len() && text.as_bytes()[line_end_pos] == b'\r' {
244            if line_end_pos + 1 < text.len() && text.as_bytes()[line_end_pos + 1] == b'\n' {
245                2 // CRLF
246            } else {
247                1 // CR only (old Mac)
248            }
249        } else if line_end_pos < text.len() && text.as_bytes()[line_end_pos] == b'\n' {
250            1 // LF only (Unix)
251        } else {
252            0 // No newline at this position (could be last line without newline)
253        };
254
255        cumulative_offset += newline_len;
256        byte_pos = cumulative_offset;
257        line_byte_offsets.push(cumulative_offset);
258    }
259
260    let mut i = 0;
261    while i < lines.len() {
262        let end = (i + chunk_size).min(lines.len());
263        let chunk_lines = &lines[i..end];
264        let chunk_text = chunk_lines.join("\n");
265
266        let byte_start = line_byte_offsets[i];
267        let byte_end = line_byte_offsets[end];
268
269        chunks.push(Chunk {
270            span: Span {
271                byte_start,
272                byte_end,
273                line_start: i + 1,
274                line_end: end,
275            },
276            text: chunk_text,
277            chunk_type: ChunkType::Text,
278            stride_info: None,
279        });
280
281        i += chunk_size - overlap;
282        if i >= lines.len() {
283            break;
284        }
285    }
286
287    Ok(chunks)
288}
289
290fn chunk_language(text: &str, language: ParseableLanguage) -> Result<Vec<Chunk>> {
291    let mut parser = tree_sitter::Parser::new();
292
293    match language {
294        ParseableLanguage::Python => parser.set_language(&tree_sitter_python::language())?,
295        ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => {
296            parser.set_language(&tree_sitter_typescript::language_typescript())?
297        }
298        ParseableLanguage::Haskell => parser.set_language(&tree_sitter_haskell::language())?,
299        ParseableLanguage::Rust => parser.set_language(&tree_sitter_rust::language())?,
300        ParseableLanguage::Ruby => parser.set_language(&tree_sitter_ruby::language())?,
301        ParseableLanguage::Go => parser.set_language(&tree_sitter_go::language())?,
302        ParseableLanguage::CSharp => parser.set_language(&tree_sitter_c_sharp::language())?,
303    }
304
305    let tree = parser
306        .parse(text, None)
307        .ok_or_else(|| anyhow::anyhow!("Failed to parse {} code", language))?;
308
309    let mut chunks = Vec::new();
310    let mut cursor = tree.root_node().walk();
311
312    extract_code_chunks(&mut cursor, text, &mut chunks, language);
313
314    if chunks.is_empty() {
315        return chunk_generic(text);
316    }
317
318    Ok(chunks)
319}
320
321fn chunk_language_with_model(
322    text: &str,
323    language: ParseableLanguage,
324    _model_name: Option<&str>,
325) -> Result<Vec<Chunk>> {
326    // For now, language-based chunking doesn't need model-specific behavior
327    // since it's based on semantic code boundaries rather than token counts
328    // We could potentially optimize this in the future by validating chunk token counts
329    chunk_language(text, language)
330}
331
332fn extract_code_chunks(
333    cursor: &mut tree_sitter::TreeCursor,
334    source: &str,
335    chunks: &mut Vec<Chunk>,
336    language: ParseableLanguage,
337) {
338    let node = cursor.node();
339    let node_kind = node.kind();
340
341    let is_chunk = match language {
342        ParseableLanguage::Python => {
343            matches!(node_kind, "function_definition" | "class_definition")
344        }
345        ParseableLanguage::TypeScript | ParseableLanguage::JavaScript => matches!(
346            node_kind,
347            "function_declaration" | "class_declaration" | "method_definition" | "arrow_function"
348        ),
349        ParseableLanguage::Haskell => matches!(
350            node_kind,
351            "signature"
352                | "data_type"
353                | "newtype"
354                | "type_synonym"
355                | "type_family"
356                | "class"
357                | "instance"
358        ),
359        ParseableLanguage::Rust => matches!(
360            node_kind,
361            "function_item" | "impl_item" | "struct_item" | "enum_item" | "trait_item" | "mod_item"
362        ),
363        ParseableLanguage::Ruby => matches!(
364            node_kind,
365            "method" | "class" | "module" | "singleton_method"
366        ),
367        ParseableLanguage::Go => matches!(
368            node_kind,
369            "function_declaration"
370                | "method_declaration"
371                | "type_declaration"
372                | "var_declaration"
373                | "const_declaration"
374        ),
375        ParseableLanguage::CSharp => matches!(
376            node_kind,
377            "method_declaration"
378                | "class_declaration"
379                | "interface_declaration"
380                | "variable_declaration"
381        ),
382    };
383
384    if is_chunk {
385        let start_byte = node.start_byte();
386        let end_byte = node.end_byte();
387        let start_pos = node.start_position();
388        let end_pos = node.end_position();
389
390        let text = &source[start_byte..end_byte];
391
392        let chunk_type = match node_kind {
393            "function_definition"
394            | "function_declaration"
395            | "arrow_function"
396            | "function"
397            | "signature"
398            | "function_item"
399            | "def"
400            | "defp"
401            | "method"
402            | "singleton_method"
403            | "defn"
404            | "defn-" => ChunkType::Function,
405            "class_definition"
406            | "class_declaration"
407            | "instance_declaration"
408            | "class"
409            | "instance"
410            | "struct_item"
411            | "enum_item"
412            | "defstruct"
413            | "defrecord"
414            | "deftype"
415            | "type_declaration" => ChunkType::Class,
416            "method_definition" | "method_declaration" | "defmacro" => ChunkType::Method,
417            "data_type"
418            | "newtype"
419            | "type_synomym"
420            | "type_family"
421            | "impl_item"
422            | "trait_item"
423            | "mod_item"
424            | "defmodule"
425            | "module"
426            | "defprotocol"
427            | "interface_declaration"
428            | "ns"
429            | "var_declaration"
430            | "const_declaration"
431            | "variable_declaration" => ChunkType::Module,
432            _ => ChunkType::Text,
433        };
434
435        chunks.push(Chunk {
436            span: Span {
437                byte_start: start_byte,
438                byte_end: end_byte,
439                line_start: start_pos.row + 1,
440                line_end: end_pos.row + 1,
441            },
442            text: text.to_string(),
443            chunk_type,
444            stride_info: None,
445        });
446    }
447
448    if cursor.goto_first_child() {
449        loop {
450            extract_code_chunks(cursor, source, chunks, language);
451            if !cursor.goto_next_sibling() {
452                break;
453            }
454        }
455        cursor.goto_parent();
456    }
457}
458
459/// Apply striding to chunks that exceed the token limit
460fn apply_striding(chunks: Vec<Chunk>, config: &ChunkConfig) -> Result<Vec<Chunk>> {
461    let mut result = Vec::new();
462
463    for chunk in chunks {
464        let estimated_tokens = estimate_tokens(&chunk.text);
465
466        if estimated_tokens <= config.max_tokens {
467            // Chunk fits within limit, no striding needed
468            result.push(chunk);
469        } else {
470            // Chunk exceeds limit, apply striding
471            tracing::debug!(
472                "Chunk with {} tokens exceeds limit of {}, applying striding",
473                estimated_tokens,
474                config.max_tokens
475            );
476
477            let strided_chunks = stride_large_chunk(chunk, config)?;
478            result.extend(strided_chunks);
479        }
480    }
481
482    Ok(result)
483}
484
485/// Create strided chunks from a large chunk that exceeds token limits
486fn stride_large_chunk(chunk: Chunk, config: &ChunkConfig) -> Result<Vec<Chunk>> {
487    let text = &chunk.text;
488
489    // Early return for empty chunks to avoid divide-by-zero
490    if text.is_empty() {
491        return Ok(vec![chunk]);
492    }
493
494    // Calculate stride parameters in characters (not bytes!)
495    // Use a conservative estimate to ensure we stay under token limits
496    let char_count = text.chars().count();
497    let estimated_tokens = estimate_tokens(text);
498    // Guard against zero token estimate to prevent divide-by-zero panic
499    let chars_per_token = if estimated_tokens == 0 {
500        4.5 // Use default average if estimation fails
501    } else {
502        char_count as f32 / estimated_tokens as f32
503    };
504    let window_chars = ((config.max_tokens as f32 * 0.9) * chars_per_token) as usize; // 10% buffer
505    let overlap_chars = (config.stride_overlap as f32 * chars_per_token) as usize;
506    let stride_chars = window_chars.saturating_sub(overlap_chars);
507
508    if stride_chars == 0 {
509        return Err(anyhow::anyhow!("Stride size is too small"));
510    }
511
512    // Build char to byte index mapping to handle UTF-8 safely
513    let char_byte_indices: Vec<(usize, char)> = text.char_indices().collect();
514    // Note: char_count is already calculated above, just reference it here
515
516    let mut strided_chunks = Vec::new();
517    let original_chunk_id = format!("{}:{}", chunk.span.byte_start, chunk.span.byte_end);
518    let mut start_char_idx = 0;
519    let mut stride_index = 0;
520
521    // Calculate total number of strides
522    let total_strides = if char_count <= window_chars {
523        1
524    } else {
525        ((char_count - overlap_chars) as f32 / stride_chars as f32).ceil() as usize
526    };
527
528    while start_char_idx < char_count {
529        let end_char_idx = (start_char_idx + window_chars).min(char_count);
530
531        // Get byte positions from char indices
532        let start_byte_pos = char_byte_indices[start_char_idx].0;
533        let end_byte_pos = if end_char_idx < char_count {
534            char_byte_indices[end_char_idx].0
535        } else {
536            text.len()
537        };
538
539        let stride_text = &text[start_byte_pos..end_byte_pos];
540
541        // Calculate overlap information
542        let overlap_start = if stride_index > 0 { overlap_chars } else { 0 };
543        let overlap_end = if end_char_idx < char_count {
544            overlap_chars
545        } else {
546            0
547        };
548
549        // Calculate span for this stride
550        let byte_offset_start = chunk.span.byte_start + start_byte_pos;
551        let byte_offset_end = chunk.span.byte_start + end_byte_pos;
552
553        // Estimate line numbers (approximate)
554        let text_before_start = &text[..start_byte_pos];
555        let line_offset_start = text_before_start.lines().count().saturating_sub(1);
556        let stride_lines = stride_text.lines().count();
557
558        let stride_chunk = Chunk {
559            span: Span {
560                byte_start: byte_offset_start,
561                byte_end: byte_offset_end,
562                line_start: chunk.span.line_start + line_offset_start,
563                // Fix: subtract 1 since stride_lines is a count but line_end should be inclusive
564                line_end: chunk.span.line_start
565                    + line_offset_start
566                    + stride_lines.saturating_sub(1),
567            },
568            text: stride_text.to_string(),
569            chunk_type: chunk.chunk_type.clone(),
570            stride_info: Some(StrideInfo {
571                original_chunk_id: original_chunk_id.clone(),
572                stride_index,
573                total_strides,
574                overlap_start,
575                overlap_end,
576            }),
577        };
578
579        strided_chunks.push(stride_chunk);
580
581        // Move to next stride
582        if end_char_idx >= char_count {
583            break;
584        }
585
586        start_char_idx += stride_chars;
587        stride_index += 1;
588    }
589
590    tracing::debug!(
591        "Created {} strides from chunk of {} tokens",
592        strided_chunks.len(),
593        estimate_tokens(text)
594    );
595
596    Ok(strided_chunks)
597}
598
599// Removed duplicate estimate_tokens function - using the one from ck-embed via TokenEstimator
600
601#[cfg(test)]
602mod tests {
603    use super::*;
604
605    #[test]
606    fn test_chunk_generic_byte_offsets() {
607        // Test that byte offsets are calculated correctly using O(n) algorithm
608        let text = "line 1\nline 2\nline 3\nline 4\nline 5";
609        let chunks = chunk_generic(text).unwrap();
610
611        assert!(!chunks.is_empty());
612
613        // First chunk should start at byte 0
614        assert_eq!(chunks[0].span.byte_start, 0);
615
616        // Each chunk's byte_end should match the actual text length
617        for chunk in &chunks {
618            let expected_len = chunk.text.len();
619            let actual_len = chunk.span.byte_end - chunk.span.byte_start;
620            assert_eq!(actual_len, expected_len);
621        }
622    }
623
624    #[test]
625    fn test_chunk_generic_large_file_performance() {
626        // Create a large text to ensure O(n) performance
627        let lines: Vec<String> = (0..1000)
628            .map(|i| format!("Line {}: Some content here", i))
629            .collect();
630        let text = lines.join("\n");
631
632        let start = std::time::Instant::now();
633        let chunks = chunk_generic(&text).unwrap();
634        let duration = start.elapsed();
635
636        // Should complete quickly even for 1000 lines
637        assert!(
638            duration.as_millis() < 100,
639            "Chunking took too long: {:?}",
640            duration
641        );
642        assert!(!chunks.is_empty());
643
644        // Verify chunks have correct line numbers
645        for chunk in &chunks {
646            assert!(chunk.span.line_start > 0);
647            assert!(chunk.span.line_end >= chunk.span.line_start);
648        }
649    }
650
651    #[test]
652    fn test_chunk_rust() {
653        let rust_code = r#"
654pub struct Calculator {
655    memory: f64,
656}
657
658impl Calculator {
659    pub fn new() -> Self {
660        Calculator { memory: 0.0 }
661    }
662    
663    pub fn add(&mut self, a: f64, b: f64) -> f64 {
664        a + b
665    }
666}
667
668fn main() {
669    let calc = Calculator::new();
670}
671
672pub mod utils {
673    pub fn helper() {}
674}
675"#;
676
677        let chunks = chunk_language(rust_code, ParseableLanguage::Rust).unwrap();
678        assert!(!chunks.is_empty());
679
680        // Should find struct, impl, functions, and module
681        let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
682        assert!(chunk_types.contains(&&ChunkType::Class)); // struct
683        assert!(chunk_types.contains(&&ChunkType::Module)); // impl and mod
684        assert!(chunk_types.contains(&&ChunkType::Function)); // functions
685    }
686
687    #[test]
688    fn test_chunk_ruby() {
689        let ruby_code = r#"
690class Calculator
691  def initialize
692    @memory = 0.0
693  end
694
695  def add(a, b)
696    a + b
697  end
698
699  def self.class_method
700    "class method"
701  end
702
703  private
704
705  def private_method
706    "private"
707  end
708end
709
710module Utils
711  def self.helper
712    "helper"
713  end
714end
715
716def main
717  calc = Calculator.new
718end
719"#;
720
721        let chunks = chunk_language(ruby_code, ParseableLanguage::Ruby).unwrap();
722        assert!(!chunks.is_empty());
723
724        // Should find class, module, and methods
725        let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
726        assert!(chunk_types.contains(&&ChunkType::Class)); // class
727        assert!(chunk_types.contains(&&ChunkType::Module)); // module
728        assert!(chunk_types.contains(&&ChunkType::Function)); // methods
729    }
730
731    #[test]
732    fn test_language_detection_fallback() {
733        // Test that unknown languages fall back to generic chunking
734        let generic_text = "Some text\nwith multiple lines\nto chunk generically";
735
736        let chunks_unknown = chunk_text(generic_text, None).unwrap();
737        let chunks_generic = chunk_generic(generic_text).unwrap();
738
739        // Should produce the same result
740        assert_eq!(chunks_unknown.len(), chunks_generic.len());
741        assert_eq!(chunks_unknown[0].text, chunks_generic[0].text);
742    }
743
744    #[test]
745    fn test_chunk_go() {
746        let go_code = r#"
747package main
748
749import "fmt"
750
751const Pi = 3.14159
752
753var memory float64
754
755type Calculator struct {
756    memory float64
757}
758
759type Operation interface {
760    Calculate(a, b float64) float64
761}
762
763func NewCalculator() *Calculator {
764    return &Calculator{memory: 0.0}
765}
766
767func (c *Calculator) Add(a, b float64) float64 {
768    return a + b
769}
770
771func main() {
772    calc := NewCalculator()
773}
774"#;
775
776        let chunks = chunk_language(go_code, ParseableLanguage::Go).unwrap();
777        assert!(!chunks.is_empty());
778
779        // Should find const, var, type declarations, functions, and methods
780        let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
781        assert!(chunk_types.contains(&&ChunkType::Module)); // const and var
782        assert!(chunk_types.contains(&&ChunkType::Class)); // struct and interface
783        assert!(chunk_types.contains(&&ChunkType::Function)); // functions
784        assert!(chunk_types.contains(&&ChunkType::Method)); // methods
785    }
786
787    #[test]
788    fn test_chunk_csharp() {
789        let csharp_code = r#"
790namespace Calculator;
791
792public interface ICalculator 
793{
794    double Add(double x, double y);
795}
796
797public class Calculator 
798{
799    public static const double PI = 3.14159;
800    private double _memory;
801
802    public Calculator() 
803    {
804        _memory = 0.0;
805    }
806
807    public double Add(double x, double y) 
808    {
809        return x + y;
810    }
811
812    public static void Main(string[] args)
813    {
814        var calc = new Calculator();
815    }
816}
817"#;
818
819        let chunks = chunk_language(csharp_code, ParseableLanguage::CSharp).unwrap();
820        assert!(!chunks.is_empty());
821
822        // Should find variable, class, method and interface declarations
823        let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
824        assert!(chunk_types.contains(&&ChunkType::Module)); // var, interface
825        assert!(chunk_types.contains(&&ChunkType::Class)); // class
826        assert!(chunk_types.contains(&&ChunkType::Method)); // methods
827    }
828
829    #[test]
830    fn test_stride_large_chunk_empty_text() {
831        // Regression test for divide-by-zero bug in stride_large_chunk
832        let empty_chunk = Chunk {
833            span: Span {
834                byte_start: 0,
835                byte_end: 0,
836                line_start: 1,
837                line_end: 1,
838            },
839            text: String::new(), // Empty text should not panic
840            chunk_type: ChunkType::Text,
841            stride_info: None,
842        };
843
844        let config = ChunkConfig::default();
845        let result = stride_large_chunk(empty_chunk.clone(), &config);
846
847        // Should not panic and return the original chunk
848        assert!(result.is_ok());
849        let chunks = result.unwrap();
850        assert_eq!(chunks.len(), 1);
851        assert_eq!(chunks[0].text, "");
852    }
853
854    #[test]
855    fn test_stride_large_chunk_zero_token_estimate() {
856        // Regression test for zero token estimate causing divide-by-zero
857        let chunk = Chunk {
858            span: Span {
859                byte_start: 0,
860                byte_end: 5,
861                line_start: 1,
862                line_end: 1,
863            },
864            text: "     ".to_string(), // Whitespace that might return 0 tokens
865            chunk_type: ChunkType::Text,
866            stride_info: None,
867        };
868
869        let config = ChunkConfig::default();
870        let result = stride_large_chunk(chunk, &config);
871
872        // Should not panic and handle gracefully
873        assert!(result.is_ok());
874    }
875
876    #[test]
877    fn test_strided_chunk_line_calculation() {
878        // Regression test for line_end calculation in strided chunks
879        // Create a chunk large enough to force striding
880        let long_text = (1..=50).map(|i| format!("This is a longer line {} with more content to ensure token count is high enough", i)).collect::<Vec<_>>().join("\n");
881
882        let chunk = Chunk {
883            span: Span {
884                byte_start: 0,
885                byte_end: long_text.len(),
886                line_start: 1,
887                line_end: 50,
888            },
889            text: long_text,
890            chunk_type: ChunkType::Text,
891            stride_info: None,
892        };
893
894        let config = ChunkConfig {
895            max_tokens: 100,    // Force striding with reasonable limit
896            stride_overlap: 10, // Small overlap for testing
897            ..Default::default()
898        };
899
900        let result = stride_large_chunk(chunk, &config);
901        if let Err(e) = &result {
902            eprintln!("Stride error: {}", e);
903        }
904        assert!(result.is_ok());
905
906        let chunks = result.unwrap();
907        assert!(
908            chunks.len() > 1,
909            "Should create multiple chunks when striding"
910        );
911
912        for chunk in chunks {
913            // Verify line_end is not off by one
914            // line_end should be inclusive and not exceed the actual content
915            assert!(chunk.span.line_end >= chunk.span.line_start);
916
917            // Check that line span makes sense for the content
918            let line_count = chunk.text.lines().count();
919            if line_count > 0 {
920                let calculated_line_span = chunk.span.line_end - chunk.span.line_start + 1;
921
922                // Allow some tolerance for striding logic
923                assert!(
924                    calculated_line_span <= line_count + 1,
925                    "Line span {} should not exceed content lines {} by more than 1",
926                    calculated_line_span,
927                    line_count
928                );
929            }
930        }
931    }
932}