Skip to main content

search_semantically/
text_chunker.rs

1use std::sync::LazyLock;
2
3#[derive(Debug, Clone, PartialEq, Eq)]
4pub enum ChunkKind {
5    Function,
6    Method,
7    Class,
8    Interface,
9    Struct,
10    Enum,
11    Impl,
12    Export,
13    TypeAlias,
14    Module,
15    HeadingSection,
16    TopLevelKey,
17    Paragraph,
18    File,
19}
20
21impl std::fmt::Display for ChunkKind {
22    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
23        match self {
24            ChunkKind::Function => write!(f, "function"),
25            ChunkKind::Method => write!(f, "method"),
26            ChunkKind::Class => write!(f, "class"),
27            ChunkKind::Interface => write!(f, "interface"),
28            ChunkKind::Struct => write!(f, "struct"),
29            ChunkKind::Enum => write!(f, "enum"),
30            ChunkKind::Impl => write!(f, "impl"),
31            ChunkKind::Export => write!(f, "export"),
32            ChunkKind::TypeAlias => write!(f, "type_alias"),
33            ChunkKind::Module => write!(f, "module"),
34            ChunkKind::HeadingSection => write!(f, "heading_section"),
35            ChunkKind::TopLevelKey => write!(f, "top_level_key"),
36            ChunkKind::Paragraph => write!(f, "paragraph"),
37            ChunkKind::File => write!(f, "file"),
38        }
39    }
40}
41
42impl std::str::FromStr for ChunkKind {
43    type Err = String;
44
45    fn from_str(s: &str) -> Result<Self, Self::Err> {
46        match s {
47            "function" => Ok(ChunkKind::Function),
48            "method" => Ok(ChunkKind::Method),
49            "class" => Ok(ChunkKind::Class),
50            "interface" => Ok(ChunkKind::Interface),
51            "struct" => Ok(ChunkKind::Struct),
52            "enum" => Ok(ChunkKind::Enum),
53            "impl" => Ok(ChunkKind::Impl),
54            "export" => Ok(ChunkKind::Export),
55            "type_alias" => Ok(ChunkKind::TypeAlias),
56            "module" => Ok(ChunkKind::Module),
57            "heading_section" => Ok(ChunkKind::HeadingSection),
58            "top_level_key" => Ok(ChunkKind::TopLevelKey),
59            "paragraph" => Ok(ChunkKind::Paragraph),
60            "file" => Ok(ChunkKind::File),
61            other => Err(format!("Unknown chunk kind: {other}")),
62        }
63    }
64}
65
66#[derive(Debug, Clone)]
67pub struct TextChunk {
68    pub file_path: String,
69    pub start_line: usize,
70    pub end_line: usize,
71    pub kind: ChunkKind,
72    pub name: Option<String>,
73    pub content: String,
74}
75
76const MIN_SPLIT_SIZE: usize = 500;
77const MAX_CHUNK_SIZE: usize = 8000;
78const MIN_PARAGRAPH_SIZE: usize = 200;
79
80#[allow(clippy::type_complexity)]
81static HEADING_RE: LazyLock<fn(&str) -> Option<(usize, &str)>> = LazyLock::new(|| {
82    |line: &str| {
83        let trimmed = line.trim_start();
84        if trimmed.starts_with('#') {
85            let hash_count = trimmed.chars().take_while(|c| *c == '#').count();
86            if hash_count <= 6 {
87                let rest = trimmed[hash_count..].trim_start();
88                if !rest.is_empty() {
89                    return Some((hash_count, rest));
90                }
91            }
92        }
93        None
94    }
95});
96
97static YAML_TOP_KEY_RE: LazyLock<fn(&str) -> Option<&str>> = LazyLock::new(|| {
98    |line: &str| {
99        let first_char = line.chars().next()?;
100        if !first_char.is_alphabetic() && first_char != '_' {
101            return None;
102        }
103        if let Some(colon_pos) = line.find(':') {
104            let key = &line[..colon_pos];
105            if key
106                .chars()
107                .all(|c| c.is_alphanumeric() || c == '_' || c == '.' || c == '-')
108            {
109                let after_colon = line[colon_pos + 1..].trim();
110                if after_colon.is_empty()
111                    || after_colon.starts_with(' ')
112                    || after_colon.starts_with('\t')
113                {
114                    return Some(key);
115                }
116            }
117        }
118        None
119    }
120});
121
122static TOML_SECTION_RE: LazyLock<fn(&str) -> Option<&str>> = LazyLock::new(|| {
123    |line: &str| {
124        let trimmed = line.trim();
125        if (trimmed.starts_with('[') && trimmed.ends_with(']'))
126            || (trimmed.starts_with("[[") && trimmed.ends_with("]]"))
127        {
128            let inner = trimmed.trim_start_matches('[').trim_end_matches(']');
129            let name = inner.trim();
130            if !name.is_empty() {
131                return Some(name);
132            }
133        }
134        None
135    }
136});
137
138static TOML_KV_RE: LazyLock<fn(&str) -> Option<&str>> = LazyLock::new(|| {
139    |line: &str| {
140        let first_char = line.chars().next()?;
141        if !first_char.is_alphabetic() && first_char != '_' {
142            return None;
143        }
144        if let Some(eq_pos) = line.find('=') {
145            let key = &line[..eq_pos];
146            if key
147                .chars()
148                .all(|c| c.is_alphanumeric() || c == '_' || c == '.' || c == '-')
149            {
150                return Some(key.trim());
151            }
152        }
153        None
154    }
155});
156
157pub fn chunk_text_file(content: &str, file_path: &str, file_type: &str) -> Vec<TextChunk> {
158    if content.len() < MIN_SPLIT_SIZE {
159        return vec![whole_file_chunk(content, file_path)];
160    }
161
162    let chunks = match file_type {
163        "markdown" => chunk_markdown(content, file_path),
164        "yaml" => chunk_yaml(content, file_path),
165        "json" => chunk_json(content, file_path),
166        "toml" => chunk_toml(content, file_path),
167        "plaintext" => chunk_plaintext(content, file_path),
168        _ => Vec::new(),
169    };
170
171    if chunks.is_empty() {
172        return vec![whole_file_chunk(content, file_path)];
173    }
174
175    chunks.into_iter().flat_map(enforce_max_size).collect()
176}
177
178fn whole_file_chunk(content: &str, file_path: &str) -> TextChunk {
179    let line_count = content.lines().count().max(1);
180    TextChunk {
181        file_path: file_path.to_string(),
182        start_line: 1,
183        end_line: line_count,
184        kind: ChunkKind::File,
185        name: None,
186        content: content.to_string(),
187    }
188}
189
190fn chunk_markdown(content: &str, file_path: &str) -> Vec<TextChunk> {
191    let lines: Vec<&str> = content.lines().collect();
192    let mut headings: Vec<(usize, usize, String)> = Vec::new();
193
194    for (i, line) in lines.iter().enumerate() {
195        if let Some((level, text)) = HEADING_RE(line) {
196            headings.push((i, level, text.to_string()));
197        }
198    }
199
200    if headings.is_empty() {
201        return chunk_plaintext(content, file_path);
202    }
203
204    let mut chunks = Vec::new();
205
206    if headings[0].0 > 0 {
207        let preamble: Vec<&str> = lines[..headings[0].0].to_vec();
208        let preamble_content = preamble.join("\n");
209        if !preamble_content.trim().is_empty() {
210            chunks.push(TextChunk {
211                file_path: file_path.to_string(),
212                start_line: 1,
213                end_line: headings[0].0,
214                kind: ChunkKind::HeadingSection,
215                name: None,
216                content: preamble_content,
217            });
218        }
219    }
220
221    for i in 0..headings.len() {
222        let start = headings[i].0;
223        let end = {
224            let mut next_same_or_higher = None;
225            for j in (i + 1)..headings.len() {
226                if headings[j].1 <= headings[i].1 {
227                    next_same_or_higher = Some(j);
228                    break;
229                }
230            }
231            match next_same_or_higher {
232                Some(j) => headings[j].0 - 1,
233                None => lines.len() - 1,
234            }
235        };
236
237        let section: Vec<&str> = lines[start..=end].to_vec();
238        chunks.push(TextChunk {
239            file_path: file_path.to_string(),
240            start_line: start + 1,
241            end_line: end + 1,
242            kind: ChunkKind::HeadingSection,
243            name: Some(headings[i].2.clone()),
244            content: section.join("\n"),
245        });
246    }
247
248    chunks
249}
250
251fn chunk_yaml(content: &str, file_path: &str) -> Vec<TextChunk> {
252    let lines: Vec<&str> = content.lines().collect();
253    let mut keys: Vec<(usize, String)> = Vec::new();
254
255    for (i, line) in lines.iter().enumerate() {
256        let trimmed = line.trim();
257        if trimmed.starts_with('#')
258            || trimmed.starts_with("---")
259            || trimmed.starts_with("...")
260            || trimmed.is_empty()
261        {
262            continue;
263        }
264        if let Some(key) = YAML_TOP_KEY_RE(line) {
265            keys.push((i, key.to_string()));
266        }
267    }
268
269    if keys.is_empty() {
270        return Vec::new();
271    }
272
273    let mut chunks = Vec::new();
274    for i in 0..keys.len() {
275        let start = if i == 0 { 0 } else { keys[i].0 };
276        let end = if i < keys.len() - 1 {
277            keys[i + 1].0 - 1
278        } else {
279            lines.len() - 1
280        };
281
282        let section: Vec<&str> = lines[start..=end].to_vec();
283        chunks.push(TextChunk {
284            file_path: file_path.to_string(),
285            start_line: start + 1,
286            end_line: end + 1,
287            kind: ChunkKind::TopLevelKey,
288            name: Some(keys[i].1.clone()),
289            content: section.join("\n"),
290        });
291    }
292
293    chunks
294}
295
296fn chunk_json(content: &str, file_path: &str) -> Vec<TextChunk> {
297    let parsed: Result<serde_json::Value, _> = serde_json::from_str(content);
298    let parsed = match parsed {
299        Ok(v) => v,
300        Err(_) => return Vec::new(),
301    };
302
303    let obj = match parsed.as_object() {
304        Some(o) => o,
305        None => return Vec::new(),
306    };
307
308    let top_keys: Vec<&String> = obj.keys().collect();
309    if top_keys.is_empty() {
310        return Vec::new();
311    }
312
313    let lines: Vec<&str> = content.lines().collect();
314
315    if lines.len() <= 1 {
316        let mut chunks = Vec::new();
317        for key in &top_keys {
318            let val = &obj[*key];
319            let serialized =
320                serde_json::to_string_pretty(&serde_json::json!({ (*key).clone(): val.clone() }))
321                    .expect("serialize json chunk");
322            chunks.push(TextChunk {
323                file_path: file_path.to_string(),
324                start_line: 1,
325                end_line: 1,
326                kind: ChunkKind::TopLevelKey,
327                name: Some(key.to_string()),
328                content: serialized,
329            });
330        }
331        return chunks;
332    }
333
334    let mut key_positions: Vec<(String, usize)> = Vec::new();
335    let mut depth = 0usize;
336    for (i, line) in lines.iter().enumerate() {
337        let chars: Vec<char> = line.chars().collect();
338        let mut c = 0usize;
339        while c < chars.len() {
340            match chars[c] {
341                '"' => {
342                    c += 1;
343                    while c < chars.len() && chars[c] != '"' {
344                        if chars[c] == '\\' {
345                            c += 1;
346                        }
347                        c += 1;
348                    }
349                    if depth == 1
350                        && let Some(rest) = line.get(c + 1..)
351                    {
352                        let rest_trimmed = rest.trim_start();
353                        if rest_trimmed.starts_with(':') {
354                            let key_text = &line[line.find('"').expect("should have quote")
355                                ..=line.rfind('"').expect("should have end quote")];
356                            let key_clean = key_text.trim_matches('"');
357                            if top_keys.iter().any(|k| k.as_str() == key_clean)
358                                && !key_positions.iter().any(|(k, _)| k == key_clean)
359                            {
360                                key_positions.push((key_clean.to_string(), i));
361                            }
362                        }
363                    }
364                }
365                '{' | '[' => depth += 1,
366                '}' | ']' => depth = depth.saturating_sub(1),
367                _ => {}
368            }
369            c += 1;
370        }
371    }
372
373    let mut chunks = Vec::new();
374    for i in 0..key_positions.len() {
375        let start = key_positions[i].1;
376        let end = if i < key_positions.len() - 1 {
377            key_positions[i + 1].1 - 1
378        } else {
379            lines.len() - 1
380        };
381
382        let mut real_end = end;
383        while real_end > start && lines[real_end].trim().is_empty() {
384            real_end -= 1;
385        }
386
387        chunks.push(TextChunk {
388            file_path: file_path.to_string(),
389            start_line: start + 1,
390            end_line: real_end + 1,
391            kind: ChunkKind::TopLevelKey,
392            name: Some(key_positions[i].0.clone()),
393            content: lines[start..=real_end].join("\n"),
394        });
395    }
396
397    chunks
398}
399
400fn chunk_toml(content: &str, file_path: &str) -> Vec<TextChunk> {
401    let lines: Vec<&str> = content.lines().collect();
402
403    let mut boundaries: Vec<(usize, String)> = Vec::new();
404    let mut first_section_line = lines.len();
405
406    for (i, line) in lines.iter().enumerate() {
407        if let Some(name) = TOML_SECTION_RE(line) {
408            if i < first_section_line {
409                first_section_line = i;
410            }
411            boundaries.push((i, name.to_string()));
412        }
413    }
414
415    for (i, &line) in lines.iter().enumerate().take(first_section_line) {
416        if line.trim().is_empty() || line.trim().starts_with('#') {
417            continue;
418        }
419        if let Some(key) = TOML_KV_RE(line) {
420            boundaries.push((i, key.to_string()));
421        }
422    }
423
424    boundaries.sort_by_key(|(line, _)| *line);
425
426    if boundaries.is_empty() {
427        return Vec::new();
428    }
429
430    let mut chunks = Vec::new();
431    for i in 0..boundaries.len() {
432        let start = if i == 0 { 0 } else { boundaries[i].0 };
433        let end = if i < boundaries.len() - 1 {
434            boundaries[i + 1].0 - 1
435        } else {
436            lines.len() - 1
437        };
438
439        chunks.push(TextChunk {
440            file_path: file_path.to_string(),
441            start_line: start + 1,
442            end_line: end + 1,
443            kind: ChunkKind::TopLevelKey,
444            name: Some(boundaries[i].1.clone()),
445            content: lines[start..=end].join("\n"),
446        });
447    }
448
449    chunks
450}
451
452fn chunk_plaintext(content: &str, file_path: &str) -> Vec<TextChunk> {
453    let lines: Vec<&str> = content.lines().collect();
454
455    let mut paragraphs: Vec<(usize, usize, String)> = Vec::new();
456    let mut para_start: Option<usize> = None;
457    let mut consecutive_blanks = 0usize;
458
459    for i in 0..lines.len() {
460        let is_blank = lines[i].trim().is_empty();
461
462        if is_blank {
463            consecutive_blanks += 1;
464            if consecutive_blanks >= 2
465                && let Some(start) = para_start
466            {
467                let para_end = i.saturating_sub(consecutive_blanks).max(start);
468                paragraphs.push((start, para_end, lines[start..=para_end].join("\n")));
469                para_start = None;
470            }
471        } else {
472            if para_start.is_none() {
473                para_start = Some(i);
474            }
475            consecutive_blanks = 0;
476        }
477    }
478
479    if let Some(start) = para_start {
480        let mut end = lines.len() - 1;
481        while end > start && lines[end].trim().is_empty() {
482            end -= 1;
483        }
484        paragraphs.push((start, end, lines[start..=end].join("\n")));
485    }
486
487    if paragraphs.is_empty() {
488        return Vec::new();
489    }
490
491    let mut chunks = Vec::new();
492    let mut group_start = paragraphs[0].0;
493    let mut group_end = paragraphs[0].1;
494    let mut group_content = paragraphs[0].2.clone();
495
496    for para in paragraphs.iter().skip(1) {
497        if group_content.len() < MIN_PARAGRAPH_SIZE {
498            group_end = para.1;
499            group_content = format!("{group_content}\n\n{}", para.2);
500        } else {
501            chunks.push(TextChunk {
502                file_path: file_path.to_string(),
503                start_line: group_start + 1,
504                end_line: group_end + 1,
505                kind: ChunkKind::Paragraph,
506                name: extract_paragraph_name(&group_content),
507                content: group_content,
508            });
509            group_start = para.0;
510            group_end = para.1;
511            group_content = para.2.clone();
512        }
513    }
514
515    if group_content.len() < MIN_PARAGRAPH_SIZE && !chunks.is_empty() {
516        let last = chunks.last_mut().expect("should have chunk");
517        last.end_line = group_end + 1;
518        last.content = format!("{}\n\n{group_content}", last.content);
519    } else {
520        chunks.push(TextChunk {
521            file_path: file_path.to_string(),
522            start_line: group_start + 1,
523            end_line: group_end + 1,
524            kind: ChunkKind::Paragraph,
525            name: extract_paragraph_name(&group_content),
526            content: group_content,
527        });
528    }
529
530    chunks
531}
532
533fn extract_paragraph_name(content: &str) -> Option<String> {
534    let first_line = content.lines().next()?.trim();
535    if first_line.is_empty() {
536        return None;
537    }
538    Some(crate::util::truncate_with_ellipsis(first_line, 60))
539}
540
541fn enforce_max_size(chunk: TextChunk) -> Vec<TextChunk> {
542    if chunk.content.len() <= MAX_CHUNK_SIZE {
543        return vec![chunk];
544    }
545
546    let lines: Vec<&str> = chunk.content.lines().collect();
547    let mut sub_chunks: Vec<TextChunk> = Vec::new();
548    let mut current_lines: Vec<&str> = Vec::new();
549    let mut current_size: usize = 0;
550    let mut chunk_start_line = chunk.start_line;
551    let mut part_index = 0;
552
553    for line in lines {
554        let line_size = line.len() + 1;
555
556        if current_size + line_size > MAX_CHUNK_SIZE && !current_lines.is_empty() {
557            let mut split_at = current_lines.len();
558            for j in (1..current_lines.len()).rev() {
559                if current_lines[j].trim().is_empty() {
560                    split_at = j;
561                    break;
562                }
563            }
564
565            let emit_lines: Vec<&str> = current_lines[..split_at].to_vec();
566            let emit_content = emit_lines.join("\n");
567            let emit_end_line = chunk_start_line + split_at - 1;
568
569            sub_chunks.push(TextChunk {
570                file_path: chunk.file_path.clone(),
571                start_line: chunk_start_line,
572                end_line: emit_end_line,
573                kind: chunk.kind.clone(),
574                name: if part_index == 0 {
575                    chunk.name.clone()
576                } else {
577                    chunk.name.as_ref().map(|n| format!("{n} (cont.)"))
578                },
579                content: emit_content,
580            });
581            part_index += 1;
582
583            let remaining: Vec<&str> = current_lines[split_at..].to_vec();
584            current_lines = remaining;
585            current_lines.push(line);
586            chunk_start_line = emit_end_line + 1;
587            current_size = current_lines.join("\n").len();
588        } else {
589            current_lines.push(line);
590            current_size += line_size;
591        }
592    }
593
594    if !current_lines.is_empty() {
595        sub_chunks.push(TextChunk {
596            file_path: chunk.file_path.clone(),
597            start_line: chunk_start_line,
598            end_line: chunk.end_line,
599            kind: chunk.kind.clone(),
600            name: if part_index == 0 {
601                chunk.name.clone()
602            } else {
603                chunk.name.as_ref().map(|n| format!("{n} (cont.)"))
604            },
605            content: current_lines.join("\n"),
606        });
607    }
608
609    if sub_chunks.is_empty() {
610        vec![chunk]
611    } else {
612        sub_chunks
613    }
614}
615
616#[cfg(test)]
617mod tests {
618    use super::*;
619
620    #[test]
621    fn small_file_returns_single_chunk() {
622        let content = "short file";
623        let chunks = chunk_text_file(content, "test.txt", "plaintext");
624        assert_eq!(chunks.len(), 1);
625        assert_eq!(chunks[0].kind, ChunkKind::File);
626    }
627
628    #[test]
629    fn markdown_splits_by_headings() {
630        let mut content = String::from("# Title\n\n");
631        content.push_str("Some intro text that is long enough to make this file exceed the minimum split size threshold for our text chunker implementation. We need at least five hundred characters in total for the chunker to activate its format-specific splitting logic rather than returning the entire file as a single chunk.\n\n");
632        content.push_str("## Section 1\n\n");
633        content.push_str("Content 1 that is long enough to be significant and meaningful and contains enough text to meet size requirements for chunking.\n\n");
634        content.push_str("## Section 2\n\n");
635        content.push_str("Content 2 that is long enough to be significant and meaningful and contains enough text to meet size requirements for chunking purposes.\n\n");
636        let chunks = chunk_text_file(&content, "test.md", "markdown");
637        assert!(
638            chunks.len() >= 2,
639            "Should have at least 2 heading sections, got: {:?}",
640            chunks
641        );
642
643        let names: Vec<&str> = chunks.iter().filter_map(|c| c.name.as_deref()).collect();
644        assert!(
645            names.contains(&"Title"),
646            "Should contain Title, got: {:?}",
647            names
648        );
649        assert!(
650            names.contains(&"Section 1"),
651            "Should contain Section 1, got: {:?}",
652            names
653        );
654        assert!(
655            names.contains(&"Section 2"),
656            "Should contain Section 2, got: {:?}",
657            names
658        );
659    }
660
661    #[test]
662    fn markdown_no_headings_falls_back_to_plaintext() {
663        let content =
664            "Line 1\n\nLine 2\n\nLine 3\n\nLine 4 is longer and has more content to be meaningful";
665        let chunks = chunk_text_file(content, "test.md", "markdown");
666        assert!(!chunks.is_empty());
667    }
668
669    #[test]
670    fn yaml_splits_by_top_level_keys() {
671        let mut content = String::new();
672        content.push_str("server:\n  port: 8080\n  host: localhost\n  timeout: 30\n");
673        content.push_str(
674            "  max_connections: 100\n  enable_tls: true\n  cert_path: /etc/ssl/cert.pem\n",
675        );
676        content.push_str(
677            "  key_path: /etc/ssl/key.pem\n  worker_threads: 4\n  max_body_size: 10485760\n",
678        );
679        content.push_str("  keep_alive_timeout: 75\n  client_header_timeout: 60\n");
680        content.push_str("  client_body_timeout: 60\n  send_timeout: 30\n");
681        content.push_str(
682            "  access_log: /var/log/nginx/access.log\n  error_log: /var/log/nginx/error.log\n\n",
683        );
684        content.push_str("database:\n  url: postgres://localhost:5432/mydb\n  pool_size: 10\n");
685        content
686            .push_str("  timeout: 30\n  max_retries: 3\n  enable_ssl: true\n  ssl_mode: require\n");
687        content
688            .push_str("  connection_timeout: 5\n  statement_timeout: 30000\n  idle_timeout: 600\n");
689        content.push_str("  max_lifetime: 1800\n");
690        let chunks = chunk_text_file(&content, "config.yaml", "yaml");
691        assert!(
692            chunks.len() >= 2,
693            "Should split on top-level keys, got: {:?}",
694            chunks.len()
695        );
696
697        let names: Vec<&str> = chunks.iter().filter_map(|c| c.name.as_deref()).collect();
698        assert!(
699            names.contains(&"server"),
700            "Should contain server, got: {:?}",
701            names
702        );
703        assert!(
704            names.contains(&"database"),
705            "Should contain database, got: {:?}",
706            names
707        );
708    }
709
710    #[test]
711    fn json_splits_by_top_level_keys() {
712        let content = r#"{
713  "name": "test-project-with-a-long-name-for-testing",
714  "version": "1.0.0",
715  "description": "A test project with enough content to exceed the minimum split size threshold for our text chunker to activate format-specific splitting logic for JSON files in the test suite",
716  "main": "src/main.rs",
717  "license": "MIT",
718  "repository": "https://github.com/example/test-project",
719  "dependencies": {
720    "serde": "1.0",
721    "anyhow": "1.0",
722    "tokio": "1.0"
723  },
724  "devDependencies": {
725    "tempfile": "3.0",
726    "insta": "1.0"
727  }
728}"#;
729        let chunks = chunk_text_file(content, "package.json", "json");
730        assert!(
731            chunks.len() >= 2,
732            "Should split on top-level keys, got: {:?}",
733            chunks.len()
734        );
735    }
736
737    #[test]
738    fn toml_splits_by_sections_and_kv_pairs() {
739        let mut content = String::new();
740        content.push_str("name = \"test-project-with-long-name\"\n");
741        content.push_str("version = \"1.0.0\"\n");
742        content.push_str("edition = \"2024\"\n");
743        content.push_str("description = \"A test project with enough content to exceed the minimum split size threshold for our text chunker implementation so that format-specific splitting is activated\"\n");
744        content.push_str("license = \"MIT\"\n");
745        content.push_str("repository = \"https://github.com/example/test-project\"\n");
746        content.push_str("readme = \"README.md\"\n");
747        content.push_str("keywords = [\"test\", \"project\", \"example\"]\n");
748        content.push_str("categories = [\"development-tools\"]\n\n");
749        content.push_str("[dependencies]\n");
750        content.push_str("serde = \"1.0\"\n");
751        content.push_str("anyhow = \"1.0\"\n");
752        content.push_str("tokio = \"1.0\"\n");
753        content.push_str("turso = \"0.5\"\n");
754        content.push_str("reqwest = \"0.13\"\n");
755        content.push_str("clap = \"4.6\"\n\n");
756        content.push_str("[dev-dependencies]\n");
757        content.push_str("tempfile = \"3.0\"\n");
758        content.push_str("insta = \"1.0\"\n");
759        let chunks = chunk_text_file(&content, "Cargo.toml", "toml");
760        assert!(
761            chunks.len() >= 2,
762            "Should split on sections and KV pairs, got: {:?}",
763            chunks.len()
764        );
765    }
766
767    #[test]
768    fn plaintext_splits_by_double_newlines() {
769        let mut content = String::new();
770        for i in 0..10 {
771            content.push_str(&format!("This is paragraph {i} with enough text to be meaningful and exceed minimum size requirements for chunking in our system.\n\n"));
772        }
773        let chunks = chunk_text_file(&content, "test.txt", "plaintext");
774        assert!(!chunks.is_empty());
775        for chunk in &chunks {
776            assert!(!chunk.content.trim().is_empty());
777        }
778    }
779
780    #[test]
781    fn chunk_kind_display_roundtrips() {
782        let kinds = vec![
783            ChunkKind::Function,
784            ChunkKind::HeadingSection,
785            ChunkKind::TopLevelKey,
786            ChunkKind::File,
787        ];
788        for kind in kinds {
789            let s = kind.to_string();
790            let parsed: ChunkKind = s.parse().expect("should parse");
791            assert_eq!(kind, parsed);
792        }
793    }
794
795    #[test]
796    fn oversized_chunk_is_split() {
797        let mut content = String::new();
798        for i in 0..500 {
799            content.push_str(&format!("Line {i}: this is a somewhat long line of text\n"));
800        }
801        let chunk = TextChunk {
802            file_path: "test.txt".to_string(),
803            start_line: 1,
804            end_line: 500,
805            kind: ChunkKind::File,
806            name: None,
807            content: content.clone(),
808        };
809        let result = enforce_max_size(chunk);
810        assert!(result.len() > 1, "Oversized chunk should be split");
811        for sub in &result {
812            assert!(sub.content.len() <= MAX_CHUNK_SIZE);
813        }
814    }
815
816    #[test]
817    fn invalid_json_returns_empty() {
818        let content = "{not valid json at all";
819        let chunks = chunk_json(content, "bad.json");
820        assert!(chunks.is_empty());
821    }
822
823    #[test]
824    fn json_array_returns_empty() {
825        let content = "[1, 2, 3]";
826        let chunks = chunk_json(content, "arr.json");
827        assert!(chunks.is_empty());
828    }
829
830    #[test]
831    fn extract_paragraph_name_multibyte_does_not_panic() {
832        // Regression: multi-byte char straddling the 57-byte cut in extract_paragraph_name.
833        let prefix = "a".repeat(56);
834        let content =
835            format!("{prefix}\u{2019}rest of a very long first line that exceeds sixty bytes");
836        let name = extract_paragraph_name(&content);
837        assert!(name.is_some());
838        let name = name.expect("should have name");
839        assert!(std::str::from_utf8(name.as_bytes()).is_ok());
840        assert!(name.len() <= 60);
841    }
842}