1use std::sync::LazyLock;
2
3#[derive(Debug, Clone, PartialEq, Eq)]
4pub enum ChunkKind {
5 Function,
6 Method,
7 Class,
8 Interface,
9 Struct,
10 Enum,
11 Impl,
12 Export,
13 TypeAlias,
14 Module,
15 HeadingSection,
16 TopLevelKey,
17 Paragraph,
18 File,
19}
20
21impl std::fmt::Display for ChunkKind {
22 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
23 match self {
24 ChunkKind::Function => write!(f, "function"),
25 ChunkKind::Method => write!(f, "method"),
26 ChunkKind::Class => write!(f, "class"),
27 ChunkKind::Interface => write!(f, "interface"),
28 ChunkKind::Struct => write!(f, "struct"),
29 ChunkKind::Enum => write!(f, "enum"),
30 ChunkKind::Impl => write!(f, "impl"),
31 ChunkKind::Export => write!(f, "export"),
32 ChunkKind::TypeAlias => write!(f, "type_alias"),
33 ChunkKind::Module => write!(f, "module"),
34 ChunkKind::HeadingSection => write!(f, "heading_section"),
35 ChunkKind::TopLevelKey => write!(f, "top_level_key"),
36 ChunkKind::Paragraph => write!(f, "paragraph"),
37 ChunkKind::File => write!(f, "file"),
38 }
39 }
40}
41
42impl std::str::FromStr for ChunkKind {
43 type Err = String;
44
45 fn from_str(s: &str) -> Result<Self, Self::Err> {
46 match s {
47 "function" => Ok(ChunkKind::Function),
48 "method" => Ok(ChunkKind::Method),
49 "class" => Ok(ChunkKind::Class),
50 "interface" => Ok(ChunkKind::Interface),
51 "struct" => Ok(ChunkKind::Struct),
52 "enum" => Ok(ChunkKind::Enum),
53 "impl" => Ok(ChunkKind::Impl),
54 "export" => Ok(ChunkKind::Export),
55 "type_alias" => Ok(ChunkKind::TypeAlias),
56 "module" => Ok(ChunkKind::Module),
57 "heading_section" => Ok(ChunkKind::HeadingSection),
58 "top_level_key" => Ok(ChunkKind::TopLevelKey),
59 "paragraph" => Ok(ChunkKind::Paragraph),
60 "file" => Ok(ChunkKind::File),
61 other => Err(format!("Unknown chunk kind: {other}")),
62 }
63 }
64}
65
66#[derive(Debug, Clone)]
67pub struct TextChunk {
68 pub file_path: String,
69 pub start_line: usize,
70 pub end_line: usize,
71 pub kind: ChunkKind,
72 pub name: Option<String>,
73 pub content: String,
74}
75
76const MIN_SPLIT_SIZE: usize = 500;
77const MAX_CHUNK_SIZE: usize = 8000;
78const MIN_PARAGRAPH_SIZE: usize = 200;
79
80#[allow(clippy::type_complexity)]
81static HEADING_RE: LazyLock<fn(&str) -> Option<(usize, &str)>> = LazyLock::new(|| {
82 |line: &str| {
83 let trimmed = line.trim_start();
84 if trimmed.starts_with('#') {
85 let hash_count = trimmed.chars().take_while(|c| *c == '#').count();
86 if hash_count <= 6 {
87 let rest = trimmed[hash_count..].trim_start();
88 if !rest.is_empty() {
89 return Some((hash_count, rest));
90 }
91 }
92 }
93 None
94 }
95});
96
97static YAML_TOP_KEY_RE: LazyLock<fn(&str) -> Option<&str>> = LazyLock::new(|| {
98 |line: &str| {
99 let first_char = line.chars().next()?;
100 if !first_char.is_alphabetic() && first_char != '_' {
101 return None;
102 }
103 if let Some(colon_pos) = line.find(':') {
104 let key = &line[..colon_pos];
105 if key
106 .chars()
107 .all(|c| c.is_alphanumeric() || c == '_' || c == '.' || c == '-')
108 {
109 let after_colon = line[colon_pos + 1..].trim();
110 if after_colon.is_empty()
111 || after_colon.starts_with(' ')
112 || after_colon.starts_with('\t')
113 {
114 return Some(key);
115 }
116 }
117 }
118 None
119 }
120});
121
122static TOML_SECTION_RE: LazyLock<fn(&str) -> Option<&str>> = LazyLock::new(|| {
123 |line: &str| {
124 let trimmed = line.trim();
125 if (trimmed.starts_with('[') && trimmed.ends_with(']'))
126 || (trimmed.starts_with("[[") && trimmed.ends_with("]]"))
127 {
128 let inner = trimmed.trim_start_matches('[').trim_end_matches(']');
129 let name = inner.trim();
130 if !name.is_empty() {
131 return Some(name);
132 }
133 }
134 None
135 }
136});
137
138static TOML_KV_RE: LazyLock<fn(&str) -> Option<&str>> = LazyLock::new(|| {
139 |line: &str| {
140 let first_char = line.chars().next()?;
141 if !first_char.is_alphabetic() && first_char != '_' {
142 return None;
143 }
144 if let Some(eq_pos) = line.find('=') {
145 let key = &line[..eq_pos];
146 if key
147 .chars()
148 .all(|c| c.is_alphanumeric() || c == '_' || c == '.' || c == '-')
149 {
150 return Some(key.trim());
151 }
152 }
153 None
154 }
155});
156
157pub fn chunk_text_file(content: &str, file_path: &str, file_type: &str) -> Vec<TextChunk> {
158 if content.len() < MIN_SPLIT_SIZE {
159 return vec![whole_file_chunk(content, file_path)];
160 }
161
162 let chunks = match file_type {
163 "markdown" => chunk_markdown(content, file_path),
164 "yaml" => chunk_yaml(content, file_path),
165 "json" => chunk_json(content, file_path),
166 "toml" => chunk_toml(content, file_path),
167 "plaintext" => chunk_plaintext(content, file_path),
168 _ => Vec::new(),
169 };
170
171 if chunks.is_empty() {
172 return vec![whole_file_chunk(content, file_path)];
173 }
174
175 chunks.into_iter().flat_map(enforce_max_size).collect()
176}
177
178fn whole_file_chunk(content: &str, file_path: &str) -> TextChunk {
179 let line_count = content.lines().count().max(1);
180 TextChunk {
181 file_path: file_path.to_string(),
182 start_line: 1,
183 end_line: line_count,
184 kind: ChunkKind::File,
185 name: None,
186 content: content.to_string(),
187 }
188}
189
190fn chunk_markdown(content: &str, file_path: &str) -> Vec<TextChunk> {
191 let lines: Vec<&str> = content.lines().collect();
192 let mut headings: Vec<(usize, usize, String)> = Vec::new();
193
194 for (i, line) in lines.iter().enumerate() {
195 if let Some((level, text)) = HEADING_RE(line) {
196 headings.push((i, level, text.to_string()));
197 }
198 }
199
200 if headings.is_empty() {
201 return chunk_plaintext(content, file_path);
202 }
203
204 let mut chunks = Vec::new();
205
206 if headings[0].0 > 0 {
207 let preamble: Vec<&str> = lines[..headings[0].0].to_vec();
208 let preamble_content = preamble.join("\n");
209 if !preamble_content.trim().is_empty() {
210 chunks.push(TextChunk {
211 file_path: file_path.to_string(),
212 start_line: 1,
213 end_line: headings[0].0,
214 kind: ChunkKind::HeadingSection,
215 name: None,
216 content: preamble_content,
217 });
218 }
219 }
220
221 for i in 0..headings.len() {
222 let start = headings[i].0;
223 let end = {
224 let mut next_same_or_higher = None;
225 for j in (i + 1)..headings.len() {
226 if headings[j].1 <= headings[i].1 {
227 next_same_or_higher = Some(j);
228 break;
229 }
230 }
231 match next_same_or_higher {
232 Some(j) => headings[j].0 - 1,
233 None => lines.len() - 1,
234 }
235 };
236
237 let section: Vec<&str> = lines[start..=end].to_vec();
238 chunks.push(TextChunk {
239 file_path: file_path.to_string(),
240 start_line: start + 1,
241 end_line: end + 1,
242 kind: ChunkKind::HeadingSection,
243 name: Some(headings[i].2.clone()),
244 content: section.join("\n"),
245 });
246 }
247
248 chunks
249}
250
251fn chunk_yaml(content: &str, file_path: &str) -> Vec<TextChunk> {
252 let lines: Vec<&str> = content.lines().collect();
253 let mut keys: Vec<(usize, String)> = Vec::new();
254
255 for (i, line) in lines.iter().enumerate() {
256 let trimmed = line.trim();
257 if trimmed.starts_with('#')
258 || trimmed.starts_with("---")
259 || trimmed.starts_with("...")
260 || trimmed.is_empty()
261 {
262 continue;
263 }
264 if let Some(key) = YAML_TOP_KEY_RE(line) {
265 keys.push((i, key.to_string()));
266 }
267 }
268
269 if keys.is_empty() {
270 return Vec::new();
271 }
272
273 let mut chunks = Vec::new();
274 for i in 0..keys.len() {
275 let start = if i == 0 { 0 } else { keys[i].0 };
276 let end = if i < keys.len() - 1 {
277 keys[i + 1].0 - 1
278 } else {
279 lines.len() - 1
280 };
281
282 let section: Vec<&str> = lines[start..=end].to_vec();
283 chunks.push(TextChunk {
284 file_path: file_path.to_string(),
285 start_line: start + 1,
286 end_line: end + 1,
287 kind: ChunkKind::TopLevelKey,
288 name: Some(keys[i].1.clone()),
289 content: section.join("\n"),
290 });
291 }
292
293 chunks
294}
295
296fn chunk_json(content: &str, file_path: &str) -> Vec<TextChunk> {
297 let parsed: Result<serde_json::Value, _> = serde_json::from_str(content);
298 let parsed = match parsed {
299 Ok(v) => v,
300 Err(_) => return Vec::new(),
301 };
302
303 let obj = match parsed.as_object() {
304 Some(o) => o,
305 None => return Vec::new(),
306 };
307
308 let top_keys: Vec<&String> = obj.keys().collect();
309 if top_keys.is_empty() {
310 return Vec::new();
311 }
312
313 let lines: Vec<&str> = content.lines().collect();
314
315 if lines.len() <= 1 {
316 let mut chunks = Vec::new();
317 for key in &top_keys {
318 let val = &obj[*key];
319 let serialized =
320 serde_json::to_string_pretty(&serde_json::json!({ (*key).clone(): val.clone() }))
321 .expect("serialize json chunk");
322 chunks.push(TextChunk {
323 file_path: file_path.to_string(),
324 start_line: 1,
325 end_line: 1,
326 kind: ChunkKind::TopLevelKey,
327 name: Some(key.to_string()),
328 content: serialized,
329 });
330 }
331 return chunks;
332 }
333
334 let mut key_positions: Vec<(String, usize)> = Vec::new();
335 let mut depth = 0usize;
336 for (i, line) in lines.iter().enumerate() {
337 let chars: Vec<char> = line.chars().collect();
338 let mut c = 0usize;
339 while c < chars.len() {
340 match chars[c] {
341 '"' => {
342 c += 1;
343 while c < chars.len() && chars[c] != '"' {
344 if chars[c] == '\\' {
345 c += 1;
346 }
347 c += 1;
348 }
349 if depth == 1
350 && let Some(rest) = line.get(c + 1..)
351 {
352 let rest_trimmed = rest.trim_start();
353 if rest_trimmed.starts_with(':') {
354 let key_text = &line[line.find('"').expect("should have quote")
355 ..=line.rfind('"').expect("should have end quote")];
356 let key_clean = key_text.trim_matches('"');
357 if top_keys.iter().any(|k| k.as_str() == key_clean)
358 && !key_positions.iter().any(|(k, _)| k == key_clean)
359 {
360 key_positions.push((key_clean.to_string(), i));
361 }
362 }
363 }
364 }
365 '{' | '[' => depth += 1,
366 '}' | ']' => depth = depth.saturating_sub(1),
367 _ => {}
368 }
369 c += 1;
370 }
371 }
372
373 let mut chunks = Vec::new();
374 for i in 0..key_positions.len() {
375 let start = key_positions[i].1;
376 let end = if i < key_positions.len() - 1 {
377 key_positions[i + 1].1 - 1
378 } else {
379 lines.len() - 1
380 };
381
382 let mut real_end = end;
383 while real_end > start && lines[real_end].trim().is_empty() {
384 real_end -= 1;
385 }
386
387 chunks.push(TextChunk {
388 file_path: file_path.to_string(),
389 start_line: start + 1,
390 end_line: real_end + 1,
391 kind: ChunkKind::TopLevelKey,
392 name: Some(key_positions[i].0.clone()),
393 content: lines[start..=real_end].join("\n"),
394 });
395 }
396
397 chunks
398}
399
400fn chunk_toml(content: &str, file_path: &str) -> Vec<TextChunk> {
401 let lines: Vec<&str> = content.lines().collect();
402
403 let mut boundaries: Vec<(usize, String)> = Vec::new();
404 let mut first_section_line = lines.len();
405
406 for (i, line) in lines.iter().enumerate() {
407 if let Some(name) = TOML_SECTION_RE(line) {
408 if i < first_section_line {
409 first_section_line = i;
410 }
411 boundaries.push((i, name.to_string()));
412 }
413 }
414
415 for (i, &line) in lines.iter().enumerate().take(first_section_line) {
416 if line.trim().is_empty() || line.trim().starts_with('#') {
417 continue;
418 }
419 if let Some(key) = TOML_KV_RE(line) {
420 boundaries.push((i, key.to_string()));
421 }
422 }
423
424 boundaries.sort_by_key(|(line, _)| *line);
425
426 if boundaries.is_empty() {
427 return Vec::new();
428 }
429
430 let mut chunks = Vec::new();
431 for i in 0..boundaries.len() {
432 let start = if i == 0 { 0 } else { boundaries[i].0 };
433 let end = if i < boundaries.len() - 1 {
434 boundaries[i + 1].0 - 1
435 } else {
436 lines.len() - 1
437 };
438
439 chunks.push(TextChunk {
440 file_path: file_path.to_string(),
441 start_line: start + 1,
442 end_line: end + 1,
443 kind: ChunkKind::TopLevelKey,
444 name: Some(boundaries[i].1.clone()),
445 content: lines[start..=end].join("\n"),
446 });
447 }
448
449 chunks
450}
451
452fn chunk_plaintext(content: &str, file_path: &str) -> Vec<TextChunk> {
453 let lines: Vec<&str> = content.lines().collect();
454
455 let mut paragraphs: Vec<(usize, usize, String)> = Vec::new();
456 let mut para_start: Option<usize> = None;
457 let mut consecutive_blanks = 0usize;
458
459 for i in 0..lines.len() {
460 let is_blank = lines[i].trim().is_empty();
461
462 if is_blank {
463 consecutive_blanks += 1;
464 if consecutive_blanks >= 2
465 && let Some(start) = para_start
466 {
467 let para_end = i.saturating_sub(consecutive_blanks).max(start);
468 paragraphs.push((start, para_end, lines[start..=para_end].join("\n")));
469 para_start = None;
470 }
471 } else {
472 if para_start.is_none() {
473 para_start = Some(i);
474 }
475 consecutive_blanks = 0;
476 }
477 }
478
479 if let Some(start) = para_start {
480 let mut end = lines.len() - 1;
481 while end > start && lines[end].trim().is_empty() {
482 end -= 1;
483 }
484 paragraphs.push((start, end, lines[start..=end].join("\n")));
485 }
486
487 if paragraphs.is_empty() {
488 return Vec::new();
489 }
490
491 let mut chunks = Vec::new();
492 let mut group_start = paragraphs[0].0;
493 let mut group_end = paragraphs[0].1;
494 let mut group_content = paragraphs[0].2.clone();
495
496 for para in paragraphs.iter().skip(1) {
497 if group_content.len() < MIN_PARAGRAPH_SIZE {
498 group_end = para.1;
499 group_content = format!("{group_content}\n\n{}", para.2);
500 } else {
501 chunks.push(TextChunk {
502 file_path: file_path.to_string(),
503 start_line: group_start + 1,
504 end_line: group_end + 1,
505 kind: ChunkKind::Paragraph,
506 name: extract_paragraph_name(&group_content),
507 content: group_content,
508 });
509 group_start = para.0;
510 group_end = para.1;
511 group_content = para.2.clone();
512 }
513 }
514
515 if group_content.len() < MIN_PARAGRAPH_SIZE && !chunks.is_empty() {
516 let last = chunks.last_mut().expect("should have chunk");
517 last.end_line = group_end + 1;
518 last.content = format!("{}\n\n{group_content}", last.content);
519 } else {
520 chunks.push(TextChunk {
521 file_path: file_path.to_string(),
522 start_line: group_start + 1,
523 end_line: group_end + 1,
524 kind: ChunkKind::Paragraph,
525 name: extract_paragraph_name(&group_content),
526 content: group_content,
527 });
528 }
529
530 chunks
531}
532
533fn extract_paragraph_name(content: &str) -> Option<String> {
534 let first_line = content.lines().next()?.trim();
535 if first_line.is_empty() {
536 return None;
537 }
538 Some(crate::util::truncate_with_ellipsis(first_line, 60))
539}
540
541fn enforce_max_size(chunk: TextChunk) -> Vec<TextChunk> {
542 if chunk.content.len() <= MAX_CHUNK_SIZE {
543 return vec![chunk];
544 }
545
546 let lines: Vec<&str> = chunk.content.lines().collect();
547 let mut sub_chunks: Vec<TextChunk> = Vec::new();
548 let mut current_lines: Vec<&str> = Vec::new();
549 let mut current_size: usize = 0;
550 let mut chunk_start_line = chunk.start_line;
551 let mut part_index = 0;
552
553 for line in lines {
554 let line_size = line.len() + 1;
555
556 if current_size + line_size > MAX_CHUNK_SIZE && !current_lines.is_empty() {
557 let mut split_at = current_lines.len();
558 for j in (1..current_lines.len()).rev() {
559 if current_lines[j].trim().is_empty() {
560 split_at = j;
561 break;
562 }
563 }
564
565 let emit_lines: Vec<&str> = current_lines[..split_at].to_vec();
566 let emit_content = emit_lines.join("\n");
567 let emit_end_line = chunk_start_line + split_at - 1;
568
569 sub_chunks.push(TextChunk {
570 file_path: chunk.file_path.clone(),
571 start_line: chunk_start_line,
572 end_line: emit_end_line,
573 kind: chunk.kind.clone(),
574 name: if part_index == 0 {
575 chunk.name.clone()
576 } else {
577 chunk.name.as_ref().map(|n| format!("{n} (cont.)"))
578 },
579 content: emit_content,
580 });
581 part_index += 1;
582
583 let remaining: Vec<&str> = current_lines[split_at..].to_vec();
584 current_lines = remaining;
585 current_lines.push(line);
586 chunk_start_line = emit_end_line + 1;
587 current_size = current_lines.join("\n").len();
588 } else {
589 current_lines.push(line);
590 current_size += line_size;
591 }
592 }
593
594 if !current_lines.is_empty() {
595 sub_chunks.push(TextChunk {
596 file_path: chunk.file_path.clone(),
597 start_line: chunk_start_line,
598 end_line: chunk.end_line,
599 kind: chunk.kind.clone(),
600 name: if part_index == 0 {
601 chunk.name.clone()
602 } else {
603 chunk.name.as_ref().map(|n| format!("{n} (cont.)"))
604 },
605 content: current_lines.join("\n"),
606 });
607 }
608
609 if sub_chunks.is_empty() {
610 vec![chunk]
611 } else {
612 sub_chunks
613 }
614}
615
616#[cfg(test)]
617mod tests {
618 use super::*;
619
620 #[test]
621 fn small_file_returns_single_chunk() {
622 let content = "short file";
623 let chunks = chunk_text_file(content, "test.txt", "plaintext");
624 assert_eq!(chunks.len(), 1);
625 assert_eq!(chunks[0].kind, ChunkKind::File);
626 }
627
628 #[test]
629 fn markdown_splits_by_headings() {
630 let mut content = String::from("# Title\n\n");
631 content.push_str("Some intro text that is long enough to make this file exceed the minimum split size threshold for our text chunker implementation. We need at least five hundred characters in total for the chunker to activate its format-specific splitting logic rather than returning the entire file as a single chunk.\n\n");
632 content.push_str("## Section 1\n\n");
633 content.push_str("Content 1 that is long enough to be significant and meaningful and contains enough text to meet size requirements for chunking.\n\n");
634 content.push_str("## Section 2\n\n");
635 content.push_str("Content 2 that is long enough to be significant and meaningful and contains enough text to meet size requirements for chunking purposes.\n\n");
636 let chunks = chunk_text_file(&content, "test.md", "markdown");
637 assert!(
638 chunks.len() >= 2,
639 "Should have at least 2 heading sections, got: {:?}",
640 chunks
641 );
642
643 let names: Vec<&str> = chunks.iter().filter_map(|c| c.name.as_deref()).collect();
644 assert!(
645 names.contains(&"Title"),
646 "Should contain Title, got: {:?}",
647 names
648 );
649 assert!(
650 names.contains(&"Section 1"),
651 "Should contain Section 1, got: {:?}",
652 names
653 );
654 assert!(
655 names.contains(&"Section 2"),
656 "Should contain Section 2, got: {:?}",
657 names
658 );
659 }
660
661 #[test]
662 fn markdown_no_headings_falls_back_to_plaintext() {
663 let content =
664 "Line 1\n\nLine 2\n\nLine 3\n\nLine 4 is longer and has more content to be meaningful";
665 let chunks = chunk_text_file(content, "test.md", "markdown");
666 assert!(!chunks.is_empty());
667 }
668
669 #[test]
670 fn yaml_splits_by_top_level_keys() {
671 let mut content = String::new();
672 content.push_str("server:\n port: 8080\n host: localhost\n timeout: 30\n");
673 content.push_str(
674 " max_connections: 100\n enable_tls: true\n cert_path: /etc/ssl/cert.pem\n",
675 );
676 content.push_str(
677 " key_path: /etc/ssl/key.pem\n worker_threads: 4\n max_body_size: 10485760\n",
678 );
679 content.push_str(" keep_alive_timeout: 75\n client_header_timeout: 60\n");
680 content.push_str(" client_body_timeout: 60\n send_timeout: 30\n");
681 content.push_str(
682 " access_log: /var/log/nginx/access.log\n error_log: /var/log/nginx/error.log\n\n",
683 );
684 content.push_str("database:\n url: postgres://localhost:5432/mydb\n pool_size: 10\n");
685 content
686 .push_str(" timeout: 30\n max_retries: 3\n enable_ssl: true\n ssl_mode: require\n");
687 content
688 .push_str(" connection_timeout: 5\n statement_timeout: 30000\n idle_timeout: 600\n");
689 content.push_str(" max_lifetime: 1800\n");
690 let chunks = chunk_text_file(&content, "config.yaml", "yaml");
691 assert!(
692 chunks.len() >= 2,
693 "Should split on top-level keys, got: {:?}",
694 chunks.len()
695 );
696
697 let names: Vec<&str> = chunks.iter().filter_map(|c| c.name.as_deref()).collect();
698 assert!(
699 names.contains(&"server"),
700 "Should contain server, got: {:?}",
701 names
702 );
703 assert!(
704 names.contains(&"database"),
705 "Should contain database, got: {:?}",
706 names
707 );
708 }
709
710 #[test]
711 fn json_splits_by_top_level_keys() {
712 let content = r#"{
713 "name": "test-project-with-a-long-name-for-testing",
714 "version": "1.0.0",
715 "description": "A test project with enough content to exceed the minimum split size threshold for our text chunker to activate format-specific splitting logic for JSON files in the test suite",
716 "main": "src/main.rs",
717 "license": "MIT",
718 "repository": "https://github.com/example/test-project",
719 "dependencies": {
720 "serde": "1.0",
721 "anyhow": "1.0",
722 "tokio": "1.0"
723 },
724 "devDependencies": {
725 "tempfile": "3.0",
726 "insta": "1.0"
727 }
728}"#;
729 let chunks = chunk_text_file(content, "package.json", "json");
730 assert!(
731 chunks.len() >= 2,
732 "Should split on top-level keys, got: {:?}",
733 chunks.len()
734 );
735 }
736
737 #[test]
738 fn toml_splits_by_sections_and_kv_pairs() {
739 let mut content = String::new();
740 content.push_str("name = \"test-project-with-long-name\"\n");
741 content.push_str("version = \"1.0.0\"\n");
742 content.push_str("edition = \"2024\"\n");
743 content.push_str("description = \"A test project with enough content to exceed the minimum split size threshold for our text chunker implementation so that format-specific splitting is activated\"\n");
744 content.push_str("license = \"MIT\"\n");
745 content.push_str("repository = \"https://github.com/example/test-project\"\n");
746 content.push_str("readme = \"README.md\"\n");
747 content.push_str("keywords = [\"test\", \"project\", \"example\"]\n");
748 content.push_str("categories = [\"development-tools\"]\n\n");
749 content.push_str("[dependencies]\n");
750 content.push_str("serde = \"1.0\"\n");
751 content.push_str("anyhow = \"1.0\"\n");
752 content.push_str("tokio = \"1.0\"\n");
753 content.push_str("turso = \"0.5\"\n");
754 content.push_str("reqwest = \"0.13\"\n");
755 content.push_str("clap = \"4.6\"\n\n");
756 content.push_str("[dev-dependencies]\n");
757 content.push_str("tempfile = \"3.0\"\n");
758 content.push_str("insta = \"1.0\"\n");
759 let chunks = chunk_text_file(&content, "Cargo.toml", "toml");
760 assert!(
761 chunks.len() >= 2,
762 "Should split on sections and KV pairs, got: {:?}",
763 chunks.len()
764 );
765 }
766
767 #[test]
768 fn plaintext_splits_by_double_newlines() {
769 let mut content = String::new();
770 for i in 0..10 {
771 content.push_str(&format!("This is paragraph {i} with enough text to be meaningful and exceed minimum size requirements for chunking in our system.\n\n"));
772 }
773 let chunks = chunk_text_file(&content, "test.txt", "plaintext");
774 assert!(!chunks.is_empty());
775 for chunk in &chunks {
776 assert!(!chunk.content.trim().is_empty());
777 }
778 }
779
780 #[test]
781 fn chunk_kind_display_roundtrips() {
782 let kinds = vec![
783 ChunkKind::Function,
784 ChunkKind::HeadingSection,
785 ChunkKind::TopLevelKey,
786 ChunkKind::File,
787 ];
788 for kind in kinds {
789 let s = kind.to_string();
790 let parsed: ChunkKind = s.parse().expect("should parse");
791 assert_eq!(kind, parsed);
792 }
793 }
794
795 #[test]
796 fn oversized_chunk_is_split() {
797 let mut content = String::new();
798 for i in 0..500 {
799 content.push_str(&format!("Line {i}: this is a somewhat long line of text\n"));
800 }
801 let chunk = TextChunk {
802 file_path: "test.txt".to_string(),
803 start_line: 1,
804 end_line: 500,
805 kind: ChunkKind::File,
806 name: None,
807 content: content.clone(),
808 };
809 let result = enforce_max_size(chunk);
810 assert!(result.len() > 1, "Oversized chunk should be split");
811 for sub in &result {
812 assert!(sub.content.len() <= MAX_CHUNK_SIZE);
813 }
814 }
815
816 #[test]
817 fn invalid_json_returns_empty() {
818 let content = "{not valid json at all";
819 let chunks = chunk_json(content, "bad.json");
820 assert!(chunks.is_empty());
821 }
822
823 #[test]
824 fn json_array_returns_empty() {
825 let content = "[1, 2, 3]";
826 let chunks = chunk_json(content, "arr.json");
827 assert!(chunks.is_empty());
828 }
829
830 #[test]
831 fn extract_paragraph_name_multibyte_does_not_panic() {
832 let prefix = "a".repeat(56);
834 let content =
835 format!("{prefix}\u{2019}rest of a very long first line that exceeds sixty bytes");
836 let name = extract_paragraph_name(&content);
837 assert!(name.is_some());
838 let name = name.expect("should have name");
839 assert!(std::str::from_utf8(name.as_bytes()).is_ok());
840 assert!(name.len() <= 60);
841 }
842}