1use pulldown_cmark::{Event, HeadingLevel, Options, Parser, Tag, TagEnd};
33use serde::{Deserialize, Serialize};
34use std::collections::HashMap;
35use std::path::{Path, PathBuf};
36use tokio::fs;
37use tracing::{debug, instrument, warn};
38
39use crate::error::{Error, Result};
40
41#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct MarkdownDocument {
50 pub path: PathBuf,
52
53 pub title: Option<String>,
55
56 pub metadata: HashMap<String, String>,
59
60 pub chunks: Vec<MarkdownChunk>,
62}
63
64impl MarkdownDocument {
65 pub fn new(path: impl Into<PathBuf>) -> Self {
67 Self {
68 path: path.into(),
69 title: None,
70 metadata: HashMap::new(),
71 chunks: Vec::new(),
72 }
73 }
74
75 pub fn full_text(&self) -> String {
77 self.chunks
78 .iter()
79 .map(|c| c.content.as_str())
80 .collect::<Vec<_>>()
81 .join("\n\n")
82 }
83
84 pub fn text_chunks(&self) -> impl Iterator<Item = &MarkdownChunk> {
86 self.chunks
87 .iter()
88 .filter(|c| c.chunk_type == ChunkType::Text)
89 }
90
91 pub fn code_chunks(&self) -> impl Iterator<Item = &MarkdownChunk> {
93 self.chunks
94 .iter()
95 .filter(|c| matches!(c.chunk_type, ChunkType::CodeBlock { .. }))
96 }
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct MarkdownChunk {
106 pub content: String,
108
109 pub chunk_type: ChunkType,
111
112 pub heading_hierarchy: Vec<String>,
116
117 pub line_range: (usize, usize),
120}
121
122impl MarkdownChunk {
123 pub fn text(
125 content: impl Into<String>,
126 heading_hierarchy: Vec<String>,
127 line_range: (usize, usize),
128 ) -> Self {
129 Self {
130 content: content.into(),
131 chunk_type: ChunkType::Text,
132 heading_hierarchy,
133 line_range,
134 }
135 }
136
137 pub fn code_block(
139 content: impl Into<String>,
140 language: Option<String>,
141 heading_hierarchy: Vec<String>,
142 line_range: (usize, usize),
143 ) -> Self {
144 Self {
145 content: content.into(),
146 chunk_type: ChunkType::CodeBlock { language },
147 heading_hierarchy,
148 line_range,
149 }
150 }
151
152 pub fn is_code(&self) -> bool {
154 matches!(self.chunk_type, ChunkType::CodeBlock { .. })
155 }
156
157 pub fn code_language(&self) -> Option<&str> {
159 match &self.chunk_type {
160 ChunkType::CodeBlock { language } => language.as_deref(),
161 _ => None,
162 }
163 }
164
165 pub fn context_string(&self) -> String {
167 if self.heading_hierarchy.is_empty() {
168 "Document root".to_string()
169 } else {
170 self.heading_hierarchy.join(" > ")
171 }
172 }
173}
174
175#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
177pub enum ChunkType {
178 Text,
180
181 CodeBlock {
183 language: Option<String>,
185 },
186
187 List,
189
190 Table,
192}
193
194#[derive(Debug, Clone)]
196pub struct IngesterConfig {
197 pub min_chunk_size: usize,
199
200 pub max_chunk_size: usize,
202
203 pub preserve_code_blocks: bool,
205
206 pub include_frontmatter: bool,
208
209 pub markdown_extensions: Vec<String>,
211}
212
213impl Default for IngesterConfig {
214 fn default() -> Self {
215 Self {
216 min_chunk_size: 50,
217 max_chunk_size: 4000,
218 preserve_code_blocks: true,
219 include_frontmatter: true,
220 markdown_extensions: vec!["md".to_string(), "markdown".to_string(), "mdx".to_string()],
221 }
222 }
223}
224
225#[derive(Debug, Clone)]
233pub struct MarkdownIngester {
234 config: IngesterConfig,
235}
236
237impl Default for MarkdownIngester {
238 fn default() -> Self {
239 Self::new()
240 }
241}
242
243impl MarkdownIngester {
244 pub fn new() -> Self {
246 Self {
247 config: IngesterConfig::default(),
248 }
249 }
250
251 pub fn with_config(config: IngesterConfig) -> Self {
253 Self { config }
254 }
255
256 #[instrument(skip(self), fields(path = %path.display()))]
270 pub async fn ingest_file(&self, path: &Path) -> Result<MarkdownDocument> {
271 debug!("Ingesting markdown file");
272
273 let content = fs::read_to_string(path).await.map_err(Error::Io)?;
274
275 let mut document = MarkdownDocument::new(path);
276
277 let (frontmatter, body) = Self::extract_frontmatter(&content);
279
280 if let Some(fm) = frontmatter {
281 document.metadata = fm.clone();
282 if let Some(title) = fm.get("title") {
284 document.title = Some(title.clone());
285 }
286 }
287
288 document.chunks = self.parse_markdown(body);
290
291 if document.title.is_none() {
293 document.title = document
294 .chunks
295 .iter()
296 .find(|c| !c.heading_hierarchy.is_empty())
297 .and_then(|c| c.heading_hierarchy.first().cloned());
298 }
299
300 debug!(chunks = document.chunks.len(), "Ingestion complete");
301
302 Ok(document)
303 }
304
305 #[instrument(skip(self), fields(dir = %dir.display(), recursive))]
317 pub async fn ingest_directory(
318 &self,
319 dir: &Path,
320 recursive: bool,
321 ) -> Result<Vec<MarkdownDocument>> {
322 debug!("Ingesting directory");
323
324 let mut documents = Vec::new();
325 let mut dirs_to_process = vec![dir.to_path_buf()];
326
327 while let Some(current_dir) = dirs_to_process.pop() {
328 let mut entries = fs::read_dir(¤t_dir).await.map_err(Error::Io)?;
329
330 while let Some(entry) = entries.next_entry().await.map_err(Error::Io)? {
331 let path = entry.path();
332 let file_type = entry.file_type().await.map_err(Error::Io)?;
333
334 if file_type.is_dir() {
335 if recursive {
336 dirs_to_process.push(path);
337 }
338 } else if file_type.is_file() && self.is_markdown_file(&path) {
339 match self.ingest_file(&path).await {
340 Ok(doc) => documents.push(doc),
341 Err(e) => {
342 warn!(path = %path.display(), error = %e, "Failed to ingest file");
343 }
344 }
345 }
346 }
347 }
348
349 debug!(count = documents.len(), "Directory ingestion complete");
350
351 Ok(documents)
352 }
353
354 pub fn extract_frontmatter(content: &str) -> (Option<HashMap<String, String>>, &str) {
375 if !content.starts_with("---") {
377 return (None, content);
378 }
379
380 let after_first_delimiter = &content[3..];
383 let Some(end_pos) = after_first_delimiter.find("\n---") else {
384 return (None, content);
386 };
387
388 let yaml_content = after_first_delimiter[..end_pos].trim();
390
391 let body_start = 3 + end_pos + 4; let body = if body_start < content.len() {
394 content[body_start..].trim_start_matches(['\n', '\r'])
396 } else {
397 ""
398 };
399
400 match serde_yaml::from_str::<serde_yaml::Value>(yaml_content) {
402 Ok(yaml) => {
403 let mut metadata = HashMap::new();
404
405 if let serde_yaml::Value::Mapping(map) = yaml {
407 for (key, value) in map {
408 if let serde_yaml::Value::String(k) = key {
409 let v = match value {
410 serde_yaml::Value::String(s) => s,
411 serde_yaml::Value::Number(n) => n.to_string(),
412 serde_yaml::Value::Bool(b) => b.to_string(),
413 serde_yaml::Value::Sequence(seq) => {
414 seq.iter()
416 .filter_map(|v| match v {
417 serde_yaml::Value::String(s) => Some(s.as_str()),
418 _ => None,
419 })
420 .collect::<Vec<_>>()
421 .join(", ")
422 }
423 _ => continue,
424 };
425 metadata.insert(k, v);
426 }
427 }
428 }
429
430 (Some(metadata), body)
431 }
432 Err(e) => {
433 warn!(error = %e, "Failed to parse YAML front-matter");
434 (None, content)
435 }
436 }
437 }
438
439 fn parse_markdown(&self, content: &str) -> Vec<MarkdownChunk> {
447 let mut chunks = Vec::new();
448 let mut current_text = String::new();
449 let mut heading_hierarchy: Vec<String> = Vec::new();
450 let mut current_heading_text = String::new();
451 let mut in_heading = false;
452 let mut in_code_block = false;
453 let mut code_block_content = String::new();
454 let mut code_block_language: Option<String> = None;
455 let mut in_list = false;
456 let mut in_table = false;
457
458 let mut current_line = 1;
460 let mut chunk_start_line = 1;
461
462 let options = Options::all();
464 let parser = Parser::new_ext(content, options);
465
466 for event in parser {
467 match event {
468 Event::Start(Tag::Heading { level, .. }) => {
469 if !current_text.trim().is_empty() {
471 let chunk_type = if in_list {
472 ChunkType::List
473 } else if in_table {
474 ChunkType::Table
475 } else {
476 ChunkType::Text
477 };
478
479 chunks.push(MarkdownChunk {
480 content: current_text.trim().to_string(),
481 chunk_type,
482 heading_hierarchy: heading_hierarchy.clone(),
483 line_range: (chunk_start_line, current_line),
484 });
485 current_text.clear();
486 }
487
488 in_heading = true;
489 current_heading_text.clear();
490
491 let level_idx = match level {
494 HeadingLevel::H1 => 0,
495 HeadingLevel::H2 => 1,
496 HeadingLevel::H3 => 2,
497 HeadingLevel::H4 => 3,
498 HeadingLevel::H5 => 4,
499 HeadingLevel::H6 => 5,
500 };
501
502 heading_hierarchy.truncate(level_idx);
504 chunk_start_line = current_line;
505 }
506
507 Event::End(TagEnd::Heading(_)) => {
508 in_heading = false;
509 let heading_text = current_heading_text.trim().to_string();
511 if !heading_text.is_empty() {
512 heading_hierarchy.push(heading_text);
513 }
514 current_heading_text.clear();
515 }
516
517 Event::Start(Tag::CodeBlock(kind)) => {
518 if !current_text.trim().is_empty() {
520 chunks.push(MarkdownChunk {
521 content: current_text.trim().to_string(),
522 chunk_type: ChunkType::Text,
523 heading_hierarchy: heading_hierarchy.clone(),
524 line_range: (chunk_start_line, current_line),
525 });
526 current_text.clear();
527 }
528
529 in_code_block = true;
530 code_block_content.clear();
531 chunk_start_line = current_line;
532
533 code_block_language = match kind {
535 pulldown_cmark::CodeBlockKind::Fenced(lang) => {
536 let lang_str = lang.to_string();
537 if lang_str.is_empty() {
538 None
539 } else {
540 Some(lang_str.split(',').next().unwrap_or(&lang_str).to_string())
542 }
543 }
544 pulldown_cmark::CodeBlockKind::Indented => None,
545 };
546 }
547
548 Event::End(TagEnd::CodeBlock) => {
549 if self.config.preserve_code_blocks && !code_block_content.trim().is_empty() {
550 chunks.push(MarkdownChunk {
551 content: code_block_content.trim().to_string(),
552 chunk_type: ChunkType::CodeBlock {
553 language: code_block_language.take(),
554 },
555 heading_hierarchy: heading_hierarchy.clone(),
556 line_range: (chunk_start_line, current_line),
557 });
558 } else if !code_block_content.is_empty() {
559 current_text.push_str("```");
561 if let Some(ref lang) = code_block_language {
562 current_text.push_str(lang);
563 }
564 current_text.push('\n');
565 current_text.push_str(&code_block_content);
566 current_text.push_str("```\n");
567 }
568
569 in_code_block = false;
570 code_block_content.clear();
571 code_block_language = None;
572 chunk_start_line = current_line;
573 }
574
575 Event::Start(Tag::List(_)) => {
576 in_list = true;
577 }
578
579 Event::End(TagEnd::List(_)) => {
580 in_list = false;
581 }
582
583 Event::Start(Tag::Table(_)) => {
584 in_table = true;
585 }
586
587 Event::End(TagEnd::Table) => {
588 in_table = false;
589 }
590
591 Event::Text(text) => {
592 current_line += text.chars().filter(|c| *c == '\n').count();
594
595 if in_heading {
596 current_heading_text.push_str(&text);
597 } else if in_code_block {
598 code_block_content.push_str(&text);
599 } else {
600 current_text.push_str(&text);
601 }
602 }
603
604 Event::Code(code) => {
605 if in_heading {
607 current_heading_text.push('`');
608 current_heading_text.push_str(&code);
609 current_heading_text.push('`');
610 } else if !in_code_block {
611 current_text.push('`');
612 current_text.push_str(&code);
613 current_text.push('`');
614 }
615 }
616
617 Event::SoftBreak | Event::HardBreak => {
618 current_line += 1;
619 if in_heading {
620 current_heading_text.push(' ');
621 } else if in_code_block {
622 code_block_content.push('\n');
623 } else {
624 current_text.push('\n');
625 }
626 }
627
628 Event::Html(html) => {
629 current_line += html.chars().filter(|c| *c == '\n').count();
631 if !in_code_block && !in_heading {
632 current_text.push_str(&html);
633 }
634 }
635
636 _ => {}
637 }
638 }
639
640 if !current_text.trim().is_empty() {
642 let chunk_type = if in_list {
643 ChunkType::List
644 } else if in_table {
645 ChunkType::Table
646 } else {
647 ChunkType::Text
648 };
649
650 chunks.push(MarkdownChunk {
651 content: current_text.trim().to_string(),
652 chunk_type,
653 heading_hierarchy: heading_hierarchy.clone(),
654 line_range: (chunk_start_line, current_line),
655 });
656 }
657
658 self.post_process_chunks(chunks)
660 }
661
662 fn post_process_chunks(&self, chunks: Vec<MarkdownChunk>) -> Vec<MarkdownChunk> {
664 let mut result = Vec::new();
665 let mut pending: Option<MarkdownChunk> = None;
666
667 for chunk in chunks {
668 if chunk.is_code() {
670 if let Some(p) = pending.take() {
672 if p.content.len() > self.config.max_chunk_size {
673 result.extend(self.split_large_chunk(p));
674 } else {
675 result.push(p);
676 }
677 }
678 if chunk.content.len() > self.config.max_chunk_size {
680 result.extend(self.split_large_chunk(chunk));
681 } else {
682 result.push(chunk);
683 }
684 continue;
685 }
686
687 match pending.take() {
688 None => {
689 pending = Some(chunk);
690 }
691 Some(mut p) => {
692 if p.content.len() < self.config.min_chunk_size {
694 if p.heading_hierarchy == chunk.heading_hierarchy {
696 p.content.push_str("\n\n");
697 p.content.push_str(&chunk.content);
698 p.line_range.1 = chunk.line_range.1;
699 pending = Some(p);
700 } else {
701 result.push(p);
703 pending = Some(chunk);
704 }
705 } else {
706 if p.content.len() > self.config.max_chunk_size {
708 result.extend(self.split_large_chunk(p));
709 } else {
710 result.push(p);
711 }
712 pending = Some(chunk);
713 }
714 }
715 }
716 }
717
718 if let Some(p) = pending {
720 if p.content.len() > self.config.max_chunk_size {
721 result.extend(self.split_large_chunk(p));
722 } else {
723 result.push(p);
724 }
725 }
726
727 result
728 }
729
730 fn split_large_chunk(&self, chunk: MarkdownChunk) -> Vec<MarkdownChunk> {
732 let mut result = Vec::new();
733 let content = &chunk.content;
734 let max_size = self.config.max_chunk_size;
735
736 let paragraphs: Vec<&str> = content.split("\n\n").collect();
738
739 let mut current = String::new();
740 let mut current_start = chunk.line_range.0;
741
742 for para in paragraphs {
743 if para.len() > max_size {
745 if !current.is_empty() {
747 let lines_in_current = current.chars().filter(|c| *c == '\n').count() + 1;
748 result.push(MarkdownChunk {
749 content: current.clone(),
750 chunk_type: chunk.chunk_type.clone(),
751 heading_hierarchy: chunk.heading_hierarchy.clone(),
752 line_range: (current_start, current_start + lines_in_current),
753 });
754 current_start += lines_in_current;
755 current.clear();
756 }
757
758 let mut para_chunk = String::new();
760 for sentence in para.split(". ") {
761 let sentence_with_period = if sentence.ends_with('.') {
762 sentence.to_string()
763 } else {
764 format!("{}. ", sentence)
765 };
766
767 if para_chunk.len() + sentence_with_period.len() > max_size
768 && !para_chunk.is_empty()
769 {
770 result.push(MarkdownChunk {
771 content: para_chunk.trim().to_string(),
772 chunk_type: chunk.chunk_type.clone(),
773 heading_hierarchy: chunk.heading_hierarchy.clone(),
774 line_range: (current_start, current_start + 1),
775 });
776 para_chunk.clear();
777 }
778 para_chunk.push_str(&sentence_with_period);
779 }
780 if !para_chunk.is_empty() {
781 result.push(MarkdownChunk {
782 content: para_chunk.trim().to_string(),
783 chunk_type: chunk.chunk_type.clone(),
784 heading_hierarchy: chunk.heading_hierarchy.clone(),
785 line_range: (current_start, current_start + 1),
786 });
787 }
788 continue;
789 }
790
791 if current.len() + para.len() + 2 > max_size && !current.is_empty() {
792 let lines_in_current = current.chars().filter(|c| *c == '\n').count() + 1;
794 result.push(MarkdownChunk {
795 content: current.clone(),
796 chunk_type: chunk.chunk_type.clone(),
797 heading_hierarchy: chunk.heading_hierarchy.clone(),
798 line_range: (current_start, current_start + lines_in_current),
799 });
800 current_start += lines_in_current;
801 current.clear();
802 }
803
804 if !current.is_empty() {
805 current.push_str("\n\n");
806 }
807 current.push_str(para);
808 }
809
810 if !current.is_empty() {
812 result.push(MarkdownChunk {
813 content: current,
814 chunk_type: chunk.chunk_type,
815 heading_hierarchy: chunk.heading_hierarchy,
816 line_range: (current_start, chunk.line_range.1),
817 });
818 }
819
820 result
821 }
822
823 fn is_markdown_file(&self, path: &Path) -> bool {
825 path.extension()
826 .and_then(|e| e.to_str())
827 .map(|ext| {
828 self.config
829 .markdown_extensions
830 .iter()
831 .any(|m| m.eq_ignore_ascii_case(ext))
832 })
833 .unwrap_or(false)
834 }
835}
836
837#[cfg(test)]
838mod tests {
839 use super::*;
840 use tempfile::TempDir;
841 use tokio::fs::File;
842 use tokio::io::AsyncWriteExt;
843
844 async fn create_temp_file(dir: &TempDir, name: &str, content: &str) -> PathBuf {
846 let path = dir.path().join(name);
847 let mut file = File::create(&path).await.unwrap();
848 file.write_all(content.as_bytes()).await.unwrap();
849 path
850 }
851
852 #[test]
861 fn test_extract_frontmatter_basic() {
862 let content = r#"---
863title: My Document
864author: John Doe
865date: 2024-01-15
866---
867
868# Hello World
869
870This is the body."#;
871
872 let (metadata, body) = MarkdownIngester::extract_frontmatter(content);
873
874 assert!(metadata.is_some(), "Front-matter should be extracted");
875 let metadata = metadata.unwrap();
876
877 assert_eq!(metadata.get("title"), Some(&"My Document".to_string()));
878 assert_eq!(metadata.get("author"), Some(&"John Doe".to_string()));
879 assert_eq!(metadata.get("date"), Some(&"2024-01-15".to_string()));
880
881 assert!(
882 body.starts_with("# Hello World"),
883 "Body should start with heading"
884 );
885 }
886
887 #[test]
894 fn test_extract_frontmatter_none() {
895 let content = "# Just a Heading\n\nSome content.";
896
897 let (metadata, body) = MarkdownIngester::extract_frontmatter(content);
898
899 assert!(metadata.is_none(), "No front-matter should be found");
900 assert_eq!(body, content, "Body should be the entire content");
901 }
902
903 #[test]
910 fn test_extract_frontmatter_arrays() {
911 let content = r#"---
912title: Tagged Post
913tags:
914 - rust
915 - programming
916 - web
917---
918
919Content here."#;
920
921 let (metadata, _body) = MarkdownIngester::extract_frontmatter(content);
922
923 let metadata = metadata.expect("Front-matter should be extracted");
924 assert_eq!(
925 metadata.get("tags"),
926 Some(&"rust, programming, web".to_string())
927 );
928 }
929
930 #[test]
937 fn test_extract_frontmatter_unclosed() {
938 let content = r#"---
939title: Broken
940author: Nobody
941
942# This has no closing delimiter"#;
943
944 let (metadata, body) = MarkdownIngester::extract_frontmatter(content);
945
946 assert!(metadata.is_none(), "Unclosed front-matter should not parse");
947 assert_eq!(body, content, "Body should be entire content");
948 }
949
950 #[tokio::test]
960 async fn test_heading_hierarchy() {
961 let temp_dir = TempDir::new().unwrap();
962 let content = r#"# Main Title
963
964Intro paragraph.
965
966## Section One
967
968Content in section one.
969
970## Section Two
971
972Content in section two.
973
974### Subsection
975
976Deep content.
977"#;
978
979 let path = create_temp_file(&temp_dir, "test.md", content).await;
980 let ingester = MarkdownIngester::new();
981 let doc = ingester.ingest_file(&path).await.unwrap();
982
983 let section_one = doc
985 .chunks
986 .iter()
987 .find(|c| c.content.to_lowercase().contains("content in section one"))
988 .expect("Should find section one content");
989
990 assert_eq!(
991 section_one.heading_hierarchy,
992 vec!["Main Title", "Section One"],
993 "Section one should have correct hierarchy"
994 );
995
996 let subsection = doc
998 .chunks
999 .iter()
1000 .find(|c| c.content.to_lowercase().contains("deep content"))
1001 .expect("Should find subsection content");
1002
1003 assert_eq!(
1004 subsection.heading_hierarchy,
1005 vec!["Main Title", "Section Two", "Subsection"],
1006 "Subsection should have full hierarchy"
1007 );
1008 }
1009
1010 #[tokio::test]
1018 async fn test_code_block_preservation() {
1019 let temp_dir = TempDir::new().unwrap();
1020 let content = r#"# Code Examples
1021
1022Here's some Rust code:
1023
1024```rust
1025fn main() {
1026 println!("Hello, world!");
1027}
1028```
1029
1030And some Python:
1031
1032```python
1033def hello():
1034 print("Hello, world!")
1035```
1036"#;
1037
1038 let path = create_temp_file(&temp_dir, "test.md", content).await;
1039 let ingester = MarkdownIngester::new();
1040 let doc = ingester.ingest_file(&path).await.unwrap();
1041
1042 let rust_block = doc.chunks.iter()
1044 .find(|c| matches!(&c.chunk_type, ChunkType::CodeBlock { language: Some(l) } if l == "rust"))
1045 .expect("Should find Rust code block");
1046
1047 assert!(
1048 rust_block.content.contains("println!"),
1049 "Rust code should be preserved"
1050 );
1051
1052 let python_block = doc.chunks.iter()
1054 .find(|c| matches!(&c.chunk_type, ChunkType::CodeBlock { language: Some(l) } if l == "python"))
1055 .expect("Should find Python code block");
1056
1057 assert!(
1058 python_block.content.contains("def hello"),
1059 "Python code should be preserved"
1060 );
1061 }
1062
1063 #[tokio::test]
1070 async fn test_code_block_no_language() {
1071 let temp_dir = TempDir::new().unwrap();
1072 let content = r#"# Unlabeled Code
1073
1074```
1075some generic code
1076```
1077"#;
1078
1079 let path = create_temp_file(&temp_dir, "test.md", content).await;
1080 let ingester = MarkdownIngester::new();
1081 let doc = ingester.ingest_file(&path).await.unwrap();
1082
1083 let code_block = doc
1084 .chunks
1085 .iter()
1086 .find(|c| matches!(&c.chunk_type, ChunkType::CodeBlock { language: None }))
1087 .expect("Should find code block without language");
1088
1089 assert!(code_block.content.contains("generic code"));
1090 }
1091
1092 #[tokio::test]
1099 async fn test_title_from_frontmatter() {
1100 let temp_dir = TempDir::new().unwrap();
1101 let content = r#"---
1102title: Front-matter Title
1103---
1104
1105# Heading Title
1106
1107Content.
1108"#;
1109
1110 let path = create_temp_file(&temp_dir, "test.md", content).await;
1111 let ingester = MarkdownIngester::new();
1112 let doc = ingester.ingest_file(&path).await.unwrap();
1113
1114 assert_eq!(
1115 doc.title,
1116 Some("Front-matter Title".to_string()),
1117 "Title should come from front-matter"
1118 );
1119 }
1120
1121 #[tokio::test]
1127 async fn test_title_from_heading() {
1128 let temp_dir = TempDir::new().unwrap();
1129 let content = r#"# First Heading
1130
1131Some content here.
1132
1133## Second Section
1134
1135More content.
1136"#;
1137
1138 let path = create_temp_file(&temp_dir, "test.md", content).await;
1139 let ingester = MarkdownIngester::new();
1140 let doc = ingester.ingest_file(&path).await.unwrap();
1141
1142 assert_eq!(
1143 doc.title,
1144 Some("First Heading".to_string()),
1145 "Title should come from first h1"
1146 );
1147 }
1148
1149 #[tokio::test]
1159 async fn test_directory_ingestion_recursive() {
1160 let temp_dir = TempDir::new().unwrap();
1161
1162 let subdir = temp_dir.path().join("subdir");
1164 fs::create_dir(&subdir).await.unwrap();
1165
1166 create_temp_file(&temp_dir, "root.md", "# Root\n\nRoot content.").await;
1167 create_temp_file(&temp_dir, "other.txt", "Not markdown").await;
1168
1169 let sub_path = subdir.join("nested.md");
1171 let mut file = File::create(&sub_path).await.unwrap();
1172 file.write_all(b"# Nested\n\nNested content.")
1173 .await
1174 .unwrap();
1175
1176 let ingester = MarkdownIngester::new();
1177 let docs = ingester
1178 .ingest_directory(temp_dir.path(), true)
1179 .await
1180 .unwrap();
1181
1182 assert_eq!(docs.len(), 2, "Should find 2 markdown files");
1183
1184 let titles: Vec<_> = docs.iter().filter_map(|d| d.title.as_ref()).collect();
1185 assert!(titles.contains(&&"Root".to_string()));
1186 assert!(titles.contains(&&"Nested".to_string()));
1187 }
1188
1189 #[tokio::test]
1196 async fn test_directory_ingestion_non_recursive() {
1197 let temp_dir = TempDir::new().unwrap();
1198
1199 let subdir = temp_dir.path().join("subdir");
1200 fs::create_dir(&subdir).await.unwrap();
1201
1202 create_temp_file(&temp_dir, "root.md", "# Root\n\nContent.").await;
1203
1204 let sub_path = subdir.join("nested.md");
1205 let mut file = File::create(&sub_path).await.unwrap();
1206 file.write_all(b"# Nested\n\nContent.").await.unwrap();
1207
1208 let ingester = MarkdownIngester::new();
1209 let docs = ingester
1210 .ingest_directory(temp_dir.path(), false)
1211 .await
1212 .unwrap();
1213
1214 assert_eq!(docs.len(), 1, "Should find only root markdown file");
1215 assert_eq!(docs[0].title, Some("Root".to_string()));
1216 }
1217
1218 #[tokio::test]
1227 async fn test_small_chunk_merging() {
1228 let temp_dir = TempDir::new().unwrap();
1229 let content = r#"# Section
1230
1231A.
1232
1233B.
1234
1235C.
1236"#;
1237
1238 let path = create_temp_file(&temp_dir, "test.md", content).await;
1239 let mut config = IngesterConfig::default();
1240 config.min_chunk_size = 100; let ingester = MarkdownIngester::with_config(config);
1243 let doc = ingester.ingest_file(&path).await.unwrap();
1244
1245 assert!(
1247 doc.chunks.len() <= 2, "Small chunks should be merged"
1249 );
1250 }
1251
1252 #[tokio::test]
1259 async fn test_large_chunk_splitting() {
1260 let temp_dir = TempDir::new().unwrap();
1261
1262 let long_paragraph = "This is a test paragraph. ".repeat(200);
1264 let content = format!(
1265 "# Large Document\n\n{}\n\n{}\n\n{}",
1266 long_paragraph, long_paragraph, long_paragraph
1267 );
1268
1269 let path = create_temp_file(&temp_dir, "test.md", &content).await;
1270 let mut config = IngesterConfig::default();
1271 config.max_chunk_size = 500;
1272 let max_chunk_size = config.max_chunk_size;
1273
1274 let ingester = MarkdownIngester::with_config(config);
1275 let doc = ingester.ingest_file(&path).await.unwrap();
1276
1277 for chunk in &doc.chunks {
1279 assert!(
1281 chunk.content.len() <= max_chunk_size + 200,
1282 "Chunk should not greatly exceed max size: {} > {}",
1283 chunk.content.len(),
1284 max_chunk_size
1285 );
1286 }
1287 }
1288
1289 #[tokio::test]
1298 async fn test_empty_file() {
1299 let temp_dir = TempDir::new().unwrap();
1300 let path = create_temp_file(&temp_dir, "empty.md", "").await;
1301
1302 let ingester = MarkdownIngester::new();
1303 let doc = ingester.ingest_file(&path).await.unwrap();
1304
1305 assert!(doc.chunks.is_empty(), "Empty file should have no chunks");
1306 assert!(doc.title.is_none(), "Empty file should have no title");
1307 }
1308
1309 #[tokio::test]
1316 async fn test_frontmatter_only() {
1317 let temp_dir = TempDir::new().unwrap();
1318 let content = r#"---
1319title: Metadata Only
1320author: Test
1321---
1322"#;
1323 let path = create_temp_file(&temp_dir, "meta.md", content).await;
1324
1325 let ingester = MarkdownIngester::new();
1326 let doc = ingester.ingest_file(&path).await.unwrap();
1327
1328 assert_eq!(doc.title, Some("Metadata Only".to_string()));
1329 assert_eq!(doc.metadata.get("author"), Some(&"Test".to_string()));
1330 assert!(doc.chunks.is_empty(), "Should have no content chunks");
1331 }
1332
1333 #[tokio::test]
1340 async fn test_inline_code_in_heading() {
1341 let temp_dir = TempDir::new().unwrap();
1342 let content = r#"# Using `async/await` in Rust
1343
1344Some explanation here.
1345"#;
1346 let path = create_temp_file(&temp_dir, "test.md", content).await;
1347
1348 let ingester = MarkdownIngester::new();
1349 let doc = ingester.ingest_file(&path).await.unwrap();
1350
1351 assert!(
1352 doc.chunks.iter().any(|c| c
1353 .heading_hierarchy
1354 .iter()
1355 .any(|h| h.contains("`async/await`"))),
1356 "Heading should preserve inline code"
1357 );
1358 }
1359
1360 #[test]
1367 fn test_context_string() {
1368 let chunk_with_hierarchy = MarkdownChunk {
1369 content: "Test".to_string(),
1370 chunk_type: ChunkType::Text,
1371 heading_hierarchy: vec!["Main".to_string(), "Section".to_string()],
1372 line_range: (1, 5),
1373 };
1374
1375 assert_eq!(chunk_with_hierarchy.context_string(), "Main > Section");
1376
1377 let chunk_no_hierarchy = MarkdownChunk {
1378 content: "Test".to_string(),
1379 chunk_type: ChunkType::Text,
1380 heading_hierarchy: vec![],
1381 line_range: (1, 5),
1382 };
1383
1384 assert_eq!(chunk_no_hierarchy.context_string(), "Document root");
1385 }
1386
1387 #[test]
1394 fn test_document_helpers() {
1395 let mut doc = MarkdownDocument::new("/test.md");
1396 doc.chunks = vec![
1397 MarkdownChunk::text("First text", vec![], (1, 2)),
1398 MarkdownChunk::code_block("let x = 1;", Some("rust".to_string()), vec![], (3, 5)),
1399 MarkdownChunk::text("Second text", vec![], (6, 7)),
1400 ];
1401
1402 let full = doc.full_text();
1403 assert!(full.contains("First text"));
1404 assert!(full.contains("let x = 1;"));
1405 assert!(full.contains("Second text"));
1406
1407 assert_eq!(doc.text_chunks().count(), 2);
1408 assert_eq!(doc.code_chunks().count(), 1);
1409 }
1410
1411 #[tokio::test]
1417 async fn test_code_language_with_attributes() {
1418 let temp_dir = TempDir::new().unwrap();
1419 let content = r#"# Test
1420
1421```rust,ignore
1422fn example() {}
1423```
1424"#;
1425 let path = create_temp_file(&temp_dir, "test.md", content).await;
1426
1427 let ingester = MarkdownIngester::new();
1428 let doc = ingester.ingest_file(&path).await.unwrap();
1429
1430 let code_chunk = doc.code_chunks().next().expect("Should have code chunk");
1431 assert_eq!(code_chunk.code_language(), Some("rust"));
1432 }
1433
1434 #[test]
1441 fn test_markdown_extension_recognition() {
1442 let ingester = MarkdownIngester::new();
1443
1444 assert!(ingester.is_markdown_file(Path::new("test.md")));
1445 assert!(ingester.is_markdown_file(Path::new("test.markdown")));
1446 assert!(ingester.is_markdown_file(Path::new("test.mdx")));
1447 assert!(ingester.is_markdown_file(Path::new("test.MD"))); assert!(!ingester.is_markdown_file(Path::new("test.txt")));
1450 assert!(!ingester.is_markdown_file(Path::new("test.rs")));
1451 assert!(!ingester.is_markdown_file(Path::new("noextension")));
1452 }
1453}