converge_knowledge/ingest/
markdown.rs

1//! Markdown ingestion module for parsing and chunking markdown documents.
2//!
3//! This module provides functionality to:
4//! - Parse markdown files using pulldown-cmark
5//! - Extract YAML front-matter metadata
6//! - Chunk content by headers (h1, h2, h3 sections)
7//! - Preserve code blocks with language tags
8//! - Return structured chunks ready for embedding
9//!
10//! # Example
11//!
12//! ```rust,no_run
13//! use converge_knowledge::ingest::{MarkdownIngester, MarkdownDocument};
14//! use std::path::Path;
15//!
16//! #[tokio::main]
17//! async fn main() -> anyhow::Result<()> {
18//!     let ingester = MarkdownIngester::new();
19//!
20//!     // Ingest a single file
21//!     let doc = ingester.ingest_file(Path::new("README.md")).await?;
22//!     println!("Found {} chunks", doc.chunks.len());
23//!
24//!     // Ingest a directory recursively
25//!     let docs = ingester.ingest_directory(Path::new("docs"), true).await?;
26//!     println!("Ingested {} documents", docs.len());
27//!
28//!     Ok(())
29//! }
30//! ```
31
32use pulldown_cmark::{Event, HeadingLevel, Options, Parser, Tag, TagEnd};
33use serde::{Deserialize, Serialize};
34use std::collections::HashMap;
35use std::path::{Path, PathBuf};
36use tokio::fs;
37use tracing::{debug, instrument, warn};
38
39use crate::error::{Error, Result};
40
41/// A parsed markdown document with extracted metadata and content chunks.
42///
43/// This structure represents a fully processed markdown file, including:
44/// - The original file path for reference
45/// - An optional title extracted from the first h1 heading or front-matter
46/// - Metadata from YAML front-matter
47/// - Content broken into semantic chunks for embedding
48#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct MarkdownDocument {
50    /// Path to the source markdown file.
51    pub path: PathBuf,
52
53    /// Document title, extracted from front-matter or first h1 heading.
54    pub title: Option<String>,
55
56    /// Metadata extracted from YAML front-matter.
57    /// Common keys include: author, date, tags, description, etc.
58    pub metadata: HashMap<String, String>,
59
60    /// Content chunks, each representing a semantic unit of the document.
61    pub chunks: Vec<MarkdownChunk>,
62}
63
64impl MarkdownDocument {
65    /// Create a new empty document for the given path.
66    pub fn new(path: impl Into<PathBuf>) -> Self {
67        Self {
68            path: path.into(),
69            title: None,
70            metadata: HashMap::new(),
71            chunks: Vec::new(),
72        }
73    }
74
75    /// Get all text content concatenated (useful for full-document embedding).
76    pub fn full_text(&self) -> String {
77        self.chunks
78            .iter()
79            .map(|c| c.content.as_str())
80            .collect::<Vec<_>>()
81            .join("\n\n")
82    }
83
84    /// Get only text chunks (excluding code blocks).
85    pub fn text_chunks(&self) -> impl Iterator<Item = &MarkdownChunk> {
86        self.chunks
87            .iter()
88            .filter(|c| c.chunk_type == ChunkType::Text)
89    }
90
91    /// Get only code block chunks.
92    pub fn code_chunks(&self) -> impl Iterator<Item = &MarkdownChunk> {
93        self.chunks
94            .iter()
95            .filter(|c| matches!(c.chunk_type, ChunkType::CodeBlock { .. }))
96    }
97}
98
99/// A single chunk of content from a markdown document.
100///
101/// Chunks are created by splitting the document at heading boundaries.
102/// Each chunk preserves its position in the document hierarchy and
103/// the line range it covers in the source file.
104#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct MarkdownChunk {
106    /// The text content of this chunk.
107    pub content: String,
108
109    /// The type of content this chunk contains.
110    pub chunk_type: ChunkType,
111
112    /// The heading hierarchy leading to this chunk.
113    /// For example, `["Introduction", "Getting Started"]` means this chunk
114    /// is under an h1 "Introduction" and h2 "Getting Started".
115    pub heading_hierarchy: Vec<String>,
116
117    /// The line range in the source file (1-indexed, inclusive).
118    /// `(start_line, end_line)` where both are inclusive.
119    pub line_range: (usize, usize),
120}
121
122impl MarkdownChunk {
123    /// Create a new text chunk.
124    pub fn text(
125        content: impl Into<String>,
126        heading_hierarchy: Vec<String>,
127        line_range: (usize, usize),
128    ) -> Self {
129        Self {
130            content: content.into(),
131            chunk_type: ChunkType::Text,
132            heading_hierarchy,
133            line_range,
134        }
135    }
136
137    /// Create a new code block chunk.
138    pub fn code_block(
139        content: impl Into<String>,
140        language: Option<String>,
141        heading_hierarchy: Vec<String>,
142        line_range: (usize, usize),
143    ) -> Self {
144        Self {
145            content: content.into(),
146            chunk_type: ChunkType::CodeBlock { language },
147            heading_hierarchy,
148            line_range,
149        }
150    }
151
152    /// Check if this chunk is a code block.
153    pub fn is_code(&self) -> bool {
154        matches!(self.chunk_type, ChunkType::CodeBlock { .. })
155    }
156
157    /// Get the language of a code block, if applicable.
158    pub fn code_language(&self) -> Option<&str> {
159        match &self.chunk_type {
160            ChunkType::CodeBlock { language } => language.as_deref(),
161            _ => None,
162        }
163    }
164
165    /// Get a context string describing where this chunk is in the document.
166    pub fn context_string(&self) -> String {
167        if self.heading_hierarchy.is_empty() {
168            "Document root".to_string()
169        } else {
170            self.heading_hierarchy.join(" > ")
171        }
172    }
173}
174
175/// The type of content a chunk contains.
176#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
177pub enum ChunkType {
178    /// Regular text content (paragraphs, inline code, etc.).
179    Text,
180
181    /// A fenced code block with optional language specification.
182    CodeBlock {
183        /// The programming language, if specified (e.g., "rust", "python").
184        language: Option<String>,
185    },
186
187    /// A list (ordered or unordered).
188    List,
189
190    /// A table.
191    Table,
192}
193
194/// Configuration options for the markdown ingester.
195#[derive(Debug, Clone)]
196pub struct IngesterConfig {
197    /// Minimum chunk size in characters. Smaller chunks will be merged.
198    pub min_chunk_size: usize,
199
200    /// Maximum chunk size in characters. Larger chunks will be split.
201    pub max_chunk_size: usize,
202
203    /// Whether to preserve code blocks as separate chunks.
204    pub preserve_code_blocks: bool,
205
206    /// Whether to include front-matter in the output.
207    pub include_frontmatter: bool,
208
209    /// File extensions to consider as markdown.
210    pub markdown_extensions: Vec<String>,
211}
212
213impl Default for IngesterConfig {
214    fn default() -> Self {
215        Self {
216            min_chunk_size: 50,
217            max_chunk_size: 4000,
218            preserve_code_blocks: true,
219            include_frontmatter: true,
220            markdown_extensions: vec!["md".to_string(), "markdown".to_string(), "mdx".to_string()],
221        }
222    }
223}
224
225/// Markdown file ingester that parses and chunks markdown documents.
226///
227/// The ingester handles:
228/// - YAML front-matter extraction
229/// - Content chunking by headings
230/// - Code block preservation with language tags
231/// - Recursive directory traversal
232#[derive(Debug, Clone)]
233pub struct MarkdownIngester {
234    config: IngesterConfig,
235}
236
237impl Default for MarkdownIngester {
238    fn default() -> Self {
239        Self::new()
240    }
241}
242
243impl MarkdownIngester {
244    /// Create a new markdown ingester with default configuration.
245    pub fn new() -> Self {
246        Self {
247            config: IngesterConfig::default(),
248        }
249    }
250
251    /// Create a new markdown ingester with custom configuration.
252    pub fn with_config(config: IngesterConfig) -> Self {
253        Self { config }
254    }
255
256    /// Ingest a single markdown file.
257    ///
258    /// This method reads the file, extracts front-matter, parses the markdown,
259    /// and returns a structured document with chunks.
260    ///
261    /// # Arguments
262    /// * `path` - Path to the markdown file
263    ///
264    /// # Returns
265    /// A `MarkdownDocument` containing the parsed content and metadata
266    ///
267    /// # Errors
268    /// Returns an error if the file cannot be read or is not valid UTF-8
269    #[instrument(skip(self), fields(path = %path.display()))]
270    pub async fn ingest_file(&self, path: &Path) -> Result<MarkdownDocument> {
271        debug!("Ingesting markdown file");
272
273        let content = fs::read_to_string(path).await.map_err(Error::Io)?;
274
275        let mut document = MarkdownDocument::new(path);
276
277        // Extract front-matter and get the remaining content
278        let (frontmatter, body) = Self::extract_frontmatter(&content);
279
280        if let Some(fm) = frontmatter {
281            document.metadata = fm.clone();
282            // Try to extract title from front-matter
283            if let Some(title) = fm.get("title") {
284                document.title = Some(title.clone());
285            }
286        }
287
288        // Parse the markdown body and create chunks
289        document.chunks = self.parse_markdown(body);
290
291        // If no title from front-matter, try to get it from first h1
292        if document.title.is_none() {
293            document.title = document
294                .chunks
295                .iter()
296                .find(|c| !c.heading_hierarchy.is_empty())
297                .and_then(|c| c.heading_hierarchy.first().cloned());
298        }
299
300        debug!(chunks = document.chunks.len(), "Ingestion complete");
301
302        Ok(document)
303    }
304
305    /// Ingest all markdown files in a directory.
306    ///
307    /// # Arguments
308    /// * `dir` - Path to the directory
309    /// * `recursive` - Whether to recursively traverse subdirectories
310    ///
311    /// # Returns
312    /// A vector of `MarkdownDocument` for each markdown file found
313    ///
314    /// # Errors
315    /// Returns an error if the directory cannot be read
316    #[instrument(skip(self), fields(dir = %dir.display(), recursive))]
317    pub async fn ingest_directory(
318        &self,
319        dir: &Path,
320        recursive: bool,
321    ) -> Result<Vec<MarkdownDocument>> {
322        debug!("Ingesting directory");
323
324        let mut documents = Vec::new();
325        let mut dirs_to_process = vec![dir.to_path_buf()];
326
327        while let Some(current_dir) = dirs_to_process.pop() {
328            let mut entries = fs::read_dir(&current_dir).await.map_err(Error::Io)?;
329
330            while let Some(entry) = entries.next_entry().await.map_err(Error::Io)? {
331                let path = entry.path();
332                let file_type = entry.file_type().await.map_err(Error::Io)?;
333
334                if file_type.is_dir() {
335                    if recursive {
336                        dirs_to_process.push(path);
337                    }
338                } else if file_type.is_file() && self.is_markdown_file(&path) {
339                    match self.ingest_file(&path).await {
340                        Ok(doc) => documents.push(doc),
341                        Err(e) => {
342                            warn!(path = %path.display(), error = %e, "Failed to ingest file");
343                        }
344                    }
345                }
346            }
347        }
348
349        debug!(count = documents.len(), "Directory ingestion complete");
350
351        Ok(documents)
352    }
353
354    /// Extract YAML front-matter from markdown content.
355    ///
356    /// Front-matter is expected to be at the very beginning of the file,
357    /// delimited by `---` lines. For example:
358    ///
359    /// ```text
360    /// ---
361    /// title: My Document
362    /// author: John Doe
363    /// date: 2024-01-15
364    /// ---
365    ///
366    /// # Document content starts here
367    /// ```
368    ///
369    /// # Arguments
370    /// * `content` - The full markdown file content
371    ///
372    /// # Returns
373    /// A tuple of (optional metadata HashMap, remaining content after front-matter)
374    pub fn extract_frontmatter(content: &str) -> (Option<HashMap<String, String>>, &str) {
375        // Front-matter must start at the very beginning with ---
376        if !content.starts_with("---") {
377            return (None, content);
378        }
379
380        // Find the closing ---
381        // Skip the first --- and find the next one
382        let after_first_delimiter = &content[3..];
383        let Some(end_pos) = after_first_delimiter.find("\n---") else {
384            // No closing delimiter found
385            return (None, content);
386        };
387
388        // Extract the YAML content (between the delimiters)
389        let yaml_content = after_first_delimiter[..end_pos].trim();
390
391        // Find where the body starts (after the closing ---)
392        let body_start = 3 + end_pos + 4; // 3 for initial ---, +4 for \n---
393        let body = if body_start < content.len() {
394            // Skip any leading newlines after front-matter
395            content[body_start..].trim_start_matches(['\n', '\r'])
396        } else {
397            ""
398        };
399
400        // Parse the YAML
401        match serde_yaml::from_str::<serde_yaml::Value>(yaml_content) {
402            Ok(yaml) => {
403                let mut metadata = HashMap::new();
404
405                // Convert YAML mapping to HashMap<String, String>
406                if let serde_yaml::Value::Mapping(map) = yaml {
407                    for (key, value) in map {
408                        if let serde_yaml::Value::String(k) = key {
409                            let v = match value {
410                                serde_yaml::Value::String(s) => s,
411                                serde_yaml::Value::Number(n) => n.to_string(),
412                                serde_yaml::Value::Bool(b) => b.to_string(),
413                                serde_yaml::Value::Sequence(seq) => {
414                                    // Convert arrays to comma-separated strings
415                                    seq.iter()
416                                        .filter_map(|v| match v {
417                                            serde_yaml::Value::String(s) => Some(s.as_str()),
418                                            _ => None,
419                                        })
420                                        .collect::<Vec<_>>()
421                                        .join(", ")
422                                }
423                                _ => continue,
424                            };
425                            metadata.insert(k, v);
426                        }
427                    }
428                }
429
430                (Some(metadata), body)
431            }
432            Err(e) => {
433                warn!(error = %e, "Failed to parse YAML front-matter");
434                (None, content)
435            }
436        }
437    }
438
439    /// Parse markdown content and return chunks.
440    ///
441    /// The parsing strategy:
442    /// 1. Track heading hierarchy (h1, h2, h3)
443    /// 2. Accumulate content until the next heading
444    /// 3. Create separate chunks for code blocks if configured
445    /// 4. Track line numbers for each chunk
446    fn parse_markdown(&self, content: &str) -> Vec<MarkdownChunk> {
447        let mut chunks = Vec::new();
448        let mut current_text = String::new();
449        let mut heading_hierarchy: Vec<String> = Vec::new();
450        let mut current_heading_text = String::new();
451        let mut in_heading = false;
452        let mut in_code_block = false;
453        let mut code_block_content = String::new();
454        let mut code_block_language: Option<String> = None;
455        let mut in_list = false;
456        let mut in_table = false;
457
458        // Track line numbers
459        let mut current_line = 1;
460        let mut chunk_start_line = 1;
461
462        // Enable all parsing options for maximum fidelity
463        let options = Options::all();
464        let parser = Parser::new_ext(content, options);
465
466        for event in parser {
467            match event {
468                Event::Start(Tag::Heading { level, .. }) => {
469                    // Before starting a new heading, save any accumulated content
470                    if !current_text.trim().is_empty() {
471                        let chunk_type = if in_list {
472                            ChunkType::List
473                        } else if in_table {
474                            ChunkType::Table
475                        } else {
476                            ChunkType::Text
477                        };
478
479                        chunks.push(MarkdownChunk {
480                            content: current_text.trim().to_string(),
481                            chunk_type,
482                            heading_hierarchy: heading_hierarchy.clone(),
483                            line_range: (chunk_start_line, current_line),
484                        });
485                        current_text.clear();
486                    }
487
488                    in_heading = true;
489                    current_heading_text.clear();
490
491                    // Adjust heading hierarchy based on level
492                    // h1 = level 0, h2 = level 1, h3 = level 2
493                    let level_idx = match level {
494                        HeadingLevel::H1 => 0,
495                        HeadingLevel::H2 => 1,
496                        HeadingLevel::H3 => 2,
497                        HeadingLevel::H4 => 3,
498                        HeadingLevel::H5 => 4,
499                        HeadingLevel::H6 => 5,
500                    };
501
502                    // Truncate hierarchy to this level
503                    heading_hierarchy.truncate(level_idx);
504                    chunk_start_line = current_line;
505                }
506
507                Event::End(TagEnd::Heading(_)) => {
508                    in_heading = false;
509                    // Add the heading text to hierarchy
510                    let heading_text = current_heading_text.trim().to_string();
511                    if !heading_text.is_empty() {
512                        heading_hierarchy.push(heading_text);
513                    }
514                    current_heading_text.clear();
515                }
516
517                Event::Start(Tag::CodeBlock(kind)) => {
518                    // Save any accumulated text before the code block
519                    if !current_text.trim().is_empty() {
520                        chunks.push(MarkdownChunk {
521                            content: current_text.trim().to_string(),
522                            chunk_type: ChunkType::Text,
523                            heading_hierarchy: heading_hierarchy.clone(),
524                            line_range: (chunk_start_line, current_line),
525                        });
526                        current_text.clear();
527                    }
528
529                    in_code_block = true;
530                    code_block_content.clear();
531                    chunk_start_line = current_line;
532
533                    // Extract language from code fence
534                    code_block_language = match kind {
535                        pulldown_cmark::CodeBlockKind::Fenced(lang) => {
536                            let lang_str = lang.to_string();
537                            if lang_str.is_empty() {
538                                None
539                            } else {
540                                // Handle cases like "rust,ignore" - take just the language
541                                Some(lang_str.split(',').next().unwrap_or(&lang_str).to_string())
542                            }
543                        }
544                        pulldown_cmark::CodeBlockKind::Indented => None,
545                    };
546                }
547
548                Event::End(TagEnd::CodeBlock) => {
549                    if self.config.preserve_code_blocks && !code_block_content.trim().is_empty() {
550                        chunks.push(MarkdownChunk {
551                            content: code_block_content.trim().to_string(),
552                            chunk_type: ChunkType::CodeBlock {
553                                language: code_block_language.take(),
554                            },
555                            heading_hierarchy: heading_hierarchy.clone(),
556                            line_range: (chunk_start_line, current_line),
557                        });
558                    } else if !code_block_content.is_empty() {
559                        // Include code in regular text flow
560                        current_text.push_str("```");
561                        if let Some(ref lang) = code_block_language {
562                            current_text.push_str(lang);
563                        }
564                        current_text.push('\n');
565                        current_text.push_str(&code_block_content);
566                        current_text.push_str("```\n");
567                    }
568
569                    in_code_block = false;
570                    code_block_content.clear();
571                    code_block_language = None;
572                    chunk_start_line = current_line;
573                }
574
575                Event::Start(Tag::List(_)) => {
576                    in_list = true;
577                }
578
579                Event::End(TagEnd::List(_)) => {
580                    in_list = false;
581                }
582
583                Event::Start(Tag::Table(_)) => {
584                    in_table = true;
585                }
586
587                Event::End(TagEnd::Table) => {
588                    in_table = false;
589                }
590
591                Event::Text(text) => {
592                    // Count newlines in the text for line tracking
593                    current_line += text.chars().filter(|c| *c == '\n').count();
594
595                    if in_heading {
596                        current_heading_text.push_str(&text);
597                    } else if in_code_block {
598                        code_block_content.push_str(&text);
599                    } else {
600                        current_text.push_str(&text);
601                    }
602                }
603
604                Event::Code(code) => {
605                    // Inline code
606                    if in_heading {
607                        current_heading_text.push('`');
608                        current_heading_text.push_str(&code);
609                        current_heading_text.push('`');
610                    } else if !in_code_block {
611                        current_text.push('`');
612                        current_text.push_str(&code);
613                        current_text.push('`');
614                    }
615                }
616
617                Event::SoftBreak | Event::HardBreak => {
618                    current_line += 1;
619                    if in_heading {
620                        current_heading_text.push(' ');
621                    } else if in_code_block {
622                        code_block_content.push('\n');
623                    } else {
624                        current_text.push('\n');
625                    }
626                }
627
628                Event::Html(html) => {
629                    // Include HTML as-is
630                    current_line += html.chars().filter(|c| *c == '\n').count();
631                    if !in_code_block && !in_heading {
632                        current_text.push_str(&html);
633                    }
634                }
635
636                _ => {}
637            }
638        }
639
640        // Don't forget any remaining content
641        if !current_text.trim().is_empty() {
642            let chunk_type = if in_list {
643                ChunkType::List
644            } else if in_table {
645                ChunkType::Table
646            } else {
647                ChunkType::Text
648            };
649
650            chunks.push(MarkdownChunk {
651                content: current_text.trim().to_string(),
652                chunk_type,
653                heading_hierarchy: heading_hierarchy.clone(),
654                line_range: (chunk_start_line, current_line),
655            });
656        }
657
658        // Post-process: merge small chunks and split large ones
659        self.post_process_chunks(chunks)
660    }
661
662    /// Post-process chunks to respect size constraints.
663    fn post_process_chunks(&self, chunks: Vec<MarkdownChunk>) -> Vec<MarkdownChunk> {
664        let mut result = Vec::new();
665        let mut pending: Option<MarkdownChunk> = None;
666
667        for chunk in chunks {
668            // Don't merge code blocks
669            if chunk.is_code() {
670                // Flush any pending text chunk
671                if let Some(p) = pending.take() {
672                    if p.content.len() > self.config.max_chunk_size {
673                        result.extend(self.split_large_chunk(p));
674                    } else {
675                        result.push(p);
676                    }
677                }
678                // Add code block as-is (or split if too large)
679                if chunk.content.len() > self.config.max_chunk_size {
680                    result.extend(self.split_large_chunk(chunk));
681                } else {
682                    result.push(chunk);
683                }
684                continue;
685            }
686
687            match pending.take() {
688                None => {
689                    pending = Some(chunk);
690                }
691                Some(mut p) => {
692                    // If pending chunk is too small, try to merge
693                    if p.content.len() < self.config.min_chunk_size {
694                        // Only merge if they share the same heading context
695                        if p.heading_hierarchy == chunk.heading_hierarchy {
696                            p.content.push_str("\n\n");
697                            p.content.push_str(&chunk.content);
698                            p.line_range.1 = chunk.line_range.1;
699                            pending = Some(p);
700                        } else {
701                            // Different context, keep small chunk as-is
702                            result.push(p);
703                            pending = Some(chunk);
704                        }
705                    } else {
706                        // Pending chunk is big enough, push it
707                        if p.content.len() > self.config.max_chunk_size {
708                            result.extend(self.split_large_chunk(p));
709                        } else {
710                            result.push(p);
711                        }
712                        pending = Some(chunk);
713                    }
714                }
715            }
716        }
717
718        // Don't forget the last pending chunk
719        if let Some(p) = pending {
720            if p.content.len() > self.config.max_chunk_size {
721                result.extend(self.split_large_chunk(p));
722            } else {
723                result.push(p);
724            }
725        }
726
727        result
728    }
729
730    /// Split a large chunk into smaller pieces.
731    fn split_large_chunk(&self, chunk: MarkdownChunk) -> Vec<MarkdownChunk> {
732        let mut result = Vec::new();
733        let content = &chunk.content;
734        let max_size = self.config.max_chunk_size;
735
736        // Try to split at paragraph boundaries first
737        let paragraphs: Vec<&str> = content.split("\n\n").collect();
738
739        let mut current = String::new();
740        let mut current_start = chunk.line_range.0;
741
742        for para in paragraphs {
743            // If a single paragraph exceeds max_size, split it at sentence boundaries
744            if para.len() > max_size {
745                // Save any accumulated content first
746                if !current.is_empty() {
747                    let lines_in_current = current.chars().filter(|c| *c == '\n').count() + 1;
748                    result.push(MarkdownChunk {
749                        content: current.clone(),
750                        chunk_type: chunk.chunk_type.clone(),
751                        heading_hierarchy: chunk.heading_hierarchy.clone(),
752                        line_range: (current_start, current_start + lines_in_current),
753                    });
754                    current_start += lines_in_current;
755                    current.clear();
756                }
757
758                // Split large paragraph at sentence boundaries (. followed by space)
759                let mut para_chunk = String::new();
760                for sentence in para.split(". ") {
761                    let sentence_with_period = if sentence.ends_with('.') {
762                        sentence.to_string()
763                    } else {
764                        format!("{}. ", sentence)
765                    };
766
767                    if para_chunk.len() + sentence_with_period.len() > max_size
768                        && !para_chunk.is_empty()
769                    {
770                        result.push(MarkdownChunk {
771                            content: para_chunk.trim().to_string(),
772                            chunk_type: chunk.chunk_type.clone(),
773                            heading_hierarchy: chunk.heading_hierarchy.clone(),
774                            line_range: (current_start, current_start + 1),
775                        });
776                        para_chunk.clear();
777                    }
778                    para_chunk.push_str(&sentence_with_period);
779                }
780                if !para_chunk.is_empty() {
781                    result.push(MarkdownChunk {
782                        content: para_chunk.trim().to_string(),
783                        chunk_type: chunk.chunk_type.clone(),
784                        heading_hierarchy: chunk.heading_hierarchy.clone(),
785                        line_range: (current_start, current_start + 1),
786                    });
787                }
788                continue;
789            }
790
791            if current.len() + para.len() + 2 > max_size && !current.is_empty() {
792                // Save current chunk
793                let lines_in_current = current.chars().filter(|c| *c == '\n').count() + 1;
794                result.push(MarkdownChunk {
795                    content: current.clone(),
796                    chunk_type: chunk.chunk_type.clone(),
797                    heading_hierarchy: chunk.heading_hierarchy.clone(),
798                    line_range: (current_start, current_start + lines_in_current),
799                });
800                current_start += lines_in_current;
801                current.clear();
802            }
803
804            if !current.is_empty() {
805                current.push_str("\n\n");
806            }
807            current.push_str(para);
808        }
809
810        // Don't forget the last piece
811        if !current.is_empty() {
812            result.push(MarkdownChunk {
813                content: current,
814                chunk_type: chunk.chunk_type,
815                heading_hierarchy: chunk.heading_hierarchy,
816                line_range: (current_start, chunk.line_range.1),
817            });
818        }
819
820        result
821    }
822
823    /// Check if a path is a markdown file based on extension.
824    fn is_markdown_file(&self, path: &Path) -> bool {
825        path.extension()
826            .and_then(|e| e.to_str())
827            .map(|ext| {
828                self.config
829                    .markdown_extensions
830                    .iter()
831                    .any(|m| m.eq_ignore_ascii_case(ext))
832            })
833            .unwrap_or(false)
834    }
835}
836
837#[cfg(test)]
838mod tests {
839    use super::*;
840    use tempfile::TempDir;
841    use tokio::fs::File;
842    use tokio::io::AsyncWriteExt;
843
844    /// Helper to create a temp file with content
845    async fn create_temp_file(dir: &TempDir, name: &str, content: &str) -> PathBuf {
846        let path = dir.path().join(name);
847        let mut file = File::create(&path).await.unwrap();
848        file.write_all(content.as_bytes()).await.unwrap();
849        path
850    }
851
852    // ==================== Front-matter Extraction Tests ====================
853
854    /// Test: Front-matter is correctly extracted from a markdown file.
855    ///
856    /// What happens:
857    /// 1. The content starts with `---` followed by YAML
858    /// 2. The YAML is parsed into a HashMap<String, String>
859    /// 3. The body after the closing `---` is returned separately
860    #[test]
861    fn test_extract_frontmatter_basic() {
862        let content = r#"---
863title: My Document
864author: John Doe
865date: 2024-01-15
866---
867
868# Hello World
869
870This is the body."#;
871
872        let (metadata, body) = MarkdownIngester::extract_frontmatter(content);
873
874        assert!(metadata.is_some(), "Front-matter should be extracted");
875        let metadata = metadata.unwrap();
876
877        assert_eq!(metadata.get("title"), Some(&"My Document".to_string()));
878        assert_eq!(metadata.get("author"), Some(&"John Doe".to_string()));
879        assert_eq!(metadata.get("date"), Some(&"2024-01-15".to_string()));
880
881        assert!(
882            body.starts_with("# Hello World"),
883            "Body should start with heading"
884        );
885    }
886
887    /// Test: Files without front-matter return None for metadata.
888    ///
889    /// What happens:
890    /// 1. The content does not start with `---`
891    /// 2. No metadata is extracted
892    /// 3. The entire content is returned as the body
893    #[test]
894    fn test_extract_frontmatter_none() {
895        let content = "# Just a Heading\n\nSome content.";
896
897        let (metadata, body) = MarkdownIngester::extract_frontmatter(content);
898
899        assert!(metadata.is_none(), "No front-matter should be found");
900        assert_eq!(body, content, "Body should be the entire content");
901    }
902
903    /// Test: Arrays in front-matter are converted to comma-separated strings.
904    ///
905    /// What happens:
906    /// 1. YAML arrays like `tags: [rust, programming]`
907    /// 2. Are converted to `"rust, programming"` string
908    /// 3. This simplifies storage in HashMap<String, String>
909    #[test]
910    fn test_extract_frontmatter_arrays() {
911        let content = r#"---
912title: Tagged Post
913tags:
914  - rust
915  - programming
916  - web
917---
918
919Content here."#;
920
921        let (metadata, _body) = MarkdownIngester::extract_frontmatter(content);
922
923        let metadata = metadata.expect("Front-matter should be extracted");
924        assert_eq!(
925            metadata.get("tags"),
926            Some(&"rust, programming, web".to_string())
927        );
928    }
929
930    /// Test: Unclosed front-matter is treated as no front-matter.
931    ///
932    /// What happens:
933    /// 1. Content starts with `---` but has no closing `---`
934    /// 2. This is invalid front-matter
935    /// 3. The entire content is returned as body
936    #[test]
937    fn test_extract_frontmatter_unclosed() {
938        let content = r#"---
939title: Broken
940author: Nobody
941
942# This has no closing delimiter"#;
943
944        let (metadata, body) = MarkdownIngester::extract_frontmatter(content);
945
946        assert!(metadata.is_none(), "Unclosed front-matter should not parse");
947        assert_eq!(body, content, "Body should be entire content");
948    }
949
950    // ==================== Markdown Parsing Tests ====================
951
952    /// Test: Basic heading hierarchy is tracked correctly.
953    ///
954    /// What happens:
955    /// 1. Parser encounters h1 heading -> starts hierarchy at level 0
956    /// 2. Parser encounters h2 heading -> adds to hierarchy at level 1
957    /// 3. Content under each heading has correct heading_hierarchy
958    /// 4. New h2 replaces previous h2 in hierarchy
959    #[tokio::test]
960    async fn test_heading_hierarchy() {
961        let temp_dir = TempDir::new().unwrap();
962        let content = r#"# Main Title
963
964Intro paragraph.
965
966## Section One
967
968Content in section one.
969
970## Section Two
971
972Content in section two.
973
974### Subsection
975
976Deep content.
977"#;
978
979        let path = create_temp_file(&temp_dir, "test.md", content).await;
980        let ingester = MarkdownIngester::new();
981        let doc = ingester.ingest_file(&path).await.unwrap();
982
983        // Find chunk with "section one" content
984        let section_one = doc
985            .chunks
986            .iter()
987            .find(|c| c.content.to_lowercase().contains("content in section one"))
988            .expect("Should find section one content");
989
990        assert_eq!(
991            section_one.heading_hierarchy,
992            vec!["Main Title", "Section One"],
993            "Section one should have correct hierarchy"
994        );
995
996        // Find chunk with subsection content
997        let subsection = doc
998            .chunks
999            .iter()
1000            .find(|c| c.content.to_lowercase().contains("deep content"))
1001            .expect("Should find subsection content");
1002
1003        assert_eq!(
1004            subsection.heading_hierarchy,
1005            vec!["Main Title", "Section Two", "Subsection"],
1006            "Subsection should have full hierarchy"
1007        );
1008    }
1009
1010    /// Test: Code blocks are preserved with their language tags.
1011    ///
1012    /// What happens:
1013    /// 1. Parser encounters fenced code block (```)
1014    /// 2. Language is extracted from the fence (e.g., ```rust)
1015    /// 3. Code content is preserved exactly
1016    /// 4. ChunkType::CodeBlock is used with language field
1017    #[tokio::test]
1018    async fn test_code_block_preservation() {
1019        let temp_dir = TempDir::new().unwrap();
1020        let content = r#"# Code Examples
1021
1022Here's some Rust code:
1023
1024```rust
1025fn main() {
1026    println!("Hello, world!");
1027}
1028```
1029
1030And some Python:
1031
1032```python
1033def hello():
1034    print("Hello, world!")
1035```
1036"#;
1037
1038        let path = create_temp_file(&temp_dir, "test.md", content).await;
1039        let ingester = MarkdownIngester::new();
1040        let doc = ingester.ingest_file(&path).await.unwrap();
1041
1042        // Find Rust code block
1043        let rust_block = doc.chunks.iter()
1044            .find(|c| matches!(&c.chunk_type, ChunkType::CodeBlock { language: Some(l) } if l == "rust"))
1045            .expect("Should find Rust code block");
1046
1047        assert!(
1048            rust_block.content.contains("println!"),
1049            "Rust code should be preserved"
1050        );
1051
1052        // Find Python code block
1053        let python_block = doc.chunks.iter()
1054            .find(|c| matches!(&c.chunk_type, ChunkType::CodeBlock { language: Some(l) } if l == "python"))
1055            .expect("Should find Python code block");
1056
1057        assert!(
1058            python_block.content.contains("def hello"),
1059            "Python code should be preserved"
1060        );
1061    }
1062
1063    /// Test: Code blocks without language specification.
1064    ///
1065    /// What happens:
1066    /// 1. Fenced code block without language (just ```)
1067    /// 2. language field is None
1068    /// 3. Content is still preserved
1069    #[tokio::test]
1070    async fn test_code_block_no_language() {
1071        let temp_dir = TempDir::new().unwrap();
1072        let content = r#"# Unlabeled Code
1073
1074```
1075some generic code
1076```
1077"#;
1078
1079        let path = create_temp_file(&temp_dir, "test.md", content).await;
1080        let ingester = MarkdownIngester::new();
1081        let doc = ingester.ingest_file(&path).await.unwrap();
1082
1083        let code_block = doc
1084            .chunks
1085            .iter()
1086            .find(|c| matches!(&c.chunk_type, ChunkType::CodeBlock { language: None }))
1087            .expect("Should find code block without language");
1088
1089        assert!(code_block.content.contains("generic code"));
1090    }
1091
1092    /// Test: Title extraction from front-matter takes precedence.
1093    ///
1094    /// What happens:
1095    /// 1. Document has title in front-matter
1096    /// 2. Document also has h1 heading
1097    /// 3. Front-matter title is used as document title
1098    #[tokio::test]
1099    async fn test_title_from_frontmatter() {
1100        let temp_dir = TempDir::new().unwrap();
1101        let content = r#"---
1102title: Front-matter Title
1103---
1104
1105# Heading Title
1106
1107Content.
1108"#;
1109
1110        let path = create_temp_file(&temp_dir, "test.md", content).await;
1111        let ingester = MarkdownIngester::new();
1112        let doc = ingester.ingest_file(&path).await.unwrap();
1113
1114        assert_eq!(
1115            doc.title,
1116            Some("Front-matter Title".to_string()),
1117            "Title should come from front-matter"
1118        );
1119    }
1120
1121    /// Test: Title extracted from first h1 when no front-matter.
1122    ///
1123    /// What happens:
1124    /// 1. No front-matter in document
1125    /// 2. First h1 heading is used as title
1126    #[tokio::test]
1127    async fn test_title_from_heading() {
1128        let temp_dir = TempDir::new().unwrap();
1129        let content = r#"# First Heading
1130
1131Some content here.
1132
1133## Second Section
1134
1135More content.
1136"#;
1137
1138        let path = create_temp_file(&temp_dir, "test.md", content).await;
1139        let ingester = MarkdownIngester::new();
1140        let doc = ingester.ingest_file(&path).await.unwrap();
1141
1142        assert_eq!(
1143            doc.title,
1144            Some("First Heading".to_string()),
1145            "Title should come from first h1"
1146        );
1147    }
1148
1149    // ==================== Directory Ingestion Tests ====================
1150
1151    /// Test: Recursive directory ingestion finds all markdown files.
1152    ///
1153    /// What happens:
1154    /// 1. Directory structure with nested folders
1155    /// 2. recursive=true traverses all subdirectories
1156    /// 3. All .md files are ingested
1157    /// 4. Non-markdown files are ignored
1158    #[tokio::test]
1159    async fn test_directory_ingestion_recursive() {
1160        let temp_dir = TempDir::new().unwrap();
1161
1162        // Create directory structure
1163        let subdir = temp_dir.path().join("subdir");
1164        fs::create_dir(&subdir).await.unwrap();
1165
1166        create_temp_file(&temp_dir, "root.md", "# Root\n\nRoot content.").await;
1167        create_temp_file(&temp_dir, "other.txt", "Not markdown").await;
1168
1169        // Create file in subdirectory
1170        let sub_path = subdir.join("nested.md");
1171        let mut file = File::create(&sub_path).await.unwrap();
1172        file.write_all(b"# Nested\n\nNested content.")
1173            .await
1174            .unwrap();
1175
1176        let ingester = MarkdownIngester::new();
1177        let docs = ingester
1178            .ingest_directory(temp_dir.path(), true)
1179            .await
1180            .unwrap();
1181
1182        assert_eq!(docs.len(), 2, "Should find 2 markdown files");
1183
1184        let titles: Vec<_> = docs.iter().filter_map(|d| d.title.as_ref()).collect();
1185        assert!(titles.contains(&&"Root".to_string()));
1186        assert!(titles.contains(&&"Nested".to_string()));
1187    }
1188
1189    /// Test: Non-recursive ingestion stays in root directory.
1190    ///
1191    /// What happens:
1192    /// 1. recursive=false
1193    /// 2. Only files in the root directory are processed
1194    /// 3. Subdirectories are ignored
1195    #[tokio::test]
1196    async fn test_directory_ingestion_non_recursive() {
1197        let temp_dir = TempDir::new().unwrap();
1198
1199        let subdir = temp_dir.path().join("subdir");
1200        fs::create_dir(&subdir).await.unwrap();
1201
1202        create_temp_file(&temp_dir, "root.md", "# Root\n\nContent.").await;
1203
1204        let sub_path = subdir.join("nested.md");
1205        let mut file = File::create(&sub_path).await.unwrap();
1206        file.write_all(b"# Nested\n\nContent.").await.unwrap();
1207
1208        let ingester = MarkdownIngester::new();
1209        let docs = ingester
1210            .ingest_directory(temp_dir.path(), false)
1211            .await
1212            .unwrap();
1213
1214        assert_eq!(docs.len(), 1, "Should find only root markdown file");
1215        assert_eq!(docs[0].title, Some("Root".to_string()));
1216    }
1217
1218    // ==================== Chunk Processing Tests ====================
1219
1220    /// Test: Small chunks are merged when they share context.
1221    ///
1222    /// What happens:
1223    /// 1. Multiple small paragraphs under same heading
1224    /// 2. If combined size < max_chunk_size, they're merged
1225    /// 3. This improves embedding quality by providing more context
1226    #[tokio::test]
1227    async fn test_small_chunk_merging() {
1228        let temp_dir = TempDir::new().unwrap();
1229        let content = r#"# Section
1230
1231A.
1232
1233B.
1234
1235C.
1236"#;
1237
1238        let path = create_temp_file(&temp_dir, "test.md", content).await;
1239        let mut config = IngesterConfig::default();
1240        config.min_chunk_size = 100; // Force merging
1241
1242        let ingester = MarkdownIngester::with_config(config);
1243        let doc = ingester.ingest_file(&path).await.unwrap();
1244
1245        // The three small paragraphs should be merged into one chunk
1246        assert!(
1247            doc.chunks.len() <= 2, // May have 1-2 chunks depending on merging
1248            "Small chunks should be merged"
1249        );
1250    }
1251
1252    /// Test: Large chunks are split at paragraph boundaries.
1253    ///
1254    /// What happens:
1255    /// 1. Content exceeds max_chunk_size
1256    /// 2. Content is split at "\n\n" (paragraph) boundaries
1257    /// 3. Each resulting chunk is within size limits
1258    #[tokio::test]
1259    async fn test_large_chunk_splitting() {
1260        let temp_dir = TempDir::new().unwrap();
1261
1262        // Create content larger than max_chunk_size
1263        let long_paragraph = "This is a test paragraph. ".repeat(200);
1264        let content = format!(
1265            "# Large Document\n\n{}\n\n{}\n\n{}",
1266            long_paragraph, long_paragraph, long_paragraph
1267        );
1268
1269        let path = create_temp_file(&temp_dir, "test.md", &content).await;
1270        let mut config = IngesterConfig::default();
1271        config.max_chunk_size = 500;
1272        let max_chunk_size = config.max_chunk_size;
1273
1274        let ingester = MarkdownIngester::with_config(config);
1275        let doc = ingester.ingest_file(&path).await.unwrap();
1276
1277        // Verify no chunk exceeds max size (with some tolerance for implementation)
1278        for chunk in &doc.chunks {
1279            // Allow some tolerance since we split at paragraph boundaries
1280            assert!(
1281                chunk.content.len() <= max_chunk_size + 200,
1282                "Chunk should not greatly exceed max size: {} > {}",
1283                chunk.content.len(),
1284                max_chunk_size
1285            );
1286        }
1287    }
1288
1289    // ==================== Edge Cases ====================
1290
1291    /// Test: Empty markdown file.
1292    ///
1293    /// What happens:
1294    /// 1. File exists but is empty
1295    /// 2. Returns document with no chunks
1296    /// 3. No error is raised
1297    #[tokio::test]
1298    async fn test_empty_file() {
1299        let temp_dir = TempDir::new().unwrap();
1300        let path = create_temp_file(&temp_dir, "empty.md", "").await;
1301
1302        let ingester = MarkdownIngester::new();
1303        let doc = ingester.ingest_file(&path).await.unwrap();
1304
1305        assert!(doc.chunks.is_empty(), "Empty file should have no chunks");
1306        assert!(doc.title.is_none(), "Empty file should have no title");
1307    }
1308
1309    /// Test: File with only front-matter.
1310    ///
1311    /// What happens:
1312    /// 1. File has front-matter but no body content
1313    /// 2. Metadata is extracted
1314    /// 3. Chunks are empty
1315    #[tokio::test]
1316    async fn test_frontmatter_only() {
1317        let temp_dir = TempDir::new().unwrap();
1318        let content = r#"---
1319title: Metadata Only
1320author: Test
1321---
1322"#;
1323        let path = create_temp_file(&temp_dir, "meta.md", content).await;
1324
1325        let ingester = MarkdownIngester::new();
1326        let doc = ingester.ingest_file(&path).await.unwrap();
1327
1328        assert_eq!(doc.title, Some("Metadata Only".to_string()));
1329        assert_eq!(doc.metadata.get("author"), Some(&"Test".to_string()));
1330        assert!(doc.chunks.is_empty(), "Should have no content chunks");
1331    }
1332
1333    /// Test: Inline code within headings.
1334    ///
1335    /// What happens:
1336    /// 1. Heading contains inline code like `code`
1337    /// 2. The backticks are preserved in the heading text
1338    /// 3. Heading is correctly added to hierarchy
1339    #[tokio::test]
1340    async fn test_inline_code_in_heading() {
1341        let temp_dir = TempDir::new().unwrap();
1342        let content = r#"# Using `async/await` in Rust
1343
1344Some explanation here.
1345"#;
1346        let path = create_temp_file(&temp_dir, "test.md", content).await;
1347
1348        let ingester = MarkdownIngester::new();
1349        let doc = ingester.ingest_file(&path).await.unwrap();
1350
1351        assert!(
1352            doc.chunks.iter().any(|c| c
1353                .heading_hierarchy
1354                .iter()
1355                .any(|h| h.contains("`async/await`"))),
1356            "Heading should preserve inline code"
1357        );
1358    }
1359
1360    /// Test: Context string generation.
1361    ///
1362    /// What happens:
1363    /// 1. MarkdownChunk has heading_hierarchy
1364    /// 2. context_string() returns "H1 > H2 > H3" format
1365    /// 3. Empty hierarchy returns "Document root"
1366    #[test]
1367    fn test_context_string() {
1368        let chunk_with_hierarchy = MarkdownChunk {
1369            content: "Test".to_string(),
1370            chunk_type: ChunkType::Text,
1371            heading_hierarchy: vec!["Main".to_string(), "Section".to_string()],
1372            line_range: (1, 5),
1373        };
1374
1375        assert_eq!(chunk_with_hierarchy.context_string(), "Main > Section");
1376
1377        let chunk_no_hierarchy = MarkdownChunk {
1378            content: "Test".to_string(),
1379            chunk_type: ChunkType::Text,
1380            heading_hierarchy: vec![],
1381            line_range: (1, 5),
1382        };
1383
1384        assert_eq!(chunk_no_hierarchy.context_string(), "Document root");
1385    }
1386
1387    /// Test: Document helper methods.
1388    ///
1389    /// What happens:
1390    /// 1. full_text() concatenates all chunks
1391    /// 2. text_chunks() filters to only text
1392    /// 3. code_chunks() filters to only code
1393    #[test]
1394    fn test_document_helpers() {
1395        let mut doc = MarkdownDocument::new("/test.md");
1396        doc.chunks = vec![
1397            MarkdownChunk::text("First text", vec![], (1, 2)),
1398            MarkdownChunk::code_block("let x = 1;", Some("rust".to_string()), vec![], (3, 5)),
1399            MarkdownChunk::text("Second text", vec![], (6, 7)),
1400        ];
1401
1402        let full = doc.full_text();
1403        assert!(full.contains("First text"));
1404        assert!(full.contains("let x = 1;"));
1405        assert!(full.contains("Second text"));
1406
1407        assert_eq!(doc.text_chunks().count(), 2);
1408        assert_eq!(doc.code_chunks().count(), 1);
1409    }
1410
1411    /// Test: Code block language extraction handles edge cases.
1412    ///
1413    /// What happens:
1414    /// 1. Language like "rust,ignore" -> extracts just "rust"
1415    /// 2. This handles common markdown patterns
1416    #[tokio::test]
1417    async fn test_code_language_with_attributes() {
1418        let temp_dir = TempDir::new().unwrap();
1419        let content = r#"# Test
1420
1421```rust,ignore
1422fn example() {}
1423```
1424"#;
1425        let path = create_temp_file(&temp_dir, "test.md", content).await;
1426
1427        let ingester = MarkdownIngester::new();
1428        let doc = ingester.ingest_file(&path).await.unwrap();
1429
1430        let code_chunk = doc.code_chunks().next().expect("Should have code chunk");
1431        assert_eq!(code_chunk.code_language(), Some("rust"));
1432    }
1433
1434    /// Test: Different markdown extensions are recognized.
1435    ///
1436    /// What happens:
1437    /// 1. Default config recognizes .md, .markdown, .mdx
1438    /// 2. is_markdown_file() returns true for these
1439    /// 3. Other extensions return false
1440    #[test]
1441    fn test_markdown_extension_recognition() {
1442        let ingester = MarkdownIngester::new();
1443
1444        assert!(ingester.is_markdown_file(Path::new("test.md")));
1445        assert!(ingester.is_markdown_file(Path::new("test.markdown")));
1446        assert!(ingester.is_markdown_file(Path::new("test.mdx")));
1447        assert!(ingester.is_markdown_file(Path::new("test.MD"))); // Case insensitive
1448
1449        assert!(!ingester.is_markdown_file(Path::new("test.txt")));
1450        assert!(!ingester.is_markdown_file(Path::new("test.rs")));
1451        assert!(!ingester.is_markdown_file(Path::new("noextension")));
1452    }
1453}
converge_knowledge/ingest/markdown.rs

converge_knowledge/ingest/
markdown.rs