use pulldown_cmark::{Event, HeadingLevel, Options, Parser, Tag, TagEnd};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use tokio::fs;
use tracing::{debug, instrument, warn};
use crate::error::{Error, Result};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MarkdownDocument {
pub path: PathBuf,
pub title: Option<String>,
pub metadata: HashMap<String, String>,
pub chunks: Vec<MarkdownChunk>,
}
impl MarkdownDocument {
pub fn new(path: impl Into<PathBuf>) -> Self {
Self {
path: path.into(),
title: None,
metadata: HashMap::new(),
chunks: Vec::new(),
}
}
pub fn full_text(&self) -> String {
self.chunks
.iter()
.map(|c| c.content.as_str())
.collect::<Vec<_>>()
.join("\n\n")
}
pub fn text_chunks(&self) -> impl Iterator<Item = &MarkdownChunk> {
self.chunks
.iter()
.filter(|c| c.chunk_type == ChunkType::Text)
}
pub fn code_chunks(&self) -> impl Iterator<Item = &MarkdownChunk> {
self.chunks
.iter()
.filter(|c| matches!(c.chunk_type, ChunkType::CodeBlock { .. }))
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MarkdownChunk {
pub content: String,
pub chunk_type: ChunkType,
pub heading_hierarchy: Vec<String>,
pub line_range: (usize, usize),
}
impl MarkdownChunk {
pub fn text(
content: impl Into<String>,
heading_hierarchy: Vec<String>,
line_range: (usize, usize),
) -> Self {
Self {
content: content.into(),
chunk_type: ChunkType::Text,
heading_hierarchy,
line_range,
}
}
pub fn code_block(
content: impl Into<String>,
language: Option<String>,
heading_hierarchy: Vec<String>,
line_range: (usize, usize),
) -> Self {
Self {
content: content.into(),
chunk_type: ChunkType::CodeBlock { language },
heading_hierarchy,
line_range,
}
}
pub fn is_code(&self) -> bool {
matches!(self.chunk_type, ChunkType::CodeBlock { .. })
}
pub fn code_language(&self) -> Option<&str> {
match &self.chunk_type {
ChunkType::CodeBlock { language } => language.as_deref(),
_ => None,
}
}
pub fn context_string(&self) -> String {
if self.heading_hierarchy.is_empty() {
"Document root".to_string()
} else {
self.heading_hierarchy.join(" > ")
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum ChunkType {
Text,
CodeBlock {
language: Option<String>,
},
List,
Table,
}
#[derive(Debug, Clone)]
pub struct IngesterConfig {
pub min_chunk_size: usize,
pub max_chunk_size: usize,
pub preserve_code_blocks: bool,
pub include_frontmatter: bool,
pub markdown_extensions: Vec<String>,
}
impl Default for IngesterConfig {
fn default() -> Self {
Self {
min_chunk_size: 50,
max_chunk_size: 4000,
preserve_code_blocks: true,
include_frontmatter: true,
markdown_extensions: vec!["md".to_string(), "markdown".to_string(), "mdx".to_string()],
}
}
}
#[derive(Debug, Clone)]
pub struct MarkdownIngester {
config: IngesterConfig,
}
impl Default for MarkdownIngester {
fn default() -> Self {
Self::new()
}
}
impl MarkdownIngester {
pub fn new() -> Self {
Self {
config: IngesterConfig::default(),
}
}
pub fn with_config(config: IngesterConfig) -> Self {
Self { config }
}
#[instrument(skip(self), fields(path = %path.display()))]
pub async fn ingest_file(&self, path: &Path) -> Result<MarkdownDocument> {
debug!("Ingesting markdown file");
let content = fs::read_to_string(path).await.map_err(Error::Io)?;
let mut document = MarkdownDocument::new(path);
let (frontmatter, body) = Self::extract_frontmatter(&content);
if let Some(fm) = frontmatter {
document.metadata.clone_from(&fm);
if let Some(title) = fm.get("title") {
document.title = Some(title.clone());
}
}
document.chunks = self.parse_markdown(body);
if document.title.is_none() {
document.title = document
.chunks
.iter()
.find(|c| !c.heading_hierarchy.is_empty())
.and_then(|c| c.heading_hierarchy.first().cloned());
}
debug!(chunks = document.chunks.len(), "Ingestion complete");
Ok(document)
}
#[instrument(skip(self), fields(dir = %dir.display(), recursive))]
pub async fn ingest_directory(
&self,
dir: &Path,
recursive: bool,
) -> Result<Vec<MarkdownDocument>> {
debug!("Ingesting directory");
let mut documents = Vec::new();
let mut dirs_to_process = vec![dir.to_path_buf()];
while let Some(current_dir) = dirs_to_process.pop() {
let mut entries = fs::read_dir(¤t_dir).await.map_err(Error::Io)?;
while let Some(entry) = entries.next_entry().await.map_err(Error::Io)? {
let path = entry.path();
let file_type = entry.file_type().await.map_err(Error::Io)?;
if file_type.is_dir() {
if recursive {
dirs_to_process.push(path);
}
} else if file_type.is_file() && self.is_markdown_file(&path) {
match self.ingest_file(&path).await {
Ok(doc) => documents.push(doc),
Err(e) => {
warn!(path = %path.display(), error = %e, "Failed to ingest file");
}
}
}
}
}
debug!(count = documents.len(), "Directory ingestion complete");
Ok(documents)
}
pub fn extract_frontmatter(content: &str) -> (Option<HashMap<String, String>>, &str) {
if !content.starts_with("---") {
return (None, content);
}
let after_first_delimiter = &content[3..];
let Some(end_pos) = after_first_delimiter.find("\n---") else {
return (None, content);
};
let yaml_content = after_first_delimiter[..end_pos].trim();
let body_start = 3 + end_pos + 4; let body = if body_start < content.len() {
content[body_start..].trim_start_matches(['\n', '\r'])
} else {
""
};
match serde_yaml::from_str::<serde_yaml::Value>(yaml_content) {
Ok(yaml) => {
let mut metadata = HashMap::new();
if let serde_yaml::Value::Mapping(map) = yaml {
for (key, value) in map {
if let serde_yaml::Value::String(k) = key {
let v = match value {
serde_yaml::Value::String(s) => s,
serde_yaml::Value::Number(n) => n.to_string(),
serde_yaml::Value::Bool(b) => b.to_string(),
serde_yaml::Value::Sequence(seq) => {
seq.iter()
.filter_map(|v| match v {
serde_yaml::Value::String(s) => Some(s.as_str()),
_ => None,
})
.collect::<Vec<_>>()
.join(", ")
}
_ => continue,
};
metadata.insert(k, v);
}
}
}
(Some(metadata), body)
}
Err(e) => {
warn!(error = %e, "Failed to parse YAML front-matter");
(None, content)
}
}
}
fn parse_markdown(&self, content: &str) -> Vec<MarkdownChunk> {
let mut chunks = Vec::new();
let mut current_text = String::new();
let mut heading_hierarchy: Vec<String> = Vec::new();
let mut current_heading_text = String::new();
let mut in_heading = false;
let mut in_code_block = false;
let mut code_block_content = String::new();
let mut code_block_language: Option<String> = None;
let mut in_list = false;
let mut in_table = false;
let mut current_line = 1;
let mut chunk_start_line = 1;
let options = Options::all();
let parser = Parser::new_ext(content, options);
for event in parser {
match event {
Event::Start(Tag::Heading { level, .. }) => {
if !current_text.trim().is_empty() {
let chunk_type = if in_list {
ChunkType::List
} else if in_table {
ChunkType::Table
} else {
ChunkType::Text
};
chunks.push(MarkdownChunk {
content: current_text.trim().to_string(),
chunk_type,
heading_hierarchy: heading_hierarchy.clone(),
line_range: (chunk_start_line, current_line),
});
current_text.clear();
}
in_heading = true;
current_heading_text.clear();
let level_idx = match level {
HeadingLevel::H1 => 0,
HeadingLevel::H2 => 1,
HeadingLevel::H3 => 2,
HeadingLevel::H4 => 3,
HeadingLevel::H5 => 4,
HeadingLevel::H6 => 5,
};
heading_hierarchy.truncate(level_idx);
chunk_start_line = current_line;
}
Event::End(TagEnd::Heading(_)) => {
in_heading = false;
let heading_text = current_heading_text.trim().to_string();
if !heading_text.is_empty() {
heading_hierarchy.push(heading_text);
}
current_heading_text.clear();
}
Event::Start(Tag::CodeBlock(kind)) => {
if !current_text.trim().is_empty() {
chunks.push(MarkdownChunk {
content: current_text.trim().to_string(),
chunk_type: ChunkType::Text,
heading_hierarchy: heading_hierarchy.clone(),
line_range: (chunk_start_line, current_line),
});
current_text.clear();
}
in_code_block = true;
code_block_content.clear();
chunk_start_line = current_line;
code_block_language = match kind {
pulldown_cmark::CodeBlockKind::Fenced(lang) => {
let lang_str = lang.to_string();
if lang_str.is_empty() {
None
} else {
Some(lang_str.split(',').next().unwrap_or(&lang_str).to_string())
}
}
pulldown_cmark::CodeBlockKind::Indented => None,
};
}
Event::End(TagEnd::CodeBlock) => {
if self.config.preserve_code_blocks && !code_block_content.trim().is_empty() {
chunks.push(MarkdownChunk {
content: code_block_content.trim().to_string(),
chunk_type: ChunkType::CodeBlock {
language: code_block_language.take(),
},
heading_hierarchy: heading_hierarchy.clone(),
line_range: (chunk_start_line, current_line),
});
} else if !code_block_content.is_empty() {
current_text.push_str("```");
if let Some(ref lang) = code_block_language {
current_text.push_str(lang);
}
current_text.push('\n');
current_text.push_str(&code_block_content);
current_text.push_str("```\n");
}
in_code_block = false;
code_block_content.clear();
code_block_language = None;
chunk_start_line = current_line;
}
Event::Start(Tag::List(_)) => {
in_list = true;
}
Event::End(TagEnd::List(_)) => {
in_list = false;
}
Event::Start(Tag::Table(_)) => {
in_table = true;
}
Event::End(TagEnd::Table) => {
in_table = false;
}
Event::Text(text) => {
current_line += text.chars().filter(|c| *c == '\n').count();
if in_heading {
current_heading_text.push_str(&text);
} else if in_code_block {
code_block_content.push_str(&text);
} else {
current_text.push_str(&text);
}
}
Event::Code(code) => {
if in_heading {
current_heading_text.push('`');
current_heading_text.push_str(&code);
current_heading_text.push('`');
} else if !in_code_block {
current_text.push('`');
current_text.push_str(&code);
current_text.push('`');
}
}
Event::SoftBreak | Event::HardBreak => {
current_line += 1;
if in_heading {
current_heading_text.push(' ');
} else if in_code_block {
code_block_content.push('\n');
} else {
current_text.push('\n');
}
}
Event::Html(html) => {
current_line += html.chars().filter(|c| *c == '\n').count();
if !in_code_block && !in_heading {
current_text.push_str(&html);
}
}
_ => {}
}
}
if !current_text.trim().is_empty() {
let chunk_type = if in_list {
ChunkType::List
} else if in_table {
ChunkType::Table
} else {
ChunkType::Text
};
chunks.push(MarkdownChunk {
content: current_text.trim().to_string(),
chunk_type,
heading_hierarchy: heading_hierarchy.clone(),
line_range: (chunk_start_line, current_line),
});
}
self.post_process_chunks(chunks)
}
fn post_process_chunks(&self, chunks: Vec<MarkdownChunk>) -> Vec<MarkdownChunk> {
let mut result = Vec::new();
let mut pending: Option<MarkdownChunk> = None;
for chunk in chunks {
if chunk.is_code() {
if let Some(p) = pending.take() {
if p.content.len() > self.config.max_chunk_size {
result.extend(self.split_large_chunk(p));
} else {
result.push(p);
}
}
if chunk.content.len() > self.config.max_chunk_size {
result.extend(self.split_large_chunk(chunk));
} else {
result.push(chunk);
}
continue;
}
match pending.take() {
None => {
pending = Some(chunk);
}
Some(mut p) => {
if p.content.len() < self.config.min_chunk_size {
if p.heading_hierarchy == chunk.heading_hierarchy {
p.content.push_str("\n\n");
p.content.push_str(&chunk.content);
p.line_range.1 = chunk.line_range.1;
pending = Some(p);
} else {
result.push(p);
pending = Some(chunk);
}
} else {
if p.content.len() > self.config.max_chunk_size {
result.extend(self.split_large_chunk(p));
} else {
result.push(p);
}
pending = Some(chunk);
}
}
}
}
if let Some(p) = pending {
if p.content.len() > self.config.max_chunk_size {
result.extend(self.split_large_chunk(p));
} else {
result.push(p);
}
}
result
}
fn split_large_chunk(&self, chunk: MarkdownChunk) -> Vec<MarkdownChunk> {
let mut result = Vec::new();
let content = &chunk.content;
let max_size = self.config.max_chunk_size;
let paragraphs: Vec<&str> = content.split("\n\n").collect();
let mut current = String::new();
let mut current_start = chunk.line_range.0;
for para in paragraphs {
if para.len() > max_size {
if !current.is_empty() {
let lines_in_current = current.chars().filter(|c| *c == '\n').count() + 1;
result.push(MarkdownChunk {
content: current.clone(),
chunk_type: chunk.chunk_type.clone(),
heading_hierarchy: chunk.heading_hierarchy.clone(),
line_range: (current_start, current_start + lines_in_current),
});
current_start += lines_in_current;
current.clear();
}
let mut para_chunk = String::new();
for sentence in para.split(". ") {
let sentence_with_period = if sentence.ends_with('.') {
sentence.to_string()
} else {
format!("{}. ", sentence)
};
if para_chunk.len() + sentence_with_period.len() > max_size
&& !para_chunk.is_empty()
{
result.push(MarkdownChunk {
content: para_chunk.trim().to_string(),
chunk_type: chunk.chunk_type.clone(),
heading_hierarchy: chunk.heading_hierarchy.clone(),
line_range: (current_start, current_start + 1),
});
para_chunk.clear();
}
para_chunk.push_str(&sentence_with_period);
}
if !para_chunk.is_empty() {
result.push(MarkdownChunk {
content: para_chunk.trim().to_string(),
chunk_type: chunk.chunk_type.clone(),
heading_hierarchy: chunk.heading_hierarchy.clone(),
line_range: (current_start, current_start + 1),
});
}
continue;
}
if current.len() + para.len() + 2 > max_size && !current.is_empty() {
let lines_in_current = current.chars().filter(|c| *c == '\n').count() + 1;
result.push(MarkdownChunk {
content: current.clone(),
chunk_type: chunk.chunk_type.clone(),
heading_hierarchy: chunk.heading_hierarchy.clone(),
line_range: (current_start, current_start + lines_in_current),
});
current_start += lines_in_current;
current.clear();
}
if !current.is_empty() {
current.push_str("\n\n");
}
current.push_str(para);
}
if !current.is_empty() {
result.push(MarkdownChunk {
content: current,
chunk_type: chunk.chunk_type,
heading_hierarchy: chunk.heading_hierarchy,
line_range: (current_start, chunk.line_range.1),
});
}
result
}
fn is_markdown_file(&self, path: &Path) -> bool {
path.extension()
.and_then(|e| e.to_str())
.is_some_and(|ext| {
self.config
.markdown_extensions
.iter()
.any(|m| m.eq_ignore_ascii_case(ext))
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
use tokio::fs::File;
use tokio::io::AsyncWriteExt;
async fn create_temp_file(dir: &TempDir, name: &str, content: &str) -> PathBuf {
let path = dir.path().join(name);
let mut file = File::create(&path).await.unwrap();
file.write_all(content.as_bytes()).await.unwrap();
file.sync_all().await.unwrap();
path
}
#[test]
fn test_extract_frontmatter_basic() {
let content = r"---
title: My Document
author: John Doe
date: 2024-01-15
---
# Hello World
This is the body.";
let (metadata, body) = MarkdownIngester::extract_frontmatter(content);
assert!(metadata.is_some(), "Front-matter should be extracted");
let metadata = metadata.unwrap();
assert_eq!(metadata.get("title"), Some(&"My Document".to_string()));
assert_eq!(metadata.get("author"), Some(&"John Doe".to_string()));
assert_eq!(metadata.get("date"), Some(&"2024-01-15".to_string()));
assert!(
body.starts_with("# Hello World"),
"Body should start with heading"
);
}
#[test]
fn test_extract_frontmatter_none() {
let content = "# Just a Heading\n\nSome content.";
let (metadata, body) = MarkdownIngester::extract_frontmatter(content);
assert!(metadata.is_none(), "No front-matter should be found");
assert_eq!(body, content, "Body should be the entire content");
}
#[test]
fn test_extract_frontmatter_arrays() {
let content = r"---
title: Tagged Post
tags:
- rust
- programming
- web
---
Content here.";
let (metadata, _body) = MarkdownIngester::extract_frontmatter(content);
let metadata = metadata.expect("Front-matter should be extracted");
assert_eq!(
metadata.get("tags"),
Some(&"rust, programming, web".to_string())
);
}
#[test]
fn test_extract_frontmatter_unclosed() {
let content = r"---
title: Broken
author: Nobody
# This has no closing delimiter";
let (metadata, body) = MarkdownIngester::extract_frontmatter(content);
assert!(metadata.is_none(), "Unclosed front-matter should not parse");
assert_eq!(body, content, "Body should be entire content");
}
#[tokio::test]
async fn test_heading_hierarchy() {
let temp_dir = TempDir::new().unwrap();
let content = r"# Main Title
Intro paragraph.
## Section One
Content in section one.
## Section Two
Content in section two.
### Subsection
Deep content.
";
let path = create_temp_file(&temp_dir, "test.md", content).await;
let ingester = MarkdownIngester::new();
let doc = ingester.ingest_file(&path).await.unwrap();
let section_one = doc
.chunks
.iter()
.find(|c| c.content.to_lowercase().contains("content in section one"))
.expect("Should find section one content");
assert_eq!(
section_one.heading_hierarchy,
vec!["Main Title", "Section One"],
"Section one should have correct hierarchy"
);
let subsection = doc
.chunks
.iter()
.find(|c| c.content.to_lowercase().contains("deep content"))
.expect("Should find subsection content");
assert_eq!(
subsection.heading_hierarchy,
vec!["Main Title", "Section Two", "Subsection"],
"Subsection should have full hierarchy"
);
}
#[tokio::test]
async fn test_code_block_preservation() {
let temp_dir = TempDir::new().unwrap();
let content = r#"# Code Examples
Here's some Rust code:
```rust
fn main() {
println!("Hello, world!");
}
```
And some Python:
```python
def hello():
print("Hello, world!")
```
"#;
let path = create_temp_file(&temp_dir, "test.md", content).await;
let ingester = MarkdownIngester::new();
let doc = ingester.ingest_file(&path).await.unwrap();
let rust_block = doc.chunks.iter()
.find(|c| matches!(&c.chunk_type, ChunkType::CodeBlock { language: Some(l) } if l == "rust"))
.expect("Should find Rust code block");
assert!(
rust_block.content.contains("println!"),
"Rust code should be preserved"
);
let python_block = doc.chunks.iter()
.find(|c| matches!(&c.chunk_type, ChunkType::CodeBlock { language: Some(l) } if l == "python"))
.expect("Should find Python code block");
assert!(
python_block.content.contains("def hello"),
"Python code should be preserved"
);
}
#[tokio::test]
async fn test_code_block_no_language() {
let temp_dir = TempDir::new().unwrap();
let content = r"# Unlabeled Code
```
some generic code
```
";
let path = create_temp_file(&temp_dir, "test.md", content).await;
let ingester = MarkdownIngester::new();
let doc = ingester.ingest_file(&path).await.unwrap();
let code_block = doc
.chunks
.iter()
.find(|c| matches!(&c.chunk_type, ChunkType::CodeBlock { language: None }))
.expect("Should find code block without language");
assert!(code_block.content.contains("generic code"));
}
#[tokio::test]
async fn test_title_from_frontmatter() {
let temp_dir = TempDir::new().unwrap();
let content = r"---
title: Front-matter Title
---
# Heading Title
Content.
";
let path = create_temp_file(&temp_dir, "test.md", content).await;
let ingester = MarkdownIngester::new();
let doc = ingester.ingest_file(&path).await.unwrap();
assert_eq!(
doc.title,
Some("Front-matter Title".to_string()),
"Title should come from front-matter"
);
}
#[tokio::test]
async fn test_title_from_heading() {
let temp_dir = TempDir::new().unwrap();
let content = r"# First Heading
Some content here.
## Second Section
More content.
";
let path = create_temp_file(&temp_dir, "test.md", content).await;
let ingester = MarkdownIngester::new();
let doc = ingester.ingest_file(&path).await.unwrap();
assert_eq!(
doc.title,
Some("First Heading".to_string()),
"Title should come from first h1"
);
}
#[tokio::test]
async fn test_directory_ingestion_recursive() {
let temp_dir = TempDir::new().unwrap();
let subdir = temp_dir.path().join("subdir");
fs::create_dir(&subdir).await.unwrap();
create_temp_file(&temp_dir, "root.md", "# Root\n\nRoot content.").await;
create_temp_file(&temp_dir, "other.txt", "Not markdown").await;
let sub_path = subdir.join("nested.md");
let mut file = File::create(&sub_path).await.unwrap();
file.write_all(b"# Nested\n\nNested content.")
.await
.unwrap();
let ingester = MarkdownIngester::new();
let docs = ingester
.ingest_directory(temp_dir.path(), true)
.await
.unwrap();
assert_eq!(docs.len(), 2, "Should find 2 markdown files");
let titles: Vec<_> = docs.iter().filter_map(|d| d.title.as_ref()).collect();
assert!(titles.contains(&&"Root".to_string()));
assert!(titles.contains(&&"Nested".to_string()));
}
#[tokio::test]
async fn test_directory_ingestion_non_recursive() {
let temp_dir = TempDir::new().unwrap();
let subdir = temp_dir.path().join("subdir");
fs::create_dir(&subdir).await.unwrap();
create_temp_file(&temp_dir, "root.md", "# Root\n\nContent.").await;
let sub_path = subdir.join("nested.md");
let mut file = File::create(&sub_path).await.unwrap();
file.write_all(b"# Nested\n\nContent.").await.unwrap();
let ingester = MarkdownIngester::new();
let docs = ingester
.ingest_directory(temp_dir.path(), false)
.await
.unwrap();
assert_eq!(docs.len(), 1, "Should find only root markdown file");
assert_eq!(docs[0].title, Some("Root".to_string()));
}
#[tokio::test]
async fn test_small_chunk_merging() {
let temp_dir = TempDir::new().unwrap();
let content = r"# Section
A.
B.
C.
";
let path = create_temp_file(&temp_dir, "test.md", content).await;
let config = IngesterConfig {
min_chunk_size: 100, ..IngesterConfig::default()
};
let ingester = MarkdownIngester::with_config(config);
let doc = ingester.ingest_file(&path).await.unwrap();
assert!(
doc.chunks.len() <= 2, "Small chunks should be merged"
);
}
#[tokio::test]
async fn test_large_chunk_splitting() {
let temp_dir = TempDir::new().unwrap();
let long_paragraph = "This is a test paragraph. ".repeat(200);
let content = format!(
"# Large Document\n\n{}\n\n{}\n\n{}",
long_paragraph, long_paragraph, long_paragraph
);
let path = create_temp_file(&temp_dir, "test.md", &content).await;
let config = IngesterConfig {
max_chunk_size: 500,
..IngesterConfig::default()
};
let max_chunk_size = config.max_chunk_size;
let ingester = MarkdownIngester::with_config(config);
let doc = ingester.ingest_file(&path).await.unwrap();
for chunk in &doc.chunks {
assert!(
chunk.content.len() <= max_chunk_size + 200,
"Chunk should not greatly exceed max size: {} > {}",
chunk.content.len(),
max_chunk_size
);
}
}
#[tokio::test]
async fn test_empty_file() {
let temp_dir = TempDir::new().unwrap();
let path = create_temp_file(&temp_dir, "empty.md", "").await;
let ingester = MarkdownIngester::new();
let doc = ingester.ingest_file(&path).await.unwrap();
assert!(doc.chunks.is_empty(), "Empty file should have no chunks");
assert!(doc.title.is_none(), "Empty file should have no title");
}
#[tokio::test]
async fn test_frontmatter_only() {
let temp_dir = TempDir::new().unwrap();
let content = r"---
title: Metadata Only
author: Test
---
";
let path = create_temp_file(&temp_dir, "meta.md", content).await;
let ingester = MarkdownIngester::new();
let doc = ingester.ingest_file(&path).await.unwrap();
assert_eq!(doc.title, Some("Metadata Only".to_string()));
assert_eq!(doc.metadata.get("author"), Some(&"Test".to_string()));
assert!(doc.chunks.is_empty(), "Should have no content chunks");
}
#[tokio::test]
async fn test_inline_code_in_heading() {
let temp_dir = TempDir::new().unwrap();
let content = r"# Using `async/await` in Rust
Some explanation here.
";
let path = create_temp_file(&temp_dir, "test.md", content).await;
let ingester = MarkdownIngester::new();
let doc = ingester.ingest_file(&path).await.unwrap();
assert!(
doc.chunks.iter().any(|c| c
.heading_hierarchy
.iter()
.any(|h| h.contains("`async/await`"))),
"Heading should preserve inline code"
);
}
#[test]
fn test_context_string() {
let chunk_with_hierarchy = MarkdownChunk {
content: "Test".to_string(),
chunk_type: ChunkType::Text,
heading_hierarchy: vec!["Main".to_string(), "Section".to_string()],
line_range: (1, 5),
};
assert_eq!(chunk_with_hierarchy.context_string(), "Main > Section");
let chunk_no_hierarchy = MarkdownChunk {
content: "Test".to_string(),
chunk_type: ChunkType::Text,
heading_hierarchy: vec![],
line_range: (1, 5),
};
assert_eq!(chunk_no_hierarchy.context_string(), "Document root");
}
#[test]
fn test_document_helpers() {
let mut doc = MarkdownDocument::new("/test.md");
doc.chunks = vec![
MarkdownChunk::text("First text", vec![], (1, 2)),
MarkdownChunk::code_block("let x = 1;", Some("rust".to_string()), vec![], (3, 5)),
MarkdownChunk::text("Second text", vec![], (6, 7)),
];
let full = doc.full_text();
assert!(full.contains("First text"));
assert!(full.contains("let x = 1;"));
assert!(full.contains("Second text"));
assert_eq!(doc.text_chunks().count(), 2);
assert_eq!(doc.code_chunks().count(), 1);
}
#[tokio::test]
async fn test_code_language_with_attributes() {
let temp_dir = TempDir::new().unwrap();
let content = r"# Test
```rust,ignore
fn example() {}
```
";
let path = create_temp_file(&temp_dir, "test.md", content).await;
let ingester = MarkdownIngester::new();
let doc = ingester.ingest_file(&path).await.unwrap();
let code_chunk = doc.code_chunks().next().expect("Should have code chunk");
assert_eq!(code_chunk.code_language(), Some("rust"));
}
#[test]
fn test_markdown_extension_recognition() {
let ingester = MarkdownIngester::new();
assert!(ingester.is_markdown_file(Path::new("test.md")));
assert!(ingester.is_markdown_file(Path::new("test.markdown")));
assert!(ingester.is_markdown_file(Path::new("test.mdx")));
assert!(ingester.is_markdown_file(Path::new("test.MD")));
assert!(!ingester.is_markdown_file(Path::new("test.txt")));
assert!(!ingester.is_markdown_file(Path::new("test.rs")));
assert!(!ingester.is_markdown_file(Path::new("noextension")));
}
}