use super::*;
fn default_cfg() -> ChunkerConfig {
ChunkerConfig::default()
}
fn tiny_cfg() -> ChunkerConfig {
ChunkerConfig {
chunk_tokens: 50,
chunk_overlap: 0,
chunk_min_tokens: 1,
}
}
#[test]
fn test_build_heading_path() {
assert_eq!(build_heading_path(&[]), "");
assert_eq!(build_heading_path(&["Intro".to_string()]), "Intro");
assert_eq!(
build_heading_path(&["Intro".to_string(), "Deep".to_string()]),
"Intro > Deep"
);
}
#[test]
fn test_build_embedding_text() {
let text = build_embedding_text(
"My Title",
"notes/test.md",
&["Section".to_string()],
"body",
);
assert_eq!(
text,
"Title: My Title\nPath: notes/test.md\nHeadings: Section\n\nbody"
);
}
#[test]
fn test_make_chunk_hash() {
let hash = make_chunk_hash("hello world");
assert_eq!(hash.len(), 64);
assert_eq!(
make_chunk_hash("hello world"),
make_chunk_hash("hello world")
);
assert_ne!(make_chunk_hash("hello"), make_chunk_hash("world"));
}
#[test]
fn test_parser_fidelity_body_is_byte_faithful() {
let raw = "---\ntitle: Fidelity\nstatus: active\n---\n\n# Body\n\nContent here.\n";
let parsed = crate::text::frontmatter::parse_frontmatter(raw);
let expected_body = "\n\n# Body\n\nContent here.\n";
assert_eq!(
parsed.body, expected_body,
"body should be everything after the closing '---' marker"
);
assert_eq!(
format!("---\n{}---{}", parsed.frontmatter_raw, parsed.body),
raw,
"full file should round-trip via frontmatter_raw + body"
);
}
#[test]
fn test_frontmatter_excluded_from_chunks() {
let body = "\n# Filters Note\n\nThis is the body content.\n";
let chunks = chunk_markdown(body, "Filters Note", "Filters/Note.md", &default_cfg());
for chunk in &chunks {
assert!(
!chunk.text.contains("status:"),
"chunk text must not contain YAML key 'status:': {:?}",
chunk.text
);
assert!(
!chunk.text.contains("archived"),
"chunk text must not contain frontmatter value 'archived': {:?}",
chunk.text
);
}
}
#[test]
fn test_obsidian_inline_comment_stripped() {
let body = "Visible text. %%hidden comment%% More visible.";
let chunks = chunk_markdown(body, "T", "t.md", &tiny_cfg());
let all_text: String = chunks
.iter()
.map(|c| c.text.as_str())
.collect::<Vec<_>>()
.join(" ");
assert!(
!all_text.contains("%%"),
"inline Obsidian comment must be stripped: {all_text}"
);
assert!(
!all_text.contains("hidden comment"),
"comment content must not appear: {all_text}"
);
}
#[test]
fn test_obsidian_block_comment_stripped() {
let body = "Before.\n%%\nblock comment line 1\nblock comment line 2\n%%\nAfter.";
let chunks = chunk_markdown(body, "T", "t.md", &tiny_cfg());
let all_text: String = chunks
.iter()
.map(|c| c.text.as_str())
.collect::<Vec<_>>()
.join("\n");
assert!(
!all_text.contains("%%"),
"block Obsidian comment must be stripped"
);
assert!(
!all_text.contains("block comment line"),
"comment content must not appear"
);
assert!(
all_text.contains("Before"),
"content before comment must survive"
);
assert!(
all_text.contains("After"),
"content after comment must survive"
);
}
#[test]
fn test_callout_not_split() {
let body = "> [!note]\n> body line 1\n> body line 2\n\nRegular paragraph after callout.";
let chunks = chunk_markdown(body, "T", "t.md", &tiny_cfg());
let callout_chunk = chunks.iter().find(|c| c.text.contains("[!note]"));
assert!(callout_chunk.is_some(), "callout should produce a chunk");
if let Some(c) = callout_chunk {
assert!(
c.text.contains("body line 1") && c.text.contains("body line 2"),
"callout body lines should be in the same chunk: {:?}",
c.text
);
}
}
#[test]
fn test_math_block_not_split() {
let body = "Paragraph before.\n\n$$\n\\frac{a}{b} = c\n$$\n\nParagraph after.";
let chunks = chunk_markdown(body, "T", "t.md", &tiny_cfg());
let math_chunks: Vec<_> = chunks.iter().filter(|c| c.text.contains("frac")).collect();
for mc in &math_chunks {
assert!(
mc.text.contains("$$") || (mc.text.contains("frac") && !mc.text.contains("$$")),
"math block chunk should not be split mid-equation: {:?}",
mc.text
);
}
assert!(
chunks.iter().any(|c| c.text.contains("frac")),
"math equation content must survive in chunks"
);
}
#[test]
fn test_fenced_code_block_not_split() {
let body =
"Before code.\n\n```rust\nfn hello() {\n println!(\"hello\");\n}\n```\n\nAfter code.";
let chunks = chunk_markdown(body, "T", "t.md", &tiny_cfg());
for chunk in &chunks {
let fence_count = chunk.text.matches("```").count();
if fence_count > 0 {
assert!(
fence_count >= 2 || !chunk.text.trim_start().starts_with("```"),
"fenced code block must not be split mid-fence: {:?}",
chunk.text
);
}
}
assert!(
chunks.iter().any(|c| c.text.contains("println")),
"code block content must survive"
);
}
#[test]
fn test_block_id_preserved_inline() {
let body = "This is a paragraph with a block ID. ^my-block-id\n\nAnother paragraph.";
let chunks = chunk_markdown(body, "T", "t.md", &default_cfg());
let has_block_id = chunks.iter().any(|c| c.text.contains("^my-block-id"));
assert!(has_block_id, "block IDs should be preserved inside chunks");
}
#[test]
fn test_heading_only_chunk_skipped() {
let body = "# Just A Heading\n";
let chunks = chunk_markdown(body, "T", "t.md", &tiny_cfg());
assert!(
chunks.is_empty(),
"heading-only content should produce no chunks, got: {chunks:?}"
);
}
#[test]
fn test_separator_only_chunk_skipped() {
let body = "Some text.\n\n---\n\nMore text.";
let chunks = chunk_markdown(body, "T", "t.md", &tiny_cfg());
for chunk in &chunks {
assert_ne!(
chunk.text.trim(),
"---",
"separator-only chunk must be filtered"
);
}
}
#[test]
fn test_single_wikilink_chunk_skipped() {
let body = "[[Some Note]]\n\nThis is real content.";
let chunks = chunk_markdown(body, "T", "t.md", &tiny_cfg());
for chunk in &chunks {
assert_ne!(
chunk.text.trim(),
"[[Some Note]]",
"single wikilink chunk must be filtered"
);
}
}
#[test]
fn test_single_image_embed_chunk_skipped() {
let body = "![[image.png]]\n\nThis is real content.";
let chunks = chunk_markdown(body, "T", "t.md", &tiny_cfg());
for chunk in &chunks {
assert_ne!(
chunk.text.trim(),
"![[image.png]]",
"single image embed chunk must be filtered"
);
}
}
#[test]
fn test_chunk_min_tokens_filters_tiny_chunks() {
let body =
"Hi.\n\nThis is a much longer paragraph with many more words to exceed the threshold.";
let strict_cfg = ChunkerConfig {
chunk_tokens: 20,
chunk_overlap: 0,
chunk_min_tokens: 10,
};
let chunks = chunk_markdown(body, "T", "t.md", &strict_cfg);
for chunk in &chunks {
assert!(
chunk.token_estimate >= 10,
"chunk below min_tokens should have been filtered: {:?} (tokens: {})",
chunk.text,
chunk.token_estimate
);
}
}
#[test]
fn test_heading_context_tracked() {
let split_cfg = ChunkerConfig {
chunk_tokens: 15,
chunk_overlap: 0,
chunk_min_tokens: 1,
};
let section_one = "# Section One\n\nThis is the first paragraph under section one. It has enough words to require splitting.";
let section_two = "## Subsection\n\nThis is the second paragraph under the subsection. It also has enough words to force a split.";
let body = format!("{section_one}\n\n{section_two}");
let chunks = chunk_markdown(&body, "Doc", "doc.md", &split_cfg);
let has_section_one = chunks.iter().any(|c| c.heading_path == "Section One");
let has_subsection = chunks
.iter()
.any(|c| c.heading_path == "Section One > Subsection");
assert!(
has_section_one,
"first section heading path should appear; chunks: {:?}",
chunks.iter().map(|c| &c.heading_path).collect::<Vec<_>>()
);
assert!(
has_subsection,
"subsection heading path should appear; chunks: {:?}",
chunks.iter().map(|c| &c.heading_path).collect::<Vec<_>>()
);
}
#[test]
fn test_empty_body_produces_no_chunks() {
assert!(chunk_markdown("", "T", "t.md", &default_cfg()).is_empty());
}
#[test]
fn test_whitespace_only_body_produces_no_chunks() {
assert!(chunk_markdown(" \n\n ", "T", "t.md", &default_cfg()).is_empty());
}
#[test]
fn test_chunk_hash_is_stable() {
let body = "# Test\n\nSome stable content.";
let c1 = chunk_markdown(body, "Test", "test.md", &default_cfg());
let c2 = chunk_markdown(body, "Test", "test.md", &default_cfg());
assert_eq!(c1.len(), c2.len());
for (a, b) in c1.iter().zip(c2.iter()) {
assert_eq!(a.chunk_hash, b.chunk_hash);
}
}