pub mod code_blocks;
mod headings;
mod tables;
pub use code_blocks::{extract_fenced_blocks, FencedBlock};
use std::path::Path;
use std::sync::LazyLock;
use regex::Regex;
use super::types::{CallSite, Chunk, ChunkType, FunctionCalls, Language, ParserError};
use headings::{detect_heading_levels, extract_headings};
use tables::{extract_table_chunks, TableContext};
static LINK_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").expect("valid regex"));
static FUNC_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"`([\w.:]+)\([^)]*\)`").expect("valid regex"));
fn min_section_lines() -> usize {
static CACHE: std::sync::OnceLock<usize> = std::sync::OnceLock::new();
*CACHE.get_or_init(|| {
std::env::var("CQS_MD_MIN_SECTION_LINES")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(30)
})
}
fn max_section_lines() -> usize {
static CACHE: std::sync::OnceLock<usize> = std::sync::OnceLock::new();
*CACHE.get_or_init(|| {
std::env::var("CQS_MD_MAX_SECTION_LINES")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(150)
})
}
struct ChunkFields<'a> {
path: &'a Path,
id: String,
name: String,
signature: String,
content: String,
line_start: u32,
line_end: u32,
content_hash: String,
parent_id: Option<String>,
}
fn make_markdown_chunk(fields: ChunkFields<'_>) -> Chunk {
Chunk {
id: fields.id,
file: fields.path.to_path_buf(),
language: Language::Markdown,
chunk_type: ChunkType::Section,
name: fields.name,
signature: fields.signature,
content: fields.content,
doc: None,
line_start: fields.line_start,
line_end: fields.line_end,
content_hash: fields.content_hash,
parent_id: fields.parent_id,
window_idx: None,
parent_type_name: None,
}
}
pub fn parse_markdown_chunks(source: &str, path: &Path) -> Result<Vec<Chunk>, ParserError> {
let _span = tracing::debug_span!("parse_markdown_chunks", path = %path.display()).entered();
let lines: Vec<&str> = source.lines().collect();
let headings = extract_headings(&lines);
if headings.is_empty() {
let name = path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("untitled")
.to_string();
let content = source.to_string();
let content_hash = blake3::hash(content.as_bytes()).to_hex().to_string();
let hash_prefix = content_hash.get(..8).unwrap_or(&content_hash);
let id = format!("{}:1:{}", path.display(), hash_prefix);
let mut chunks = vec![make_markdown_chunk(ChunkFields {
path,
id: id.clone(),
name: name.clone(),
signature: name.clone(),
content,
line_start: 1,
line_end: lines.len() as u32,
content_hash,
parent_id: None,
})];
extract_table_chunks(
&TableContext {
lines: &lines,
section_start: 0,
section_end: lines.len(),
section_name: &name,
signature: &name,
section_id: &id,
path,
},
&mut chunks,
);
return Ok(chunks);
}
if headings.len() == 1 {
let h = &headings[0];
let content = source.to_string();
let content_hash = blake3::hash(content.as_bytes()).to_hex().to_string();
let hash_prefix = content_hash.get(..8).unwrap_or(&content_hash);
let line_start = 1;
let line_end = lines.len() as u32;
let id = format!("{}:{}:{}", path.display(), line_start, hash_prefix);
let mut chunks = vec![make_markdown_chunk(ChunkFields {
path,
id: id.clone(),
name: h.text.clone(),
signature: h.text.clone(),
content,
line_start,
line_end,
content_hash,
parent_id: None,
})];
extract_table_chunks(
&TableContext {
lines: &lines,
section_start: 0,
section_end: lines.len(),
section_name: &h.text,
signature: &h.text,
section_id: &id,
path,
},
&mut chunks,
);
return Ok(chunks);
}
let (title_idx, primary_level, overflow_level) = detect_heading_levels(&headings);
let mut sections = build_sections(&lines, &headings, title_idx, primary_level);
if let Some(ovf) = overflow_level {
sections = overflow_split(sections, &headings, ovf);
}
sections = merge_small_sections(sections);
let title_text = title_idx.map(|i| headings[i].text.as_str()).unwrap_or("");
let mut chunks = Vec::with_capacity(sections.len());
for section in §ions {
let line_start = section.line_start as u32 + 1; let line_end = section.line_end as u32;
let content = lines[section.line_start..section.line_end].join("\n");
let content_hash = blake3::hash(content.as_bytes()).to_hex().to_string();
let hash_prefix = content_hash.get(..8).unwrap_or(&content_hash);
let id = format!("{}:{}:{}", path.display(), line_start, hash_prefix);
let signature = build_breadcrumb(title_text, §ion.heading_stack);
chunks.push(make_markdown_chunk(ChunkFields {
path,
id: id.clone(),
name: section.name.clone(),
signature: signature.clone(),
content,
line_start,
line_end,
content_hash,
parent_id: None,
}));
extract_table_chunks(
&TableContext {
lines: &lines,
section_start: section.line_start,
section_end: section.line_end,
section_name: §ion.name,
signature: &signature,
section_id: &id,
path,
},
&mut chunks,
);
}
Ok(chunks)
}
pub fn parse_markdown_references(
source: &str,
path: &Path,
) -> Result<Vec<FunctionCalls>, ParserError> {
let _span = tracing::debug_span!("parse_markdown_references", path = %path.display()).entered();
let lines: Vec<&str> = source.lines().collect();
let headings = extract_headings(&lines);
if headings.is_empty() {
let calls = extract_references_from_text(source);
if calls.is_empty() {
return Ok(vec![]);
}
let name = path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("untitled")
.to_string();
return Ok(vec![FunctionCalls {
name,
line_start: 1,
calls,
}]);
}
let mut results = Vec::new();
for i in 0..headings.len() {
let start = headings[i].line;
let end = if i + 1 < headings.len() {
headings[i + 1].line
} else {
lines.len()
};
let section_text = lines[start..end].join("\n");
let calls = extract_references_from_text(§ion_text);
if !calls.is_empty() {
results.push(FunctionCalls {
name: headings[i].text.clone(),
line_start: start as u32 + 1,
calls,
});
}
}
if let Some(title) = headings.first() {
let file_stem = path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("")
.to_string();
if file_stem.len() > 1 && file_stem != title.text {
results.push(FunctionCalls {
name: file_stem,
line_start: 1,
calls: vec![CallSite {
callee_name: title.text.clone(),
line_number: 1,
}],
});
}
}
Ok(results)
}
pub fn extract_calls_from_markdown_chunk(chunk: &Chunk) -> Vec<CallSite> {
extract_references_from_text(&chunk.content)
}
#[derive(Debug)]
struct Section {
name: String,
heading_stack: Vec<String>, line_start: usize, line_end: usize, }
fn build_sections(
lines: &[&str],
headings: &[headings::Heading],
title_idx: Option<usize>,
primary_level: u32,
) -> Vec<Section> {
let primary_headings: Vec<&headings::Heading> = headings
.iter()
.enumerate()
.filter(|(i, h)| h.level == primary_level && title_idx != Some(*i))
.map(|(_, h)| h)
.collect();
if primary_headings.is_empty() {
let name = headings[0].text.clone();
return vec![Section {
name,
heading_stack: vec![],
line_start: 0,
line_end: lines.len(),
}];
}
let mut sections = Vec::new();
if let Some(ti) = title_idx {
let first_primary_line = primary_headings[0].line;
if headings[ti].line < first_primary_line {
let content_start = headings[ti].line;
let has_content = lines[content_start..first_primary_line]
.iter()
.any(|l| !l.trim().is_empty() && !l.trim().starts_with('#'));
if has_content {
sections.push(Section {
name: headings[ti].text.clone(),
heading_stack: vec![],
line_start: content_start,
line_end: first_primary_line,
});
}
}
}
let mut parent_stack: Vec<(u32, String)> = Vec::new();
for (i, ph) in primary_headings.iter().enumerate() {
let line_start = ph.line;
let line_end = if i + 1 < primary_headings.len() {
primary_headings[i + 1].line
} else {
lines.len()
};
let search_start = if i == 0 {
0
} else {
primary_headings[i - 1].line
};
for h in headings {
if h.line >= search_start && h.line < line_start && h.level < primary_level {
parent_stack.retain(|(lvl, _)| *lvl < h.level);
parent_stack.push((h.level, h.text.clone()));
}
}
let heading_stack: Vec<String> = parent_stack.iter().map(|(_, t)| t.clone()).collect();
sections.push(Section {
name: ph.text.clone(),
heading_stack,
line_start,
line_end,
});
}
sections
}
fn overflow_split(
sections: Vec<Section>,
headings: &[headings::Heading],
overflow_level: u32,
) -> Vec<Section> {
let mut result = Vec::new();
for section in sections {
let section_lines = section.line_end - section.line_start;
if section_lines <= max_section_lines() {
result.push(section);
continue;
}
let sub_headings: Vec<&headings::Heading> = headings
.iter()
.filter(|h| {
h.level == overflow_level
&& h.line > section.line_start
&& h.line < section.line_end
})
.collect();
if sub_headings.is_empty() {
result.push(section);
continue;
}
if sub_headings[0].line > section.line_start {
result.push(Section {
name: section.name.clone(),
heading_stack: section.heading_stack.clone(),
line_start: section.line_start,
line_end: sub_headings[0].line,
});
}
for (i, sh) in sub_headings.iter().enumerate() {
let end = if i + 1 < sub_headings.len() {
sub_headings[i + 1].line
} else {
section.line_end
};
let mut stack = section.heading_stack.clone();
stack.push(section.name.clone());
result.push(Section {
name: sh.text.clone(),
heading_stack: stack,
line_start: sh.line,
line_end: end,
});
}
}
result
}
fn merge_small_sections(sections: Vec<Section>) -> Vec<Section> {
if sections.len() <= 1 {
return sections;
}
let mut result: Vec<Section> = Vec::new();
let mut pending_start: Option<usize> = None;
let mut pending_end: usize = 0;
for section in sections {
let section_lines = section.line_end - section.line_start;
if section_lines < min_section_lines() {
if pending_start.is_none() {
pending_start = Some(section.line_start);
}
pending_end = section.line_end;
} else {
let mut section = section;
if let Some(start) = pending_start.take() {
section.line_start = start;
}
result.push(section);
}
}
if let Some(start) = pending_start {
if let Some(last) = result.last_mut() {
last.line_end = pending_end;
} else {
result.push(Section {
name: "Document".to_string(),
heading_stack: vec![],
line_start: start,
line_end: pending_end,
});
}
}
result
}
fn build_breadcrumb(title: &str, heading_stack: &[String]) -> String {
let mut parts = Vec::new();
if !title.is_empty() {
parts.push(title.to_string());
}
for h in heading_stack {
if !parts.contains(h) {
parts.push(h.clone());
}
}
if parts.is_empty() {
return String::new();
}
parts.join(" > ")
}
fn extract_md_file_stem(url: &str) -> Option<String> {
if url.starts_with("http://") || url.starts_with("https://") || url.starts_with("//") {
return None;
}
if url.starts_with('/') {
return None;
}
let path_part = url.split('#').next().unwrap_or(url);
if !path_part.ends_with(".md") && !path_part.ends_with(".mdx") {
return None;
}
let filename = path_part.rsplit(['/', '\\']).next().unwrap_or(path_part);
let stem = filename
.strip_suffix(".mdx")
.or_else(|| filename.strip_suffix(".md"))?;
if stem.is_empty() || stem.len() == 1 {
return None;
}
Some(stem.to_string())
}
fn extract_anchor(url: &str) -> Option<String> {
let anchor = url.split_once('#')?.1;
if anchor.is_empty() {
return None;
}
Some(anchor.to_string())
}
fn extract_references_from_text(text: &str) -> Vec<CallSite> {
let mut calls = Vec::new();
let mut seen = std::collections::HashSet::new();
for cap in LINK_RE.captures_iter(text) {
let Some(full_match) = cap.get(0) else {
continue;
};
let match_start = full_match.start();
if match_start > 0 && text.as_bytes()[match_start - 1] == b'!' {
continue;
}
let link_text = cap[1].to_string();
let line_number = text[..match_start].matches('\n').count() as u32 + 1;
if !link_text.is_empty() && seen.insert(link_text.clone()) {
calls.push(CallSite {
callee_name: link_text,
line_number,
});
}
let url = cap[2].to_string();
if let Some(stem) = extract_md_file_stem(&url) {
if seen.insert(stem.clone()) {
calls.push(CallSite {
callee_name: stem,
line_number,
});
}
}
if let Some(anchor) = extract_anchor(&url) {
if seen.insert(anchor.clone()) {
calls.push(CallSite {
callee_name: anchor,
line_number,
});
}
}
}
for cap in FUNC_RE.captures_iter(text) {
let full_ref = &cap[1];
let callee_name = full_ref.to_string();
if !callee_name.is_empty() && seen.insert(callee_name.clone()) {
let Some(full_match) = cap.get(0) else {
continue;
};
let match_start = full_match.start();
let line_number = text[..match_start].matches('\n').count() as u32 + 1;
calls.push(CallSite {
callee_name,
line_number,
});
}
}
calls
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
fn test_path() -> PathBuf {
PathBuf::from("test.md")
}
#[test]
fn test_no_headings_fallback() {
let source = "Just some text\nwith no headings\nat all.\n";
let chunks = parse_markdown_chunks(source, &test_path()).unwrap();
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].name, "test");
assert_eq!(chunks[0].chunk_type, ChunkType::Section);
assert_eq!(chunks[0].signature, "test");
}
#[test]
fn test_single_heading_fallback() {
let source = "# Only Title\n\nSome content below.\n";
let chunks = parse_markdown_chunks(source, &test_path()).unwrap();
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].name, "Only Title");
assert_eq!(chunks[0].signature, "Only Title");
}
#[test]
fn test_standard_hierarchy() {
let mut source = String::from("# Title\n\nIntro text.\n\n## Section A\n\n");
for i in 0..35 {
source.push_str(&format!("Section A line {}.\n", i));
}
source.push_str("\n## Section B\n\n");
for i in 0..35 {
source.push_str(&format!("Section B line {}.\n", i));
}
let chunks = parse_markdown_chunks(&source, &test_path()).unwrap();
assert!(
chunks.len() >= 2,
"got {} chunks: {:?}",
chunks.len(),
chunks.iter().map(|c| c.name.as_str()).collect::<Vec<_>>()
);
let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
assert!(names.contains(&"Section A"));
assert!(names.contains(&"Section B"));
let sec_a = chunks.iter().find(|c| c.name == "Section A").unwrap();
assert!(
sec_a.signature.contains("Title"),
"signature was: {}",
sec_a.signature
);
}
#[test]
fn test_inverted_hierarchy() {
let mut source = String::new();
source.push_str("## AVEVA Historian Concepts\n\n");
source.push_str("Introduction text.\n\n");
source.push_str("# Process Data\n\n");
for i in 0..80 {
source.push_str(&format!("Line {} of process data content.\n", i));
}
source.push_str("\n# Data Acquisition\n\n");
for i in 0..80 {
source.push_str(&format!("Line {} of data acquisition content.\n", i));
}
let chunks = parse_markdown_chunks(&source, &test_path()).unwrap();
let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
assert!(names.contains(&"Process Data"));
assert!(names.contains(&"Data Acquisition"));
let pd = chunks.iter().find(|c| c.name == "Process Data").unwrap();
assert!(
pd.signature.contains("AVEVA Historian Concepts"),
"signature was: {}",
pd.signature
);
}
#[test]
fn test_cross_references_extracted() {
let source =
"# Docs\n\n## API\n\nSee [TagRead](api.md) for details.\nUse `TagRead()` to read.\n";
let refs = parse_markdown_references(source, &test_path()).unwrap();
assert!(!refs.is_empty());
let all_callees: Vec<&str> = refs
.iter()
.flat_map(|fc| fc.calls.iter().map(|c| c.callee_name.as_str()))
.collect();
assert!(all_callees.contains(&"TagRead"));
assert!(
all_callees.contains(&"api"),
"Should extract file stem 'api' from api.md link: {:?}",
all_callees
);
assert!(all_callees.contains(&"TagRead"));
}
#[test]
fn test_image_links_not_extracted() {
let source = "# Doc\n\n\n[real link](other.md)\n";
let refs = parse_markdown_references(source, &test_path()).unwrap();
let all_callees: Vec<&str> = refs
.iter()
.flat_map(|fc| fc.calls.iter().map(|c| c.callee_name.as_str()))
.collect();
assert!(!all_callees.contains(&"screenshot"));
assert!(all_callees.contains(&"real link"));
assert!(
all_callees.contains(&"other"),
"Should extract file stem 'other': {:?}",
all_callees
);
}
#[test]
fn test_backtick_function_refs() {
let text = "Call `Module.func()` and `Class::method(arg)` for results.";
let calls = extract_references_from_text(text);
let names: Vec<&str> = calls.iter().map(|c| c.callee_name.as_str()).collect();
assert!(names.contains(&"Module.func"));
assert!(names.contains(&"Class::method"));
}
#[test]
fn test_link_extracts_file_stem() {
let text = "[Configuration Guide](config.md)";
let calls = extract_references_from_text(text);
let names: Vec<&str> = calls.iter().map(|c| c.callee_name.as_str()).collect();
assert!(names.contains(&"Configuration Guide"));
assert!(
names.contains(&"config"),
"Should extract file stem: {:?}",
names
);
}
#[test]
fn test_link_extracts_anchor() {
let text = "[Database Settings](config.md#db-settings)";
let calls = extract_references_from_text(text);
let names: Vec<&str> = calls.iter().map(|c| c.callee_name.as_str()).collect();
assert!(names.contains(&"Database Settings"));
assert!(
names.contains(&"config"),
"Should extract file stem: {:?}",
names
);
assert!(
names.contains(&"db-settings"),
"Should extract anchor: {:?}",
names
);
}
#[test]
fn test_link_extracts_both_stem_and_anchor() {
let text = "[X](foo.md#bar)";
let calls = extract_references_from_text(text);
let names: Vec<&str> = calls.iter().map(|c| c.callee_name.as_str()).collect();
assert!(names.contains(&"X"));
assert!(names.contains(&"foo"));
assert!(names.contains(&"bar"));
}
#[test]
fn test_external_links_no_stem() {
let text = "[Docs](https://example.com/page.md) and [API](http://api.com)";
let calls = extract_references_from_text(text);
let names: Vec<&str> = calls.iter().map(|c| c.callee_name.as_str()).collect();
assert!(names.contains(&"Docs"));
assert!(names.contains(&"API"));
assert!(
!names.contains(&"page"),
"Should not extract stem from external URL: {:?}",
names
);
}
#[test]
fn test_self_anchor_link() {
let text = "[Jump to setup](#setup-instructions)";
let calls = extract_references_from_text(text);
let names: Vec<&str> = calls.iter().map(|c| c.callee_name.as_str()).collect();
assert!(names.contains(&"Jump to setup"));
assert!(
names.contains(&"setup-instructions"),
"Should extract self-anchor: {:?}",
names
);
}
#[test]
fn test_link_with_directory_prefix() {
let text = "[Setup](../guides/setup-guide.md)";
let calls = extract_references_from_text(text);
let names: Vec<&str> = calls.iter().map(|c| c.callee_name.as_str()).collect();
assert!(names.contains(&"Setup"));
assert!(
names.contains(&"setup-guide"),
"Should extract stem from last path component: {:?}",
names
);
assert!(!names.contains(&"../guides/setup-guide"));
}
#[test]
fn test_link_non_md_target() {
let text = "[Source](main.rs) and [Schema](schema.sql)";
let calls = extract_references_from_text(text);
let names: Vec<&str> = calls.iter().map(|c| c.callee_name.as_str()).collect();
assert!(names.contains(&"Source"));
assert!(names.contains(&"Schema"));
assert!(
!names.contains(&"main"),
"Should not extract stem from .rs: {:?}",
names
);
assert!(
!names.contains(&"schema"),
"Should not extract stem from .sql: {:?}",
names
);
}
#[test]
fn test_extract_md_file_stem_helper() {
assert_eq!(
extract_md_file_stem("config.md"),
Some("config".to_string())
);
assert_eq!(extract_md_file_stem("page.mdx"), Some("page".to_string()));
assert_eq!(
extract_md_file_stem("dir/file.md"),
Some("file".to_string())
);
assert_eq!(
extract_md_file_stem("../other/doc.md"),
Some("doc".to_string())
);
assert_eq!(
extract_md_file_stem("config.md#anchor"),
Some("config".to_string())
);
assert_eq!(extract_md_file_stem("https://example.com/page.md"), None);
assert_eq!(extract_md_file_stem("http://foo.md"), None);
assert_eq!(extract_md_file_stem("/absolute/path.md"), None);
assert_eq!(extract_md_file_stem("code.rs"), None);
assert_eq!(extract_md_file_stem(""), None);
assert_eq!(extract_md_file_stem(".md"), None); assert_eq!(extract_md_file_stem("a.md"), None); }
#[test]
fn test_extract_anchor_helper() {
assert_eq!(
extract_anchor("file.md#section"),
Some("section".to_string())
);
assert_eq!(
extract_anchor("#local-anchor"),
Some("local-anchor".to_string())
);
assert_eq!(extract_anchor("file.md"), None);
assert_eq!(extract_anchor("file.md#"), None); assert_eq!(extract_anchor(""), None);
}
#[test]
fn test_bridge_edge_emitted() {
let source = "# Configuration Guide\n\n## Database\n\nSome content.\n";
let path = PathBuf::from("config.md");
let refs = parse_markdown_references(source, &path).unwrap();
let bridge = refs.iter().find(|fc| fc.name == "config");
assert!(
bridge.is_some(),
"Should emit bridge edge for file stem 'config': {:?}",
refs.iter().map(|fc| &fc.name).collect::<Vec<_>>()
);
let bridge = bridge.unwrap();
assert_eq!(bridge.calls.len(), 1);
assert_eq!(bridge.calls[0].callee_name, "Configuration Guide");
}
#[test]
fn test_bridge_edge_skipped_when_stem_equals_title() {
let source = "# overview\n\nContent here.\n";
let path = PathBuf::from("overview.md");
let refs = parse_markdown_references(source, &path).unwrap();
let bridge = refs.iter().find(|fc| fc.name == "overview");
assert!(
bridge.is_none(),
"Should not emit bridge when stem equals title: {:?}",
refs.iter().map(|fc| &fc.name).collect::<Vec<_>>()
);
}
#[test]
fn test_bridge_edge_skipped_no_headings() {
let source = "Just plain text with no headings at all.\n";
let path = PathBuf::from("notes.md");
let refs = parse_markdown_references(source, &path).unwrap();
let bridge = refs.iter().find(|fc| fc.name == "notes");
assert!(
bridge.is_none(),
"Should not emit bridge when no headings: {:?}",
refs.iter().map(|fc| &fc.name).collect::<Vec<_>>()
);
}
#[test]
fn test_bridge_edge_with_directory_path() {
let source = "# AVEVA System Platform\n\nContent.\n";
let path = PathBuf::from("docs/aveva-system-platform.md");
let refs = parse_markdown_references(source, &path).unwrap();
let bridge = refs.iter().find(|fc| fc.name == "aveva-system-platform");
assert!(
bridge.is_some(),
"Should emit bridge using file stem from full path: {:?}",
refs.iter().map(|fc| &fc.name).collect::<Vec<_>>()
);
assert_eq!(
bridge.unwrap().calls[0].callee_name,
"AVEVA System Platform"
);
}
}