use anyhow::Result;
use cartog_core::{Symbol, SymbolKind};
use crate::ExtractionResult;
const MAX_CHUNK_BYTES: usize = 1500;
fn is_heading(line: &str) -> bool {
let trimmed = line.trim_start();
trimmed.starts_with('#') && {
let hashes = trimmed.bytes().take_while(|&b| b == b'#').count();
(1..=6).contains(&hashes)
&& trimmed
.as_bytes()
.get(hashes)
.map_or(true, |&b| b == b' ' || b == b'\t')
}
}
fn heading_text<'a>(line: &'a str, fallback: &'a str) -> &'a str {
let trimmed = line.trim_start();
let after_hashes = trimmed.trim_start_matches('#');
let text = after_hashes.trim();
if text.is_empty() {
fallback
} else {
text
}
}
struct Section<'a> {
heading: Option<&'a str>,
start_byte: usize,
end_byte: usize,
}
fn split_by_headings(source: &str) -> Vec<Section<'_>> {
let mut sections: Vec<Section<'_>> = Vec::new();
let mut current_heading: Option<&str> = None;
let mut section_start: usize = 0;
for (byte_offset, line) in LineByteOffsets::new(source) {
if is_heading(line) {
if byte_offset > section_start {
sections.push(Section {
heading: current_heading,
start_byte: section_start,
end_byte: byte_offset,
});
}
current_heading = Some(line);
section_start = byte_offset;
}
}
if section_start < source.len() {
sections.push(Section {
heading: current_heading,
start_byte: section_start,
end_byte: source.len(),
});
}
sections
}
struct LineByteOffsets<'a> {
source: &'a str,
offset: usize,
}
impl<'a> LineByteOffsets<'a> {
fn new(source: &'a str) -> Self {
Self { source, offset: 0 }
}
}
impl<'a> Iterator for LineByteOffsets<'a> {
type Item = (usize, &'a str);
fn next(&mut self) -> Option<Self::Item> {
if self.offset > self.source.len() {
return None;
}
let remaining = &self.source[self.offset..];
if remaining.is_empty() {
self.offset = self.source.len() + 1; return None;
}
let line_end = remaining.find('\n').unwrap_or(remaining.len());
let line = &remaining[..line_end];
let current_offset = self.offset;
self.offset += line_end + 1; Some((current_offset, line))
}
}
fn paragraph_breaks(slice: &str) -> Vec<usize> {
let bytes = slice.as_bytes();
let mut breaks = Vec::new();
for i in 1..bytes.len() {
if bytes[i] == b'\n' && bytes[i - 1] == b'\n' {
breaks.push(i + 1);
}
}
breaks
}
fn chunk_at_paragraphs(source: &str, start: usize, end: usize) -> Vec<(usize, usize)> {
let slice = &source[start..end];
if slice.trim().is_empty() {
return Vec::new();
}
if end - start <= MAX_CHUNK_BYTES {
return vec![(start, end)];
}
let breaks = paragraph_breaks(slice);
if breaks.is_empty() {
return vec![(start, end)];
}
let mut chunks = Vec::new();
let mut chunk_start = 0usize; let mut last_break = 0usize;
for &brk in &breaks {
if brk - chunk_start > MAX_CHUNK_BYTES && last_break > chunk_start {
chunks.push((start + chunk_start, start + last_break));
chunk_start = last_break;
}
last_break = brk;
}
if chunk_start < slice.len() {
let remaining = &slice[chunk_start..];
if !remaining.trim().is_empty() {
chunks.push((start + chunk_start, end));
}
}
chunks
}
fn line_number_at(source: &str, byte_offset: usize) -> u32 {
let end = byte_offset.min(source.len());
source.as_bytes()[..end]
.iter()
.filter(|&&b| b == b'\n')
.count() as u32
+ 1
}
#[derive(Default)]
pub struct MarkdownExtractor;
impl MarkdownExtractor {
pub fn new() -> Self {
Self
}
}
impl crate::Extractor for MarkdownExtractor {
fn extract(&mut self, source: &str, file_path: &str) -> Result<ExtractionResult> {
if source.trim().is_empty() {
return Ok(ExtractionResult::default());
}
let file_stem = std::path::Path::new(file_path)
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or(file_path);
let sections = split_by_headings(source);
let has_headings = sections.iter().any(|s| s.heading.is_some());
let mut symbols = Vec::new();
if has_headings {
for section in §ions {
let name = section
.heading
.map(|h| heading_text(h, file_stem))
.unwrap_or("preamble");
let sig = section.heading.map(|h| h.trim().to_string());
let chunks = chunk_at_paragraphs(source, section.start_byte, section.end_byte);
if chunks.len() <= 1 {
let content = &source[section.start_byte..section.end_byte];
if content.trim().is_empty() {
continue;
}
let start_line = line_number_at(source, section.start_byte);
let end_line = line_number_at(source, section.end_byte.saturating_sub(1));
let sym = Symbol::new(
name,
SymbolKind::Document,
file_path,
start_line,
end_line,
section.start_byte as u32,
section.end_byte as u32,
None,
)
.with_signature(sig);
symbols.push(sym);
} else {
for (i, &(cs, ce)) in chunks.iter().enumerate() {
let chunk_name = if i == 0 {
name.to_string()
} else {
format!("{name}_part_{}", i + 1)
};
let start_line = line_number_at(source, cs);
let end_line = line_number_at(source, ce.saturating_sub(1));
let sym = Symbol::new(
&chunk_name,
SymbolKind::Document,
file_path,
start_line,
end_line,
cs as u32,
ce as u32,
None,
)
.with_signature(sig.clone());
symbols.push(sym);
}
}
}
} else {
let chunks = chunk_at_paragraphs(source, 0, source.len());
for (i, &(cs, ce)) in chunks.iter().enumerate() {
let chunk_name = if chunks.len() == 1 {
file_stem.to_string()
} else {
format!("chunk_{}", i + 1)
};
let start_line = line_number_at(source, cs);
let end_line = line_number_at(source, ce.saturating_sub(1));
let sym = Symbol::new(
&chunk_name,
SymbolKind::Document,
file_path,
start_line,
end_line,
cs as u32,
ce as u32,
None,
);
symbols.push(sym);
}
}
Ok(ExtractionResult {
symbols,
edges: Vec::new(),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::Extractor;
#[test]
fn test_heading_based_splitting() {
let source =
"# Title\n\nIntro text.\n\n## Section A\n\nContent A.\n\n## Section B\n\nContent B.\n";
let mut ext = MarkdownExtractor::new();
let result = ext.extract(source, "doc.md").unwrap();
let names: Vec<&str> = result.symbols.iter().map(|s| s.name.as_str()).collect();
assert_eq!(names, vec!["Title", "Section A", "Section B"]);
for sym in &result.symbols {
assert_eq!(sym.kind, SymbolKind::Document);
assert_eq!(sym.file_path, "doc.md");
let content = &source[sym.start_byte as usize..sym.end_byte as usize];
assert!(!content.trim().is_empty());
}
}
#[test]
fn test_mixed_heading_levels() {
let source = "# H1\n\nPara.\n\n### H3\n\nDeep.\n\n## H2\n\nMid.\n";
let mut ext = MarkdownExtractor::new();
let result = ext.extract(source, "doc.md").unwrap();
let names: Vec<&str> = result.symbols.iter().map(|s| s.name.as_str()).collect();
assert_eq!(names, vec!["H1", "H3", "H2"]);
}
#[test]
fn test_no_headings_fallback() {
let source = "Just some plain text.\n\nAnother paragraph.\n\nThird paragraph.\n";
let mut ext = MarkdownExtractor::new();
let result = ext.extract(source, "notes.md").unwrap();
assert!(!result.symbols.is_empty());
assert_eq!(result.symbols[0].name, "notes");
assert_eq!(result.symbols[0].kind, SymbolKind::Document);
}
#[test]
fn test_no_headings_large_file_chunks() {
let para = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. ".repeat(10);
let source = format!("{para}\n\n{para}\n\n{para}\n\n{para}\n\n{para}\n\n{para}\n\n{para}\n\n{para}\n\n{para}\n\n{para}\n");
assert!(source.len() > MAX_CHUNK_BYTES);
let mut ext = MarkdownExtractor::new();
let result = ext.extract(&source, "big.md").unwrap();
assert!(result.symbols.len() > 1, "should produce multiple chunks");
for sym in &result.symbols {
assert_eq!(sym.kind, SymbolKind::Document);
assert!(sym.name.starts_with("chunk_"));
}
}
#[test]
fn test_empty_file() {
let mut ext = MarkdownExtractor::new();
let result = ext.extract("", "empty.md").unwrap();
assert!(result.symbols.is_empty());
let result2 = ext.extract(" \n \n ", "blank.md").unwrap();
assert!(result2.symbols.is_empty());
}
#[test]
fn test_headings_only_no_body() {
let source = "# Title\n## Section\n### Sub\n";
let mut ext = MarkdownExtractor::new();
let result = ext.extract(source, "headers.md").unwrap();
assert!(!result.symbols.is_empty());
for sym in &result.symbols {
assert_eq!(sym.kind, SymbolKind::Document);
}
}
#[test]
fn test_byte_offsets_match_source() {
let source = "# First\n\nHello world.\n\n# Second\n\nGoodbye.\n";
let mut ext = MarkdownExtractor::new();
let result = ext.extract(source, "test.md").unwrap();
assert_eq!(result.symbols.len(), 2);
let first = &result.symbols[0];
let first_content = &source[first.start_byte as usize..first.end_byte as usize];
assert!(first_content.starts_with("# First"));
assert!(first_content.contains("Hello world."));
let second = &result.symbols[1];
let second_content = &source[second.start_byte as usize..second.end_byte as usize];
assert!(second_content.starts_with("# Second"));
assert!(second_content.contains("Goodbye."));
}
#[test]
fn test_signature_is_heading_line() {
let source = "## Authentication\n\nUse JWT tokens.\n";
let mut ext = MarkdownExtractor::new();
let result = ext.extract(source, "doc.md").unwrap();
assert_eq!(
result.symbols[0].signature.as_deref(),
Some("## Authentication")
);
}
#[test]
fn test_preamble_before_first_heading() {
let source = "Some intro text.\n\n# Title\n\nBody.\n";
let mut ext = MarkdownExtractor::new();
let result = ext.extract(source, "doc.md").unwrap();
let names: Vec<&str> = result.symbols.iter().map(|s| s.name.as_str()).collect();
assert_eq!(names, vec!["preamble", "Title"]);
}
#[test]
fn test_no_edges() {
let source = "# Title\n\nContent.\n";
let mut ext = MarkdownExtractor::new();
let result = ext.extract(source, "doc.md").unwrap();
assert!(result.edges.is_empty());
}
#[test]
fn test_is_heading() {
assert!(is_heading("# Title"));
assert!(is_heading("## Sub"));
assert!(is_heading("###### Deep"));
assert!(is_heading("# "));
assert!(!is_heading("####### Too deep")); assert!(!is_heading("#not a heading")); assert!(!is_heading("regular text"));
assert!(!is_heading(""));
}
#[test]
fn test_large_section_sub_chunking() {
let para = "This is a paragraph with enough text to matter. ".repeat(20);
let source = format!("# Big Section\n\n{para}\n\n{para}\n\n{para}\n\n{para}\n");
assert!(source.len() > MAX_CHUNK_BYTES);
let mut ext = MarkdownExtractor::new();
let result = ext.extract(&source, "doc.md").unwrap();
assert!(
result.symbols.len() > 1,
"large section should be sub-chunked, got {} symbols",
result.symbols.len()
);
assert_eq!(result.symbols[0].name, "Big Section");
assert!(result.symbols[1].name.contains("part_"));
}
#[test]
fn test_chunks_respect_max_size() {
let para = "A".repeat(400);
let source = format!("{para}\n\n{para}\n\n{para}\n\n{para}\n\n{para}\n\n{para}\n");
assert!(source.len() > MAX_CHUNK_BYTES);
let chunks = chunk_at_paragraphs(&source, 0, source.len());
assert!(chunks.len() > 1, "should produce multiple chunks");
for &(cs, ce) in &chunks[..chunks.len() - 1] {
assert!(
ce - cs <= MAX_CHUNK_BYTES + 500, "chunk size {} exceeds limit",
ce - cs
);
}
}
#[test]
fn test_empty_heading_uses_fallback() {
let source = "#\n\nSome content here.\n";
let mut ext = MarkdownExtractor::new();
let result = ext.extract(source, "notes.md").unwrap();
assert_eq!(result.symbols.len(), 1);
assert_eq!(result.symbols[0].name, "notes");
}
#[test]
fn test_line_byte_offsets_iterator() {
let source = "aaa\nbbb\nccc";
let offsets: Vec<_> = LineByteOffsets::new(source).collect();
assert_eq!(offsets, vec![(0, "aaa"), (4, "bbb"), (8, "ccc")]);
}
#[test]
fn test_paragraph_breaks() {
let source = "para1\n\npara2\n\npara3";
let breaks = paragraph_breaks(source);
assert_eq!(breaks.len(), 2);
assert_eq!(&source[breaks[0]..breaks[0] + 5], "para2");
assert_eq!(&source[breaks[1]..breaks[1] + 5], "para3");
}
#[test]
fn test_line_number_at_offset_inside_multibyte_char() {
let source = "a\n\u{200B}b";
assert_eq!(source.len(), 6);
assert_eq!(line_number_at(source, 2), 2);
assert_eq!(line_number_at(source, 3), 2);
assert_eq!(line_number_at(source, 4), 2);
assert_eq!(line_number_at(source, 9999), 2);
}
#[test]
fn test_extract_with_zero_width_space_at_section_end() {
let source = "# A\n\nfoo\u{200B}";
let mut ext = MarkdownExtractor::new();
let result = ext.extract(source, "note.md").unwrap();
let names: Vec<&str> = result.symbols.iter().map(|s| s.name.as_str()).collect();
assert_eq!(names, vec!["A"]);
}
}