use crate::normalize::normalize_document;
use crate::types::{
DocumentExtractor, ExtractOutput, ExtractedSegment, LocationQuality, SegmentKind,
read_error_category,
};
use orbok_core::{ErrorCategory, OrbokError, OrbokResult, versions::NORMALIZATION_VERSION};
use orbok_fs::ValidatedPath;
pub struct MarkdownExtractor;
impl DocumentExtractor for MarkdownExtractor {
fn name(&self) -> &'static str {
"markdown"
}
fn version(&self) -> &'static str {
"markdown-v1"
}
fn supported_extensions(&self) -> &'static [&'static str] {
&["md", "markdown"]
}
fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
let bytes = std::fs::read(&path.canonical).map_err(|e| OrbokError::Extraction {
category: read_error_category(&e),
message: e.to_string(),
})?;
let raw = String::from_utf8(bytes).map_err(|_| OrbokError::Extraction {
category: ErrorCategory::EncodingError,
message: "file is not valid UTF-8".into(),
})?;
let normalized = normalize_document(&raw);
let segments = parse_markdown(&normalized);
Ok(ExtractOutput {
extractor_name: self.name().into(),
extractor_version: self.version().into(),
normalization_version: NORMALIZATION_VERSION.into(),
char_count: normalized.chars().count() as u64,
segments,
})
}
}
struct HeadingStack(Vec<(u8, String)>);
impl HeadingStack {
fn push(&mut self, level: u8, title: &str) {
self.0.retain(|(l, _)| *l < level);
self.0.push((level, title.to_string()));
}
fn path(&self) -> Option<String> {
if self.0.is_empty() {
None
} else {
Some(
self.0
.iter()
.map(|(_, t)| t.as_str())
.collect::<Vec<_>>()
.join(" > "),
)
}
}
}
fn parse_markdown(normalized: &str) -> Vec<ExtractedSegment> {
let lines: Vec<&str> = normalized.lines().collect();
let mut segments = Vec::new();
let mut headings = HeadingStack(Vec::new());
let mut paragraph: Vec<&str> = Vec::new();
let mut paragraph_start = 0u32;
let mut idx = 0usize;
macro_rules! flush_paragraph {
($end:expr) => {
if !paragraph.is_empty() {
segments.push(ExtractedSegment {
kind: SegmentKind::Paragraph,
text: paragraph.join("\n"),
line_start: paragraph_start,
line_end: $end,
heading_path: headings.path(),
location_quality: LocationQuality::Exact,
});
paragraph.clear();
}
};
}
while idx < lines.len() {
let line = lines[idx];
let line_no = idx as u32 + 1;
if let Some((level, title)) = parse_atx_heading(line) {
flush_paragraph!(line_no - 1);
headings.push(level, title);
segments.push(ExtractedSegment {
kind: SegmentKind::Heading,
text: title.to_string(),
line_start: line_no,
line_end: line_no,
heading_path: headings.path(),
location_quality: LocationQuality::Exact,
});
idx += 1;
continue;
}
if let Some(fence) = parse_fence(line) {
flush_paragraph!(line_no - 1);
let start = line_no;
let mut body = Vec::new();
idx += 1;
while idx < lines.len() && parse_fence(lines[idx]) != Some(fence) {
body.push(lines[idx]);
idx += 1;
}
let end = (idx as u32) + 1; idx += 1; segments.push(ExtractedSegment {
kind: SegmentKind::CodeBlock,
text: body.join("\n"),
line_start: start,
line_end: end.min(lines.len() as u32),
heading_path: headings.path(),
location_quality: LocationQuality::Exact,
});
continue;
}
if line.trim().is_empty() {
flush_paragraph!(line_no - 1);
idx += 1;
continue;
}
if paragraph.is_empty() {
paragraph_start = line_no;
}
paragraph.push(line);
idx += 1;
}
let last = lines.len() as u32;
flush_paragraph!(last);
segments
}
fn parse_atx_heading(line: &str) -> Option<(u8, &str)> {
let trimmed = line.trim_start();
let hashes = trimmed.bytes().take_while(|b| *b == b'#').count();
if (1..=6).contains(&hashes) {
let rest = &trimmed[hashes..];
if let Some(title) = rest.strip_prefix(' ') {
let title = title.trim().trim_end_matches('#').trim_end();
if !title.is_empty() {
return Some((hashes as u8, title));
}
}
}
None
}
fn parse_fence(line: &str) -> Option<char> {
let trimmed = line.trim_start();
for fence_char in ['`', '~'] {
let count = trimmed.chars().take_while(|c| *c == fence_char).count();
if count >= 3 {
return Some(fence_char);
}
}
None
}