use crate::normalize::normalize_document;
use crate::types::{
DocumentExtractor, ExtractContext, ExtractOutput, ExtractWarning, ExtractedSegment,
LocationKind, LocationQuality, SegmentKind, read_error_category,
};
use orbok_core::{ErrorCategory, OrbokError, OrbokResult, versions::NORMALIZATION_VERSION};
use orbok_fs::ValidatedPath;
pub struct MarkdownExtractor;
impl DocumentExtractor for MarkdownExtractor {
fn name(&self) -> &'static str {
"markdown"
}
fn version(&self) -> &'static str {
"markdown-v1"
}
fn supported_extensions(&self) -> &'static [&'static str] {
&["md", "markdown"]
}
fn extract_with_context(
&self,
path: &ValidatedPath,
context: &ExtractContext,
) -> OrbokResult<ExtractOutput> {
let limits = &context.limits;
let mut warnings = Vec::new();
let meta = std::fs::metadata(&path.canonical).map_err(|e| OrbokError::Extraction {
category: read_error_category(&e),
message: e.to_string(),
})?;
if meta.len() > limits.max_file_bytes {
return Err(OrbokError::Extraction {
category: ErrorCategory::FileTooLarge,
message: format!(
"file is {} bytes, limit is {}",
meta.len(),
limits.max_file_bytes
),
});
}
let bytes = std::fs::read(&path.canonical).map_err(|e| OrbokError::Extraction {
category: read_error_category(&e),
message: e.to_string(),
})?;
let raw = String::from_utf8(bytes).map_err(|_| OrbokError::Extraction {
category: ErrorCategory::EncodingError,
message: "file is not valid UTF-8".into(),
})?;
let normalized = normalize_document(&raw);
let mut segments = parse_markdown(&normalized);
let mut char_count = normalized.chars().count() as u64;
if segments.len() > limits.max_segments {
segments.truncate(limits.max_segments);
warnings.push(ExtractWarning::SizeLimitReached {
limit_name: "max_segments".into(),
});
}
if char_count > limits.max_extracted_chars {
let mut kept = 0usize;
let mut kept_chars = 0u64;
for seg in &segments {
let n = seg.text.chars().count() as u64;
if kept_chars + n > limits.max_extracted_chars {
break;
}
kept_chars += n;
kept += 1;
}
segments.truncate(kept);
char_count = kept_chars;
warnings.push(ExtractWarning::SizeLimitReached {
limit_name: "max_extracted_chars".into(),
});
}
Ok(ExtractOutput {
extractor_name: self.name().into(),
extractor_version: self.version().into(),
normalization_version: NORMALIZATION_VERSION.into(),
char_count,
segments,
warnings,
})
}
fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
self.extract_with_context(path, &ExtractContext::default())
}
}
struct HeadingStack(Vec<(u8, String)>);
impl HeadingStack {
fn push(&mut self, level: u8, title: &str) {
self.0.retain(|(l, _)| *l < level);
self.0.push((level, title.to_string()));
}
fn path(&self) -> Option<String> {
if self.0.is_empty() {
None
} else {
Some(
self.0
.iter()
.map(|(_, t)| t.as_str())
.collect::<Vec<_>>()
.join(" > "),
)
}
}
}
fn parse_markdown(normalized: &str) -> Vec<ExtractedSegment> {
let lines: Vec<&str> = normalized.lines().collect();
let mut segments = Vec::new();
let mut headings = HeadingStack(Vec::new());
let mut paragraph: Vec<&str> = Vec::new();
let mut paragraph_start = 0u32;
let mut idx = 0usize;
macro_rules! flush_paragraph {
($end:expr) => {
if !paragraph.is_empty() {
segments.push(ExtractedSegment {
kind: SegmentKind::Paragraph,
text: paragraph.join("\n"),
line_start: paragraph_start,
line_end: $end,
location_kind: LocationKind::Lines,
heading_path: headings.path(),
location_quality: LocationQuality::Exact,
});
paragraph.clear();
}
};
}
while idx < lines.len() {
let line = lines[idx];
let line_no = idx as u32 + 1;
if let Some((level, title)) = parse_atx_heading(line) {
flush_paragraph!(line_no - 1);
headings.push(level, title);
segments.push(ExtractedSegment {
kind: SegmentKind::Heading,
text: title.to_string(),
line_start: line_no,
line_end: line_no,
location_kind: LocationKind::Lines,
heading_path: headings.path(),
location_quality: LocationQuality::Exact,
});
idx += 1;
continue;
}
if let Some(fence) = parse_fence(line) {
flush_paragraph!(line_no - 1);
let start = line_no;
let mut body = Vec::new();
idx += 1;
while idx < lines.len() && parse_fence(lines[idx]) != Some(fence) {
body.push(lines[idx]);
idx += 1;
}
let end = (idx as u32) + 1;
idx += 1;
segments.push(ExtractedSegment {
kind: SegmentKind::CodeBlock,
text: body.join("\n"),
line_start: start,
line_end: end.min(lines.len() as u32),
location_kind: LocationKind::Lines,
heading_path: headings.path(),
location_quality: LocationQuality::Exact,
});
continue;
}
if line.trim().is_empty() {
flush_paragraph!(line_no - 1);
idx += 1;
continue;
}
if paragraph.is_empty() {
paragraph_start = line_no;
}
paragraph.push(line);
idx += 1;
}
let last = lines.len() as u32;
flush_paragraph!(last);
segments
}
fn parse_atx_heading(line: &str) -> Option<(u8, &str)> {
let trimmed = line.trim_start();
let hashes = trimmed.bytes().take_while(|b| *b == b'#').count();
if (1..=6).contains(&hashes) {
let rest = &trimmed[hashes..];
if let Some(title) = rest.strip_prefix(' ') {
let title = title.trim().trim_end_matches('#').trim_end();
if !title.is_empty() {
return Some((hashes as u8, title));
}
}
}
None
}
fn parse_fence(line: &str) -> Option<char> {
let trimmed = line.trim_start();
for fence_char in ['`', '~'] {
let count = trimmed.chars().take_while(|c| *c == fence_char).count();
if count >= 3 {
return Some(fence_char);
}
}
None
}