orbok-extract 0.18.0

orbok document extraction pipeline: extractor trait, normalization, text/markdown/pdf/docx/html extractors (RFC-005, RFC-044)
Documentation
//! Markdown extractor, `markdown-v1` (RFC-005 §5/§8: heading-aware,
//! fence-aware, exact line locations; RFC-044 §16.2 resource limits).
//!
//! Line-oriented by design: every segment maps to exact source lines so
//! search results can highlight the original file region. ATX headings
//! (`#`–`######`) maintain the heading path; fenced code blocks become
//! [`SegmentKind::CodeBlock`]; everything else groups into paragraphs.

use crate::normalize::normalize_document;
use crate::types::{
    DocumentExtractor, ExtractContext, ExtractOutput, ExtractWarning, ExtractedSegment,
    LocationKind, LocationQuality, SegmentKind, read_error_category,
};
use orbok_core::{ErrorCategory, OrbokError, OrbokResult, versions::NORMALIZATION_VERSION};
use orbok_fs::ValidatedPath;

pub struct MarkdownExtractor;

impl DocumentExtractor for MarkdownExtractor {
    fn name(&self) -> &'static str {
        "markdown"
    }

    fn version(&self) -> &'static str {
        "markdown-v1"
    }

    fn supported_extensions(&self) -> &'static [&'static str] {
        &["md", "markdown"]
    }

    fn extract_with_context(
        &self,
        path: &ValidatedPath,
        context: &ExtractContext,
    ) -> OrbokResult<ExtractOutput> {
        let limits = &context.limits;
        let mut warnings = Vec::new();

        // RFC-044 §9.5: check file size before reading.
        let meta = std::fs::metadata(&path.canonical).map_err(|e| OrbokError::Extraction {
            category: read_error_category(&e),
            message: e.to_string(),
        })?;
        if meta.len() > limits.max_file_bytes {
            return Err(OrbokError::Extraction {
                category: ErrorCategory::FileTooLarge,
                message: format!(
                    "file is {} bytes, limit is {}",
                    meta.len(),
                    limits.max_file_bytes
                ),
            });
        }

        let bytes = std::fs::read(&path.canonical).map_err(|e| OrbokError::Extraction {
            category: read_error_category(&e),
            message: e.to_string(),
        })?;
        let raw = String::from_utf8(bytes).map_err(|_| OrbokError::Extraction {
            category: ErrorCategory::EncodingError,
            message: "file is not valid UTF-8".into(),
        })?;

        let normalized = normalize_document(&raw);
        let mut segments = parse_markdown(&normalized);
        let mut char_count = normalized.chars().count() as u64;

        // RFC-044 §9.5: segment limit.
        if segments.len() > limits.max_segments {
            segments.truncate(limits.max_segments);
            warnings.push(ExtractWarning::SizeLimitReached {
                limit_name: "max_segments".into(),
            });
        }

        // RFC-044 §9.5: extracted char limit — truncate and warn.
        if char_count > limits.max_extracted_chars {
            let mut kept = 0usize;
            let mut kept_chars = 0u64;
            for seg in &segments {
                let n = seg.text.chars().count() as u64;
                if kept_chars + n > limits.max_extracted_chars {
                    break;
                }
                kept_chars += n;
                kept += 1;
            }
            segments.truncate(kept);
            char_count = kept_chars;
            warnings.push(ExtractWarning::SizeLimitReached {
                limit_name: "max_extracted_chars".into(),
            });
        }

        Ok(ExtractOutput {
            extractor_name: self.name().into(),
            extractor_version: self.version().into(),
            normalization_version: NORMALIZATION_VERSION.into(),
            char_count,
            segments,
            warnings,
        })
    }

    fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
        self.extract_with_context(path, &ExtractContext::default())
    }
}

struct HeadingStack(Vec<(u8, String)>);

impl HeadingStack {
    fn push(&mut self, level: u8, title: &str) {
        self.0.retain(|(l, _)| *l < level);
        self.0.push((level, title.to_string()));
    }

    fn path(&self) -> Option<String> {
        if self.0.is_empty() {
            None
        } else {
            Some(
                self.0
                    .iter()
                    .map(|(_, t)| t.as_str())
                    .collect::<Vec<_>>()
                    .join(" > "),
            )
        }
    }
}

fn parse_markdown(normalized: &str) -> Vec<ExtractedSegment> {
    let lines: Vec<&str> = normalized.lines().collect();
    let mut segments = Vec::new();
    let mut headings = HeadingStack(Vec::new());
    let mut paragraph: Vec<&str> = Vec::new();
    let mut paragraph_start = 0u32;
    let mut idx = 0usize;

    macro_rules! flush_paragraph {
        ($end:expr) => {
            if !paragraph.is_empty() {
                segments.push(ExtractedSegment {
                    kind: SegmentKind::Paragraph,
                    text: paragraph.join("\n"),
                    line_start: paragraph_start,
                    line_end: $end,
                    location_kind: LocationKind::Lines,
                    heading_path: headings.path(),
                    location_quality: LocationQuality::Exact,
                });
                paragraph.clear();
            }
        };
    }

    while idx < lines.len() {
        let line = lines[idx];
        let line_no = idx as u32 + 1;

        // ATX heading.
        if let Some((level, title)) = parse_atx_heading(line) {
            flush_paragraph!(line_no - 1);
            headings.push(level, title);
            segments.push(ExtractedSegment {
                kind: SegmentKind::Heading,
                text: title.to_string(),
                line_start: line_no,
                line_end: line_no,
                location_kind: LocationKind::Lines,
                heading_path: headings.path(),
                location_quality: LocationQuality::Exact,
            });
            idx += 1;
            continue;
        }

        // Fenced code block.
        if let Some(fence) = parse_fence(line) {
            flush_paragraph!(line_no - 1);
            let start = line_no;
            let mut body = Vec::new();
            idx += 1;
            while idx < lines.len() && parse_fence(lines[idx]) != Some(fence) {
                body.push(lines[idx]);
                idx += 1;
            }
            let end = (idx as u32) + 1;
            idx += 1;
            segments.push(ExtractedSegment {
                kind: SegmentKind::CodeBlock,
                text: body.join("\n"),
                line_start: start,
                line_end: end.min(lines.len() as u32),
                location_kind: LocationKind::Lines,
                heading_path: headings.path(),
                location_quality: LocationQuality::Exact,
            });
            continue;
        }

        // Blank line ends a paragraph.
        if line.trim().is_empty() {
            flush_paragraph!(line_no - 1);
            idx += 1;
            continue;
        }

        if paragraph.is_empty() {
            paragraph_start = line_no;
        }
        paragraph.push(line);
        idx += 1;
    }
    let last = lines.len() as u32;
    flush_paragraph!(last);
    segments
}

/// `#`–`######` followed by a space → (level, title).
fn parse_atx_heading(line: &str) -> Option<(u8, &str)> {
    let trimmed = line.trim_start();
    let hashes = trimmed.bytes().take_while(|b| *b == b'#').count();
    if (1..=6).contains(&hashes) {
        let rest = &trimmed[hashes..];
        if let Some(title) = rest.strip_prefix(' ') {
            let title = title.trim().trim_end_matches('#').trim_end();
            if !title.is_empty() {
                return Some((hashes as u8, title));
            }
        }
    }
    None
}

/// Code fence marker: three-or-more backticks or tildes.
fn parse_fence(line: &str) -> Option<char> {
    let trimmed = line.trim_start();
    for fence_char in ['`', '~'] {
        let count = trimmed.chars().take_while(|c| *c == fence_char).count();
        if count >= 3 {
            return Some(fence_char);
        }
    }
    None
}