kbolt-core 0.1.7

Core engine for kbolt local-first retrieval
Documentation
use std::collections::HashMap;
use std::ops::Range;
use std::path::Path;

use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Options, Parser, Tag, TagEnd};

use crate::ingest::extract::{BlockKind, ExtractedBlock, ExtractedDocument, Extractor};
use crate::Result;

pub struct MarkdownExtractor;

impl Extractor for MarkdownExtractor {
    fn supports(&self) -> &[&str] {
        &["md", "markdown", "mdown", "mkd"]
    }

    fn profile_key(&self) -> &'static str {
        "md"
    }

    fn version(&self) -> u32 {
        2
    }

    fn extract(&self, _path: &Path, bytes: &[u8]) -> Result<ExtractedDocument> {
        let source = std::str::from_utf8(bytes).map_err(|err| {
            kbolt_types::KboltError::InvalidInput(format!("non-utf8 markdown input: {err}"))
        })?;

        let mut blocks = Vec::new();
        let mut heading_stack: Vec<String> = Vec::new();
        let mut open_blocks: Vec<OpenBlock> = Vec::new();
        let mut title: Option<String> = None;
        let parser = Parser::new_ext(source, Options::all());

        for (event, range) in parser.into_offset_iter() {
            match event {
                Event::Start(tag) => {
                    if let Some(open) =
                        open_block_for_tag(&tag, range.start, &heading_stack, &open_blocks)
                    {
                        open_blocks.push(open);
                    }
                }
                Event::End(tag_end) => {
                    let Some(index) = open_blocks
                        .iter()
                        .rposition(|open| open.matches_end(&tag_end))
                    else {
                        continue;
                    };
                    let open = open_blocks.remove(index);
                    let exclude_end = range.end.min(source.len());
                    for parent in &mut open_blocks {
                        if parent.start <= open.start {
                            parent.excluded_ranges.push(open.start..exclude_end);
                        }
                    }

                    let span_end = trim_trailing_newlines(source, open.start, range.end);
                    if span_end <= open.start {
                        continue;
                    }

                    let text = block_text(source, open.start, span_end, &open.excluded_ranges);
                    if text.trim().is_empty() {
                        continue;
                    }

                    if let OpenKind::Heading(level) = open.kind {
                        let heading = extract_heading_label(text.as_str());
                        if !heading.is_empty() {
                            apply_heading(&mut heading_stack, level, heading.clone());
                            if title.is_none() {
                                title = Some(heading);
                            }
                        }
                    }

                    let length = text.len();
                    blocks.push(ExtractedBlock {
                        text,
                        offset: open.start,
                        length,
                        kind: open.block_kind,
                        heading_path: open.heading_path,
                        attrs: open.attrs,
                    });
                }
                _ => {}
            }
        }

        blocks.sort_by_key(|block| block.offset);

        Ok(ExtractedDocument {
            blocks,
            metadata: HashMap::new(),
            title,
        })
    }
}

#[derive(Debug, Clone)]
struct OpenBlock {
    kind: OpenKind,
    block_kind: BlockKind,
    start: usize,
    heading_path: Vec<String>,
    attrs: HashMap<String, String>,
    excluded_ranges: Vec<Range<usize>>,
}

impl OpenBlock {
    fn matches_end(&self, end: &TagEnd) -> bool {
        match (&self.kind, end) {
            (OpenKind::Heading(level), TagEnd::Heading(end_level)) => {
                *level == heading_level(end_level)
            }
            (OpenKind::Paragraph, TagEnd::Paragraph) => true,
            (OpenKind::ListItem, TagEnd::Item) => true,
            (OpenKind::BlockQuote, TagEnd::BlockQuote(_)) => true,
            (OpenKind::CodeFence, TagEnd::CodeBlock) => true,
            (OpenKind::TableHeader, TagEnd::TableHead) => true,
            (OpenKind::TableRow, TagEnd::TableRow) => true,
            (OpenKind::HtmlBlock, TagEnd::HtmlBlock) => true,
            _ => false,
        }
    }
}

#[derive(Debug, Clone, Copy)]
enum OpenKind {
    Heading(usize),
    Paragraph,
    ListItem,
    BlockQuote,
    CodeFence,
    TableHeader,
    TableRow,
    HtmlBlock,
}

fn open_block_for_tag(
    tag: &Tag<'_>,
    start: usize,
    heading_path: &[String],
    open_blocks: &[OpenBlock],
) -> Option<OpenBlock> {
    let (kind, block_kind, attrs) = match tag {
        Tag::Heading { level, .. } => (
            OpenKind::Heading(heading_level(level)),
            BlockKind::Heading,
            HashMap::new(),
        ),
        Tag::Paragraph if inside_list_or_quote(open_blocks) => return None,
        Tag::Paragraph => (OpenKind::Paragraph, BlockKind::Paragraph, HashMap::new()),
        Tag::Item => (OpenKind::ListItem, BlockKind::ListItem, HashMap::new()),
        Tag::BlockQuote(_) => (OpenKind::BlockQuote, BlockKind::BlockQuote, HashMap::new()),
        Tag::CodeBlock(kind) => {
            let mut attrs = HashMap::new();
            if let CodeBlockKind::Fenced(info) = kind {
                if let Some(language) = info.split_whitespace().next() {
                    if !language.is_empty() {
                        attrs.insert("language".to_string(), language.to_string());
                    }
                }
            }
            (OpenKind::CodeFence, BlockKind::CodeFence, attrs)
        }
        Tag::TableHead => (
            OpenKind::TableHeader,
            BlockKind::TableHeader,
            HashMap::new(),
        ),
        Tag::TableRow => (OpenKind::TableRow, BlockKind::TableRow, HashMap::new()),
        Tag::HtmlBlock => (OpenKind::HtmlBlock, BlockKind::HtmlBlock, HashMap::new()),
        _ => return None,
    };

    Some(OpenBlock {
        kind,
        block_kind,
        start,
        heading_path: heading_path.to_vec(),
        attrs,
        excluded_ranges: Vec::new(),
    })
}

fn inside_list_or_quote(open_blocks: &[OpenBlock]) -> bool {
    open_blocks
        .iter()
        .any(|open| matches!(open.kind, OpenKind::ListItem | OpenKind::BlockQuote))
}

fn heading_level(level: &HeadingLevel) -> usize {
    match level {
        HeadingLevel::H1 => 1,
        HeadingLevel::H2 => 2,
        HeadingLevel::H3 => 3,
        HeadingLevel::H4 => 4,
        HeadingLevel::H5 => 5,
        HeadingLevel::H6 => 6,
    }
}

fn apply_heading(stack: &mut Vec<String>, level: usize, heading: String) {
    while stack.len() >= level {
        stack.pop();
    }
    stack.push(heading);
}

fn extract_heading_label(raw_markdown: &str) -> String {
    let line = raw_markdown.lines().next().unwrap_or("").trim();
    let stripped = line
        .trim_start_matches('#')
        .trim()
        .trim_end_matches('#')
        .trim();

    if stripped.is_empty() {
        line.to_string()
    } else {
        stripped.to_string()
    }
}

fn trim_trailing_newlines(source: &str, start: usize, end: usize) -> usize {
    let bytes = source.as_bytes();
    let mut cursor = end.min(bytes.len());
    while cursor > start && matches!(bytes[cursor - 1], b'\n' | b'\r') {
        cursor -= 1;
    }
    cursor
}

fn block_text(source: &str, start: usize, end: usize, excluded_ranges: &[Range<usize>]) -> String {
    if excluded_ranges.is_empty() {
        return source[start..end].to_string();
    }

    let mut ranges = excluded_ranges
        .iter()
        .filter_map(|range| {
            let range_start = range.start.max(start).min(end);
            let range_end = range.end.max(start).min(end);
            (range_start < range_end).then_some(range_start..range_end)
        })
        .collect::<Vec<_>>();
    ranges.sort_by_key(|range| range.start);

    let mut text = String::new();
    let mut cursor = start;
    for range in ranges {
        if range.start > cursor {
            text.push_str(&source[cursor..range.start]);
        }
        cursor = cursor.max(range.end);
    }
    if cursor < end {
        text.push_str(&source[cursor..end]);
    }

    text.trim_end_matches([' ', '\t', '\n', '\r']).to_string()
}

#[cfg(test)]
mod tests {
    use std::path::Path;

    use crate::ingest::extract::{BlockKind, Extractor};
    use crate::ingest::markdown::MarkdownExtractor;

    #[test]
    fn extracts_heading_paths_for_nested_sections() {
        let extractor = MarkdownExtractor;
        assert_eq!(extractor.profile_key(), "md");
        let markdown = br#"# Title
Intro paragraph.

## Details
More text.
"#;

        let doc = extractor
            .extract(Path::new("docs/readme.md"), markdown)
            .expect("extract markdown");

        assert_eq!(doc.title.as_deref(), Some("Title"));
        assert!(
            doc.blocks
                .iter()
                .any(|block| block.kind == BlockKind::Heading),
            "expected heading blocks"
        );
        assert!(
            doc.blocks.iter().any(|block| {
                block.kind == BlockKind::Paragraph
                    && block.heading_path == vec!["Title".to_string(), "Details".to_string()]
            }),
            "expected paragraph to carry nested heading path"
        );
    }

    #[test]
    fn emits_list_quote_and_code_blocks_with_attrs() {
        let extractor = MarkdownExtractor;
        let markdown = br#"# Guide
- first item

> quoted text

```rust
fn main() {}
```
"#;

        let doc = extractor
            .extract(Path::new("docs/guide.md"), markdown)
            .expect("extract markdown");

        assert!(doc
            .blocks
            .iter()
            .any(|block| block.kind == BlockKind::ListItem));
        assert!(doc
            .blocks
            .iter()
            .any(|block| block.kind == BlockKind::BlockQuote));
        let code = doc
            .blocks
            .iter()
            .find(|block| block.kind == BlockKind::CodeFence)
            .expect("code fence block");
        assert_eq!(code.attrs.get("language").map(String::as_str), Some("rust"));
    }

    #[test]
    fn nested_list_items_do_not_duplicate_child_text() {
        let extractor = MarkdownExtractor;
        let doc = extractor
            .extract(
                Path::new("docs/list.md"),
                br#"- parent listtarget
  - child nestedtarget
"#,
            )
            .expect("extract markdown");

        let list_items = doc
            .blocks
            .iter()
            .filter(|block| block.kind == BlockKind::ListItem)
            .map(|block| block.text.as_str())
            .collect::<Vec<_>>();
        assert_eq!(
            list_items,
            vec!["- parent listtarget", "- child nestedtarget"]
        );

        let canonical = list_items.join("\n\n");
        assert_eq!(canonical.matches("nestedtarget").count(), 1);
        assert!(!canonical.contains("listtarget\n  - child"));
        assert!(doc
            .blocks
            .iter()
            .all(|block| block.length == block.text.len()));
    }

    #[test]
    fn rejects_non_utf8_markdown_bytes() {
        let extractor = MarkdownExtractor;
        let err = extractor
            .extract(Path::new("docs/readme.md"), &[0xff, 0xfe, 0xfd])
            .expect_err("invalid utf8 should fail");
        assert!(err.to_string().contains("non-utf8 markdown input"));
    }
}