use std::fmt;
use std::path::{Path, PathBuf};
use gobby_core::indexing::Chunk;
use serde_json::json;
use crate::frontmatter::{FrontmatterError, WikiFrontmatter, parse_frontmatter};
use crate::links::{WikiLink, extract_links};
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct MarkdownHeading {
pub level: u8,
pub title: String,
pub path: Vec<String>,
pub byte_start: usize,
pub byte_end: usize,
pub section_byte_start: usize,
pub section_byte_end: usize,
}
#[derive(Debug, Clone, PartialEq)]
pub struct MarkdownDomainRecord {
pub path: PathBuf,
pub frontmatter: WikiFrontmatter,
pub body_start: usize,
pub headings: Vec<MarkdownHeading>,
pub links: Vec<WikiLink>,
pub chunks: Vec<Chunk>,
}
#[derive(Debug)]
pub enum MarkdownParseError {
Frontmatter(FrontmatterError),
Io(std::io::Error),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) struct MarkdownFence {
marker: u8,
len: usize,
}
pub(crate) fn markdown_fence_start(line: &str) -> Option<MarkdownFence> {
let leading_spaces = line.len() - line.trim_start_matches(' ').len();
if leading_spaces > 3 {
return None;
}
let trimmed = &line[leading_spaces..];
let marker = match trimmed.as_bytes().first().copied()? {
b'`' | b'~' => trimmed.as_bytes()[0],
_ => return None,
};
let len = trimmed.bytes().take_while(|byte| *byte == marker).count();
(len >= 3).then_some(MarkdownFence { marker, len })
}
pub(crate) fn markdown_fence_closes(line: &str, fence: MarkdownFence) -> bool {
let leading_spaces = line.len() - line.trim_start_matches(' ').len();
if leading_spaces > 3 {
return false;
}
let trimmed = &line[leading_spaces..];
let len = trimmed
.bytes()
.take_while(|byte| *byte == fence.marker)
.count();
len >= fence.len && trimmed[len..].trim().is_empty()
}
impl fmt::Display for MarkdownParseError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Frontmatter(error) => write!(f, "{error}"),
Self::Io(error) => write!(f, "{error}"),
}
}
}
impl std::error::Error for MarkdownParseError {}
impl From<FrontmatterError> for MarkdownParseError {
fn from(error: FrontmatterError) -> Self {
Self::Frontmatter(error)
}
}
impl From<std::io::Error> for MarkdownParseError {
fn from(error: std::io::Error) -> Self {
Self::Io(error)
}
}
pub fn parse_markdown<I, S>(
path: impl Into<PathBuf>,
markdown: &str,
known_targets: I,
) -> Result<MarkdownDomainRecord, MarkdownParseError>
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
let path = path.into();
let frontmatter = parse_frontmatter(markdown)?;
let links = extract_links(markdown, known_targets);
let headings = extract_headings(markdown, frontmatter.body_start);
let chunks = build_chunks(&path, markdown, frontmatter.body_start, &headings);
Ok(MarkdownDomainRecord {
path,
frontmatter: frontmatter.metadata,
body_start: frontmatter.body_start,
headings,
links,
chunks,
})
}
pub fn parse_index_file<I, S>(
path: impl AsRef<Path>,
known_targets: I,
) -> Result<MarkdownDomainRecord, MarkdownParseError>
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
let path = path.as_ref();
let markdown = std::fs::read_to_string(path)?;
parse_markdown(path.to_path_buf(), &markdown, known_targets)
}
fn extract_headings(markdown: &str, body_start: usize) -> Vec<MarkdownHeading> {
let mut headings = Vec::new();
let mut heading_path = Vec::new();
let mut offset = body_start;
let mut fence: Option<MarkdownFence> = None;
while offset < markdown.len() {
let line_end = markdown[offset..]
.find('\n')
.map_or(markdown.len(), |relative| offset + relative);
let line_content_end = markdown[..line_end]
.strip_suffix('\r')
.map_or(line_end, |line| line.len());
let line = &markdown[offset..line_content_end];
if let Some(active_fence) = fence {
if markdown_fence_closes(line, active_fence) {
fence = None;
}
} else if let Some(opening_fence) = markdown_fence_start(line) {
fence = Some(opening_fence);
} else if let Some((level, title)) = parse_atx_heading(line) {
let parent_depth = usize::from(level.saturating_sub(1));
heading_path.truncate(parent_depth.min(heading_path.len()));
heading_path.push(title.clone());
headings.push(MarkdownHeading {
level,
title,
path: heading_path.clone(),
byte_start: offset,
byte_end: line_content_end,
section_byte_start: offset,
section_byte_end: markdown.len(),
});
}
if line_end == markdown.len() {
break;
}
offset = line_end + 1;
}
let section_ends: Vec<usize> = headings
.iter()
.skip(1)
.map(|heading| heading.byte_start)
.chain(std::iter::once(markdown.len()))
.collect();
for (heading, section_end) in headings.iter_mut().zip(section_ends) {
heading.section_byte_end = section_end;
}
headings
}
pub(crate) fn parse_atx_heading(line: &str) -> Option<(u8, String)> {
let leading_spaces = line.len() - line.trim_start_matches(' ').len();
if leading_spaces > 3 {
return None;
}
let line = &line[leading_spaces..];
let level = line.bytes().take_while(|byte| *byte == b'#').count();
if !(1..=6).contains(&level) {
return None;
}
let after_marks = &line[level..];
if !after_marks.is_empty() && !after_marks.chars().next().is_some_and(char::is_whitespace) {
return None;
}
let title = strip_atx_closing_sequence(after_marks.trim()).to_string();
Some((level as u8, title))
}
fn strip_atx_closing_sequence(title: &str) -> &str {
let mut hash_start = title.len();
let mut saw_hash = false;
for (index, ch) in title.char_indices().rev() {
if ch == '#' {
saw_hash = true;
hash_start = index;
} else {
break;
}
}
if saw_hash
&& hash_start > 0
&& title[..hash_start]
.chars()
.next_back()
.is_some_and(char::is_whitespace)
{
title[..hash_start].trim_end()
} else {
title
}
}
fn build_chunks(
path: &Path,
markdown: &str,
body_start: usize,
headings: &[MarkdownHeading],
) -> Vec<Chunk> {
let mut chunks = Vec::new();
let mut next_start = body_start;
for heading in headings {
push_chunk(
&mut chunks,
path,
markdown,
next_start,
heading.byte_start,
None,
Vec::new(),
);
push_chunk(
&mut chunks,
path,
markdown,
heading.section_byte_start,
heading.section_byte_end,
Some(heading.title.clone()),
heading.path.clone(),
);
next_start = heading.section_byte_end;
}
push_chunk(
&mut chunks,
path,
markdown,
next_start,
markdown.len(),
None,
Vec::new(),
);
chunks
}
fn push_chunk(
chunks: &mut Vec<Chunk>,
path: &Path,
markdown: &str,
byte_start: usize,
byte_end: usize,
heading: Option<String>,
heading_path: Vec<String>,
) {
if byte_start >= byte_end || markdown[byte_start..byte_end].trim().is_empty() {
return;
}
chunks.push(Chunk {
file_path: path.to_path_buf(),
byte_start,
byte_end,
heading,
metadata: json!({ "heading_path": heading_path }),
});
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::Value;
#[test]
fn extracts_heading_ranges() {
let markdown = concat!(
"---\n",
"title: Parser Test\n",
"---\n",
"Intro text.\n",
"\n",
"# Overview\n",
"Overview body with [[Known]].\n",
"\n",
"# Details\n",
"Details body.\n",
);
let overview_start = markdown.find("# Overview").expect("overview offset");
let details_start = markdown.find("# Details").expect("details offset");
let parsed =
parse_markdown("wiki/topics/parser.md", markdown, ["Known"]).expect("parse markdown");
assert_eq!(
parsed.body_start,
markdown.find("Intro text.").expect("body offset")
);
assert_eq!(parsed.headings.len(), 2);
assert_eq!(parsed.headings[0].title, "Overview");
assert_eq!(parsed.headings[0].path, vec!["Overview"]);
assert_eq!(parsed.headings[0].byte_start, overview_start);
assert_eq!(parsed.headings[0].section_byte_start, overview_start);
assert_eq!(parsed.headings[0].section_byte_end, details_start);
assert_eq!(parsed.headings[1].title, "Details");
assert_eq!(parsed.headings[1].section_byte_start, details_start);
assert_eq!(parsed.headings[1].section_byte_end, markdown.len());
assert_eq!(parsed.chunks.len(), 3);
assert_eq!(parsed.chunks[0].heading, None);
assert_eq!(parsed.chunks[0].byte_start, parsed.body_start);
assert_eq!(parsed.chunks[0].byte_end, overview_start);
assert_eq!(parsed.chunks[1].heading.as_deref(), Some("Overview"));
assert_eq!(parsed.chunks[1].byte_start, overview_start);
assert_eq!(parsed.chunks[1].byte_end, details_start);
assert_eq!(
parsed.chunks[1]
.metadata
.get("heading_path")
.and_then(Value::as_array)
.and_then(|path| path.first())
.and_then(Value::as_str),
Some("Overview")
);
}
#[test]
fn headings_ignore_code_until_matching_fence_length_closes() {
let markdown = "````md\n# Not Heading\n```\n# Still Not Heading\n````\n# Heading\n";
let parsed = parse_markdown(
"wiki/topics/fences.md",
markdown,
std::iter::empty::<&str>(),
)
.expect("parse markdown");
assert_eq!(parsed.headings.len(), 1);
assert_eq!(parsed.headings[0].title, "Heading");
}
#[test]
fn index_parse_is_read_only() {
let tmp = tempfile::tempdir().expect("tempdir");
let page = tmp.path().join("wiki/topics/Page.md");
std::fs::create_dir_all(page.parent().expect("parent")).expect("create parent");
let markdown = "---\ntitle: Page\n---\n# Page\nSee [[Other Page]].\n";
std::fs::write(&page, markdown).expect("write page");
let parsed = parse_index_file(&page, ["Other Page"]).expect("parse index file");
assert_eq!(parsed.path, page);
assert_eq!(parsed.frontmatter.title.as_deref(), Some("Page"));
assert_eq!(parsed.links.len(), 1);
assert_eq!(
std::fs::read_to_string(&parsed.path).expect("read page"),
markdown
);
}
#[test]
fn atx_heading_keeps_hash_without_preceding_space() {
assert_eq!(
parse_atx_heading("# C#").map(|(_, title)| title),
Some("C#".to_string())
);
assert_eq!(
parse_atx_heading("# Title ###").map(|(_, title)| title),
Some("Title".to_string())
);
}
}