ascent-research 0.4.2

//! Markdown → HTML rendering with three editorial conventions:
//!
//! 1. **Aside callout** — A blockquote whose first paragraph starts with
//!    `**aside:**` becomes `<p class="aside">…</p>`. Only the first such
//!    blockquote is extracted; subsequent ones stay as regular blockquotes
//!    and produce a warning.
//! 2. **Diagram inline** — An `![alt](diagrams/foo.svg)` image with a path
//!    under `<session_dir>/diagrams/` has its SVG contents inlined inside a
//!    `<div class="diagram">…<p class="caption">alt</p></div>` wrapper.
//!    Out-of-bounds paths are fatal; missing / oversize / non-SVG files
//!    degrade to a plain `<img>`.
//! 3. **Section numbering** — A heading whose text begins with `<digits> · `
//!    (e.g. `## 01 · WHY`) renders with the number pulled into a
//!    `<span class="section-num">` badge.
//!
//! Strategy: render with `pulldown-cmark` first, then post-process the HTML
//! with focused regex substitutions for each convention. Post-processing is
//! reliable because the input HTML is generated by a known renderer, not a
//! human.

use pulldown_cmark::{Options, Parser, html};
use regex::Regex;
use std::path::{Path, PathBuf};
use std::sync::OnceLock;

/// Max SVG file size we'll inline — larger diagrams fall back to `<img>`.
const SVG_MAX_BYTES: u64 = 512 * 1024;

#[derive(Debug, Clone)]
pub struct RenderResult {
    pub body_html: String,
    /// First `**aside:**` blockquote, already wrapped in `<p class="aside">`.
    /// Empty string when absent (template substitution treats it as a skip).
    pub aside_html: String,
    pub diagrams_inlined: u32,
    pub warnings: Vec<String>,
}

#[derive(Debug, Clone)]
pub enum RenderError {
    /// A `diagrams/…` image resolved to a path outside `<session_dir>/diagrams/`.
    DiagramOutOfBounds(PathBuf),
}

impl std::fmt::Display for RenderError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            RenderError::DiagramOutOfBounds(p) => write!(
                f,
                "diagram path '{}' resolves outside session_dir/diagrams/",
                p.display()
            ),
        }
    }
}
impl std::error::Error for RenderError {}

/// Render session.md body content to HTML with editorial conventions applied.
///
/// `session_dir` is the absolute path to the session directory on disk; it is
/// used to resolve `diagrams/…` image paths and enforce path-containment.
pub fn render_body(md: &str, session_dir: &Path) -> Result<RenderResult, RenderError> {
    // Strip the Sources block entirely — the report builder constructs it
    // from session.jsonl instead. Also strip the top-level `# Research: …`
    // H1 and the `## Objective` / `## Preset` / `## Sources` scaffolding so
    // the body begins at `## Overview`.
    let md = strip_scaffolding(md);

    let html = markdown_to_html(&md);
    let (html, aside_html, aside_count) = extract_aside(&html);
    let (html, diagrams_inlined, mut warnings) = inline_diagrams(&html, session_dir)?;
    let html = apply_section_numbers(&html);

    if aside_count > 1 {
        warnings.push("aside_multiple".to_string());
    }

    Ok(RenderResult {
        body_html: html,
        aside_html,
        diagrams_inlined,
        warnings,
    })
}

/// Render a wiki-page body: markdown → HTML with diagram inlining, but
/// **without** the session.md-oriented scaffolding strip, aside
/// extraction, or section-number styling. Wiki pages start with `# Slug`
/// plus flowing prose — they have no `## Overview` sentinel to anchor
/// `strip_scaffolding` on, and feeding them through `render_body` would
/// drop their entire body.
pub fn render_wiki_page(md: &str, session_dir: &Path) -> Result<RenderResult, RenderError> {
    let html = markdown_to_html(md);
    let (html, diagrams_inlined, warnings) = inline_diagrams(&html, session_dir)?;
    Ok(RenderResult {
        body_html: html,
        aside_html: String::new(),
        diagrams_inlined,
        warnings,
    })
}

fn markdown_to_html(md: &str) -> String {
    let opts = Options::ENABLE_TABLES | Options::ENABLE_STRIKETHROUGH;
    let parser = Parser::new_ext(md, opts);
    let mut out = String::with_capacity(md.len() * 2);
    html::push_html(&mut out, parser);
    out
}

/// Remove the boilerplate that session.md always carries — the top H1, the
/// Objective / Preset / Sources blocks. The body should start at `## Overview`.
fn strip_scaffolding(md: &str) -> String {
    // First pass: drop preamble, the `## Sources` block, and any section
    // whose body is empty or entirely HTML-comment placeholder. The empty-
    // section skip covers the default template's `## Findings` and
    // `## Notes` when the loop hasn't filled them in — they used to render
    // as naked `<h2>` stubs in the rich-html output.
    let sections = split_on_headings(md);
    let mut out = String::with_capacity(md.len());
    let mut seen_overview = false;
    for sec in sections {
        let heading = sec.heading.trim();
        if heading.starts_with("## Overview") {
            seen_overview = true;
        }
        if !seen_overview {
            continue;
        }
        if heading.starts_with("## Sources") {
            continue;
        }
        if section_body_is_empty_or_placeholder(&sec.body) {
            continue;
        }
        if !sec.heading.is_empty() {
            out.push_str(&sec.heading);
            out.push('\n');
        }
        out.push_str(&sec.body);
    }
    out
}

struct Section {
    heading: String,
    body: String,
}

fn split_on_headings(md: &str) -> Vec<Section> {
    let mut out: Vec<Section> = Vec::new();
    let mut current = Section {
        heading: String::new(),
        body: String::new(),
    };
    for line in md.lines() {
        let trimmed = line.trim_start();
        if trimmed.starts_with("## ") {
            if !current.heading.is_empty() || !current.body.is_empty() {
                out.push(std::mem::replace(
                    &mut current,
                    Section {
                        heading: String::new(),
                        body: String::new(),
                    },
                ));
            }
            current.heading = line.to_string();
        } else {
            current.body.push_str(line);
            current.body.push('\n');
        }
    }
    if !current.heading.is_empty() || !current.body.is_empty() {
        out.push(current);
    }
    out
}

fn section_body_is_empty_or_placeholder(body: &str) -> bool {
    let meaningful: Vec<&str> = body
        .lines()
        .map(|l| l.trim())
        .filter(|l| !l.is_empty())
        .filter(|l| !(l.starts_with("<!--") && l.ends_with("-->")))
        .collect();
    meaningful.is_empty()
}

// ── Aside extraction ────────────────────────────────────────────────────────

fn aside_re() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| {
        // Match any <blockquote>…</blockquote> whose first paragraph's first
        // inline is `<strong>aside:</strong>`. Captures the paragraph body.
        Regex::new(
            r"(?s)<blockquote>\s*<p>\s*<strong>aside:</strong>\s*(?P<body>.*?)</p>\s*</blockquote>\s*",
        )
        .expect("aside regex must compile")
    })
}

fn extract_aside(html: &str) -> (String, String, usize) {
    let re = aside_re();
    let total = re.find_iter(html).count();
    if total == 0 {
        return (html.to_string(), String::new(), 0);
    }
    // Take the first match body; leave later matches in place.
    let first = re.captures(html).unwrap();
    let body = first.name("body").unwrap().as_str().trim().to_string();
    let full_match = first.get(0).unwrap();
    let mut rewritten = String::with_capacity(html.len());
    rewritten.push_str(&html[..full_match.start()]);
    rewritten.push_str(&html[full_match.end()..]);
    let aside_html = format!("<p class=\"aside\">{body}</p>");
    (rewritten, aside_html, total)
}

// ── Diagram inlining ────────────────────────────────────────────────────────

fn img_re() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| {
        Regex::new(r#"<img src="(?P<src>[^"]+)" alt="(?P<alt>[^"]*)"(?: title="[^"]*")?\s*/?>"#)
            .expect("img regex must compile")
    })
}

fn inline_diagrams(
    html: &str,
    session_dir: &Path,
) -> Result<(String, u32, Vec<String>), RenderError> {
    let re = img_re();
    let mut out = String::with_capacity(html.len());
    let mut last = 0usize;
    let mut inlined_count = 0u32;
    let mut warnings = Vec::new();
    let diagrams_root = session_dir.join("diagrams");

    for m in re.captures_iter(html) {
        let full = m.get(0).unwrap();
        out.push_str(&html[last..full.start()]);
        last = full.end();

        let src = m.name("src").unwrap().as_str();
        let alt = m.name("alt").map(|m| m.as_str()).unwrap_or("");

        // Only intercept diagrams/…/*.svg paths.
        if !src.starts_with("diagrams/") || !src.to_ascii_lowercase().ends_with(".svg") {
            out.push_str(full.as_str());
            continue;
        }

        match resolve_diagram(src, &diagrams_root)? {
            DiagramResolve::Inlinable(abs_path) => {
                match std::fs::read_to_string(&abs_path) {
                    Ok(svg) => {
                        out.push_str("<div class=\"diagram\">");
                        out.push_str(&svg);
                        if !alt.is_empty() {
                            out.push_str("<p class=\"caption\">");
                            out.push_str(alt);
                            out.push_str("</p>");
                        }
                        out.push_str("</div>");
                        inlined_count += 1;
                    }
                    Err(_) => {
                        // Race between resolve + read — fall back.
                        out.push_str(full.as_str());
                        warnings.push("diagram_fallback_img".into());
                    }
                }
            }
            DiagramResolve::TooLarge | DiagramResolve::Missing | DiagramResolve::NotSvg => {
                // Author referenced `diagrams/<path>.svg` but no file
                // was written (or it was invalid / oversize). Rather
                // than leaving a broken-image `<img>` in the HTML, emit
                // a styled placeholder so the report stays readable and
                // the gap is obvious.
                let fname = src.strip_prefix("diagrams/").unwrap_or(src);
                let label = if alt.is_empty() { fname } else { alt };
                out.push_str(&format!(
                    r#"<div class="diagram diagram-missing"><p class="diagram-missing-label">diagram pending</p><p class="caption">{label} — <code>{src}</code></p></div>"#
                ));
                warnings.push("diagram_fallback_img".into());
            }
        }
    }
    out.push_str(&html[last..]);
    Ok((out, inlined_count, warnings))
}

enum DiagramResolve {
    Inlinable(PathBuf),
    TooLarge,
    Missing,
    NotSvg,
}

fn resolve_diagram(src: &str, diagrams_root: &Path) -> Result<DiagramResolve, RenderError> {
    // src is like "diagrams/foo.svg" — strip the prefix and join to root.
    let suffix = src.strip_prefix("diagrams/").unwrap_or(src);
    let candidate = diagrams_root.join(suffix);

    // Reject obvious traversal even before canonicalize (for missing files
    // where canonicalize fails).
    if suffix.contains("..") {
        return Err(RenderError::DiagramOutOfBounds(candidate));
    }

    match std::fs::canonicalize(&candidate) {
        Ok(canonical) => {
            let root_canonical = std::fs::canonicalize(diagrams_root)
                .unwrap_or_else(|_| diagrams_root.to_path_buf());
            if !canonical.starts_with(&root_canonical) {
                return Err(RenderError::DiagramOutOfBounds(canonical));
            }
            let meta = match std::fs::metadata(&canonical) {
                Ok(m) => m,
                Err(_) => return Ok(DiagramResolve::Missing),
            };
            if meta.len() > SVG_MAX_BYTES {
                return Ok(DiagramResolve::TooLarge);
            }
            // Sanity check content-type via extension only — we don't want to
            // read the first bytes for a magic sniff; agents set extensions.
            if canonical
                .extension()
                .and_then(|e| e.to_str())
                .map(|e| e.eq_ignore_ascii_case("svg"))
                != Some(true)
            {
                return Ok(DiagramResolve::NotSvg);
            }
            Ok(DiagramResolve::Inlinable(canonical))
        }
        Err(_) => Ok(DiagramResolve::Missing),
    }
}

// ── Section numbering ───────────────────────────────────────────────────────

fn section_num_re() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| {
        // Match <hN>01 · WHY</hN> style headings.
        Regex::new(r"<h(?P<lvl>[1-6])>(?P<num>\d{1,2})\s*·\s*(?P<title>.+?)</h[1-6]>")
            .expect("section num regex must compile")
    })
}

fn apply_section_numbers(html: &str) -> String {
    let re = section_num_re();
    re.replace_all(html, |caps: &regex::Captures| {
        let lvl = caps.name("lvl").unwrap().as_str();
        let num = caps.name("num").unwrap().as_str();
        let title = caps.name("title").unwrap().as_str();
        format!("<h{lvl}><span class=\"section-num\">{num}</span><span>{title}</span></h{lvl}>")
    })
    .into_owned()
}

#[cfg(test)]
mod tests {
    use super::*;
    use tempfile::TempDir;

    fn render(md: &str) -> RenderResult {
        let tmp = TempDir::new().unwrap();
        render_body(md, tmp.path()).unwrap()
    }

    #[test]
    fn aside_single_block_extracted() {
        let md = "## Overview\nbody\n\n> **aside:** The less you build, the more it works.\n\nmore body\n";
        let r = render(md);
        assert!(r.aside_html.contains("The less you build"));
        assert!(r.aside_html.starts_with("<p class=\"aside\">"));
        assert!(!r.body_html.contains("The less you build"));
        assert!(r.body_html.contains("more body"));
    }

    #[test]
    fn aside_absent_leaves_body_untouched() {
        let md = "## Overview\nno aside\n\n> just a plain quote\n";
        let r = render(md);
        assert!(r.aside_html.is_empty());
        assert!(r.body_html.contains("plain quote"));
    }

    #[test]
    fn multiple_asides_warn_and_keep_first() {
        let md = "## Overview\nx\n\n> **aside:** first\n\nmid\n\n> **aside:** second\n";
        let r = render(md);
        assert!(r.aside_html.contains("first"));
        assert!(!r.aside_html.contains("second"));
        assert!(r.warnings.iter().any(|w| w == "aside_multiple"));
        // The second aside survives in the body as a normal blockquote.
        assert!(r.body_html.contains("<strong>aside:</strong>"));
    }

    #[test]
    fn section_number_wrapped() {
        let md = "## Overview\ncontent\n\n## 01 · WHY\nintro\n";
        let r = render(md);
        assert!(
            r.body_html
                .contains("<span class=\"section-num\">01</span>")
        );
        assert!(r.body_html.contains("<span>WHY</span>"));
        assert!(!r.body_html.contains("<h2>01 ·"));
    }

    #[test]
    fn section_without_pattern_unchanged() {
        let md = "## Overview\n\n## Regular heading\nx\n";
        let r = render(md);
        assert!(r.body_html.contains("<h2>Regular heading</h2>"));
        assert!(!r.body_html.contains("section-num"));
    }

    #[test]
    fn diagram_inline_happy_path() {
        let tmp = TempDir::new().unwrap();
        std::fs::create_dir_all(tmp.path().join("diagrams")).unwrap();
        std::fs::write(
            tmp.path().join("diagrams/foo.svg"),
            "<svg xmlns=\"http://www.w3.org/2000/svg\"><circle r=\"5\"/></svg>",
        )
        .unwrap();

        let md = "## Overview\nx\n\n![Fig · demo](diagrams/foo.svg)\n";
        let r = render_body(md, tmp.path()).unwrap();

        assert_eq!(r.diagrams_inlined, 1);
        assert!(r.body_html.contains("<div class=\"diagram\">"));
        assert!(r.body_html.contains("<circle r=\"5\"/>"));
        assert!(r.body_html.contains("<p class=\"caption\">Fig · demo</p>"));
        assert!(!r.body_html.contains("<img"));
    }

    #[test]
    fn render_wiki_page_keeps_body_without_overview_heading() {
        // Regression for the wiki-render bug: `render_body` strips
        // everything before `## Overview`, so a wiki page that starts
        // with `# Slug` plus prose came out empty. `render_wiki_page`
        // keeps the body intact.
        let tmp = TempDir::new().unwrap();
        let md = "# Scheduler\n\nThe scheduler coordinates workers.\n\nSecond paragraph.\n";
        let r = render_wiki_page(md, tmp.path()).unwrap();
        assert!(r.body_html.contains("<h1>Scheduler</h1>"));
        assert!(r.body_html.contains("coordinates workers"));
        assert!(r.body_html.contains("Second paragraph"));
    }

    #[test]
    fn diagram_missing_renders_placeholder() {
        let tmp = TempDir::new().unwrap();
        std::fs::create_dir_all(tmp.path().join("diagrams")).unwrap();
        let md = "## Overview\nx\n\n![missing](diagrams/nope.svg)\n";
        let r = render_body(md, tmp.path()).unwrap();
        assert_eq!(r.diagrams_inlined, 0);
        // Do NOT leave a broken <img> tag — that renders a broken-
        // image icon in the browser. Emit a styled placeholder and
        // keep the warning so callers can surface it.
        assert!(!r.body_html.contains("<img src=\"diagrams/nope.svg\""));
        assert!(r.body_html.contains(r#"class="diagram diagram-missing""#));
        assert!(r.body_html.contains("diagram pending"));
        assert!(r.warnings.iter().any(|w| w == "diagram_fallback_img"));
    }

    #[test]
    fn diagram_out_of_bounds_is_fatal() {
        let tmp = TempDir::new().unwrap();
        std::fs::create_dir_all(tmp.path().join("diagrams")).unwrap();
        let md = "## Overview\nx\n\n![bad](diagrams/../../etc/passwd.svg)\n";
        let err = render_body(md, tmp.path()).unwrap_err();
        matches!(err, RenderError::DiagramOutOfBounds(_));
    }

    #[test]
    fn diagram_too_large_falls_back() {
        let tmp = TempDir::new().unwrap();
        std::fs::create_dir_all(tmp.path().join("diagrams")).unwrap();
        let big = "<svg>".to_string() + &"x".repeat((SVG_MAX_BYTES + 1) as usize) + "</svg>";
        std::fs::write(tmp.path().join("diagrams/big.svg"), big).unwrap();

        let md = "## Overview\nx\n\n![huge](diagrams/big.svg)\n";
        let r = render_body(md, tmp.path()).unwrap();
        assert_eq!(r.diagrams_inlined, 0);
        assert!(r.warnings.iter().any(|w| w == "diagram_fallback_img"));
    }

    #[test]
    fn non_diagram_img_untouched() {
        let tmp = TempDir::new().unwrap();
        let md = "## Overview\nx\n\n![logo](https://example.com/pic.png)\n";
        let r = render_body(md, tmp.path()).unwrap();
        assert!(
            r.body_html
                .contains("<img src=\"https://example.com/pic.png\"")
        );
        assert_eq!(r.warnings.len(), 0);
    }

    #[test]
    fn strip_scaffolding_removes_preamble() {
        let md = "# Research: X\n\n## Objective\nfoo\n\n## Preset\ntech\n\n## Sources\n<!-- research:sources-start -->\n- foo\n<!-- research:sources-end -->\n\n## Overview\nthe real thing\n";
        let r = render(md);
        assert!(!r.body_html.contains("Research: X"));
        assert!(!r.body_html.contains("Preset"));
        assert!(!r.body_html.contains("research:sources-start"));
        assert!(r.body_html.contains("the real thing"));
    }
}