sculblog 0.1.9 - Docs.rs

use regex::Regex;
use std::path::Path;
use std::process::Command;

/// Runs pandoc on `file_path`. If `lua_filter` is `Some` and that path exists as a file,
/// passes `--lua-filter`; otherwise converts without a Lua filter.
pub fn write_page_html(file_path: &str, lua_filter: Option<&str>, debug: bool) -> String {
    let expanded = expand_tilde_path(file_path);
    if debug {
        eprintln!("Debug: Passed target file '{}' to pandoc.", expanded);
    }
    // this is for running ssh in non-interactive mode. set a PANDOC_BIN in your ssh command 
    let pandoc = std::env::var("PANDOC_BIN").unwrap_or_else(|_| "pandoc".to_string());
    let mut cmd = Command::new(&pandoc);
    cmd.arg(&expanded)
        .arg("--to=html4")
        .arg("--from=markdown+fenced_divs-markdown_in_html_blocks")
        .arg("--mathml");
    if let Some(path) = lua_filter {
        let exp = expand_tilde_path(path);
        if Path::new(&exp).is_file() {
            cmd.arg(format!("--lua-filter={}", exp));
        }
    }
    let output = match cmd.output() {
        Ok(o) => o,
        Err(e) => {
            eprintln!("Failed to execute pandoc on '{}': {}", expanded, e);
            std::process::exit(1);
        }
    };

    if !output.status.success() {
        let err = String::from_utf8_lossy(&output.stderr);
        panic!("Pandoc failed: {}", err);
    }
    String::from_utf8(output.stdout).expect("Pandoc returned invalid UTF-8")
}

pub fn expand_tilde_path(path: &str) -> String {
    if path.starts_with("~/") {
        if let Some(home) = std::env::var_os("HOME") {
            return format!("{}/{}", home.to_string_lossy(), &path[2..]);
        }
    } else if path == "~" {
        if let Some(home) = std::env::var_os("HOME") {
            return home.to_string_lossy().to_string();
        }
    }
    path.to_string()
}

#[derive(Debug)]
pub struct Macrolink {
    pub href: String,
    pub ref_start: usize,
    pub ref_end: usize,
}

#[derive(Debug)]
pub struct Autolink {
    pub def_start: usize,
    pub def_end: usize,
    pub ref_start: usize,
    pub ref_end: usize,
}

fn classify_links(links: Vec<Macrolink>, html: &str) -> (Vec<Macrolink>, Vec<Autolink>) {
    let header_re = Regex::new(r#"<(h[1-6]|section|div)[^>]*\bid=["']([^"']+)["'][^>]*>"#).unwrap();
    let tag_name_re = Regex::new(r#"<(\w+)"#).unwrap();
    
    let mut header_positions = std::collections::HashMap::new();
    for cap in header_re.captures_iter(html) {
        let anchor_id = cap.get(2).unwrap().as_str().to_string();
        let tag_end = cap.get(0).unwrap().end();
        let m_full = cap.get(0).unwrap().as_str();
        
        if let Some(tag_cap) = tag_name_re.captures(m_full) {
            let tag_name = tag_cap.get(1).unwrap().as_str();
            let close_re_str = format!(r"(?i)</{}\s*>", tag_name);
            if let Ok(close_re) = Regex::new(&close_re_str) {
                if let Some(close_m) = close_re.find(&html[tag_end..]) {
                    header_positions.insert(anchor_id, (tag_end, tag_end + close_m.start() - 1));
                }
            }
        }
    }

    let mut macrolinks = Vec::new();
    let mut autolinks = Vec::new();

    for link in links {
        if link.href.starts_with("#") {
            let anchor = &link.href[1..];
            if let Some(&(def_start, def_end)) = header_positions.get(anchor) {
                autolinks.push(Autolink {
                    def_start,
                    def_end,
                    ref_start: link.ref_start,
                    ref_end: link.ref_end,
                });
            }
        } else {
            macrolinks.push(link);
        }
    }

    (macrolinks, autolinks)
}

pub fn strip_and_extract_links(html: &str) -> (String, Vec<Macrolink>, Vec<Autolink>) {
    let a_open_re = Regex::new(r"(?i)<a\s[^>]*>").unwrap();
    let a_close_re = Regex::new(r"(?i)</a\s*>").unwrap();
    let class_re = Regex::new(r#"(?i)\bclass=["']"#).unwrap();
    let href_re = Regex::new(r#"(?i)href=["']([^"']*)["']"#).unwrap();

    let mut links = Vec::new();
    let mut parts = String::new();
    let mut i = 0;

    while i < html.len() {
        if let Some(a_match) = a_open_re.find(&html[i..]) {
            let abs_start = i + a_match.start();
            parts.push_str(&html[i..abs_start]);

            let tag_str = a_match.as_str();
            let has_class = class_re.is_match(tag_str);

            let inner_start = abs_start + tag_str.len();
            if let Some(close_match) = a_close_re.find(&html[inner_start..]) {
                let full_tag_end = inner_start + close_match.end();

                if has_class {
                    parts.push_str(&html[abs_start..full_tag_end]);
                } else {
                    let mut href = String::new();
                    if let Some(h_cap) = href_re.captures(tag_str) {
                        href = h_cap.get(1).unwrap().as_str().to_string();
                    }

                    if href.starts_with("http://") || href.starts_with("https://") {
                        // external link — keep the full <a> tag
                        parts.push_str(&html[abs_start..full_tag_end]);
                    } else {
                        // internal link — strip tag, record as link
                        let inner_text = &html[inner_start..inner_start + close_match.start()];
                        let stripped_pos = parts.len();
                        parts.push_str(inner_text);
                        let chars_len = inner_text.len();
                        if chars_len > 0 {
                            links.push(Macrolink {
                                href,
                                ref_start: stripped_pos,
                                ref_end: stripped_pos + chars_len - 1,
                            });
                        }
                    }
                }
                i = full_tag_end;
            } else {
                parts.push_str(&html[abs_start..]);
                break;
            }
        } else {
            parts.push_str(&html[i..]);
            break;
        }
    }

    let (macro_l, auto_l) = classify_links(links, &parts);
    (parts, macro_l, auto_l)
}