use regex::Regex;
use std::path::Path;
use std::process::Command;
pub fn write_page_html(file_path: &str, lua_filter: Option<&str>, debug: bool) -> String {
let expanded = expand_tilde_path(file_path);
if debug {
eprintln!("Debug: Passed target file '{}' to pandoc.", expanded);
}
let pandoc = std::env::var("PANDOC_BIN").unwrap_or_else(|_| "pandoc".to_string());
let mut cmd = Command::new(&pandoc);
cmd.arg(&expanded)
.arg("--to=html4")
.arg("--from=markdown+fenced_divs-markdown_in_html_blocks")
.arg("--mathml");
if let Some(path) = lua_filter {
let exp = expand_tilde_path(path);
if Path::new(&exp).is_file() {
cmd.arg(format!("--lua-filter={}", exp));
}
}
let output = match cmd.output() {
Ok(o) => o,
Err(e) => {
eprintln!("Failed to execute pandoc on '{}': {}", expanded, e);
std::process::exit(1);
}
};
if !output.status.success() {
let err = String::from_utf8_lossy(&output.stderr);
panic!("Pandoc failed: {}", err);
}
String::from_utf8(output.stdout).expect("Pandoc returned invalid UTF-8")
}
pub fn expand_tilde_path(path: &str) -> String {
if path.starts_with("~/") {
if let Some(home) = std::env::var_os("HOME") {
return format!("{}/{}", home.to_string_lossy(), &path[2..]);
}
} else if path == "~" {
if let Some(home) = std::env::var_os("HOME") {
return home.to_string_lossy().to_string();
}
}
path.to_string()
}
#[derive(Debug)]
pub struct Macrolink {
pub href: String,
pub ref_start: usize,
pub ref_end: usize,
}
#[derive(Debug)]
pub struct Autolink {
pub def_start: usize,
pub def_end: usize,
pub ref_start: usize,
pub ref_end: usize,
}
fn classify_links(links: Vec<Macrolink>, html: &str) -> (Vec<Macrolink>, Vec<Autolink>) {
let header_re = Regex::new(r#"<(h[1-6]|section|div)[^>]*\bid=["']([^"']+)["'][^>]*>"#).unwrap();
let tag_name_re = Regex::new(r#"<(\w+)"#).unwrap();
let mut header_positions = std::collections::HashMap::new();
for cap in header_re.captures_iter(html) {
let anchor_id = cap.get(2).unwrap().as_str().to_string();
let tag_end = cap.get(0).unwrap().end();
let m_full = cap.get(0).unwrap().as_str();
if let Some(tag_cap) = tag_name_re.captures(m_full) {
let tag_name = tag_cap.get(1).unwrap().as_str();
let close_re_str = format!(r"(?i)</{}\s*>", tag_name);
if let Ok(close_re) = Regex::new(&close_re_str) {
if let Some(close_m) = close_re.find(&html[tag_end..]) {
header_positions.insert(anchor_id, (tag_end, tag_end + close_m.start() - 1));
}
}
}
}
let mut macrolinks = Vec::new();
let mut autolinks = Vec::new();
for link in links {
if link.href.starts_with("#") {
let anchor = &link.href[1..];
if let Some(&(def_start, def_end)) = header_positions.get(anchor) {
autolinks.push(Autolink {
def_start,
def_end,
ref_start: link.ref_start,
ref_end: link.ref_end,
});
}
} else {
macrolinks.push(link);
}
}
(macrolinks, autolinks)
}
pub fn strip_and_extract_links(html: &str) -> (String, Vec<Macrolink>, Vec<Autolink>) {
let a_open_re = Regex::new(r"(?i)<a\s[^>]*>").unwrap();
let a_close_re = Regex::new(r"(?i)</a\s*>").unwrap();
let class_re = Regex::new(r#"(?i)\bclass=["']"#).unwrap();
let href_re = Regex::new(r#"(?i)href=["']([^"']*)["']"#).unwrap();
let mut links = Vec::new();
let mut parts = String::new();
let mut i = 0;
while i < html.len() {
if let Some(a_match) = a_open_re.find(&html[i..]) {
let abs_start = i + a_match.start();
parts.push_str(&html[i..abs_start]);
let tag_str = a_match.as_str();
let has_class = class_re.is_match(tag_str);
let inner_start = abs_start + tag_str.len();
if let Some(close_match) = a_close_re.find(&html[inner_start..]) {
let full_tag_end = inner_start + close_match.end();
if has_class {
parts.push_str(&html[abs_start..full_tag_end]);
} else {
let mut href = String::new();
if let Some(h_cap) = href_re.captures(tag_str) {
href = h_cap.get(1).unwrap().as_str().to_string();
}
if href.starts_with("http://") || href.starts_with("https://") {
parts.push_str(&html[abs_start..full_tag_end]);
} else {
let inner_text = &html[inner_start..inner_start + close_match.start()];
let stripped_pos = parts.len();
parts.push_str(inner_text);
let chars_len = inner_text.len();
if chars_len > 0 {
links.push(Macrolink {
href,
ref_start: stripped_pos,
ref_end: stripped_pos + chars_len - 1,
});
}
}
}
i = full_tag_end;
} else {
parts.push_str(&html[abs_start..]);
break;
}
} else {
parts.push_str(&html[i..]);
break;
}
}
let (macro_l, auto_l) = classify_links(links, &parts);
(parts, macro_l, auto_l)
}