use quick_xml::events::Event;
use quick_xml::Reader;
pub fn xhtml_to_typst(xhtml: &str) -> String {
let mut reader = Reader::from_str(xhtml);
reader.config_mut().check_end_names = false;
let mut buf = Vec::new();
let mut blocks: Vec<String> = Vec::new();
let mut line = String::new();
let mut heading: Option<usize> = None;
let mut list_stack: Vec<char> = Vec::new();
let mut in_head = false;
fn flush(blocks: &mut Vec<String>, line: &mut String) {
let t = line.trim();
if !t.is_empty() {
blocks.push(t.to_string());
}
line.clear();
}
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) => match local(e.name().as_ref()) {
b"head" => in_head = true,
b"h1" | b"h2" | b"h3" | b"h4" | b"h5" | b"h6" => {
flush(&mut blocks, &mut line);
heading = Some(heading_level(local(e.name().as_ref())));
}
b"p" | b"div" | b"blockquote" => flush(&mut blocks, &mut line),
b"em" | b"i" => line.push('_'),
b"strong" | b"b" => line.push('*'),
b"ul" => list_stack.push('-'),
b"ol" => list_stack.push('+'),
b"li" => {
flush(&mut blocks, &mut line);
let marker = list_stack.last().copied().unwrap_or('-');
line.push(marker);
line.push(' ');
}
_ => {}
},
Ok(Event::End(e)) => match local(e.name().as_ref()) {
b"head" => in_head = false,
b"h1" | b"h2" | b"h3" | b"h4" | b"h5" | b"h6" => {
if let Some(lvl) = heading.take() {
let t = line.trim();
if !t.is_empty() {
blocks.push(format!("{} {}", "=".repeat(lvl), t));
}
line.clear();
}
}
b"p" | b"div" | b"blockquote" | b"li" => flush(&mut blocks, &mut line),
b"em" | b"i" => line.push('_'),
b"strong" | b"b" => line.push('*'),
b"ul" | b"ol" => {
list_stack.pop();
}
_ => {}
},
Ok(Event::Empty(e)) => match local(e.name().as_ref()) {
b"br" => line.push(' '),
b"img" => {
if let Some(src) = attr(&e, b"src") {
line.push_str(&format!("#image(\"{src}\")"));
}
}
_ => {}
},
Ok(Event::Text(t)) => {
if !in_head {
let s = t.unescape().unwrap_or_default();
line.push_str(&escape_typst(&s));
}
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
buf.clear();
}
flush(&mut blocks, &mut line);
blocks.join("\n\n")
}
fn heading_level(local: &[u8]) -> usize {
match local {
b"h1" => 1,
b"h2" => 2,
b"h3" => 3,
b"h4" => 4,
b"h5" => 5,
_ => 6,
}
}
fn escape_typst(s: &str) -> String {
let mut out = String::with_capacity(s.len());
for c in s.chars() {
if matches!(c, '\\' | '#' | '*' | '_' | '`' | '$' | '@' | '<' | '>') {
out.push('\\');
}
out.push(c);
}
out
}
fn local(name: &[u8]) -> &[u8] {
match name.iter().rposition(|&b| b == b':') {
Some(i) => &name[i + 1..],
None => name,
}
}
fn attr(e: &quick_xml::events::BytesStart, key: &[u8]) -> Option<String> {
for a in e.attributes().flatten() {
if local(a.key.as_ref()) == key {
return Some(String::from_utf8_lossy(&a.value).into_owned());
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn headings_paragraphs_and_inline() {
let xhtml = "<html><body>\
<h1>Chapter One</h1>\
<p>The road was <strong>long</strong> and <em>cold</em>.</p>\
<p>Second paragraph.</p>\
</body></html>";
let typ = xhtml_to_typst(xhtml);
assert!(typ.contains("= Chapter One"));
assert!(typ.contains("The road was *long* and _cold_."));
assert!(typ.contains("\n\nSecond paragraph."));
}
#[test]
fn lists_and_images() {
let xhtml = "<body><ul><li>alpha</li><li>beta</li></ul>\
<p>see <img src=\"img/x.png\"/> here</p></body>";
let typ = xhtml_to_typst(xhtml);
assert!(typ.contains("- alpha"));
assert!(typ.contains("- beta"));
assert!(typ.contains("#image(\"img/x.png\")"));
}
#[test]
fn text_is_typst_escaped() {
let typ = xhtml_to_typst("<body><p>cost is #5 *not* a list_item</p></body>");
assert!(typ.contains("\\#5"), "got: {typ}");
assert!(typ.contains("\\*not\\*"), "got: {typ}");
assert!(typ.contains("list\\_item"), "got: {typ}");
}
#[test]
fn head_content_is_dropped() {
let xhtml = "<html><head><title>meta</title></head><body><p>body text</p></body></html>";
let typ = xhtml_to_typst(xhtml);
assert!(!typ.contains("meta"));
assert!(typ.contains("body text"));
}
use proptest::prelude::*;
proptest! {
#[test]
fn never_panics(s in "\\PC{0,400}") {
let _ = xhtml_to_typst(&s);
}
#[test]
fn tag_salad_never_panics(
toks in proptest::collection::vec(
proptest::sample::select(vec![
"<p>", "</p>", "<h1>", "</h1>", "<strong>", "</strong>",
"<em>", "<ul>", "<li>", "</li>", "</ul>", "<br/>",
"<img src=\"x\"/>", "word", " ", "&", "<", ">",
]),
0..200,
),
) {
let _ = xhtml_to_typst(&toks.concat());
}
}
}