use crate::markdown::Token;
use super::ir::{Block, DefinitionEntry, FootnoteEntry, InlineRun, ListBullet, ListEntry, RunFlags};
use std::collections::HashMap;
pub fn lower(tokens: &[Token]) -> Vec<Block> {
let mut out = Vec::new();
let mut buffered_inline: Vec<InlineRun> = Vec::new();
let footnote_numbers = collect_footnote_numbering(tokens);
let mut footnote_definitions: HashMap<String, Vec<InlineRun>> = HashMap::new();
let mut root_html_depth = InlineHtmlDepth::default();
fn flush_paragraph(out: &mut Vec<Block>, buffered: &mut Vec<InlineRun>) {
if !buffered.iter().all(|r| r.text.trim().is_empty()) {
out.push(Block::Paragraph {
runs: std::mem::take(buffered),
});
}
buffered.clear();
}
let mut i = 0;
while i < tokens.len() {
match &tokens[i] {
Token::Newline => {
let mut run = 0usize;
while i + run < tokens.len() && matches!(tokens[i + run], Token::Newline) {
run += 1;
}
if run >= 2 {
flush_paragraph(&mut out, &mut buffered_inline);
} else if !buffered_inline.is_empty() {
push_text(&mut buffered_inline, " ", RunFlags::default(), None);
}
i += run;
}
Token::HardBreak => {
flush_paragraph(&mut out, &mut buffered_inline);
i += 1;
}
Token::Heading(content, level) => {
flush_paragraph(&mut out, &mut buffered_inline);
let runs = flatten_inline(content, RunFlags::default(), None, &footnote_numbers);
out.push(Block::Heading {
level: (*level).clamp(1, 6) as u8,
runs,
});
i += 1;
}
Token::Code {
content,
block: true,
..
} => {
flush_paragraph(&mut out, &mut buffered_inline);
let lines = content.split('\n').map(|s| s.to_string()).collect();
out.push(Block::CodeBlock { lines });
i += 1;
}
Token::HorizontalRule => {
flush_paragraph(&mut out, &mut buffered_inline);
out.push(Block::HorizontalRule);
i += 1;
}
Token::HtmlBlock(content) => {
flush_paragraph(&mut out, &mut buffered_inline);
if is_pagebreak_marker(content) {
out.push(Block::PageBreak);
} else if let Some(img) = parse_html_img_block(content) {
out.push(Block::Image {
path: std::path::PathBuf::from(&img.src),
alt: img.alt,
caption: img.title,
});
} else if is_framing_only_html(content) {
} else if !is_only_html_comments(content) {
out.push(Block::HtmlBlock {
content: content.clone(),
});
}
i += 1;
}
Token::BlockQuote(body) => {
flush_paragraph(&mut out, &mut buffered_inline);
let nested = lower(body);
out.push(Block::BlockQuote { body: nested });
i += 1;
}
Token::DefinitionList { entries } => {
flush_paragraph(&mut out, &mut buffered_inline);
let ir_entries: Vec<DefinitionEntry> = entries
.iter()
.map(|e| DefinitionEntry {
term: flatten_inline(&e.term, RunFlags::default(), None, &footnote_numbers),
definitions: e
.definitions
.iter()
.map(|d| flatten_inline(d, RunFlags::default(), None, &footnote_numbers))
.collect(),
})
.collect();
out.push(Block::DefinitionList { entries: ir_entries });
i += 1;
}
Token::FootnoteDefinition { label, content } => {
flush_paragraph(&mut out, &mut buffered_inline);
let runs = flatten_inline(content, RunFlags::default(), None, &footnote_numbers);
footnote_definitions
.entry(label.clone())
.or_insert(runs);
i += 1;
}
Token::ListItem { .. } => {
flush_paragraph(&mut out, &mut buffered_inline);
let mut entries = Vec::new();
while i < tokens.len() {
let Token::ListItem {
content,
ordered,
number,
checked,
loose,
..
} = &tokens[i]
else {
break;
};
entries.push(make_list_entry(
*ordered,
*number,
*checked,
*loose,
content,
&footnote_numbers,
));
i += 1;
while i < tokens.len() && matches!(tokens[i], Token::Newline) {
i += 1;
}
}
out.push(Block::List { entries });
}
Token::Table {
headers,
aligns,
rows,
} => {
flush_paragraph(&mut out, &mut buffered_inline);
let head_runs: Vec<Vec<InlineRun>> = headers
.iter()
.map(|cell| flatten_inline(cell, RunFlags::default(), None, &footnote_numbers))
.collect();
let row_runs: Vec<Vec<Vec<InlineRun>>> = rows
.iter()
.map(|row| {
row.iter()
.map(|cell| flatten_inline(cell, RunFlags::default(), None, &footnote_numbers))
.collect()
})
.collect();
out.push(Block::Table {
headers: head_runs,
aligns: aligns.clone(),
rows: row_runs,
});
i += 1;
}
Token::Image { alt, url, title }
if buffered_inline.is_empty() && image_is_standalone(tokens, i) =>
{
let is_url = url.starts_with("http://") || url.starts_with("https://");
let path = std::path::PathBuf::from(url);
if is_url || path.exists() {
let alt_text = crate::markdown::Token::collect_all_text(alt);
out.push(Block::Image {
path,
alt: alt_text,
caption: title.clone(),
});
i += 1;
continue;
}
flatten_one(&tokens[i], RunFlags::default(), None, &mut buffered_inline, &footnote_numbers);
i += 1;
}
_ => {
if let Token::HtmlInline(tag) = &tokens[i] {
if let Some(parsed) = classify_inline_html_tag(tag) {
root_html_depth.handle(parsed);
i += 1;
continue;
}
}
let effective = root_html_depth.apply(RunFlags::default());
flatten_one(&tokens[i], effective, None, &mut buffered_inline, &footnote_numbers);
i += 1;
}
}
}
flush_paragraph(&mut out, &mut buffered_inline);
if !footnote_definitions.is_empty() {
let mut entries: Vec<FootnoteEntry> = Vec::with_capacity(footnote_definitions.len());
for (label, number) in {
let mut v: Vec<(String, usize)> = footnote_numbers
.iter()
.map(|(k, v)| (k.clone(), *v))
.collect();
v.sort_by_key(|(_, n)| *n);
v
} {
if let Some(runs) = footnote_definitions.remove(&label) {
entries.push(FootnoteEntry {
label,
number,
runs,
});
}
}
let mut next = entries.len() + 1;
let mut unused: Vec<(String, Vec<InlineRun>)> = footnote_definitions.into_iter().collect();
unused.sort_by(|a, b| a.0.cmp(&b.0));
for (label, runs) in unused {
entries.push(FootnoteEntry {
label,
number: next,
runs,
});
next += 1;
}
out.push(Block::FootnoteDefinitions { entries });
}
out
}
fn is_pagebreak_marker(s: &str) -> bool {
let trimmed = s.trim();
let inner = trimmed
.strip_prefix("<!--")
.and_then(|s| s.strip_suffix("-->"))
.map(str::trim);
matches!(inner, Some(word) if word.eq_ignore_ascii_case("pagebreak"))
}
struct HtmlImg {
src: String,
alt: String,
title: Option<String>,
}
fn parse_html_img_block(s: &str) -> Option<HtmlImg> {
let stripped = strip_html_comments(s);
let trimmed = stripped.trim();
if !trimmed.to_ascii_lowercase().starts_with("<img") {
return None;
}
let end = trimmed.find('>')?;
if trimmed[end + 1..].trim_end_matches('/').trim() != "" {
return None;
}
let inner = &trimmed[4..end].trim_end_matches('/').trim();
let attrs = parse_html_attrs(inner);
let src = attrs.iter().find_map(|(k, v)| {
if k.eq_ignore_ascii_case("src") {
Some(v.clone())
} else {
None
}
})?;
let alt = attrs
.iter()
.find_map(|(k, v)| {
if k.eq_ignore_ascii_case("alt") {
Some(v.clone())
} else {
None
}
})
.unwrap_or_default();
let title = attrs.iter().find_map(|(k, v)| {
if k.eq_ignore_ascii_case("title") {
Some(v.clone())
} else {
None
}
});
Some(HtmlImg { src, alt, title })
}
fn parse_html_attrs(s: &str) -> Vec<(String, String)> {
let mut out = Vec::new();
let bytes = s.as_bytes();
let mut i = 0;
while i < bytes.len() {
while i < bytes.len() && bytes[i].is_ascii_whitespace() {
i += 1;
}
if i >= bytes.len() {
break;
}
let name_start = i;
while i < bytes.len()
&& !bytes[i].is_ascii_whitespace()
&& bytes[i] != b'='
&& bytes[i] != b'/'
{
i += 1;
}
if i == name_start {
i += 1;
continue;
}
let name = s[name_start..i].to_string();
while i < bytes.len() && bytes[i].is_ascii_whitespace() {
i += 1;
}
if i >= bytes.len() || bytes[i] != b'=' {
out.push((name, String::new()));
continue;
}
i += 1;
while i < bytes.len() && bytes[i].is_ascii_whitespace() {
i += 1;
}
if i >= bytes.len() {
out.push((name, String::new()));
break;
}
let quote = bytes[i];
let value = if quote == b'"' || quote == b'\'' {
i += 1;
let start = i;
while i < bytes.len() && bytes[i] != quote {
i += 1;
}
let v = s[start..i].to_string();
if i < bytes.len() {
i += 1;
}
v
} else {
let start = i;
while i < bytes.len() && !bytes[i].is_ascii_whitespace() && bytes[i] != b'>' {
i += 1;
}
s[start..i].to_string()
};
out.push((name, value));
}
out
}
fn strip_html_comments(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut rest = s;
while let Some(start) = rest.find("<!--") {
out.push_str(&rest[..start]);
match rest[start..].find("-->") {
Some(end) => rest = &rest[start + end + 3..],
None => {
rest = "";
break;
}
}
}
out.push_str(rest);
out
}
fn is_framing_only_html(s: &str) -> bool {
let trimmed = strip_html_comments(s);
let trimmed = trimmed.trim();
if !trimmed.starts_with('<') || !trimmed.ends_with('>') {
return false;
}
let inner = &trimmed[1..trimmed.len() - 1];
let inner = inner.trim_start_matches('/').trim_end_matches('/').trim();
let tag_end = inner
.find(|c: char| c.is_ascii_whitespace())
.unwrap_or(inner.len());
let tag = inner[..tag_end].to_ascii_lowercase();
matches!(tag.as_str(), "p" | "div" | "center")
}
fn is_only_html_comments(s: &str) -> bool {
let mut rest = s.trim();
if rest.is_empty() {
return false;
}
while !rest.is_empty() {
if !rest.starts_with("<!--") {
return false;
}
match rest.find("-->") {
Some(end) => rest = rest[end + 3..].trim(),
None => return false,
}
}
true
}
fn collect_footnote_numbering(tokens: &[Token]) -> HashMap<String, usize> {
let mut map: HashMap<String, usize> = HashMap::new();
fn walk(t: &Token, map: &mut HashMap<String, usize>) {
match t {
Token::FootnoteReference(label) => {
let next = map.len() + 1;
map.entry(label.clone()).or_insert(next);
}
Token::Heading(inner, _)
| Token::Emphasis { content: inner, .. }
| Token::StrongEmphasis(inner)
| Token::Strikethrough(inner)
| Token::BlockQuote(inner)
| Token::ListItem { content: inner, .. }
| Token::Link { content: inner, .. }
| Token::Image { alt: inner, .. } => {
for c in inner {
walk(c, map);
}
}
Token::FootnoteDefinition { content, .. } => {
for c in content {
walk(c, map);
}
}
Token::DefinitionList { entries } => {
for entry in entries {
for c in &entry.term {
walk(c, map);
}
for def in &entry.definitions {
for c in def {
walk(c, map);
}
}
}
}
Token::Table { headers, rows, .. } => {
for header in headers {
for c in header {
walk(c, map);
}
}
for row in rows {
for cell in row {
for c in cell {
walk(c, map);
}
}
}
}
_ => {}
}
}
for t in tokens {
walk(t, &mut map);
}
map
}
fn image_is_standalone(tokens: &[Token], idx: usize) -> bool {
for tok in tokens.iter().skip(idx + 1) {
match tok {
Token::Newline | Token::HardBreak => return true,
_ => return false,
}
}
true
}
fn make_list_entry(
ordered: bool,
number: Option<usize>,
checked: Option<bool>,
loose: bool,
content: &[Token],
footnotes: &HashMap<String, usize>,
) -> ListEntry {
let bullet = match checked {
Some(true) => ListBullet::TaskChecked,
Some(false) => ListBullet::TaskUnchecked,
None if ordered => ListBullet::Ordered(number.unwrap_or(1)),
None => ListBullet::Unordered('-'),
};
let mut inline_end = 0;
for (i, tok) in content.iter().enumerate() {
if matches!(
tok,
Token::ListItem { .. }
| Token::Heading(..)
| Token::Code { block: true, .. }
| Token::HorizontalRule
| Token::BlockQuote(_)
| Token::Table { .. }
) {
inline_end = i;
break;
}
inline_end = i + 1;
}
let head = &content[..inline_end];
let tail = &content[inline_end..];
let runs = flatten_inline(head, RunFlags::default(), None, footnotes);
let children = if tail.is_empty() {
Vec::new()
} else {
lower(tail)
};
ListEntry {
bullet,
runs,
children,
loose,
}
}
fn flatten_inline(
tokens: &[Token],
flags: RunFlags,
link: Option<&str>,
footnotes: &HashMap<String, usize>,
) -> Vec<InlineRun> {
let mut out = Vec::new();
let mut depth = InlineHtmlDepth::default();
for tok in tokens {
if let Token::HtmlInline(tag) = tok {
if let Some(parsed) = classify_inline_html_tag(tag) {
depth.handle(parsed);
continue;
}
}
let effective = depth.apply(flags);
flatten_one(tok, effective, link, &mut out, footnotes);
}
out
}
enum InlineHtmlTag {
SupOpen,
SupClose,
SubOpen,
SubClose,
UnderlineOpen,
UnderlineClose,
StrikeOpen,
StrikeClose,
SmallOpen,
SmallClose,
KbdOpen,
KbdClose,
}
fn classify_inline_html_tag(raw: &str) -> Option<InlineHtmlTag> {
let s = raw.trim().to_ascii_lowercase();
match s.as_str() {
"<sup>" => Some(InlineHtmlTag::SupOpen),
"</sup>" => Some(InlineHtmlTag::SupClose),
"<sub>" => Some(InlineHtmlTag::SubOpen),
"</sub>" => Some(InlineHtmlTag::SubClose),
"<u>" => Some(InlineHtmlTag::UnderlineOpen),
"</u>" => Some(InlineHtmlTag::UnderlineClose),
"<s>" | "<del>" | "<strike>" => Some(InlineHtmlTag::StrikeOpen),
"</s>" | "</del>" | "</strike>" => Some(InlineHtmlTag::StrikeClose),
"<small>" => Some(InlineHtmlTag::SmallOpen),
"</small>" => Some(InlineHtmlTag::SmallClose),
"<kbd>" => Some(InlineHtmlTag::KbdOpen),
"</kbd>" => Some(InlineHtmlTag::KbdClose),
_ => None,
}
}
#[derive(Default, Clone, Copy)]
struct InlineHtmlDepth {
sup: u32,
sub: u32,
underline: u32,
strike: u32,
small: u32,
kbd: u32,
}
impl InlineHtmlDepth {
fn handle(&mut self, tag: InlineHtmlTag) {
match tag {
InlineHtmlTag::SupOpen => self.sup += 1,
InlineHtmlTag::SupClose => self.sup = self.sup.saturating_sub(1),
InlineHtmlTag::SubOpen => self.sub += 1,
InlineHtmlTag::SubClose => self.sub = self.sub.saturating_sub(1),
InlineHtmlTag::UnderlineOpen => self.underline += 1,
InlineHtmlTag::UnderlineClose => self.underline = self.underline.saturating_sub(1),
InlineHtmlTag::StrikeOpen => self.strike += 1,
InlineHtmlTag::StrikeClose => self.strike = self.strike.saturating_sub(1),
InlineHtmlTag::SmallOpen => self.small += 1,
InlineHtmlTag::SmallClose => self.small = self.small.saturating_sub(1),
InlineHtmlTag::KbdOpen => self.kbd += 1,
InlineHtmlTag::KbdClose => self.kbd = self.kbd.saturating_sub(1),
}
}
fn apply(&self, mut flags: RunFlags) -> RunFlags {
if self.sup > 0 {
flags = flags.with_superscript();
}
if self.sub > 0 {
flags = flags.with_subscript();
}
if self.underline > 0 {
flags = flags.with_underline();
}
if self.strike > 0 {
flags = flags.with_strikethrough();
}
if self.small > 0 {
flags = flags.with_small();
}
if self.kbd > 0 {
flags = flags.with_monospace();
}
flags
}
}
fn flatten_one(
tok: &Token,
flags: RunFlags,
link: Option<&str>,
out: &mut Vec<InlineRun>,
footnotes: &HashMap<String, usize>,
) {
match tok {
Token::Text(s) => push_text(out, s, flags, link),
Token::Emphasis { level, content } => {
let nested = match level {
1 => flags.with_italic(),
2 => flags.with_bold(),
_ => flags.with_bold().with_italic(),
};
for t in content {
flatten_one(t, nested, link, out, footnotes);
}
}
Token::StrongEmphasis(content) => {
let nested = flags.with_bold();
for t in content {
flatten_one(t, nested, link, out, footnotes);
}
}
Token::Strikethrough(content) => {
let nested = flags.with_strikethrough();
for t in content {
flatten_one(t, nested, link, out, footnotes);
}
}
Token::Code {
content,
block: false,
..
} => {
let mono = flags.with_monospace();
push_text(out, content, mono, link);
}
Token::Link { content, url, .. } => {
let url_str = url.as_str();
let nested = flags.with_underline();
for t in content {
flatten_one(t, nested, Some(url_str), out, footnotes);
}
}
Token::FootnoteReference(label) => {
let number = footnotes.get(label).copied();
let display = number
.map(|n| n.to_string())
.unwrap_or_else(|| label.clone());
let anchor_link = number.map(|n| format!("#footnote-{}", n));
let sup_flags = flags.with_superscript();
out.push(InlineRun {
text: display,
flags: sup_flags,
link: anchor_link,
});
}
Token::FootnoteDefinition { .. } => {
}
Token::Image { alt, .. } => {
for t in alt {
flatten_one(t, flags, link, out, footnotes);
}
}
Token::HtmlInline(tag) => {
if classify_inline_html_tag(tag).is_some() {
return;
}
let lower = tag.to_ascii_lowercase();
if lower.starts_with("<br") || lower.starts_with("</br") {
push_text(out, " ", flags, link);
} else if lower.starts_with("<!--") {
} else {
push_text(out, tag, flags, link);
}
}
Token::HtmlComment(_) => {}
Token::HtmlBlock(s) => {
push_text(out, s, flags, link);
}
Token::Newline => push_text(out, " ", flags, link),
Token::HardBreak => push_text(out, " ", flags, link),
Token::Heading(content, _)
| Token::BlockQuote(content)
| Token::ListItem { content, .. } => {
for t in content {
flatten_one(t, flags, link, out, footnotes);
}
}
Token::Code {
content,
block: true,
..
} => {
let mono = flags.with_monospace();
push_text(out, content, mono, link);
}
_ => {}
}
}
fn push_text(out: &mut Vec<InlineRun>, text: &str, flags: RunFlags, link: Option<&str>) {
if text.is_empty() {
return;
}
let link_owned = link.map(|s| s.to_string());
if let Some(last) = out.last_mut() {
if last.flags == flags && last.link == link_owned {
last.text.push_str(text);
return;
}
}
out.push(InlineRun {
text: text.to_string(),
flags,
link: link_owned,
});
}
#[cfg(test)]
mod tests {
use super::*;
use crate::markdown::Token;
#[test]
fn plain_text_to_paragraph() {
let blocks = lower(&[Token::Text("hello world".to_string())]);
assert_eq!(blocks.len(), 1);
let Block::Paragraph { runs } = &blocks[0] else {
panic!("expected paragraph");
};
assert_eq!(runs.len(), 1);
assert_eq!(runs[0].text, "hello world");
assert!(!runs[0].flags.bold);
}
#[test]
fn heading_lifts_to_block() {
let blocks = lower(&[Token::Heading(vec![Token::Text("Hi".into())], 2)]);
assert_eq!(blocks.len(), 1);
let Block::Heading { level, runs } = &blocks[0] else {
panic!("expected heading");
};
assert_eq!(*level, 2);
assert_eq!(runs[0].text, "Hi");
}
#[test]
fn emphasis_propagates_flags() {
let blocks = lower(&[
Token::Text("a ".into()),
Token::Emphasis {
level: 2,
content: vec![Token::Text("bold".into())],
},
Token::Text(" tail".into()),
]);
let Block::Paragraph { runs } = &blocks[0] else {
panic!("expected paragraph");
};
assert_eq!(runs.len(), 3);
assert!(!runs[0].flags.bold);
assert!(runs[1].flags.bold);
assert!(!runs[2].flags.bold);
}
#[test]
fn double_newline_separates_paragraphs() {
let blocks = lower(&[
Token::Text("first".into()),
Token::Newline,
Token::Newline,
Token::Text("second".into()),
]);
assert_eq!(blocks.len(), 2);
assert!(matches!(blocks[0], Block::Paragraph { .. }));
assert!(matches!(blocks[1], Block::Paragraph { .. }));
}
#[test]
fn inline_code_becomes_monospace_run() {
let blocks = lower(&[
Token::Text("see ".into()),
Token::Code {
language: String::new(),
content: "foo".into(),
block: false,
},
]);
let Block::Paragraph { runs } = &blocks[0] else {
panic!();
};
assert!(runs.iter().any(|r| r.text == "foo" && r.flags.monospace));
}
#[test]
fn code_block_becomes_codeblock() {
let blocks = lower(&[Token::Code {
language: "rust".into(),
content: "fn main()\n{}".into(),
block: true,
}]);
assert_eq!(blocks.len(), 1);
let Block::CodeBlock { lines } = &blocks[0] else {
panic!();
};
assert_eq!(lines, &vec!["fn main()".to_string(), "{}".to_string()]);
}
}