Skip to main content

typub_html/parse/
mod.rs

1//! HTML parsing into v2 semantic IR `Document`.
2//!
3//! Parser emits v2 semantic IR directly.
4
5mod blocks;
6mod code;
7mod inline;
8mod lists;
9mod spec;
10
11use anyhow::Result;
12use scraper::{ElementRef, Html, Node, Selector};
13use std::collections::BTreeMap;
14
15use typub_ir::{
16    AdmonitionKind, Asset, AssetId, AssetRef, AssetSource, Block, BlockAttrs, DocMeta, Document,
17    FootnoteDef, FootnoteId, ImageAttrs, Inline, InlineAttrs, ListKind, MathSource,
18    OrderedListMarker, RelativePath, TableHeaderScope, TextAlign, UnknownChild, Url,
19};
20
21#[derive(Default)]
22pub(crate) struct ParseCtx {
23    assets: BTreeMap<AssetId, Asset>,
24    footnotes: BTreeMap<FootnoteId, FootnoteDef>,
25    seen_assets: BTreeMap<String, AssetId>,
26    next_asset_num: u64,
27}
28
29impl ParseCtx {
30    pub(crate) fn register_image(
31        &mut self,
32        src: &str,
33        width: Option<u32>,
34        height: Option<u32>,
35    ) -> Option<AssetRef> {
36        let canonical_src = src.trim();
37        if canonical_src.starts_with("[[IMG:") && canonical_src.ends_with("]]") {
38            return None;
39        }
40
41        if let Some(id) = self.seen_assets.get(canonical_src) {
42            return Some(AssetRef(id.clone()));
43        }
44
45        let source = if canonical_src.starts_with("data:") {
46            AssetSource::DataUri {
47                uri: canonical_src.to_string(),
48            }
49        } else if canonical_src.contains("://") || canonical_src.starts_with("//") {
50            AssetSource::RemoteUrl {
51                url: Url(canonical_src.to_string()),
52            }
53        } else {
54            let path = RelativePath::new(canonical_src.to_string()).ok()?;
55            AssetSource::LocalPath { path }
56        };
57
58        self.next_asset_num += 1;
59        let id = AssetId(format!("asset-{:06}", self.next_asset_num));
60        let asset = Asset::Image(typub_ir::ImageAsset {
61            source,
62            meta: Some(typub_ir::ImageMeta {
63                width,
64                height,
65                format: None,
66                sha256: None,
67            }),
68            variants: Vec::new(),
69        });
70
71        self.assets.insert(id.clone(), asset);
72        self.seen_assets
73            .insert(canonical_src.to_string(), id.clone());
74        Some(AssetRef(id))
75    }
76}
77
78/// Parse HTML into v2 `Document`.
79pub fn parse_html_document(html: &str) -> Result<Document> {
80    let doc = Html::parse_document(html);
81    let body_selector = Selector::parse("body").ok();
82    let root = body_selector
83        .as_ref()
84        .and_then(|s| doc.select(s).next())
85        .unwrap_or_else(|| doc.root_element());
86
87    let mut ctx = ParseCtx::default();
88    let mut blocks = Vec::new();
89    let mut root_text = String::new();
90    for child in root.children() {
91        match child.value() {
92            Node::Element(_) => {
93                if let Some(text) = normalize_text_content(&root_text)
94                    && !text.trim().is_empty()
95                {
96                    blocks.push(Block::Paragraph {
97                        content: vec![Inline::Text(text)],
98                        attrs: BlockAttrs::default(),
99                    });
100                }
101                root_text.clear();
102
103                if let Some(el) = ElementRef::wrap(child) {
104                    if parse_footnote_container(el, &mut ctx)? {
105                        continue;
106                    }
107                    blocks::parse_element(el, &mut blocks, &mut ctx)?;
108                }
109            }
110            Node::Text(t) => root_text.push_str(t),
111            _ => {}
112        }
113    }
114    if let Some(text) = normalize_text_content(&root_text)
115        && !text.trim().is_empty()
116    {
117        blocks.push(Block::Paragraph {
118            content: vec![Inline::Text(text)],
119            attrs: BlockAttrs::default(),
120        });
121    }
122
123    Ok(Document {
124        blocks,
125        footnotes: ctx.footnotes,
126        assets: ctx.assets,
127        meta: DocMeta::default(),
128    })
129}
130
131pub(crate) fn parse_block_attrs(el: &ElementRef<'_>) -> BlockAttrs {
132    let mut passthrough = BTreeMap::new();
133    let mut classes = Vec::new();
134    let mut style = None;
135
136    for (k, v) in el.value().attrs() {
137        match k {
138            "class" => {
139                classes = v
140                    .split_whitespace()
141                    .filter(|s| !s.is_empty())
142                    .map(str::to_string)
143                    .collect();
144            }
145            "style" => style = Some(v.to_string()),
146            _ => {
147                passthrough.insert(k.to_string(), v.to_string());
148            }
149        }
150    }
151
152    BlockAttrs {
153        classes,
154        style,
155        passthrough,
156    }
157}
158
159pub(crate) fn parse_image_attrs(
160    el: &ElementRef<'_>,
161    width: Option<u32>,
162    height: Option<u32>,
163) -> ImageAttrs {
164    let mut passthrough = BTreeMap::new();
165    for (k, v) in el.value().attrs() {
166        match k {
167            "src" | "alt" | "title" | "align" => {}
168            _ => {
169                passthrough.insert(k.to_string(), v.to_string());
170            }
171        }
172    }
173
174    let align = match el.value().attr("align") {
175        Some("left") => Some(TextAlign::Left),
176        Some("center") => Some(TextAlign::Center),
177        Some("right") => Some(TextAlign::Right),
178        _ => el
179            .value()
180            .attr("style")
181            .and_then(parse_text_align_from_style),
182    };
183
184    ImageAttrs {
185        width,
186        height,
187        align,
188        passthrough,
189    }
190}
191
192pub(crate) fn parse_inline_attrs(el: &ElementRef<'_>) -> InlineAttrs {
193    let mut passthrough = BTreeMap::new();
194    let mut classes = Vec::new();
195    let mut style = None;
196
197    for (k, v) in el.value().attrs() {
198        match k {
199            "class" => {
200                classes = v
201                    .split_whitespace()
202                    .filter(|s| !s.is_empty())
203                    .map(str::to_string)
204                    .collect();
205            }
206            "style" => style = Some(v.to_string()),
207            _ => {
208                passthrough.insert(k.to_string(), v.to_string());
209            }
210        }
211    }
212
213    InlineAttrs {
214        classes,
215        style,
216        passthrough,
217    }
218}
219
220pub(crate) fn parse_math_source(el: ElementRef) -> Option<MathSource> {
221    if let Some(latex) = el.value().attr("data-latex-src") {
222        Some(MathSource::Latex(latex.to_string()))
223    } else {
224        el.value()
225            .attr("data-typst-src")
226            .map(|s| MathSource::Typst(s.to_string()))
227    }
228}
229
230pub(crate) fn detect_gfm_alert(text: &str) -> Option<(AdmonitionKind, &'static str)> {
231    let t = text.trim_start();
232    if t.starts_with("[!NOTE]") {
233        Some((AdmonitionKind::Note, "[!NOTE]"))
234    } else if t.starts_with("[!TIP]") {
235        Some((AdmonitionKind::Tip, "[!TIP]"))
236    } else if t.starts_with("[!WARNING]") {
237        Some((AdmonitionKind::Warning, "[!WARNING]"))
238    } else if t.starts_with("[!IMPORTANT]") {
239        Some((AdmonitionKind::Info, "[!IMPORTANT]"))
240    } else if t.starts_with("[!CAUTION]") {
241        Some((AdmonitionKind::Danger, "[!CAUTION]"))
242    } else {
243        None
244    }
245}
246
247pub(crate) fn parse_ordered_marker(raw: Option<&str>) -> Option<OrderedListMarker> {
248    match raw {
249        Some("a") => Some(OrderedListMarker::LowerAlpha),
250        Some("A") => Some(OrderedListMarker::UpperAlpha),
251        Some("i") => Some(OrderedListMarker::LowerRoman),
252        Some("I") => Some(OrderedListMarker::UpperRoman),
253        Some("1") => Some(OrderedListMarker::Decimal),
254        _ => None,
255    }
256}
257
258pub(crate) fn parse_header_scope(raw: &str) -> Option<TableHeaderScope> {
259    match raw {
260        "row" => Some(TableHeaderScope::Row),
261        "col" => Some(TableHeaderScope::Col),
262        "rowgroup" => Some(TableHeaderScope::RowGroup),
263        "colgroup" => Some(TableHeaderScope::ColGroup),
264        _ => None,
265    }
266}
267
268pub(crate) fn parse_text_align_from_style(style: &str) -> Option<TextAlign> {
269    let normalized = style.replace(' ', "").to_ascii_lowercase();
270    if normalized.contains("text-align:center") {
271        Some(TextAlign::Center)
272    } else if normalized.contains("text-align:left") {
273        Some(TextAlign::Left)
274    } else if normalized.contains("text-align:right") {
275        Some(TextAlign::Right)
276    } else {
277        None
278    }
279}
280
281pub(crate) fn is_admonition_wrapper(el: ElementRef) -> bool {
282    if let Some(class) = el.value().attr("class") {
283        class_has_keyword(class, "admonition")
284            || class_has_keyword(class, "callout")
285            || class_has_keyword(class, "notice")
286            || class_has_keyword(class, "warning")
287            || class_has_keyword(class, "tip")
288            || class_has_keyword(class, "note")
289            || class_has_keyword(class, "info")
290            || class_has_keyword(class, "danger")
291    } else {
292        false
293    }
294}
295
296pub(crate) fn class_has_keyword(class_attr: &str, keyword: &str) -> bool {
297    class_attr
298        .split_whitespace()
299        .any(|token| class_token_has_keyword(token, keyword))
300}
301
302fn class_token_has_keyword(token: &str, keyword: &str) -> bool {
303    token == keyword
304        || token
305            .split(['-', '_'])
306            .any(|segment| !segment.is_empty() && segment == keyword)
307}
308
309pub(crate) fn normalize_text_content(text: &str) -> Option<String> {
310    if text.is_empty() {
311        return None;
312    }
313    if text.trim().is_empty() {
314        return Some(" ".to_string());
315    }
316
317    let has_leading_space = text.starts_with(char::is_whitespace);
318    let has_trailing_space = text.ends_with(char::is_whitespace);
319    let normalized: String = text.split_whitespace().collect::<Vec<_>>().join(" ");
320
321    let mut result = String::new();
322    if has_leading_space {
323        result.push(' ');
324    }
325    result.push_str(&normalized);
326    if has_trailing_space && !normalized.is_empty() {
327        result.push(' ');
328    }
329
330    Some(result)
331}
332
333pub(crate) fn normalize_footnote_label(text: &str) -> Option<String> {
334    let trimmed = text.trim();
335    if trimmed.is_empty() {
336        return None;
337    }
338
339    let unwrapped = trimmed
340        .strip_prefix('[')
341        .and_then(|s| s.strip_suffix(']'))
342        .unwrap_or(trimmed);
343    let normalized = unwrapped.trim();
344    if normalized.is_empty() {
345        None
346    } else {
347        Some(normalized.to_string())
348    }
349}
350
351pub(crate) fn parse_footnote_container(el: ElementRef<'_>, ctx: &mut ParseCtx) -> Result<bool> {
352    let tag = el.value().name();
353    let class = el.value().attr("class").unwrap_or_default();
354    let role = el.value().attr("role").unwrap_or_default();
355
356    let is_doc_endnotes = role == "doc-endnotes";
357    let is_footnote_section =
358        ((tag == "section" || tag == "div") && class.contains("footnotes")) || is_doc_endnotes;
359    if is_footnote_section {
360        let mut extracted_any = false;
361        if let Ok(li_sel) = Selector::parse("li[id]") {
362            for li in el.select(&li_sel) {
363                if let Some(id_attr) = li.value().attr("id") {
364                    let fallback_id = id_attr.strip_prefix("fn-").unwrap_or(id_attr);
365                    if fallback_id.is_empty() {
366                        continue;
367                    }
368
369                    let mut blocks = blocks::parse_element_as_blocks(li, ctx)?;
370                    let footnote_id = if is_doc_endnotes {
371                        strip_doc_backlinks(&mut blocks);
372                        strip_whitespace_only_paragraphs(&mut blocks);
373                        find_doc_backlink_label(li).unwrap_or_else(|| fallback_id.to_string())
374                    } else {
375                        fallback_id.to_string()
376                    };
377                    if footnote_id.is_empty() {
378                        continue;
379                    }
380                    ctx.footnotes
381                        .insert(FootnoteId(footnote_id), FootnoteDef { blocks });
382                    extracted_any = true;
383                }
384            }
385        }
386        return Ok(extracted_any);
387    }
388
389    let is_single_footnote = tag == "div"
390        && class.contains("footnote")
391        && el
392            .value()
393            .attr("id")
394            .is_some_and(|id| id.starts_with("fn-"));
395    if is_single_footnote
396        && let Some(id_attr) = el.value().attr("id")
397        && let Some(id) = id_attr.strip_prefix("fn-")
398    {
399        let blocks = blocks::parse_child_blocks(el, ctx)?;
400        ctx.footnotes
401            .insert(FootnoteId(id.to_string()), FootnoteDef { blocks });
402        return Ok(true);
403    }
404
405    Ok(false)
406}
407
408fn find_doc_backlink_label(li: ElementRef<'_>) -> Option<String> {
409    let selector = Selector::parse(r#"a[role="doc-backlink"]"#).ok()?;
410    for link in li.select(&selector) {
411        let text = link.text().collect::<String>();
412        if let Some(label) = normalize_footnote_label(&text) {
413            return Some(label);
414        }
415    }
416    None
417}
418
419fn strip_doc_backlinks(blocks: &mut [Block]) {
420    for block in blocks {
421        strip_doc_backlinks_from_block(block);
422    }
423}
424
425fn strip_doc_backlinks_from_block(block: &mut Block) {
426    match block {
427        Block::Heading { content, .. } | Block::Paragraph { content, .. } => {
428            strip_doc_backlinks_from_inlines(content);
429        }
430        Block::Quote { blocks, .. }
431        | Block::Figure {
432            content: blocks, ..
433        }
434        | Block::Admonition { blocks, .. }
435        | Block::Details { blocks, .. } => strip_doc_backlinks(blocks),
436        Block::List { list, .. } => match &mut list.kind {
437            ListKind::Bullet { items } | ListKind::Numbered { items, .. } => {
438                for item in items {
439                    strip_doc_backlinks(&mut item.blocks);
440                }
441            }
442            ListKind::Task { items } => {
443                for item in items {
444                    strip_doc_backlinks(&mut item.blocks);
445                }
446            }
447            ListKind::Custom { items, .. } => {
448                for item in items {
449                    strip_doc_backlinks(&mut item.blocks);
450                }
451            }
452        },
453        Block::DefinitionList { items, .. } => {
454            for item in items {
455                for group in item.terms.iter_mut().chain(item.definitions.iter_mut()) {
456                    strip_doc_backlinks(group);
457                }
458            }
459        }
460        Block::Table { sections, .. } => {
461            for section in sections {
462                for row in &mut section.rows {
463                    for cell in &mut row.cells {
464                        strip_doc_backlinks(&mut cell.blocks);
465                    }
466                }
467            }
468        }
469        Block::UnknownBlock { children, .. } => {
470            for child in children {
471                match child {
472                    UnknownChild::Block(block) => strip_doc_backlinks_from_block(block),
473                    UnknownChild::Inline(inline) => strip_doc_backlinks_from_inline(inline),
474                }
475            }
476        }
477        Block::CodeBlock { .. }
478        | Block::Divider { .. }
479        | Block::MathBlock { .. }
480        | Block::SvgBlock { .. }
481        | Block::RawBlock { .. } => {}
482    }
483}
484
485fn strip_doc_backlinks_from_inline(inline: &mut Inline) {
486    match inline {
487        Inline::Styled { content, .. } | Inline::UnknownInline { content, .. } => {
488            strip_doc_backlinks_from_inlines(content);
489        }
490        Inline::Text(_)
491        | Inline::Code(_)
492        | Inline::SoftBreak
493        | Inline::HardBreak
494        | Inline::Link { .. }
495        | Inline::Image { .. }
496        | Inline::FootnoteRef(_)
497        | Inline::MathInline { .. }
498        | Inline::SvgInline { .. }
499        | Inline::RawInline { .. } => {}
500    }
501}
502
503fn strip_doc_backlinks_from_inlines(inlines: &mut Vec<Inline>) {
504    let mut kept = Vec::with_capacity(inlines.len());
505    for mut inline in std::mem::take(inlines) {
506        if is_doc_backlink_link(&inline) {
507            continue;
508        }
509        strip_doc_backlinks_from_inline(&mut inline);
510        let keep = match &inline {
511            Inline::Styled { content, .. } | Inline::UnknownInline { content, .. } => {
512                !content.is_empty()
513            }
514            _ => true,
515        };
516        if keep {
517            kept.push(inline);
518        }
519    }
520    *inlines = kept;
521}
522
523fn is_doc_backlink_link(inline: &Inline) -> bool {
524    match inline {
525        Inline::Link { attrs, .. } => attrs
526            .passthrough
527            .get("role")
528            .is_some_and(|role| role == "doc-backlink"),
529        _ => false,
530    }
531}
532
533fn strip_whitespace_only_paragraphs(blocks: &mut Vec<Block>) {
534    blocks.retain(|block| !is_whitespace_only_paragraph(block));
535}
536
537fn is_whitespace_only_paragraph(block: &Block) -> bool {
538    match block {
539        Block::Paragraph { content, .. } => {
540            !content.is_empty() && content.iter().all(inline_is_whitespace_only)
541        }
542        _ => false,
543    }
544}
545
546fn inline_is_whitespace_only(inline: &Inline) -> bool {
547    match inline {
548        Inline::Text(text) => text.trim().is_empty(),
549        Inline::SoftBreak | Inline::HardBreak => true,
550        Inline::Styled { content, .. } | Inline::UnknownInline { content, .. } => {
551            content.iter().all(inline_is_whitespace_only)
552        }
553        Inline::Code(_)
554        | Inline::Link { .. }
555        | Inline::Image { .. }
556        | Inline::FootnoteRef(_)
557        | Inline::MathInline { .. }
558        | Inline::SvgInline { .. }
559        | Inline::RawInline { .. } => false,
560    }
561}
562
563#[cfg(test)]
564#[allow(clippy::expect_used)]
565mod tests;