Skip to main content

typub_html/parse/
mod.rs

1//! HTML parsing into v2 semantic IR `Document`.
2//!
3//! Parser emits v2 semantic IR directly.
4
5mod blocks;
6mod code;
7mod inline;
8mod lists;
9mod spec;
10
11use anyhow::Result;
12use scraper::{ElementRef, Html, Node, Selector};
13use std::collections::BTreeMap;
14
15use typub_ir::{
16    AdmonitionKind, Asset, AssetId, AssetRef, AssetSource, Block, BlockAttrs, DocMeta, Document,
17    FootnoteDef, FootnoteId, ImageAttrs, Inline, InlineAttrs, ListKind, MathSource,
18    OrderedListMarker, RelativePath, TableHeaderScope, TextAlign, UnknownChild, Url,
19};
20
21#[derive(Default)]
22pub(crate) struct ParseCtx {
23    assets: BTreeMap<AssetId, Asset>,
24    footnotes: BTreeMap<FootnoteId, FootnoteDef>,
25    seen_assets: BTreeMap<String, AssetId>,
26    next_asset_num: u64,
27}
28
29impl ParseCtx {
30    pub(crate) fn register_image(
31        &mut self,
32        src: &str,
33        width: Option<u32>,
34        height: Option<u32>,
35    ) -> Option<AssetRef> {
36        let canonical_src = src.trim();
37        if canonical_src.starts_with("[[IMG:") && canonical_src.ends_with("]]") {
38            return None;
39        }
40
41        if let Some(id) = self.seen_assets.get(canonical_src) {
42            return Some(AssetRef(id.clone()));
43        }
44
45        let source = if canonical_src.starts_with("data:") {
46            AssetSource::DataUri {
47                uri: canonical_src.to_string(),
48            }
49        } else if canonical_src.contains("://") || canonical_src.starts_with("//") {
50            AssetSource::RemoteUrl {
51                url: Url(canonical_src.to_string()),
52            }
53        } else {
54            let path = RelativePath::new(canonical_src.to_string()).ok()?;
55            AssetSource::LocalPath { path }
56        };
57
58        self.next_asset_num += 1;
59        let id = AssetId(format!("asset-{:06}", self.next_asset_num));
60        let asset = Asset::Image(typub_ir::ImageAsset {
61            source,
62            meta: Some(typub_ir::ImageMeta {
63                width,
64                height,
65                format: None,
66                sha256: None,
67            }),
68            variants: Vec::new(),
69        });
70
71        self.assets.insert(id.clone(), asset);
72        self.seen_assets
73            .insert(canonical_src.to_string(), id.clone());
74        Some(AssetRef(id))
75    }
76}
77
78/// Parse HTML into v2 `Document`.
79pub fn parse_html_document(html: &str) -> Result<Document> {
80    let doc = Html::parse_document(html);
81    let body_selector = Selector::parse("body").ok();
82    let root = body_selector
83        .as_ref()
84        .and_then(|s| doc.select(s).next())
85        .unwrap_or_else(|| doc.root_element());
86
87    let mut ctx = ParseCtx::default();
88    let mut blocks = Vec::new();
89    let mut root_text = String::new();
90    for child in root.children() {
91        match child.value() {
92            Node::Element(_) => {
93                if let Some(text) = normalize_text_content(&root_text)
94                    && !text.trim().is_empty()
95                {
96                    blocks.push(Block::Paragraph {
97                        content: vec![Inline::Text(text)],
98                        attrs: BlockAttrs::default(),
99                    });
100                }
101                root_text.clear();
102
103                if let Some(el) = ElementRef::wrap(child) {
104                    if parse_footnote_container(el, &mut ctx)? {
105                        continue;
106                    }
107                    blocks::parse_element(el, &mut blocks, &mut ctx)?;
108                }
109            }
110            Node::Text(t) => root_text.push_str(t),
111            _ => {}
112        }
113    }
114    if let Some(text) = normalize_text_content(&root_text)
115        && !text.trim().is_empty()
116    {
117        blocks.push(Block::Paragraph {
118            content: vec![Inline::Text(text)],
119            attrs: BlockAttrs::default(),
120        });
121    }
122
123    Ok(Document {
124        blocks,
125        footnotes: ctx.footnotes,
126        assets: ctx.assets,
127        meta: DocMeta::default(),
128    })
129}
130
131pub(crate) fn parse_block_attrs(el: &ElementRef<'_>) -> BlockAttrs {
132    let mut passthrough = BTreeMap::new();
133    let mut classes = Vec::new();
134    let mut style = None;
135
136    for (k, v) in el.value().attrs() {
137        match k {
138            "class" => {
139                classes = v
140                    .split_whitespace()
141                    .filter(|s| !s.is_empty())
142                    .map(str::to_string)
143                    .collect();
144            }
145            "style" => style = Some(v.to_string()),
146            _ => {
147                passthrough.insert(k.to_string(), v.to_string());
148            }
149        }
150    }
151
152    BlockAttrs {
153        classes,
154        style,
155        passthrough,
156    }
157}
158
159pub(crate) fn parse_image_attrs(
160    el: &ElementRef<'_>,
161    width: Option<u32>,
162    height: Option<u32>,
163) -> ImageAttrs {
164    let mut passthrough = BTreeMap::new();
165    for (k, v) in el.value().attrs() {
166        match k {
167            "src" | "alt" | "title" | "align" => {}
168            _ => {
169                passthrough.insert(k.to_string(), v.to_string());
170            }
171        }
172    }
173
174    let align = match el.value().attr("align") {
175        Some("left") => Some(TextAlign::Left),
176        Some("center") => Some(TextAlign::Center),
177        Some("right") => Some(TextAlign::Right),
178        _ => el
179            .value()
180            .attr("style")
181            .and_then(parse_text_align_from_style),
182    };
183
184    ImageAttrs {
185        width,
186        height,
187        align,
188        passthrough,
189    }
190}
191
192pub(crate) fn parse_inline_attrs(el: &ElementRef<'_>) -> InlineAttrs {
193    let mut passthrough = BTreeMap::new();
194    let mut classes = Vec::new();
195    let mut style = None;
196
197    for (k, v) in el.value().attrs() {
198        match k {
199            "class" => {
200                classes = v
201                    .split_whitespace()
202                    .filter(|s| !s.is_empty())
203                    .map(str::to_string)
204                    .collect();
205            }
206            "style" => style = Some(v.to_string()),
207            _ => {
208                passthrough.insert(k.to_string(), v.to_string());
209            }
210        }
211    }
212
213    InlineAttrs {
214        classes,
215        style,
216        passthrough,
217    }
218}
219
220pub(crate) fn parse_math_source(el: ElementRef) -> Option<MathSource> {
221    if let Some(latex) = el.value().attr("data-latex-src") {
222        Some(MathSource::Latex(latex.to_string()))
223    } else {
224        el.value()
225            .attr("data-typst-src")
226            .map(|s| MathSource::Typst(s.to_string()))
227    }
228}
229
230pub(crate) fn detect_gfm_alert(text: &str) -> Option<(AdmonitionKind, &'static str)> {
231    let t = text.trim_start();
232    if t.starts_with("[!NOTE]") {
233        Some((AdmonitionKind::Note, "[!NOTE]"))
234    } else if t.starts_with("[!TIP]") {
235        Some((AdmonitionKind::Tip, "[!TIP]"))
236    } else if t.starts_with("[!WARNING]") {
237        Some((AdmonitionKind::Warning, "[!WARNING]"))
238    } else if t.starts_with("[!IMPORTANT]") {
239        Some((AdmonitionKind::Info, "[!IMPORTANT]"))
240    } else if t.starts_with("[!CAUTION]") {
241        Some((AdmonitionKind::Danger, "[!CAUTION]"))
242    } else {
243        None
244    }
245}
246
247pub(crate) fn parse_ordered_marker(raw: Option<&str>) -> Option<OrderedListMarker> {
248    match raw {
249        Some("a") => Some(OrderedListMarker::LowerAlpha),
250        Some("A") => Some(OrderedListMarker::UpperAlpha),
251        Some("i") => Some(OrderedListMarker::LowerRoman),
252        Some("I") => Some(OrderedListMarker::UpperRoman),
253        Some("1") => Some(OrderedListMarker::Decimal),
254        _ => None,
255    }
256}
257
258pub(crate) fn parse_header_scope(raw: &str) -> Option<TableHeaderScope> {
259    match raw {
260        "row" => Some(TableHeaderScope::Row),
261        "col" => Some(TableHeaderScope::Col),
262        "rowgroup" => Some(TableHeaderScope::RowGroup),
263        "colgroup" => Some(TableHeaderScope::ColGroup),
264        _ => None,
265    }
266}
267
268pub(crate) fn parse_text_align_from_style(style: &str) -> Option<TextAlign> {
269    let normalized = style.replace(' ', "").to_ascii_lowercase();
270    if normalized.contains("text-align:center") {
271        Some(TextAlign::Center)
272    } else if normalized.contains("text-align:left") {
273        Some(TextAlign::Left)
274    } else if normalized.contains("text-align:right") {
275        Some(TextAlign::Right)
276    } else {
277        None
278    }
279}
280
281pub(crate) fn is_admonition_wrapper(el: ElementRef) -> bool {
282    if let Some(class) = el.value().attr("class") {
283        class_has_keyword(class, "admonition")
284            || class_has_keyword(class, "callout")
285            || class_has_keyword(class, "notice")
286            || class_has_keyword(class, "warning")
287            || class_has_keyword(class, "tip")
288            || class_has_keyword(class, "note")
289            || class_has_keyword(class, "info")
290            || class_has_keyword(class, "danger")
291    } else {
292        false
293    }
294}
295
296pub(crate) fn class_has_keyword(class_attr: &str, keyword: &str) -> bool {
297    class_attr
298        .split_whitespace()
299        .any(|token| class_token_has_keyword(token, keyword))
300}
301
302fn class_token_has_keyword(token: &str, keyword: &str) -> bool {
303    token == keyword
304        || token
305            .split(['-', '_'])
306            .any(|segment| !segment.is_empty() && segment == keyword)
307}
308
309pub(crate) fn normalize_text_content(text: &str) -> Option<String> {
310    if text.is_empty() {
311        return None;
312    }
313    if text.trim().is_empty() {
314        return Some(" ".to_string());
315    }
316
317    let has_leading_space = text.starts_with(char::is_whitespace);
318    let has_trailing_space = text.ends_with(char::is_whitespace);
319    let normalized: String = text.split_whitespace().collect::<Vec<_>>().join(" ");
320
321    let mut result = String::new();
322    if has_leading_space {
323        result.push(' ');
324    }
325    result.push_str(&normalized);
326    if has_trailing_space && !normalized.is_empty() {
327        result.push(' ');
328    }
329
330    Some(result)
331}
332
333pub(crate) fn normalize_footnote_label(text: &str) -> Option<String> {
334    let trimmed = text.trim();
335    if trimmed.is_empty() {
336        return None;
337    }
338
339    let unwrapped = trimmed
340        .strip_prefix('[')
341        .and_then(|s| s.strip_suffix(']'))
342        .unwrap_or(trimmed);
343    let normalized = unwrapped.trim();
344    if normalized.is_empty() {
345        None
346    } else {
347        Some(normalized.to_string())
348    }
349}
350
351pub(crate) fn parse_footnote_container(el: ElementRef<'_>, ctx: &mut ParseCtx) -> Result<bool> {
352    let tag = el.value().name();
353    let class = el.value().attr("class").unwrap_or_default();
354    let role = el.value().attr("role").unwrap_or_default();
355
356    let is_doc_endnotes = role == "doc-endnotes";
357    let is_footnote_section =
358        ((tag == "section" || tag == "div") && class.contains("footnotes")) || is_doc_endnotes;
359    if is_footnote_section {
360        let mut extracted_any = false;
361        if let Ok(li_sel) = Selector::parse("li[id]") {
362            for li in el.select(&li_sel) {
363                if let Some(id_attr) = li.value().attr("id") {
364                    let fallback_id = id_attr.strip_prefix("fn-").unwrap_or(id_attr);
365                    if fallback_id.is_empty() {
366                        continue;
367                    }
368
369                    let mut blocks = blocks::parse_element_as_blocks(li, ctx)?;
370                    let footnote_id = if is_doc_endnotes {
371                        strip_doc_backlinks(&mut blocks);
372                        strip_whitespace_only_paragraphs(&mut blocks);
373                        find_doc_backlink_label(li).unwrap_or_else(|| fallback_id.to_string())
374                    } else {
375                        fallback_id.to_string()
376                    };
377                    if footnote_id.is_empty() {
378                        continue;
379                    }
380                    let Some(id_num) = footnote_id.parse::<u64>().ok() else {
381                        continue;
382                    };
383                    ctx.footnotes
384                        .insert(FootnoteId(id_num), FootnoteDef { blocks });
385                    extracted_any = true;
386                }
387            }
388        }
389        return Ok(extracted_any);
390    }
391
392    let is_single_footnote = tag == "div"
393        && class.contains("footnote")
394        && el
395            .value()
396            .attr("id")
397            .is_some_and(|id| id.starts_with("fn-"));
398    if is_single_footnote
399        && let Some(id_attr) = el.value().attr("id")
400        && let Some(id) = id_attr.strip_prefix("fn-")
401    {
402        let blocks = blocks::parse_child_blocks(el, ctx)?;
403        let Some(id_num) = id.parse::<u64>().ok() else {
404            return Ok(false);
405        };
406        ctx.footnotes
407            .insert(FootnoteId(id_num), FootnoteDef { blocks });
408        return Ok(true);
409    }
410
411    Ok(false)
412}
413
414fn find_doc_backlink_label(li: ElementRef<'_>) -> Option<String> {
415    let selector = Selector::parse(r#"a[role="doc-backlink"]"#).ok()?;
416    for link in li.select(&selector) {
417        let text = link.text().collect::<String>();
418        if let Some(label) = normalize_footnote_label(&text) {
419            return Some(label);
420        }
421    }
422    None
423}
424
425fn strip_doc_backlinks(blocks: &mut [Block]) {
426    for block in blocks {
427        strip_doc_backlinks_from_block(block);
428    }
429}
430
431fn strip_doc_backlinks_from_block(block: &mut Block) {
432    match block {
433        Block::Heading { content, .. } | Block::Paragraph { content, .. } => {
434            strip_doc_backlinks_from_inlines(content);
435        }
436        Block::Quote { blocks, .. }
437        | Block::Figure {
438            content: blocks, ..
439        }
440        | Block::Admonition { blocks, .. }
441        | Block::Details { blocks, .. } => strip_doc_backlinks(blocks),
442        Block::List { list, .. } => match &mut list.kind {
443            ListKind::Bullet { items } | ListKind::Numbered { items, .. } => {
444                for item in items {
445                    strip_doc_backlinks(&mut item.blocks);
446                }
447            }
448            ListKind::Task { items } => {
449                for item in items {
450                    strip_doc_backlinks(&mut item.blocks);
451                }
452            }
453            ListKind::Custom { items, .. } => {
454                for item in items {
455                    strip_doc_backlinks(&mut item.blocks);
456                }
457            }
458        },
459        Block::DefinitionList { items, .. } => {
460            for item in items {
461                for group in item.terms.iter_mut().chain(item.definitions.iter_mut()) {
462                    strip_doc_backlinks(group);
463                }
464            }
465        }
466        Block::Table { sections, .. } => {
467            for section in sections {
468                for row in &mut section.rows {
469                    for cell in &mut row.cells {
470                        strip_doc_backlinks(&mut cell.blocks);
471                    }
472                }
473            }
474        }
475        Block::UnknownBlock { children, .. } => {
476            for child in children {
477                match child {
478                    UnknownChild::Block(block) => strip_doc_backlinks_from_block(block),
479                    UnknownChild::Inline(inline) => strip_doc_backlinks_from_inline(inline),
480                }
481            }
482        }
483        Block::CodeBlock { .. }
484        | Block::Divider { .. }
485        | Block::MathBlock { .. }
486        | Block::SvgBlock { .. }
487        | Block::RawBlock { .. } => {}
488    }
489}
490
491fn strip_doc_backlinks_from_inline(inline: &mut Inline) {
492    match inline {
493        Inline::Styled { content, .. } | Inline::UnknownInline { content, .. } => {
494            strip_doc_backlinks_from_inlines(content);
495        }
496        Inline::Text(_)
497        | Inline::Code(_)
498        | Inline::SoftBreak
499        | Inline::HardBreak
500        | Inline::Link { .. }
501        | Inline::Image { .. }
502        | Inline::FootnoteRef(_)
503        | Inline::MathInline { .. }
504        | Inline::SvgInline { .. }
505        | Inline::RawInline { .. } => {}
506    }
507}
508
509fn strip_doc_backlinks_from_inlines(inlines: &mut Vec<Inline>) {
510    let mut kept = Vec::with_capacity(inlines.len());
511    for mut inline in std::mem::take(inlines) {
512        if is_doc_backlink_link(&inline) {
513            continue;
514        }
515        strip_doc_backlinks_from_inline(&mut inline);
516        let keep = match &inline {
517            Inline::Styled { content, .. } | Inline::UnknownInline { content, .. } => {
518                !content.is_empty()
519            }
520            _ => true,
521        };
522        if keep {
523            kept.push(inline);
524        }
525    }
526    *inlines = kept;
527}
528
529fn is_doc_backlink_link(inline: &Inline) -> bool {
530    match inline {
531        Inline::Link { attrs, .. } => attrs
532            .passthrough
533            .get("role")
534            .is_some_and(|role| role == "doc-backlink"),
535        _ => false,
536    }
537}
538
539fn strip_whitespace_only_paragraphs(blocks: &mut Vec<Block>) {
540    blocks.retain(|block| !is_whitespace_only_paragraph(block));
541}
542
543fn is_whitespace_only_paragraph(block: &Block) -> bool {
544    match block {
545        Block::Paragraph { content, .. } => {
546            !content.is_empty() && content.iter().all(inline_is_whitespace_only)
547        }
548        _ => false,
549    }
550}
551
552fn inline_is_whitespace_only(inline: &Inline) -> bool {
553    match inline {
554        Inline::Text(text) => text.trim().is_empty(),
555        Inline::SoftBreak | Inline::HardBreak => true,
556        Inline::Styled { content, .. } | Inline::UnknownInline { content, .. } => {
557            content.iter().all(inline_is_whitespace_only)
558        }
559        Inline::Code(_)
560        | Inline::Link { .. }
561        | Inline::Image { .. }
562        | Inline::FootnoteRef(_)
563        | Inline::MathInline { .. }
564        | Inline::SvgInline { .. }
565        | Inline::RawInline { .. } => false,
566    }
567}
568
569#[cfg(test)]
570#[allow(clippy::expect_used)]
571mod tests;