1mod blocks;
6mod code;
7mod inline;
8mod lists;
9mod spec;
10
11use anyhow::Result;
12use scraper::{ElementRef, Html, Node, Selector};
13use std::collections::BTreeMap;
14
15use typub_ir::{
16 AdmonitionKind, Asset, AssetId, AssetRef, AssetSource, Block, BlockAttrs, DocMeta, Document,
17 FootnoteDef, FootnoteId, ImageAttrs, Inline, InlineAttrs, ListKind, MathSource,
18 OrderedListMarker, RelativePath, TableHeaderScope, TextAlign, UnknownChild, Url,
19};
20
21#[derive(Default)]
22pub(crate) struct ParseCtx {
23 assets: BTreeMap<AssetId, Asset>,
24 footnotes: BTreeMap<FootnoteId, FootnoteDef>,
25 seen_assets: BTreeMap<String, AssetId>,
26 next_asset_num: u64,
27}
28
29impl ParseCtx {
30 pub(crate) fn register_image(
31 &mut self,
32 src: &str,
33 width: Option<u32>,
34 height: Option<u32>,
35 ) -> Option<AssetRef> {
36 let canonical_src = src.trim();
37 if canonical_src.starts_with("[[IMG:") && canonical_src.ends_with("]]") {
38 return None;
39 }
40
41 if let Some(id) = self.seen_assets.get(canonical_src) {
42 return Some(AssetRef(id.clone()));
43 }
44
45 let source = if canonical_src.starts_with("data:") {
46 AssetSource::DataUri {
47 uri: canonical_src.to_string(),
48 }
49 } else if canonical_src.contains("://") || canonical_src.starts_with("//") {
50 AssetSource::RemoteUrl {
51 url: Url(canonical_src.to_string()),
52 }
53 } else {
54 let path = RelativePath::new(canonical_src.to_string()).ok()?;
55 AssetSource::LocalPath { path }
56 };
57
58 self.next_asset_num += 1;
59 let id = AssetId(format!("asset-{:06}", self.next_asset_num));
60 let asset = Asset::Image(typub_ir::ImageAsset {
61 source,
62 meta: Some(typub_ir::ImageMeta {
63 width,
64 height,
65 format: None,
66 sha256: None,
67 }),
68 variants: Vec::new(),
69 });
70
71 self.assets.insert(id.clone(), asset);
72 self.seen_assets
73 .insert(canonical_src.to_string(), id.clone());
74 Some(AssetRef(id))
75 }
76}
77
78pub fn parse_html_document(html: &str) -> Result<Document> {
80 let doc = Html::parse_document(html);
81 let body_selector = Selector::parse("body").ok();
82 let root = body_selector
83 .as_ref()
84 .and_then(|s| doc.select(s).next())
85 .unwrap_or_else(|| doc.root_element());
86
87 let mut ctx = ParseCtx::default();
88 let mut blocks = Vec::new();
89 let mut root_text = String::new();
90 for child in root.children() {
91 match child.value() {
92 Node::Element(_) => {
93 if let Some(text) = normalize_text_content(&root_text)
94 && !text.trim().is_empty()
95 {
96 blocks.push(Block::Paragraph {
97 content: vec![Inline::Text(text)],
98 attrs: BlockAttrs::default(),
99 });
100 }
101 root_text.clear();
102
103 if let Some(el) = ElementRef::wrap(child) {
104 if parse_footnote_container(el, &mut ctx)? {
105 continue;
106 }
107 blocks::parse_element(el, &mut blocks, &mut ctx)?;
108 }
109 }
110 Node::Text(t) => root_text.push_str(t),
111 _ => {}
112 }
113 }
114 if let Some(text) = normalize_text_content(&root_text)
115 && !text.trim().is_empty()
116 {
117 blocks.push(Block::Paragraph {
118 content: vec![Inline::Text(text)],
119 attrs: BlockAttrs::default(),
120 });
121 }
122
123 Ok(Document {
124 blocks,
125 footnotes: ctx.footnotes,
126 assets: ctx.assets,
127 meta: DocMeta::default(),
128 })
129}
130
131pub(crate) fn parse_block_attrs(el: &ElementRef<'_>) -> BlockAttrs {
132 let mut passthrough = BTreeMap::new();
133 let mut classes = Vec::new();
134 let mut style = None;
135
136 for (k, v) in el.value().attrs() {
137 match k {
138 "class" => {
139 classes = v
140 .split_whitespace()
141 .filter(|s| !s.is_empty())
142 .map(str::to_string)
143 .collect();
144 }
145 "style" => style = Some(v.to_string()),
146 _ => {
147 passthrough.insert(k.to_string(), v.to_string());
148 }
149 }
150 }
151
152 BlockAttrs {
153 classes,
154 style,
155 passthrough,
156 }
157}
158
159pub(crate) fn parse_image_attrs(
160 el: &ElementRef<'_>,
161 width: Option<u32>,
162 height: Option<u32>,
163) -> ImageAttrs {
164 let mut passthrough = BTreeMap::new();
165 for (k, v) in el.value().attrs() {
166 match k {
167 "src" | "alt" | "title" | "align" => {}
168 _ => {
169 passthrough.insert(k.to_string(), v.to_string());
170 }
171 }
172 }
173
174 let align = match el.value().attr("align") {
175 Some("left") => Some(TextAlign::Left),
176 Some("center") => Some(TextAlign::Center),
177 Some("right") => Some(TextAlign::Right),
178 _ => el
179 .value()
180 .attr("style")
181 .and_then(parse_text_align_from_style),
182 };
183
184 ImageAttrs {
185 width,
186 height,
187 align,
188 passthrough,
189 }
190}
191
192pub(crate) fn parse_inline_attrs(el: &ElementRef<'_>) -> InlineAttrs {
193 let mut passthrough = BTreeMap::new();
194 let mut classes = Vec::new();
195 let mut style = None;
196
197 for (k, v) in el.value().attrs() {
198 match k {
199 "class" => {
200 classes = v
201 .split_whitespace()
202 .filter(|s| !s.is_empty())
203 .map(str::to_string)
204 .collect();
205 }
206 "style" => style = Some(v.to_string()),
207 _ => {
208 passthrough.insert(k.to_string(), v.to_string());
209 }
210 }
211 }
212
213 InlineAttrs {
214 classes,
215 style,
216 passthrough,
217 }
218}
219
220pub(crate) fn parse_math_source(el: ElementRef) -> Option<MathSource> {
221 if let Some(latex) = el.value().attr("data-latex-src") {
222 Some(MathSource::Latex(latex.to_string()))
223 } else {
224 el.value()
225 .attr("data-typst-src")
226 .map(|s| MathSource::Typst(s.to_string()))
227 }
228}
229
230pub(crate) fn detect_gfm_alert(text: &str) -> Option<(AdmonitionKind, &'static str)> {
231 let t = text.trim_start();
232 if t.starts_with("[!NOTE]") {
233 Some((AdmonitionKind::Note, "[!NOTE]"))
234 } else if t.starts_with("[!TIP]") {
235 Some((AdmonitionKind::Tip, "[!TIP]"))
236 } else if t.starts_with("[!WARNING]") {
237 Some((AdmonitionKind::Warning, "[!WARNING]"))
238 } else if t.starts_with("[!IMPORTANT]") {
239 Some((AdmonitionKind::Info, "[!IMPORTANT]"))
240 } else if t.starts_with("[!CAUTION]") {
241 Some((AdmonitionKind::Danger, "[!CAUTION]"))
242 } else {
243 None
244 }
245}
246
247pub(crate) fn parse_ordered_marker(raw: Option<&str>) -> Option<OrderedListMarker> {
248 match raw {
249 Some("a") => Some(OrderedListMarker::LowerAlpha),
250 Some("A") => Some(OrderedListMarker::UpperAlpha),
251 Some("i") => Some(OrderedListMarker::LowerRoman),
252 Some("I") => Some(OrderedListMarker::UpperRoman),
253 Some("1") => Some(OrderedListMarker::Decimal),
254 _ => None,
255 }
256}
257
258pub(crate) fn parse_header_scope(raw: &str) -> Option<TableHeaderScope> {
259 match raw {
260 "row" => Some(TableHeaderScope::Row),
261 "col" => Some(TableHeaderScope::Col),
262 "rowgroup" => Some(TableHeaderScope::RowGroup),
263 "colgroup" => Some(TableHeaderScope::ColGroup),
264 _ => None,
265 }
266}
267
268pub(crate) fn parse_text_align_from_style(style: &str) -> Option<TextAlign> {
269 let normalized = style.replace(' ', "").to_ascii_lowercase();
270 if normalized.contains("text-align:center") {
271 Some(TextAlign::Center)
272 } else if normalized.contains("text-align:left") {
273 Some(TextAlign::Left)
274 } else if normalized.contains("text-align:right") {
275 Some(TextAlign::Right)
276 } else {
277 None
278 }
279}
280
281pub(crate) fn is_admonition_wrapper(el: ElementRef) -> bool {
282 if let Some(class) = el.value().attr("class") {
283 class_has_keyword(class, "admonition")
284 || class_has_keyword(class, "callout")
285 || class_has_keyword(class, "notice")
286 || class_has_keyword(class, "warning")
287 || class_has_keyword(class, "tip")
288 || class_has_keyword(class, "note")
289 || class_has_keyword(class, "info")
290 || class_has_keyword(class, "danger")
291 } else {
292 false
293 }
294}
295
296pub(crate) fn class_has_keyword(class_attr: &str, keyword: &str) -> bool {
297 class_attr
298 .split_whitespace()
299 .any(|token| class_token_has_keyword(token, keyword))
300}
301
302fn class_token_has_keyword(token: &str, keyword: &str) -> bool {
303 token == keyword
304 || token
305 .split(['-', '_'])
306 .any(|segment| !segment.is_empty() && segment == keyword)
307}
308
309pub(crate) fn normalize_text_content(text: &str) -> Option<String> {
310 if text.is_empty() {
311 return None;
312 }
313 if text.trim().is_empty() {
314 return Some(" ".to_string());
315 }
316
317 let has_leading_space = text.starts_with(char::is_whitespace);
318 let has_trailing_space = text.ends_with(char::is_whitespace);
319 let normalized: String = text.split_whitespace().collect::<Vec<_>>().join(" ");
320
321 let mut result = String::new();
322 if has_leading_space {
323 result.push(' ');
324 }
325 result.push_str(&normalized);
326 if has_trailing_space && !normalized.is_empty() {
327 result.push(' ');
328 }
329
330 Some(result)
331}
332
333pub(crate) fn normalize_footnote_label(text: &str) -> Option<String> {
334 let trimmed = text.trim();
335 if trimmed.is_empty() {
336 return None;
337 }
338
339 let unwrapped = trimmed
340 .strip_prefix('[')
341 .and_then(|s| s.strip_suffix(']'))
342 .unwrap_or(trimmed);
343 let normalized = unwrapped.trim();
344 if normalized.is_empty() {
345 None
346 } else {
347 Some(normalized.to_string())
348 }
349}
350
351pub(crate) fn parse_footnote_container(el: ElementRef<'_>, ctx: &mut ParseCtx) -> Result<bool> {
352 let tag = el.value().name();
353 let class = el.value().attr("class").unwrap_or_default();
354 let role = el.value().attr("role").unwrap_or_default();
355
356 let is_doc_endnotes = role == "doc-endnotes";
357 let is_footnote_section =
358 ((tag == "section" || tag == "div") && class.contains("footnotes")) || is_doc_endnotes;
359 if is_footnote_section {
360 let mut extracted_any = false;
361 if let Ok(li_sel) = Selector::parse("li[id]") {
362 for li in el.select(&li_sel) {
363 if let Some(id_attr) = li.value().attr("id") {
364 let fallback_id = id_attr.strip_prefix("fn-").unwrap_or(id_attr);
365 if fallback_id.is_empty() {
366 continue;
367 }
368
369 let mut blocks = blocks::parse_element_as_blocks(li, ctx)?;
370 let footnote_id = if is_doc_endnotes {
371 strip_doc_backlinks(&mut blocks);
372 strip_whitespace_only_paragraphs(&mut blocks);
373 find_doc_backlink_label(li).unwrap_or_else(|| fallback_id.to_string())
374 } else {
375 fallback_id.to_string()
376 };
377 if footnote_id.is_empty() {
378 continue;
379 }
380 let Some(id_num) = footnote_id.parse::<u64>().ok() else {
381 continue;
382 };
383 ctx.footnotes
384 .insert(FootnoteId(id_num), FootnoteDef { blocks });
385 extracted_any = true;
386 }
387 }
388 }
389 return Ok(extracted_any);
390 }
391
392 let is_single_footnote = tag == "div"
393 && class.contains("footnote")
394 && el
395 .value()
396 .attr("id")
397 .is_some_and(|id| id.starts_with("fn-"));
398 if is_single_footnote
399 && let Some(id_attr) = el.value().attr("id")
400 && let Some(id) = id_attr.strip_prefix("fn-")
401 {
402 let blocks = blocks::parse_child_blocks(el, ctx)?;
403 let Some(id_num) = id.parse::<u64>().ok() else {
404 return Ok(false);
405 };
406 ctx.footnotes
407 .insert(FootnoteId(id_num), FootnoteDef { blocks });
408 return Ok(true);
409 }
410
411 Ok(false)
412}
413
414fn find_doc_backlink_label(li: ElementRef<'_>) -> Option<String> {
415 let selector = Selector::parse(r#"a[role="doc-backlink"]"#).ok()?;
416 for link in li.select(&selector) {
417 let text = link.text().collect::<String>();
418 if let Some(label) = normalize_footnote_label(&text) {
419 return Some(label);
420 }
421 }
422 None
423}
424
425fn strip_doc_backlinks(blocks: &mut [Block]) {
426 for block in blocks {
427 strip_doc_backlinks_from_block(block);
428 }
429}
430
431fn strip_doc_backlinks_from_block(block: &mut Block) {
432 match block {
433 Block::Heading { content, .. } | Block::Paragraph { content, .. } => {
434 strip_doc_backlinks_from_inlines(content);
435 }
436 Block::Quote { blocks, .. }
437 | Block::Figure {
438 content: blocks, ..
439 }
440 | Block::Admonition { blocks, .. }
441 | Block::Details { blocks, .. } => strip_doc_backlinks(blocks),
442 Block::List { list, .. } => match &mut list.kind {
443 ListKind::Bullet { items } | ListKind::Numbered { items, .. } => {
444 for item in items {
445 strip_doc_backlinks(&mut item.blocks);
446 }
447 }
448 ListKind::Task { items } => {
449 for item in items {
450 strip_doc_backlinks(&mut item.blocks);
451 }
452 }
453 ListKind::Custom { items, .. } => {
454 for item in items {
455 strip_doc_backlinks(&mut item.blocks);
456 }
457 }
458 },
459 Block::DefinitionList { items, .. } => {
460 for item in items {
461 for group in item.terms.iter_mut().chain(item.definitions.iter_mut()) {
462 strip_doc_backlinks(group);
463 }
464 }
465 }
466 Block::Table { sections, .. } => {
467 for section in sections {
468 for row in &mut section.rows {
469 for cell in &mut row.cells {
470 strip_doc_backlinks(&mut cell.blocks);
471 }
472 }
473 }
474 }
475 Block::UnknownBlock { children, .. } => {
476 for child in children {
477 match child {
478 UnknownChild::Block(block) => strip_doc_backlinks_from_block(block),
479 UnknownChild::Inline(inline) => strip_doc_backlinks_from_inline(inline),
480 }
481 }
482 }
483 Block::CodeBlock { .. }
484 | Block::Divider { .. }
485 | Block::MathBlock { .. }
486 | Block::SvgBlock { .. }
487 | Block::RawBlock { .. } => {}
488 }
489}
490
491fn strip_doc_backlinks_from_inline(inline: &mut Inline) {
492 match inline {
493 Inline::Styled { content, .. } | Inline::UnknownInline { content, .. } => {
494 strip_doc_backlinks_from_inlines(content);
495 }
496 Inline::Text(_)
497 | Inline::Code(_)
498 | Inline::SoftBreak
499 | Inline::HardBreak
500 | Inline::Link { .. }
501 | Inline::Image { .. }
502 | Inline::FootnoteRef(_)
503 | Inline::MathInline { .. }
504 | Inline::SvgInline { .. }
505 | Inline::RawInline { .. } => {}
506 }
507}
508
509fn strip_doc_backlinks_from_inlines(inlines: &mut Vec<Inline>) {
510 let mut kept = Vec::with_capacity(inlines.len());
511 for mut inline in std::mem::take(inlines) {
512 if is_doc_backlink_link(&inline) {
513 continue;
514 }
515 strip_doc_backlinks_from_inline(&mut inline);
516 let keep = match &inline {
517 Inline::Styled { content, .. } | Inline::UnknownInline { content, .. } => {
518 !content.is_empty()
519 }
520 _ => true,
521 };
522 if keep {
523 kept.push(inline);
524 }
525 }
526 *inlines = kept;
527}
528
529fn is_doc_backlink_link(inline: &Inline) -> bool {
530 match inline {
531 Inline::Link { attrs, .. } => attrs
532 .passthrough
533 .get("role")
534 .is_some_and(|role| role == "doc-backlink"),
535 _ => false,
536 }
537}
538
539fn strip_whitespace_only_paragraphs(blocks: &mut Vec<Block>) {
540 blocks.retain(|block| !is_whitespace_only_paragraph(block));
541}
542
543fn is_whitespace_only_paragraph(block: &Block) -> bool {
544 match block {
545 Block::Paragraph { content, .. } => {
546 !content.is_empty() && content.iter().all(inline_is_whitespace_only)
547 }
548 _ => false,
549 }
550}
551
552fn inline_is_whitespace_only(inline: &Inline) -> bool {
553 match inline {
554 Inline::Text(text) => text.trim().is_empty(),
555 Inline::SoftBreak | Inline::HardBreak => true,
556 Inline::Styled { content, .. } | Inline::UnknownInline { content, .. } => {
557 content.iter().all(inline_is_whitespace_only)
558 }
559 Inline::Code(_)
560 | Inline::Link { .. }
561 | Inline::Image { .. }
562 | Inline::FootnoteRef(_)
563 | Inline::MathInline { .. }
564 | Inline::SvgInline { .. }
565 | Inline::RawInline { .. } => false,
566 }
567}
568
569#[cfg(test)]
570#[allow(clippy::expect_used)]
571mod tests;