1mod blocks;
6mod code;
7mod inline;
8mod lists;
9mod spec;
10
11use anyhow::Result;
12use scraper::{ElementRef, Html, Node, Selector};
13use std::collections::BTreeMap;
14
15use typub_ir::{
16 AdmonitionKind, Asset, AssetId, AssetRef, AssetSource, Block, BlockAttrs, DocMeta, Document,
17 FootnoteDef, FootnoteId, ImageAttrs, Inline, InlineAttrs, ListKind, MathSource,
18 OrderedListMarker, RelativePath, TableHeaderScope, TextAlign, UnknownChild, Url,
19};
20
21#[derive(Default)]
22pub(crate) struct ParseCtx {
23 assets: BTreeMap<AssetId, Asset>,
24 footnotes: BTreeMap<FootnoteId, FootnoteDef>,
25 seen_assets: BTreeMap<String, AssetId>,
26 next_asset_num: u64,
27}
28
29impl ParseCtx {
30 pub(crate) fn register_image(
31 &mut self,
32 src: &str,
33 width: Option<u32>,
34 height: Option<u32>,
35 ) -> Option<AssetRef> {
36 let canonical_src = src.trim();
37 if canonical_src.starts_with("[[IMG:") && canonical_src.ends_with("]]") {
38 return None;
39 }
40
41 if let Some(id) = self.seen_assets.get(canonical_src) {
42 return Some(AssetRef(id.clone()));
43 }
44
45 let source = if canonical_src.starts_with("data:") {
46 AssetSource::DataUri {
47 uri: canonical_src.to_string(),
48 }
49 } else if canonical_src.contains("://") || canonical_src.starts_with("//") {
50 AssetSource::RemoteUrl {
51 url: Url(canonical_src.to_string()),
52 }
53 } else {
54 let path = RelativePath::new(canonical_src.to_string()).ok()?;
55 AssetSource::LocalPath { path }
56 };
57
58 self.next_asset_num += 1;
59 let id = AssetId(format!("asset-{:06}", self.next_asset_num));
60 let asset = Asset::Image(typub_ir::ImageAsset {
61 source,
62 meta: Some(typub_ir::ImageMeta {
63 width,
64 height,
65 format: None,
66 sha256: None,
67 }),
68 variants: Vec::new(),
69 });
70
71 self.assets.insert(id.clone(), asset);
72 self.seen_assets
73 .insert(canonical_src.to_string(), id.clone());
74 Some(AssetRef(id))
75 }
76}
77
78pub fn parse_html_document(html: &str) -> Result<Document> {
80 let doc = Html::parse_document(html);
81 let body_selector = Selector::parse("body").ok();
82 let root = body_selector
83 .as_ref()
84 .and_then(|s| doc.select(s).next())
85 .unwrap_or_else(|| doc.root_element());
86
87 let mut ctx = ParseCtx::default();
88 let mut blocks = Vec::new();
89 let mut root_text = String::new();
90 for child in root.children() {
91 match child.value() {
92 Node::Element(_) => {
93 if let Some(text) = normalize_text_content(&root_text)
94 && !text.trim().is_empty()
95 {
96 blocks.push(Block::Paragraph {
97 content: vec![Inline::Text(text)],
98 attrs: BlockAttrs::default(),
99 });
100 }
101 root_text.clear();
102
103 if let Some(el) = ElementRef::wrap(child) {
104 if parse_footnote_container(el, &mut ctx)? {
105 continue;
106 }
107 blocks::parse_element(el, &mut blocks, &mut ctx)?;
108 }
109 }
110 Node::Text(t) => root_text.push_str(t),
111 _ => {}
112 }
113 }
114 if let Some(text) = normalize_text_content(&root_text)
115 && !text.trim().is_empty()
116 {
117 blocks.push(Block::Paragraph {
118 content: vec![Inline::Text(text)],
119 attrs: BlockAttrs::default(),
120 });
121 }
122
123 Ok(Document {
124 blocks,
125 footnotes: ctx.footnotes,
126 assets: ctx.assets,
127 meta: DocMeta::default(),
128 })
129}
130
131pub(crate) fn parse_block_attrs(el: &ElementRef<'_>) -> BlockAttrs {
132 let mut passthrough = BTreeMap::new();
133 let mut classes = Vec::new();
134 let mut style = None;
135
136 for (k, v) in el.value().attrs() {
137 match k {
138 "class" => {
139 classes = v
140 .split_whitespace()
141 .filter(|s| !s.is_empty())
142 .map(str::to_string)
143 .collect();
144 }
145 "style" => style = Some(v.to_string()),
146 _ => {
147 passthrough.insert(k.to_string(), v.to_string());
148 }
149 }
150 }
151
152 BlockAttrs {
153 classes,
154 style,
155 passthrough,
156 }
157}
158
159pub(crate) fn parse_image_attrs(
160 el: &ElementRef<'_>,
161 width: Option<u32>,
162 height: Option<u32>,
163) -> ImageAttrs {
164 let mut passthrough = BTreeMap::new();
165 for (k, v) in el.value().attrs() {
166 match k {
167 "src" | "alt" | "title" | "align" => {}
168 _ => {
169 passthrough.insert(k.to_string(), v.to_string());
170 }
171 }
172 }
173
174 let align = match el.value().attr("align") {
175 Some("left") => Some(TextAlign::Left),
176 Some("center") => Some(TextAlign::Center),
177 Some("right") => Some(TextAlign::Right),
178 _ => el
179 .value()
180 .attr("style")
181 .and_then(parse_text_align_from_style),
182 };
183
184 ImageAttrs {
185 width,
186 height,
187 align,
188 passthrough,
189 }
190}
191
192pub(crate) fn parse_inline_attrs(el: &ElementRef<'_>) -> InlineAttrs {
193 let mut passthrough = BTreeMap::new();
194 let mut classes = Vec::new();
195 let mut style = None;
196
197 for (k, v) in el.value().attrs() {
198 match k {
199 "class" => {
200 classes = v
201 .split_whitespace()
202 .filter(|s| !s.is_empty())
203 .map(str::to_string)
204 .collect();
205 }
206 "style" => style = Some(v.to_string()),
207 _ => {
208 passthrough.insert(k.to_string(), v.to_string());
209 }
210 }
211 }
212
213 InlineAttrs {
214 classes,
215 style,
216 passthrough,
217 }
218}
219
220pub(crate) fn parse_math_source(el: ElementRef) -> Option<MathSource> {
221 if let Some(latex) = el.value().attr("data-latex-src") {
222 Some(MathSource::Latex(latex.to_string()))
223 } else {
224 el.value()
225 .attr("data-typst-src")
226 .map(|s| MathSource::Typst(s.to_string()))
227 }
228}
229
230pub(crate) fn detect_gfm_alert(text: &str) -> Option<(AdmonitionKind, &'static str)> {
231 let t = text.trim_start();
232 if t.starts_with("[!NOTE]") {
233 Some((AdmonitionKind::Note, "[!NOTE]"))
234 } else if t.starts_with("[!TIP]") {
235 Some((AdmonitionKind::Tip, "[!TIP]"))
236 } else if t.starts_with("[!WARNING]") {
237 Some((AdmonitionKind::Warning, "[!WARNING]"))
238 } else if t.starts_with("[!IMPORTANT]") {
239 Some((AdmonitionKind::Info, "[!IMPORTANT]"))
240 } else if t.starts_with("[!CAUTION]") {
241 Some((AdmonitionKind::Danger, "[!CAUTION]"))
242 } else {
243 None
244 }
245}
246
247pub(crate) fn parse_ordered_marker(raw: Option<&str>) -> Option<OrderedListMarker> {
248 match raw {
249 Some("a") => Some(OrderedListMarker::LowerAlpha),
250 Some("A") => Some(OrderedListMarker::UpperAlpha),
251 Some("i") => Some(OrderedListMarker::LowerRoman),
252 Some("I") => Some(OrderedListMarker::UpperRoman),
253 Some("1") => Some(OrderedListMarker::Decimal),
254 _ => None,
255 }
256}
257
258pub(crate) fn parse_header_scope(raw: &str) -> Option<TableHeaderScope> {
259 match raw {
260 "row" => Some(TableHeaderScope::Row),
261 "col" => Some(TableHeaderScope::Col),
262 "rowgroup" => Some(TableHeaderScope::RowGroup),
263 "colgroup" => Some(TableHeaderScope::ColGroup),
264 _ => None,
265 }
266}
267
268pub(crate) fn parse_text_align_from_style(style: &str) -> Option<TextAlign> {
269 let normalized = style.replace(' ', "").to_ascii_lowercase();
270 if normalized.contains("text-align:center") {
271 Some(TextAlign::Center)
272 } else if normalized.contains("text-align:left") {
273 Some(TextAlign::Left)
274 } else if normalized.contains("text-align:right") {
275 Some(TextAlign::Right)
276 } else {
277 None
278 }
279}
280
281pub(crate) fn is_admonition_wrapper(el: ElementRef) -> bool {
282 if let Some(class) = el.value().attr("class") {
283 class_has_keyword(class, "admonition")
284 || class_has_keyword(class, "callout")
285 || class_has_keyword(class, "notice")
286 || class_has_keyword(class, "warning")
287 || class_has_keyword(class, "tip")
288 || class_has_keyword(class, "note")
289 || class_has_keyword(class, "info")
290 || class_has_keyword(class, "danger")
291 } else {
292 false
293 }
294}
295
296pub(crate) fn class_has_keyword(class_attr: &str, keyword: &str) -> bool {
297 class_attr
298 .split_whitespace()
299 .any(|token| class_token_has_keyword(token, keyword))
300}
301
302fn class_token_has_keyword(token: &str, keyword: &str) -> bool {
303 token == keyword
304 || token
305 .split(['-', '_'])
306 .any(|segment| !segment.is_empty() && segment == keyword)
307}
308
309pub(crate) fn normalize_text_content(text: &str) -> Option<String> {
310 if text.is_empty() {
311 return None;
312 }
313 if text.trim().is_empty() {
314 return Some(" ".to_string());
315 }
316
317 let has_leading_space = text.starts_with(char::is_whitespace);
318 let has_trailing_space = text.ends_with(char::is_whitespace);
319 let normalized: String = text.split_whitespace().collect::<Vec<_>>().join(" ");
320
321 let mut result = String::new();
322 if has_leading_space {
323 result.push(' ');
324 }
325 result.push_str(&normalized);
326 if has_trailing_space && !normalized.is_empty() {
327 result.push(' ');
328 }
329
330 Some(result)
331}
332
333pub(crate) fn normalize_footnote_label(text: &str) -> Option<String> {
334 let trimmed = text.trim();
335 if trimmed.is_empty() {
336 return None;
337 }
338
339 let unwrapped = trimmed
340 .strip_prefix('[')
341 .and_then(|s| s.strip_suffix(']'))
342 .unwrap_or(trimmed);
343 let normalized = unwrapped.trim();
344 if normalized.is_empty() {
345 None
346 } else {
347 Some(normalized.to_string())
348 }
349}
350
351pub(crate) fn parse_footnote_container(el: ElementRef<'_>, ctx: &mut ParseCtx) -> Result<bool> {
352 let tag = el.value().name();
353 let class = el.value().attr("class").unwrap_or_default();
354 let role = el.value().attr("role").unwrap_or_default();
355
356 let is_doc_endnotes = role == "doc-endnotes";
357 let is_footnote_section =
358 ((tag == "section" || tag == "div") && class.contains("footnotes")) || is_doc_endnotes;
359 if is_footnote_section {
360 let mut extracted_any = false;
361 if let Ok(li_sel) = Selector::parse("li[id]") {
362 for li in el.select(&li_sel) {
363 if let Some(id_attr) = li.value().attr("id") {
364 let fallback_id = id_attr.strip_prefix("fn-").unwrap_or(id_attr);
365 if fallback_id.is_empty() {
366 continue;
367 }
368
369 let mut blocks = blocks::parse_element_as_blocks(li, ctx)?;
370 let footnote_id = if is_doc_endnotes {
371 strip_doc_backlinks(&mut blocks);
372 strip_whitespace_only_paragraphs(&mut blocks);
373 find_doc_backlink_label(li).unwrap_or_else(|| fallback_id.to_string())
374 } else {
375 fallback_id.to_string()
376 };
377 if footnote_id.is_empty() {
378 continue;
379 }
380 ctx.footnotes
381 .insert(FootnoteId(footnote_id), FootnoteDef { blocks });
382 extracted_any = true;
383 }
384 }
385 }
386 return Ok(extracted_any);
387 }
388
389 let is_single_footnote = tag == "div"
390 && class.contains("footnote")
391 && el
392 .value()
393 .attr("id")
394 .is_some_and(|id| id.starts_with("fn-"));
395 if is_single_footnote
396 && let Some(id_attr) = el.value().attr("id")
397 && let Some(id) = id_attr.strip_prefix("fn-")
398 {
399 let blocks = blocks::parse_child_blocks(el, ctx)?;
400 ctx.footnotes
401 .insert(FootnoteId(id.to_string()), FootnoteDef { blocks });
402 return Ok(true);
403 }
404
405 Ok(false)
406}
407
408fn find_doc_backlink_label(li: ElementRef<'_>) -> Option<String> {
409 let selector = Selector::parse(r#"a[role="doc-backlink"]"#).ok()?;
410 for link in li.select(&selector) {
411 let text = link.text().collect::<String>();
412 if let Some(label) = normalize_footnote_label(&text) {
413 return Some(label);
414 }
415 }
416 None
417}
418
419fn strip_doc_backlinks(blocks: &mut [Block]) {
420 for block in blocks {
421 strip_doc_backlinks_from_block(block);
422 }
423}
424
425fn strip_doc_backlinks_from_block(block: &mut Block) {
426 match block {
427 Block::Heading { content, .. } | Block::Paragraph { content, .. } => {
428 strip_doc_backlinks_from_inlines(content);
429 }
430 Block::Quote { blocks, .. }
431 | Block::Figure {
432 content: blocks, ..
433 }
434 | Block::Admonition { blocks, .. }
435 | Block::Details { blocks, .. } => strip_doc_backlinks(blocks),
436 Block::List { list, .. } => match &mut list.kind {
437 ListKind::Bullet { items } | ListKind::Numbered { items, .. } => {
438 for item in items {
439 strip_doc_backlinks(&mut item.blocks);
440 }
441 }
442 ListKind::Task { items } => {
443 for item in items {
444 strip_doc_backlinks(&mut item.blocks);
445 }
446 }
447 ListKind::Custom { items, .. } => {
448 for item in items {
449 strip_doc_backlinks(&mut item.blocks);
450 }
451 }
452 },
453 Block::DefinitionList { items, .. } => {
454 for item in items {
455 for group in item.terms.iter_mut().chain(item.definitions.iter_mut()) {
456 strip_doc_backlinks(group);
457 }
458 }
459 }
460 Block::Table { sections, .. } => {
461 for section in sections {
462 for row in &mut section.rows {
463 for cell in &mut row.cells {
464 strip_doc_backlinks(&mut cell.blocks);
465 }
466 }
467 }
468 }
469 Block::UnknownBlock { children, .. } => {
470 for child in children {
471 match child {
472 UnknownChild::Block(block) => strip_doc_backlinks_from_block(block),
473 UnknownChild::Inline(inline) => strip_doc_backlinks_from_inline(inline),
474 }
475 }
476 }
477 Block::CodeBlock { .. }
478 | Block::Divider { .. }
479 | Block::MathBlock { .. }
480 | Block::SvgBlock { .. }
481 | Block::RawBlock { .. } => {}
482 }
483}
484
485fn strip_doc_backlinks_from_inline(inline: &mut Inline) {
486 match inline {
487 Inline::Styled { content, .. } | Inline::UnknownInline { content, .. } => {
488 strip_doc_backlinks_from_inlines(content);
489 }
490 Inline::Text(_)
491 | Inline::Code(_)
492 | Inline::SoftBreak
493 | Inline::HardBreak
494 | Inline::Link { .. }
495 | Inline::Image { .. }
496 | Inline::FootnoteRef(_)
497 | Inline::MathInline { .. }
498 | Inline::SvgInline { .. }
499 | Inline::RawInline { .. } => {}
500 }
501}
502
503fn strip_doc_backlinks_from_inlines(inlines: &mut Vec<Inline>) {
504 let mut kept = Vec::with_capacity(inlines.len());
505 for mut inline in std::mem::take(inlines) {
506 if is_doc_backlink_link(&inline) {
507 continue;
508 }
509 strip_doc_backlinks_from_inline(&mut inline);
510 let keep = match &inline {
511 Inline::Styled { content, .. } | Inline::UnknownInline { content, .. } => {
512 !content.is_empty()
513 }
514 _ => true,
515 };
516 if keep {
517 kept.push(inline);
518 }
519 }
520 *inlines = kept;
521}
522
523fn is_doc_backlink_link(inline: &Inline) -> bool {
524 match inline {
525 Inline::Link { attrs, .. } => attrs
526 .passthrough
527 .get("role")
528 .is_some_and(|role| role == "doc-backlink"),
529 _ => false,
530 }
531}
532
533fn strip_whitespace_only_paragraphs(blocks: &mut Vec<Block>) {
534 blocks.retain(|block| !is_whitespace_only_paragraph(block));
535}
536
537fn is_whitespace_only_paragraph(block: &Block) -> bool {
538 match block {
539 Block::Paragraph { content, .. } => {
540 !content.is_empty() && content.iter().all(inline_is_whitespace_only)
541 }
542 _ => false,
543 }
544}
545
546fn inline_is_whitespace_only(inline: &Inline) -> bool {
547 match inline {
548 Inline::Text(text) => text.trim().is_empty(),
549 Inline::SoftBreak | Inline::HardBreak => true,
550 Inline::Styled { content, .. } | Inline::UnknownInline { content, .. } => {
551 content.iter().all(inline_is_whitespace_only)
552 }
553 Inline::Code(_)
554 | Inline::Link { .. }
555 | Inline::Image { .. }
556 | Inline::FootnoteRef(_)
557 | Inline::MathInline { .. }
558 | Inline::SvgInline { .. }
559 | Inline::RawInline { .. } => false,
560 }
561}
562
563#[cfg(test)]
564#[allow(clippy::expect_used)]
565mod tests;