Skip to main content

web_capture/
latex.rs

1//! LaTeX formula extraction module (R1).
2//!
3//! Extracts LaTeX formulas from HTML content, handling multiple sources:
4//! - Habr: `img.formula` elements with `source` attribute
5//! - `KaTeX`: `.katex` elements with `annotation[encoding="application/x-tex"]`
6//! - `MathJax`: `mjx-container` elements with `data-tex`/`data-latex` attributes
7//!
8//! Based on reference implementation from:
9//! <https://github.com/link-foundation/meta-theory/blob/main/scripts/download-article.mjs>
10
11use scraper::{ElementRef, Selector};
12
13/// Check if an element is a formula image (Habr-specific).
14///
15/// Habr renders formulas as SVG/PNG images with class `formula`.
16#[must_use]
17pub fn is_formula_image(element: &ElementRef) -> bool {
18    let value = element.value();
19    if value.name() != "img" {
20        return false;
21    }
22    let classes = value.attr("class").unwrap_or("");
23    classes.contains("formula") || value.attr("source").is_some()
24}
25
26/// Check if an element is a math element (`KaTeX`, `MathJax`, or generic math class).
27#[must_use]
28pub fn is_math_element(element: &ElementRef) -> bool {
29    let value = element.value();
30    let tag = value.name();
31    let classes = value.attr("class").unwrap_or("");
32    classes.contains("katex")
33        || classes.contains("math")
34        || classes.contains("MathJax")
35        || tag == "mjx-container"
36}
37
38/// Extract LaTeX source from a formula image element (Habr-specific).
39///
40/// Habr renders formulas as SVG/PNG images with class `formula`.
41/// The original LaTeX source is in the `source` attribute.
42#[must_use]
43pub fn extract_habr_formula(element: &ElementRef) -> Option<String> {
44    let value = element.value();
45    if let Some(source) = value.attr("source") {
46        let trimmed = source.trim();
47        if !trimmed.is_empty() {
48            return Some(trimmed.to_string());
49        }
50    }
51    if let Some(alt) = value.attr("alt") {
52        let trimmed = alt.trim();
53        if !trimmed.is_empty() {
54            return Some(trimmed.to_string());
55        }
56    }
57    None
58}
59
60/// Extract LaTeX from `KaTeX` elements.
61///
62/// `KaTeX` stores the TeX source in `annotation[encoding="application/x-tex"]`.
63#[must_use]
64pub fn extract_katex_formula(element: &ElementRef) -> Option<String> {
65    // Look for annotation element
66    if let Ok(sel) = Selector::parse(r#"annotation[encoding="application/x-tex"]"#) {
67        if let Some(annotation) = element.select(&sel).next() {
68            let text: String = annotation.text().collect();
69            let trimmed = text.trim();
70            if !trimmed.is_empty() {
71                return Some(trimmed.to_string());
72            }
73        }
74    }
75    // Fallback to data-tex or data-latex attributes
76    let value = element.value();
77    if let Some(tex) = value.attr("data-tex").or_else(|| value.attr("data-latex")) {
78        let trimmed = tex.trim();
79        if !trimmed.is_empty() {
80            return Some(trimmed.to_string());
81        }
82    }
83    None
84}
85
86/// Extract LaTeX from `MathJax` elements.
87///
88/// `MathJax` stores TeX in `data-tex` attribute or annotation elements.
89#[must_use]
90pub fn extract_mathjax_formula(element: &ElementRef) -> Option<String> {
91    let value = element.value();
92    // First try data-tex/data-latex attributes
93    if let Some(tex) = value.attr("data-tex").or_else(|| value.attr("data-latex")) {
94        let trimmed = tex.trim();
95        if !trimmed.is_empty() {
96            return Some(trimmed.to_string());
97        }
98    }
99    // Fallback to annotation element
100    if let Ok(sel) = Selector::parse(r#"annotation[encoding="application/x-tex"]"#) {
101        if let Some(annotation) = element.select(&sel).next() {
102            let text: String = annotation.text().collect();
103            let trimmed = text.trim();
104            if !trimmed.is_empty() {
105                return Some(trimmed.to_string());
106            }
107        }
108    }
109    None
110}
111
112/// Extract formula from any supported element type.
113#[must_use]
114pub fn extract_formula(element: &ElementRef) -> Option<String> {
115    if is_formula_image(element) {
116        return extract_habr_formula(element);
117    }
118    let tag = element.value().name();
119    if tag == "mjx-container" {
120        return extract_mathjax_formula(element);
121    }
122    if is_math_element(element) {
123        return extract_katex_formula(element);
124    }
125    None
126}