use scraper::{ElementRef, Selector};
#[must_use]
pub fn is_formula_image(element: &ElementRef) -> bool {
let value = element.value();
if value.name() != "img" {
return false;
}
let classes = value.attr("class").unwrap_or("");
classes.contains("formula") || value.attr("source").is_some()
}
#[must_use]
pub fn is_math_element(element: &ElementRef) -> bool {
let value = element.value();
let tag = value.name();
let classes = value.attr("class").unwrap_or("");
classes.contains("katex")
|| classes.contains("math")
|| classes.contains("MathJax")
|| tag == "mjx-container"
}
#[must_use]
pub fn extract_habr_formula(element: &ElementRef) -> Option<String> {
let value = element.value();
if let Some(source) = value.attr("source") {
let trimmed = source.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
if let Some(alt) = value.attr("alt") {
let trimmed = alt.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
None
}
#[must_use]
pub fn extract_katex_formula(element: &ElementRef) -> Option<String> {
if let Ok(sel) = Selector::parse(r#"annotation[encoding="application/x-tex"]"#) {
if let Some(annotation) = element.select(&sel).next() {
let text: String = annotation.text().collect();
let trimmed = text.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
}
let value = element.value();
if let Some(tex) = value.attr("data-tex").or_else(|| value.attr("data-latex")) {
let trimmed = tex.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
None
}
#[must_use]
pub fn extract_mathjax_formula(element: &ElementRef) -> Option<String> {
let value = element.value();
if let Some(tex) = value.attr("data-tex").or_else(|| value.attr("data-latex")) {
let trimmed = tex.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
if let Ok(sel) = Selector::parse(r#"annotation[encoding="application/x-tex"]"#) {
if let Some(annotation) = element.select(&sel).next() {
let text: String = annotation.text().collect();
let trimmed = text.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
}
None
}
#[must_use]
pub fn extract_formula(element: &ElementRef) -> Option<String> {
if is_formula_image(element) {
return extract_habr_formula(element);
}
let tag = element.value().name();
if tag == "mjx-container" {
return extract_mathjax_formula(element);
}
if is_math_element(element) {
return extract_katex_formula(element);
}
None
}