mod jsonld;
mod microdata;
mod rdfa;
pub use jsonld::JsonLdExtractor;
pub use microdata::MicrodataExtractor;
pub use rdfa::RdfaLiteExtractor;
use std::borrow::Cow;
use serde::{Deserialize, Serialize};
use crate::error::{ExtractionError, ExtractionWarning};
use crate::types::{SchemaNode, SchemaValue};
#[must_use]
#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
pub struct ExtractionOutput {
pub nodes: Vec<SchemaNode>,
pub warnings: Vec<ExtractionWarning>,
}
pub trait Extractor: Send + Sync {
fn extract(&self, html: &str) -> Result<ExtractionOutput, ExtractionError>;
}
const SCHEMA_PREFIXES: &[&str] = &["https://schema.org/", "http://schema.org/", "schema:"];
pub(crate) fn strip_schema_prefix(name: &str) -> Cow<'_, str> {
for prefix in SCHEMA_PREFIXES {
if let Some(stripped) = name.strip_prefix(prefix) {
return Cow::Borrowed(stripped);
}
}
Cow::Borrowed(name)
}
pub(crate) fn classify_text_value(s: &str) -> SchemaValue {
if s.starts_with("http://") || s.starts_with("https://") || s.starts_with("mailto:") {
return SchemaValue::Url(s.to_string());
}
if s.as_bytes().first().is_some_and(u8::is_ascii_digit) && is_iso_datetime(s) {
return SchemaValue::DateTime(s.to_string());
}
SchemaValue::Text(s.to_string())
}
pub(crate) fn is_iso_datetime(s: &str) -> bool {
let bytes = s.as_bytes();
if bytes.len() < 10 {
return false;
}
let valid_pattern = bytes[0..4].iter().all(u8::is_ascii_digit)
&& bytes[4] == b'-'
&& bytes[5..7].iter().all(u8::is_ascii_digit)
&& bytes[7] == b'-'
&& bytes[8..10].iter().all(u8::is_ascii_digit);
if !valid_pattern {
return false;
}
let month = (bytes[5] - b'0') * 10 + (bytes[6] - b'0');
let day = (bytes[8] - b'0') * 10 + (bytes[9] - b'0');
if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
return false;
}
if bytes.len() == 10 {
return true;
}
matches!(bytes[10], b'T' | b't' | b'Z' | b'+' | b'-')
}
#[cfg(test)]
mod common_tests {
use super::*;
#[test]
fn strip_schema_prefixes() {
assert_eq!(strip_schema_prefix("Product").as_ref(), "Product");
assert_eq!(
strip_schema_prefix("https://schema.org/Product").as_ref(),
"Product"
);
assert_eq!(
strip_schema_prefix("http://schema.org/Product").as_ref(),
"Product"
);
assert_eq!(strip_schema_prefix("schema:Product").as_ref(), "Product");
assert!(matches!(strip_schema_prefix("Product"), Cow::Borrowed(_)));
assert!(matches!(
strip_schema_prefix("https://schema.org/Product"),
Cow::Borrowed(_)
));
}
#[test]
fn classify_text_values() {
assert_eq!(
classify_text_value("hello"),
SchemaValue::Text("hello".into())
);
assert_eq!(
classify_text_value("https://example.com"),
SchemaValue::Url("https://example.com".into())
);
assert_eq!(
classify_text_value("2024-01-15"),
SchemaValue::DateTime("2024-01-15".into())
);
assert_eq!(
classify_text_value("2024-01-15T10:30:00Z"),
SchemaValue::DateTime("2024-01-15T10:30:00Z".into())
);
}
#[test]
fn iso_datetime_detection() {
assert!(is_iso_datetime("2024-01-15"));
assert!(is_iso_datetime("2024-01-15T10:30:00"));
assert!(is_iso_datetime("2024-01-15T10:30:00Z"));
assert!(!is_iso_datetime("hello"));
assert!(!is_iso_datetime("2024"));
assert!(!is_iso_datetime("not-a-date"));
assert!(!is_iso_datetime("2024-13-15"));
assert!(!is_iso_datetime("2024-00-15"));
assert!(!is_iso_datetime("2024-01-00"));
assert!(!is_iso_datetime("2024-01-32"));
assert!(is_iso_datetime("2024-01-01"));
assert!(is_iso_datetime("2024-12-31"));
assert!(is_iso_datetime("2024-01-15+02:00"));
assert!(is_iso_datetime("2024-01-15-05:00"));
assert!(is_iso_datetime("2024-01-15Z"));
assert!(is_iso_datetime("2024-01-15T"));
assert!(!is_iso_datetime("2024-01-15 10:30:00"));
assert!(!is_iso_datetime("2024-01-15 is the deadline"));
assert!(!is_iso_datetime("2024-01-15abc"));
assert!(!is_iso_datetime("2024-01-15."));
}
}