use chrono::{Datelike, NaiveDate, Utc};
use once_cell::sync::Lazy;
use regex::Regex;
use serde::Serialize;
static DATE_MODIFIED_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#""dateModified"\s*:\s*"([^"]+)""#).unwrap());
static DATE_PUBLISHED_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#""datePublished"\s*:\s*"([^"]+)""#).unwrap());
static YEAR_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\b(19\d{2}|20\d{2}|21\d{2})\b").unwrap());
static TIME_DATETIME_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"<time\b[^>]*\bdatetime\s*=\s*["']([^"']+)["']"#).unwrap()
});
static VISIBLE_DATE_LABEL_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"(?i)(?:last\s+updated|updated\s+on|updated|last\s+modified|modified\s+on|modified|reviewed\s+on|reviewed|fact[\s-]?checked\s+on|fact[\s-]?checked|posted|published)\s*[:.\s]+\s*((?:\d{1,2}\s+)?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2}?,?\s*\d{4}|\d{4}-\d{2}-\d{2}|\d{1,2}/\d{1,2}/\d{2,4}|\d{4})",
)
.unwrap()
});
#[derive(Serialize)]
pub struct Freshness {
pub date_modified: Option<String>,
pub date_published: Option<String>,
pub days_since_modified: Option<i64>,
pub year_mentions: Vec<u16>,
pub current_year: i32,
pub time_datetime: Option<String>,
pub visible_updated_label: Option<String>,
pub schema_vs_visible_mismatch: bool,
}
pub fn analyze(html: &str, body_text: &str, _schema_types: &[String]) -> Freshness {
let date_modified = DATE_MODIFIED_RE
.captures(html)
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string());
let date_published = DATE_PUBLISHED_RE
.captures(html)
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string());
let today = Utc::now().date_naive();
let pick = date_modified.as_ref().or(date_published.as_ref());
let days = pick.and_then(|s| parse_date(s).map(|d| (today - d).num_days()));
let mut years: Vec<u16> = YEAR_RE
.find_iter(body_text)
.filter_map(|m| m.as_str().parse::<u16>().ok())
.collect();
years.sort();
years.dedup();
let time_datetime = TIME_DATETIME_RE
.captures(html)
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string());
let visible_updated_label = VISIBLE_DATE_LABEL_RE
.captures(body_text)
.map(|c| c.get(0).map(|m| m.as_str().to_string()))
.flatten()
.map(|s| s.split_whitespace().collect::<Vec<_>>().join(" "));
let schema_vs_visible_mismatch = match (
date_modified.as_deref().and_then(parse_date),
time_datetime.as_deref().and_then(parse_date).or_else(|| {
visible_updated_label
.as_deref()
.and_then(extract_year_from_label)
.map(|y| NaiveDate::from_ymd_opt(y as i32, 1, 1))
.flatten()
}),
) {
(Some(schema), Some(visible)) => (schema - visible).num_days() > 180,
(Some(_), None) if date_modified.is_some() => {
body_text.split_whitespace().count() >= 400
}
_ => false,
};
Freshness {
date_modified,
date_published,
days_since_modified: days,
year_mentions: years,
current_year: today.year(),
time_datetime,
visible_updated_label,
schema_vs_visible_mismatch,
}
}
fn extract_year_from_label(label: &str) -> Option<u16> {
YEAR_RE
.find_iter(label)
.filter_map(|m| m.as_str().parse::<u16>().ok())
.max()
}
fn parse_date(s: &str) -> Option<NaiveDate> {
if let Ok(dt) = chrono::DateTime::parse_from_rfc3339(s) {
return Some(dt.naive_utc().date());
}
let head: String = s.chars().take_while(|c| c.is_ascii()).take(10).collect();
if head.len() == 10 {
return NaiveDate::parse_from_str(&head, "%Y-%m-%d").ok();
}
None
}