nako-metadata-scraper 0.1.0-alpha.2

Official Nako metadata scraper Addon Sidecar.
Documentation
use scraper::{Html, Selector};

use crate::{
    engine::av::{AvNumberSource, AvQueryFacts, facts_from_text},
    providers::rendered_av,
};

#[derive(Clone, Debug, Eq, PartialEq)]
pub(super) struct DmmSearchResult {
    pub(super) cid: String,
    pub(super) url: String,
    pub(super) title: String,
    pub(super) number: String,
}

#[derive(Clone, Debug, Eq, PartialEq)]
pub(super) struct DmmDetailFacts {
    pub(super) cid: String,
    pub(super) url: String,
    pub(super) av: AvQueryFacts,
    pub(super) title: String,
    pub(super) overview: Option<String>,
    pub(super) release_date: Option<String>,
    pub(super) release_year: Option<i32>,
    pub(super) runtime_minutes: Option<u32>,
    pub(super) actors: Vec<String>,
    pub(super) tags: Vec<String>,
    pub(super) maker: Option<String>,
    pub(super) label: Option<String>,
    pub(super) series: Option<String>,
    pub(super) director: Option<String>,
    pub(super) rating_milli: Option<u16>,
    pub(super) poster_url: Option<String>,
    pub(super) backdrop_urls: Vec<String>,
}

pub(super) fn parse_search_results(html: &str, av: &AvQueryFacts) -> Vec<DmmSearchResult> {
    let document = Html::parse_document(html);
    let Ok(link_selector) = Selector::parse("a[href*=\"cid=\"]") else {
        return Vec::new();
    };
    let mut results = Vec::new();

    for link in document.select(&link_selector) {
        let Some(href) = link.value().attr("href") else {
            continue;
        };
        let Some(cid) = cid_from_url(href) else {
            continue;
        };
        let text = normalize_whitespace(&link.text().collect::<Vec<_>>().join(" "));
        if text.is_empty() || !text_or_cid_matches_av(&text, &cid, av) {
            continue;
        }
        results.push(DmmSearchResult {
            cid,
            url: href.to_owned(),
            title: text,
            number: av.number.clone(),
        });
    }

    results
}

pub(super) fn parse_detail_page(
    html: &str,
    search_result: &DmmSearchResult,
    detail_url: &str,
    av: Option<AvQueryFacts>,
) -> Option<DmmDetailFacts> {
    let document = Html::parse_document(html);
    let body_text = element_text(&document, "body").unwrap_or_default();
    let info_text = element_text(&document, ".product-info, #mu, table, body")
        .unwrap_or_else(|| body_text.clone());
    let title = first_non_empty(&[
        element_text(&document, "h1#title, h1.product-title, h1").as_deref(),
        attr_value(&document, "meta[property=\"og:title\"]", "content").as_deref(),
        Some(search_result.title.as_str()),
    ])?;
    let overview = first_non_empty(&[
        element_text(&document, ".story").as_deref(),
        element_text(&document, ".mg-b20.lh4").as_deref(),
        element_text(&document, "#comment").as_deref(),
        element_text(&document, ".comment").as_deref(),
    ]);
    let release_date = dmm_labeled_value(
        &document,
        &info_text,
        &["発売日", "配信開始日", "Release Date"],
    )
    .or_else(|| first_iso_date(&body_text));
    let release_year = release_date.as_deref().and_then(first_year);
    let runtime_minutes = dmm_labeled_value(
        &document,
        &info_text,
        &["収録時間", "再生時間", "Duration", "Runtime"],
    )
    .and_then(|value| parse_minutes(&value));
    let maker = dmm_labeled_value(&document, &info_text, &["メーカー", "Maker", "Studio"]);
    let label = dmm_labeled_value(&document, &info_text, &["レーベル", "Label", "Publisher"]);
    let series = dmm_labeled_value(&document, &info_text, &["シリーズ", "Series"]);
    let director = dmm_labeled_value(&document, &info_text, &["監督", "Director"]);
    let product_number = dmm_labeled_value(
        &document,
        &info_text,
        &["品番", "商品番号", "Product ID", "Number"],
    )
    .or_else(|| facts_from_text(&body_text, AvNumberSource::ExternalId).map(|facts| facts.number));
    let actors = link_texts(
        &document,
        "a[href*=\"/actress/\"], a[href*=\"article=actress\"], a[href*=\"article=actor\"]",
    );
    let tags = link_texts(
        &document,
        "a[href*=\"article=keyword\"], a[href*=\"article=genre\"], a[href*=\"/genre/\"], a[href*=\"/keyword/\"]",
    );
    let rating_milli = element_text(
        &document,
        ".review-average, .d-review__average, [class*=\"review\"]",
    )
    .and_then(|value| parse_rating_milli(&value));
    let poster_url = attr_value(&document, "meta[property=\"og:image\"]", "content")
        .or_else(|| attr_value(&document, "#package-image img, .package-image img", "src"))
        .map(normalize_url);
    let backdrop_urls = image_urls(
        &document,
        ".sample-image-block img, a[name*=\"sample\"] img, .sample-image img",
    );

    let av = product_number
        .as_deref()
        .and_then(|value| facts_from_text(value, AvNumberSource::ExternalId))
        .or_else(|| facts_from_text(&search_result.number, AvNumberSource::ExternalId))
        .or(av)?;

    Some(DmmDetailFacts {
        cid: cid_from_url(detail_url).unwrap_or_else(|| search_result.cid.clone()),
        url: detail_url.to_owned(),
        av,
        title,
        overview,
        release_date,
        release_year,
        runtime_minutes,
        actors,
        tags,
        maker,
        label,
        series,
        director,
        rating_milli,
        poster_url,
        backdrop_urls: backdrop_urls.into_iter().map(normalize_url).collect(),
    })
}

pub(super) fn cid_from_url(url: &str) -> Option<String> {
    let marker = "cid=";
    let start = url.find(marker)? + marker.len();
    let rest = &url[start..];
    let end = rest.find(['/', '?', '#', '&']).unwrap_or(rest.len());
    let cid = &rest[..end];
    (!cid.is_empty()).then(|| cid.to_owned())
}

fn text_or_cid_matches_av(text: &str, cid: &str, av: &AvQueryFacts) -> bool {
    [text, cid]
        .into_iter()
        .filter_map(|value| facts_from_text(value, AvNumberSource::ExternalId))
        .any(|facts| facts.number.eq_ignore_ascii_case(&av.number))
        || comparable_number(text).contains(&comparable_number(&av.number))
        || comparable_number(cid).contains(&comparable_number(&av.number))
}

fn element_text(document: &Html, selector: &str) -> Option<String> {
    let selector = Selector::parse(selector).ok()?;
    document
        .select(&selector)
        .next()
        .map(|element| normalize_whitespace(&element.text().collect::<Vec<_>>().join(" ")))
        .filter(|value| !value.is_empty())
}

fn attr_value(document: &Html, selector: &str, attr: &str) -> Option<String> {
    let selector = Selector::parse(selector).ok()?;
    document
        .select(&selector)
        .find_map(|element| element.value().attr(attr))
        .map(str::to_owned)
        .filter(|value| !value.trim().is_empty())
}

fn link_texts(document: &Html, selector: &str) -> Vec<String> {
    let Ok(selector) = Selector::parse(selector) else {
        return Vec::new();
    };
    document
        .select(&selector)
        .map(|element| normalize_whitespace(&element.text().collect::<Vec<_>>().join(" ")))
        .filter(|value| !value.is_empty())
        .fold(Vec::new(), |mut values, value| {
            if !values.contains(&value) {
                values.push(value);
            }
            values
        })
}

fn image_urls(document: &Html, selector: &str) -> Vec<String> {
    let Ok(selector) = Selector::parse(selector) else {
        return Vec::new();
    };
    document
        .select(&selector)
        .filter_map(|element| element.value().attr("src"))
        .map(str::to_owned)
        .filter(|value| !value.trim().is_empty())
        .fold(Vec::new(), |mut values, value| {
            if !values.contains(&value) {
                values.push(value);
            }
            values
        })
}

const DMM_LABELS: &[&str] = &[
    "品番",
    "商品番号",
    "Product ID",
    "Number",
    "発売日",
    "配信開始日",
    "Release Date",
    "収録時間",
    "再生時間",
    "Duration",
    "Runtime",
    "メーカー",
    "Maker",
    "Studio",
    "レーベル",
    "Label",
    "Publisher",
    "シリーズ",
    "Series",
    "監督",
    "Director",
];

const DMM_LABEL_ROW_SELECTOR: &str =
    ".product-info > p, .product-info > li, .product-info tr, #mu > p, #mu > li, #mu tr, table tr";

fn dmm_labeled_value(document: &Html, info_text: &str, labels: &[&str]) -> Option<String> {
    rendered_av::structured_or_labeled_value(
        document,
        DMM_LABEL_ROW_SELECTOR,
        info_text,
        labels,
        DMM_LABELS,
    )
}

fn first_non_empty(values: &[Option<&str>]) -> Option<String> {
    values
        .iter()
        .flatten()
        .map(|value| normalize_whitespace(value))
        .find(|value| !value.is_empty())
}

fn first_iso_date(text: &str) -> Option<String> {
    for token in text.split_whitespace() {
        if token.len() >= 10 && token.as_bytes().get(4) == Some(&b'-') {
            let value = &token[..10];
            if value
                .chars()
                .enumerate()
                .all(|(index, character)| matches!(index, 4 | 7) || character.is_ascii_digit())
            {
                return Some(value.to_owned());
            }
        }
    }
    None
}

fn first_year(text: &str) -> Option<i32> {
    for token in text.split(|character: char| !character.is_ascii_digit()) {
        if token.len() == 4 {
            let year = token.parse::<i32>().ok()?;
            if (1888..=2100).contains(&year) {
                return Some(year);
            }
        }
    }
    None
}

fn parse_minutes(value: &str) -> Option<u32> {
    value
        .split(|character: char| !character.is_ascii_digit())
        .find(|value| !value.is_empty())
        .and_then(|value| value.parse::<u32>().ok())
}

fn parse_rating_milli(value: &str) -> Option<u16> {
    let rating = value
        .split_whitespace()
        .find_map(|token| token.trim().parse::<f64>().ok())?;
    let scaled = if rating <= 5.0 {
        rating * 200.0
    } else {
        rating * 100.0
    };
    Some(scaled.round().clamp(0.0, 1000.0) as u16)
}

fn comparable_number(value: &str) -> String {
    value
        .chars()
        .filter(|character| character.is_ascii_alphanumeric())
        .map(|character| character.to_ascii_uppercase())
        .collect()
}

fn normalize_url(value: String) -> String {
    if let Some(value) = value.strip_prefix("//") {
        return format!("https://{value}");
    }
    value
}

fn normalize_whitespace(value: &str) -> String {
    value.split_whitespace().collect::<Vec<_>>().join(" ")
}