nako-metadata-scraper 0.1.0-alpha.2

Official Nako metadata scraper Addon Sidecar.
Documentation
use scraper::{Html, Selector};

use crate::{engine::av::AvQueryFacts, providers::rendered_av};

#[derive(Clone, Debug, Eq, PartialEq)]
pub(super) struct JavdbSearchResult {
    pub(super) movie_id: String,
    pub(super) url: String,
    pub(super) title: String,
    pub(super) number: String,
}

#[derive(Clone, Debug, Eq, PartialEq)]
pub(super) struct JavdbDetailFacts {
    pub(super) movie_id: String,
    pub(super) url: String,
    pub(super) av: AvQueryFacts,
    pub(super) title: String,
    pub(super) release_date: Option<String>,
    pub(super) release_year: Option<i32>,
    pub(super) runtime_minutes: Option<u32>,
    pub(super) actors: Vec<String>,
    pub(super) tags: Vec<String>,
    pub(super) studio: Option<String>,
    pub(super) publisher: Option<String>,
    pub(super) series: Option<String>,
    pub(super) director: Option<String>,
    pub(super) rating_milli: Option<u16>,
    pub(super) wanted_count: Option<u32>,
    pub(super) poster_url: Option<String>,
    pub(super) backdrop_urls: Vec<String>,
}

pub(super) fn parse_search_results(html: &str, query_number: &str) -> Vec<JavdbSearchResult> {
    let document = Html::parse_document(html);
    let Ok(link_selector) = Selector::parse("a.box[href], a[href*=\"/v/\"]") else {
        return Vec::new();
    };
    let normalized_query = comparable_number(query_number);
    let mut results = Vec::new();

    for link in document.select(&link_selector) {
        let Some(href) = link.value().attr("href") else {
            continue;
        };
        let Some(movie_id) = movie_id_from_url(href) else {
            continue;
        };
        let text = normalize_whitespace(&link.text().collect::<Vec<_>>().join(" "));
        if text.is_empty() || !comparable_number(&text).contains(&normalized_query) {
            continue;
        }
        results.push(JavdbSearchResult {
            movie_id,
            url: href.to_owned(),
            title: text.clone(),
            number: query_number.to_owned(),
        });
    }

    results
}

pub(super) fn parse_detail_page(
    html: &str,
    search_result: &JavdbSearchResult,
    detail_url: &str,
    av: Option<AvQueryFacts>,
) -> Option<JavdbDetailFacts> {
    let document = Html::parse_document(html);
    let body_text = element_text(&document, "body").unwrap_or_default();
    let info_text = element_text(&document, ".movie-panel-info, .video-meta-panel")
        .unwrap_or_else(|| element_text(&document, "body").unwrap_or_default());
    let title = first_non_empty(&[
        element_text(&document, "strong.current-title").as_deref(),
        element_text(&document, "h2.title").as_deref(),
        Some(search_result.title.as_str()),
    ])?;
    let release_date = javdb_labeled_value(
        &document,
        &info_text,
        &["日期", "發行日期", "発売日", "Release Date"],
    )
    .or_else(|| first_iso_date(&body_text));
    let release_year = release_date.as_deref().and_then(first_year);
    let runtime_minutes = javdb_labeled_value(
        &document,
        &info_text,
        &["時長", "片長", "収録時間", "Runtime"],
    )
    .and_then(|value| parse_minutes(&value));
    let studio = javdb_labeled_value(
        &document,
        &info_text,
        &["片商", "製作商", "Studio", "Maker"],
    );
    let publisher = javdb_labeled_value(
        &document,
        &info_text,
        &["發行", "发行", "Publisher", "Label"],
    );
    let series = javdb_labeled_value(&document, &info_text, &["系列", "Series"]);
    let director = javdb_labeled_value(&document, &info_text, &["導演", "导演", "Director"]);
    let actors = link_texts(&document, "a[href*=\"/actors/\"]");
    let tags = link_texts(&document, "a[href*=\"/tags/\"]");
    let detail_number = first_non_empty(&[
        attr_value(
            &document,
            ".copy-to-clipboard[data-clipboard-text]",
            "data-clipboard-text",
        )
        .as_deref(),
        detail_number_from_info_text(&document, &info_text).as_deref(),
        Some(search_result.number.as_str()),
    ])
    .unwrap_or_else(|| search_result.number.clone());
    let rating_milli = element_text(&document, ".score, .rating, strong.score")
        .and_then(|value| parse_rating_milli(&value));
    let wanted_count = element_text(&document, ".wanted, .want")
        .or_else(|| javdb_labeled_value(&document, &body_text, &["想看", "Wanted"]))
        .and_then(|value| parse_first_u32(&value));
    let poster_url = attr_value(&document, "meta[property=\"og:image\"]", "content")
        .or_else(|| attr_value(&document, ".cover img, .movie-cover img", "src"));
    let backdrop_urls = image_urls(
        &document,
        ".preview-images img, .tile-images img, a.tile-item img",
    );

    let av = crate::engine::av::facts_from_text(
        &detail_number,
        crate::engine::av::AvNumberSource::ExternalId,
    )
    .or(av)?;

    Some(JavdbDetailFacts {
        movie_id: search_result.movie_id.clone(),
        url: detail_url.to_owned(),
        av,
        title,
        release_date,
        release_year,
        runtime_minutes,
        actors,
        tags,
        studio,
        publisher,
        series,
        director,
        rating_milli,
        wanted_count,
        poster_url: poster_url.map(normalize_url),
        backdrop_urls: backdrop_urls.into_iter().map(normalize_url).collect(),
    })
}

fn detail_number_from_info_text(document: &Html, text: &str) -> Option<String> {
    javdb_labeled_value(document, text, &["番號", "番号", "識別碼", "ID", "Number"])
}

fn element_text(document: &Html, selector: &str) -> Option<String> {
    let selector = Selector::parse(selector).ok()?;
    document
        .select(&selector)
        .next()
        .map(|element| normalize_whitespace(&element.text().collect::<Vec<_>>().join(" ")))
        .filter(|value| !value.is_empty())
}

fn attr_value(document: &Html, selector: &str, attr: &str) -> Option<String> {
    let selector = Selector::parse(selector).ok()?;
    document
        .select(&selector)
        .find_map(|element| element.value().attr(attr))
        .map(str::to_owned)
        .filter(|value| !value.trim().is_empty())
}

fn link_texts(document: &Html, selector: &str) -> Vec<String> {
    let Ok(selector) = Selector::parse(selector) else {
        return Vec::new();
    };
    document
        .select(&selector)
        .map(|element| normalize_whitespace(&element.text().collect::<Vec<_>>().join(" ")))
        .filter(|value| !value.is_empty())
        .fold(Vec::new(), |mut values, value| {
            if !values.contains(&value) {
                values.push(value);
            }
            values
        })
}

fn image_urls(document: &Html, selector: &str) -> Vec<String> {
    let Ok(selector) = Selector::parse(selector) else {
        return Vec::new();
    };
    document
        .select(&selector)
        .filter_map(|element| element.value().attr("src"))
        .map(str::to_owned)
        .filter(|value| !value.trim().is_empty())
        .fold(Vec::new(), |mut values, value| {
            if !values.contains(&value) {
                values.push(value);
            }
            values
        })
}

const JAVDB_LABELS: &[&str] = &[
    "日期",
    "發行日期",
    "発売日",
    "Release Date",
    "時長",
    "片長",
    "収録時間",
    "Runtime",
    "片商",
    "製作商",
    "Studio",
    "Maker",
    "發行",
    "发行",
    "Publisher",
    "Label",
    "系列",
    "Series",
    "導演",
    "导演",
    "Director",
    "想看",
    "Wanted",
    "番號",
    "番号",
    "識別碼",
    "ID",
    "Number",
];

const JAVDB_LABEL_ROW_SELECTOR: &str = ".movie-panel-info p, .movie-panel-info li, .movie-panel-info tr, .video-meta-panel p, .video-meta-panel li, .video-meta-panel tr";

fn javdb_labeled_value(document: &Html, info_text: &str, labels: &[&str]) -> Option<String> {
    rendered_av::structured_or_labeled_value(
        document,
        JAVDB_LABEL_ROW_SELECTOR,
        info_text,
        labels,
        JAVDB_LABELS,
    )
}

fn first_non_empty(values: &[Option<&str>]) -> Option<String> {
    values
        .iter()
        .flatten()
        .map(|value| normalize_whitespace(value))
        .find(|value| !value.is_empty())
}

fn first_iso_date(text: &str) -> Option<String> {
    for token in text.split_whitespace() {
        if token.len() >= 10 && token.as_bytes().get(4) == Some(&b'-') {
            let value = &token[..10];
            if value
                .chars()
                .enumerate()
                .all(|(index, character)| matches!(index, 4 | 7) || character.is_ascii_digit())
            {
                return Some(value.to_owned());
            }
        }
    }
    None
}

fn first_year(text: &str) -> Option<i32> {
    for token in text.split(|character: char| !character.is_ascii_digit()) {
        if token.len() == 4 {
            let year = token.parse::<i32>().ok()?;
            if (1888..=2100).contains(&year) {
                return Some(year);
            }
        }
    }
    None
}

fn parse_minutes(value: &str) -> Option<u32> {
    parse_first_u32(value)
}

fn parse_first_u32(value: &str) -> Option<u32> {
    value
        .split(|character: char| !character.is_ascii_digit())
        .find(|value| !value.is_empty())
        .and_then(|value| value.parse::<u32>().ok())
}

fn parse_rating_milli(value: &str) -> Option<u16> {
    let rating = value.trim().parse::<f64>().ok()?;
    let scaled = if rating <= 5.0 {
        rating * 200.0
    } else {
        rating * 100.0
    };
    Some(scaled.round().clamp(0.0, 1000.0) as u16)
}

fn movie_id_from_url(url: &str) -> Option<String> {
    let marker = "/v/";
    let start = url.find(marker)? + marker.len();
    let rest = &url[start..];
    let end = rest.find(['/', '?', '#']).unwrap_or(rest.len());
    let movie_id = &rest[..end];
    (!movie_id.is_empty()).then(|| movie_id.to_owned())
}

fn comparable_number(value: &str) -> String {
    value
        .chars()
        .filter(|character| character.is_ascii_alphanumeric())
        .map(|character| character.to_ascii_uppercase())
        .collect()
}

fn normalize_url(value: String) -> String {
    if let Some(value) = value.strip_prefix("//") {
        return format!("https://{value}");
    }
    value
}

fn normalize_whitespace(value: &str) -> String {
    value.split_whitespace().collect::<Vec<_>>().join(" ")
}