use scraper::{Html, Selector};
use crate::{engine::av::AvQueryFacts, providers::rendered_av};
#[derive(Clone, Debug, Eq, PartialEq)]
pub(super) struct Fc2DetailFacts {
pub(super) article_id: String,
pub(super) url: String,
pub(super) av: AvQueryFacts,
pub(super) title: String,
pub(super) overview: Option<String>,
pub(super) release_date: Option<String>,
pub(super) release_year: Option<i32>,
pub(super) runtime_minutes: Option<u32>,
pub(super) seller: Option<String>,
pub(super) tags: Vec<String>,
pub(super) poster_url: Option<String>,
}
pub(super) fn parse_detail_page(
html: &str,
detail_url: &str,
av: AvQueryFacts,
) -> Option<Fc2DetailFacts> {
let document = Html::parse_document(html);
let body_text = element_text(&document, "body").unwrap_or_default();
let info_text = element_text(&document, ".items_article_info, .items_article_HeadInfo")
.unwrap_or_else(|| body_text.clone());
let title = first_non_empty(&[
element_text(&document, "h1").as_deref(),
element_text(&document, "meta[property=\"og:title\"]").as_deref(),
Some(av.number.as_str()),
])?;
let release_date = fc2_labeled_value(
&document,
&info_text,
&["販売日", "配信開始日", "Release Date"],
)
.or_else(|| first_iso_date(&body_text));
let release_year = release_date.as_deref().and_then(first_year);
let runtime_minutes =
fc2_labeled_value(&document, &info_text, &["収録時間", "再生時間", "Runtime"])
.and_then(|value| parse_minutes(&value));
let seller = fc2_labeled_value(&document, &info_text, &["販売者", "Seller", "メーカー"]);
let overview = element_text(
&document,
".items_article_Comment, .items_article_description, .comment",
);
let tags = link_texts(&document, "a[href*=\"/genre/\"], a[href*=\"/tag/\"]");
let poster_url = attr_value(&document, "meta[property=\"og:image\"]", "content")
.or_else(|| {
attr_value(
&document,
".items_article_MainitemThumb, .items_article_Mainitem img",
"src",
)
})
.map(normalize_url);
Some(Fc2DetailFacts {
article_id: article_id_from_av_number(&av.number)?,
url: detail_url.to_owned(),
av,
title,
overview,
release_date,
release_year,
runtime_minutes,
seller,
tags,
poster_url,
})
}
pub(super) fn article_id_from_av_number(number: &str) -> Option<String> {
number
.strip_prefix("FC2-")
.filter(|value| value.chars().all(|character| character.is_ascii_digit()))
.map(str::to_owned)
}
fn element_text(document: &Html, selector: &str) -> Option<String> {
let selector = Selector::parse(selector).ok()?;
document
.select(&selector)
.next()
.map(|element| normalize_whitespace(&element.text().collect::<Vec<_>>().join(" ")))
.filter(|value| !value.is_empty())
}
fn attr_value(document: &Html, selector: &str, attr: &str) -> Option<String> {
let selector = Selector::parse(selector).ok()?;
document
.select(&selector)
.find_map(|element| element.value().attr(attr))
.map(str::to_owned)
.filter(|value| !value.trim().is_empty())
}
fn link_texts(document: &Html, selector: &str) -> Vec<String> {
let Ok(selector) = Selector::parse(selector) else {
return Vec::new();
};
document
.select(&selector)
.map(|element| normalize_whitespace(&element.text().collect::<Vec<_>>().join(" ")))
.filter(|value| !value.is_empty())
.fold(Vec::new(), |mut values, value| {
if !values.contains(&value) {
values.push(value);
}
values
})
}
const FC2_LABELS: &[&str] = &[
"販売日",
"配信開始日",
"Release Date",
"収録時間",
"再生時間",
"Runtime",
"販売者",
"Seller",
"メーカー",
];
const FC2_LABEL_ROW_SELECTOR: &str = ".items_article_info p, .items_article_info li, .items_article_info tr, \
.items_article_HeadInfo p, .items_article_HeadInfo li, .items_article_HeadInfo tr";
fn fc2_labeled_value(document: &Html, info_text: &str, labels: &[&str]) -> Option<String> {
rendered_av::structured_or_labeled_value(
document,
FC2_LABEL_ROW_SELECTOR,
info_text,
labels,
FC2_LABELS,
)
}
fn first_non_empty(values: &[Option<&str>]) -> Option<String> {
values
.iter()
.flatten()
.map(|value| normalize_whitespace(value))
.find(|value| !value.is_empty())
}
fn first_iso_date(text: &str) -> Option<String> {
for token in text.split_whitespace() {
if token.len() >= 10 && token.as_bytes().get(4) == Some(&b'-') {
let value = &token[..10];
if value
.chars()
.enumerate()
.all(|(index, character)| matches!(index, 4 | 7) || character.is_ascii_digit())
{
return Some(value.to_owned());
}
}
}
None
}
fn first_year(text: &str) -> Option<i32> {
for token in text.split(|character: char| !character.is_ascii_digit()) {
if token.len() == 4 {
let year = token.parse::<i32>().ok()?;
if (1888..=2100).contains(&year) {
return Some(year);
}
}
}
None
}
fn parse_minutes(value: &str) -> Option<u32> {
value
.split(|character: char| !character.is_ascii_digit())
.find(|value| !value.is_empty())
.and_then(|value| value.parse::<u32>().ok())
}
fn normalize_url(value: String) -> String {
if let Some(value) = value.strip_prefix("//") {
return format!("https://{value}");
}
value
}
fn normalize_whitespace(value: &str) -> String {
value.split_whitespace().collect::<Vec<_>>().join(" ")
}