use std::collections::HashMap;
use std::fmt::{Debug, Formatter};
use std::sync::LazyLock;
use chrono::DateTime;
use html_escape::decode_html_entities;
use log::debug;
use regex::Regex;
use scraper::{Html, Selector};
use crate::backends::{Backend, BackendError, ChapterListElem, ChapterOrderingFn};
use crate::utils::get;
use crate::Chapter;
static CHAPTER_TITLE_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
Selector::parse("table#chapters tbody tr.chapter-row td:first-child a").unwrap()
});
static CHAPTER_CREATED_AT_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
Selector::parse("table#chapters tbody tr.chapter-row td:last-child time").unwrap()
});
static FICTION_AUTHORS_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("meta[property='books:author']").unwrap());
static CHAPTER_PAGE_TITLE_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("div.row.fic-header div.row div h1.font-white").unwrap());
static CHAPTER_PAGE_CONTENT: LazyLock<Selector> = LazyLock::new(|| {
Selector::parse("div.page-container div.page-content-wrapper div.page-content div.container.chapter-page div div div.portlet-body div.chapter-inner.chapter-content").unwrap()
});
static FICTION_TITLE_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
Selector::parse("div.row.fic-header div.fic-title div.col h1.font-white").unwrap()
});
static FICTION_IMAGE_URL_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("meta[property='og:image']").unwrap());
const ROYALROAD_ANTI_THEFT_TEXT: &str =
include_str!("../../ressources/royalroad/known_anti-theft_sentences.txt");
static ROYALROAD_ANTI_THEFT_TEXT_ARRAY: LazyLock<Vec<String>> = LazyLock::new(|| {
ROYALROAD_ANTI_THEFT_TEXT
.lines()
.filter(|s| !s.is_empty())
.map(|t| format!(r#"<p>{}</p>"#, t))
.collect()
});
static ROYALROAD_CHAPTER_URL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"https?://www\.royalroad\.com/fiction/(?<fiction_id>\d+)/(?<fiction_title_slug>[\w-]+)/chapter/(?<chapter_id>\d+)/(?<chapter_title_slug>[\w-]+)").unwrap()
});
static ROYALROAD_P_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r#"<p class=".*">"#).unwrap());
pub struct RoyalRoad {
url: String,
fiction_page: Html,
}
impl Default for RoyalRoad {
fn default() -> Self {
Self {
url: "".to_string(),
fiction_page: Html::new_document(),
}
}
}
#[allow(unused_variables, dead_code)]
impl Debug for RoyalRoad {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
#[derive(Debug)]
struct Royalroad<'a> {
url: &'a String,
}
let Self {
url,
fiction_page: _,
} = self;
Debug::fmt(&Royalroad { url }, f)
}
}
impl Backend for RoyalRoad {
fn get_backend_regexps() -> Vec<Regex> {
vec![Regex::new(
r"https?://www\.royalroad\.com/fiction/(?<fiction_id>\d+)/(?<fiction_title_slug>[\w\-]+)",
)
.unwrap()]
}
fn get_backend_name() -> &'static str {
"royalroad"
}
fn get_ordering_function() -> ChapterOrderingFn {
Box::new(|c1: &Chapter, c2: &Chapter| c1.published_at().cmp(c2.published_at()))
}
fn new(url: &str) -> Result<Self, BackendError> {
let req = get(url)?;
if !req.status().is_success() {
return Err(BackendError::RequestFailed {
message: format!("Could not get fiction URL {url}"),
status: req.status(),
content: req.text()?,
});
}
Ok(Self {
url: url.to_string(),
fiction_page: Html::parse_document(&req.text()?),
})
}
fn title(&self) -> Result<String, BackendError> {
let title = self
.fiction_page
.select(&FICTION_TITLE_SELECTOR)
.map(|selection| selection.inner_html())
.next();
debug!("Got title: {:?}", title);
if title.is_none() {
return Err(BackendError::ParseError(format!(
"Failed to get title from {}",
self.url
)));
}
Ok(title.unwrap())
}
fn immutable_identifier(&self) -> Result<String, BackendError> {
let regex = &Self::get_backend_regexps()[0];
let matches = regex.captures(&self.url);
if let Some(matches) = matches {
let fiction_id = matches.name("fiction_id").unwrap();
let fiction_title = matches.name("fiction_title_slug").unwrap();
Ok(format!(
"{}-{}",
fiction_title.as_str(),
fiction_id.as_str()
))
} else {
Err(BackendError::ParseError("Unable to parse URL".to_string()))
}
}
fn url(&self) -> String {
self.url.clone()
}
fn cover_url(&self) -> Result<String, BackendError> {
let img_url = self
.fiction_page
.select(&FICTION_IMAGE_URL_SELECTOR)
.next()
.ok_or(BackendError::ParseError(
"Could not find fiction cover image url".to_string(),
))?
.attr("content")
.ok_or(BackendError::ParseError(
"Could not find property \"content\" when searching for cover image".to_string(),
))?;
Ok(img_url.to_string())
}
fn get_authors(&self) -> Result<Vec<String>, BackendError> {
let authors : Result<Vec<String>, BackendError>=
self.fiction_page
.select(&FICTION_AUTHORS_SELECTOR)
.map(|selection| selection.attr("content").ok_or_else(|| BackendError::ParseError("Failed to find 'content' attribute while looking at <meta property='books:author'>".to_string())).map(|s| s.to_string())).collect();
let authors = authors.map_err(|e| {
BackendError::ParseError(format!("Failed to get authors from {}: {}", self.url, e))
})?;
if authors.is_empty() {
return Err(BackendError::ParseError(format!(
"Failed to get authors from {}: Resulting author list is empty",
self.url
)));
}
Ok(authors)
}
fn get_chapter_list(&self) -> Result<Vec<ChapterListElem>, BackendError> {
let results = self
.fiction_page
.select(&CHAPTER_TITLE_SELECTOR)
.enumerate()
.map(|(index, elem)| {
let title =
decode_html_entities(elem.inner_html().trim_matches('\n').trim()).to_string();
(index + 1, title)
})
.collect();
Ok(results)
}
fn get_chapter(&self, chapter_number: usize) -> Result<Chapter, BackendError> {
if chapter_number == 0 {
return Err(BackendError::UnknownChapter(chapter_number));
}
let chapter_url = self
.fiction_page
.select(&CHAPTER_TITLE_SELECTOR)
.map(|select| select.attr("href").unwrap().to_string())
.nth(chapter_number - 1)
.ok_or(BackendError::UnknownChapter(chapter_number))?;
let chapter_date = self
.fiction_page
.select(&CHAPTER_CREATED_AT_SELECTOR)
.map(|select| DateTime::parse_from_rfc3339(select.attr("datetime").unwrap()))
.nth(chapter_number - 1)
.ok_or(BackendError::UnknownChapter(chapter_number))?;
let chapter_url = format!("https://www.royalroad.com{}", chapter_url);
let matches = ROYALROAD_CHAPTER_URL_REGEX.captures(&chapter_url).unwrap();
let metadata = HashMap::from([
(
"chapter_id".to_string(),
matches.name("chapter_id").unwrap().as_str().to_string(),
),
(
"fiction_id".to_string(),
matches.name("fiction_id").unwrap().as_str().to_string(),
),
]);
debug!("Attempting to get chapter {chapter_url}");
let res = get(&chapter_url)?;
if !res.status().is_success() {
return Err(BackendError::RequestFailed {
message: format!(
"failed to get chapter {} from {}",
chapter_number, &chapter_url,
),
status: res.status(),
content: res.text()?,
});
}
let mut txt = res.text()?;
for anti_theft_text in ROYALROAD_ANTI_THEFT_TEXT_ARRAY.iter() {
txt = txt.replace(anti_theft_text, "");
}
let txt = ROYALROAD_P_REGEX.replace_all(&txt, "<p>").to_string();
let chapter_page = Html::parse_document(&txt);
let chapter_title = decode_html_entities(
chapter_page
.select(&CHAPTER_PAGE_TITLE_SELECTOR)
.next()
.unwrap()
.inner_html()
.trim_matches(['\n', ' ']),
)
.to_string();
let chapter_content = chapter_page
.select(&CHAPTER_PAGE_CONTENT)
.next()
.unwrap()
.inner_html()
.to_string();
let mut chapter = Chapter::default();
chapter.set_index(chapter_number);
chapter.set_title(Some(chapter_title));
chapter.set_chapter_url(chapter_url);
chapter.set_fiction_url(self.url().clone());
chapter.set_published_at(Some(chapter_date?.to_utc()));
chapter.set_metadata(metadata);
chapter.set_content(chapter_content);
Ok(chapter)
}
fn get_chapter_count(&self) -> Result<usize, BackendError> {
let chapter_urls: Vec<String> = self
.fiction_page
.select(&CHAPTER_TITLE_SELECTOR)
.map(|select| select.attr("href").unwrap().to_string())
.collect();
Ok(chapter_urls.len())
}
}
#[cfg(test)]
mod tests {
use std::str::FromStr;
use scraper::Html;
use test_log::test;
use crate::backends::RoyalRoad;
use crate::{Backend, Chapter};
const TEST_URL: &str = "https://www.royalroad.com/fiction/21220/mother-of-learning";
#[test]
fn test_chapter_to_string_and_back() {
let b = RoyalRoad::new(TEST_URL).unwrap();
let chapter = b.get_chapter(1).unwrap();
let s = chapter.to_string();
let chapter2 = Chapter::from_str(&s).unwrap();
assert_eq!(chapter.index, chapter2.index);
assert_eq!(chapter.title, chapter2.title);
assert_eq!(chapter.chapter_url, chapter2.chapter_url);
assert_eq!(chapter.fiction_url, chapter2.fiction_url);
assert_eq!(chapter.published_at, chapter2.published_at);
assert_eq!(chapter.metadata, chapter2.metadata);
assert_eq!(
Html::parse_fragment(&chapter.content),
Html::parse_fragment(&chapter2.content)
);
}
#[test]
fn test_chapter_equality() {
let b = RoyalRoad::new(TEST_URL).unwrap();
let chapters: Vec<Chapter> = (1..3)
.map(|index| b.get_chapter(index).unwrap())
.collect::<Vec<_>>();
let expected = b.get_chapter_list().unwrap();
for chapter in chapters {
assert_eq!(
chapter.title(),
&Some(expected[chapter.index - 1].1.clone())
)
}
}
}