libwebnovel 0.9.3

A Rust crate enabling users to get chapters of a webnovel, with multiple available backends.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
use std::collections::HashMap;
use std::fmt::{Debug, Formatter};
use std::sync::LazyLock;

use chrono::DateTime;
use html_escape::decode_html_entities;
use log::debug;
use regex::Regex;
use scraper::{Html, Selector};

use crate::backends::{Backend, BackendError, ChapterListElem, ChapterOrderingFn};
use crate::utils::get;
use crate::Chapter;

/// Used to return the chapter's <a> in the fiction's chapter list
static CHAPTER_TITLE_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
    Selector::parse("table#chapters tbody tr.chapter-row td:first-child a").unwrap()
});
/// Used to return the date of the chapter in the fiction chapter's list
static CHAPTER_CREATED_AT_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
    Selector::parse("table#chapters tbody tr.chapter-row td:last-child time").unwrap()
});
/// Used to return the authors of the fiction
static FICTION_AUTHORS_SELECTOR: LazyLock<Selector> =
    LazyLock::new(|| Selector::parse("meta[property='books:author']").unwrap());
/// Used to return the chapter's title on the chapter page
static CHAPTER_PAGE_TITLE_SELECTOR: LazyLock<Selector> =
    LazyLock::new(|| Selector::parse("div.row.fic-header div.row div h1.font-white").unwrap());
/// Used to return the chapter content on the chapter page
static CHAPTER_PAGE_CONTENT: LazyLock<Selector> = LazyLock::new(|| {
    Selector::parse("div.page-container div.page-content-wrapper div.page-content div.container.chapter-page div div div.portlet-body div.chapter-inner.chapter-content").unwrap()
});
/// Used to get the fiction's title on the main fiction page
static FICTION_TITLE_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
    Selector::parse("div.row.fic-header div.fic-title div.col h1.font-white").unwrap()
});

/// Used to return the fiction's cover image
static FICTION_IMAGE_URL_SELECTOR: LazyLock<Selector> =
    LazyLock::new(|| Selector::parse("meta[property='og:image']").unwrap());

/// This is text added to RoyalRoad (RR) chapters when reading them outside of
/// RR's website (i guess). I think it is better to remove them since it
/// interrupts the flow of reading, and we know it's from RR, since we are
/// attempting to contact it directly, at the demand of the user. Maybe add a
/// disclaimer as a footing to remind the content is from RR and preserve the
/// spirit of these additions?
/// On other news, if find it nice from RR to put this, from an author
/// standpoint.
const ROYALROAD_ANTI_THEFT_TEXT: &str =
    include_str!("../../ressources/royalroad/known_anti-theft_sentences.txt");

static ROYALROAD_ANTI_THEFT_TEXT_ARRAY: LazyLock<Vec<String>> = LazyLock::new(|| {
    ROYALROAD_ANTI_THEFT_TEXT
        .lines()
        .filter(|s| !s.is_empty())
        .map(|t| format!(r#"<p>{}</p>"#, t))
        .collect()
});

/// Used to identify a chapter URL
static ROYALROAD_CHAPTER_URL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r"https?://www\.royalroad\.com/fiction/(?<fiction_id>\d+)/(?<fiction_title_slug>[\w-]+)/chapter/(?<chapter_id>\d+)/(?<chapter_title_slug>[\w-]+)").unwrap()
});

/// Used to strip RR's weird paragraph CSS classes
static ROYALROAD_P_REGEX: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r#"<p class=".*">"#).unwrap());

/// A [`Backend`] implementation for [RoyalRoad](https://royalroad.com)
pub struct RoyalRoad {
    url: String,
    fiction_page: Html,
}
impl Default for RoyalRoad {
    fn default() -> Self {
        Self {
            url: "".to_string(),
            fiction_page: Html::new_document(),
        }
    }
}

#[allow(unused_variables, dead_code)]
impl Debug for RoyalRoad {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        #[derive(Debug)]
        struct Royalroad<'a> {
            url: &'a String,
        }
        let Self {
            url,
            fiction_page: _,
        } = self;
        Debug::fmt(&Royalroad { url }, f)
    }
}

/// Builds a new RoyalRoad backend for a given URL.
/// ```
/// use libwebnovel::backends::RoyalRoad;
/// use libwebnovel::Backend;
/// let backend =
///     RoyalRoad::new("https://www.royalroad.com/fiction/21220/mother-of-learning").unwrap();
/// assert_eq!(backend.title().unwrap(), "Mother of Learning");
/// ```
impl Backend for RoyalRoad {
    fn get_backend_regexps() -> Vec<Regex> {
        vec![Regex::new(
            r"https?://www\.royalroad\.com/fiction/(?<fiction_id>\d+)/(?<fiction_title_slug>[\w\-]+)",
        )
        .unwrap()]
    }

    fn get_backend_name() -> &'static str {
        "royalroad"
    }

    /// Returns a function capable of comparing two chapters
    ///
    /// ```rust
    /// use libwebnovel::backends::RoyalRoad;
    /// use libwebnovel::Backend;
    /// let backend =
    ///     RoyalRoad::new("https://www.royalroad.com/fiction/21220/mother-of-learning").unwrap();
    /// let mut chapters = vec![
    ///     backend.get_chapter(2).unwrap(),
    ///     backend.get_chapter(1).unwrap(),
    ///     backend.get_chapter(4).unwrap(),
    ///     backend.get_chapter(3).unwrap(),
    /// ];
    /// chapters.sort_by(RoyalRoad::get_ordering_function());
    /// assert_eq!(
    ///     chapters[0].title(),
    ///     &Some("1. Good Morning Brother".to_string())
    /// );
    /// assert_eq!(
    ///     chapters[1].title(),
    ///     &Some("2. Life’s Little Problems".to_string())
    /// );
    /// assert_eq!(
    ///     chapters[2].title(),
    ///     &Some("3. The Bitter Truth".to_string())
    /// );
    /// assert_eq!(chapters[3].title(), &Some("4. Stars Fell".to_string()));
    /// ```
    fn get_ordering_function() -> ChapterOrderingFn {
        Box::new(|c1: &Chapter, c2: &Chapter| c1.published_at().cmp(c2.published_at()))
    }

    fn new(url: &str) -> Result<Self, BackendError> {
        let req = get(url)?;
        if !req.status().is_success() {
            return Err(BackendError::RequestFailed {
                message: format!("Could not get fiction URL {url}"),
                status: req.status(),
                content: req.text()?,
            });
        }
        Ok(Self {
            url: url.to_string(),
            fiction_page: Html::parse_document(&req.text()?),
        })
    }

    fn title(&self) -> Result<String, BackendError> {
        let title = self
            .fiction_page
            .select(&FICTION_TITLE_SELECTOR)
            .map(|selection| selection.inner_html())
            .next();
        debug!("Got title: {:?}", title);
        if title.is_none() {
            return Err(BackendError::ParseError(format!(
                "Failed to get title from {}",
                self.url
            )));
        }
        Ok(title.unwrap())
    }

    /// ```rust
    /// use libwebnovel::backends::RoyalRoad;
    /// use libwebnovel::Backend;
    /// let backend =
    ///     RoyalRoad::new("https://www.royalroad.com/fiction/21220/mother-of-learning").unwrap();
    /// assert_eq!(
    ///     backend.immutable_identifier().unwrap(),
    ///     "mother-of-learning-21220"
    /// );
    /// ```
    fn immutable_identifier(&self) -> Result<String, BackendError> {
        let regex = &Self::get_backend_regexps()[0];
        let matches = regex.captures(&self.url);
        if let Some(matches) = matches {
            let fiction_id = matches.name("fiction_id").unwrap();
            let fiction_title = matches.name("fiction_title_slug").unwrap();
            Ok(format!(
                "{}-{}",
                fiction_title.as_str(),
                fiction_id.as_str()
            ))
        } else {
            Err(BackendError::ParseError("Unable to parse URL".to_string()))
        }
    }

    fn url(&self) -> String {
        self.url.clone()
    }

    /// Returns the cover URL of the fiction.
    ///
    /// ```rust
    /// use libwebnovel::backends::RoyalRoad;
    /// use libwebnovel::Backend;
    /// let backend =
    ///     RoyalRoad::new("https://www.royalroad.com/fiction/21220/mother-of-learning").unwrap();
    /// let cover_url = backend.cover_url().unwrap();
    /// assert_eq!(cover_url, "https://www.royalroadcdn.com/public/covers-full/21220-mother-of-learning.jpg?time=1637247458");
    /// ```
    fn cover_url(&self) -> Result<String, BackendError> {
        let img_url = self
            .fiction_page
            .select(&FICTION_IMAGE_URL_SELECTOR)
            .next()
            .ok_or(BackendError::ParseError(
                "Could not find fiction cover image url".to_string(),
            ))?
            .attr("content")
            .ok_or(BackendError::ParseError(
                "Could not find property \"content\" when searching for cover image".to_string(),
            ))?;
        Ok(img_url.to_string())
    }

    fn get_authors(&self) -> Result<Vec<String>, BackendError> {
        let authors : Result<Vec<String>, BackendError>=
            self.fiction_page
                .select(&FICTION_AUTHORS_SELECTOR)
                .map(|selection| selection.attr("content").ok_or_else(|| BackendError::ParseError("Failed to find 'content' attribute while looking at <meta property='books:author'>".to_string())).map(|s| s.to_string())).collect();

        let authors = authors.map_err(|e| {
            BackendError::ParseError(format!("Failed to get authors from {}: {}", self.url, e))
        })?;
        if authors.is_empty() {
            return Err(BackendError::ParseError(format!(
                "Failed to get authors from {}: Resulting author list is empty",
                self.url
            )));
        }
        Ok(authors)
    }

    /// Returns the chapter list as available on the main fiction page
    /// ```rust
    /// use libwebnovel::backends::RoyalRoad;
    /// use libwebnovel::Backend;
    /// let backend =
    ///     RoyalRoad::new("https://www.royalroad.com/fiction/21220/mother-of-learning").unwrap();
    /// let chapter_lists = backend.get_chapter_list().unwrap();
    /// let expected_tuples: &[(usize, &str)] = &[
    ///     (1, "1. Good Morning Brother"),
    ///     (2, "2. Life’s Little Problems"),
    ///     (3, "3. The Bitter Truth"),
    /// ];
    /// for (expected_index, expected_title) in expected_tuples {
    ///     assert_eq!(chapter_lists[*expected_index - 1].0, *expected_index);
    ///     assert_eq!(&chapter_lists[*expected_index - 1].1, expected_title);
    /// }
    /// ```
    fn get_chapter_list(&self) -> Result<Vec<ChapterListElem>, BackendError> {
        let results = self
            .fiction_page
            .select(&CHAPTER_TITLE_SELECTOR)
            .enumerate()
            .map(|(index, elem)| {
                let title =
                    decode_html_entities(elem.inner_html().trim_matches('\n').trim()).to_string();
                (index + 1, title)
            })
            .collect();
        Ok(results)
    }

    fn get_chapter(&self, chapter_number: usize) -> Result<Chapter, BackendError> {
        if chapter_number == 0 {
            return Err(BackendError::UnknownChapter(chapter_number));
        }
        // Get che chapter URL
        let chapter_url = self
            .fiction_page
            .select(&CHAPTER_TITLE_SELECTOR)
            .map(|select| select.attr("href").unwrap().to_string())
            .nth(chapter_number - 1)
            .ok_or(BackendError::UnknownChapter(chapter_number))?;
        // Get the chapter publication date
        let chapter_date = self
            .fiction_page
            .select(&CHAPTER_CREATED_AT_SELECTOR)
            .map(|select| DateTime::parse_from_rfc3339(select.attr("datetime").unwrap()))
            .nth(chapter_number - 1)
            .ok_or(BackendError::UnknownChapter(chapter_number))?;
        let chapter_url = format!("https://www.royalroad.com{}", chapter_url);
        let matches = ROYALROAD_CHAPTER_URL_REGEX.captures(&chapter_url).unwrap();
        let metadata = HashMap::from([
            (
                "chapter_id".to_string(),
                matches.name("chapter_id").unwrap().as_str().to_string(),
            ),
            (
                "fiction_id".to_string(),
                matches.name("fiction_id").unwrap().as_str().to_string(),
            ),
        ]);

        debug!("Attempting to get chapter {chapter_url}");
        let res = get(&chapter_url)?;
        if !res.status().is_success() {
            return Err(BackendError::RequestFailed {
                message: format!(
                    "failed to get chapter {} from {}",
                    chapter_number, &chapter_url,
                ),
                status: res.status(),
                content: res.text()?,
            });
        }
        // A bit of text transformation to get rid of RR's anti-theft added text
        let mut txt = res.text()?;
        for anti_theft_text in ROYALROAD_ANTI_THEFT_TEXT_ARRAY.iter() {
            txt = txt.replace(anti_theft_text, "");
        }

        let txt = ROYALROAD_P_REGEX.replace_all(&txt, "<p>").to_string();

        let chapter_page = Html::parse_document(&txt);
        let chapter_title = decode_html_entities(
            chapter_page
                .select(&CHAPTER_PAGE_TITLE_SELECTOR)
                .next()
                .unwrap()
                .inner_html()
                .trim_matches(['\n', ' ']),
        )
        .to_string();
        let chapter_content = chapter_page
            .select(&CHAPTER_PAGE_CONTENT)
            .next()
            .unwrap()
            .inner_html()
            .to_string();
        let mut chapter = Chapter::default();
        chapter.set_index(chapter_number);
        chapter.set_title(Some(chapter_title));
        chapter.set_chapter_url(chapter_url);
        chapter.set_fiction_url(self.url().clone());
        chapter.set_published_at(Some(chapter_date?.to_utc()));
        chapter.set_metadata(metadata);
        chapter.set_content(chapter_content);
        Ok(chapter)
    }

    fn get_chapter_count(&self) -> Result<usize, BackendError> {
        let chapter_urls: Vec<String> = self
            .fiction_page
            .select(&CHAPTER_TITLE_SELECTOR)
            .map(|select| select.attr("href").unwrap().to_string())
            .collect();
        Ok(chapter_urls.len())
    }
}

#[cfg(test)]
mod tests {
    use std::str::FromStr;

    use scraper::Html;
    use test_log::test;

    use crate::backends::RoyalRoad;
    use crate::{Backend, Chapter};

    const TEST_URL: &str = "https://www.royalroad.com/fiction/21220/mother-of-learning";
    #[test]
    fn test_chapter_to_string_and_back() {
        let b = RoyalRoad::new(TEST_URL).unwrap();
        let chapter = b.get_chapter(1).unwrap();
        let s = chapter.to_string();
        let chapter2 = Chapter::from_str(&s).unwrap();
        assert_eq!(chapter.index, chapter2.index);
        assert_eq!(chapter.title, chapter2.title);
        assert_eq!(chapter.chapter_url, chapter2.chapter_url);
        assert_eq!(chapter.fiction_url, chapter2.fiction_url);
        assert_eq!(chapter.published_at, chapter2.published_at);
        assert_eq!(chapter.metadata, chapter2.metadata);
        assert_eq!(
            Html::parse_fragment(&chapter.content),
            Html::parse_fragment(&chapter2.content)
        );
    }

    #[test]
    fn test_chapter_equality() {
        let b = RoyalRoad::new(TEST_URL).unwrap();
        let chapters: Vec<Chapter> = (1..3)
            .map(|index| b.get_chapter(index).unwrap())
            .collect::<Vec<_>>();
        let expected = b.get_chapter_list().unwrap();
        for chapter in chapters {
            assert_eq!(
                chapter.title(),
                &Some(expected[chapter.index - 1].1.clone())
            )
        }
    }
}