grscraper/
metadata_fetcher.rs

1use crate::errors::ScraperError;
2use chrono::{DateTime, Utc};
3use derive_new::new;
4use log::{error, warn};
5use regex::Regex;
6use reqwest::get;
7use scraper::{Html, Selector};
8use serde_json::Value;
9
10/// The primary data structure containing the metadata of a book.
11#[derive(Debug, new, PartialEq)]
12pub struct BookMetadata {
13    /// The main title of the book.
14    pub title: String,
15    /// An optional subtitle of the book.
16    pub subtitle: Option<String>,
17    /// An optional description or summary of the book.
18    pub description: Option<String>,
19    /// The publisher of the book, if available.
20    pub publisher: Option<String>,
21    /// The publication date of the book, represented as a UTC datetime.
22    pub publication_date: Option<DateTime<Utc>>,
23    /// The ISBN of the book, if available.
24    pub isbn: Option<String>,
25    /// A list of contributors to the book, each represented as a `BookContributor`.
26    pub contributors: Vec<BookContributor>,
27    /// A list of genres associated with the book.
28    pub genres: Vec<String>,
29    /// The series information, if the book is part of a series, represented as a `BookSeries`.
30    pub series: Option<BookSeries>,
31    /// The number of pages in the book, if available.
32    pub page_count: Option<i64>,
33    /// The language of the book, if available.
34    pub language: Option<String>,
35    /// A URL to an image of the book's cover, if available.
36    pub image_url: Option<String>,
37}
38
39/// Represents an individual who contributed to the book, such as an author or editor.
40#[derive(Debug, new, PartialEq)]
41pub struct BookContributor {
42    /// The name of the contributor.
43    pub name: String,
44    /// The role of the contributor, such as "Author" or "Illustrator".
45    pub role: String,
46}
47
48/// Represents series information for a book, including the series title and book's position within the series.
49#[derive(Debug, new, PartialEq)]
50pub struct BookSeries {
51    /// The title of the series.
52    pub title: String,
53    /// The position of the book within the series, represented as a float to accommodate cases like "1.5".
54    pub number: f32,
55}
56
57pub async fn fetch_metadata(goodreads_id: &str) -> Result<BookMetadata, ScraperError> {
58    let metadata = extract_book_metadata(goodreads_id).await?;
59    let amazon_id = extract_amazon_id(&metadata, goodreads_id)?;
60
61    let (title, subtitle) = extract_title_and_subtitle(&metadata, &amazon_id)?;
62    let description = extract_description(&metadata, &amazon_id);
63    let image_url = extract_image_url(&metadata, &amazon_id);
64    let contributors = extract_contributors(&metadata, &amazon_id);
65    let genres = extract_genres(&metadata, &amazon_id);
66    let publisher = extract_publisher(&metadata, &amazon_id);
67    let publication_date = extract_publication_date(&metadata, &amazon_id);
68    let isbn = extract_isbn(&metadata, &amazon_id);
69    let page_count = extract_page_count(&metadata, &amazon_id);
70    let language = extract_language(&metadata, &amazon_id);
71    let series = extract_series(&metadata, &amazon_id);
72
73    let metadata = BookMetadata::new(
74        title,
75        subtitle,
76        description,
77        publisher,
78        publication_date,
79        isbn,
80        contributors,
81        genres,
82        series,
83        page_count,
84        language,
85        image_url,
86    );
87
88    Ok(metadata)
89}
90
91async fn extract_book_metadata(goodreads_id: &str) -> Result<Value, ScraperError> {
92    let url = format!("https://www.goodreads.com/book/show/{}", goodreads_id);
93    let document = Html::parse_document(&get(&url).await?.text().await?);
94    let metadata_selector = Selector::parse(r#"script[id="__NEXT_DATA__"]"#)?;
95    let metadata = &document.select(&metadata_selector).next();
96
97    let metadata = match metadata {
98        None => {
99            error!("Failed to scrape book metadata");
100            return Err(ScraperError::ScrapeError(
101                "Failed to scrape book metadata".to_string(),
102            ));
103        }
104        Some(m) => serde_json::from_str(&m.text().collect::<String>())?,
105    };
106
107    Ok(metadata)
108}
109
110fn extract_amazon_id(metadata: &Value, goodreads_id: &str) -> Result<String, ScraperError> {
111    let amazon_id_key = format!("getBookByLegacyId({{\"legacyId\":\"{}\"}})", goodreads_id);
112    let amazon_id =
113        &metadata["props"]["pageProps"]["apolloState"]["ROOT_QUERY"][amazon_id_key]["__ref"];
114    let amazon_id = match to_string(amazon_id) {
115        None => {
116            error!("Failed to scrape Amazon ID");
117            return Err(ScraperError::ScrapeError(
118                "Failed to scrape Amazon ID".to_string(),
119            ));
120        }
121        Some(id) => id,
122    };
123
124    Ok(amazon_id)
125}
126
127fn extract_title_and_subtitle(
128    metadata: &Value,
129    amazon_id: &str,
130) -> Result<(String, Option<String>), ScraperError> {
131    let title = &metadata["props"]["pageProps"]["apolloState"][amazon_id]["title"];
132    let title = match to_string(title) {
133        None => {
134            error!("Failed to scrape book title");
135            return Err(ScraperError::ScrapeError(
136                "Failed to scrape book title".to_string(),
137            ));
138        }
139        Some(t) => t,
140    };
141
142    match title.split_once(":") {
143        Some((title, subtitle)) => Ok((title.to_string(), Some(subtitle.trim().to_string()))),
144        None => Ok((title.to_string(), None)),
145    }
146}
147
148fn extract_description(metadata: &Value, amazon_id: &str) -> Option<String> {
149    let description = &metadata["props"]["pageProps"]["apolloState"][amazon_id]["description"];
150    to_string(description)
151}
152
153fn extract_image_url(metadata: &Value, amazon_id: &str) -> Option<String> {
154    let url = &metadata["props"]["pageProps"]["apolloState"][amazon_id]["imageUrl"];
155    to_string(url)
156}
157
158fn extract_contributors(metadata: &Value, amazon_id: &str) -> Vec<BookContributor> {
159    let mut contributors = Vec::new();
160
161    let primary = metadata["props"]["pageProps"]["apolloState"][amazon_id]
162        ["primaryContributorEdge"]
163        .as_object()
164        .map(|obj| (to_string(&obj["role"]), to_string(&obj["node"]["__ref"])));
165
166    match primary {
167        Some((Some(role), Some(reference))) => {
168            if let Some(contributor) = fetch_contributor(metadata, (role, reference)) {
169                contributors.push(contributor);
170            }
171        }
172        Some(_) => {
173            warn!("Failed to parse contributor");
174        }
175        None => (),
176    }
177
178    let Some(secondary) = metadata["props"]["pageProps"]["apolloState"][amazon_id]
179        ["secondaryContributorEdges"]
180        .as_array()
181    else {
182        return contributors
183            .into_iter()
184            .filter(|s| !s.name.to_lowercase().eq("unknown author"))
185            .collect();
186    };
187
188    for contributor in secondary {
189        let role = to_string(&contributor["role"]);
190        let key = to_string(&contributor["node"]["__ref"]);
191        if role.is_none() || key.is_none() {
192            warn!("Failed to parse contributor");
193            continue;
194        }
195
196        if let Some(contributor) = fetch_contributor(metadata, (role.unwrap(), key.unwrap())) {
197            contributors.push(contributor);
198        }
199    }
200
201    contributors
202        .into_iter()
203        .filter(|s| !s.name.to_lowercase().eq("unknown author"))
204        .collect()
205}
206
207fn fetch_contributor(metadata: &Value, (role, key): (String, String)) -> Option<BookContributor> {
208    let contributor = &metadata["props"]["pageProps"]["apolloState"][key]["name"];
209    let name = to_string(contributor);
210    if name.is_none() {
211        warn!("Failed to parse contributor")
212    }
213
214    name.map(|n| BookContributor::new(n, role))
215}
216
217fn extract_genres(metadata: &Value, amazon_id: &str) -> Vec<String> {
218    let genres = metadata["props"]["pageProps"]["apolloState"][amazon_id]["bookGenres"].as_array();
219
220    let Some(genres) = genres else {
221        return vec![];
222    };
223
224    genres
225        .iter()
226        .filter_map(|genre| {
227            to_string(&genre["genre"]["name"]).or_else(|| {
228                warn!("Failed to parse genre name");
229                None
230            })
231        })
232        .collect()
233}
234
235fn extract_publisher(metadata: &Value, amazon_id: &str) -> Option<String> {
236    let publisher =
237        &metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["publisher"];
238    to_string(publisher)
239}
240
241fn extract_publication_date(metadata: &Value, amazon_id: &str) -> Option<DateTime<Utc>> {
242    match &metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["publicationTime"] {
243        Value::Null => None,
244        Value::Number(number) => {
245            let timestamp = number.as_i64().map(DateTime::from_timestamp_millis);
246
247            if timestamp.is_none() {
248                warn!("Failed to parse publication date");
249            }
250
251            timestamp.flatten()
252        }
253        _ => panic!("Publication date must be a timestamp"),
254    }
255}
256
257fn extract_isbn(metadata: &Value, amazon_id: &str) -> Option<String> {
258    let isbn = &metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["isbn"];
259    match to_string(isbn) {
260        Some(i) => return Some(i),
261        None => (),
262    };
263
264    let isbn13 = &metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["isbn13"];
265    match to_string(isbn13) {
266        Some(i) => return Some(i),
267        None => (),
268    };
269
270    let asin = &metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["asin"];
271    to_string(asin)
272}
273
274fn extract_page_count(metadata: &Value, amazon_id: &str) -> Option<i64> {
275    let count =
276        metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["numPages"].as_i64();
277    match count {
278        Some(0) => return None,
279        c => return c,
280    }
281}
282
283fn extract_language(metadata: &Value, amazon_id: &str) -> Option<String> {
284    let language =
285        &metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["language"]["name"];
286    to_string(language)
287}
288
289fn extract_series(metadata: &Value, amazon_id: &str) -> Option<BookSeries> {
290    let series_array =
291        match metadata["props"]["pageProps"]["apolloState"][amazon_id]["bookSeries"].as_array() {
292            None => return None,
293            Some(s) => s,
294        };
295
296    let series = match series_array.first() {
297        None => return None,
298        Some(s) => s,
299    };
300
301    let Some(position) = series["userPosition"]
302        .as_str()
303        .map(|s| s.split('-').next().unwrap_or(""))
304        .map(|s| s.parse::<f32>().ok())
305        .flatten()
306    else {
307        warn!("Failed to parse series number");
308        return None;
309    };
310
311    let key = match to_string(&series["series"]["__ref"]) {
312        None => {
313            warn!("Failed to parse series key");
314            return None;
315        }
316        Some(k) => k,
317    };
318
319    let title = &metadata["props"]["pageProps"]["apolloState"][key]["title"];
320    let title = match to_string(title) {
321        None => {
322            warn!("Failed to parse series title");
323            return None;
324        }
325        Some(t) => t,
326    };
327
328    Some(BookSeries::new(title, position))
329}
330
331fn to_string(value: &Value) -> Option<String> {
332    let re = Regex::new(r"\s{2,}").expect("Regex must be valid");
333    value
334        .as_str()
335        .map(|s| s.trim())
336        .map(|s| re.replace_all(s, " ").to_string())
337        .filter(|s| !s.is_empty())
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343
344    #[tokio::test]
345    async fn fetch_metadata_test() {
346        let expected_series = Some(BookSeries::new(
347            "Percy Jackson and the Olympians".to_string(),
348            5.0,
349        ));
350        let expected_contributors = vec![BookContributor::new(
351            "Rick Riordan".to_string(),
352            "Author".to_string(),
353        )];
354        let expected_genres = vec![
355            "Fantasy".to_string(),
356            "Young Adult".to_string(),
357            "Mythology".to_string(),
358            "Fiction".to_string(),
359            "Percy Jackson".to_string(),
360            "Middle Grade".to_string(),
361            "Adventure".to_string(),
362            "Greek Mythology".to_string(),
363            "Urban Fantasy".to_string(),
364            "Childrens".to_string(),
365        ];
366        let expected_metadata = BookMetadata::new(
367            "The Last Olympian".to_string(),
368            None,
369            Some("All year the half-bloods have been preparing for battle against the Titans, knowing the odds of victory are grim. \
370            Kronos's army is stronger than ever, and with every god and half-blood he recruits, the evil Titan's power only grows.\
371            <br /><br />While the Olympians struggle to contain the rampaging monster Typhon, Kronos begins his advance on New York City, \
372            where Mount Olympus stands virtually unguarded. Now it's up to Percy Jackson and an army of young demigods to stop the Lord of Time. \
373            <br /><br />In this momentous final book in the <i>New York Times</i> best-selling series, the long-awaited prophecy surrounding \
374            Percy's sixteenth birthday unfolds. And as the battle for Western civilization rages on the streets of Manhattan, Percy faces a \
375            terrifying suspicion that he may be fighting against his own fate.".to_string()),
376            Some("Disney-Hyperion Books".to_string()),
377            Some(DateTime::parse_from_rfc3339("2009-05-05T07:00:00Z").unwrap().to_utc()),
378            Some("1423101472".to_string()),
379            expected_contributors,
380            expected_genres,
381            expected_series,
382            Some(381),
383            Some("English".to_string()),
384            Some("https://images-na.ssl-images-amazon.com/images/S/compressed.photo.goodreads.com/books/1723393514i/4556058.jpg".to_string()),
385        );
386
387        let metadata = fetch_metadata("4556058").await.unwrap();
388        assert_eq!(metadata, expected_metadata);
389    }
390}