1use crate::errors::ScraperError;
2use chrono::{DateTime, Utc};
3use derive_new::new;
4use log::{error, warn};
5use regex::Regex;
6use reqwest::get;
7use scraper::{Html, Selector};
8use serde_json::Value;
9
10#[derive(Debug, new, PartialEq)]
12pub struct BookMetadata {
13 pub title: String,
15 pub subtitle: Option<String>,
17 pub description: Option<String>,
19 pub publisher: Option<String>,
21 pub publication_date: Option<DateTime<Utc>>,
23 pub isbn: Option<String>,
25 pub contributors: Vec<BookContributor>,
27 pub genres: Vec<String>,
29 pub series: Option<BookSeries>,
31 pub page_count: Option<i64>,
33 pub language: Option<String>,
35 pub image_url: Option<String>,
37}
38
39#[derive(Debug, new, PartialEq)]
41pub struct BookContributor {
42 pub name: String,
44 pub role: String,
46}
47
48#[derive(Debug, new, PartialEq)]
50pub struct BookSeries {
51 pub title: String,
53 pub number: f32,
55}
56
57pub async fn fetch_metadata(goodreads_id: &str) -> Result<BookMetadata, ScraperError> {
58 let metadata = extract_book_metadata(goodreads_id).await?;
59 let amazon_id = extract_amazon_id(&metadata, goodreads_id)?;
60
61 let (title, subtitle) = extract_title_and_subtitle(&metadata, &amazon_id)?;
62 let description = extract_description(&metadata, &amazon_id);
63 let image_url = extract_image_url(&metadata, &amazon_id);
64 let contributors = extract_contributors(&metadata, &amazon_id);
65 let genres = extract_genres(&metadata, &amazon_id);
66 let publisher = extract_publisher(&metadata, &amazon_id);
67 let publication_date = extract_publication_date(&metadata, &amazon_id);
68 let isbn = extract_isbn(&metadata, &amazon_id);
69 let page_count = extract_page_count(&metadata, &amazon_id);
70 let language = extract_language(&metadata, &amazon_id);
71 let series = extract_series(&metadata, &amazon_id);
72
73 let metadata = BookMetadata::new(
74 title,
75 subtitle,
76 description,
77 publisher,
78 publication_date,
79 isbn,
80 contributors,
81 genres,
82 series,
83 page_count,
84 language,
85 image_url,
86 );
87
88 Ok(metadata)
89}
90
91async fn extract_book_metadata(goodreads_id: &str) -> Result<Value, ScraperError> {
92 let url = format!("https://www.goodreads.com/book/show/{}", goodreads_id);
93 let document = Html::parse_document(&get(&url).await?.text().await?);
94 let metadata_selector = Selector::parse(r#"script[id="__NEXT_DATA__"]"#)?;
95 let metadata = &document.select(&metadata_selector).next();
96
97 let metadata = match metadata {
98 None => {
99 error!("Failed to scrape book metadata");
100 return Err(ScraperError::ScrapeError(
101 "Failed to scrape book metadata".to_string(),
102 ));
103 }
104 Some(m) => serde_json::from_str(&m.text().collect::<String>())?,
105 };
106
107 Ok(metadata)
108}
109
110fn extract_amazon_id(metadata: &Value, goodreads_id: &str) -> Result<String, ScraperError> {
111 let amazon_id_key = format!("getBookByLegacyId({{\"legacyId\":\"{}\"}})", goodreads_id);
112 let amazon_id =
113 &metadata["props"]["pageProps"]["apolloState"]["ROOT_QUERY"][amazon_id_key]["__ref"];
114 let amazon_id = match to_string(amazon_id) {
115 None => {
116 error!("Failed to scrape Amazon ID");
117 return Err(ScraperError::ScrapeError(
118 "Failed to scrape Amazon ID".to_string(),
119 ));
120 }
121 Some(id) => id,
122 };
123
124 Ok(amazon_id)
125}
126
127fn extract_title_and_subtitle(
128 metadata: &Value,
129 amazon_id: &str,
130) -> Result<(String, Option<String>), ScraperError> {
131 let title = &metadata["props"]["pageProps"]["apolloState"][amazon_id]["title"];
132 let title = match to_string(title) {
133 None => {
134 error!("Failed to scrape book title");
135 return Err(ScraperError::ScrapeError(
136 "Failed to scrape book title".to_string(),
137 ));
138 }
139 Some(t) => t,
140 };
141
142 match title.split_once(":") {
143 Some((title, subtitle)) => Ok((title.to_string(), Some(subtitle.trim().to_string()))),
144 None => Ok((title.to_string(), None)),
145 }
146}
147
148fn extract_description(metadata: &Value, amazon_id: &str) -> Option<String> {
149 let description = &metadata["props"]["pageProps"]["apolloState"][amazon_id]["description"];
150 to_string(description)
151}
152
153fn extract_image_url(metadata: &Value, amazon_id: &str) -> Option<String> {
154 let url = &metadata["props"]["pageProps"]["apolloState"][amazon_id]["imageUrl"];
155 to_string(url)
156}
157
158fn extract_contributors(metadata: &Value, amazon_id: &str) -> Vec<BookContributor> {
159 let mut contributors = Vec::new();
160
161 let primary = metadata["props"]["pageProps"]["apolloState"][amazon_id]
162 ["primaryContributorEdge"]
163 .as_object()
164 .map(|obj| (to_string(&obj["role"]), to_string(&obj["node"]["__ref"])));
165
166 match primary {
167 Some((Some(role), Some(reference))) => {
168 if let Some(contributor) = fetch_contributor(metadata, (role, reference)) {
169 contributors.push(contributor);
170 }
171 }
172 Some(_) => {
173 warn!("Failed to parse contributor");
174 }
175 None => (),
176 }
177
178 let Some(secondary) = metadata["props"]["pageProps"]["apolloState"][amazon_id]
179 ["secondaryContributorEdges"]
180 .as_array()
181 else {
182 return contributors
183 .into_iter()
184 .filter(|s| !s.name.to_lowercase().eq("unknown author"))
185 .collect();
186 };
187
188 for contributor in secondary {
189 let role = to_string(&contributor["role"]);
190 let key = to_string(&contributor["node"]["__ref"]);
191 if role.is_none() || key.is_none() {
192 warn!("Failed to parse contributor");
193 continue;
194 }
195
196 if let Some(contributor) = fetch_contributor(metadata, (role.unwrap(), key.unwrap())) {
197 contributors.push(contributor);
198 }
199 }
200
201 contributors
202 .into_iter()
203 .filter(|s| !s.name.to_lowercase().eq("unknown author"))
204 .collect()
205}
206
207fn fetch_contributor(metadata: &Value, (role, key): (String, String)) -> Option<BookContributor> {
208 let contributor = &metadata["props"]["pageProps"]["apolloState"][key]["name"];
209 let name = to_string(contributor);
210 if name.is_none() {
211 warn!("Failed to parse contributor")
212 }
213
214 name.map(|n| BookContributor::new(n, role))
215}
216
217fn extract_genres(metadata: &Value, amazon_id: &str) -> Vec<String> {
218 let genres = metadata["props"]["pageProps"]["apolloState"][amazon_id]["bookGenres"].as_array();
219
220 let Some(genres) = genres else {
221 return vec![];
222 };
223
224 genres
225 .iter()
226 .filter_map(|genre| {
227 to_string(&genre["genre"]["name"]).or_else(|| {
228 warn!("Failed to parse genre name");
229 None
230 })
231 })
232 .collect()
233}
234
235fn extract_publisher(metadata: &Value, amazon_id: &str) -> Option<String> {
236 let publisher =
237 &metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["publisher"];
238 to_string(publisher)
239}
240
241fn extract_publication_date(metadata: &Value, amazon_id: &str) -> Option<DateTime<Utc>> {
242 match &metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["publicationTime"] {
243 Value::Null => None,
244 Value::Number(number) => {
245 let timestamp = number.as_i64().map(DateTime::from_timestamp_millis);
246
247 if timestamp.is_none() {
248 warn!("Failed to parse publication date");
249 }
250
251 timestamp.flatten()
252 }
253 _ => panic!("Publication date must be a timestamp"),
254 }
255}
256
257fn extract_isbn(metadata: &Value, amazon_id: &str) -> Option<String> {
258 let isbn = &metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["isbn"];
259 match to_string(isbn) {
260 Some(i) => return Some(i),
261 None => (),
262 };
263
264 let isbn13 = &metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["isbn13"];
265 match to_string(isbn13) {
266 Some(i) => return Some(i),
267 None => (),
268 };
269
270 let asin = &metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["asin"];
271 to_string(asin)
272}
273
274fn extract_page_count(metadata: &Value, amazon_id: &str) -> Option<i64> {
275 let count =
276 metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["numPages"].as_i64();
277 match count {
278 Some(0) => return None,
279 c => return c,
280 }
281}
282
283fn extract_language(metadata: &Value, amazon_id: &str) -> Option<String> {
284 let language =
285 &metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["language"]["name"];
286 to_string(language)
287}
288
289fn extract_series(metadata: &Value, amazon_id: &str) -> Option<BookSeries> {
290 let series_array =
291 match metadata["props"]["pageProps"]["apolloState"][amazon_id]["bookSeries"].as_array() {
292 None => return None,
293 Some(s) => s,
294 };
295
296 let series = match series_array.first() {
297 None => return None,
298 Some(s) => s,
299 };
300
301 let Some(position) = series["userPosition"]
302 .as_str()
303 .map(|s| s.split('-').next().unwrap_or(""))
304 .map(|s| s.parse::<f32>().ok())
305 .flatten()
306 else {
307 warn!("Failed to parse series number");
308 return None;
309 };
310
311 let key = match to_string(&series["series"]["__ref"]) {
312 None => {
313 warn!("Failed to parse series key");
314 return None;
315 }
316 Some(k) => k,
317 };
318
319 let title = &metadata["props"]["pageProps"]["apolloState"][key]["title"];
320 let title = match to_string(title) {
321 None => {
322 warn!("Failed to parse series title");
323 return None;
324 }
325 Some(t) => t,
326 };
327
328 Some(BookSeries::new(title, position))
329}
330
331fn to_string(value: &Value) -> Option<String> {
332 let re = Regex::new(r"\s{2,}").expect("Regex must be valid");
333 value
334 .as_str()
335 .map(|s| s.trim())
336 .map(|s| re.replace_all(s, " ").to_string())
337 .filter(|s| !s.is_empty())
338}
339
340#[cfg(test)]
341mod tests {
342 use super::*;
343
344 #[tokio::test]
345 async fn fetch_metadata_test() {
346 let expected_series = Some(BookSeries::new(
347 "Percy Jackson and the Olympians".to_string(),
348 5.0,
349 ));
350 let expected_contributors = vec![BookContributor::new(
351 "Rick Riordan".to_string(),
352 "Author".to_string(),
353 )];
354 let expected_genres = vec![
355 "Fantasy".to_string(),
356 "Young Adult".to_string(),
357 "Mythology".to_string(),
358 "Fiction".to_string(),
359 "Percy Jackson".to_string(),
360 "Middle Grade".to_string(),
361 "Adventure".to_string(),
362 "Greek Mythology".to_string(),
363 "Urban Fantasy".to_string(),
364 "Childrens".to_string(),
365 ];
366 let expected_metadata = BookMetadata::new(
367 "The Last Olympian".to_string(),
368 None,
369 Some("All year the half-bloods have been preparing for battle against the Titans, knowing the odds of victory are grim. \
370 Kronos's army is stronger than ever, and with every god and half-blood he recruits, the evil Titan's power only grows.\
371 <br /><br />While the Olympians struggle to contain the rampaging monster Typhon, Kronos begins his advance on New York City, \
372 where Mount Olympus stands virtually unguarded. Now it's up to Percy Jackson and an army of young demigods to stop the Lord of Time. \
373 <br /><br />In this momentous final book in the <i>New York Times</i> best-selling series, the long-awaited prophecy surrounding \
374 Percy's sixteenth birthday unfolds. And as the battle for Western civilization rages on the streets of Manhattan, Percy faces a \
375 terrifying suspicion that he may be fighting against his own fate.".to_string()),
376 Some("Disney-Hyperion Books".to_string()),
377 Some(DateTime::parse_from_rfc3339("2009-05-05T07:00:00Z").unwrap().to_utc()),
378 Some("1423101472".to_string()),
379 expected_contributors,
380 expected_genres,
381 expected_series,
382 Some(381),
383 Some("English".to_string()),
384 Some("https://images-na.ssl-images-amazon.com/images/S/compressed.photo.goodreads.com/books/1723393514i/4556058.jpg".to_string()),
385 );
386
387 let metadata = fetch_metadata("4556058").await.unwrap();
388 assert_eq!(metadata, expected_metadata);
389 }
390}