1use crate::errors::ScraperError;
2use chrono::{DateTime, Utc};
3use derive_new::new;
4use regex::Regex;
5use reqwest::get;
6use scraper::{Html, Selector};
7use serde_json::Value;
8
9#[derive(Debug, new, PartialEq)]
11pub struct BookMetadata {
12 pub title: String,
14 pub subtitle: Option<String>,
16 pub description: Option<String>,
18 pub publisher: Option<String>,
20 pub publication_date: Option<DateTime<Utc>>,
22 pub isbn: Option<String>,
24 pub contributors: Vec<BookContributor>,
26 pub genres: Vec<String>,
28 pub series: Option<BookSeries>,
30 pub page_count: Option<i64>,
32 pub language: Option<String>,
34 pub image_url: Option<String>,
36}
37
38#[derive(Debug, new, PartialEq)]
40pub struct BookContributor {
41 pub name: String,
43 pub role: String,
45}
46
47#[derive(Debug, new, PartialEq)]
49pub struct BookSeries {
50 pub title: String,
52 pub number: f32,
54}
55
56pub async fn fetch_metadata(goodreads_id: &str) -> Result<BookMetadata, ScraperError> {
57 let metadata = extract_book_metadata(goodreads_id).await?;
58 let amazon_id = extract_amazon_id(&metadata, goodreads_id);
59
60 let (title, subtitle) = extract_title_and_subtitle(&metadata, &amazon_id);
61 let description = extract_description(&metadata, &amazon_id);
62 let image_url = extract_image_url(&metadata, &amazon_id);
63 let contributors = extract_contributors(&metadata, &amazon_id);
64 let genres = extract_genres(&metadata, &amazon_id);
65 let publisher = extract_publisher(&metadata, &amazon_id);
66 let publication_date = extract_publication_date(&metadata, &amazon_id);
67 let isbn = extract_isbn(&metadata, &amazon_id);
68 let page_count = extract_page_count(&metadata, &amazon_id);
69 let language = extract_language(&metadata, &amazon_id);
70 let series = extract_series(&metadata, &amazon_id);
71
72 let metadata = BookMetadata::new(
73 title,
74 subtitle,
75 description,
76 publisher,
77 publication_date,
78 isbn,
79 contributors,
80 genres,
81 series,
82 page_count,
83 language,
84 image_url,
85 );
86
87 Ok(metadata)
88}
89
90async fn extract_book_metadata(goodreads_id: &str) -> Result<Value, ScraperError> {
91 let url = format!("https://www.goodreads.com/book/show/{}", goodreads_id);
92 let document = Html::parse_document(&get(&url).await?.text().await?);
93 let metadata_selector = Selector::parse(r#"script[id="__NEXT_DATA__"]"#)?;
94 let metadata: Value = serde_json::from_str(
95 &document
96 .select(&metadata_selector)
97 .next()
98 .expect("Failed to find metadata script")
99 .text()
100 .collect::<String>(),
101 )?;
102 Ok(metadata)
103}
104
105fn extract_amazon_id(metadata: &Value, goodreads_id: &str) -> String {
106 let amazon_id_key = format!("getBookByLegacyId({{\"legacyId\":\"{}\"}})", goodreads_id);
107 let amazon_id =
108 &metadata["props"]["pageProps"]["apolloState"]["ROOT_QUERY"][amazon_id_key]["__ref"];
109 to_string(amazon_id).expect("Amazon ID must be present")
110}
111
112fn extract_title_and_subtitle(metadata: &Value, amazon_id: &str) -> (String, Option<String>) {
113 let title = &metadata["props"]["pageProps"]["apolloState"][amazon_id]["title"];
114 let title = to_string(title).expect("Title must be present");
115
116 match title.split_once(":") {
117 Some((title, subtitle)) => (title.to_string(), Some(subtitle.trim().to_string())),
118 None => (title.to_string(), None),
119 }
120}
121
122fn extract_description(metadata: &Value, amazon_id: &str) -> Option<String> {
123 let description = &metadata["props"]["pageProps"]["apolloState"][amazon_id]["description"];
124 to_string(description)
125}
126
127fn extract_image_url(metadata: &Value, amazon_id: &str) -> Option<String> {
128 let url = &metadata["props"]["pageProps"]["apolloState"][amazon_id]["imageUrl"];
129 to_string(url)
130}
131
132fn extract_contributors(metadata: &Value, amazon_id: &str) -> Vec<BookContributor> {
133 let mut contributors = Vec::new();
134
135 let primary = metadata["props"]["pageProps"]["apolloState"][amazon_id]
136 ["primaryContributorEdge"]
137 .as_object()
138 .map(|obj| {
139 (
140 to_string(&obj["role"]).expect("Contributor role must be present"),
141 to_string(&obj["node"]["__ref"]).expect("Contributor key must be present"),
142 )
143 })
144 .expect("Primary contributor must be an object");
145
146 contributors.push(fetch_contributor(metadata, primary));
147
148 let secondary = metadata["props"]["pageProps"]["apolloState"][amazon_id]
149 ["secondaryContributorEdges"]
150 .as_array()
151 .expect("Secondary contributors must be an array");
152
153 for contributor in secondary {
154 let role = to_string(&contributor["role"]).expect("Contributor role must be present");
155 let key =
156 to_string(&contributor["node"]["__ref"]).expect("Contributor key must be present");
157 contributors.push(fetch_contributor(metadata, (role, key)));
158 }
159
160 contributors
161}
162
163fn fetch_contributor(metadata: &Value, (role, key): (String, String)) -> BookContributor {
164 let contributor = &metadata["props"]["pageProps"]["apolloState"][key]["name"];
165 let name = to_string(contributor).expect("Contributor name must be present");
166 BookContributor::new(name, role)
167}
168
169fn extract_genres(metadata: &Value, amazon_id: &str) -> Vec<String> {
170 metadata["props"]["pageProps"]["apolloState"][amazon_id]["bookGenres"]
171 .as_array()
172 .expect("Genres must be an array")
173 .iter()
174 .map(|genre| to_string(&genre["genre"]["name"]).expect("Genre name must be present"))
175 .collect()
176}
177
178fn extract_publisher(metadata: &Value, amazon_id: &str) -> Option<String> {
179 let publisher =
180 &metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["publisher"];
181 to_string(publisher)
182}
183
184fn extract_publication_date(metadata: &Value, amazon_id: &str) -> Option<DateTime<Utc>> {
185 metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["publicationTime"]
186 .as_i64()
187 .map(DateTime::from_timestamp_millis)
188 .expect("Publication date must be a timestamp")
189}
190
191fn extract_isbn(metadata: &Value, amazon_id: &str) -> Option<String> {
192 let isbn = &metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["isbn"];
193 to_string(isbn)
194}
195
196fn extract_page_count(metadata: &Value, amazon_id: &str) -> Option<i64> {
197 metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["numPages"]
198 .as_i64()
199}
200
201fn extract_language(metadata: &Value, amazon_id: &str) -> Option<String> {
202 let language = &metadata["props"]["pageProps"]["apolloState"][amazon_id]["details"]["language"]["name"];
203 to_string(language)
204}
205
206fn extract_series(metadata: &Value, amazon_id: &str) -> Option<BookSeries> {
207 let series_array = metadata["props"]["pageProps"]["apolloState"][amazon_id]["bookSeries"]
208 .as_array()
209 .expect("Book series must be an array");
210
211 if let Some(series) = series_array.first() {
212 let position = series["userPosition"]
213 .as_str()
214 .expect("Series position must be present")
215 .parse::<f32>()
216 .expect("Series position must be a number");
217
218 let key = to_string(&series["series"]["__ref"]).expect("Series key must be present");
219
220 let title = &metadata["props"]["pageProps"]["apolloState"][key]["title"];
221 let title = to_string(title).expect("Series title must be present");
222
223 Some(BookSeries::new(title, position))
224 } else {
225 None
226 }
227}
228
229fn to_string(value: &Value) -> Option<String> {
230 let re = Regex::new(r"\s{2,}").expect("Regex must be valid");
231 value
232 .as_str()
233 .map(|s| s.trim())
234 .map(|s| re.replace_all(s, " ").to_string())
235}
236
237#[cfg(test)]
238mod tests {
239 use super::*;
240
241 #[tokio::test]
242 async fn fetch_metadata_test() {
243 let expected_series = Some(BookSeries::new(
244 "Percy Jackson and the Olympians".to_string(),
245 5.0,
246 ));
247 let expected_contributors = vec![BookContributor::new(
248 "Rick Riordan".to_string(),
249 "Author".to_string(),
250 )];
251 let expected_genres = vec![
252 "Fantasy".to_string(),
253 "Young Adult".to_string(),
254 "Mythology".to_string(),
255 "Fiction".to_string(),
256 "Middle Grade".to_string(),
257 "Adventure".to_string(),
258 "Greek Mythology".to_string(),
259 "Urban Fantasy".to_string(),
260 "Childrens".to_string(),
261 "Audiobook".to_string(),
262 ];
263 let expected_metadata = BookMetadata::new(
264 "The Last Olympian".to_string(),
265 None,
266 Some("All year the half-bloods have been preparing for battle against the Titans, knowing the odds of victory are grim. \
267 Kronos's army is stronger than ever, and with every god and half-blood he recruits, the evil Titan's power only grows.\
268 <br /><br />While the Olympians struggle to contain the rampaging monster Typhon, Kronos begins his advance on New York City, \
269 where Mount Olympus stands virtually unguarded. Now it's up to Percy Jackson and an army of young demigods to stop the Lord of Time. \
270 <br /><br />In this momentous final book in the <i>New York Times</i> best-selling series, the long-awaited prophecy surrounding \
271 Percy's sixteenth birthday unfolds. And as the battle for Western civilization rages on the streets of Manhattan, Percy faces a \
272 terrifying suspicion that he may be fighting against his own fate.".to_string()),
273 Some("Disney-Hyperion Books".to_string()),
274 Some(DateTime::parse_from_rfc3339("2009-05-05T07:00:00Z").unwrap().to_utc()),
275 Some("1423101472".to_string()),
276 expected_contributors,
277 expected_genres,
278 expected_series,
279 Some(381),
280 Some("English".to_string()),
281 Some("https://images-na.ssl-images-amazon.com/images/S/compressed.photo.goodreads.com/books/1723393514i/4556058.jpg".to_string()),
282 );
283
284 let metadata = fetch_metadata("4556058").await.unwrap();
285 assert_eq!(metadata, expected_metadata);
286 }
287}