goodreads-metadata-scraper 0.2.5

use crate::errors::ScraperError;
use reqwest::get;
use scraper::{Html, Selector};
use serde_json::Value;
use urlencoding::encode;

pub async fn verify_id_exists(id: &str) -> bool {
    let url = format!("https://www.goodreads.com/book/show/{id}");
    let response = get(&url).await.expect("Failed to fetch book page");
    response.status().is_success()
}

pub async fn fetch_id_from_isbn(isbn: &str) -> Result<Option<String>, ScraperError> {
    let url = format!("https://www.goodreads.com/search?q={}", encode(isbn));
    let document = Html::parse_document(&get(&url).await?.text().await?);

    let metadata_selector = Selector::parse(r#"script[id="__NEXT_DATA__"]"#)?;

    let metadata = match document.select(&metadata_selector).next() {
        Some(metadata) => &metadata.text().collect::<String>(),
        None => return Ok(None),
    };

    let metadata: Value = serde_json::from_str(metadata)?;

    let goodreads_id = metadata["props"]["pageProps"]["params"]["book_id"]
        .as_str()
        .expect("Failed to extract Goodreads ID from ISBN search results");

    let goodreads_id = goodreads_id
        .chars()
        .take_while(|c| c.is_numeric())
        .collect::<String>();

    Ok(Some(goodreads_id))
}

pub async fn fetch_id_from_title(title: &str) -> Result<Option<String>, ScraperError> {
    let results = search_books(title).await?;

    for (found_title, _, found_id) in results {
        if matches(&found_title, title) {
            return Ok(Some(found_id));
        }
    }

    Ok(None)
}

pub async fn fetch_id_from_title_and_author(
    title: &str,
    author: &str,
) -> Result<Option<String>, ScraperError> {
    let results = search_books(title).await?;

    for (found_title, found_author, found_id) in results {
        if matches(&found_title, title) && matches(&found_author, author) {
            return Ok(Some(found_id));
        }
    }

    let results = search_books(&format!("{title} {author}")).await?;

    for (found_title, found_author, found_id) in results {
        if matches(&found_title, title) && matches(&found_author, author) {
            return Ok(Some(found_id));
        }
    }

    Ok(None)
}

async fn search_books(query: &str) -> Result<Vec<(String, String, String)>, ScraperError> {
    let url = format!("https://www.goodreads.com/search?q={}", encode(query));

    let document = Html::parse_document(&get(&url).await?.text().await?);
    let book_selector = Selector::parse(r#"tr[itemtype="http://schema.org/Book"]"#)?;
    let title_selector = Selector::parse(r#"a[class="bookTitle"]"#)?;
    let author_selector = Selector::parse(r#"a[class="authorName"]"#)?;

    let mut results = Vec::new();

    for book in document.select(&book_selector) {
        let Some(title) = book.select(&title_selector).next() else {
            continue;
        };

        let found_title = title.text().collect::<String>().trim().to_string();

        let Some(found_link) = title.value().attr("href") else {
            continue;
        };

        let found_id = extract_goodreads_id(found_link);

        let found_authors = book
            .select(&author_selector)
            .map(|author| author.text().collect::<String>().trim().to_string())
            .collect::<Vec<_>>()
            .join(", ");

        results.push((found_title, found_authors, found_id));
    }

    Ok(results)
}

fn matches(str1: &str, str2: &str) -> bool {
    let str1 = str1
        .chars()
        .filter(|c| c.is_alphanumeric())
        .collect::<String>();
    let str2 = str2
        .chars()
        .filter(|c| c.is_alphanumeric())
        .collect::<String>();

    str1.to_lowercase().contains(&str2.to_lowercase())
}

fn extract_goodreads_id(url: &str) -> String {
    url.splitn(4, '/')
        .nth(3)
        .expect("Failed to extract Goodreads ID")
        .split('?')
        .next()
        .expect("Failed to extract Goodreads ID")
        .chars()
        .take_while(|c| c.is_numeric())
        .collect::<String>()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[tokio::test]
    async fn fetch_id_from_title_test() {
        let book_title = "The Last Magician";
        assert_eq!(
            fetch_id_from_title(book_title).await.unwrap(),
            Some("30312855".to_string())
        );
    }

    #[tokio::test]
    async fn fetch_id_from_title_not_found_test() {
        let book_title = "thistitledoesnotexist";
        assert_eq!(fetch_id_from_title(book_title).await.unwrap(), None);
    }

    #[tokio::test]
    async fn fetch_id_from_title_and_author_test() {
        let book_title = "Fire";
        let book_author = "Kristin Cashore";
        assert_eq!(
            fetch_id_from_title_and_author(book_title, book_author)
                .await
                .unwrap(),
            Some("6137154".to_string())
        );
    }

    #[tokio::test]
    async fn fetch_id_from_title_and_author_not_found_test() {
        let book_title = "thistitledoesnotexist";
        let book_author = "noauthor";
        assert_eq!(
            fetch_id_from_title_and_author(book_title, book_author)
                .await
                .unwrap(),
            None
        );
    }

    #[tokio::test]
    async fn fetch_id_from_isbn_test() {
        let isbn = "9780063021426";
        assert_eq!(
            fetch_id_from_isbn(isbn).await.unwrap(),
            Some("57945316".to_string())
        );
    }

    #[tokio::test]
    async fn fetch_id_from_isbn_not_found_test() {
        let isbn = "1234001592323";
        assert_eq!(fetch_id_from_isbn(isbn).await.unwrap(), None);
    }

    #[tokio::test]
    async fn verify_id_exists_test() {
        let id = "57945316";
        assert!(verify_id_exists(id).await);
    }

    #[tokio::test]
    async fn verify_id_not_found_test() {
        let id = "bad_id";
        assert!(!(verify_id_exists(id).await));
    }
}