use std::{borrow::Cow, str::FromStr};
use crate::utils::{
download,
selectors::{RELEASE_SCHEMA, SPAN, TD, TIME, TRACKLIST, VERSIONS_TABLE_LINK},
};
use id3::Timestamp;
use scraper::{html::Select, Html};
use serde::Deserialize;
use thiserror::Error;
#[derive(Debug, Clone, Deserialize)]
pub struct DiscogsTrack {
pub number: i32,
pub title: String,
pub duration: String,
}
#[derive(Debug, Clone)]
pub struct DiscogsAlbum {
pub album_data: DiscogsAlbumData,
pub tracks: Vec<Option<DiscogsTrack>>,
pub released: Option<Timestamp>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct DiscogsAlbumData {
#[serde(rename = "@context")]
pub context: String,
#[serde(rename = "@type")]
pub r#type: String,
#[serde(rename = "@id")]
pub id: String,
pub name: String,
#[serde(rename = "musicReleaseFormat")]
pub music_release_format: String,
pub genre: Vec<String>,
pub description: Option<String>,
#[serde(rename = "datePublished")]
pub date_published: i32,
#[serde(rename = "catalogNumber")]
pub catalog_number: String,
#[serde(rename = "recordLabel")]
pub record_label: Vec<DiscogsNamedObject>,
#[serde(rename = "releaseOf")]
pub release_of: DiscogsReleaseOf,
#[serde(rename = "releasedEvent")]
pub released_event: DiscogsReleasedEvent,
pub image: String,
}
#[derive(Debug, Clone, Deserialize)]
pub struct DiscogsNamedObject {
#[serde(rename = "@type")]
pub r#type: String,
#[serde(rename = "@id")]
pub id: String,
pub name: String,
}
#[derive(Debug, Clone, Deserialize)]
pub struct DiscogsReleaseOf {
#[serde(rename = "@type")]
pub r#type: String,
#[serde(rename = "@id")]
pub id: Option<String>,
pub name: String,
#[serde(rename = "datePublished")]
pub date_published: i32,
#[serde(rename = "byArtist")]
pub by_artist: Vec<DiscogsNamedObject>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct DiscogsReleasedEvent {
#[serde(rename = "@type")]
pub r#type: String,
#[serde(rename = "startDate")]
pub start_date: i32,
pub location: DiscogsLocation,
}
#[derive(Debug, Clone, Deserialize)]
pub struct DiscogsLocation {
#[serde(rename = "@type")]
pub r#type: String,
pub name: String,
}
#[derive(Debug, Error)]
pub enum DiscogsScrapeError {
#[error("{0}")]
ReqwestError(#[from] reqwest::Error),
#[error("couldn't find release schema script")]
CouldntFindReleaseSchema,
#[error("{0}")]
SerdeError(#[from] serde_json::Error),
#[error("couldn't find release page from master page")]
CouldntFindReleasePage,
}
fn release_from_master(url: &str) -> Result<Cow<str>, DiscogsScrapeError> {
fn first_release_in_select(selection: Select<'_, '_>) -> Option<String> {
for s in selection {
if let Some(link) = s.value().attr("href") {
if link.starts_with("/release/") {
return Some(format!("https://www.discogs.com{link}"));
}
}
}
None
}
if url.contains("discogs.com/master") {
let resp = download(url)?;
let document = Html::parse_document(resp.text()?.as_str());
let links = document.select(&VERSIONS_TABLE_LINK);
first_release_in_select(links)
.map(Cow::Owned)
.ok_or(DiscogsScrapeError::CouldntFindReleasePage)
} else {
Ok(Cow::Borrowed(url))
}
}
pub fn scrape_discogs(url: &str) -> Result<DiscogsAlbum, DiscogsScrapeError> {
let url = release_from_master(url)?;
let resp = download(&url)?;
let document = Html::parse_document(resp.text()?.as_str());
let album_data = parse_release_schema(&document)?;
let tracks = parse_tracks(&document);
let released = parse_released(&document);
Ok(DiscogsAlbum {
album_data,
tracks,
released,
})
}
fn parse_release_schema(document: &Html) -> Result<DiscogsAlbumData, DiscogsScrapeError> {
serde_json::de::from_str(
document
.select(&RELEASE_SCHEMA)
.next()
.ok_or(DiscogsScrapeError::CouldntFindReleaseSchema)?
.inner_html()
.as_str(),
)
.map_err(Into::into)
}
fn parse_tracks(document: &Html) -> Vec<Option<DiscogsTrack>> {
document
.select(&TRACKLIST)
.map(|track| {
let tds: Vec<_> = track.select(&TD).collect();
if tds.len() >= 4 {
Some(DiscogsTrack {
number: tds[0].inner_html().parse().ok()?,
title: tds[2].select(&SPAN).next()?.inner_html(),
duration: tds[3].select(&SPAN).next()?.inner_html(),
})
} else {
None
}
})
.collect()
}
fn parse_released(document: &Html) -> Option<Timestamp> {
document
.select(&TIME)
.next()?
.value()
.attr("datetime")
.map(FromStr::from_str)
.and_then(Result::ok)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn release_basic() {
let album =
scrape_discogs("https://www.discogs.com/release/27651927-Odd-Eye-Circle-Version-Up")
.unwrap();
assert_eq!(album.album_data.name.as_str(), "Version Up");
assert_eq!(
&album.album_data.genre,
&["Electronic".to_string(), "Pop".to_string()]
);
assert_eq!(
&album.album_data.description.unwrap(),
"Album title stylized as &quot;ODD EYE CIRCLE &lt;Version Up&gt;.&quot;" );
assert_eq!(album.album_data.date_published, 2023);
assert_eq!(&album.album_data.record_label[0].name, "Modhaus");
assert_eq!(
&album.album_data.release_of.by_artist[0].name,
"ODD EYE CIRCLE"
);
assert!(album.album_data.image.starts_with("https://i.discogs.com/"));
assert_eq!(album.tracks.len(), 6);
let expected_titles = [
"Did You Wait?",
"Air Force One",
"Je Ne Sais Quoi",
"Lucid",
"Love Me Like",
"My Secret Playlist",
];
let expected_durations = ["1:10", "2:44", "2:54", "3:34", "2:59", "2:33"];
for (i, track) in album.tracks.iter().map(Option::as_ref).enumerate() {
assert_eq!(track.map(|t| t.number), Some(i32::try_from(i).unwrap() + 1));
assert_eq!(track.map(|t| t.title.as_str()), Some(expected_titles[i]));
assert_eq!(
track.map(|t| t.duration.as_str()),
Some(expected_durations[i])
);
}
}
#[test]
fn master_basic() {
let master = r#"https://www.discogs.com/master/3166419-Odd-Eye-Circle-Version-Up"#;
let release = release_from_master(master).unwrap();
assert_eq!(
&release,
r#"https://www.discogs.com/release/27651927-Odd-Eye-Circle-Version-Up"#
);
let master = r#"https://www.discogs.com/release/27651927-Odd-Eye-Circle-Version-Up"#;
let release = release_from_master(master).unwrap();
assert_eq!(&release, master);
}
}