Skip to main content

wax/
parser.rs

1use std::collections::{HashMap, HashSet};
2
3use regex::Regex;
4use scraper::{Html, Selector};
5use url::Url;
6
7use crate::error::{AppError, Result};
8use crate::model::{Collector, ItemKind, OwnedAlbum, Platform, SeedAlbum};
9
10pub fn normalize_url(url: &str) -> Result<String> {
11    let mut parsed = Url::parse(url)?;
12    parsed.set_fragment(None);
13    parsed.set_query(None);
14
15    if parsed.path() != "/" {
16        let trimmed = parsed.path().trim_end_matches('/').to_string();
17        parsed.set_path(&trimmed);
18    }
19
20    Ok(parsed.to_string())
21}
22
23pub fn resolve_seed(url: &str, html: &str) -> Result<SeedAlbum> {
24    let document = Html::parse_document(html);
25    let canonical_url = meta_content(&document, r#"meta[property="og:url"]"#)
26        .or_else(|| Some(url.to_string()))
27        .map(|value| normalize_url(&value))
28        .transpose()?
29        .ok_or_else(|| AppError::Parse("unable to determine canonical URL".to_string()))?;
30
31    let og_title = meta_content(&document, r#"meta[property="og:title"]"#)
32        .or_else(|| title_text(&document))
33        .unwrap_or_else(|| "Unknown Album".to_string());
34
35    let (title, artist) = split_album_artist(&og_title);
36    let artist_name = artist.unwrap_or_else(|| {
37        infer_artist_from_html(html).unwrap_or_else(|| "Unknown Artist".to_string())
38    });
39    let tags = collect_tag_text(&document);
40
41    Ok(SeedAlbum {
42        platform: Platform::Bandcamp,
43        kind: ItemKind::Album,
44        title,
45        artist: artist_name,
46        url: canonical_url,
47        artist_url: None,
48        tags,
49        label: None,
50        release_id: infer_release_id(html),
51    })
52}
53
54pub fn parse_collectors(html: &str) -> Vec<Collector> {
55    let document = Html::parse_document(html);
56    let selector = Selector::parse("a[href]").expect("valid selector");
57    let mut seen = HashSet::new();
58    let mut collectors = Vec::new();
59
60    for anchor in document.select(&selector) {
61        let Some(href) = anchor.value().attr("href") else {
62            continue;
63        };
64        let Some(url) = normalize_collector_url(href) else {
65            continue;
66        };
67        if !seen.insert(url.clone()) {
68            continue;
69        }
70
71        let handle = url
72            .trim_end_matches('/')
73            .rsplit('/')
74            .next()
75            .unwrap_or("unknown")
76            .to_string();
77        let text = anchor
78            .text()
79            .collect::<Vec<_>>()
80            .join(" ")
81            .trim()
82            .to_string();
83
84        collectors.push(Collector {
85            handle,
86            url,
87            display_name: if text.is_empty() { None } else { Some(text) },
88            visible: true,
89        });
90    }
91
92    collectors
93}
94
95pub fn parse_owned_albums(html: &str) -> Vec<OwnedAlbum> {
96    let document = Html::parse_document(html);
97    let selector = Selector::parse("a[href]").expect("valid selector");
98    let mut albums = HashMap::<String, OwnedAlbum>::new();
99
100    for anchor in document.select(&selector) {
101        let Some(href) = anchor.value().attr("href") else {
102            continue;
103        };
104        let Some(url) = normalize_album_url(href) else {
105            continue;
106        };
107
108        let text = anchor.text().collect::<Vec<_>>().join(" ");
109        let trimmed = collapse_ws(&text);
110        let (title, artist) = split_album_artist(&trimmed);
111
112        albums.entry(url.clone()).or_insert_with(|| OwnedAlbum {
113            platform: Platform::Bandcamp,
114            kind: ItemKind::Album,
115            title: if title.is_empty() {
116                "Unknown Album".to_string()
117            } else {
118                title
119            },
120            artist: artist.unwrap_or_else(|| "Unknown Artist".to_string()),
121            url,
122            tags: Vec::new(),
123            label: None,
124        });
125    }
126
127    albums.into_values().collect()
128}
129
130fn meta_content(document: &Html, selector: &str) -> Option<String> {
131    let selector = Selector::parse(selector).ok()?;
132    document
133        .select(&selector)
134        .next()
135        .and_then(|node| node.value().attr("content"))
136        .map(|value| value.trim().to_string())
137}
138
139fn title_text(document: &Html) -> Option<String> {
140    let selector = Selector::parse("title").ok()?;
141    document
142        .select(&selector)
143        .next()
144        .map(|node| collapse_ws(&node.text().collect::<Vec<_>>().join(" ")))
145}
146
147fn split_album_artist(raw: &str) -> (String, Option<String>) {
148    let cleaned = collapse_ws(raw);
149    if let Some((title, artist)) = cleaned.split_once(", by ") {
150        return (title.trim().to_string(), Some(artist.trim().to_string()));
151    }
152    if let Some((title, artist)) = cleaned.split_once(" | ") {
153        return (title.trim().to_string(), Some(artist.trim().to_string()));
154    }
155    (cleaned, None)
156}
157
158fn collect_tag_text(document: &Html) -> Vec<String> {
159    let selector = Selector::parse(r#"a[href*="/tag/"]"#).expect("valid selector");
160    let mut seen = HashSet::new();
161    let mut tags = Vec::new();
162    for tag in document.select(&selector) {
163        let text = collapse_ws(&tag.text().collect::<Vec<_>>().join(" "));
164        if !text.is_empty() && seen.insert(text.clone()) {
165            tags.push(text);
166        }
167    }
168    tags
169}
170
171fn infer_artist_from_html(html: &str) -> Option<String> {
172    let patterns = [
173        Regex::new(r#""artist"\s*:\s*"([^"]+)""#).ok()?,
174        Regex::new(r#""byArtist"\s*:\s*"([^"]+)""#).ok()?,
175    ];
176
177    for pattern in patterns {
178        if let Some(capture) = pattern.captures(html) {
179            return Some(capture.get(1)?.as_str().to_string());
180        }
181    }
182
183    None
184}
185
186fn infer_release_id(html: &str) -> Option<String> {
187    let pattern = Regex::new(r#""id"\s*:\s*([0-9]+)"#).ok()?;
188    pattern
189        .captures(html)
190        .and_then(|capture| capture.get(1))
191        .map(|id| id.as_str().to_string())
192}
193
194fn normalize_collector_url(href: &str) -> Option<String> {
195    let url = if href.starts_with("http://") || href.starts_with("https://") {
196        Url::parse(href).ok()?
197    } else {
198        Url::parse(&format!(
199            "https://bandcamp.com{}",
200            ensure_leading_slash(href)
201        ))
202        .ok()?
203    };
204
205    let host = url.host_str()?;
206    let path = url.path().trim_end_matches('/');
207    let first_segment = path
208        .trim_start_matches('/')
209        .split('/')
210        .next()
211        .unwrap_or_default();
212
213    if host == "bandcamp.com"
214        && !first_segment.is_empty()
215        && !matches!(
216            first_segment,
217            "album" | "track" | "music" | "discover" | "tag" | "about" | "help" | "search"
218        )
219    {
220        return Some(format!("https://bandcamp.com/{first_segment}"));
221    }
222
223    None
224}
225
226fn normalize_album_url(href: &str) -> Option<String> {
227    let candidate = if href.starts_with("http://") || href.starts_with("https://") {
228        Url::parse(href).ok()?
229    } else {
230        return None;
231    };
232
233    if candidate.path().contains("/album/") {
234        let mut normalized = candidate;
235        normalized.set_query(None);
236        normalized.set_fragment(None);
237        return Some(normalized.to_string().trim_end_matches('/').to_string());
238    }
239
240    None
241}
242
243fn ensure_leading_slash(path: &str) -> String {
244    if path.starts_with('/') {
245        path.to_string()
246    } else {
247        format!("/{path}")
248    }
249}
250
251fn collapse_ws(value: &str) -> String {
252    value.split_whitespace().collect::<Vec<_>>().join(" ")
253}
254
255#[cfg(test)]
256mod tests {
257    use super::*;
258
259    #[test]
260    fn normalizes_album_url() {
261        let actual = normalize_url("https://artist.bandcamp.com/album/test?x=1#frag").unwrap();
262        assert_eq!(actual, "https://artist.bandcamp.com/album/test");
263    }
264
265    #[test]
266    fn parses_collectors_from_bandcamp_links() {
267        let html = r#"
268            <html><body>
269                <a href="https://bandcamp.com/fan_a">Fan A</a>
270                <a href="/fan_b">Fan B</a>
271                <a href="https://bandcamp.com/discover">Discover</a>
272            </body></html>
273        "#;
274        let collectors = parse_collectors(html);
275        assert_eq!(collectors.len(), 2);
276        assert_eq!(collectors[0].url, "https://bandcamp.com/fan_a");
277    }
278
279    #[test]
280    fn parses_owned_album_links() {
281        let html = r#"
282            <html><body>
283                <a href="https://artist.bandcamp.com/album/record-a">Record A, by Artist A</a>
284                <a href="https://artist.bandcamp.com/track/song-a">Song A</a>
285            </body></html>
286        "#;
287        let albums = parse_owned_albums(html);
288        assert_eq!(albums.len(), 1);
289        assert_eq!(albums[0].title, "Record A");
290        assert_eq!(albums[0].artist, "Artist A");
291        assert_eq!(albums[0].platform, Platform::Bandcamp);
292        assert_eq!(albums[0].kind, ItemKind::Album);
293    }
294
295    #[test]
296    fn resolves_seed_from_og_title() {
297        let html = r#"
298            <html><head>
299                <meta property="og:url" content="https://artist.bandcamp.com/album/seed">
300                <meta property="og:title" content="Seed Record, by Seed Artist">
301            </head></html>
302        "#;
303        let seed = resolve_seed("https://artist.bandcamp.com/album/seed", html).unwrap();
304        assert_eq!(seed.title, "Seed Record");
305        assert_eq!(seed.artist, "Seed Artist");
306        assert_eq!(seed.platform, Platform::Bandcamp);
307        assert_eq!(seed.kind, ItemKind::Album);
308    }
309}