1use std::collections::{HashMap, HashSet};
2
3use regex::Regex;
4use scraper::{Html, Selector};
5use url::Url;
6
7use crate::error::{AppError, Result};
8use crate::model::{Collector, ItemKind, OwnedAlbum, Platform, SeedAlbum};
9
10pub fn normalize_url(url: &str) -> Result<String> {
11 let mut parsed = Url::parse(url)?;
12 parsed.set_fragment(None);
13 parsed.set_query(None);
14
15 if parsed.path() != "/" {
16 let trimmed = parsed.path().trim_end_matches('/').to_string();
17 parsed.set_path(&trimmed);
18 }
19
20 Ok(parsed.to_string())
21}
22
23pub fn resolve_seed(url: &str, html: &str) -> Result<SeedAlbum> {
24 let document = Html::parse_document(html);
25 let canonical_url = meta_content(&document, r#"meta[property="og:url"]"#)
26 .or_else(|| Some(url.to_string()))
27 .map(|value| normalize_url(&value))
28 .transpose()?
29 .ok_or_else(|| AppError::Parse("unable to determine canonical URL".to_string()))?;
30
31 let og_title = meta_content(&document, r#"meta[property="og:title"]"#)
32 .or_else(|| title_text(&document))
33 .unwrap_or_else(|| "Unknown Album".to_string());
34
35 let (title, artist) = split_album_artist(&og_title);
36 let artist_name = artist.unwrap_or_else(|| {
37 infer_artist_from_html(html).unwrap_or_else(|| "Unknown Artist".to_string())
38 });
39 let tags = collect_tag_text(&document);
40
41 Ok(SeedAlbum {
42 platform: Platform::Bandcamp,
43 kind: ItemKind::Album,
44 title,
45 artist: artist_name,
46 url: canonical_url,
47 artist_url: None,
48 tags,
49 label: None,
50 release_id: infer_release_id(html),
51 })
52}
53
54pub fn parse_collectors(html: &str) -> Vec<Collector> {
55 let document = Html::parse_document(html);
56 let selector = Selector::parse("a[href]").expect("valid selector");
57 let mut seen = HashSet::new();
58 let mut collectors = Vec::new();
59
60 for anchor in document.select(&selector) {
61 let Some(href) = anchor.value().attr("href") else {
62 continue;
63 };
64 let Some(url) = normalize_collector_url(href) else {
65 continue;
66 };
67 if !seen.insert(url.clone()) {
68 continue;
69 }
70
71 let handle = url
72 .trim_end_matches('/')
73 .rsplit('/')
74 .next()
75 .unwrap_or("unknown")
76 .to_string();
77 let text = anchor
78 .text()
79 .collect::<Vec<_>>()
80 .join(" ")
81 .trim()
82 .to_string();
83
84 collectors.push(Collector {
85 handle,
86 url,
87 display_name: if text.is_empty() { None } else { Some(text) },
88 visible: true,
89 });
90 }
91
92 collectors
93}
94
95pub fn parse_owned_albums(html: &str) -> Vec<OwnedAlbum> {
96 let document = Html::parse_document(html);
97 let selector = Selector::parse("a[href]").expect("valid selector");
98 let mut albums = HashMap::<String, OwnedAlbum>::new();
99
100 for anchor in document.select(&selector) {
101 let Some(href) = anchor.value().attr("href") else {
102 continue;
103 };
104 let Some(url) = normalize_album_url(href) else {
105 continue;
106 };
107
108 let text = anchor.text().collect::<Vec<_>>().join(" ");
109 let trimmed = collapse_ws(&text);
110 let (title, artist) = split_album_artist(&trimmed);
111
112 albums.entry(url.clone()).or_insert_with(|| OwnedAlbum {
113 platform: Platform::Bandcamp,
114 kind: ItemKind::Album,
115 title: if title.is_empty() {
116 "Unknown Album".to_string()
117 } else {
118 title
119 },
120 artist: artist.unwrap_or_else(|| "Unknown Artist".to_string()),
121 url,
122 tags: Vec::new(),
123 label: None,
124 });
125 }
126
127 albums.into_values().collect()
128}
129
130fn meta_content(document: &Html, selector: &str) -> Option<String> {
131 let selector = Selector::parse(selector).ok()?;
132 document
133 .select(&selector)
134 .next()
135 .and_then(|node| node.value().attr("content"))
136 .map(|value| value.trim().to_string())
137}
138
139fn title_text(document: &Html) -> Option<String> {
140 let selector = Selector::parse("title").ok()?;
141 document
142 .select(&selector)
143 .next()
144 .map(|node| collapse_ws(&node.text().collect::<Vec<_>>().join(" ")))
145}
146
147fn split_album_artist(raw: &str) -> (String, Option<String>) {
148 let cleaned = collapse_ws(raw);
149 if let Some((title, artist)) = cleaned.split_once(", by ") {
150 return (title.trim().to_string(), Some(artist.trim().to_string()));
151 }
152 if let Some((title, artist)) = cleaned.split_once(" | ") {
153 return (title.trim().to_string(), Some(artist.trim().to_string()));
154 }
155 (cleaned, None)
156}
157
158fn collect_tag_text(document: &Html) -> Vec<String> {
159 let selector = Selector::parse(r#"a[href*="/tag/"]"#).expect("valid selector");
160 let mut seen = HashSet::new();
161 let mut tags = Vec::new();
162 for tag in document.select(&selector) {
163 let text = collapse_ws(&tag.text().collect::<Vec<_>>().join(" "));
164 if !text.is_empty() && seen.insert(text.clone()) {
165 tags.push(text);
166 }
167 }
168 tags
169}
170
171fn infer_artist_from_html(html: &str) -> Option<String> {
172 let patterns = [
173 Regex::new(r#""artist"\s*:\s*"([^"]+)""#).ok()?,
174 Regex::new(r#""byArtist"\s*:\s*"([^"]+)""#).ok()?,
175 ];
176
177 for pattern in patterns {
178 if let Some(capture) = pattern.captures(html) {
179 return Some(capture.get(1)?.as_str().to_string());
180 }
181 }
182
183 None
184}
185
186fn infer_release_id(html: &str) -> Option<String> {
187 let pattern = Regex::new(r#""id"\s*:\s*([0-9]+)"#).ok()?;
188 pattern
189 .captures(html)
190 .and_then(|capture| capture.get(1))
191 .map(|id| id.as_str().to_string())
192}
193
194fn normalize_collector_url(href: &str) -> Option<String> {
195 let url = if href.starts_with("http://") || href.starts_with("https://") {
196 Url::parse(href).ok()?
197 } else {
198 Url::parse(&format!(
199 "https://bandcamp.com{}",
200 ensure_leading_slash(href)
201 ))
202 .ok()?
203 };
204
205 let host = url.host_str()?;
206 let path = url.path().trim_end_matches('/');
207 let first_segment = path
208 .trim_start_matches('/')
209 .split('/')
210 .next()
211 .unwrap_or_default();
212
213 if host == "bandcamp.com"
214 && !first_segment.is_empty()
215 && !matches!(
216 first_segment,
217 "album" | "track" | "music" | "discover" | "tag" | "about" | "help" | "search"
218 )
219 {
220 return Some(format!("https://bandcamp.com/{first_segment}"));
221 }
222
223 None
224}
225
226fn normalize_album_url(href: &str) -> Option<String> {
227 let candidate = if href.starts_with("http://") || href.starts_with("https://") {
228 Url::parse(href).ok()?
229 } else {
230 return None;
231 };
232
233 if candidate.path().contains("/album/") {
234 let mut normalized = candidate;
235 normalized.set_query(None);
236 normalized.set_fragment(None);
237 return Some(normalized.to_string().trim_end_matches('/').to_string());
238 }
239
240 None
241}
242
243fn ensure_leading_slash(path: &str) -> String {
244 if path.starts_with('/') {
245 path.to_string()
246 } else {
247 format!("/{path}")
248 }
249}
250
251fn collapse_ws(value: &str) -> String {
252 value.split_whitespace().collect::<Vec<_>>().join(" ")
253}
254
255#[cfg(test)]
256mod tests {
257 use super::*;
258
259 #[test]
260 fn normalizes_album_url() {
261 let actual = normalize_url("https://artist.bandcamp.com/album/test?x=1#frag").unwrap();
262 assert_eq!(actual, "https://artist.bandcamp.com/album/test");
263 }
264
265 #[test]
266 fn parses_collectors_from_bandcamp_links() {
267 let html = r#"
268 <html><body>
269 <a href="https://bandcamp.com/fan_a">Fan A</a>
270 <a href="/fan_b">Fan B</a>
271 <a href="https://bandcamp.com/discover">Discover</a>
272 </body></html>
273 "#;
274 let collectors = parse_collectors(html);
275 assert_eq!(collectors.len(), 2);
276 assert_eq!(collectors[0].url, "https://bandcamp.com/fan_a");
277 }
278
279 #[test]
280 fn parses_owned_album_links() {
281 let html = r#"
282 <html><body>
283 <a href="https://artist.bandcamp.com/album/record-a">Record A, by Artist A</a>
284 <a href="https://artist.bandcamp.com/track/song-a">Song A</a>
285 </body></html>
286 "#;
287 let albums = parse_owned_albums(html);
288 assert_eq!(albums.len(), 1);
289 assert_eq!(albums[0].title, "Record A");
290 assert_eq!(albums[0].artist, "Artist A");
291 assert_eq!(albums[0].platform, Platform::Bandcamp);
292 assert_eq!(albums[0].kind, ItemKind::Album);
293 }
294
295 #[test]
296 fn resolves_seed_from_og_title() {
297 let html = r#"
298 <html><head>
299 <meta property="og:url" content="https://artist.bandcamp.com/album/seed">
300 <meta property="og:title" content="Seed Record, by Seed Artist">
301 </head></html>
302 "#;
303 let seed = resolve_seed("https://artist.bandcamp.com/album/seed", html).unwrap();
304 assert_eq!(seed.title, "Seed Record");
305 assert_eq!(seed.artist, "Seed Artist");
306 assert_eq!(seed.platform, Platform::Bandcamp);
307 assert_eq!(seed.kind, ItemKind::Album);
308 }
309}