lastfm_edit/
parsing.rs

1//! HTML parsing utilities for Last.fm pages.
2//!
3//! This module contains all the HTML parsing logic for extracting track, album,
4//! and other data from Last.fm web pages. These functions are primarily pure
5//! functions that take HTML documents and return structured data.
6
7use crate::{Album, AlbumPage, LastFmError, Result, Track, TrackPage};
8use scraper::{Html, Selector};
9
10/// Parser struct containing parsing methods for Last.fm HTML pages.
11///
12/// This struct holds the parsing logic that was previously embedded in the client.
13/// It's designed to be stateless and focused purely on HTML parsing.
14#[derive(Debug)]
15pub struct LastFmParser;
16
17impl LastFmParser {
18    /// Create a new parser instance.
19    pub fn new() -> Self {
20        Self
21    }
22
23    /// Parse recent scrobbles from the user's library page
24    /// This extracts real scrobble data with timestamps for editing
25    pub fn parse_recent_scrobbles(&self, document: &Html) -> Result<Vec<Track>> {
26        let mut tracks = Vec::new();
27
28        // Recent scrobbles are typically in chartlist tables - there can be multiple
29        let table_selector = Selector::parse("table.chartlist").unwrap();
30        let row_selector = Selector::parse("tbody tr").unwrap();
31
32        let tables: Vec<_> = document.select(&table_selector).collect();
33        log::debug!("Found {} chartlist tables", tables.len());
34
35        for table in tables {
36            for row in table.select(&row_selector) {
37                if let Ok(track) = self.parse_recent_scrobble_row(&row) {
38                    tracks.push(track);
39                }
40            }
41        }
42
43        if tracks.is_empty() {
44            log::debug!("No tracks found in recent scrobbles");
45        }
46
47        log::debug!("Parsed {} recent scrobbles", tracks.len());
48        Ok(tracks)
49    }
50
51    /// Parse a single row from the recent scrobbles table
52    fn parse_recent_scrobble_row(&self, row: &scraper::ElementRef) -> Result<Track> {
53        // Extract track name
54        let name_selector = Selector::parse(".chartlist-name a").unwrap();
55        let name = row
56            .select(&name_selector)
57            .next()
58            .ok_or(LastFmError::Parse("Missing track name".to_string()))?
59            .text()
60            .collect::<String>()
61            .trim()
62            .to_string();
63
64        // Extract artist name
65        let artist_selector = Selector::parse(".chartlist-artist a").unwrap();
66        let artist = row
67            .select(&artist_selector)
68            .next()
69            .ok_or(LastFmError::Parse("Missing artist name".to_string()))?
70            .text()
71            .collect::<String>()
72            .trim()
73            .to_string();
74
75        // Extract timestamp from data attributes or hidden inputs
76        let timestamp = self.extract_scrobble_timestamp(row);
77
78        // Extract album from hidden inputs in edit form
79        let album = self.extract_scrobble_album(row);
80
81        // For recent scrobbles, playcount is typically 1 since they're individual scrobbles
82        let playcount = 1;
83
84        Ok(Track {
85            name,
86            artist,
87            playcount,
88            timestamp,
89            album,
90        })
91    }
92
93    /// Extract timestamp from scrobble row elements
94    fn extract_scrobble_timestamp(&self, row: &scraper::ElementRef) -> Option<u64> {
95        // Look for timestamp in various places:
96
97        // 1. Check for data-timestamp attribute
98        if let Some(timestamp_str) = row.value().attr("data-timestamp") {
99            if let Ok(timestamp) = timestamp_str.parse::<u64>() {
100                return Some(timestamp);
101            }
102        }
103
104        // 2. Look for hidden timestamp input
105        let timestamp_input_selector = Selector::parse("input[name='timestamp']").unwrap();
106        if let Some(input) = row.select(&timestamp_input_selector).next() {
107            if let Some(value) = input.value().attr("value") {
108                if let Ok(timestamp) = value.parse::<u64>() {
109                    return Some(timestamp);
110                }
111            }
112        }
113
114        // 3. Look for edit form with timestamp
115        let edit_form_selector =
116            Selector::parse("form[data-edit-scrobble] input[name='timestamp']").unwrap();
117        if let Some(timestamp_input) = row.select(&edit_form_selector).next() {
118            if let Some(value) = timestamp_input.value().attr("value") {
119                if let Ok(timestamp) = value.parse::<u64>() {
120                    return Some(timestamp);
121                }
122            }
123        }
124
125        // 4. Look for time element with datetime attribute
126        let time_selector = Selector::parse("time").unwrap();
127        if let Some(time_elem) = row.select(&time_selector).next() {
128            if let Some(datetime) = time_elem.value().attr("datetime") {
129                // Parse ISO datetime to timestamp
130                if let Ok(parsed_time) = chrono::DateTime::parse_from_rfc3339(datetime) {
131                    return Some(parsed_time.timestamp() as u64);
132                }
133            }
134        }
135
136        None
137    }
138
139    /// Extract album name from scrobble row elements
140    fn extract_scrobble_album(&self, row: &scraper::ElementRef) -> Option<String> {
141        // Look for album_name in hidden inputs within edit forms
142        let album_input_selector =
143            Selector::parse("form[data-edit-scrobble] input[name='album_name']").unwrap();
144
145        if let Some(album_input) = row.select(&album_input_selector).next() {
146            if let Some(album_name) = album_input.value().attr("value") {
147                if !album_name.is_empty() {
148                    return Some(album_name.to_string());
149                }
150            }
151        }
152
153        None
154    }
155
156    /// Parse a tracks page into a TrackPage structure
157    pub fn parse_tracks_page(
158        &self,
159        document: &Html,
160        page_number: u32,
161        artist: &str,
162    ) -> Result<TrackPage> {
163        let tracks = self.extract_tracks_from_document(document, artist)?;
164
165        // Check for pagination
166        let (has_next_page, total_pages) = self.parse_pagination(document, page_number)?;
167
168        Ok(TrackPage {
169            tracks,
170            page_number,
171            has_next_page,
172            total_pages,
173        })
174    }
175
176    /// Extract tracks from HTML document
177    pub fn extract_tracks_from_document(
178        &self,
179        document: &Html,
180        artist: &str,
181    ) -> Result<Vec<Track>> {
182        let mut tracks = Vec::new();
183        let mut seen_tracks = std::collections::HashSet::new();
184
185        // Try JSON-embedded data first
186        if let Ok(json_tracks) = self.parse_json_tracks_page(document, 1, artist) {
187            return Ok(json_tracks.tracks);
188        }
189
190        // Strategy 1: Try parsing track data from data-track-name attributes (AJAX response)
191        let track_selector = Selector::parse("[data-track-name]").unwrap();
192        let track_elements: Vec<_> = document.select(&track_selector).collect();
193
194        if !track_elements.is_empty() {
195            for element in track_elements {
196                let track_name = element.value().attr("data-track-name").unwrap_or("");
197                if !track_name.is_empty() && !seen_tracks.contains(track_name) {
198                    seen_tracks.insert(track_name.to_string());
199
200                    if let Ok(playcount) = self.find_playcount_for_track(document, track_name) {
201                        let timestamp = self.find_timestamp_for_track(document, track_name);
202                        let track = Track {
203                            name: track_name.to_string(),
204                            artist: artist.to_string(),
205                            playcount,
206                            timestamp,
207                            album: None, // JSON parsing doesn't have album info
208                        };
209                        tracks.push(track);
210                    }
211                    if tracks.len() >= 50 {
212                        break;
213                    }
214                }
215            }
216        }
217
218        // Strategy 2: Parse tracks from hidden form inputs (for tracks like "Comes a Time - 2016")
219        if tracks.len() < 50 {
220            let form_input_selector = Selector::parse("input[name='track']").unwrap();
221            for input in document.select(&form_input_selector) {
222                if let Some(track_name) = input.value().attr("value") {
223                    if !track_name.is_empty() && !seen_tracks.contains(track_name) {
224                        seen_tracks.insert(track_name.to_string());
225
226                        let playcount = self
227                            .find_playcount_for_track(document, track_name)
228                            .unwrap_or(0);
229                        let timestamp = self.find_timestamp_for_track(document, track_name);
230                        let track = Track {
231                            name: track_name.to_string(),
232                            artist: artist.to_string(),
233                            playcount,
234                            timestamp,
235                            album: None, // Form parsing doesn't have album info
236                        };
237                        tracks.push(track);
238                        if tracks.len() >= 50 {
239                            break;
240                        }
241                    }
242                }
243            }
244        }
245
246        // Strategy 3: Fallback to table parsing method if we didn't find enough tracks
247        if tracks.len() < 10 {
248            let table_tracks = self.parse_tracks_from_rows(document, artist)?;
249            for track in table_tracks {
250                if !seen_tracks.contains(&track.name) && tracks.len() < 50 {
251                    seen_tracks.insert(track.name.clone());
252                    tracks.push(track);
253                }
254            }
255        }
256
257        log::debug!("Successfully extracted {} unique tracks", tracks.len());
258        Ok(tracks)
259    }
260
261    /// Parse tracks from chartlist table rows
262    fn parse_tracks_from_rows(&self, document: &Html, artist: &str) -> Result<Vec<Track>> {
263        let mut tracks = Vec::new();
264        let table_selector = Selector::parse("table.chartlist").unwrap();
265        let row_selector = Selector::parse("tbody tr").unwrap();
266
267        for table in document.select(&table_selector) {
268            for row in table.select(&row_selector) {
269                if let Ok(mut track) = self.parse_track_row(&row) {
270                    track.artist = artist.to_string(); // Fill in artist name
271                    tracks.push(track);
272                }
273            }
274        }
275        Ok(tracks)
276    }
277
278    /// Parse a single track row from chartlist table
279    pub fn parse_track_row(&self, row: &scraper::ElementRef) -> Result<Track> {
280        // Extract track name using shared method
281        let name = self.extract_name_from_row(row, "track")?;
282
283        // Parse play count using shared method
284        let playcount = self.extract_playcount_from_row(row);
285
286        let artist = "".to_string(); // Will be filled in by caller
287
288        Ok(Track {
289            name,
290            artist,
291            playcount,
292            timestamp: None, // Not available in table parsing mode
293            album: None,     // Not available in table parsing mode
294        })
295    }
296
297    /// Parse albums page into AlbumPage structure
298    pub fn parse_albums_page(
299        &self,
300        document: &Html,
301        page_number: u32,
302        artist: &str,
303    ) -> Result<AlbumPage> {
304        let mut albums = Vec::new();
305
306        // Try parsing album data from data attributes (AJAX response)
307        let album_selector = Selector::parse("[data-album-name]").unwrap();
308        let album_elements: Vec<_> = document.select(&album_selector).collect();
309
310        if !album_elements.is_empty() {
311            log::debug!(
312                "Found {} album elements with data-album-name",
313                album_elements.len()
314            );
315
316            // Use a set to track unique albums
317            let mut seen_albums = std::collections::HashSet::new();
318
319            for element in album_elements {
320                let album_name = element.value().attr("data-album-name").unwrap_or("");
321                if !album_name.is_empty() && !seen_albums.contains(album_name) {
322                    seen_albums.insert(album_name.to_string());
323
324                    if let Ok(playcount) = self.find_playcount_for_album(document, album_name) {
325                        let timestamp = self.find_timestamp_for_album(document, album_name);
326                        let album = Album {
327                            name: album_name.to_string(),
328                            artist: artist.to_string(),
329                            playcount,
330                            timestamp,
331                        };
332                        albums.push(album);
333                    }
334
335                    if albums.len() >= 50 {
336                        break;
337                    }
338                }
339            }
340        } else {
341            // Fall back to parsing album rows from chartlist tables
342            albums = self.parse_albums_from_rows(document, artist)?;
343        }
344
345        let (has_next_page, total_pages) = self.parse_pagination(document, page_number)?;
346
347        Ok(AlbumPage {
348            albums,
349            page_number,
350            has_next_page,
351            total_pages,
352        })
353    }
354
355    /// Parse albums from chartlist table rows
356    fn parse_albums_from_rows(&self, document: &Html, artist: &str) -> Result<Vec<Album>> {
357        let mut albums = Vec::new();
358        let table_selector = Selector::parse("table.chartlist").unwrap();
359        let row_selector = Selector::parse("tbody tr").unwrap();
360
361        for table in document.select(&table_selector) {
362            for row in table.select(&row_selector) {
363                if let Ok(mut album) = self.parse_album_row(&row) {
364                    album.artist = artist.to_string();
365                    albums.push(album);
366                }
367            }
368        }
369        Ok(albums)
370    }
371
372    /// Parse a single album row from chartlist table
373    pub fn parse_album_row(&self, row: &scraper::ElementRef) -> Result<Album> {
374        // Extract album name using shared method
375        let name = self.extract_name_from_row(row, "album")?;
376
377        // Parse play count using shared method
378        let playcount = self.extract_playcount_from_row(row);
379
380        let artist = "".to_string(); // Will be filled in by caller
381
382        Ok(Album {
383            name,
384            artist,
385            playcount,
386            timestamp: None, // Not available in table parsing
387        })
388    }
389
390    // === SHARED PARSING UTILITIES ===
391
392    /// Extract name from chartlist row (works for both tracks and albums)
393    fn extract_name_from_row(&self, row: &scraper::ElementRef, item_type: &str) -> Result<String> {
394        let name_selector = Selector::parse(".chartlist-name a").unwrap();
395        let name = row
396            .select(&name_selector)
397            .next()
398            .map(|el| el.text().collect::<String>().trim().to_string())
399            .ok_or_else(|| LastFmError::Parse(format!("Missing {item_type} name")))?;
400        Ok(name)
401    }
402
403    /// Extract playcount from chartlist row (works for both tracks and albums)
404    fn extract_playcount_from_row(&self, row: &scraper::ElementRef) -> u32 {
405        let playcount_selector = Selector::parse(".chartlist-count-bar-value").unwrap();
406        let mut playcount = 1; // default fallback
407
408        if let Some(element) = row.select(&playcount_selector).next() {
409            let text = element.text().collect::<String>().trim().to_string();
410            // Extract just the number part (before "scrobbles" if present)
411            if let Some(number_part) = text.split_whitespace().next() {
412                if let Ok(count) = number_part.parse::<u32>() {
413                    playcount = count;
414                }
415            }
416        }
417        playcount
418    }
419
420    /// Parse pagination information from document
421    pub fn parse_pagination(
422        &self,
423        document: &Html,
424        _current_page: u32,
425    ) -> Result<(bool, Option<u32>)> {
426        let pagination_selector = Selector::parse(".pagination-list").unwrap();
427
428        if let Some(pagination) = document.select(&pagination_selector).next() {
429            // Try multiple possible selectors for next page link
430            let next_selectors = [
431                "a[aria-label=\"Next\"]",
432                ".pagination-next a",
433                "a:contains(\"Next\")",
434                ".next a",
435            ];
436
437            let mut has_next = false;
438            for selector_str in &next_selectors {
439                if let Ok(selector) = Selector::parse(selector_str) {
440                    if pagination.select(&selector).next().is_some() {
441                        has_next = true;
442                        break;
443                    }
444                }
445            }
446
447            // Try to extract total pages from pagination text
448            let total_pages = self.extract_total_pages_from_pagination(&pagination);
449
450            Ok((has_next, total_pages))
451        } else {
452            // No pagination found - single page
453            Ok((false, Some(1)))
454        }
455    }
456
457    /// Helper functions for pagination parsing
458    fn extract_total_pages_from_pagination(&self, pagination: &scraper::ElementRef) -> Option<u32> {
459        // Look for patterns like "Page 1 of 42"
460        let text = pagination.text().collect::<String>();
461        if let Some(of_pos) = text.find(" of ") {
462            let after_of = &text[of_pos + 4..];
463            if let Some(number_end) = after_of.find(|c: char| !c.is_ascii_digit()) {
464                if let Ok(total) = after_of[..number_end].parse::<u32>() {
465                    return Some(total);
466                }
467            } else if let Ok(total) = after_of.trim().parse::<u32>() {
468                return Some(total);
469            }
470        }
471        None
472    }
473
474    // === JSON PARSING METHODS ===
475
476    fn parse_json_tracks_page(
477        &self,
478        _document: &Html,
479        _page: u32,
480        _artist: &str,
481    ) -> Result<TrackPage> {
482        // JSON parsing not implemented - return error to trigger fallback
483        Err(crate::LastFmError::Parse(
484            "JSON parsing not implemented".to_string(),
485        ))
486    }
487
488    // === FIND HELPER METHODS ===
489
490    pub fn find_timestamp_for_track(&self, _document: &Html, _track_name: &str) -> Option<u64> {
491        // Implementation would search for timestamp data
492        None
493    }
494
495    pub fn find_playcount_for_track(&self, document: &Html, track_name: &str) -> Result<u32> {
496        // Look for chartlist-count-bar-value elements near the track
497        let count_selector = Selector::parse(".chartlist-count-bar-value").unwrap();
498        let link_selector = Selector::parse("a[href*=\"/music/\"]").unwrap();
499
500        // Find all track links that match our track name
501        for link in document.select(&link_selector) {
502            let link_text = link.text().collect::<String>().trim().to_string();
503            if link_text == track_name {
504                if let Some(row) = self.find_ancestor_row(link) {
505                    if let Some(count_element) = row.select(&count_selector).next() {
506                        let text = count_element.text().collect::<String>().trim().to_string();
507                        if let Some(number_part) = text.split_whitespace().next() {
508                            if let Ok(count) = number_part.parse::<u32>() {
509                                return Ok(count);
510                            }
511                        }
512                    }
513                }
514            }
515        }
516        Err(LastFmError::Parse(format!(
517            "Could not find playcount for track: {track_name}"
518        )))
519    }
520
521    pub fn find_timestamp_for_album(&self, _document: &Html, _album_name: &str) -> Option<u64> {
522        // Implementation would search for timestamp data
523        None
524    }
525
526    pub fn find_playcount_for_album(&self, document: &Html, album_name: &str) -> Result<u32> {
527        // Look for chartlist-count-bar-value elements near the album
528        let count_selector = Selector::parse(".chartlist-count-bar-value").unwrap();
529        let link_selector = Selector::parse("a[href*=\"/music/\"]").unwrap();
530
531        // Find all album links that match our album name
532        for link in document.select(&link_selector) {
533            let link_text = link.text().collect::<String>().trim().to_string();
534            if link_text == album_name {
535                if let Some(row) = self.find_ancestor_row(link) {
536                    if let Some(count_element) = row.select(&count_selector).next() {
537                        let text = count_element.text().collect::<String>().trim().to_string();
538                        if let Some(number_part) = text.split_whitespace().next() {
539                            if let Ok(count) = number_part.parse::<u32>() {
540                                return Ok(count);
541                            }
542                        }
543                    }
544                }
545            }
546        }
547        Err(LastFmError::Parse(format!(
548            "Could not find playcount for album: {album_name}"
549        )))
550    }
551
552    pub fn find_ancestor_row<'a>(
553        &self,
554        element: scraper::ElementRef<'a>,
555    ) -> Option<scraper::ElementRef<'a>> {
556        let mut current = element;
557        while let Some(parent) = current.parent() {
558            if let Some(parent_elem) = scraper::ElementRef::wrap(parent) {
559                if parent_elem.value().name() == "tr" {
560                    return Some(parent_elem);
561                }
562                current = parent_elem;
563            } else {
564                break;
565            }
566        }
567        None
568    }
569}
570
571impl Default for LastFmParser {
572    fn default() -> Self {
573        Self::new()
574    }
575}