lastfm_edit/
parsing.rs

1//! HTML parsing utilities for Last.fm pages.
2//!
3//! This module contains all the HTML parsing logic for extracting track, album,
4//! and other data from Last.fm web pages. These functions are primarily pure
5//! functions that take HTML documents and return structured data.
6
7use crate::{Album, AlbumPage, LastFmError, Result, Track, TrackPage};
8use scraper::{Html, Selector};
9
10/// Parser struct containing parsing methods for Last.fm HTML pages.
11///
12/// This struct holds the parsing logic that was previously embedded in the client.
13/// It's designed to be stateless and focused purely on HTML parsing.
14#[derive(Debug, Clone)]
15pub struct LastFmParser;
16
17impl LastFmParser {
18    /// Create a new parser instance.
19    pub fn new() -> Self {
20        Self
21    }
22
23    /// Parse recent scrobbles from the user's library page
24    /// This extracts real scrobble data with timestamps for editing
25    pub fn parse_recent_scrobbles(&self, document: &Html) -> Result<Vec<Track>> {
26        let mut tracks = Vec::new();
27
28        // Recent scrobbles are typically in chartlist tables - there can be multiple
29        let table_selector = Selector::parse("table.chartlist").unwrap();
30        let row_selector = Selector::parse("tbody tr").unwrap();
31
32        let tables: Vec<_> = document.select(&table_selector).collect();
33        log::debug!("Found {} chartlist tables", tables.len());
34
35        for table in tables {
36            for row in table.select(&row_selector) {
37                if let Ok(track) = self.parse_recent_scrobble_row(&row) {
38                    tracks.push(track);
39                }
40            }
41        }
42
43        if tracks.is_empty() {
44            log::debug!("No tracks found in recent scrobbles");
45        }
46
47        log::debug!("Parsed {} recent scrobbles", tracks.len());
48        Ok(tracks)
49    }
50
51    /// Parse a single row from the recent scrobbles table
52    fn parse_recent_scrobble_row(&self, row: &scraper::ElementRef) -> Result<Track> {
53        // Extract track name
54        let name_selector = Selector::parse(".chartlist-name a").unwrap();
55        let name = row
56            .select(&name_selector)
57            .next()
58            .ok_or(LastFmError::Parse("Missing track name".to_string()))?
59            .text()
60            .collect::<String>()
61            .trim()
62            .to_string();
63
64        // Extract artist name
65        let artist_selector = Selector::parse(".chartlist-artist a").unwrap();
66        let artist = row
67            .select(&artist_selector)
68            .next()
69            .ok_or(LastFmError::Parse("Missing artist name".to_string()))?
70            .text()
71            .collect::<String>()
72            .trim()
73            .to_string();
74
75        // Extract timestamp from data attributes or hidden inputs
76        let timestamp = self.extract_scrobble_timestamp(row);
77
78        // Extract album from hidden inputs in edit form
79        let album = self.extract_scrobble_album(row);
80
81        // Extract album artist from hidden inputs in edit form
82        let album_artist = self.extract_scrobble_album_artist(row);
83
84        // For recent scrobbles, playcount is typically 1 since they're individual scrobbles
85        let playcount = 1;
86
87        Ok(Track {
88            name,
89            artist,
90            playcount,
91            timestamp,
92            album,
93            album_artist,
94        })
95    }
96
97    /// Extract timestamp from scrobble row elements
98    fn extract_scrobble_timestamp(&self, row: &scraper::ElementRef) -> Option<u64> {
99        // Look for timestamp in various places:
100
101        // 1. Check for data-timestamp attribute
102        if let Some(timestamp_str) = row.value().attr("data-timestamp") {
103            if let Ok(timestamp) = timestamp_str.parse::<u64>() {
104                return Some(timestamp);
105            }
106        }
107
108        // 2. Look for hidden timestamp input
109        let timestamp_input_selector = Selector::parse("input[name='timestamp']").unwrap();
110        if let Some(input) = row.select(&timestamp_input_selector).next() {
111            if let Some(value) = input.value().attr("value") {
112                if let Ok(timestamp) = value.parse::<u64>() {
113                    return Some(timestamp);
114                }
115            }
116        }
117
118        // 3. Look for edit form with timestamp
119        let edit_form_selector =
120            Selector::parse("form[data-edit-scrobble] input[name='timestamp']").unwrap();
121        if let Some(timestamp_input) = row.select(&edit_form_selector).next() {
122            if let Some(value) = timestamp_input.value().attr("value") {
123                if let Ok(timestamp) = value.parse::<u64>() {
124                    return Some(timestamp);
125                }
126            }
127        }
128
129        // 4. Look for time element with datetime attribute
130        let time_selector = Selector::parse("time").unwrap();
131        if let Some(time_elem) = row.select(&time_selector).next() {
132            if let Some(datetime) = time_elem.value().attr("datetime") {
133                // Parse ISO datetime to timestamp
134                if let Ok(parsed_time) = chrono::DateTime::parse_from_rfc3339(datetime) {
135                    return Some(parsed_time.timestamp() as u64);
136                }
137            }
138        }
139
140        None
141    }
142
143    /// Extract album name from scrobble row elements
144    fn extract_scrobble_album(&self, row: &scraper::ElementRef) -> Option<String> {
145        // Look for album_name in hidden inputs within edit forms
146        let album_input_selector =
147            Selector::parse("form[data-edit-scrobble] input[name='album_name']").unwrap();
148
149        if let Some(album_input) = row.select(&album_input_selector).next() {
150            if let Some(album_name) = album_input.value().attr("value") {
151                if !album_name.is_empty() {
152                    return Some(album_name.to_string());
153                }
154            }
155        }
156
157        None
158    }
159
160    /// Extract album artist name from scrobble row elements
161    fn extract_scrobble_album_artist(&self, row: &scraper::ElementRef) -> Option<String> {
162        // Look for album_artist_name in hidden inputs within edit forms
163        let album_artist_input_selector =
164            Selector::parse("form[data-edit-scrobble] input[name='album_artist_name']").unwrap();
165
166        if let Some(album_artist_input) = row.select(&album_artist_input_selector).next() {
167            if let Some(album_artist_name) = album_artist_input.value().attr("value") {
168                if !album_artist_name.is_empty() {
169                    return Some(album_artist_name.to_string());
170                }
171            }
172        }
173
174        None
175    }
176
177    /// Parse a tracks page into a `TrackPage` structure
178    pub fn parse_tracks_page(
179        &self,
180        document: &Html,
181        page_number: u32,
182        artist: &str,
183        album: Option<&str>,
184    ) -> Result<TrackPage> {
185        let tracks = self.extract_tracks_from_document(document, artist, album)?;
186
187        // Check for pagination
188        let (has_next_page, total_pages) = self.parse_pagination(document, page_number)?;
189
190        Ok(TrackPage {
191            tracks,
192            page_number,
193            has_next_page,
194            total_pages,
195        })
196    }
197
198    /// Extract tracks from HTML document
199    pub fn extract_tracks_from_document(
200        &self,
201        document: &Html,
202        artist: &str,
203        album: Option<&str>,
204    ) -> Result<Vec<Track>> {
205        let mut tracks = Vec::new();
206        let mut seen_tracks = std::collections::HashSet::new();
207
208        // Try JSON-embedded data first
209        if let Ok(json_tracks) = self.parse_json_tracks_page(document, 1, artist, album) {
210            return Ok(json_tracks.tracks);
211        }
212
213        // Strategy 1: Try parsing track data from data-track-name attributes (AJAX response)
214        let track_selector = Selector::parse("[data-track-name]").unwrap();
215        let track_elements: Vec<_> = document.select(&track_selector).collect();
216
217        if !track_elements.is_empty() {
218            for element in track_elements {
219                let track_name = element.value().attr("data-track-name").unwrap_or("");
220                if !track_name.is_empty() && !seen_tracks.contains(track_name) {
221                    seen_tracks.insert(track_name.to_string());
222
223                    if let Ok(playcount) = self.find_playcount_for_track(document, track_name) {
224                        let timestamp = self.find_timestamp_for_track(document, track_name);
225                        let track = Track {
226                            name: track_name.to_string(),
227                            artist: artist.to_string(),
228                            playcount,
229                            timestamp,
230                            album: album.map(|a| a.to_string()),
231                            album_artist: None, // Not available in aggregate track listings
232                        };
233                        tracks.push(track);
234                    }
235                    if tracks.len() >= 50 {
236                        break;
237                    }
238                }
239            }
240        }
241
242        // Strategy 2: Parse tracks from hidden form inputs (for tracks like "Comes a Time - 2016")
243        if tracks.len() < 50 {
244            let form_input_selector = Selector::parse("input[name='track']").unwrap();
245            for input in document.select(&form_input_selector) {
246                if let Some(track_name) = input.value().attr("value") {
247                    if !track_name.is_empty() && !seen_tracks.contains(track_name) {
248                        seen_tracks.insert(track_name.to_string());
249
250                        let playcount = self
251                            .find_playcount_for_track(document, track_name)
252                            .unwrap_or(0);
253                        let timestamp = self.find_timestamp_for_track(document, track_name);
254                        let track = Track {
255                            name: track_name.to_string(),
256                            artist: artist.to_string(),
257                            playcount,
258                            timestamp,
259                            album: album.map(|a| a.to_string()),
260                            album_artist: None, // Not available in form input parsing
261                        };
262                        tracks.push(track);
263                        if tracks.len() >= 50 {
264                            break;
265                        }
266                    }
267                }
268            }
269        }
270
271        // Strategy 3: Fallback to table parsing method if we didn't find enough tracks
272        if tracks.len() < 10 {
273            let table_tracks = self.parse_tracks_from_rows(document, artist, album)?;
274            for track in table_tracks {
275                if !seen_tracks.contains(&track.name) && tracks.len() < 50 {
276                    seen_tracks.insert(track.name.clone());
277                    tracks.push(track);
278                }
279            }
280        }
281
282        log::debug!("Successfully extracted {} unique tracks", tracks.len());
283        Ok(tracks)
284    }
285
286    /// Parse tracks from chartlist table rows
287    fn parse_tracks_from_rows(
288        &self,
289        document: &Html,
290        artist: &str,
291        album: Option<&str>,
292    ) -> Result<Vec<Track>> {
293        let mut tracks = Vec::new();
294        let table_selector = Selector::parse("table.chartlist").unwrap();
295        let row_selector = Selector::parse("tbody tr").unwrap();
296
297        for table in document.select(&table_selector) {
298            for row in table.select(&row_selector) {
299                if let Ok(mut track) = self.parse_track_row(&row) {
300                    track.artist = artist.to_string(); // Fill in artist name
301                    track.album = album.map(|a| a.to_string()); // Fill in album name
302                    tracks.push(track);
303                }
304            }
305        }
306        Ok(tracks)
307    }
308
309    /// Parse a single track row from chartlist table
310    pub fn parse_track_row(&self, row: &scraper::ElementRef) -> Result<Track> {
311        // Extract track name using shared method
312        let name = self.extract_name_from_row(row, "track")?;
313
314        // Parse play count using shared method
315        let playcount = self.extract_playcount_from_row(row);
316
317        let artist = "".to_string(); // Will be filled in by caller
318
319        Ok(Track {
320            name,
321            artist,
322            playcount,
323            timestamp: None,    // Not available in table parsing mode
324            album: None,        // Not available in table parsing mode
325            album_artist: None, // Not available in table parsing mode
326        })
327    }
328
329    /// Parse albums page into `AlbumPage` structure
330    pub fn parse_albums_page(
331        &self,
332        document: &Html,
333        page_number: u32,
334        artist: &str,
335    ) -> Result<AlbumPage> {
336        let mut albums = Vec::new();
337
338        // Try parsing album data from data attributes (AJAX response)
339        let album_selector = Selector::parse("[data-album-name]").unwrap();
340        let album_elements: Vec<_> = document.select(&album_selector).collect();
341
342        if !album_elements.is_empty() {
343            log::debug!(
344                "Found {} album elements with data-album-name",
345                album_elements.len()
346            );
347
348            // Use a set to track unique albums
349            let mut seen_albums = std::collections::HashSet::new();
350
351            for element in album_elements {
352                let album_name = element.value().attr("data-album-name").unwrap_or("");
353                if !album_name.is_empty() && !seen_albums.contains(album_name) {
354                    seen_albums.insert(album_name.to_string());
355
356                    if let Ok(playcount) = self.find_playcount_for_album(document, album_name) {
357                        let timestamp = self.find_timestamp_for_album(document, album_name);
358                        let album = Album {
359                            name: album_name.to_string(),
360                            artist: artist.to_string(),
361                            playcount,
362                            timestamp,
363                        };
364                        albums.push(album);
365                    }
366
367                    if albums.len() >= 50 {
368                        break;
369                    }
370                }
371            }
372        } else {
373            // Fall back to parsing album rows from chartlist tables
374            albums = self.parse_albums_from_rows(document, artist)?;
375        }
376
377        let (has_next_page, total_pages) = self.parse_pagination(document, page_number)?;
378
379        Ok(AlbumPage {
380            albums,
381            page_number,
382            has_next_page,
383            total_pages,
384        })
385    }
386
387    /// Parse albums from chartlist table rows
388    fn parse_albums_from_rows(&self, document: &Html, artist: &str) -> Result<Vec<Album>> {
389        let mut albums = Vec::new();
390        let table_selector = Selector::parse("table.chartlist").unwrap();
391        let row_selector = Selector::parse("tbody tr").unwrap();
392
393        for table in document.select(&table_selector) {
394            for row in table.select(&row_selector) {
395                if let Ok(mut album) = self.parse_album_row(&row) {
396                    album.artist = artist.to_string();
397                    albums.push(album);
398                }
399            }
400        }
401        Ok(albums)
402    }
403
404    /// Parse a single album row from chartlist table
405    pub fn parse_album_row(&self, row: &scraper::ElementRef) -> Result<Album> {
406        // Extract album name using shared method
407        let name = self.extract_name_from_row(row, "album")?;
408
409        // Parse play count using shared method
410        let playcount = self.extract_playcount_from_row(row);
411
412        let artist = "".to_string(); // Will be filled in by caller
413
414        Ok(Album {
415            name,
416            artist,
417            playcount,
418            timestamp: None, // Not available in table parsing
419        })
420    }
421
422    // === SHARED PARSING UTILITIES ===
423
424    /// Extract name from chartlist row (works for both tracks and albums)
425    fn extract_name_from_row(&self, row: &scraper::ElementRef, item_type: &str) -> Result<String> {
426        let name_selector = Selector::parse(".chartlist-name a").unwrap();
427        let name = row
428            .select(&name_selector)
429            .next()
430            .map(|el| el.text().collect::<String>().trim().to_string())
431            .ok_or_else(|| LastFmError::Parse(format!("Missing {item_type} name")))?;
432        Ok(name)
433    }
434
435    /// Extract playcount from chartlist row (works for both tracks and albums)
436    fn extract_playcount_from_row(&self, row: &scraper::ElementRef) -> u32 {
437        let playcount_selector = Selector::parse(".chartlist-count-bar-value").unwrap();
438        let mut playcount = 1; // default fallback
439
440        if let Some(element) = row.select(&playcount_selector).next() {
441            let text = element.text().collect::<String>().trim().to_string();
442            // Extract just the number part (before "scrobbles" if present)
443            if let Some(number_part) = text.split_whitespace().next() {
444                if let Ok(count) = number_part.parse::<u32>() {
445                    playcount = count;
446                }
447            }
448        }
449        playcount
450    }
451
452    /// Parse pagination information from document
453    pub fn parse_pagination(
454        &self,
455        document: &Html,
456        _current_page: u32,
457    ) -> Result<(bool, Option<u32>)> {
458        let pagination_selector = Selector::parse(".pagination-list").unwrap();
459
460        if let Some(pagination) = document.select(&pagination_selector).next() {
461            // Try multiple possible selectors for next page link
462            let next_selectors = [
463                "a[aria-label=\"Next\"]",
464                ".pagination-next a",
465                "a:contains(\"Next\")",
466                ".next a",
467            ];
468
469            let mut has_next = false;
470            for selector_str in &next_selectors {
471                if let Ok(selector) = Selector::parse(selector_str) {
472                    if pagination.select(&selector).next().is_some() {
473                        has_next = true;
474                        break;
475                    }
476                }
477            }
478
479            // Try to extract total pages from pagination text
480            let total_pages = self.extract_total_pages_from_pagination(&pagination);
481
482            Ok((has_next, total_pages))
483        } else {
484            // No pagination found - single page
485            Ok((false, Some(1)))
486        }
487    }
488
489    /// Helper functions for pagination parsing
490    fn extract_total_pages_from_pagination(&self, pagination: &scraper::ElementRef) -> Option<u32> {
491        // Look for patterns like "Page 1 of 42"
492        let text = pagination.text().collect::<String>();
493        if let Some(of_pos) = text.find(" of ") {
494            let after_of = &text[of_pos + 4..];
495            if let Some(number_end) = after_of.find(|c: char| !c.is_ascii_digit()) {
496                if let Ok(total) = after_of[..number_end].parse::<u32>() {
497                    return Some(total);
498                }
499            } else if let Ok(total) = after_of.trim().parse::<u32>() {
500                return Some(total);
501            }
502        }
503        None
504    }
505
506    // === JSON PARSING METHODS ===
507
508    fn parse_json_tracks_page(
509        &self,
510        _document: &Html,
511        _page: u32,
512        _artist: &str,
513        _album: Option<&str>,
514    ) -> Result<TrackPage> {
515        // JSON parsing not implemented - return error to trigger fallback
516        Err(crate::LastFmError::Parse(
517            "JSON parsing not implemented".to_string(),
518        ))
519    }
520
521    // === FIND HELPER METHODS ===
522
523    pub fn find_timestamp_for_track(&self, _document: &Html, _track_name: &str) -> Option<u64> {
524        // Implementation would search for timestamp data
525        None
526    }
527
528    pub fn find_playcount_for_track(&self, document: &Html, track_name: &str) -> Result<u32> {
529        // Look for chartlist-count-bar-value elements near the track
530        let count_selector = Selector::parse(".chartlist-count-bar-value").unwrap();
531        let link_selector = Selector::parse("a[href*=\"/music/\"]").unwrap();
532
533        // Find all track links that match our track name
534        for link in document.select(&link_selector) {
535            let link_text = link.text().collect::<String>().trim().to_string();
536            if link_text == track_name {
537                if let Some(row) = self.find_ancestor_row(link) {
538                    if let Some(count_element) = row.select(&count_selector).next() {
539                        let text = count_element.text().collect::<String>().trim().to_string();
540                        if let Some(number_part) = text.split_whitespace().next() {
541                            if let Ok(count) = number_part.parse::<u32>() {
542                                return Ok(count);
543                            }
544                        }
545                    }
546                }
547            }
548        }
549        Err(LastFmError::Parse(format!(
550            "Could not find playcount for track: {track_name}"
551        )))
552    }
553
554    pub fn find_timestamp_for_album(&self, _document: &Html, _album_name: &str) -> Option<u64> {
555        // Implementation would search for timestamp data
556        None
557    }
558
559    pub fn find_playcount_for_album(&self, document: &Html, album_name: &str) -> Result<u32> {
560        // Look for chartlist-count-bar-value elements near the album
561        let count_selector = Selector::parse(".chartlist-count-bar-value").unwrap();
562        let link_selector = Selector::parse("a[href*=\"/music/\"]").unwrap();
563
564        // Find all album links that match our album name
565        for link in document.select(&link_selector) {
566            let link_text = link.text().collect::<String>().trim().to_string();
567            if link_text == album_name {
568                if let Some(row) = self.find_ancestor_row(link) {
569                    if let Some(count_element) = row.select(&count_selector).next() {
570                        let text = count_element.text().collect::<String>().trim().to_string();
571                        if let Some(number_part) = text.split_whitespace().next() {
572                            if let Ok(count) = number_part.parse::<u32>() {
573                                return Ok(count);
574                            }
575                        }
576                    }
577                }
578            }
579        }
580        Err(LastFmError::Parse(format!(
581            "Could not find playcount for album: {album_name}"
582        )))
583    }
584
585    pub fn find_ancestor_row<'a>(
586        &self,
587        element: scraper::ElementRef<'a>,
588    ) -> Option<scraper::ElementRef<'a>> {
589        let mut current = element;
590        while let Some(parent) = current.parent() {
591            if let Some(parent_elem) = scraper::ElementRef::wrap(parent) {
592                if parent_elem.value().name() == "tr" {
593                    return Some(parent_elem);
594                }
595                current = parent_elem;
596            } else {
597                break;
598            }
599        }
600        None
601    }
602}
603
604impl Default for LastFmParser {
605    fn default() -> Self {
606        Self::new()
607    }
608}