ug_scraper/
tab_scraper.rs

1// UG-Scraper - A basic rust API for getting data from Ultimate Guitar
2// Copyright (C) 2025  Linus Tibert
3//
4// This program was originally published under the MIT licence as seen
5// here: https://github.com/Lich-Corals/ug-tab-scraper-rs/blob/mistress/LICENCE
6
7use crate::types::*;
8use crate::network::*;
9use crate::error::UGError;
10use regex::Regex;
11use std::str::FromStr;
12
13const END_OF_CHORDS_DELIM: &str = "","revision_id":";
14const START_OF_CHORDS_DELIM: &str = "":{"wiki_tab":{"content":"";
15const HTML_BLACKLIST: [&str; 1] = [""type":"Video""];
16const VALID_LINK_REGEX: &str = r"http[s]*:\/\/[www.]*[tabs.]*ultimate-guitar.com\/tab\/[\S]+";
17const METADATA_REGEX: &str = r""adsupp_binary_blocked":null,"meta":\{["capo":]*(\d*)[,]*"[tonality":"]*(\w*)[","]*tuning":\{"name":"([^:]*)","value":"([^:]*)",";
18const BASIC_DATA_REGEX: &str = r"tab":\{"id":(\d+),"song_id":(\d+),"song_name":"([^:]+)","artist_id":\d+,"artist_name":"([^:]+)","type":"([\w\s]+)","part":";
19
20/// Gets as much data about a tab as possible.
21/// 
22/// ## Arguments
23/// * `url`: The URL to the tab
24/// * `replace_german_names`: Wether to replace german chord names like `H` with `B`
25///     * View [`crate::types::Line::replace_german_names`] for more information.
26/// 
27/// ## Example: 
28/// ```
29/// use ug_scraper::tab_scraper::get_song_data;
30/// 
31/// // Returns a wrapped Song object with associated data and replaced german chords
32/// get_song_data("https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741", true);
33/// ```
34/// 
35/// ## Possible errors
36/// * `ureq::Error::*`
37/// * [`crate::error::UGError`]
38///     * `InvalidHTMLError`
39///     * `InvalidURLError`
40///     * `NoBasicDataMatchError`
41///     * `UnexpectedWebResultError`
42pub fn get_song_data(url: &str, replace_german_names: bool) -> Result<Song, Box<dyn std::error::Error>> {
43        let raw_html: String;
44        match get_raw_html(url) {
45                Ok(s) => raw_html = s,
46                Err(e) => return Err(e.into()),
47        }
48        let song_lines: Vec<Line> = get_tab_lines(&raw_html, replace_german_names)?;
49        let song_metadata: Option<SongMetaData>;
50        let basic_song_data: BasicSongData;
51        match get_basic_metadata(&raw_html, url) {
52                Ok(d) => {
53                        song_metadata = extract_metadata(&raw_html);
54                        basic_song_data = d;
55                }
56                Err(e) => return Err(e.into())
57        }
58        let song: Song = Song { lines: song_lines, metadata: song_metadata, basic_data: basic_song_data };
59        Ok(song)
60}
61
62/// Get the basic metadata about a tab from valid HTML
63/// 
64/// ## Arguments
65/// * `raw_html`: the raw HTML of a supported UG tab page
66/// * `tab_link`: the link to the page
67/// 
68/// ## Example:
69/// ```
70/// use ug_scraper::tab_scraper::get_basic_metadata;
71/// use ug_scraper::network::get_raw_html;
72/// 
73/// let url: &str = "https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741";
74/// let raw_html: &str = &get_raw_html(url).unwrap();
75/// let basic_data = get_basic_metadata(raw_html, url).unwrap();
76/// // Returns:
77/// // BasicSongData { title: "Never Gonna Give You Up",
78/// //                 artist: "Rick Astley",
79/// //                 tab_link: "https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741",
80/// //                 song_id: 196324,
81/// //                 tab_id: 521741,
82/// //                 data_type: Chords }
83/// ```
84/// 
85/// ## Possible errors
86/// * [`crate::error::UGError`]
87///     * `InvalidHTMLError`
88///     * `InvalidURLError`
89///     * `NoBasicDataMatchError`
90///     * `UnexpectedWebResultError`
91pub fn get_basic_metadata(raw_html: &str, tab_link: &str) -> Result<BasicSongData, UGError> {
92        validate_html(raw_html)?;
93        validate_link(tab_link)?;
94
95        let regex = Regex::new(BASIC_DATA_REGEX).unwrap();
96        let captures = regex.captures(raw_html);
97        if captures.is_some() {
98                let captures = captures.unwrap();
99                let song_type: DataSetType = get_data_type(&captures[5]).unwrap_or(DataSetType::default());
100                let tab_id: u32;
101                match u32::from_str(&captures[1]) {
102                        Ok(i) => tab_id = i,
103                        Err(_e) => return Err(UGError::UnexpectedWebResultError),
104                }
105                let song_id: u32;
106                match u32::from_str(&captures[2]) {
107                        Ok(i) => song_id = i,
108                        Err(_e) => return Err(UGError::UnexpectedWebResultError),
109                }
110                let title = unescape_string(&captures[3]).to_string();
111                let artist = unescape_string(&captures[4]).to_string();
112                let song_basic_meta: BasicSongData = BasicSongData { title: title,
113                        artist: artist,
114                        tab_link: tab_link.to_string(),
115                        song_id: song_id,
116                        tab_id: tab_id,
117                        data_type: song_type };
118                return Ok(song_basic_meta)
119        } else {
120                return Err(UGError::NoBasicDataMatchError)
121        }
122}
123
124/// Get a `Vec` with the lines of a tab
125/// 
126/// ## Arguments
127/// * `raw_html`: the raw HTML of a supported UG tab page
128/// * `replace_german_names`: Wether to replace german chord names like `H` with `B`
129///     * View [`crate::types::Line::replace_german_names`] for more information.
130/// 
131/// ## Example:
132/// ```
133/// use ug_scraper::tab_scraper::get_tab_lines;
134/// use ug_scraper::network::get_raw_html;
135/// use ug_scraper::types::Line;
136/// 
137/// let url: &str = "https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741";
138/// let raw_html: &str = &get_raw_html(url).unwrap();
139/// 
140/// // Ruturns lines of the tab with german chord names replaced
141/// let lines_vec = get_tab_lines(raw_html, true).unwrap();
142/// ```
143/// 
144/// ## Possible errors
145/// * [`crate::error::UGError::InvalidHTMLError`]
146pub fn get_tab_lines(raw_html: &str, replace_german_names: bool) -> Result<Vec<Line>, UGError> {
147        validate_html(raw_html)?;
148        let string_parts: Vec<&str> = raw_html.split(END_OF_CHORDS_DELIM).collect();
149        let raw_data: &str = string_parts[0].split(START_OF_CHORDS_DELIM).collect::<Vec<&str>>()[1];
150        let formatted_string_lines = unescape_string(raw_data);
151        let lines: Vec<Line> = clean_and_evaluate(formatted_string_lines.lines(), replace_german_names);
152        Ok(lines)
153}
154
155/// Checks if a given URL is leading to a tab on Ultimate Guitar
156/// 
157/// Returns Ok(()) if valid and Err(UGError::InvalidURLError) if invalid.
158pub fn validate_link(url: &str) -> Result<(), UGError> {
159        let regex = Regex::new(VALID_LINK_REGEX).unwrap();
160        let captures = regex.captures(url);
161        match captures {
162                Some(_d) => Ok(()),
163                None => Err(UGError::InvalidURLError),
164        }
165}
166
167fn validate_html(raw_html: &str) -> Result<(), UGError> {
168        for item in HTML_BLACKLIST {
169                if raw_html.contains(item) {
170                        return Err(UGError::InvalidHTMLError)
171                }
172        }
173        if !raw_html.contains(START_OF_CHORDS_DELIM) || !raw_html.contains(END_OF_CHORDS_DELIM) {
174                return Err(UGError::InvalidHTMLError)
175        }
176        Ok(())
177}
178
179fn extract_metadata(raw_html: &str) -> Option<SongMetaData> {
180        let regex = Regex::new(METADATA_REGEX).unwrap();
181        let captures = regex.captures(raw_html);
182        let mut song_metadata: SongMetaData = SongMetaData::default();
183        if captures.is_some() {
184                let captures = captures.unwrap();
185                let mut capture_options: [Option<String>; 4] = [Some(captures[1].to_string()), 
186                        Some(captures[2].to_string()), 
187                        Some(captures[3].to_string()), 
188                        Some(captures[4].to_string())];
189                for i in 0..4 {
190                        if capture_options[i].clone().unwrap().is_empty() {
191                                capture_options[i] = None;
192                        }
193                        match i {
194                                0 => song_metadata.capo = capture_options[i].clone(),
195                                1 => song_metadata.tonality = capture_options[i].clone(),
196                                2 => song_metadata.tuning_name = capture_options[i].clone(),
197                                3 => song_metadata.tuning = capture_options[i].clone(),
198                                _ => (),
199                        }
200                }                
201        } else {
202                return None
203        }
204        return Some(song_metadata)
205}
206
207fn clean_and_evaluate(lines: std::str::Lines<'_>, replace_german_names: bool) -> Vec<Line> {
208        let mut clean_lines: Vec<Line> = Vec::new();
209        for line in lines {
210                let mut line_type: DataType = DataType::Lyric;
211                if line.contains("[ch]") {
212                        line_type = DataType::Chord;
213                }
214                let mut clean_line: String = String::from(line);
215                for key in ["[ch]", "[/ch]", "[tab]", "[/tab]"] {
216                        clean_line = clean_line.replace(key, "")
217                }
218                if clean_line.contains("[") && clean_line.contains("]") {
219                        line_type = DataType::SectionTitle;
220                }
221                let mut line = Line {line_type: line_type, text_data: clean_line};
222                if replace_german_names {
223                        line = line.replace_german_names();
224                }
225                clean_lines.push(line);
226        }
227        clean_lines
228}
229
230#[cfg(test)]
231mod tests {
232        use core::panic;
233
234        use super::*;
235
236        #[test]
237        fn get_lines_of_tab() {
238                let tabs_to_get = ["https://tabs.ultimate-guitar.com/tab/367279", 
239                        "https://tabs.ultimate-guitar.com/tab/queen/dont-stop-me-now-chords-519549"];
240                for tab in tabs_to_get {
241                        println!("Getting tab: {}", tab);
242                        assert!(!matches!(get_tab_lines(&get_raw_html(tab).unwrap(), true), Err(UGError::InvalidHTMLError)));
243                }
244        }
245
246        #[test]
247        fn tab_link_validation() {
248                assert_eq!(validate_link("https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741"), Ok(()));
249                assert_ne!(validate_link("tabs.ultimate-guitar.com/tab/refused/rather-be-dead-power-595658"), Ok(()));
250        }
251
252        #[test]
253        fn type_detection() {
254                let type_detection_checks: Vec<(DataSetType, &str)> = vec![(DataSetType::Chords, "https://tabs.ultimate-guitar.com/tab/queen/dont-stop-me-now-chords-519549"),
255                        (DataSetType::Chords, "https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741"),
256                        (DataSetType::Bass, "https://tabs.ultimate-guitar.com/tab/bloc-party/this-modern-love-bass-180218"),
257                        (DataSetType::Tab, "https://tabs.ultimate-guitar.com/tab/led-zeppelin/stairway-to-heaven-tabs-9488"),
258                        (DataSetType::Ukulele, "https://tabs.ultimate-guitar.com/tab/olli-schulz/wenn-es-gut-ist-ukulele-1381967"),
259                        (DataSetType::Drums, "https://tabs.ultimate-guitar.com/tab/phil-collins/in-the-air-tonight-drums-880599"),
260                        (DataSetType::Bass, "https://tabs.ultimate-guitar.com/tab/pink-floyd/empty-spaces-bass-147995")];
261                for check in type_detection_checks {
262                        println!("Testing valid url: {}", check.1);
263                        assert_eq!(get_basic_metadata(&get_raw_html(check.1).unwrap(), check.1).unwrap().data_type, check.0);
264                }
265        }
266
267        #[test]
268        fn validate_page_contents() {
269                let valid_page_urls = vec!["https://tabs.ultimate-guitar.com/tab/queen/dont-stop-me-now-chords-519549",
270                        "https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741",
271                        "https://tabs.ultimate-guitar.com/tab/led-zeppelin/stairway-to-heaven-tabs-9488",
272                        "https://tabs.ultimate-guitar.com/tab/olli-schulz/wenn-es-gut-ist-ukulele-1381967",
273                        "https://tabs.ultimate-guitar.com/tab/phil-collins/in-the-air-tonight-drums-880599",
274                        "https://tabs.ultimate-guitar.com/tab/blink-182/feeling-this-bass-104175",
275                        "https://tabs.ultimate-guitar.com/tab/pink-floyd/empty-spaces-bass-147995",
276                        "https://tabs.ultimate-guitar.com/tab/367279"];
277                for valid_page_url in valid_page_urls {
278                        println!("Testing valid url: {}", valid_page_url);
279                        assert!(!matches!(validate_html(&get_raw_html(valid_page_url).unwrap()), Err(UGError::InvalidHTMLError)));
280                }
281
282                let invalid_page_urls = vec!["https://tabs.ultimate-guitar.com/tab/refused/i-wanna-watch-the-world-burn-guitar-pro-5868920", 
283                        "https://tabs.ultimate-guitar.com/tab/refused/rather-be-dead-power-595658", 
284                        "https://tabs.ultimate-guitar.com/tab/the-beatles/let-it-be-video-781202",
285                        "https://www.youtube.com/watch?v=dQw4w9WgXcQ&list=RDdQw4w9WgXcQ"];
286                for invalid_page_url in invalid_page_urls {
287                        println!("Testing invalid url: {}", invalid_page_url);
288                        assert!(matches!(validate_html(&get_raw_html(invalid_page_url).unwrap()), Err(UGError::InvalidHTMLError)));
289                }
290        }
291
292        #[test]
293        fn get_basic_data() {
294                let test_sets: Vec<(&str, &str, &str, u32, u32)> = vec![("https://tabs.ultimate-guitar.com/tab/queen/dont-stop-me-now-chords-519549",
295                                "Dont Stop Me Now", "Queen", 15591, 519549),
296                        ("https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741",
297                                "Never Gonna Give You Up", "Rick Astley", 196324, 521741),
298                        ("https://tabs.ultimate-guitar.com/tab/led-zeppelin/stairway-to-heaven-tabs-9488",
299                                "Stairway To Heaven", "Led Zeppelin", 31683, 9488),
300                        ("https://tabs.ultimate-guitar.com/tab/olli-schulz/wenn-es-gut-ist-ukulele-1381967",
301                                "Wenn Es Gut Ist", "Olli Schulz", 317511, 1381967),
302                        ("https://tabs.ultimate-guitar.com/tab/phil-collins/in-the-air-tonight-drums-880599",
303                                "In The Air Tonight", "Phil Collins", 138587, 880599),
304                        ("https://tabs.ultimate-guitar.com/tab/blink-182/feeling-this-bass-104175",
305                                "Feeling This", "Blink-182", 54209, 104175), // The title is actually wrong it the UG metadata. This is not a bug!
306                        ("https://tabs.ultimate-guitar.com/tab/pink-floyd/empty-spaces-bass-147995",
307                                "Empty Spaces", "Pink Floyd", 17357, 147995),
308                        ("https://tabs.ultimate-guitar.com/tab/367279",
309                                "Zu Spät", "Die Ärzte", 1577513, 367279)];
310
311                for set in test_sets {
312                        let result = get_basic_metadata(&get_raw_html(set.0).unwrap(), set.0).unwrap();
313                        assert_eq!(result.title, set.1);
314                        assert_eq!(result.artist, set.2);
315                        assert_eq!(result.song_id, set.3);
316                        assert_eq!(result.tab_id, set.4)
317                }
318        }
319
320        #[test]
321        fn get_metadata() {
322                let url_metadata_sets: Vec<(Option<SongMetaData>, &str)> = vec![(Some(SongMetaData { 
323                                capo: Some(String::from("3")), 
324                                tonality: None, 
325                                tuning_name: Some(String::from("G C E A")), 
326                                tuning: Some(String::from("G C E A")) }), "https://tabs.ultimate-guitar.com/tab/olli-schulz/wenn-es-gut-ist-ukulele-1381967"),
327                        (None, "https://tabs.ultimate-guitar.com/tab/pink-floyd/empty-spaces-bass-147995"),
328                        (None, "https://tabs.ultimate-guitar.com/tab/phil-collins/in-the-air-tonight-drums-880599"),
329                        (Some(SongMetaData { capo: Some(String::from("1")), 
330                                tonality: None, 
331                                tuning_name: Some(String::from("Standard")), 
332                                tuning: Some(String::from("E A D G B E")) }), "https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741"),
333                        (Some(SongMetaData { capo: None, 
334                                tonality: Some(String::from("F")), 
335                                tuning_name: Some(String::from("Standard")), 
336                                tuning: Some(String::from("E A D G B E")) }), "https://tabs.ultimate-guitar.com/tab/queen/dont-stop-me-now-chords-519549"),];
337                for url_metadata_set in url_metadata_sets {
338                        println!("Testing url: {}", stringify!(get_type(&get_raw_html(url_metadata_set.1).unwrap()).unwrap()));
339                        match extract_metadata(&get_raw_html(url_metadata_set.1).unwrap()) {
340                                Some(d) => assert_eq!(d, url_metadata_set.0.unwrap()),
341                                None => {
342                                        if url_metadata_set.0.is_some() {
343                                                panic!("Found metadata for song without known metadata.")
344                                        }
345                                },
346                        }
347                }
348        }
349}