ug_scraper/
tab_scraper.rs

1// UG-Scraper - A basic rust API for getting data from Ultimate Guitar
2// Copyright (C) 2025  Linus Tibert
3//
4// This program was originally published under the MIT licence as seen
5// here: https://github.com/Lich-Corals/ug-tab-scraper-rs/blob/mistress/LICENCE
6
7use crate::types::*;
8use crate::network::*;
9use crate::error::UGError;
10use regex::Regex;
11use std::str::FromStr;
12
13const END_OF_CHORDS_DELIM: &str = "","revision_id":";
14const START_OF_CHORDS_DELIM: &str = "":{"wiki_tab":{"content":"";
15const HTML_BLACKLIST: [&str; 1] = [""type":"Video""];
16const VALID_LINK_REGEX: &str = r"http[s]*:\/\/[www.]*[tabs.]*ultimate-guitar.com\/tab\/[\S]+";
17const METADATA_REGEX: &str = r""adsupp_binary_blocked":null,"meta":\{["capo":]*(\d*)[,]*"[tonality":"]*(\w*)[","]*tuning":\{"name":"([^:]*)","value":"([^:]*)",";
18const BASIC_DATA_REGEX: &str = r"tab":\{"id":(\d+),"song_id":(\d+),"song_name":"([^:]+)","artist_id":\d+,"artist_name":"([^:]+)","type":"([\w\s]+)","part":";
19
20/// Gets as much data about a tab as possible.
21/// 
22/// ## Arguments
23/// * `url`: The URL to the tab
24/// * `replace_german_names`: Wether to replace german chord names like `H` with `B`
25///     * View [`crate::types::Line::replace_german_names`] for more information.
26/// 
27/// ## Example: 
28/// ```
29/// use ug_scraper::tab_scraper::get_song_data;
30/// 
31/// // Returns a wrapped Song object with associated data and replaced german chords
32/// get_song_data("https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741", true);
33/// ```
34/// 
35/// ## Possible errors
36/// * `ureq::Error::*`
37/// * [`crate::error::UGError`]
38///     * `InvalidHTMLError`
39///     * `InvalidURLError`
40///     * `NoBasicDataMatchError`
41///     * `UnexpectedWebResultError`
42pub fn get_song_data(url: &str, replace_german_names: bool) -> Result<Song, Box<dyn std::error::Error>> {
43        let raw_html: String;
44        match get_raw_html(url) {
45                Ok(s) => raw_html = s,
46                Err(e) => return Err(e.into()),
47        }
48        let song_lines: Vec<Line> = get_tab_lines(&raw_html, replace_german_names)?;
49        let song_metadata: Option<SongMetaData>;
50        let basic_song_data: BasicSongData;
51        match get_basic_metadata(&raw_html, url) {
52                Ok(d) => {
53                        song_metadata = extract_metadata(&raw_html);
54                        basic_song_data = d;
55                }
56                Err(e) => return Err(e.into())
57        }
58        let song: Song = Song { lines: song_lines, metadata: song_metadata, basic_data: basic_song_data };
59        Ok(song)
60}
61
62/// Get the basic metadata about a tab from valid HTML
63/// 
64/// ## Arguments
65/// * `raw_html`: the raw HTML of a supported UG tab page
66/// * `tab_link`: the link to the page
67/// 
68/// ## Example:
69/// ```
70/// use ug_scraper::tab_scraper::get_basic_metadata;
71/// use ug_scraper::network::get_raw_html;
72/// 
73/// let url: &str = "https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741";
74/// let raw_html: &str = &get_raw_html(url).unwrap();
75/// let basic_data = get_basic_metadata(raw_html, url).unwrap();
76/// // Returns:
77/// // BasicSongData { title: "Never Gonna Give You Up",
78/// //                 artist: "Rick Astley",
79/// //                 tab_link: "https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741",
80/// //                 song_id: 196324,
81/// //                 tab_id: 521741,
82/// //                 data_type: Chords }
83/// ```
84/// 
85/// ## Possible errors
86/// * [`crate::error::UGError`]
87///     * `InvalidHTMLError`
88///     * `InvalidURLError`
89///     * `NoBasicDataMatchError`
90///     * `UnexpectedWebResultError`
91pub fn get_basic_metadata(raw_html: &str, tab_link: &str) -> Result<BasicSongData, UGError> {
92        validate_html(raw_html)?;
93        validate_link(tab_link)?;
94
95        let regex = Regex::new(BASIC_DATA_REGEX).unwrap();
96        let captures = regex.captures(raw_html);
97        if captures.is_some() {
98                let captures = captures.unwrap();
99                let song_type: DataSetType = get_data_type(&captures[5]).unwrap_or(DataSetType::default());
100                let tab_id: u32;
101                match u32::from_str(&captures[1]) {
102                        Ok(i) => tab_id = i,
103                        Err(_e) => return Err(UGError::UnexpectedWebResultError),
104                }
105                let song_id: u32;
106                match u32::from_str(&captures[2]) {
107                        Ok(i) => song_id = i,
108                        Err(_e) => return Err(UGError::UnexpectedWebResultError),
109                }
110                let title = unescape_string(&captures[3]).to_string();
111                let artist = unescape_string(&captures[4]).to_string();
112                let song_basic_meta: BasicSongData = BasicSongData { title: title,
113                        artist: artist,
114                        tab_link: tab_link.to_string(),
115                        song_id: song_id,
116                        tab_id: tab_id,
117                        data_type: song_type };
118                return Ok(song_basic_meta)
119        } else {
120                return Err(UGError::NoBasicDataMatchError)
121        }
122}
123
124/// Get a `Vec` with the lines of a tab
125/// 
126/// ## Arguments
127/// * `raw_html`: the raw HTML of a supported UG tab page
128/// * `replace_german_names`: Wether to replace german chord names like `H` with `B`
129///     * View [`crate::types::Line::replace_german_names`] for more information.
130/// 
131/// ## Example:
132/// ```
133/// use ug_scraper::tab_scraper::get_tab_lines;
134/// use ug_scraper::network::get_raw_html;
135/// use ug_scraper::types::Line;
136/// 
137/// let url: &str = "https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741";
138/// let raw_html: &str = &get_raw_html(url).unwrap();
139/// 
140/// // Ruturns lines of the tab with german chord names replaced
141/// let lines_vec = get_tab_lines(raw_html, true).unwrap();
142/// ```
143/// 
144/// ## Possible errors
145/// * [`crate::error::UGError::InvalidHTMLError`]
146pub fn get_tab_lines(raw_html: &str, replace_german_names: bool) -> Result<Vec<Line>, UGError> {
147        validate_html(raw_html)?;
148        let string_parts: Vec<&str> = raw_html.split(END_OF_CHORDS_DELIM).collect();
149        let raw_data: &str = string_parts[0].split(START_OF_CHORDS_DELIM).collect::<Vec<&str>>()[1];
150        let formatted_string_lines = unescape_string(raw_data);
151        let lines: Vec<Line> = clean_and_evaluate(formatted_string_lines.lines(), replace_german_names);
152        Ok(lines)
153}
154
155fn validate_html(raw_html: &str) -> Result<(), UGError> {
156        for item in HTML_BLACKLIST {
157                if raw_html.contains(item) {
158                        return Err(UGError::InvalidHTMLError)
159                }
160        }
161        if !raw_html.contains(START_OF_CHORDS_DELIM) || !raw_html.contains(END_OF_CHORDS_DELIM) {
162                return Err(UGError::InvalidHTMLError)
163        }
164        Ok(())
165}
166
167fn validate_link(url: &str) -> Result<(), UGError> {
168        let regex = Regex::new(VALID_LINK_REGEX).unwrap();
169        let captures = regex.captures(url);
170        match captures {
171                Some(_d) => Ok(()),
172                None => Err(UGError::InvalidURLError),
173        }
174        
175}
176
177fn extract_metadata(raw_html: &str) -> Option<SongMetaData> {
178        let regex = Regex::new(METADATA_REGEX).unwrap();
179        let captures = regex.captures(raw_html);
180        let mut song_metadata: SongMetaData = SongMetaData::default();
181        if captures.is_some() {
182                let captures = captures.unwrap();
183                let mut capture_options: [Option<String>; 4] = [Some(captures[1].to_string()), 
184                        Some(captures[2].to_string()), 
185                        Some(captures[3].to_string()), 
186                        Some(captures[4].to_string())];
187                for i in 0..4 {
188                        if capture_options[i].clone().unwrap().is_empty() {
189                                capture_options[i] = None;
190                        }
191                        match i {
192                                0 => song_metadata.capo = capture_options[i].clone(),
193                                1 => song_metadata.tonality = capture_options[i].clone(),
194                                2 => song_metadata.tuning_name = capture_options[i].clone(),
195                                3 => song_metadata.tuning = capture_options[i].clone(),
196                                _ => (),
197                        }
198                }                
199        } else {
200                return None
201        }
202        return Some(song_metadata)
203}
204
205fn clean_and_evaluate(lines: std::str::Lines<'_>, replace_german_names: bool) -> Vec<Line> {
206        let mut clean_lines: Vec<Line> = Vec::new();
207        for line in lines {
208                let mut line_type: DataType = DataType::Lyric;
209                if line.contains("[ch]") {
210                        line_type = DataType::Chord;
211                }
212                let mut clean_line: String = String::from(line);
213                for key in ["[ch]", "[/ch]", "[tab]", "[/tab]"] {
214                        clean_line = clean_line.replace(key, "")
215                }
216                if clean_line.contains("[") && clean_line.contains("]") {
217                        line_type = DataType::SectionTitle;
218                }
219                let mut line = Line {line_type: line_type, text_data: clean_line};
220                if replace_german_names {
221                        line = line.replace_german_names();
222                }
223                clean_lines.push(line);
224        }
225        clean_lines
226}
227
228#[cfg(test)]
229mod tests {
230        use core::panic;
231
232        use super::*;
233
234        #[test]
235        fn get_lines_of_tab() {
236                let tabs_to_get = ["https://tabs.ultimate-guitar.com/tab/367279", 
237                        "https://tabs.ultimate-guitar.com/tab/queen/dont-stop-me-now-chords-519549"];
238                for tab in tabs_to_get {
239                        println!("Getting tab: {}", tab);
240                        assert!(!matches!(get_tab_lines(&get_raw_html(tab).unwrap(), true), Err(UGError::InvalidHTMLError)));
241                }
242        }
243
244        #[test]
245        fn tab_link_validation() {
246                assert_eq!(validate_link("https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741"), Ok(()));
247                assert_ne!(validate_link("tabs.ultimate-guitar.com/tab/refused/rather-be-dead-power-595658"), Ok(()));
248        }
249
250        #[test]
251        fn type_detection() {
252                let type_detection_checks: Vec<(DataSetType, &str)> = vec![(DataSetType::Chords, "https://tabs.ultimate-guitar.com/tab/queen/dont-stop-me-now-chords-519549"),
253                        (DataSetType::Chords, "https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741"),
254                        (DataSetType::Bass, "https://tabs.ultimate-guitar.com/tab/bloc-party/this-modern-love-bass-180218"),
255                        (DataSetType::Tab, "https://tabs.ultimate-guitar.com/tab/led-zeppelin/stairway-to-heaven-tabs-9488"),
256                        (DataSetType::Ukulele, "https://tabs.ultimate-guitar.com/tab/olli-schulz/wenn-es-gut-ist-ukulele-1381967"),
257                        (DataSetType::Drums, "https://tabs.ultimate-guitar.com/tab/phil-collins/in-the-air-tonight-drums-880599"),
258                        (DataSetType::Bass, "https://tabs.ultimate-guitar.com/tab/pink-floyd/empty-spaces-bass-147995")];
259                for check in type_detection_checks {
260                        println!("Testing valid url: {}", check.1);
261                        assert_eq!(get_basic_metadata(&get_raw_html(check.1).unwrap(), check.1).unwrap().data_type, check.0);
262                }
263        }
264
265        #[test]
266        fn validate_page_contents() {
267                let valid_page_urls = vec!["https://tabs.ultimate-guitar.com/tab/queen/dont-stop-me-now-chords-519549",
268                        "https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741",
269                        "https://tabs.ultimate-guitar.com/tab/led-zeppelin/stairway-to-heaven-tabs-9488",
270                        "https://tabs.ultimate-guitar.com/tab/olli-schulz/wenn-es-gut-ist-ukulele-1381967",
271                        "https://tabs.ultimate-guitar.com/tab/phil-collins/in-the-air-tonight-drums-880599",
272                        "https://tabs.ultimate-guitar.com/tab/blink-182/feeling-this-bass-104175",
273                        "https://tabs.ultimate-guitar.com/tab/pink-floyd/empty-spaces-bass-147995",
274                        "https://tabs.ultimate-guitar.com/tab/367279"];
275                for valid_page_url in valid_page_urls {
276                        println!("Testing valid url: {}", valid_page_url);
277                        assert!(!matches!(validate_html(&get_raw_html(valid_page_url).unwrap()), Err(UGError::InvalidHTMLError)));
278                }
279
280                let invalid_page_urls = vec!["https://tabs.ultimate-guitar.com/tab/refused/i-wanna-watch-the-world-burn-guitar-pro-5868920", 
281                        "https://tabs.ultimate-guitar.com/tab/refused/rather-be-dead-power-595658", 
282                        "https://tabs.ultimate-guitar.com/tab/the-beatles/let-it-be-video-781202",
283                        "https://www.youtube.com/watch?v=dQw4w9WgXcQ&list=RDdQw4w9WgXcQ"];
284                for invalid_page_url in invalid_page_urls {
285                        println!("Testing invalid url: {}", invalid_page_url);
286                        assert!(matches!(validate_html(&get_raw_html(invalid_page_url).unwrap()), Err(UGError::InvalidHTMLError)));
287                }
288        }
289
290        #[test]
291        fn get_basic_data() {
292                let test_sets: Vec<(&str, &str, &str, u32, u32)> = vec![("https://tabs.ultimate-guitar.com/tab/queen/dont-stop-me-now-chords-519549",
293                                "Dont Stop Me Now", "Queen", 15591, 519549),
294                        ("https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741",
295                                "Never Gonna Give You Up", "Rick Astley", 196324, 521741),
296                        ("https://tabs.ultimate-guitar.com/tab/led-zeppelin/stairway-to-heaven-tabs-9488",
297                                "Stairway To Heaven", "Led Zeppelin", 31683, 9488),
298                        ("https://tabs.ultimate-guitar.com/tab/olli-schulz/wenn-es-gut-ist-ukulele-1381967",
299                                "Wenn Es Gut Ist", "Olli Schulz", 317511, 1381967),
300                        ("https://tabs.ultimate-guitar.com/tab/phil-collins/in-the-air-tonight-drums-880599",
301                                "In The Air Tonight", "Phil Collins", 138587, 880599),
302                        ("https://tabs.ultimate-guitar.com/tab/blink-182/feeling-this-bass-104175",
303                                "Feeling This", "Blink-182", 54209, 104175), // The title is actually wrong it the UG metadata. This is not a bug!
304                        ("https://tabs.ultimate-guitar.com/tab/pink-floyd/empty-spaces-bass-147995",
305                                "Empty Spaces", "Pink Floyd", 17357, 147995),
306                        ("https://tabs.ultimate-guitar.com/tab/367279",
307                                "Zu Spät", "Die Ärzte", 1577513, 367279)];
308
309                for set in test_sets {
310                        let result = get_basic_metadata(&get_raw_html(set.0).unwrap(), set.0).unwrap();
311                        assert_eq!(result.title, set.1);
312                        assert_eq!(result.artist, set.2);
313                        assert_eq!(result.song_id, set.3);
314                        assert_eq!(result.tab_id, set.4)
315                }
316        }
317
318        #[test]
319        fn get_metadata() {
320                let url_metadata_sets: Vec<(Option<SongMetaData>, &str)> = vec![(Some(SongMetaData { 
321                                capo: Some(String::from("3")), 
322                                tonality: None, 
323                                tuning_name: Some(String::from("G C E A")), 
324                                tuning: Some(String::from("G C E A")) }), "https://tabs.ultimate-guitar.com/tab/olli-schulz/wenn-es-gut-ist-ukulele-1381967"),
325                        (None, "https://tabs.ultimate-guitar.com/tab/pink-floyd/empty-spaces-bass-147995"),
326                        (None, "https://tabs.ultimate-guitar.com/tab/phil-collins/in-the-air-tonight-drums-880599"),
327                        (Some(SongMetaData { capo: Some(String::from("1")), 
328                                tonality: None, 
329                                tuning_name: Some(String::from("Standard")), 
330                                tuning: Some(String::from("E A D G B E")) }), "https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741"),
331                        (Some(SongMetaData { capo: None, 
332                                tonality: Some(String::from("F")), 
333                                tuning_name: Some(String::from("Standard")), 
334                                tuning: Some(String::from("E A D G B E")) }), "https://tabs.ultimate-guitar.com/tab/queen/dont-stop-me-now-chords-519549"),];
335                for url_metadata_set in url_metadata_sets {
336                        println!("Testing url: {}", stringify!(get_type(&get_raw_html(url_metadata_set.1).unwrap()).unwrap()));
337                        match extract_metadata(&get_raw_html(url_metadata_set.1).unwrap()) {
338                                Some(d) => assert_eq!(d, url_metadata_set.0.unwrap()),
339                                None => {
340                                        if url_metadata_set.0.is_some() {
341                                                panic!("Found metadata for song without known metadata.")
342                                        }
343                                },
344                        }
345                }
346        }
347}