ug_scraper/
tab_scraper.rs

1// UG-Scraper - A basic rust API for getting data from Ultimate Guitar
2// Copyright (C) 2025  Linus Tibert
3//
4// This program was originally published under the MIT licence as seen
5// here: https://github.com/Lich-Corals/ug-tab-scraper-rs/blob/mistress/LICENCE
6
7use crate::types::*;
8use crate::network::*;
9use crate::error::UGError;
10use regex::Regex;
11use std::str::FromStr;
12
13const END_OF_CHORDS_DELIM: &str = "","revision_id":";
14const START_OF_CHORDS_DELIM: &str = "":{"wiki_tab":{"content":"";
15const HTML_BLACKLIST: [&str; 1] = [""type":"Video""];
16const VALID_LINK_REGEX: &str = r"http[s]*:\/\/[www.]*[tabs.]*ultimate-guitar.com\/tab\/[\S]+";
17const METADATA_REGEX: &str = r""adsupp_binary_blocked":null,"meta":\{["capo":]*(\d*)[,]*"[tonality":"]*(\w*)[","]*tuning":\{"name":"([^:]*)","value":"([^:]*)",";
18const BASIC_DATA_REGEX: &str = r"tab":\{"id":(\d+),"song_id":(\d+),"song_name":"([^:]+)","artist_id":\d+,"artist_name":"([^:]+)","type":"([\w\s]+)","part":";
19
20/// Gets as much data about a tab as possible.
21/// 
22/// ## Arguments
23/// * `url`: The URL to the tab
24/// * `replace_german_names`: Wether to replace german chord names like `H` with `B`
25///     * View [`crate::types::Line::replace_german_names`] for more information.
26/// 
27/// ## Example: 
28/// ```
29/// use ug_scraper::tab_scraper::get_song_data;
30/// 
31/// // Returns a wrapped Song object with associated data and replaced german chords
32/// get_song_data("https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741", true);
33/// ```
34/// 
35/// ## Possible errors
36/// * `ureq::Error::*`
37/// * [`crate::error::UGError`]
38///     * `InvalidHTMLError`
39///     * `InvalidURLError`
40///     * `NoBasicDataMatchError`
41///     * `UnexpectedWebResultError`
42pub fn get_song_data(url: &str, replace_german_names: bool) -> Result<Song, Box<dyn std::error::Error>> {
43        let raw_html: String;
44        match get_raw_html(url) {
45                Ok(s) => raw_html = s,
46                Err(e) => return Err(e.into()),
47        }
48        let song_lines: Vec<Line> = get_tab_lines(&raw_html, replace_german_names)?;
49        let song_metadata: Option<SongMetaData>;
50        let basic_song_data: BasicSongData;
51        match get_basic_metadata(&raw_html, url) {
52                Ok(d) => {
53                        song_metadata = extract_metadata(&raw_html);
54                        basic_song_data = d;
55                }
56                Err(e) => return Err(e.into())
57        }
58        let song: Song = Song { lines: song_lines, metadata: song_metadata, basic_data: basic_song_data };
59        Ok(song)
60}
61
62/// Get the basic metadata about a tab from valid HTML
63/// 
64/// ## Arguments
65/// * `raw_html`: the raw HTML of a supported UG tab page
66/// * `tab_link`: the link to the page
67/// 
68/// ## Example:
69/// ```
70/// use ug_scraper::tab_scraper::get_basic_metadata;
71/// use ug_scraper::network::get_raw_html;
72/// 
73/// let url: &str = "https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741";
74/// let raw_html: &str = &get_raw_html(url).unwrap();
75/// let basic_data = get_basic_metadata(raw_html, url).unwrap();
76/// // Returns:
77/// // BasicSongData { title: "Never Gonna Give You Up",
78/// //                 artist: "Rick Astley",
79/// //                 tab_link: "https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741",
80/// //                 song_id: 196324,
81/// //                 tab_id: 521741,
82/// //                 data_type: Chords }
83/// ```
84/// 
85/// ## Possible errors
86/// * [`crate::error::UGError`]
87///     * `InvalidHTMLError`
88///     * `InvalidURLError`
89///     * `NoBasicDataMatchError`
90///     * `UnexpectedWebResultError`
91pub fn get_basic_metadata(raw_html: &str, tab_link: &str) -> Result<BasicSongData, UGError> {
92        validate_html(raw_html)?;
93        validate_link(tab_link)?;
94
95        let regex = Regex::new(BASIC_DATA_REGEX).unwrap();
96        let captures = regex.captures(raw_html);
97        if let Some(cap) = captures {
98                let song_type: DataSetType = get_data_type(&cap[5]).unwrap_or_default();
99                let tab_id = match u32::from_str(&cap[1]) {
100                        Ok(i) => i,
101                        Err(_e) => return Err(UGError::UnexpectedWebResultError),
102                };
103                let song_id = match u32::from_str(&cap[2]) {
104                        Ok(i) => i,
105                        Err(_e) => return Err(UGError::UnexpectedWebResultError),
106                };
107                let title = unescape_string(&cap[3]).to_string();
108                let artist = unescape_string(&cap[4]).to_string();
109                let song_basic_meta: BasicSongData = BasicSongData { title,
110                        artist,
111                        tab_link: tab_link.to_string(),
112                        song_id,
113                        tab_id,
114                        data_type: song_type };
115                Ok(song_basic_meta)
116        } else {
117                Err(UGError::NoBasicDataMatchError)
118        }
119}
120
121/// Get a `Vec` with the lines of a tab
122/// 
123/// ## Arguments
124/// * `raw_html`: the raw HTML of a supported UG tab page
125/// * `replace_german_names`: Wether to replace german chord names like `H` with `B`
126///     * View [`crate::types::Line::replace_german_names`] for more information.
127/// 
128/// ## Example:
129/// ```
130/// use ug_scraper::tab_scraper::get_tab_lines;
131/// use ug_scraper::network::get_raw_html;
132/// use ug_scraper::types::Line;
133/// 
134/// let url: &str = "https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741";
135/// let raw_html: &str = &get_raw_html(url).unwrap();
136/// 
137/// // Ruturns lines of the tab with german chord names replaced
138/// let lines_vec = get_tab_lines(raw_html, true).unwrap();
139/// ```
140/// 
141/// ## Possible errors
142/// * [`crate::error::UGError::InvalidHTMLError`]
143pub fn get_tab_lines(raw_html: &str, replace_german_names: bool) -> Result<Vec<Line>, UGError> {
144        validate_html(raw_html)?;
145        let string_parts: Vec<&str> = raw_html.split(END_OF_CHORDS_DELIM).collect();
146        let raw_data: &str = string_parts[0].split(START_OF_CHORDS_DELIM).collect::<Vec<&str>>()[1];
147        let formatted_string_lines = unescape_string(raw_data);
148        let lines: Vec<Line> = clean_and_evaluate(formatted_string_lines.lines(), replace_german_names);
149        Ok(lines)
150}
151
152/// Checks if a given URL is leading to a tab on Ultimate Guitar
153/// 
154/// Returns Ok(()) if valid and Err(UGError::InvalidURLError) if invalid.
155pub fn validate_link(url: &str) -> Result<(), UGError> {
156        let regex = Regex::new(VALID_LINK_REGEX).unwrap();
157        let captures = regex.captures(url);
158        match captures {
159                Some(_d) => Ok(()),
160                None => Err(UGError::InvalidURLError),
161        }
162}
163
164fn validate_html(raw_html: &str) -> Result<(), UGError> {
165        for item in HTML_BLACKLIST {
166                if raw_html.contains(item) {
167                        return Err(UGError::InvalidHTMLError)
168                }
169        }
170        if !raw_html.contains(START_OF_CHORDS_DELIM) || !raw_html.contains(END_OF_CHORDS_DELIM) {
171                return Err(UGError::InvalidHTMLError)
172        }
173        Ok(())
174}
175
176fn extract_metadata(raw_html: &str) -> Option<SongMetaData> {
177        let regex = Regex::new(METADATA_REGEX).unwrap();
178        let captures = regex.captures(raw_html);
179        let mut song_metadata: SongMetaData = SongMetaData::default();
180        if let Some(cap) = captures {
181                let mut capture_options: [Option<String>; 4] = [Some(cap[1].to_string()), 
182                        Some(cap[2].to_string()), 
183                        Some(cap[3].to_string()), 
184                        Some(cap[4].to_string())];
185                for i in 0..4 {
186                        if capture_options[i].clone().unwrap().is_empty() {
187                                capture_options[i] = None;
188                        }
189                        match i {
190                                0 => song_metadata.capo = capture_options[i].clone(),
191                                1 => song_metadata.tonality = capture_options[i].clone(),
192                                2 => song_metadata.tuning_name = capture_options[i].clone(),
193                                3 => song_metadata.tuning = capture_options[i].clone(),
194                                _ => (),
195                        }
196                }                
197        } else {
198                return None
199        }
200        Some(song_metadata)
201}
202
203fn clean_and_evaluate(lines: std::str::Lines<'_>, replace_german_names: bool) -> Vec<Line> {
204        let mut clean_lines: Vec<Line> = Vec::new();
205        for line in lines {
206                let mut line_type: DataType = DataType::Lyric;
207                if line.contains("[ch]") {
208                        line_type = DataType::Chord;
209                }
210                let mut clean_line: String = String::from(line);
211                for key in ["[ch]", "[/ch]", "[tab]", "[/tab]"] {
212                        clean_line = clean_line.replace(key, "")
213                }
214                if clean_line.contains("[") && clean_line.contains("]") && line_type != DataType::Chord {
215                        line_type = DataType::SectionTitle;
216                }
217                let mut line = Line {line_type, text_data: clean_line};
218                if replace_german_names {
219                        line = line.replace_german_names();
220                }
221                clean_lines.push(line);
222        }
223        clean_lines
224}
225
226#[cfg(test)]
227mod tests {
228        use core::panic;
229
230        use super::*;
231
232        #[test]
233        fn get_lines_of_tab() {
234                let tabs_to_get = ["https://tabs.ultimate-guitar.com/tab/367279", 
235                        "https://tabs.ultimate-guitar.com/tab/queen/dont-stop-me-now-chords-519549"];
236                for tab in tabs_to_get {
237                        println!("Getting tab: {}", tab);
238                        assert!(!matches!(get_tab_lines(&get_raw_html(tab).unwrap(), true), Err(UGError::InvalidHTMLError)));
239                }
240        }
241
242        #[test]
243        fn tab_link_validation() {
244                assert_eq!(validate_link("https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741"), Ok(()));
245                assert_ne!(validate_link("tabs.ultimate-guitar.com/tab/refused/rather-be-dead-power-595658"), Ok(()));
246        }
247
248        #[test]
249        fn type_detection() {
250                let type_detection_checks: Vec<(DataSetType, &str)> = vec![(DataSetType::Chords, "https://tabs.ultimate-guitar.com/tab/queen/dont-stop-me-now-chords-519549"),
251                        (DataSetType::Chords, "https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741"),
252                        (DataSetType::Bass, "https://tabs.ultimate-guitar.com/tab/bloc-party/this-modern-love-bass-180218"),
253                        (DataSetType::Tab, "https://tabs.ultimate-guitar.com/tab/led-zeppelin/stairway-to-heaven-tabs-9488"),
254                        (DataSetType::Ukulele, "https://tabs.ultimate-guitar.com/tab/olli-schulz/wenn-es-gut-ist-ukulele-1381967"),
255                        (DataSetType::Drums, "https://tabs.ultimate-guitar.com/tab/phil-collins/in-the-air-tonight-drums-880599"),
256                        (DataSetType::Bass, "https://tabs.ultimate-guitar.com/tab/pink-floyd/empty-spaces-bass-147995")];
257                for check in type_detection_checks {
258                        println!("Testing valid url: {}", check.1);
259                        assert_eq!(get_basic_metadata(&get_raw_html(check.1).unwrap(), check.1).unwrap().data_type, check.0);
260                }
261        }
262
263        #[test]
264        fn validate_page_contents() {
265                let valid_page_urls = vec!["https://tabs.ultimate-guitar.com/tab/queen/dont-stop-me-now-chords-519549",
266                        "https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741",
267                        "https://tabs.ultimate-guitar.com/tab/led-zeppelin/stairway-to-heaven-tabs-9488",
268                        "https://tabs.ultimate-guitar.com/tab/olli-schulz/wenn-es-gut-ist-ukulele-1381967",
269                        "https://tabs.ultimate-guitar.com/tab/phil-collins/in-the-air-tonight-drums-880599",
270                        "https://tabs.ultimate-guitar.com/tab/blink-182/feeling-this-bass-104175",
271                        "https://tabs.ultimate-guitar.com/tab/pink-floyd/empty-spaces-bass-147995",
272                        "https://tabs.ultimate-guitar.com/tab/367279"];
273                for valid_page_url in valid_page_urls {
274                        println!("Testing valid url: {}", valid_page_url);
275                        assert!(!matches!(validate_html(&get_raw_html(valid_page_url).unwrap()), Err(UGError::InvalidHTMLError)));
276                }
277
278                let invalid_page_urls = vec!["https://tabs.ultimate-guitar.com/tab/refused/i-wanna-watch-the-world-burn-guitar-pro-5868920", 
279                        "https://tabs.ultimate-guitar.com/tab/refused/rather-be-dead-power-595658", 
280                        "https://tabs.ultimate-guitar.com/tab/the-beatles/let-it-be-video-781202",
281                        "https://www.youtube.com/watch?v=dQw4w9WgXcQ&list=RDdQw4w9WgXcQ"];
282                for invalid_page_url in invalid_page_urls {
283                        println!("Testing invalid url: {}", invalid_page_url);
284                        assert!(matches!(validate_html(&get_raw_html(invalid_page_url).unwrap()), Err(UGError::InvalidHTMLError)));
285                }
286        }
287
288        #[test]
289        fn get_basic_data() {
290                let test_sets: Vec<(&str, &str, &str, u32, u32)> = vec![("https://tabs.ultimate-guitar.com/tab/queen/dont-stop-me-now-chords-519549",
291                                "Dont Stop Me Now", "Queen", 15591, 519549),
292                        ("https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741",
293                                "Never Gonna Give You Up", "Rick Astley", 196324, 521741),
294                        ("https://tabs.ultimate-guitar.com/tab/led-zeppelin/stairway-to-heaven-tabs-9488",
295                                "Stairway To Heaven", "Led Zeppelin", 31683, 9488),
296                        ("https://tabs.ultimate-guitar.com/tab/olli-schulz/wenn-es-gut-ist-ukulele-1381967",
297                                "Wenn Es Gut Ist", "Olli Schulz", 317511, 1381967),
298                        ("https://tabs.ultimate-guitar.com/tab/phil-collins/in-the-air-tonight-drums-880599",
299                                "In The Air Tonight", "Phil Collins", 138587, 880599),
300                        ("https://tabs.ultimate-guitar.com/tab/blink-182/feeling-this-bass-104175",
301                                "Feeling This", "Blink-182", 54209, 104175), // The title is actually wrong it the UG metadata. This is not a bug!
302                        ("https://tabs.ultimate-guitar.com/tab/pink-floyd/empty-spaces-bass-147995",
303                                "Empty Spaces", "Pink Floyd", 17357, 147995),
304                        ("https://tabs.ultimate-guitar.com/tab/367279",
305                                "Zu Spät", "Die Ärzte", 1577513, 367279)];
306
307                for set in test_sets {
308                        let result = get_basic_metadata(&get_raw_html(set.0).unwrap(), set.0).unwrap();
309                        assert_eq!(result.title, set.1);
310                        assert_eq!(result.artist, set.2);
311                        assert_eq!(result.song_id, set.3);
312                        assert_eq!(result.tab_id, set.4)
313                }
314        }
315
316        #[test]
317        fn get_metadata() {
318                let url_metadata_sets: Vec<(Option<SongMetaData>, &str)> = vec![(Some(SongMetaData { 
319                                capo: Some(String::from("3")), 
320                                tonality: None, 
321                                tuning_name: Some(String::from("G C E A")), 
322                                tuning: Some(String::from("G C E A")) }), "https://tabs.ultimate-guitar.com/tab/olli-schulz/wenn-es-gut-ist-ukulele-1381967"),
323                        (None, "https://tabs.ultimate-guitar.com/tab/pink-floyd/empty-spaces-bass-147995"),
324                        (None, "https://tabs.ultimate-guitar.com/tab/phil-collins/in-the-air-tonight-drums-880599"),
325                        (Some(SongMetaData { capo: Some(String::from("1")), 
326                                tonality: None, 
327                                tuning_name: Some(String::from("Standard")), 
328                                tuning: Some(String::from("E A D G B E")) }), "https://tabs.ultimate-guitar.com/tab/rick-astley/never-gonna-give-you-up-chords-521741"),
329                        (Some(SongMetaData { capo: None, 
330                                tonality: Some(String::from("F")), 
331                                tuning_name: Some(String::from("Standard")), 
332                                tuning: Some(String::from("E A D G B E")) }), "https://tabs.ultimate-guitar.com/tab/queen/dont-stop-me-now-chords-519549"),];
333                for url_metadata_set in url_metadata_sets {
334                        println!("Testing url: {}", stringify!(get_type(&get_raw_html(url_metadata_set.1).unwrap()).unwrap()));
335                        match extract_metadata(&get_raw_html(url_metadata_set.1).unwrap()) {
336                                Some(d) => assert_eq!(d, url_metadata_set.0.unwrap()),
337                                None => {
338                                        if url_metadata_set.0.is_some() {
339                                                panic!("Found metadata for song without known metadata.")
340                                        }
341                                },
342                        }
343                }
344        }
345}