ug_scraper/
search_scraper.rs

1// UG-Scraper - A basic rust API for getting data from Ultimate Guitar
2// Copyright (C) 2025  Linus Tibert
3//
4// This program was originally published under the MIT licence as seen
5// here: https://github.com/Lich-Corals/ug-tab-scraper-rs/blob/mistress/LICENCE
6
7use crate::types::*;
8use crate::network::{encode_string, get_raw_html, unescape_string};
9use regex::{CaptureMatches, Regex};
10use std::str::FromStr;
11use ureq::{Error as ReqError};
12
13const DATA_REGEX: &str = r"(?:"id":(\d{2,}).+?song_id":(\d+).+?song_name":"(.*?)".*?artist_name":"(.*?)".*?type":"(.+?)".+?votes":(\d*).*?rating":([\d\.]+)).*?"tab_url":"(https:\/\/tabs\.ultimate-guitar\.com\/tab\/.*?\d+)";
14const BASE_SEARCH_URL: &str = "https://www.ultimate-guitar.com/search.php?search_type=title&value=";
15
16/// Get search results for a query
17/// 
18/// ## Arguments:
19/// * `query`: The query to search for; can be a song title or an author.
20/// * `max_additional_pages`: The amount of pages to search after the first page of results
21///     * The function will automatically stop searching if a page is empty. If you want all results, you may use `u8::MAX` without having any performance problems.
22/// 
23/// ## Example:
24/// ```
25/// use ug_scraper::search_scraper::get_search_results;
26/// 
27/// // Only gets results of the first page
28/// get_search_results("Never gonna give you up", 0);
29/// 
30/// // Gets results of the first three pages
31/// get_search_results("Never gonna give you up", 2);
32/// ```
33/// 
34/// Returns an empty vector if there are no results.
35/// 
36/// ## Possible errors
37/// * `ureq::Error::*`
38/// * [`crate::error::UGError::UnexpectedWebResultError`]
39pub fn get_search_results(query: &str, max_additional_pages: u8) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
40        let mut results: Vec<SearchResult> = vec![];
41        for i in 1..max_additional_pages {
42                match search_page(query, i) {
43                        Ok(mut r) => {
44                                if r.len() == 0 {
45                                        return Ok(results)
46                                }
47                                results.append(&mut r);
48                        },
49                        Err(e) => return Err(e)
50                }
51        }
52        Ok(results)
53}
54
55/// Get search results of a single page
56/// 
57/// ## Arguments:
58/// * `query`: The query to search for; can be a song title or an author.
59/// * `page_to_search`: The page of results to search
60/// 
61/// ## Example:
62/// ```
63/// use ug_scraper::search_scraper::search_page;
64/// 
65/// // Returns results from results page 1
66/// search_page("We are number one", 1);
67/// 
68/// // Returns an empty Vec because results page 0 is searched
69/// search_page("empty spaces", 0);
70/// ```
71/// 
72/// ## Possible errors
73/// * `ureq::Error::*`
74/// * [`crate::error::UGError::UnexpectedWebResultError`]
75pub fn search_page(query: &str, page_to_search: u8) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
76        let search_url: String = BASE_SEARCH_URL.to_string() + &encode_string(query) + "&page=" + &page_to_search.to_string();
77        let raw_html: String;
78        match get_raw_html(&search_url) {
79                Ok(d) => raw_html = d,
80                Err(e) => match e {
81                        ReqError::StatusCode(c) => match c {
82                                404 => return Ok(vec![]),
83                                _ => return Err(e.into()),
84                        },
85                        _ => return Err(e.into()),
86                },
87        }
88        let regex = Regex::new(DATA_REGEX).unwrap();
89        let captures = regex.captures_iter(&raw_html);
90        match unwrap_results(captures) {
91                Ok(r) => Ok(r),
92                Err(e) => Err(e.into()),
93        }
94}
95
96fn unwrap_results(matches: CaptureMatches) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
97        let mut results: Vec<SearchResult> = Vec::new();
98        for regex_match in matches {
99                let basic_data: BasicSongData = BasicSongData { 
100                        song_id: u32::from_str(&regex_match[2])?,
101                        tab_id: u32::from_str(&regex_match[1])?,
102                        title: unescape_string(&regex_match[3]).to_string(),
103                        artist: unescape_string(&regex_match[4]).to_string(),
104                        data_type: get_data_type(&regex_match[5]).unwrap_or(DataSetType::default()), 
105                        tab_link: unescape_string(&regex_match[8]).to_string()};
106                let result: SearchResult = SearchResult { 
107                        basic_data: basic_data,
108                        rating_count: u32::from_str(&regex_match[6])?,
109                        rating_value: f32::from_str(&regex_match[7])?};
110                results.push(result);
111        }
112        Ok(results)
113}
114
115#[cfg(test)]
116mod tests {
117    use std::u8;
118    use crate::{search_scraper::{get_search_results, search_page}};
119        #[test]
120        fn search_results() {
121                let valid_search_queries: Vec<&str> = vec!["zu spät die ärzte", "NEVER GONNA GIVE you up", "Don't stop me now", "Bloc party"];
122                for query in valid_search_queries {
123                        assert!(get_search_results(query, u8::MAX).unwrap().len() >= 1);
124                }
125                let no_result_queries: Vec<&str> = vec!["this should_not return any #results!"];
126                for query in no_result_queries {
127                        assert!(get_search_results(query, 1).unwrap().len() == 0);
128                }
129        }
130
131        #[test]
132        fn search_single_page() {
133                let valid_search_queries: Vec<&str> = vec!["NEVER GONNA GIVE you up"];
134                for query in valid_search_queries {
135                        for i in 1..2 {
136                                assert!(search_page(query, i).unwrap().len() >= 1);
137                        }
138                }
139        }
140}