surt_rs/
lib.rs

1use lazy_static::lazy_static;
2use regex::Regex;
3use url::{ParseError, Url};
4
5fn normalize_surt(surt: &str) -> String {
6    let mut surt = surt.to_string();
7
8    // decode surt
9    surt = url_escape::decode(&surt).to_string();
10
11    // replace whitespace with %20
12    surt = surt.replace(' ', "%20");
13
14    let query_index = surt.find('?').unwrap_or(0);
15
16    // remove trailing slashes unless it's the root path
17    if query_index == 0 && surt.ends_with('/') && !surt.ends_with(")/") {
18        surt.pop();
19    }
20
21    // remove trailing slash for SURTs with query parameters
22    // unless it's the root path
23    let start = &mut surt[..query_index].to_string();
24    if start.ends_with('/') && !start.ends_with(")/") {
25        start.pop();
26    }
27    surt = format!("{}{}", start, &surt[query_index..]);
28
29    surt
30}
31
32lazy_static! {
33    static ref SESSION_REGEXP: Regex = Regex::new(r"(?i)(&|^)(?:jsessionid=[0-9a-z$]{10,}|sessionid=[0-9a-z]{16,}|phpsessid=[0-9a-z]{16,}|sid=[0-9a-z]{16,}|aspsessionid[a-z]{8}=[0-9a-z]{16,}|cfid=[0-9]+&cftoken=[0-9a-z-]+)(&|$)").unwrap();
34    static ref WWW_REGEXP: Regex = Regex::new(r"^www(\w?)+\.(.*\.+)").unwrap();
35}
36
37fn normalize_url(mut parsed: Url) -> String {
38    // lowercase and sort query parameters
39    if parsed.query().is_some() {
40        let mut query = parsed.query().unwrap().split('&').collect::<Vec<&str>>();
41        query.sort();
42        let mut query = query.join("&").to_lowercase();
43        query = SESSION_REGEXP.replace_all(&query, "$1$3").to_string();
44        parsed.set_query(Some(&query));
45    }
46
47    if parsed.host_str().is_some() {
48        // remove www(ish) subdomain
49        let host_str = parsed.host_str().unwrap();
50        let host_str = WWW_REGEXP.replace(host_str, "${2}").to_string();
51
52        // lowercase host
53        let host_str = host_str.to_lowercase();
54
55        parsed.set_host(Some(&host_str)).unwrap();
56    }
57
58    let mut url = parsed.to_string();
59
60    // replace trailing slash unless it's the root path
61    if url.ends_with('/') && parsed.path() != "/" {
62        url.pop();
63    }
64
65    // replace trailing ?
66    if url.ends_with('?') {
67        url.pop();
68    }
69
70    url
71}
72
73pub fn generate_surt(url: &str) -> Result<String, ParseError> {
74    let mut parsed = Url::parse(url)?;
75    parsed = Url::parse(&normalize_url(parsed))?;
76
77    let scheme = parsed.scheme();
78    match scheme == "https" || scheme == "http" {
79        true => scheme,
80        _ => return Err(ParseError::RelativeUrlWithoutBase),
81    };
82
83    if parsed.host_str().is_none() {
84        return Err(ParseError::RelativeUrlWithoutBase);
85    }
86    let host_str = parsed.host_str().unwrap().to_lowercase();
87    let mut host_split = host_str.split('.').collect::<Vec<&str>>();
88    host_split.reverse();
89    let mut surt = host_split.join(",");
90
91    if parsed.port().is_some() {
92        let port = parsed.port().unwrap();
93        surt += &format!(":{}", port);
94    }
95
96    if parsed.path() != "" {
97        let path = parsed.path().to_lowercase();
98        surt += &format!("){}", path);
99    }
100
101    if parsed.query().is_some() {
102        let query = parsed.query().unwrap().to_lowercase();
103        surt += &format!("?{}", query);
104    }
105
106    if parsed.fragment().is_some() {
107        let fragment = parsed.fragment().unwrap().to_lowercase();
108        surt += &format!("#{}", fragment);
109    }
110
111    surt = normalize_surt(&surt);
112
113    Ok(surt)
114}
115
116#[cfg(test)]
117mod tests {
118    use super::*;
119    use serde_json::Value;
120    use std::collections::HashMap;
121    use std::fs::File;
122    use std::io::BufReader;
123
124    fn load_test_data() -> HashMap<String, HashMap<String, String>> {
125        let file = File::open("./test_data/surt.json").unwrap();
126        let reader = BufReader::new(file);
127        let v: Value = serde_json::from_reader(reader).unwrap();
128        v.as_object()
129            .unwrap()
130            .iter()
131            .map(|(k, v)| {
132                let inner_map = v
133                    .as_object()
134                    .unwrap()
135                    .iter()
136                    .map(|(k, v)| (k.clone(), v.as_str().unwrap().to_string()))
137                    .collect();
138                (k.clone(), inner_map)
139            })
140            .collect()
141    }
142
143    #[test]
144    fn test_surt() {
145        let test_data = load_test_data();
146
147        for (section, examples) in test_data {
148            // if section does not include surt case insensitive skip
149            if !section.to_lowercase().contains("surt") {
150                continue;
151            }
152            println!("Testing section: {}", section);
153
154            for (input, expected) in examples {
155                println!("Testing example: {}", input);
156                let surt = generate_surt(&input).unwrap();
157                assert_eq!(surt, expected);
158            }
159        }
160    }
161
162    #[test]
163    fn test_url_normalization() {
164        let test_data = load_test_data();
165
166        for (section, examples) in test_data {
167            // if section does not include url_normalization case insensitive skip
168            if !section.to_lowercase().contains("url_normalization") {
169                continue;
170            }
171            println!("Testing section: {}", section);
172
173            for (input, expected) in examples {
174                println!("Testing example: {}", input);
175                let parsed = Url::parse(&input);
176                println!("parsed: {:?}", parsed);
177                let url = normalize_url(Url::parse(&input).unwrap());
178                assert_eq!(url, expected);
179            }
180        }
181    }
182
183    #[test]
184    fn test_generate_surt_with_valid_url() {
185        let url = "http://example.com/path?query=value#fragment";
186        let expected = "com,example)/path?query=value#fragment";
187        assert_eq!(generate_surt(url).unwrap(), expected);
188    }
189
190    #[test]
191    fn test_generate_surt_with_url_without_scheme() {
192        let url = "example.com";
193        assert!(generate_surt(url).is_err());
194    }
195
196    #[test]
197    fn test_generate_surt_with_relative_url() {
198        let url = "/path";
199        assert!(generate_surt(url).is_err());
200    }
201
202    #[test]
203    fn test_generate_surt_with_url_without_host() {
204        let url = "http://";
205        assert!(generate_surt(url).is_err());
206    }
207
208    #[test]
209    fn test_generate_surt_with_url_with_port() {
210        let url = "http://example.com:8080";
211        let expected = "com,example:8080)/";
212        assert_eq!(generate_surt(url).unwrap(), expected);
213    }
214
215    #[test]
216    fn test_generate_surt_with_url_with_query() {
217        let url = "http://example.com?query=value";
218        let expected = "com,example)/?query=value";
219        assert_eq!(generate_surt(url).unwrap(), expected);
220    }
221
222    #[test]
223    fn test_generate_surt_with_url_with_query_and_trailing_slash_after_path() {
224        let url = "http://example.com/foo/bar/?query=value";
225        let expected = "com,example)/foo/bar?query=value";
226        assert_eq!(generate_surt(url).unwrap(), expected);
227    }
228
229    #[test]
230    fn test_generate_surt_with_url_with_fragment() {
231        let url = "http://example.com#fragment";
232        let expected = "com,example)/#fragment";
233        assert_eq!(generate_surt(url).unwrap(), expected);
234    }
235
236    #[test]
237    fn test_generate_surt_with_url_with_uppercase() {
238        let url = "http://EXAMPLE.COM/PATH?QUERY=VALUE#FRAGMENT";
239        let expected = "com,example)/path?query=value#fragment";
240        assert_eq!(generate_surt(url).unwrap(), expected);
241    }
242
243    #[test]
244    fn test_generate_surt_with_url_with_space() {
245        let url = "http://example.com/path with space";
246        let expected = "com,example)/path%20with%20space";
247        assert_eq!(generate_surt(url).unwrap(), expected);
248    }
249
250    #[test]
251    fn test_generate_surt_with_url_with_trailing_slash() {
252        let url = "http://example.com/";
253        let expected = "com,example)/";
254        assert_eq!(generate_surt(url).unwrap(), expected);
255    }
256
257    #[test]
258    fn test_generate_surt_with_url_with_trailing_slash_after_path() {
259        let url = "http://example.com/foo/bar/";
260        let expected = "com,example)/foo/bar";
261        assert_eq!(generate_surt(url).unwrap(), expected);
262    }
263
264    #[test]
265    fn test_generate_surt_with_url_with_www_subdomain() {
266        let url = "http://www.example.com";
267        let expected = "com,example)/";
268        assert_eq!(generate_surt(url).unwrap(), expected);
269    }
270
271    #[test]
272    fn test_generate_surt_with_ftp_url() {
273        let url = "ftp://www.example.com";
274        assert!(generate_surt(url).is_err());
275    }
276
277    #[test]
278    fn test_normalize_url_with_www_subdomain_and_https() {
279        let url = Url::parse("https://www.example.com").unwrap();
280        let expected = "https://example.com/";
281        assert_eq!(normalize_url(url), expected);
282    }
283
284    #[test]
285    fn test_normalize_surt_root_with_trailing_slash() {
286        let url = "com,example)/";
287        let expected = "com,example)/";
288        assert_eq!(normalize_surt(url), expected);
289    }
290
291    #[test]
292    fn test_normalize_surt_with_trailing_slash() {
293        let url = "com,example)/foo/bar/";
294        let expected = "com,example)/foo/bar";
295        assert_eq!(normalize_surt(url), expected);
296    }
297}