buup/transformers/
url_parser.rs

1use crate::{Transform, TransformError, TransformerCategory};
2
3/// URL Parser transformer
4#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub struct UrlParser;
6
7/// Default test input for URL Parser
8pub const DEFAULT_TEST_INPUT: &str =
9    "https://user:pass@example.com:8080/path/to/resource?key=value&key2=value2#fragment";
10
11// List of known non-hierarchical schemes (add more as needed)
12const NON_HIERARCHICAL_SCHEMES: &[&str] = &["mailto", "urn", "tel", "sms", "news", "isbn"];
13
14impl Transform for UrlParser {
15    fn name(&self) -> &'static str {
16        "URL Parser"
17    }
18
19    fn id(&self) -> &'static str {
20        "urlparser"
21    }
22
23    fn description(&self) -> &'static str {
24        "Parses a URL into its components (scheme, authority, path, query, fragment)"
25    }
26
27    fn category(&self) -> TransformerCategory {
28        TransformerCategory::Other
29    }
30
31    // Basic URL Parser (doesn't handle all edge cases, e.g., complex userinfo, IPv6 hosts)
32    fn transform(&self, input: &str) -> Result<String, TransformError> {
33        let input = input.trim();
34        if input.is_empty() {
35            return Err(TransformError::InvalidArgument("Input URL is empty".into()));
36        }
37
38        let mut remainder = input;
39
40        // 1. Scheme
41        // Determine scheme, whether it's hierarchical, and the remainder of the string
42        let (scheme, is_hierarchical, remainder_after_scheme) = if let Some(pos) =
43            remainder.find("://")
44        {
45            let scheme_part = &remainder[..pos];
46            // Validate scheme characters before ://
47            if scheme_part.is_empty()
48                || !scheme_part.starts_with(|c: char| c.is_ascii_alphabetic())
49                || !scheme_part
50                    .chars()
51                    .all(|c| c.is_ascii_alphanumeric() || c == '+' || c == '-' || c == '.')
52            {
53                return Err(TransformError::InvalidArgument(
54                    format!("Invalid characters in scheme before '://': {}", scheme_part).into(),
55                ));
56            }
57            (Some(scheme_part), true, &remainder[pos + 3..]) // Standard hierarchical scheme
58        } else if let Some(pos) = remainder.find(':') {
59            let potential_scheme = &remainder[..pos];
60            // Check if the part before ':' looks structurally like a scheme
61            if !potential_scheme.is_empty()
62                && potential_scheme.starts_with(|c: char| c.is_ascii_alphabetic())
63                && potential_scheme
64                    .chars()
65                    .all(|c| c.is_ascii_alphanumeric() || c == '+' || c == '-' || c == '.')
66            {
67                // It might be a scheme. Check if it's a known non-hierarchical one.
68                let lower_scheme = potential_scheme.to_ascii_lowercase();
69                if NON_HIERARCHICAL_SCHEMES.contains(&lower_scheme.as_str()) {
70                    // It's a known non-hierarchical scheme (e.g., mailto:)
71                    (Some(potential_scheme), false, &remainder[pos + 1..])
72                } else {
73                    // Looks like a scheme syntactically but not known non-hierarchical,
74                    // and no '://' was present. Assume it's not a scheme (e.g., host:port, drive letter).
75                    (None, true, remainder) // Treat as having no scheme
76                }
77            } else {
78                // The part before ':' doesn't look like a scheme (e.g., contains invalid chars)
79                (None, true, remainder) // Treat as having no scheme
80            }
81        } else {
82            // No ':' found at all
83            (None, true, remainder) // Treat as having no scheme
84        };
85
86        // Update the remainder based on whether a scheme was parsed
87        remainder = remainder_after_scheme;
88
89        // 2. Fragment
90        let fragment = if let Some(pos) = remainder.find('#') {
91            let frag = &remainder[pos + 1..];
92            remainder = &remainder[..pos];
93            Some(frag)
94        } else {
95            None
96        };
97
98        // 3. Query
99        let query = if let Some(pos) = remainder.find('?') {
100            let q = &remainder[pos + 1..];
101            remainder = &remainder[..pos];
102            Some(q)
103        } else {
104            None
105        };
106
107        // 4. Authority and Path
108        let (authority, path_str) = if !is_hierarchical {
109            // For non-hierarchical schemes, the rest is the path (SSP)
110            (None, remainder)
111        } else if remainder.starts_with("//") {
112            // Handle authority explicitly starting with //
113            remainder = &remainder[2..];
114            if let Some(pos) = remainder.find('/') {
115                (Some(&remainder[..pos]), &remainder[pos..])
116            } else {
117                (Some(remainder), "")
118            }
119        } else if remainder.starts_with('/') {
120            // Path starts immediately (e.g., /foo/bar?q=1 or file:///foo/bar)
121            (None, remainder)
122        } else if let Some(pos) = remainder.find('/') {
123            // Authority present before path (e.g., host:port/path)
124            (Some(&remainder[..pos]), &remainder[pos..])
125        } else {
126            // Only authority or path-rootless
127            if scheme.is_some() {
128                // If scheme present, assume remainder is authority if non-empty
129                (Some(remainder), "")
130            } else {
131                // No scheme - check for host:port format or path
132                let is_likely_host_port = remainder.contains(':')
133                    && remainder.chars().filter(|&c| c == ':').count() == 1
134                    && remainder
135                        .split(':')
136                        .nth(1)
137                        .unwrap_or("")
138                        .chars()
139                        .all(|c| c.is_ascii_digit())
140                    && !remainder.contains('/')
141                    && !remainder.contains('?')
142                    && !remainder.contains('#');
143
144                if is_likely_host_port {
145                    // Treat as authority (host:port) if it matches the pattern
146                    (Some(remainder), "")
147                } else {
148                    // Otherwise treat as path
149                    (None, remainder)
150                }
151            }
152        };
153
154        // Further parse authority into userinfo, host, port (basic)
155        let mut userinfo = None;
156        let mut host = None;
157        let mut port = None;
158
159        if let Some(auth_str) = authority {
160            let mut auth_rem = auth_str;
161            if let Some(pos) = auth_rem.rfind('@') {
162                userinfo = Some(&auth_rem[..pos]);
163                auth_rem = &auth_rem[pos + 1..];
164            }
165
166            // Very basic host/port split (doesn't handle IPv6 brackets)
167            if let Some(pos) = auth_rem.rfind(':') {
168                // Check if colon is part of IPv6 address (crude check)
169                if !auth_rem[..pos].contains(':') {
170                    // Likely not IPv6
171                    host = Some(&auth_rem[..pos]); // Assign host
172                    let port_str = &auth_rem[pos + 1..];
173                    if port_str.chars().all(|c| c.is_ascii_digit()) {
174                        port = Some(port_str); // Assign port if valid
175                    } // If port is invalid, host remains as parsed above, port remains None
176                } else {
177                    // Assume IPv6 or complex host, treat whole as host
178                    host = Some(auth_rem);
179                    // port remains None
180                }
181            } else {
182                // No colon found, the whole remaining string is the host
183                host = Some(auth_rem);
184                // port remains None
185            }
186        }
187
188        let mut result = String::new();
189        result.push_str(&format!("Scheme: {}\n", scheme.unwrap_or("-")));
190        result.push_str(&format!("UserInfo: {}\n", userinfo.unwrap_or("-")));
191        result.push_str(&format!("Host: {}\n", host.unwrap_or("-")));
192        result.push_str(&format!("Port: {}\n", port.unwrap_or("-")));
193        result.push_str(&format!(
194            "Path: {}\n",
195            if path_str.is_empty() { "-" } else { path_str }
196        ));
197        result.push_str(&format!("Query: {}\n", query.unwrap_or("-")));
198        result.push_str(&format!("Fragment: {}", fragment.unwrap_or("-")));
199
200        Ok(result)
201    }
202
203    fn default_test_input(&self) -> &'static str {
204        "https://user:pass@example.com:8080/p/a/t/h?query=string&key=val#fragment"
205    }
206}
207
208#[cfg(test)]
209mod tests {
210    use super::*;
211
212    fn get_component(output: &str, label: &str) -> String {
213        output
214            .lines()
215            .find(|line| line.starts_with(label))
216            .map(|line| {
217                line.split_once(':')
218                    .map(|(_, v)| v.trim().to_string())
219                    .unwrap_or_default()
220            })
221            .unwrap_or_default()
222    }
223
224    #[test]
225    fn test_url_parser_full() {
226        let transformer = UrlParser;
227        let url = DEFAULT_TEST_INPUT;
228        let result = transformer.transform(url).unwrap();
229        assert_eq!(get_component(&result, "Scheme"), "https");
230        assert_eq!(get_component(&result, "UserInfo"), "user:pass");
231        assert_eq!(get_component(&result, "Host"), "example.com");
232        assert_eq!(get_component(&result, "Port"), "8080");
233        assert_eq!(get_component(&result, "Path"), "/path/to/resource");
234        assert_eq!(get_component(&result, "Query"), "key=value&key2=value2");
235        assert_eq!(get_component(&result, "Fragment"), "fragment");
236    }
237
238    #[test]
239    fn test_url_parser_simple_http() {
240        let transformer = UrlParser;
241        let url = "http://example.com/home";
242        let result = transformer.transform(url).unwrap();
243        assert_eq!(get_component(&result, "Scheme"), "http");
244        assert_eq!(get_component(&result, "UserInfo"), "-");
245        assert_eq!(get_component(&result, "Host"), "example.com");
246        assert_eq!(get_component(&result, "Port"), "-");
247        assert_eq!(get_component(&result, "Path"), "/home");
248        assert_eq!(get_component(&result, "Query"), "-");
249        assert_eq!(get_component(&result, "Fragment"), "-");
250    }
251
252    #[test]
253    fn test_url_parser_ftp() {
254        let transformer = UrlParser;
255        let url = "ftp://user@ftp.example.org/";
256        let result = transformer.transform(url).unwrap();
257        assert_eq!(get_component(&result, "Scheme"), "ftp");
258        assert_eq!(get_component(&result, "UserInfo"), "user");
259        assert_eq!(get_component(&result, "Host"), "ftp.example.org");
260        assert_eq!(get_component(&result, "Port"), "-");
261        assert_eq!(get_component(&result, "Path"), "/");
262        assert_eq!(get_component(&result, "Query"), "-");
263        assert_eq!(get_component(&result, "Fragment"), "-");
264    }
265
266    #[test]
267    fn test_url_parser_mailto() {
268        let transformer = UrlParser;
269        let url = "mailto:user@example.com";
270        let result = transformer.transform(url).unwrap();
271        assert_eq!(get_component(&result, "Scheme"), "mailto");
272        assert_eq!(get_component(&result, "UserInfo"), "-"); // Corrected
273        assert_eq!(get_component(&result, "Host"), "-"); // Corrected
274        assert_eq!(get_component(&result, "Port"), "-");
275        assert_eq!(get_component(&result, "Path"), "user@example.com"); // Corrected
276        assert_eq!(get_component(&result, "Query"), "-");
277        assert_eq!(get_component(&result, "Fragment"), "-");
278    }
279
280    #[test]
281    fn test_url_parser_urn() {
282        let transformer = UrlParser;
283        let url = "urn:isbn:0451450523";
284        let result = transformer.transform(url).unwrap();
285        assert_eq!(get_component(&result, "Scheme"), "urn");
286        assert_eq!(get_component(&result, "UserInfo"), "-"); // Corrected
287        assert_eq!(get_component(&result, "Host"), "-"); // Corrected
288        assert_eq!(get_component(&result, "Port"), "-");
289        assert_eq!(get_component(&result, "Path"), "isbn:0451450523"); // Corrected
290        assert_eq!(get_component(&result, "Query"), "-");
291        assert_eq!(get_component(&result, "Fragment"), "-");
292    }
293
294    #[test]
295    fn test_url_parser_path_only() {
296        let transformer = UrlParser;
297        let url = "/path/only?query#frag";
298        let result = transformer.transform(url).unwrap();
299        assert_eq!(get_component(&result, "Scheme"), "-");
300        assert_eq!(get_component(&result, "UserInfo"), "-");
301        assert_eq!(get_component(&result, "Host"), "-");
302        assert_eq!(get_component(&result, "Port"), "-");
303        assert_eq!(get_component(&result, "Path"), "/path/only");
304        assert_eq!(get_component(&result, "Query"), "query");
305        assert_eq!(get_component(&result, "Fragment"), "frag");
306    }
307
308    #[test]
309    fn test_url_parser_host_port_only() {
310        let transformer = UrlParser;
311        let url = "example.com:8080"; // No scheme
312        let result = transformer.transform(url).unwrap();
313        assert_eq!(get_component(&result, "Scheme"), "-");
314        assert_eq!(get_component(&result, "UserInfo"), "-");
315        assert_eq!(get_component(&result, "Host"), "example.com");
316        assert_eq!(get_component(&result, "Port"), "8080");
317        assert_eq!(get_component(&result, "Path"), "-");
318        assert_eq!(get_component(&result, "Query"), "-");
319        assert_eq!(get_component(&result, "Fragment"), "-");
320    }
321
322    #[test]
323    fn test_url_parser_empty() {
324        let transformer = UrlParser;
325        assert!(matches!(
326            transformer.transform(""),
327            Err(TransformError::InvalidArgument(_))
328        ));
329    }
330
331    #[test]
332    fn test_url_parser_invalid_scheme() {
333        let transformer = UrlParser;
334        assert!(matches!(
335            transformer.transform("1http://example.com"),
336            Err(TransformError::InvalidArgument(_))
337        ));
338        assert!(matches!(
339            transformer.transform("://example.com"),
340            Err(TransformError::InvalidArgument(_))
341        ));
342    }
343}