minurl/
lib.rs

1// This file is Copyright its original authors, visible in version control history.
2//
3// This file is licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5// http://opensource.org/licenses/MIT>, at your option. You may not use this file except in
6// accordance with one or both of these licenses.
7
8#![crate_name = "minurl"]
9#![deny(missing_docs)]
10#![deny(rustdoc::broken_intra_doc_links)]
11#![deny(rustdoc::private_intra_doc_links)]
12#![allow(bare_trait_objects)]
13#![allow(ellipsis_inclusive_range_patterns)]
14#![cfg_attr(docsrs, feature(doc_cfg))]
15
16//! A minimal library for parsing and validating URLs.
17
18use std::ops::Range;
19
20/// Returns the default port for known schemes, or `None` for unknown schemes.
21fn default_port_for_scheme(scheme: &str) -> Option<u16> {
22	match scheme {
23		"http" | "ws" => Some(80),
24		"https" | "wss" => Some(443),
25		"ftp" => Some(21),
26		_ => None,
27	}
28}
29
30/// Errors that can occur during URL parsing.
31#[derive(Debug, Clone, PartialEq, Eq)]
32pub enum ParseError {
33	/// The input string is empty.
34	EmptyInput,
35	/// The input contains invalid characters (control characters or non-ASCII).
36	InvalidCharacter(char),
37	/// The URL is missing a scheme.
38	MissingScheme,
39	/// The URL has an invalid scheme format.
40	InvalidScheme,
41	/// The URL has an empty host.
42	EmptyHost,
43	/// The port number is invalid.
44	InvalidPort,
45}
46
47impl std::fmt::Display for ParseError {
48	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
49		match self {
50			ParseError::EmptyInput => write!(f, "empty input"),
51			ParseError::InvalidCharacter(c) => write!(f, "invalid character: {:?}", c),
52			ParseError::MissingScheme => write!(f, "missing scheme"),
53			ParseError::InvalidScheme => write!(f, "invalid scheme"),
54			ParseError::EmptyHost => write!(f, "empty host"),
55			ParseError::InvalidPort => write!(f, "invalid port"),
56		}
57	}
58}
59
60impl std::error::Error for ParseError {}
61
62/// A parsed URL.
63///
64/// All accessor methods return slices into the original URL string,
65/// avoiding any additional string allocations.
66///
67/// **Note:** This type currently only supports ASCII URLs. Non-ASCII characters
68/// (including internationalized domain names and punycode) are not supported.
69#[derive(Debug, Clone, PartialEq, Eq)]
70pub struct Url {
71	/// The full serialized URL string.
72	serialization: String,
73	/// Range of the scheme in `serialization`.
74	scheme: Range<usize>,
75	/// Range of the username in `serialization`. Empty range if no username.
76	username: Range<usize>,
77	/// Range of the password in `serialization`, if present.
78	password: Option<Range<usize>>,
79	/// Range of the host in `serialization`.
80	host: Range<usize>,
81	/// The port number, if specified.
82	port: Option<u16>,
83	/// Range of the path in `serialization`.
84	path: Range<usize>,
85	/// Range of the query string in `serialization` (excludes leading `?`).
86	query: Option<Range<usize>>,
87	/// Range of the fragment in `serialization` (excludes leading `#`).
88	fragment: Option<Range<usize>>,
89}
90
91impl Url {
92	/// Parses a URL string and returns a `Url` instance.
93	///
94	/// Validates that the input contains only valid non-control ASCII characters.
95	pub fn parse(url_str: &str) -> Result<Self, ParseError> {
96		if url_str.is_empty() {
97			return Err(ParseError::EmptyInput);
98		}
99
100		// Validate: only non-control ASCII characters allowed
101		for c in url_str.chars() {
102			if !c.is_ascii() || c.is_ascii_control() {
103				return Err(ParseError::InvalidCharacter(c));
104			}
105		}
106
107		// Find the scheme (everything before "://")
108		let scheme_end = url_str.find("://").ok_or(ParseError::MissingScheme)?;
109
110		if scheme_end == 0 {
111			return Err(ParseError::InvalidScheme);
112		}
113
114		let scheme = &url_str[..scheme_end];
115
116		// Validate scheme: must start with a letter and contain only
117		// letters, digits, '+', '-', or '.'
118		let mut scheme_chars = scheme.chars();
119		let first_char = scheme_chars.next().ok_or(ParseError::InvalidScheme)?;
120		if !first_char.is_ascii_alphabetic() {
121			return Err(ParseError::InvalidScheme);
122		}
123
124		for c in scheme_chars {
125			if !c.is_ascii_alphanumeric() && c != '+' && c != '-' && c != '.' {
126				return Err(ParseError::InvalidScheme);
127			}
128		}
129
130		// Parse the rest after "://"
131		let after_scheme_pos = scheme_end + 3;
132		let after_scheme = &url_str[after_scheme_pos..];
133
134		// Extract the authority (host:port) - everything before '/', '?', or '#'
135		let authority_end =
136			after_scheme.find(|c| c == '/' || c == '?' || c == '#').unwrap_or(after_scheme.len());
137
138		let authority = &after_scheme[..authority_end];
139		let after_authority = &after_scheme[authority_end..];
140
141		// Extract userinfo (username:password@) from authority if present
142		let (userinfo, host_and_port) = if let Some(at_pos) = authority.rfind('@') {
143			(Some(&authority[..at_pos]), &authority[at_pos + 1..])
144		} else {
145			(None, authority)
146		};
147
148		// Calculate host start position
149		let host_start = if let Some(at_pos) = authority.rfind('@') {
150			after_scheme_pos + at_pos + 1
151		} else {
152			after_scheme_pos
153		};
154
155		// Calculate username and password ranges
156		let (username, password) = if let Some(info) = userinfo {
157			if let Some(colon_pos) = info.find(':') {
158				let username = after_scheme_pos..(after_scheme_pos + colon_pos);
159				let password = Some((after_scheme_pos + colon_pos + 1)..(host_start - 1));
160				(username, password)
161			} else {
162				let username = after_scheme_pos..(after_scheme_pos + info.len());
163				(username, None)
164			}
165		} else {
166			(after_scheme_pos..after_scheme_pos, None) // Empty range for no username
167		};
168
169		// Parse host and optional port from host_and_port
170		// Handle IPv6 addresses specially: [ipv6]:port
171		let (host_len, port) = if host_and_port.starts_with('[') {
172			// IPv6 address - find the closing bracket
173			if let Some(bracket_pos) = host_and_port.find(']') {
174				let after_bracket = &host_and_port[bracket_pos + 1..];
175				if after_bracket.starts_with(':') && after_bracket.len() > 1 {
176					// Has a port after the bracket
177					let potential_port = &after_bracket[1..];
178					if potential_port.chars().all(|c| c.is_ascii_digit()) {
179						let port_num: u16 =
180							potential_port.parse().map_err(|_| ParseError::InvalidPort)?;
181						(bracket_pos + 1, Some(port_num))
182					} else {
183						(host_and_port.len(), None)
184					}
185				} else if after_bracket.is_empty() {
186					// Just [ipv6] with no port
187					(host_and_port.len(), None)
188				} else {
189					// Invalid: something after ] that isn't :port
190					(host_and_port.len(), None)
191				}
192			} else {
193				// No closing bracket - malformed, but don't fail, just use as-is
194				(host_and_port.len(), None)
195			}
196		} else if let Some(colon_pos) = host_and_port.rfind(':') {
197			let potential_port = &host_and_port[colon_pos + 1..];
198			// Check if this is actually a port (all digits)
199			if !potential_port.is_empty() && potential_port.chars().all(|c| c.is_ascii_digit()) {
200				let port_num: u16 = potential_port.parse().map_err(|_| ParseError::InvalidPort)?;
201				(colon_pos, Some(port_num))
202			} else {
203				(host_and_port.len(), None)
204			}
205		} else {
206			(host_and_port.len(), None)
207		};
208
209		let host_end = host_start + host_len;
210
211		// Validate that host is not empty
212		if host_len == 0 {
213			return Err(ParseError::EmptyHost);
214		}
215
216		// Calculate path start position (after authority)
217		let path_start = after_scheme_pos + authority_end;
218
219		// Copy the URL string and normalize the scheme to lowercase
220		let mut serialization = url_str.to_string();
221		serialization[..scheme_end].make_ascii_lowercase();
222		let url_len = serialization.len();
223
224		// Calculate path, query, and fragment ranges
225		let (path, query, fragment) = {
226			let mut query = None;
227			let mut fragment = None;
228			let mut path_end = url_len;
229
230			if after_authority.starts_with('/') {
231				// Find where path ends (at '?' or '#')
232				if let Some(q_pos) = after_authority.find('?') {
233					let query_start = path_start + q_pos;
234					path_end = query_start;
235					// Fragment comes after query
236					if let Some(f_pos) = after_authority[q_pos..].find('#') {
237						let fragment_start = query_start + f_pos;
238						query = Some((query_start + 1)..fragment_start);
239						fragment = Some((fragment_start + 1)..url_len);
240					} else {
241						query = Some((query_start + 1)..url_len);
242					}
243				} else if let Some(f_pos) = after_authority.find('#') {
244					let fragment_start = path_start + f_pos;
245					path_end = fragment_start;
246					fragment = Some((fragment_start + 1)..url_len);
247				}
248			} else {
249				// No path, check for query/fragment directly
250				if after_authority.starts_with('?') {
251					let query_start = path_start;
252					path_end = query_start;
253					if let Some(f_pos) = after_authority.find('#') {
254						let fragment_start = path_start + f_pos;
255						query = Some((query_start + 1)..fragment_start);
256						fragment = Some((fragment_start + 1)..url_len);
257					} else {
258						query = Some((query_start + 1)..url_len);
259					}
260				} else if after_authority.starts_with('#') {
261					let fragment_start = path_start;
262					path_end = fragment_start;
263					fragment = Some((fragment_start + 1)..url_len);
264				}
265			}
266
267			(path_start..path_end, query, fragment)
268		};
269
270		Ok(Url {
271			serialization,
272			scheme: 0..scheme_end,
273			username,
274			password,
275			host: host_start..host_end,
276			port,
277			path,
278			query,
279			fragment,
280		})
281	}
282
283	/// Returns the scheme of the URL (e.g., "http", "https").
284	pub fn scheme(&self) -> &str {
285		&self.serialization[self.scheme.clone()]
286	}
287
288	/// Returns the username from the URL, if present.
289	///
290	/// Returns an empty string if no username was specified.
291	pub fn username(&self) -> &str {
292		&self.serialization[self.username.clone()]
293	}
294
295	/// Returns the password from the URL, if present.
296	pub fn password(&self) -> Option<&str> {
297		self.password.as_ref().map(|r| &self.serialization[r.clone()])
298	}
299
300	/// Returns the base URL (host portion).
301	pub fn base_url(&self) -> &str {
302		&self.serialization[self.host.clone()]
303	}
304
305	/// Returns the port number if specified, unless it is the default port for
306	/// the scheme.
307	///
308	/// Returns `None` if no port was specified, or if the specified port is the
309	/// default for the URL's scheme (e.g., 80 for `http`, 443 for `https`).
310	pub fn port(&self) -> Option<u16> {
311		match self.port {
312			Some(port) if Some(port) == default_port_for_scheme(self.scheme()) => None,
313			port => port,
314		}
315	}
316
317	/// Returns the port number if specified, or the default port for known
318	/// schemes.
319	///
320	/// Unlike [`port()`](Self::port), this method returns the port even if it
321	/// matches the default for the scheme.
322	pub fn port_or_known_default(&self) -> Option<u16> {
323		self.port.or_else(|| default_port_for_scheme(self.scheme()))
324	}
325
326	/// Returns the path of the URL.
327	///
328	/// The path includes the leading `/` if present. Returns an empty string
329	/// if no path was specified.
330	pub fn path(&self) -> &str {
331		&self.serialization[self.path.clone()]
332	}
333
334	/// Returns an iterator over the path segments.
335	///
336	/// Path segments are the portions between `/` characters. Empty segments
337	/// (from leading or consecutive slashes) are included.
338	pub fn path_segments(&self) -> impl Iterator<Item = &str> {
339		let path = self.path();
340		let path = if path.starts_with('/') { &path[1..] } else { path };
341		path.split('/')
342	}
343
344	/// Returns the query string of the URL, if present.
345	///
346	/// The returned string does not include the leading `?`.
347	pub fn query(&self) -> Option<&str> {
348		self.query.as_ref().map(|r| &self.serialization[r.clone()])
349	}
350
351	/// Returns an iterator over the query string's key-value pairs.
352	///
353	/// Pairs are separated by `&` and keys are separated from values by `=`.
354	/// If a pair has no `=`, the value will be an empty string.
355	pub fn query_pairs(&self) -> impl Iterator<Item = (&str, &str)> {
356		self.query().into_iter().flat_map(|q| {
357			q.split('&').map(|pair| {
358				if let Some(eq_pos) = pair.find('=') {
359					(&pair[..eq_pos], &pair[eq_pos + 1..])
360				} else {
361					(pair, "")
362				}
363			})
364		})
365	}
366
367	/// Returns the fragment identifier of the URL, if present.
368	///
369	/// The returned string does not include the leading `#`.
370	pub fn fragment(&self) -> Option<&str> {
371		self.fragment.as_ref().map(|r| &self.serialization[r.clone()])
372	}
373
374	/// Returns the serialized URL as a string slice.
375	pub fn as_str(&self) -> &str {
376		&self.serialization
377	}
378}
379
380impl std::fmt::Display for Url {
381	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
382		f.write_str(self.as_str())
383	}
384}
385
386#[cfg(test)]
387mod tests {
388	use super::*;
389
390	#[test]
391	fn parse_simple_url() {
392		let url = Url::parse("http://example.com").unwrap();
393		assert_eq!(url.scheme(), "http");
394		assert_eq!(url.base_url(), "example.com");
395		assert_eq!(url.port(), None);
396	}
397
398	#[test]
399	fn parse_url_with_port() {
400		let url = Url::parse("https://example.com:8080").unwrap();
401		assert_eq!(url.scheme(), "https");
402		assert_eq!(url.base_url(), "example.com");
403		assert_eq!(url.port(), Some(8080));
404	}
405
406	#[test]
407	fn parse_url_with_path() {
408		let url = Url::parse("http://example.com/path/to/resource").unwrap();
409		assert_eq!(url.scheme(), "http");
410		assert_eq!(url.base_url(), "example.com");
411		assert_eq!(url.port(), None);
412	}
413
414	#[test]
415	fn empty_input_returns_error() {
416		assert_eq!(Url::parse(""), Err(ParseError::EmptyInput));
417	}
418
419	#[test]
420	fn missing_scheme_returns_error() {
421		assert_eq!(Url::parse("example.com"), Err(ParseError::MissingScheme));
422	}
423
424	#[test]
425	fn invalid_character_returns_error() {
426		// Control character
427		assert!(matches!(
428			Url::parse("http://example\x00.com"),
429			Err(ParseError::InvalidCharacter('\x00'))
430		));
431
432		// Non-ASCII character
433		assert!(matches!(Url::parse("http://exämple.com"), Err(ParseError::InvalidCharacter('ä'))));
434	}
435
436	#[test]
437	fn scheme_is_lowercased() {
438		let url = Url::parse("HTTP://EXAMPLE.COM").unwrap();
439		assert_eq!(url.scheme(), "http");
440	}
441
442	#[test]
443	fn path_returns_full_path() {
444		let url = Url::parse("http://example.com/path/to/resource").unwrap();
445		assert_eq!(url.path(), "/path/to/resource");
446	}
447
448	#[test]
449	fn path_is_empty_when_not_specified() {
450		let url = Url::parse("http://example.com").unwrap();
451		assert_eq!(url.path(), "");
452	}
453
454	#[test]
455	fn path_segments_splits_correctly() {
456		let url = Url::parse("http://example.com/path/to/resource").unwrap();
457		let segments: Vec<&str> = url.path_segments().collect();
458		assert_eq!(segments, vec!["path", "to", "resource"]);
459	}
460
461	#[test]
462	fn path_segments_handles_empty_path() {
463		let url = Url::parse("http://example.com").unwrap();
464		let segments: Vec<&str> = url.path_segments().collect();
465		assert_eq!(segments, vec![""]);
466	}
467
468	#[test]
469	fn path_stops_at_query_string() {
470		let url = Url::parse("http://example.com/path?query=value").unwrap();
471		assert_eq!(url.path(), "/path");
472	}
473
474	#[test]
475	fn path_stops_at_fragment() {
476		let url = Url::parse("http://example.com/path#section").unwrap();
477		assert_eq!(url.path(), "/path");
478	}
479
480	#[test]
481	fn query_returns_query_string() {
482		let url = Url::parse("http://example.com/path?foo=bar&baz=qux").unwrap();
483		assert_eq!(url.query(), Some("foo=bar&baz=qux"));
484	}
485
486	#[test]
487	fn query_is_none_when_not_present() {
488		let url = Url::parse("http://example.com/path").unwrap();
489		assert_eq!(url.query(), None);
490	}
491
492	#[test]
493	fn query_stops_at_fragment() {
494		let url = Url::parse("http://example.com/path?query=value#section").unwrap();
495		assert_eq!(url.query(), Some("query=value"));
496	}
497
498	#[test]
499	fn query_pairs_parses_key_value_pairs() {
500		let url = Url::parse("http://example.com?foo=bar&baz=qux").unwrap();
501		let pairs: Vec<(&str, &str)> = url.query_pairs().collect();
502		assert_eq!(pairs, vec![("foo", "bar"), ("baz", "qux")]);
503	}
504
505	#[test]
506	fn query_pairs_handles_missing_value() {
507		let url = Url::parse("http://example.com?foo&bar=baz").unwrap();
508		let pairs: Vec<(&str, &str)> = url.query_pairs().collect();
509		assert_eq!(pairs, vec![("foo", ""), ("bar", "baz")]);
510	}
511
512	#[test]
513	fn query_pairs_is_empty_when_no_query() {
514		let url = Url::parse("http://example.com").unwrap();
515		let pairs: Vec<(&str, &str)> = url.query_pairs().collect();
516		assert!(pairs.is_empty());
517	}
518
519	#[test]
520	fn fragment_returns_fragment() {
521		let url = Url::parse("http://example.com/path#section").unwrap();
522		assert_eq!(url.fragment(), Some("section"));
523	}
524
525	#[test]
526	fn fragment_is_none_when_not_present() {
527		let url = Url::parse("http://example.com/path").unwrap();
528		assert_eq!(url.fragment(), None);
529	}
530
531	#[test]
532	fn fragment_with_query() {
533		let url = Url::parse("http://example.com/path?query=value#section").unwrap();
534		assert_eq!(url.query(), Some("query=value"));
535		assert_eq!(url.fragment(), Some("section"));
536	}
537
538	#[test]
539	fn fragment_without_path_or_query() {
540		let url = Url::parse("http://example.com#section").unwrap();
541		assert_eq!(url.path(), "");
542		assert_eq!(url.query(), None);
543		assert_eq!(url.fragment(), Some("section"));
544	}
545
546	#[test]
547	fn as_str_returns_full_url() {
548		let url = Url::parse("http://example.com/path?query=value#section").unwrap();
549		assert_eq!(url.as_str(), "http://example.com/path?query=value#section");
550	}
551
552	#[test]
553	fn as_str_with_port() {
554		let url = Url::parse("https://example.com:8080/path").unwrap();
555		assert_eq!(url.as_str(), "https://example.com:8080/path");
556	}
557
558	#[test]
559	fn as_str_normalizes_scheme_to_lowercase() {
560		let url = Url::parse("HTTP://EXAMPLE.COM/path").unwrap();
561		assert_eq!(url.as_str(), "http://EXAMPLE.COM/path");
562	}
563
564	#[test]
565	fn as_str_minimal_url() {
566		let url = Url::parse("http://example.com").unwrap();
567		assert_eq!(url.as_str(), "http://example.com");
568	}
569
570	#[test]
571	fn display_matches_as_str() {
572		let url = Url::parse("http://example.com/path?query=value#section").unwrap();
573		assert_eq!(format!("{}", url), url.as_str());
574	}
575
576	#[test]
577	fn display_can_be_used_in_format_string() {
578		let url = Url::parse("http://example.com").unwrap();
579		let formatted = format!("URL: {}", url);
580		assert_eq!(formatted, "URL: http://example.com");
581	}
582
583	#[test]
584	fn ipv6_without_port() {
585		let url = Url::parse("http://[::1]/path").unwrap();
586		assert_eq!(url.scheme(), "http");
587		assert_eq!(url.base_url(), "[::1]");
588		assert_eq!(url.port(), None);
589		assert_eq!(url.path(), "/path");
590	}
591
592	#[test]
593	fn ipv6_with_port() {
594		let url = Url::parse("http://[::1]:8080/path").unwrap();
595		assert_eq!(url.scheme(), "http");
596		assert_eq!(url.base_url(), "[::1]");
597		assert_eq!(url.port(), Some(8080));
598		assert_eq!(url.path(), "/path");
599	}
600
601	#[test]
602	fn ipv6_full_address_with_port() {
603		let url = Url::parse("http://[2001:db8::1]:443/").unwrap();
604		assert_eq!(url.base_url(), "[2001:db8::1]");
605		assert_eq!(url.port(), Some(443));
606	}
607
608	#[test]
609	fn ipv6_as_str_roundtrip() {
610		let url = Url::parse("http://[::1]:8080/path").unwrap();
611		assert_eq!(url.as_str(), "http://[::1]:8080/path");
612	}
613
614	#[test]
615	fn userinfo_with_username_only() {
616		let url = Url::parse("http://user@example.com/path").unwrap();
617		assert_eq!(url.username(), "user");
618		assert_eq!(url.password(), None);
619		assert_eq!(url.base_url(), "example.com");
620		assert_eq!(url.path(), "/path");
621	}
622
623	#[test]
624	fn userinfo_with_username_and_password() {
625		let url = Url::parse("http://user:pass@example.com/path").unwrap();
626		assert_eq!(url.username(), "user");
627		assert_eq!(url.password(), Some("pass"));
628		assert_eq!(url.base_url(), "example.com");
629		assert_eq!(url.path(), "/path");
630	}
631
632	#[test]
633	fn userinfo_with_port() {
634		let url = Url::parse("http://user:pass@example.com:8080/path").unwrap();
635		assert_eq!(url.username(), "user");
636		assert_eq!(url.password(), Some("pass"));
637		assert_eq!(url.base_url(), "example.com");
638		assert_eq!(url.port(), Some(8080));
639	}
640
641	#[test]
642	fn userinfo_empty_when_not_present() {
643		let url = Url::parse("http://example.com/path").unwrap();
644		assert_eq!(url.username(), "");
645		assert_eq!(url.password(), None);
646	}
647
648	#[test]
649	fn userinfo_as_str_roundtrip() {
650		let url = Url::parse("http://user:pass@example.com:8080/path").unwrap();
651		assert_eq!(url.as_str(), "http://user:pass@example.com:8080/path");
652	}
653
654	#[test]
655	fn userinfo_with_empty_password() {
656		let url = Url::parse("http://user:@example.com").unwrap();
657		assert_eq!(url.username(), "user");
658		assert_eq!(url.password(), Some(""));
659		assert_eq!(url.base_url(), "example.com");
660	}
661
662	#[test]
663	fn parse_error_display() {
664		assert_eq!(ParseError::EmptyInput.to_string(), "empty input");
665		assert_eq!(ParseError::InvalidCharacter('\x00').to_string(), "invalid character: '\\0'");
666		assert_eq!(ParseError::MissingScheme.to_string(), "missing scheme");
667		assert_eq!(ParseError::InvalidScheme.to_string(), "invalid scheme");
668		assert_eq!(ParseError::EmptyHost.to_string(), "empty host");
669		assert_eq!(ParseError::InvalidPort.to_string(), "invalid port");
670	}
671
672	#[test]
673	fn empty_host_returns_error() {
674		assert_eq!(Url::parse("http:///path"), Err(ParseError::EmptyHost));
675		assert_eq!(Url::parse("http://:8080/path"), Err(ParseError::EmptyHost));
676		assert_eq!(Url::parse("http://user@/path"), Err(ParseError::EmptyHost));
677	}
678
679	#[test]
680	fn parse_error_is_std_error() {
681		fn assert_error<E: std::error::Error>(_: &E) {}
682		assert_error(&ParseError::EmptyInput);
683	}
684}