linkify/url.rs
1use std::char;
2use std::ops::Range;
3
4use crate::domains::find_authority_end;
5use crate::scanner::Scanner;
6
7/// Minimum valid URL length
8///
9/// The shortest valid URL (without a scheme) might be g.cn (Google China),
10/// which consists of four characters.
11/// We set this as a lower threshold for parsing URLs from plaintext
12/// to avoid false-positives and as a slight performance optimization.
13/// This threshold might be adjusted in the future.
14const MIN_URL_LENGTH: usize = 4;
15
16const QUOTES: &[char] = &['\'', '\"'];
17
18/// Scan for URLs starting from the trigger character ":" (requires "://").
19///
20/// Based on RFC 3986.
21pub struct UrlScanner {
22 pub iri_parsing_enabled: bool,
23}
24
25/// Scan for plain domains (without scheme) such as `test.com` or `test.com/hi-there`.
26pub struct DomainScanner {
27 pub iri_parsing_enabled: bool,
28}
29
30impl Scanner for UrlScanner {
31 /// Scan for an URL at the given separator index in the string.
32 ///
33 /// Returns `None` if none was found.
34 fn scan(&self, s: &str, separator: usize) -> Option<Range<usize>> {
35 // There must be something before separator for scheme
36 if separator == 0 {
37 return None;
38 }
39
40 if !s[separator..].starts_with("://") {
41 // We only support schemes with authority, not things like `myscheme:mything`.
42 return None;
43 }
44
45 let after_separator = separator + "://".len();
46
47 // Need at least one character after '//'
48 if after_separator >= s.len() {
49 return None;
50 }
51
52 if let (Some(start), quote) = find_scheme_start(&s[0..separator]) {
53 let scheme = &s[start..separator];
54 let s = &s[after_separator..];
55
56 let require_host = scheme_requires_host(scheme);
57
58 if let (Some(after_authority), _) =
59 find_authority_end(s, true, require_host, true, self.iri_parsing_enabled)
60 {
61 if let Some(end) =
62 find_url_end(&s[after_authority..], quote, self.iri_parsing_enabled)
63 {
64 if after_authority == 0 && end == 0 {
65 return None;
66 }
67
68 let range = Range {
69 start,
70 end: after_separator + after_authority + end,
71 };
72 return Some(range);
73 }
74 }
75 }
76
77 None
78 }
79}
80
81impl Scanner for DomainScanner {
82 fn scan(&self, s: &str, separator: usize) -> Option<Range<usize>> {
83 // There must be something before separator for domain, and a minimum number of characters
84 if separator == 0 || s.len() < MIN_URL_LENGTH {
85 return None;
86 }
87
88 if let (Some(start), quote) = find_domain_start(&s[0..separator], self.iri_parsing_enabled)
89 {
90 let s = &s[start..];
91
92 if let (Some(domain_end), Some(_)) =
93 find_authority_end(s, false, true, true, self.iri_parsing_enabled)
94 {
95 if let Some(end) = find_url_end(&s[domain_end..], quote, self.iri_parsing_enabled) {
96 let range = Range {
97 start,
98 end: start + domain_end + end,
99 };
100 return Some(range);
101 }
102 }
103 }
104
105 None
106 }
107}
108
109/// Find start of scheme, e.g. from `https://`, start at `s` and end at `h`.
110fn find_scheme_start(s: &str) -> (Option<usize>, Option<char>) {
111 let mut first = None;
112 let mut special = None;
113 let mut quote = None;
114 for (i, c) in s.char_indices().rev() {
115 match c {
116 'a'..='z' | 'A'..='Z' => first = Some(i),
117 '0'..='9' => special = Some(i),
118 '+' | '-' | '.' => {}
119 '@' => return (None, None),
120 c if QUOTES.contains(&c) => {
121 // Check if there's a quote before the scheme,
122 // and stop once we encounter one of those quotes.
123 // https://github.com/robinst/linkify/issues/20
124 quote = Some(c);
125 break;
126 }
127 _ => break,
128 }
129 }
130
131 // We don't want to extract "abc://foo" out of "1abc://foo".
132 // ".abc://foo" and others are ok though, as they feel more like separators.
133 if let Some(first) = first {
134 if let Some(special) = special {
135 // Comparing the byte indices with `- 1` is ok as scheme must be ASCII
136 if first > 0 && first - 1 == special {
137 return (None, quote);
138 }
139 }
140 }
141 (first, quote)
142}
143
144/// Whether a scheme requires that authority looks like a host name (domain or IP address) or not
145/// (can contain reg-name with arbitrary allowed characters).
146///
147/// We could make this configurable, but let's keep it simple until someone asks (hi!).
148fn scheme_requires_host(scheme: &str) -> bool {
149 match scheme {
150 "https" | "http" | "ftp" | "ssh" => true,
151 _ => false,
152 }
153}
154
155/// Find the start of a plain domain URL (no scheme), e.g. from `blog.`, start at `g` and end at `b`.
156/// The rules are:
157/// - Domain is labels separated by `.`. Because we're starting at the first `.`, we only need to
158/// handle one label.
159/// - Label can not start or end with `-`
160/// - Label can contain letters, digits, `-` or Unicode if iri_allowed flag is true
161fn find_domain_start(s: &str, iri_parsing_enabled: bool) -> (Option<usize>, Option<char>) {
162 let mut first = None;
163 let mut quote = None;
164
165 for (i, c) in s.char_indices().rev() {
166 match c {
167 'a'..='z' | 'A'..='Z' | '0'..='9' => first = Some(i),
168 '\u{80}'..=char::MAX if iri_parsing_enabled => first = Some(i),
169 // If we had something valid like `https://www.` we'd have found it with the ":"
170 // scanner already. We don't want to allow `.../www.example.com` just by itself.
171 // We *could* allow `//www.example.com` (scheme-relative URLs) in the future.
172 '/' => return (None, None),
173 // Similar to above, if this was an email we'd have found it already.
174 '@' => return (None, None),
175 // If this was a valid domain, we'd have extracted it already from the previous "."
176 '.' => return (None, None),
177 '-' => {
178 if first == None {
179 // Domain label can't end with `-`
180 return (None, None);
181 } else {
182 first = Some(i);
183 }
184 }
185 c if QUOTES.contains(&c) => {
186 // Check if there's a quote before, and stop once we encounter one of those quotes,
187 // e.g. with `"www.example.com"`
188 quote = Some(c);
189 break;
190 }
191 _ => break,
192 }
193 }
194
195 if let Some(first) = first {
196 if s[first..].starts_with('-') {
197 // Domain label can't start with `-`
198 return (None, None);
199 }
200 }
201
202 (first, quote)
203}
204
205/// Find the end of a URL. At this point we already scanned past a valid authority. So e.g. in
206/// `https://example.com/foo` we're starting at `/` and want to end at `o`.
207fn find_url_end(s: &str, quote: Option<char>, iri_parsing_enabled: bool) -> Option<usize> {
208 let mut round = 0;
209 let mut square = 0;
210 let mut curly = 0;
211 let mut single_quote = false;
212
213 let mut previous_can_be_last = true;
214 let mut end = Some(0);
215
216 if !s[0..].starts_with("/") && !s[0..].starts_with("?") {
217 return Some(0);
218 }
219
220 for (i, c) in s.char_indices() {
221 let can_be_last = match c {
222 '\u{00}'..='\u{1F}' | ' ' | '|' | '\"' | '<' | '>' | '`' | '\u{7F}'..='\u{9F}' => {
223 // These can never be part of an URL, so stop now. See RFC 3986 and RFC 3987.
224 // Some characters are not in the above list, even they are not in "unreserved"
225 // or "reserved":
226 // '\\', '^', '{', '}'
227 // The reason for this is that other link detectors also allow them. Also see
228 // below, we require the braces to be balanced.
229 break;
230 }
231 '?' | '!' | '.' | ',' | ':' | ';' | '*' => {
232 // These may be part of an URL but not at the end. It's not that the spec
233 // doesn't allow them, but they are frequently used in plain text as delimiters
234 // where they're not meant to be part of the URL.
235 false
236 }
237 '/' => {
238 // This may be part of an URL and at the end, but not if the previous character
239 // can't be the end of an URL
240 previous_can_be_last
241 }
242 '(' => {
243 round += 1;
244 false
245 }
246 ')' => {
247 round -= 1;
248 if round < 0 {
249 // More closing than opening brackets, stop now
250 break;
251 }
252 true
253 }
254 '[' => {
255 square += 1;
256 false
257 }
258 ']' => {
259 square -= 1;
260 if square < 0 {
261 // More closing than opening brackets, stop now
262 break;
263 }
264 true
265 }
266 '{' => {
267 curly += 1;
268 false
269 }
270 '}' => {
271 curly -= 1;
272 if curly < 0 {
273 // More closing than opening brackets, stop now
274 break;
275 }
276 true
277 }
278 _ if Some(c) == quote => {
279 // Found matching quote from beginning of URL, stop now
280 break;
281 }
282 '\'' => {
283 single_quote = !single_quote;
284 // A single quote can only be the end of an URL if there's an even number
285 !single_quote
286 }
287 '\u{80}'..=char::MAX if !iri_parsing_enabled => false,
288
289 _ => true,
290 };
291 if can_be_last {
292 end = Some(i + c.len_utf8());
293 }
294 previous_can_be_last = can_be_last;
295 }
296
297 end
298}