linkify/
url.rs

1use std::char;
2use std::ops::Range;
3
4use crate::domains::find_authority_end;
5use crate::scanner::Scanner;
6
7/// Minimum valid URL length
8///
9/// The shortest valid URL (without a scheme) might be g.cn (Google China),
10/// which consists of four characters.
11/// We set this as a lower threshold for parsing URLs from plaintext
12/// to avoid false-positives and as a slight performance optimization.
13/// This threshold might be adjusted in the future.
14const MIN_URL_LENGTH: usize = 4;
15
16const QUOTES: &[char] = &['\'', '\"'];
17
18/// Scan for URLs starting from the trigger character ":" (requires "://").
19///
20/// Based on RFC 3986.
21pub struct UrlScanner {
22    pub iri_parsing_enabled: bool,
23}
24
25/// Scan for plain domains (without scheme) such as `test.com` or `test.com/hi-there`.
26pub struct DomainScanner {
27    pub iri_parsing_enabled: bool,
28}
29
30impl Scanner for UrlScanner {
31    /// Scan for an URL at the given separator index in the string.
32    ///
33    /// Returns `None` if none was found.
34    fn scan(&self, s: &str, separator: usize) -> Option<Range<usize>> {
35        // There must be something before separator for scheme
36        if separator == 0 {
37            return None;
38        }
39
40        if !s[separator..].starts_with("://") {
41            // We only support schemes with authority, not things like `myscheme:mything`.
42            return None;
43        }
44
45        let after_separator = separator + "://".len();
46
47        // Need at least one character after '//'
48        if after_separator >= s.len() {
49            return None;
50        }
51
52        if let (Some(start), quote) = find_scheme_start(&s[0..separator]) {
53            let scheme = &s[start..separator];
54            let s = &s[after_separator..];
55
56            let require_host = scheme_requires_host(scheme);
57
58            if let (Some(after_authority), _) =
59                find_authority_end(s, true, require_host, true, self.iri_parsing_enabled)
60            {
61                if let Some(end) =
62                    find_url_end(&s[after_authority..], quote, self.iri_parsing_enabled)
63                {
64                    if after_authority == 0 && end == 0 {
65                        return None;
66                    }
67
68                    let range = Range {
69                        start,
70                        end: after_separator + after_authority + end,
71                    };
72                    return Some(range);
73                }
74            }
75        }
76
77        None
78    }
79}
80
81impl Scanner for DomainScanner {
82    fn scan(&self, s: &str, separator: usize) -> Option<Range<usize>> {
83        // There must be something before separator for domain, and a minimum number of characters
84        if separator == 0 || s.len() < MIN_URL_LENGTH {
85            return None;
86        }
87
88        if let (Some(start), quote) = find_domain_start(&s[0..separator], self.iri_parsing_enabled)
89        {
90            let s = &s[start..];
91
92            if let (Some(domain_end), Some(_)) =
93                find_authority_end(s, false, true, true, self.iri_parsing_enabled)
94            {
95                if let Some(end) = find_url_end(&s[domain_end..], quote, self.iri_parsing_enabled) {
96                    let range = Range {
97                        start,
98                        end: start + domain_end + end,
99                    };
100                    return Some(range);
101                }
102            }
103        }
104
105        None
106    }
107}
108
109/// Find start of scheme, e.g. from `https://`, start at `s` and end at `h`.
110fn find_scheme_start(s: &str) -> (Option<usize>, Option<char>) {
111    let mut first = None;
112    let mut special = None;
113    let mut quote = None;
114    for (i, c) in s.char_indices().rev() {
115        match c {
116            'a'..='z' | 'A'..='Z' => first = Some(i),
117            '0'..='9' => special = Some(i),
118            '+' | '-' | '.' => {}
119            '@' => return (None, None),
120            c if QUOTES.contains(&c) => {
121                // Check if there's a quote before the scheme,
122                // and stop once we encounter one of those quotes.
123                // https://github.com/robinst/linkify/issues/20
124                quote = Some(c);
125                break;
126            }
127            _ => break,
128        }
129    }
130
131    // We don't want to extract "abc://foo" out of "1abc://foo".
132    // ".abc://foo" and others are ok though, as they feel more like separators.
133    if let Some(first) = first {
134        if let Some(special) = special {
135            // Comparing the byte indices with `- 1` is ok as scheme must be ASCII
136            if first > 0 && first - 1 == special {
137                return (None, quote);
138            }
139        }
140    }
141    (first, quote)
142}
143
144/// Whether a scheme requires that authority looks like a host name (domain or IP address) or not
145/// (can contain reg-name with arbitrary allowed characters).
146///
147/// We could make this configurable, but let's keep it simple until someone asks (hi!).
148fn scheme_requires_host(scheme: &str) -> bool {
149    match scheme {
150        "https" | "http" | "ftp" | "ssh" => true,
151        _ => false,
152    }
153}
154
155/// Find the start of a plain domain URL (no scheme), e.g. from `blog.`, start at `g` and end at `b`.
156/// The rules are:
157/// - Domain is labels separated by `.`. Because we're starting at the first `.`, we only need to
158///   handle one label.
159/// - Label can not start or end with `-`
160/// - Label can contain letters, digits, `-` or Unicode if iri_allowed flag is true
161fn find_domain_start(s: &str, iri_parsing_enabled: bool) -> (Option<usize>, Option<char>) {
162    let mut first = None;
163    let mut quote = None;
164
165    for (i, c) in s.char_indices().rev() {
166        match c {
167            'a'..='z' | 'A'..='Z' | '0'..='9' => first = Some(i),
168            '\u{80}'..=char::MAX if iri_parsing_enabled => first = Some(i),
169            // If we had something valid like `https://www.` we'd have found it with the ":"
170            // scanner already. We don't want to allow `.../www.example.com` just by itself.
171            // We *could* allow `//www.example.com` (scheme-relative URLs) in the future.
172            '/' => return (None, None),
173            // Similar to above, if this was an email we'd have found it already.
174            '@' => return (None, None),
175            // If this was a valid domain, we'd have extracted it already from the previous "."
176            '.' => return (None, None),
177            '-' => {
178                if first == None {
179                    // Domain label can't end with `-`
180                    return (None, None);
181                } else {
182                    first = Some(i);
183                }
184            }
185            c if QUOTES.contains(&c) => {
186                // Check if there's a quote before, and stop once we encounter one of those quotes,
187                // e.g. with `"www.example.com"`
188                quote = Some(c);
189                break;
190            }
191            _ => break,
192        }
193    }
194
195    if let Some(first) = first {
196        if s[first..].starts_with('-') {
197            // Domain label can't start with `-`
198            return (None, None);
199        }
200    }
201
202    (first, quote)
203}
204
205/// Find the end of a URL. At this point we already scanned past a valid authority. So e.g. in
206/// `https://example.com/foo` we're starting at `/` and want to end at `o`.
207fn find_url_end(s: &str, quote: Option<char>, iri_parsing_enabled: bool) -> Option<usize> {
208    let mut round = 0;
209    let mut square = 0;
210    let mut curly = 0;
211    let mut single_quote = false;
212
213    let mut previous_can_be_last = true;
214    let mut end = Some(0);
215
216    if !s[0..].starts_with("/") && !s[0..].starts_with("?") {
217        return Some(0);
218    }
219
220    for (i, c) in s.char_indices() {
221        let can_be_last = match c {
222            '\u{00}'..='\u{1F}' | ' ' | '|' | '\"' | '<' | '>' | '`' | '\u{7F}'..='\u{9F}' => {
223                // These can never be part of an URL, so stop now. See RFC 3986 and RFC 3987.
224                // Some characters are not in the above list, even they are not in "unreserved"
225                // or "reserved":
226                //   '\\', '^', '{', '}'
227                // The reason for this is that other link detectors also allow them. Also see
228                // below, we require the braces to be balanced.
229                break;
230            }
231            '?' | '!' | '.' | ',' | ':' | ';' | '*' => {
232                // These may be part of an URL but not at the end. It's not that the spec
233                // doesn't allow them, but they are frequently used in plain text as delimiters
234                // where they're not meant to be part of the URL.
235                false
236            }
237            '/' => {
238                // This may be part of an URL and at the end, but not if the previous character
239                // can't be the end of an URL
240                previous_can_be_last
241            }
242            '(' => {
243                round += 1;
244                false
245            }
246            ')' => {
247                round -= 1;
248                if round < 0 {
249                    // More closing than opening brackets, stop now
250                    break;
251                }
252                true
253            }
254            '[' => {
255                square += 1;
256                false
257            }
258            ']' => {
259                square -= 1;
260                if square < 0 {
261                    // More closing than opening brackets, stop now
262                    break;
263                }
264                true
265            }
266            '{' => {
267                curly += 1;
268                false
269            }
270            '}' => {
271                curly -= 1;
272                if curly < 0 {
273                    // More closing than opening brackets, stop now
274                    break;
275                }
276                true
277            }
278            _ if Some(c) == quote => {
279                // Found matching quote from beginning of URL, stop now
280                break;
281            }
282            '\'' => {
283                single_quote = !single_quote;
284                // A single quote can only be the end of an URL if there's an even number
285                !single_quote
286            }
287            '\u{80}'..=char::MAX if !iri_parsing_enabled => false,
288
289            _ => true,
290        };
291        if can_be_last {
292            end = Some(i + c.len_utf8());
293        }
294        previous_can_be_last = can_be_last;
295    }
296
297    end
298}