Skip to main content

oxiui_text/
hyperlink.rs

1//! Hyperlink detection in plain text strings.
2//!
3//! Uses a conservative hand-written matcher (no regex crate) that finds
4//! substrings starting with `http://`, `https://`, or `www.` and ending at
5//! the next ASCII whitespace or certain common punctuation characters that
6//! are unlikely to be part of a URL.
7
8// ── HyperlinkSpan ─────────────────────────────────────────────────────────────
9
10/// A URL-like substring found within a text string.
11#[derive(Debug, Clone, PartialEq, Eq)]
12pub struct HyperlinkSpan {
13    /// Byte offset of the first character of the URL (inclusive).
14    pub start: usize,
15    /// Byte offset just past the last character of the URL (exclusive).
16    pub end: usize,
17    /// The matched URL text.
18    pub url: String,
19}
20
21// ── Punctuation that terminates a URL ────────────────────────────────────────
22
23/// Characters that are considered URL-terminating when they appear at the
24/// end of a candidate URL.  Leading occurrences within the URL are kept.
25const URL_TERMINATORS: &[char] = &['.', ',', '!', '?', ';', ':', ')', ']', '}', '\'', '"'];
26
27// ── find_hyperlinks ───────────────────────────────────────────────────────────
28
29/// Find all URL-like substrings in `text`.
30///
31/// A URL-like substring starts with `http://`, `https://`, or `www.` and
32/// continues to the next ASCII whitespace (or end-of-string), with trailing
33/// punctuation stripped.
34pub fn find_hyperlinks(text: &str) -> Vec<HyperlinkSpan> {
35    let mut spans: Vec<HyperlinkSpan> = Vec::new();
36    let bytes = text.as_bytes();
37    let len = bytes.len();
38
39    let mut i = 0usize;
40    while i < len {
41        // Try to match a URL prefix at position `i`.
42        let prefix = try_match_prefix(text, i);
43        if let Some(prefix_end) = prefix {
44            // Extend to the end of the URL (whitespace-terminated).
45            let url_end = extend_url(text, i, prefix_end);
46            let url = &text[i..url_end];
47            // Strip trailing punctuation that is probably not part of the URL.
48            let url = strip_trailing_punct(url);
49            let url_end = i + url.len();
50            if url_end > i {
51                spans.push(HyperlinkSpan {
52                    start: i,
53                    end: url_end,
54                    url: url.to_owned(),
55                });
56                i = url_end;
57                continue;
58            }
59        }
60        // Advance by one character.
61        i += char_len_at(bytes, i);
62    }
63
64    spans
65}
66
67// ── Helpers ───────────────────────────────────────────────────────────────────
68
69/// Try to match a URL prefix (`http://`, `https://`, `www.`) starting at
70/// `pos` in `text`.  Returns the byte offset of the character immediately
71/// after the prefix if matched.
72fn try_match_prefix(text: &str, pos: usize) -> Option<usize> {
73    let rest = &text[pos..];
74    for prefix in &["https://", "http://", "www."] {
75        if rest.starts_with(prefix) {
76            return Some(pos + prefix.len());
77        }
78    }
79    None
80}
81
82/// Return the byte offset just past the end of the URL starting at `start`
83/// with the prefix ending at `prefix_end`.  Stops at ASCII whitespace.
84fn extend_url(text: &str, start: usize, _prefix_end: usize) -> usize {
85    let rest = &text[start..];
86    let end_local = rest
87        .char_indices()
88        .find(|(_, c)| c.is_ascii_whitespace())
89        .map(|(i, _)| i)
90        .unwrap_or(rest.len());
91    start + end_local
92}
93
94/// Strip trailing punctuation from the candidate URL slice.
95fn strip_trailing_punct(url: &str) -> &str {
96    let mut end = url.len();
97    while end > 0 {
98        let ch = url[..end].chars().next_back().unwrap_or('\0');
99        if URL_TERMINATORS.contains(&ch) {
100            end -= ch.len_utf8();
101        } else {
102            break;
103        }
104    }
105    &url[..end]
106}
107
108/// Return the byte length of the UTF-8 character starting at `bytes[pos]`.
109fn char_len_at(bytes: &[u8], pos: usize) -> usize {
110    match bytes[pos] {
111        b if b < 0x80 => 1,
112        b if b < 0xC0 => 1, // continuation byte — shouldn't start a char
113        b if b < 0xE0 => 2,
114        b if b < 0xF0 => 3,
115        _ => 4,
116    }
117}
118
119// ── Tests ─────────────────────────────────────────────────────────────────────
120
121#[cfg(test)]
122mod tests {
123    use super::*;
124
125    #[test]
126    fn hyperlink_finds_https() {
127        let spans = find_hyperlinks("visit https://example.com today");
128        assert_eq!(spans.len(), 1);
129        assert!(spans[0].url.starts_with("https://"));
130    }
131
132    #[test]
133    fn hyperlink_finds_http() {
134        let spans = find_hyperlinks("see http://example.com/path");
135        assert_eq!(spans.len(), 1);
136        assert!(spans[0].url.starts_with("http://"));
137    }
138
139    #[test]
140    fn hyperlink_finds_www() {
141        let spans = find_hyperlinks("see www.example.com");
142        assert_eq!(spans.len(), 1);
143        assert!(spans[0].url.starts_with("www."));
144    }
145
146    #[test]
147    fn hyperlink_ignores_plain_text() {
148        let spans = find_hyperlinks("hello world");
149        assert!(spans.is_empty());
150    }
151
152    #[test]
153    fn hyperlink_multiple_urls() {
154        let spans = find_hyperlinks("a https://a.com b http://b.com c");
155        assert_eq!(spans.len(), 2);
156    }
157
158    #[test]
159    fn hyperlink_strips_trailing_punctuation() {
160        let spans = find_hyperlinks("visit https://example.com.");
161        assert_eq!(spans.len(), 1);
162        assert!(
163            !spans[0].url.ends_with('.'),
164            "trailing dot must be stripped"
165        );
166    }
167
168    #[test]
169    fn hyperlink_correct_byte_offsets() {
170        let text = "x https://example.com y";
171        let spans = find_hyperlinks(text);
172        assert_eq!(spans.len(), 1);
173        let span = &spans[0];
174        assert_eq!(&text[span.start..span.end], span.url.as_str());
175    }
176
177    #[test]
178    fn hyperlink_empty_string() {
179        assert!(find_hyperlinks("").is_empty());
180    }
181}