markdown_it_autolink/
lib.rs

1//! Implement the Github Flavoured Markdown [autolink extension](https://github.github.com/gfm/#autolinks-extension-).
2//!
3//! ```rust
4//! let md = &mut markdown_it::MarkdownIt::new();
5//! markdown_it::plugins::cmark::add(md);
6//! markdown_it_autolink::add(md);
7//!
8//! assert_eq!(
9//!     md.parse("www.example.com").render(),
10//!     "<p><a href=\"http://www.example.com\">www.example.com</a></p>\n",
11//! );
12//! ```
13
14use gfm_autolinks::{
15    check_prev,
16    email::{match_any_email, Protocol},
17    url::match_http,
18    www::match_www,
19};
20use markdown_it::{
21    parser::inline::{InlineRule, InlineState, TextSpecial},
22    plugins::cmark::inline::autolink::Autolink,
23    MarkdownIt, Node,
24};
25
26/// Add the GFM autolink extension plugin to MarkdownIt.
27pub fn add(md: &mut MarkdownIt) {
28    // TODO should these rules be before/after something?
29    md.inline.add_rule::<WwwScanner>();
30    md.inline.add_rule::<ProtocolScanner>();
31    // TODO this does not currently work with `_` in the user part
32    // since the `_` is parsed as emphasis, before it can be back-scanned
33    md.inline.add_rule::<BareEmailScanner>();
34}
35
36/// Scanner for bare `www.` URLs.
37pub struct WwwScanner;
38impl InlineRule for WwwScanner {
39    const MARKER: char = 'w';
40    fn run(state: &mut InlineState) -> Option<(Node, usize)> {
41        if state.link_level > 0 {
42            return None;
43        }
44        if !state.src[state.pos..state.pos_max].starts_with("www.") {
45            return None;
46        }
47        check_preceding(state, 0)?;
48        let (url, length) = match_www(state.src[state.pos..state.pos_max].as_bytes())?;
49
50        let text = state.src[state.pos..state.pos + length].to_string();
51
52        create_autolink(state, 0, length, url, Some(text))
53    }
54}
55
56/// Scanner for URL protocols that are supported by the GFM autolink extension;
57/// `http`, `https`, `mailto`, and `xmpp`.
58/// The rule searches for `:`, back-scans to match the protocol name,
59/// then forward-spans to match the rest of the URL.
60/// Note, this is a balance between performance (only activating on `:` for all protocols)
61/// and correctness (risking that the preceding protocol has already been parsed by another rule).
62pub struct ProtocolScanner;
63impl InlineRule for ProtocolScanner {
64    const MARKER: char = ':';
65    fn run(state: &mut InlineState) -> Option<(Node, usize)> {
66        // auto-links cannot be inside other links
67        if state.link_level > 0 {
68            return None;
69        }
70
71        // remaining text must start with `:` and have at least 3 more chars
72        let remaining = &state.src[state.pos..state.pos_max];
73        let mut chars = remaining.chars();
74        if chars.next() != Some(':') {
75            return None;
76        }
77        for _ in 0..3 {
78            chars.next()?;
79        }
80
81        // get preceding "unparsed" text
82        let trailing = state.trailing_text_get();
83
84        for (pname, ptype) in vec![("mailto", Protocol::Mailto), ("xmpp", Protocol::Xmpp)] {
85            if !trailing.ends_with(pname) {
86                continue;
87            }
88            let bscan_len = pname.len();
89            check_preceding(state, bscan_len)?;
90            let (full_url, total_len) = match_any_email(
91                &[pname.as_bytes(), remaining.as_bytes()].concat(),
92                bscan_len + 1,
93                ptype,
94            )?;
95            return create_autolink(state, bscan_len, total_len, full_url, None);
96        }
97
98        for pname in &["http", "https"] {
99            if !trailing.ends_with(pname) {
100                continue;
101            }
102            let bscan_len = pname.len();
103            check_preceding(state, bscan_len)?;
104            let (full_url, total_len) =
105                match_http(&[pname.as_bytes(), remaining.as_bytes()].concat())?;
106            return create_autolink(state, bscan_len, total_len, full_url, None);
107        }
108
109        None
110    }
111}
112
113pub struct BareEmailScanner;
114impl InlineRule for BareEmailScanner {
115    const MARKER: char = '@';
116    fn run(state: &mut InlineState) -> Option<(Node, usize)> {
117        // auto-links cannot be inside other links
118        if state.link_level > 0 {
119            return None;
120        }
121
122        // remaining text must start with `@` and have at least 1 more chars
123        let mut chars = state.src[state.pos..state.pos_max].chars();
124        if chars.next() != Some('@') {
125            return None;
126        }
127        chars.next()?;
128
129        let trailing = state.trailing_text_get();
130
131        // back-scan to find the local part of the email
132        let mut local_rev = vec![];
133        for char in trailing.chars().rev() {
134            if char.is_ascii_alphanumeric()
135                || char == '.'
136                || char == '-'
137                || char == '_'
138                || char == '+'
139            {
140                local_rev.push(char);
141            } else {
142                break;
143            }
144        }
145        if local_rev.is_empty() {
146            return None;
147        }
148        let local_len = local_rev.len();
149        check_preceding(state, local_len)?;
150
151        // forward-scan to find the domain part of the email
152        let remaining = &state.src[state.pos + 1..state.pos_max];
153        let mut domain_len = 0;
154        let mut num_period = 0;
155        for (i, c) in remaining.char_indices() {
156            if c.is_ascii_alphanumeric() {
157            } else if c == '@' {
158                return None;
159            } else if c == '.'
160                && remaining
161                    .chars()
162                    .nth(i + 1)
163                    .is_some_and(|c| c.is_ascii_alphanumeric())
164            {
165                num_period += 1;
166            } else if c != '-' && c != '_' {
167                break;
168            }
169            domain_len += 1;
170        }
171        // There must be at least one period in the domain
172        if domain_len == 0 || num_period == 0 {
173            return None;
174        }
175        // The last character must not be one of `-` or `_`
176        let last_char = remaining.chars().nth(domain_len - 1)?;
177        if !last_char.is_ascii_alphanumeric() && last_char != '.' {
178            return None;
179        }
180        let text = format!(
181            "{}{}",
182            String::from_iter(local_rev.iter().rev()),
183            &state.src[state.pos..state.pos + 1 + domain_len]
184        );
185        create_autolink(
186            state,
187            local_len,
188            local_len + 1 + domain_len,
189            format!("mailto:{}", text),
190            Some(text),
191        )
192    }
193}
194
195/// Check if the preceding syntax allows an autolink.
196fn check_preceding(state: &InlineState, bscan_len: usize) -> Option<()> {
197    if state.pos - bscan_len > 0 && !check_prev(state.src.chars().nth(state.pos - bscan_len - 1)?) {
198        return None;
199    }
200    Some(())
201}
202
203/// Create the autolink node, removing any parsed trailing text.
204fn create_autolink(
205    state: &mut InlineState,
206    bscan_len: usize,
207    total_len: usize,
208    url: String,
209    text: Option<String>,
210) -> Option<(Node, usize)> {
211    state.trailing_text_pop(bscan_len);
212
213    let content = text.unwrap_or_else(|| url.clone());
214
215    let mut inner_node = Node::new(TextSpecial {
216        content: content.to_string(),
217        markup: content,
218        info: "autolink",
219    });
220    inner_node.srcmap = state.get_map(state.pos - bscan_len, state.pos - bscan_len + total_len);
221
222    let mut node = Node::new(Autolink { url });
223    node.children.push(inner_node);
224
225    state.pos -= bscan_len;
226
227    Some((node, total_len))
228}
229
230#[cfg(test)]
231mod tests {
232    use super::*;
233    use rstest::rstest;
234
235    #[rstest]
236    #[case("text http://example.com ", &["Root:0:24", "Paragraph:0:24", "Text:0:5", "Autolink:5:23", "TextSpecial:5:23"])]
237    #[case("text mailto:bob@test.com ", &["Root:0:25", "Paragraph:0:25", "Text:0:5", "Autolink:5:24", "TextSpecial:5:24"])]
238    #[case("text www.example.com ", &["Root:0:21", "Paragraph:0:21", "Text:0:5", "Autolink:5:20", "TextSpecial:5:20"])]
239    #[case("text bob@test.com ", &["Root:0:18", "Paragraph:0:18", "Text:0:5", "Autolink:5:17", "TextSpecial:5:17"])]
240    fn test_sourcepos(#[case] intput: &str, #[case] expected: &[&str]) {
241        let md = &mut markdown_it::MarkdownIt::new();
242        markdown_it::plugins::cmark::add(md);
243        add(md);
244        let node = md.parse(intput);
245        let mut pos = vec![];
246        node.walk(|n, _| {
247            n.srcmap.and_then(|s| {
248                let (i, j) = s.get_byte_offsets();
249                let name = n.name().split("::").last().unwrap_or_default();
250                pos.push(format!("{}:{}:{}", name, i, j));
251                Some(())
252            });
253        });
254        assert_eq!(pos, expected);
255    }
256}