parse_hyperlinks/
lib.rs

1//! Library and application for parsing hyperlinks and link reference
2//! definitions in Markdown, reStructuredText, Asciidoc and HTML format. The
3//! library implements the
4//! [CommonMark Specification 0.30](https://spec.commonmark.org/0.30/),
5//! [reStructuredText Markup Specification](https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html)
6//! (revision 8571, date 2020-10-28),
7//! [Asciidoctor User Manual, chapter 26](https://asciidoctor.org/docs/user-manual/#url) (date 2020-12-03),
8//! the
9//! [HTML 5.2: section 4.5.](https://www.w3.org/TR/html52/textlevel-semantics.html#the-a-element)
10//! specification
11//! and the [Wikitext v1.0.0](https://www.mediawiki.org/wiki/Specs/wikitext/1.0.0)
12//! specification.
13#![allow(dead_code)]
14
15pub mod iterator;
16pub mod parser;
17pub mod renderer;
18
19use nom::error::Error;
20use nom::error::ErrorKind;
21use nom::error::ParseError;
22use nom::Err;
23use nom::IResult;
24
25/// A parser similar to `nom::bytes::complete::take_until()`, except that this
26/// one does not stop at balanced opening and closing tags. It is designed to
27/// work inside the `nom::sequence::delimited()` parser.
28///
29/// # Basic usage
30/// ```
31/// use nom::bytes::complete::tag;
32/// use nom::sequence::delimited;
33/// use parse_hyperlinks::take_until_unbalanced;
34///
35/// let mut parser = delimited(tag("<"), take_until_unbalanced('<', '>'), tag(">"));
36/// assert_eq!(parser("<<inside>inside>abc"), Ok(("abc", "<inside>inside")));
37/// ```
38/// It skips nested brackets until it finds an extra unbalanced closing bracket. Escaped brackets
39/// like `\<` and `\>` are not considered as brackets and are not counted. This function is
40/// very similar to `nom::bytes::complete::take_until(">")`, except it also takes nested brackets.
41pub fn take_until_unbalanced(
42    opening_bracket: char,
43    closing_bracket: char,
44) -> impl Fn(&str) -> IResult<&str, &str> {
45    move |i: &str| {
46        let mut index = 0;
47        let mut bracket_counter = 0;
48        while let Some(n) = &i[index..].find(&[opening_bracket, closing_bracket, '\\'][..]) {
49            index += n;
50            let mut it = i[index..].chars();
51            match it.next() {
52                Some(c) if c == '\\' => {
53                    // Skip the escape char `\`.
54                    index += '\\'.len_utf8();
55                    // Skip also the following char.
56                    if let Some(c) = it.next() {
57                        index += c.len_utf8();
58                    }
59                }
60                Some(c) if c == opening_bracket => {
61                    bracket_counter += 1;
62                    index += opening_bracket.len_utf8();
63                }
64                Some(c) if c == closing_bracket => {
65                    // Closing bracket.
66                    bracket_counter -= 1;
67                    index += closing_bracket.len_utf8();
68                }
69                // Can not happen.
70                _ => unreachable!(),
71            };
72            // We found the unmatched closing bracket.
73            if bracket_counter == -1 {
74                // We do not consume it.
75                index -= closing_bracket.len_utf8();
76                return Ok((&i[index..], &i[0..index]));
77            };
78        }
79
80        if bracket_counter == 0 {
81            Ok(("", i))
82        } else {
83            Err(Err::Error(Error::from_error_kind(i, ErrorKind::TakeUntil)))
84        }
85    }
86}
87
88#[cfg(test)]
89mod tests {
90    use super::*;
91    use nom::error::ErrorKind;
92
93    #[test]
94    fn test_take_until_unmatched() {
95        assert_eq!(take_until_unbalanced('(', ')')("abc"), Ok(("", "abc")));
96        assert_eq!(
97            take_until_unbalanced('(', ')')("url)abc"),
98            Ok((")abc", "url"))
99        );
100        assert_eq!(
101            take_until_unbalanced('(', ')')("url)abc\\"),
102            Ok((")abc\\", "url"))
103        );
104        assert_eq!(
105            take_until_unbalanced('(', ')')("u()rl)abc"),
106            Ok((")abc", "u()rl"))
107        );
108        assert_eq!(
109            take_until_unbalanced('(', ')')("u(())rl)abc"),
110            Ok((")abc", "u(())rl"))
111        );
112        assert_eq!(
113            take_until_unbalanced('(', ')')("u\\(())rl)abc"),
114            Ok((")rl)abc", "u\\(()"))
115        );
116        assert_eq!(
117            take_until_unbalanced('(', ')')("u(()\\)rl)abc"),
118            Ok(("", "u(()\\)rl)abc"))
119        );
120        assert_eq!(
121            take_until_unbalanced('(', ')')("u(())r()l)abc"),
122            Ok((")abc", "u(())r()l"))
123        );
124        assert_eq!(
125            take_until_unbalanced('(', ')')("u(())r()labc"),
126            Ok(("", "u(())r()labc"))
127        );
128        assert_eq!(
129            take_until_unbalanced('(', ')')(r#"u\((\))r()labc"#),
130            Ok(("", r#"u\((\))r()labc"#))
131        );
132        assert_eq!(
133            take_until_unbalanced('(', ')')("u(())r(labc"),
134            Err(nom::Err::Error(nom::error::Error::new(
135                "u(())r(labc",
136                ErrorKind::TakeUntil
137            )))
138        );
139        assert_eq!(
140            take_until_unbalanced('€', 'ü')("€uü€€üürlüabc"),
141            Ok(("üabc", "€uü€€üürl"))
142        );
143    }
144}