parse_hyperlinks/lib.rs
1//! Library and application for parsing hyperlinks and link reference
2//! definitions in Markdown, reStructuredText, Asciidoc and HTML format. The
3//! library implements the
4//! [CommonMark Specification 0.30](https://spec.commonmark.org/0.30/),
5//! [reStructuredText Markup Specification](https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html)
6//! (revision 8571, date 2020-10-28),
7//! [Asciidoctor User Manual, chapter 26](https://asciidoctor.org/docs/user-manual/#url) (date 2020-12-03),
8//! the
9//! [HTML 5.2: section 4.5.](https://www.w3.org/TR/html52/textlevel-semantics.html#the-a-element)
10//! specification
11//! and the [Wikitext v1.0.0](https://www.mediawiki.org/wiki/Specs/wikitext/1.0.0)
12//! specification.
13#![allow(dead_code)]
14
15pub mod iterator;
16pub mod parser;
17pub mod renderer;
18
19use nom::Err;
20use nom::IResult;
21use nom::error::Error;
22use nom::error::ErrorKind;
23use nom::error::ParseError;
24
25/// A parser similar to `nom::bytes::complete::take_until()`, except that this
26/// one does not stop at balanced opening and closing tags. It is designed to
27/// work inside the `nom::sequence::delimited()` parser.
28///
29/// # Basic usage
30/// ```
31/// use nom::bytes::complete::tag;
32/// use nom::sequence::delimited;
33/// use nom::Parser;
34/// use parse_hyperlinks::take_until_unbalanced;
35///
36/// let mut parser = delimited(tag("<"), take_until_unbalanced('<', '>'), tag(">"));
37/// assert_eq!(parser.parse("<<inside>inside>abc"), Ok(("abc", "<inside>inside")));
38/// ```
39/// It skips nested brackets until it finds an extra unbalanced closing bracket. Escaped brackets
40/// like `\<` and `\>` are not considered as brackets and are not counted. This function is
41/// very similar to `nom::bytes::complete::take_until(">")`, except it also takes nested brackets.
42pub fn take_until_unbalanced(
43 opening_bracket: char,
44 closing_bracket: char,
45) -> impl Fn(&str) -> IResult<&str, &str> {
46 move |i: &str| {
47 let mut index = 0;
48 let mut bracket_counter = 0;
49 while let Some(n) = &i[index..].find(&[opening_bracket, closing_bracket, '\\'][..]) {
50 index += n;
51 let mut it = i[index..].chars();
52 match it.next() {
53 Some('\\') => {
54 // Skip the escape char `\`.
55 index += '\\'.len_utf8();
56 // Skip also the following char.
57 if let Some(c) = it.next() {
58 index += c.len_utf8();
59 }
60 }
61 Some(c) if c == opening_bracket => {
62 bracket_counter += 1;
63 index += opening_bracket.len_utf8();
64 }
65 Some(c) if c == closing_bracket => {
66 // Closing bracket.
67 bracket_counter -= 1;
68 index += closing_bracket.len_utf8();
69 }
70 // Can not happen.
71 _ => unreachable!(),
72 };
73 // We found the unmatched closing bracket.
74 if bracket_counter == -1 {
75 // We do not consume it.
76 index -= closing_bracket.len_utf8();
77 return Ok((&i[index..], &i[0..index]));
78 };
79 }
80
81 if bracket_counter == 0 {
82 Ok(("", i))
83 } else {
84 Err(Err::Error(Error::from_error_kind(i, ErrorKind::TakeUntil)))
85 }
86 }
87}
88
89#[cfg(test)]
90mod tests {
91 use super::*;
92 use nom::error::ErrorKind;
93
94 #[test]
95 fn test_take_until_unmatched() {
96 assert_eq!(take_until_unbalanced('(', ')')("abc"), Ok(("", "abc")));
97 assert_eq!(
98 take_until_unbalanced('(', ')')("url)abc"),
99 Ok((")abc", "url"))
100 );
101 assert_eq!(
102 take_until_unbalanced('(', ')')("url)abc\\"),
103 Ok((")abc\\", "url"))
104 );
105 assert_eq!(
106 take_until_unbalanced('(', ')')("u()rl)abc"),
107 Ok((")abc", "u()rl"))
108 );
109 assert_eq!(
110 take_until_unbalanced('(', ')')("u(())rl)abc"),
111 Ok((")abc", "u(())rl"))
112 );
113 assert_eq!(
114 take_until_unbalanced('(', ')')("u\\(())rl)abc"),
115 Ok((")rl)abc", "u\\(()"))
116 );
117 assert_eq!(
118 take_until_unbalanced('(', ')')("u(()\\)rl)abc"),
119 Ok(("", "u(()\\)rl)abc"))
120 );
121 assert_eq!(
122 take_until_unbalanced('(', ')')("u(())r()l)abc"),
123 Ok((")abc", "u(())r()l"))
124 );
125 assert_eq!(
126 take_until_unbalanced('(', ')')("u(())r()labc"),
127 Ok(("", "u(())r()labc"))
128 );
129 assert_eq!(
130 take_until_unbalanced('(', ')')(r#"u\((\))r()labc"#),
131 Ok(("", r#"u\((\))r()labc"#))
132 );
133 assert_eq!(
134 take_until_unbalanced('(', ')')("u(())r(labc"),
135 Err(nom::Err::Error(nom::error::Error::new(
136 "u(())r(labc",
137 ErrorKind::TakeUntil
138 )))
139 );
140 assert_eq!(
141 take_until_unbalanced('€', 'ü')("€uü€€üürlüabc"),
142 Ok(("üabc", "€uü€€üürl"))
143 );
144 }
145}