parse_hyperlinks/lib.rs
1//! Library and application for parsing hyperlinks and link reference
2//! definitions in Markdown, reStructuredText, Asciidoc and HTML format. The
3//! library implements the
4//! [CommonMark Specification 0.30](https://spec.commonmark.org/0.30/),
5//! [reStructuredText Markup Specification](https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html)
6//! (revision 8571, date 2020-10-28),
7//! [Asciidoctor User Manual, chapter 26](https://asciidoctor.org/docs/user-manual/#url) (date 2020-12-03),
8//! the
9//! [HTML 5.2: section 4.5.](https://www.w3.org/TR/html52/textlevel-semantics.html#the-a-element)
10//! specification
11//! and the [Wikitext v1.0.0](https://www.mediawiki.org/wiki/Specs/wikitext/1.0.0)
12//! specification.
13#![allow(dead_code)]
14
15pub mod iterator;
16pub mod parser;
17pub mod renderer;
18
19use nom::error::Error;
20use nom::error::ErrorKind;
21use nom::error::ParseError;
22use nom::Err;
23use nom::IResult;
24
25/// A parser similar to `nom::bytes::complete::take_until()`, except that this
26/// one does not stop at balanced opening and closing tags. It is designed to
27/// work inside the `nom::sequence::delimited()` parser.
28///
29/// # Basic usage
30/// ```
31/// use nom::bytes::complete::tag;
32/// use nom::sequence::delimited;
33/// use parse_hyperlinks::take_until_unbalanced;
34///
35/// let mut parser = delimited(tag("<"), take_until_unbalanced('<', '>'), tag(">"));
36/// assert_eq!(parser("<<inside>inside>abc"), Ok(("abc", "<inside>inside")));
37/// ```
38/// It skips nested brackets until it finds an extra unbalanced closing bracket. Escaped brackets
39/// like `\<` and `\>` are not considered as brackets and are not counted. This function is
40/// very similar to `nom::bytes::complete::take_until(">")`, except it also takes nested brackets.
41pub fn take_until_unbalanced(
42 opening_bracket: char,
43 closing_bracket: char,
44) -> impl Fn(&str) -> IResult<&str, &str> {
45 move |i: &str| {
46 let mut index = 0;
47 let mut bracket_counter = 0;
48 while let Some(n) = &i[index..].find(&[opening_bracket, closing_bracket, '\\'][..]) {
49 index += n;
50 let mut it = i[index..].chars();
51 match it.next() {
52 Some(c) if c == '\\' => {
53 // Skip the escape char `\`.
54 index += '\\'.len_utf8();
55 // Skip also the following char.
56 if let Some(c) = it.next() {
57 index += c.len_utf8();
58 }
59 }
60 Some(c) if c == opening_bracket => {
61 bracket_counter += 1;
62 index += opening_bracket.len_utf8();
63 }
64 Some(c) if c == closing_bracket => {
65 // Closing bracket.
66 bracket_counter -= 1;
67 index += closing_bracket.len_utf8();
68 }
69 // Can not happen.
70 _ => unreachable!(),
71 };
72 // We found the unmatched closing bracket.
73 if bracket_counter == -1 {
74 // We do not consume it.
75 index -= closing_bracket.len_utf8();
76 return Ok((&i[index..], &i[0..index]));
77 };
78 }
79
80 if bracket_counter == 0 {
81 Ok(("", i))
82 } else {
83 Err(Err::Error(Error::from_error_kind(i, ErrorKind::TakeUntil)))
84 }
85 }
86}
87
88#[cfg(test)]
89mod tests {
90 use super::*;
91 use nom::error::ErrorKind;
92
93 #[test]
94 fn test_take_until_unmatched() {
95 assert_eq!(take_until_unbalanced('(', ')')("abc"), Ok(("", "abc")));
96 assert_eq!(
97 take_until_unbalanced('(', ')')("url)abc"),
98 Ok((")abc", "url"))
99 );
100 assert_eq!(
101 take_until_unbalanced('(', ')')("url)abc\\"),
102 Ok((")abc\\", "url"))
103 );
104 assert_eq!(
105 take_until_unbalanced('(', ')')("u()rl)abc"),
106 Ok((")abc", "u()rl"))
107 );
108 assert_eq!(
109 take_until_unbalanced('(', ')')("u(())rl)abc"),
110 Ok((")abc", "u(())rl"))
111 );
112 assert_eq!(
113 take_until_unbalanced('(', ')')("u\\(())rl)abc"),
114 Ok((")rl)abc", "u\\(()"))
115 );
116 assert_eq!(
117 take_until_unbalanced('(', ')')("u(()\\)rl)abc"),
118 Ok(("", "u(()\\)rl)abc"))
119 );
120 assert_eq!(
121 take_until_unbalanced('(', ')')("u(())r()l)abc"),
122 Ok((")abc", "u(())r()l"))
123 );
124 assert_eq!(
125 take_until_unbalanced('(', ')')("u(())r()labc"),
126 Ok(("", "u(())r()labc"))
127 );
128 assert_eq!(
129 take_until_unbalanced('(', ')')(r#"u\((\))r()labc"#),
130 Ok(("", r#"u\((\))r()labc"#))
131 );
132 assert_eq!(
133 take_until_unbalanced('(', ')')("u(())r(labc"),
134 Err(nom::Err::Error(nom::error::Error::new(
135 "u(())r(labc",
136 ErrorKind::TakeUntil
137 )))
138 );
139 assert_eq!(
140 take_until_unbalanced('€', 'ü')("€uü€€üürlüabc"),
141 Ok(("üabc", "€uü€€üürl"))
142 );
143 }
144}