parse_hyperlinks/parser/mod.rs
1//! This module implements parsers to extract hyperlinks and link reference
2//! definitions from text input.
3
4pub mod asciidoc;
5pub mod html;
6pub mod html_img;
7pub mod markdown;
8pub mod markdown_img;
9pub mod parse;
10pub mod restructured_text;
11pub mod wikitext;
12use nom::error::ErrorKind;
13use percent_encoding::percent_decode_str;
14use std::borrow::Cow;
15
16/// A [hyperlink] with the following variants:
17/// * an [inline link] `Text2Dev`,
18/// * a [reference link] `Text2Label`,
19/// * a [link reference definition] `Label2Dest`,
20/// * a [combined inline link / link reference definition] `TextLabel2Dest`,
21/// * a [reference alias] `Label2Label`,
22/// * an [inline image] `Image` or
23/// * an [inline link with embedded inline image] `Image2Dest`
24/// This is the main return type of this API.
25///
26/// The _link title_ in Markdown is optional, when not given the string is set
27/// to the empty string `""`. The back ticks \` in reStructuredText can be
28/// omitted when only one word is enclosed without spaces.
29///
30/// [markup hyperlink]: https://spec.commonmark.org/0.30/#links)
31/// [reference link]: https://spec.commonmark.org/0.30/#reference-link
32/// [link reference definition]: https://spec.commonmark.org/0.30/#link-reference-definition
33/// [combined inline link / link reference definition]: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#hyperlink-references
34/// [reference alias]: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#hyperlink-references
35/// [inline image]: https://spec.commonmark.org/0.30/#images
36/// [inline link with embedded inline image]: https://spec.commonmark.org/0.30/#example-519
37#[derive(Debug, PartialEq, Clone)]
38#[non_exhaustive]
39pub enum Link<'a> {
40 /// An _inline link_ with the following tuple values:
41 /// ```text
42 /// Text2Dest(link_text, link_destination, link_title)
43 /// ```
44 /// In (stand alone) **inline links** the destination and title are given
45 /// immediately after the link text. When an _inline link_ is rendered, only
46 /// the `link_text` is visible in the continuous text.
47 /// * Markdown example:
48 /// ```md
49 /// [link_text](link_dest "link title")
50 /// ```
51 /// * reStructuredText example:
52 /// ```rst
53 /// `link_text <link_dest>`__
54 /// ```
55 /// * Asciidoc example:
56 /// ```adoc
57 /// http://link_dest[link_text]
58 /// ```
59 /// * Wikitext example:
60 /// ```wm
61 /// [http://link_dest link_text]
62 /// ```
63 Text2Dest(Cow<'a, str>, Cow<'a, str>, Cow<'a, str>),
64
65 /// A _reference link_ with the following tuple values:
66 /// ```text
67 /// Text2Label(link_text, link_label)
68 /// ```
69 /// In **reference links** the destination and title are defined elsewhere
70 /// in the document in some _link reference definition_. When a _reference
71 /// link_ is rendered only `link_text` is visible.
72 /// * Markdown examples:
73 /// ```md
74 /// [link_text][link_label]
75 ///
76 /// [link_text]
77 /// ```
78 /// When only _link text_ is given, _link label_ is set to the same string.
79 /// * reStructuredText examples:
80 /// ```rst
81 /// `link_text <link_label_>`_
82 ///
83 /// `link_text`_
84 /// ```
85 /// When only _link text_ is given, _link label_ is set to the same string.
86 /// * Asciidoc example:
87 /// ```adoc
88 /// {link_label}[link_text]
89 /// ```
90 Text2Label(Cow<'a, str>, Cow<'a, str>),
91
92 /// A _link reference definition_ with the following tuple values:
93 /// ```text
94 /// Label2Dest(link_label, link_destination, link_title)
95 /// ```
96 /// A **link reference definition** refers to a _reference link_ with the
97 /// same _link label_. A _link reference definition_ is not visible
98 /// when the document is rendered.
99 /// _link title_ is optional.
100 /// * Markdown example:
101 /// ```md
102 /// [link_label]: link_dest "link title"
103 /// ```
104 /// * reStructuredText examples:
105 /// ```rst
106 /// .. _`link_label`: link_dest
107 ///
108 /// .. __: link_dest
109 ///
110 /// __ link_dest
111 /// ```
112 /// When `__` is given, the _link label_ is set to `"_"`, which is a marker
113 /// for an anonymous _link label_.
114 /// * Asciidoc example:
115 /// ```adoc
116 /// :link_label: http://link_dest
117 /// ```
118 Label2Dest(Cow<'a, str>, Cow<'a, str>, Cow<'a, str>),
119
120 /// An _inline link/link reference definition'_ with tuple values:
121 /// ```text
122 /// Label2Dest(link_text_label, link_destination, link_title)
123 /// ```
124 /// This type represents a combined **inline link** and **link reference
125 /// definition**. Semantically `TextLabel2Dest` is a shorthand for two links
126 /// `Text2Dest` and `Label2Dest` in one object, where _link text_ and _link
127 /// label_ are the same string. When rendered, _link text_ is visible.
128 ///
129 /// * Consider the following reStructuredText link:
130 /// ```rst
131 /// `link_text_label <link_dest>`_
132 ///
133 /// `a <b>`_
134 /// ```
135 /// In this link is `b` the _link destination_ and `a` has a double role:
136 /// it defines _link text_ of the first link `Text2Dest("a", "b", "")` and
137 /// _link label_ of the second link `Label2Dest("a", "b", "")`.
138 ///
139 TextLabel2Dest(Cow<'a, str>, Cow<'a, str>, Cow<'a, str>),
140
141 /// A _reference alias_ with the following tuple values:
142 /// ```text
143 /// Label2Label(alt_link_label, link_label)
144 /// ```
145 /// The **reference alias** defines an alternative link label
146 /// `alt_link_label` for an existing `link_label` defined elsewhere in the
147 /// document. At some point, the `link_label` must be resolved to a
148 /// `link_destination` by a _link_reference_definition_. A _reference
149 /// alias_ is not visible when the document is rendered.
150 /// This link type is only available in reStructuredText, e.g.
151 /// ```rst
152 /// .. _`alt_link_label`: `link_label`_
153 /// ```
154 Label2Label(Cow<'a, str>, Cow<'a, str>),
155
156 /// An _inline image_ with the following tuple values:
157 /// ```text
158 /// Image(img_alt, img_src)
159 /// ```
160 /// Note: this crate does not contain parsers for this variant.
161 Image(Cow<'a, str>, Cow<'a, str>),
162
163 /// An _inline link_ with embedded _inline image_ and the following
164 /// tuple values.
165 /// ```text
166 /// Image2Text(text1, img_alt, img_src, text2, dest, title)
167 /// ```
168 Image2Dest(
169 Cow<'a, str>,
170 Cow<'a, str>,
171 Cow<'a, str>,
172 Cow<'a, str>,
173 Cow<'a, str>,
174 Cow<'a, str>,
175 ),
176}
177
178/// A parser that decodes percent encoded URLS.
179/// This parser consumes all input. It returns `Err` when the percent-decoded
180/// bytes are not well-formed in UTF-8.
181/// ```text
182/// use std::borrow::Cow;
183///
184/// let res = percent_decode("https://getreu.net/?q=%5Ba%20b%5D").unwrap();
185/// assert_eq!(res, ("", Cow::Owned("https://getreu.net/?q=[a b]".to_string())));
186///```
187fn percent_decode(i: &str) -> nom::IResult<&str, Cow<str>> {
188 let decoded = percent_decode_str(i)
189 .decode_utf8()
190 .map_err(|_| nom::Err::Error(nom::error::Error::new(i, ErrorKind::EscapedTransform)))?;
191 Ok(("", decoded))
192}
193
194#[test]
195fn test_percent_decode() {
196 let res = percent_decode("percent%20encoded string").unwrap();
197 assert!(matches!(res.1, Cow::Owned(..)));
198 assert_eq!(res.1, Cow::from("percent encoded string"));
199
200 let res = percent_decode("nothing").unwrap();
201 assert!(matches!(res.1, Cow::Borrowed(..)));
202 assert_eq!(res.1, Cow::from("nothing"));
203}