parse_hyperlinks/parser/mod.rs
1//! This module implements parsers to extract hyperlinks and link reference
2//! definitions from text input.
3
4pub mod asciidoc;
5pub mod html;
6pub mod html_img;
7pub mod markdown;
8pub mod markdown_img;
9pub mod parse;
10pub mod restructured_text;
11pub mod wikitext;
12use nom::error::ErrorKind;
13use percent_encoding::percent_decode_str;
14use std::borrow::Cow;
15
16/// A [hyperlink] with the following variants:
17/// * an [inline link] `Text2Dev`,
18/// * a [reference link] `Text2Label`,
19/// * a [link reference definition] `Label2Dest`,
20/// * a [combined inline link / link reference definition] `TextLabel2Dest`,
21/// * a [reference alias] `Label2Label`,
22/// * an [inline image] `Image` or
23/// * an [inline link with embedded inline image] `Image2Dest`
24///
25/// This is the main return type of this API.
26///
27/// The _link title_ in Markdown is optional, when not given the string is set
28/// to the empty string `""`. The back ticks \` in reStructuredText can be
29/// omitted when only one word is enclosed without spaces.
30///
31/// [markup hyperlink]: https://spec.commonmark.org/0.30/#links)
32/// [reference link]: https://spec.commonmark.org/0.30/#reference-link
33/// [link reference definition]: https://spec.commonmark.org/0.30/#link-reference-definition
34/// [combined inline link / link reference definition]: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#hyperlink-references
35/// [reference alias]: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#hyperlink-references
36/// [inline image]: https://spec.commonmark.org/0.30/#images
37/// [inline link with embedded inline image]: https://spec.commonmark.org/0.30/#example-519
38#[derive(Debug, PartialEq, Clone)]
39#[non_exhaustive]
40pub enum Link<'a> {
41 /// An _inline link_ with the following tuple values:
42 /// ```text
43 /// Text2Dest(link_text, link_destination, link_title)
44 /// ```
45 /// In (stand alone) **inline links** the destination and title are given
46 /// immediately after the link text. When an _inline link_ is rendered, only
47 /// the `link_text` is visible in the continuous text.
48 /// * Markdown example:
49 /// ```md
50 /// [link_text](link_dest "link title")
51 /// ```
52 /// * reStructuredText example:
53 /// ```rst
54 /// `link_text <link_dest>`__
55 /// ```
56 /// * Asciidoc example:
57 /// ```adoc
58 /// http://link_dest[link_text]
59 /// ```
60 /// * Wikitext example:
61 /// ```wm
62 /// [http://link_dest link_text]
63 /// ```
64 Text2Dest(Cow<'a, str>, Cow<'a, str>, Cow<'a, str>),
65
66 /// A _reference link_ with the following tuple values:
67 /// ```text
68 /// Text2Label(link_text, link_label)
69 /// ```
70 /// In **reference links** the destination and title are defined elsewhere
71 /// in the document in some _link reference definition_. When a _reference
72 /// link_ is rendered only `link_text` is visible.
73 /// * Markdown examples:
74 /// ```md
75 /// [link_text][link_label]
76 ///
77 /// [link_text]
78 /// ```
79 /// When only _link text_ is given, _link label_ is set to the same string.
80 /// * reStructuredText examples:
81 /// ```rst
82 /// `link_text <link_label_>`_
83 ///
84 /// `link_text`_
85 /// ```
86 /// When only _link text_ is given, _link label_ is set to the same string.
87 /// * Asciidoc example:
88 /// ```adoc
89 /// {link_label}[link_text]
90 /// ```
91 Text2Label(Cow<'a, str>, Cow<'a, str>),
92
93 /// A _link reference definition_ with the following tuple values:
94 /// ```text
95 /// Label2Dest(link_label, link_destination, link_title)
96 /// ```
97 /// A **link reference definition** refers to a _reference link_ with the
98 /// same _link label_. A _link reference definition_ is not visible
99 /// when the document is rendered.
100 /// _link title_ is optional.
101 /// * Markdown example:
102 /// ```md
103 /// [link_label]: link_dest "link title"
104 /// ```
105 /// * reStructuredText examples:
106 /// ```rst
107 /// .. _`link_label`: link_dest
108 ///
109 /// .. __: link_dest
110 ///
111 /// __ link_dest
112 /// ```
113 /// When `__` is given, the _link label_ is set to `"_"`, which is a marker
114 /// for an anonymous _link label_.
115 /// * Asciidoc example:
116 /// ```adoc
117 /// :link_label: http://link_dest
118 /// ```
119 Label2Dest(Cow<'a, str>, Cow<'a, str>, Cow<'a, str>),
120
121 /// An _inline link/link reference definition'_ with tuple values:
122 /// ```text
123 /// Label2Dest(link_text_label, link_destination, link_title)
124 /// ```
125 /// This type represents a combined **inline link** and **link reference
126 /// definition**. Semantically `TextLabel2Dest` is a shorthand for two links
127 /// `Text2Dest` and `Label2Dest` in one object, where _link text_ and _link
128 /// label_ are the same string. When rendered, _link text_ is visible.
129 ///
130 /// * Consider the following reStructuredText link:
131 /// ```rst
132 /// `link_text_label <link_dest>`_
133 ///
134 /// `a <b>`_
135 /// ```
136 /// In this link is `b` the _link destination_ and `a` has a double role:
137 /// it defines _link text_ of the first link `Text2Dest("a", "b", "")` and
138 /// _link label_ of the second link `Label2Dest("a", "b", "")`.
139 ///
140 TextLabel2Dest(Cow<'a, str>, Cow<'a, str>, Cow<'a, str>),
141
142 /// A _reference alias_ with the following tuple values:
143 /// ```text
144 /// Label2Label(alt_link_label, link_label)
145 /// ```
146 /// The **reference alias** defines an alternative link label
147 /// `alt_link_label` for an existing `link_label` defined elsewhere in the
148 /// document. At some point, the `link_label` must be resolved to a
149 /// `link_destination` by a _link_reference_definition_. A _reference
150 /// alias_ is not visible when the document is rendered.
151 /// This link type is only available in reStructuredText, e.g.
152 /// ```rst
153 /// .. _`alt_link_label`: `link_label`_
154 /// ```
155 Label2Label(Cow<'a, str>, Cow<'a, str>),
156
157 /// An _inline image_ with the following tuple values:
158 /// ```text
159 /// Image(img_alt, img_src)
160 /// ```
161 /// Note: this crate does not contain parsers for this variant.
162 Image(Cow<'a, str>, Cow<'a, str>),
163
164 /// An _inline link_ with embedded _inline image_ and the following
165 /// tuple values.
166 /// ```text
167 /// Image2Text(text1, img_alt, img_src, text2, dest, title)
168 /// ```
169 Image2Dest(
170 Cow<'a, str>,
171 Cow<'a, str>,
172 Cow<'a, str>,
173 Cow<'a, str>,
174 Cow<'a, str>,
175 Cow<'a, str>,
176 ),
177}
178
179/// A parser that decodes percent encoded URLS.
180/// This parser consumes all input. It returns `Err` when the percent-decoded
181/// bytes are not well-formed in UTF-8.
182/// ```text
183/// use std::borrow::Cow;
184///
185/// let res = percent_decode("https://getreu.net/?q=%5Ba%20b%5D").unwrap();
186/// assert_eq!(res, ("", Cow::Owned("https://getreu.net/?q=[a b]".to_string())));
187///```
188fn percent_decode(i: &str) -> nom::IResult<&str, Cow<str>> {
189 let decoded = percent_decode_str(i)
190 .decode_utf8()
191 .map_err(|_| nom::Err::Error(nom::error::Error::new(i, ErrorKind::EscapedTransform)))?;
192 Ok(("", decoded))
193}
194
195#[test]
196fn test_percent_decode() {
197 let res = percent_decode("percent%20encoded string").unwrap();
198 assert!(matches!(res.1, Cow::Owned(..)));
199 assert_eq!(res.1, Cow::from("percent encoded string"));
200
201 let res = percent_decode("nothing").unwrap();
202 assert!(matches!(res.1, Cow::Borrowed(..)));
203 assert_eq!(res.1, Cow::from("nothing"));
204}