parse_hyperlinks/parser/
mod.rs

1//! This module implements parsers to extract hyperlinks and link reference
2//! definitions from text input.
3
4pub mod asciidoc;
5pub mod html;
6pub mod html_img;
7pub mod markdown;
8pub mod markdown_img;
9pub mod parse;
10pub mod restructured_text;
11pub mod wikitext;
12use nom::error::ErrorKind;
13use percent_encoding::percent_decode_str;
14use std::borrow::Cow;
15
16/// A [hyperlink] with the following variants:
17/// * an [inline link] `Text2Dev`,
18/// * a [reference link] `Text2Label`,
19/// * a [link reference definition] `Label2Dest`,
20/// * a [combined inline link / link reference definition] `TextLabel2Dest`,
21/// * a [reference alias] `Label2Label`,
22/// * an [inline image] `Image` or
23/// * an [inline link with embedded inline image] `Image2Dest`
24/// This is the main return type of this API.
25///
26/// The _link title_ in Markdown is optional, when not given the string is set
27/// to the empty string `""`.  The back ticks \` in reStructuredText can be
28/// omitted when only one word is enclosed without spaces.
29///
30/// [markup hyperlink]: https://spec.commonmark.org/0.30/#links)
31/// [reference link]: https://spec.commonmark.org/0.30/#reference-link
32/// [link reference definition]: https://spec.commonmark.org/0.30/#link-reference-definition
33/// [combined inline link / link reference definition]: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#hyperlink-references
34/// [reference alias]: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#hyperlink-references
35/// [inline image]: https://spec.commonmark.org/0.30/#images
36/// [inline link with embedded inline image]: https://spec.commonmark.org/0.30/#example-519
37#[derive(Debug, PartialEq, Clone)]
38#[non_exhaustive]
39pub enum Link<'a> {
40    /// An _inline link_ with the following tuple values:
41    /// ```text
42    /// Text2Dest(link_text, link_destination, link_title)
43    /// ```
44    /// In (stand alone) **inline links** the destination and title are given
45    /// immediately after the link text. When an _inline link_ is rendered, only
46    /// the `link_text` is visible in the continuous text.
47    /// * Markdown example:
48    ///   ```md
49    ///       [link_text](link_dest "link title")
50    ///   ```
51    /// * reStructuredText example:
52    ///   ```rst
53    ///       `link_text <link_dest>`__
54    ///   ```
55    /// *  Asciidoc example:
56    ///    ```adoc
57    ///    http://link_dest[link_text]
58    ///    ```
59    /// *  Wikitext example:
60    ///    ```wm
61    ///    [http://link_dest link_text]
62    ///    ```
63    Text2Dest(Cow<'a, str>, Cow<'a, str>, Cow<'a, str>),
64
65    /// A _reference link_ with the following tuple values:
66    /// ```text
67    /// Text2Label(link_text, link_label)
68    /// ```
69    /// In **reference links** the destination and title are defined elsewhere
70    /// in the document in some _link reference definition_. When a _reference
71    /// link_ is rendered only `link_text` is visible.
72    /// * Markdown examples:
73    ///   ```md
74    ///   [link_text][link_label]
75    ///
76    ///   [link_text]
77    ///   ```
78    ///   When only _link text_ is given, _link label_ is set to the same string.
79    /// * reStructuredText examples:
80    ///   ```rst
81    ///   `link_text <link_label_>`_
82    ///
83    ///   `link_text`_
84    ///   ```
85    ///   When only _link text_ is given, _link label_ is set to the same string.
86    /// * Asciidoc example:
87    ///   ```adoc
88    ///   {link_label}[link_text]
89    ///   ```
90    Text2Label(Cow<'a, str>, Cow<'a, str>),
91
92    /// A _link reference definition_ with the following tuple values:
93    /// ```text
94    /// Label2Dest(link_label, link_destination, link_title)
95    /// ```
96    /// A **link reference definition** refers to a _reference link_ with the
97    /// same _link label_. A _link reference definition_ is not visible
98    /// when the document is rendered.
99    /// _link title_ is optional.
100    /// * Markdown example:
101    ///   ```md
102    ///   [link_label]: link_dest "link title"
103    ///   ```
104    /// * reStructuredText examples:
105    ///   ```rst
106    ///   .. _`link_label`: link_dest
107    ///
108    ///   .. __: link_dest
109    ///
110    ///   __ link_dest
111    ///   ```
112    ///   When `__` is given, the _link label_ is set to `"_"`, which is a marker
113    ///   for an anonymous _link label_.
114    /// * Asciidoc example:
115    ///   ```adoc
116    ///   :link_label: http://link_dest
117    ///   ```
118    Label2Dest(Cow<'a, str>, Cow<'a, str>, Cow<'a, str>),
119
120    /// An _inline link/link reference definition'_ with tuple values:
121    /// ```text
122    /// Label2Dest(link_text_label, link_destination, link_title)
123    /// ```
124    /// This type represents a combined **inline link** and **link reference
125    /// definition**. Semantically `TextLabel2Dest` is a shorthand for two links
126    /// `Text2Dest` and `Label2Dest` in one object, where _link text_ and _link
127    /// label_ are the same string. When rendered, _link text_ is visible.
128    ///
129    /// * Consider the following reStructuredText link:
130    ///   ```rst
131    ///   `link_text_label <link_dest>`_
132    ///
133    ///   `a <b>`_
134    ///   ```
135    ///   In this link is `b` the _link destination_ and `a` has a double role:
136    ///   it defines _link text_ of the first link `Text2Dest("a", "b", "")` and
137    ///   _link label_ of the second link `Label2Dest("a", "b", "")`.
138    ///
139    TextLabel2Dest(Cow<'a, str>, Cow<'a, str>, Cow<'a, str>),
140
141    /// A _reference alias_ with the following tuple values:
142    /// ```text
143    /// Label2Label(alt_link_label, link_label)
144    /// ```
145    /// The **reference alias** defines an alternative link label
146    /// `alt_link_label` for an existing `link_label` defined elsewhere in the
147    /// document. At some point, the `link_label` must be resolved to a
148    /// `link_destination` by a _link_reference_definition_. A _reference
149    /// alias_ is not visible when the document is rendered.
150    /// This link type is only available in reStructuredText, e.g.
151    /// ```rst
152    /// .. _`alt_link_label`: `link_label`_
153    /// ```
154    Label2Label(Cow<'a, str>, Cow<'a, str>),
155
156    /// An _inline image_ with the following tuple values:
157    /// ```text
158    /// Image(img_alt, img_src)
159    /// ```
160    /// Note: this crate does not contain parsers for this variant.
161    Image(Cow<'a, str>, Cow<'a, str>),
162
163    /// An _inline link_ with embedded _inline image_ and the following
164    /// tuple values.
165    /// ```text
166    /// Image2Text(text1, img_alt, img_src, text2, dest, title)
167    /// ```
168    Image2Dest(
169        Cow<'a, str>,
170        Cow<'a, str>,
171        Cow<'a, str>,
172        Cow<'a, str>,
173        Cow<'a, str>,
174        Cow<'a, str>,
175    ),
176}
177
178/// A parser that decodes percent encoded URLS.
179/// This parser consumes all input. It returns `Err` when the percent-decoded
180/// bytes are not well-formed in UTF-8.
181/// ```text
182/// use std::borrow::Cow;
183///
184/// let res = percent_decode("https://getreu.net/?q=%5Ba%20b%5D").unwrap();
185/// assert_eq!(res, ("", Cow::Owned("https://getreu.net/?q=[a b]".to_string())));
186///```
187fn percent_decode(i: &str) -> nom::IResult<&str, Cow<str>> {
188    let decoded = percent_decode_str(i)
189        .decode_utf8()
190        .map_err(|_| nom::Err::Error(nom::error::Error::new(i, ErrorKind::EscapedTransform)))?;
191    Ok(("", decoded))
192}
193
194#[test]
195fn test_percent_decode() {
196    let res = percent_decode("percent%20encoded string").unwrap();
197    assert!(matches!(res.1, Cow::Owned(..)));
198    assert_eq!(res.1, Cow::from("percent encoded string"));
199
200    let res = percent_decode("nothing").unwrap();
201    assert!(matches!(res.1, Cow::Borrowed(..)));
202    assert_eq!(res.1, Cow::from("nothing"));
203}