parse_hyperlinks/parser/
mod.rs

1//! This module implements parsers to extract hyperlinks and link reference
2//! definitions from text input.
3
4pub mod asciidoc;
5pub mod html;
6pub mod html_img;
7pub mod markdown;
8pub mod markdown_img;
9pub mod parse;
10pub mod restructured_text;
11pub mod wikitext;
12use nom::error::ErrorKind;
13use percent_encoding::percent_decode_str;
14use std::borrow::Cow;
15
16/// A [hyperlink] with the following variants:
17/// * an [inline link] `Text2Dev`,
18/// * a [reference link] `Text2Label`,
19/// * a [link reference definition] `Label2Dest`,
20/// * a [combined inline link / link reference definition] `TextLabel2Dest`,
21/// * a [reference alias] `Label2Label`,
22/// * an [inline image] `Image` or
23/// * an [inline link with embedded inline image] `Image2Dest`
24///
25/// This is the main return type of this API.
26///
27/// The _link title_ in Markdown is optional, when not given the string is set
28/// to the empty string `""`.  The back ticks \` in reStructuredText can be
29/// omitted when only one word is enclosed without spaces.
30///
31/// [markup hyperlink]: https://spec.commonmark.org/0.30/#links)
32/// [reference link]: https://spec.commonmark.org/0.30/#reference-link
33/// [link reference definition]: https://spec.commonmark.org/0.30/#link-reference-definition
34/// [combined inline link / link reference definition]: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#hyperlink-references
35/// [reference alias]: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#hyperlink-references
36/// [inline image]: https://spec.commonmark.org/0.30/#images
37/// [inline link with embedded inline image]: https://spec.commonmark.org/0.30/#example-519
38#[derive(Debug, PartialEq, Clone)]
39#[non_exhaustive]
40pub enum Link<'a> {
41    /// An _inline link_ with the following tuple values:
42    /// ```text
43    /// Text2Dest(link_text, link_destination, link_title)
44    /// ```
45    /// In (stand alone) **inline links** the destination and title are given
46    /// immediately after the link text. When an _inline link_ is rendered, only
47    /// the `link_text` is visible in the continuous text.
48    /// * Markdown example:
49    ///   ```md
50    ///       [link_text](link_dest "link title")
51    ///   ```
52    /// * reStructuredText example:
53    ///   ```rst
54    ///       `link_text <link_dest>`__
55    ///   ```
56    /// *  Asciidoc example:
57    ///    ```adoc
58    ///    http://link_dest[link_text]
59    ///    ```
60    /// *  Wikitext example:
61    ///    ```wm
62    ///    [http://link_dest link_text]
63    ///    ```
64    Text2Dest(Cow<'a, str>, Cow<'a, str>, Cow<'a, str>),
65
66    /// A _reference link_ with the following tuple values:
67    /// ```text
68    /// Text2Label(link_text, link_label)
69    /// ```
70    /// In **reference links** the destination and title are defined elsewhere
71    /// in the document in some _link reference definition_. When a _reference
72    /// link_ is rendered only `link_text` is visible.
73    /// * Markdown examples:
74    ///   ```md
75    ///   [link_text][link_label]
76    ///
77    ///   [link_text]
78    ///   ```
79    ///   When only _link text_ is given, _link label_ is set to the same string.
80    /// * reStructuredText examples:
81    ///   ```rst
82    ///   `link_text <link_label_>`_
83    ///
84    ///   `link_text`_
85    ///   ```
86    ///   When only _link text_ is given, _link label_ is set to the same string.
87    /// * Asciidoc example:
88    ///   ```adoc
89    ///   {link_label}[link_text]
90    ///   ```
91    Text2Label(Cow<'a, str>, Cow<'a, str>),
92
93    /// A _link reference definition_ with the following tuple values:
94    /// ```text
95    /// Label2Dest(link_label, link_destination, link_title)
96    /// ```
97    /// A **link reference definition** refers to a _reference link_ with the
98    /// same _link label_. A _link reference definition_ is not visible
99    /// when the document is rendered.
100    /// _link title_ is optional.
101    /// * Markdown example:
102    ///   ```md
103    ///   [link_label]: link_dest "link title"
104    ///   ```
105    /// * reStructuredText examples:
106    ///   ```rst
107    ///   .. _`link_label`: link_dest
108    ///
109    ///   .. __: link_dest
110    ///
111    ///   __ link_dest
112    ///   ```
113    ///   When `__` is given, the _link label_ is set to `"_"`, which is a marker
114    ///   for an anonymous _link label_.
115    /// * Asciidoc example:
116    ///   ```adoc
117    ///   :link_label: http://link_dest
118    ///   ```
119    Label2Dest(Cow<'a, str>, Cow<'a, str>, Cow<'a, str>),
120
121    /// An _inline link/link reference definition'_ with tuple values:
122    /// ```text
123    /// Label2Dest(link_text_label, link_destination, link_title)
124    /// ```
125    /// This type represents a combined **inline link** and **link reference
126    /// definition**. Semantically `TextLabel2Dest` is a shorthand for two links
127    /// `Text2Dest` and `Label2Dest` in one object, where _link text_ and _link
128    /// label_ are the same string. When rendered, _link text_ is visible.
129    ///
130    /// * Consider the following reStructuredText link:
131    ///   ```rst
132    ///   `link_text_label <link_dest>`_
133    ///
134    ///   `a <b>`_
135    ///   ```
136    ///   In this link is `b` the _link destination_ and `a` has a double role:
137    ///   it defines _link text_ of the first link `Text2Dest("a", "b", "")` and
138    ///   _link label_ of the second link `Label2Dest("a", "b", "")`.
139    ///
140    TextLabel2Dest(Cow<'a, str>, Cow<'a, str>, Cow<'a, str>),
141
142    /// A _reference alias_ with the following tuple values:
143    /// ```text
144    /// Label2Label(alt_link_label, link_label)
145    /// ```
146    /// The **reference alias** defines an alternative link label
147    /// `alt_link_label` for an existing `link_label` defined elsewhere in the
148    /// document. At some point, the `link_label` must be resolved to a
149    /// `link_destination` by a _link_reference_definition_. A _reference
150    /// alias_ is not visible when the document is rendered.
151    /// This link type is only available in reStructuredText, e.g.
152    /// ```rst
153    /// .. _`alt_link_label`: `link_label`_
154    /// ```
155    Label2Label(Cow<'a, str>, Cow<'a, str>),
156
157    /// An _inline image_ with the following tuple values:
158    /// ```text
159    /// Image(img_alt, img_src)
160    /// ```
161    /// Note: this crate does not contain parsers for this variant.
162    Image(Cow<'a, str>, Cow<'a, str>),
163
164    /// An _inline link_ with embedded _inline image_ and the following
165    /// tuple values.
166    /// ```text
167    /// Image2Text(text1, img_alt, img_src, text2, dest, title)
168    /// ```
169    Image2Dest(
170        Cow<'a, str>,
171        Cow<'a, str>,
172        Cow<'a, str>,
173        Cow<'a, str>,
174        Cow<'a, str>,
175        Cow<'a, str>,
176    ),
177}
178
179/// A parser that decodes percent encoded URLS.
180/// This parser consumes all input. It returns `Err` when the percent-decoded
181/// bytes are not well-formed in UTF-8.
182/// ```text
183/// use std::borrow::Cow;
184///
185/// let res = percent_decode("https://getreu.net/?q=%5Ba%20b%5D").unwrap();
186/// assert_eq!(res, ("", Cow::Owned("https://getreu.net/?q=[a b]".to_string())));
187///```
188fn percent_decode(i: &'_ str) -> nom::IResult<&'_ str, Cow<'_, str>> {
189    let decoded = percent_decode_str(i)
190        .decode_utf8()
191        .map_err(|_| nom::Err::Error(nom::error::Error::new(i, ErrorKind::EscapedTransform)))?;
192    Ok(("", decoded))
193}
194
195#[test]
196fn test_percent_decode() {
197    let res = percent_decode("percent%20encoded string").unwrap();
198    assert!(matches!(res.1, Cow::Owned(..)));
199    assert_eq!(res.1, Cow::from("percent encoded string"));
200
201    let res = percent_decode("nothing").unwrap();
202    assert!(matches!(res.1, Cow::Borrowed(..)));
203    assert_eq!(res.1, Cow::from("nothing"));
204}