discord_markdown/
parser.rs

1//! Parse Discord MarkDown into an AST
2
3use nom::{IResult, Slice, branch::alt, bytes::complete::{is_not, tag, take_until}, combinator::{cond, map_opt, map_parser, recognize}, regex::Regex, sequence::{delimited, pair, preceded, terminated}};
4use lazy_static::lazy_static;
5
6/// Enum to represent the AST
7#[derive(Debug, PartialEq)]
8pub enum Expression<'a> {
9    Text(&'a str),
10    CustomEmoji(&'a str, String),
11    User(&'a str),
12    Role(&'a str),
13    Channel(&'a str),
14    Hyperlink(&'a str, &'a str),
15    MultilineCode(&'a str),
16    InlineCode(&'a str),
17    Blockquote(Vec<Expression<'a>>),
18    Spoiler(Vec<Expression<'a>>),
19    Underline(Vec<Expression<'a>>),
20    Strikethrough(Vec<Expression<'a>>),
21    Bold(Vec<Expression<'a>>),
22    Italics(Vec<Expression<'a>>),
23    Newline,
24}
25
26lazy_static! {
27    static ref CUSTOM_EMOJI_RE: Regex = Regex::new(r"^<(a?):(\w+):(\d+)(>)").unwrap();
28    static ref USER_RE: Regex = Regex::new(r"^<@!?(\d+)(>)").unwrap();
29    static ref ROLE_RE: Regex = Regex::new(r"^<@&(\d+)(>)").unwrap();
30    static ref CHANNEL_RE: Regex = Regex::new(r"^<#(\d+)(>)").unwrap();
31    static ref LINK_RE: Regex = Regex::new(r"^(https?|ftp|file)(://[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[A-Za-z0-9+&@#/%=~_|])").unwrap();
32}
33
34// Re-implement re_capture from nom, but make it take &'a Regex instead of Regex
35// This provides a noticeable speed improvement since we don't have to RE.clone() each time
36fn re_capture<'a, E>(re: &'a Regex) -> impl Fn(&'a str) -> IResult<&'a str, Vec<&'a str>, E>
37    where
38        E: nom::error::ParseError<&'a str>,
39{
40    move |i| {
41        if let Some(c) = re.captures(i) {
42            let v: Vec<_> = c
43                .iter()
44                .filter(|el| el.is_some())
45                .map(|el| el.unwrap())
46                .map(|m| i.slice(m.start()..m.end()))
47                .collect();
48            let offset = {
49                let end = v.last().unwrap();
50                end.as_ptr() as usize + end.len() - i.as_ptr() as usize
51            };
52            Ok((i.slice(offset..), v))
53        } else {
54            Err(nom::Err::Error(E::from_error_kind(i, nom::error::ErrorKind::RegexpCapture)))
55        }
56    }
57}
58
59// Parses custom emoji
60fn custom_emoji<'a>(input: &'a str) -> IResult<&str, Expression<'a>> {
61    let (input, custom_emoji) = re_capture(&CUSTOM_EMOJI_RE)(input)?;
62    let extension = if custom_emoji[1] == "a" { "gif" } else { "png" };
63    Ok((input, Expression::CustomEmoji(custom_emoji[2], format!("{}.{}", custom_emoji[3], extension))))
64}
65
66// Parses user mentions
67fn user<'a>(input: &'a str) -> IResult<&str, Expression<'a>> {
68    let (input, user) = re_capture(&USER_RE)(input)?;
69    Ok((input, Expression::User(user[1])))
70}
71
72// Parses role mentions
73fn role<'a>(input: &'a str) -> IResult<&str, Expression<'a>> {
74    let (input, role) = re_capture(&ROLE_RE)(input)?;
75    Ok((input, Expression::Role(role[1])))
76}
77
78// Parses channel links
79fn channel<'a>(input: &'a str) -> IResult<&str, Expression<'a>> {
80    let (input, channel) = re_capture(&CHANNEL_RE)(input)?;
81    Ok((input, Expression::Channel(channel[1])))
82}
83
84fn hyperlink_internals(input: &str) -> IResult<&str, (&str, &str)> {
85    let (input, hyperlink) = alt((
86        re_capture(&LINK_RE),
87        delimited(tag("<"), re_capture(&LINK_RE), tag(">")),
88    ))(input)?;
89    Ok((input, (hyperlink[0], hyperlink[0])))
90}
91
92// Parses hyperlinks
93fn hyperlink<'a>(input: &'a str) -> IResult<&str, Expression<'a>> {
94    let (input, hyperlink) = hyperlink_internals(input)?;
95    Ok((input, Expression::Hyperlink(hyperlink.0, hyperlink.1)))
96}
97
98// Parses hyperlinks with support for alt text
99fn md_hyperlink<'a>(input: &'a str) -> IResult<&str, Expression<'a>> {
100    let (input, hyperlink) = alt((
101        hyperlink_internals,
102        pair(
103            delimited(tag("["), take_until("]"), tag("]")),
104            delimited(tag("("), |input| {
105                let x = hyperlink_internals(input)?;
106                Ok((x.0, x.1.0))
107            }, tag(")"))
108        ),
109    ))(input)?;
110    Ok((input, Expression::Hyperlink(hyperlink.0, hyperlink.1)))
111}
112
113fn multiline_code<'a>(input: &'a str) -> IResult<&str, Expression<'a>> {
114    let (input, multiline_code) = delimited(tag("```"), take_until("```"), tag("```"))(input)?;
115    Ok((input, Expression::MultilineCode(multiline_code)))
116}
117
118fn inline_code<'a>(input: &'a str) -> IResult<&str, Expression<'a>> {
119    let (input, inline_code) = alt((
120        // If the inline code block is delimited by ``
121        delimited(tag("``"), take_until("``"), tag("``")),
122        // If the inline code block is delimited by `
123        delimited(tag("`"), is_not("`"), tag("`")),
124    ))(input)?;
125    Ok((input, Expression::InlineCode(inline_code)))
126}
127
128fn blockquote<'a>(input: &'a str) -> IResult<&str, Expression<'a>> {
129    let (input, blockquote) = map_parser(alt((
130        // Blockquote until end of line
131        delimited(tag("> "), is_not("\n"), tag("\n")),
132        // Special case for `> \n`
133        preceded(tag("> "), tag("\n")),
134        // Blockquote until end of file
135        preceded(tag("> "), is_not("\n")),
136    )), parse_section)(input)?;
137    Ok((input, Expression::Blockquote(blockquote)))
138}
139
140fn spoiler<'a>(input: &'a str) -> IResult<&str, Expression<'a>> {
141    let (input, spoiler) = map_parser(
142        delimited(tag("||"), take_until("||"), tag("||")),
143        parse_section,
144    )(input)?;
145    Ok((input, Expression::Spoiler(spoiler)))
146}
147
148fn underline<'a>(input: &'a str) -> IResult<&str, Expression<'a>> {
149    let (input, underline) = map_parser(
150        alt((
151            // Special case with four surrounding underlines
152            delimited(tag("____"), take_until("____"), tag("____")),
153            // Special case with three surrounding underlines
154            delimited(
155                tag("__"),
156                recognize(delimited(tag("_"), take_until("___"), tag("_"))),
157                tag("__"),
158            ),
159            // Special case with three underscores at the end alone
160            delimited(
161                tag("__"),
162                recognize(terminated(take_until("___"), tag("_"))),
163                tag("__"),
164            ),
165            delimited(tag("__"), take_until("__"), tag("__")),
166        )),
167        parse_section,
168    )(input)?;
169    Ok((input, Expression::Underline(underline)))
170}
171
172fn strikethrough<'a>(input: &'a str) -> IResult<&str, Expression<'a>> {
173    let (input, strikethrough) = map_parser(
174        delimited(tag("~~"), take_until("~~"), tag("~~")),
175        parse_section,
176    )(input)?;
177    Ok((input, Expression::Strikethrough(strikethrough)))
178}
179
180fn bold<'a>(input: &'a str) -> IResult<&str, Expression<'a>> {
181    let (input, bold) = map_parser(
182        alt((
183            // Special case with four surrounding asterisks
184            delimited(tag("****"), take_until("****"), tag("****")),
185            // Special case with three surrounding asterisks
186            delimited(
187                tag("**"),
188                recognize(delimited(tag("*"), take_until("***"), tag("*"))),
189                tag("**"),
190            ),
191            // Special case with three asterisks at the end alone
192            delimited(
193                tag("**"),
194                recognize(terminated(take_until("***"), tag("*"))),
195                tag("**"),
196            ),
197            delimited(tag("**"), take_until("**"), tag("**")),
198        )),
199        parse_section,
200    )(input)?;
201    Ok((input, Expression::Bold(bold)))
202}
203
204fn italics<'a>(input: &'a str) -> IResult<&str, Expression<'a>> {
205    let (input, italics) = map_parser(
206        alt((
207            delimited(tag("_"), is_not("_"), tag("_")),
208            delimited(tag("*"), is_not("*"), tag("*")),
209        )),
210        parse_section,
211    )(input)?;
212    Ok((input, Expression::Italics(italics)))
213}
214
215fn apply_parsers(
216    allow_blockquote: bool,
217    md_hyperlinks: bool,
218    input: &str,
219) -> IResult<&str, Expression> {
220    alt((
221        map_opt(cond(allow_blockquote, blockquote), |o| o),
222        custom_emoji,
223        user,
224        role,
225        channel,
226        if md_hyperlinks {md_hyperlink} else {hyperlink},
227        multiline_code,
228        inline_code,
229        spoiler,
230        underline,
231        strikethrough,
232        bold,
233        italics,
234    ))(input)
235}
236
237fn parse_internals<'a>(
238    mut input: &'a str,
239    mut allow_blockquote: bool,
240    md_hyperlinks: bool,
241) -> IResult<&str, Vec<Expression<'a>>> {
242    // Attempt to parse everything until we encounter a newline/end of input
243    let mut result = Vec::new();
244
245    'outer: while input.len() != 0 {
246        for (i, c) in input.char_indices() {
247            if c == '\n' {
248                // If it's a newline, we can parse blockquotes starting from the next character
249                if i > 0 {
250                    result.push(Expression::Text(&input[..i]))
251                }
252                result.push(Expression::Newline);
253                allow_blockquote = true;
254                // Remove the parsed part from `input` and restart the for loop
255                // We can safely do i + 1 because the input can't end with \n (it's stripped)
256                input = &input[i + 1..];
257                continue 'outer;
258            } else if c == '¯' && input[i..].starts_with(r"¯\_(ツ)_/¯") {
259                // Parse shrug emote
260                if i > 0 {
261                    result.push(Expression::Text(&input[..i]))
262                }
263                // Push the shrug emote as Expression::Text
264                result.push(Expression::Text(r"¯\_(ツ)_/¯"));
265                // Remove the parsed part from `input` and restart the for loop
266                input = &input[i + r"¯\_(ツ)_/¯".len()..];
267                continue 'outer;
268            } else if c == '\\' && input[i..].len() > 1 {
269                // If it's a backslash, we should escape the following character
270                if i > 0 {
271                    result.push(Expression::Text(&input[..i]))
272                }
273                // Push the escaped character as Expression::Text
274                let (char_pos, c) = input.char_indices().nth(i + 1).unwrap();
275                result.push(Expression::Text(&input[char_pos..char_pos + c.len_utf8()]));
276                // Remove the parsed part from `input` and restart the for loop
277                input = &input[char_pos + c.len_utf8()..];
278                continue 'outer;
279            }
280            if let Ok((remaining, expr)) = apply_parsers(allow_blockquote, md_hyperlinks, &input[i..]) {
281                // Don't reset blockquote if we just matched on a blockquote because it consumes a
282                // succeeding newline if it exists, and if it doesn't, `allow_blockquote` doesn't
283                // matter anyway
284                if !matches!(expr, Expression::Blockquote(_)) {
285                    // Reset allow_blockquote because we're not immediately after a newline
286                    allow_blockquote = false;
287                }
288                // Add the text up to the parsed expression as Expression::Text
289                if i > 0 {
290                    result.push(Expression::Text(&input[..i]))
291                }
292                // Add the parsed expression
293                result.push(expr);
294                // Remove the parsed part from `input` and restart the for loop
295                input = remaining;
296                continue 'outer;
297            } else {
298                allow_blockquote = false;
299            }
300        }
301        if input.len() != 0 {
302            result.push(Expression::Text(input));
303            input = "";
304        }
305    }
306
307    Ok((input, result))
308}
309
310fn parse_section<'a>(mut input: &'a str) -> IResult<&str, Vec<Expression<'a>>> {
311    parse_internals(&mut input, false, false)
312}
313
314/// Parses the given input string as Discord MarkDown and returns a vector of `Expression`s
315///
316/// ```
317/// use discord_markdown::parser::{parse, Expression::*};
318///
319/// let ast = parse(
320///     "> Can someone link the rust website?\n<@123456789123456789> https://www.rust-lang.org"
321/// );
322///
323/// assert_eq!(ast, vec![
324///     Blockquote(vec![Text("Can someone link the rust website?")]),
325///     User("123456789123456789"),
326///     Text(" "),
327///     Hyperlink("https://www.rust-lang.org", "https://www.rust-lang.org"),
328/// ]);
329/// ```
330pub fn parse(mut input: &str) -> Vec<Expression> {
331    parse_internals(&mut input, true, false).unwrap().1
332}
333
334/// Parses the given input string as Discord MarkDown with support for hyperlinks with alt text
335/// (used in discord embeds) and returns a vector of `Expression`s
336///
337/// ```
338/// use discord_markdown::parser::{parse_with_md_hyperlinks, Expression::*};
339///
340/// let ast = parse_with_md_hyperlinks("_link_: [example](https://example.com)");
341/// assert_eq!(ast, vec![
342///     Italics(vec![Text("link")]),
343///     Text(": "),
344///     Hyperlink("example", "https://example.com"),
345/// ]);
346/// ```
347pub fn parse_with_md_hyperlinks(mut input: &str) -> Vec<Expression> {
348    parse_internals(&mut input, true, true).unwrap().1
349}