rustla/parser/state_machine/
inline.rs

1/*!
2A submodule related to parsing blocks of text for inline elements.
3
4### Inline markup recognition rules
5
6Inline markup start-strings and end-strings are only recognized if the following conditions are met:
7
81. Inline markup start-strings must be immediately followed by non-whitespace.
92. Inline markup end-strings must be immediately preceded by non-whitespace.
103. The inline markup end-string must be separated by at least one character from the start-string.
114. Both, inline markup start-string and end-string must not be preceded by an unescaped backslash
12   (except for the end-string of inline literals). See Escaping Mechanism above for details.
135. If an inline markup start-string is immediately preceded by one of the ASCII characters ' " < ( [ { or a similar non-ASCII character,
14   it must not be followed by the corresponding closing character from ' " ) ] } > or a similar non-ASCII character.
15   (For quotes, matching characters can be any of the quotation marks in international usage.)
16
17If the configuration setting simple-inline-markup is False (default),
18additional conditions apply to the characters "around" the inline markup:
19
206. Inline markup start-strings must start a text block or be immediately preceded by
21    * whitespace,
22    * one of the ASCII characters - : / ' " < ( [ {
23    * or a similar non-ASCII punctuation character.
24
257. Inline markup end-strings must end a text block or be immediately followed by
26    * whitespace,
27    * one of the ASCII characters - . , : ; ! ? \ / ' " ) ] } >
28    * or a similar non-ASCII punctuation character.
29
30Copyright © 2020 Santtu Söderholm
31*/
32
33use super::*;
34use crate::common::normalize_refname;
35use crate::common::Reference;
36use utf8_to_latex::unicode_text_to_latex;
37
38/// Parses inline text elements that have identical opening
39/// and closing delimiters such as `**strong emphasis**` or ``` ``literal_text`` ```.
40pub fn paired_delimiter(
41    opt_doctree_ref: &mut Option<&mut DocTree>,
42    pattern_name: Pattern,
43    captures: &regex::Captures,
44) -> (Vec<TreeNodeType>, usize) {
45    // Destructuring the regex parts...
46
47    let lookbehind_str = if let Some(lookbehind) = captures.name("lookbehind") {
48        lookbehind.as_str()
49    } else {
50        ""
51    };
52    let markup_start = captures.name("markup_start").unwrap().as_str();
53    let content = captures.name("content").unwrap().as_str();
54    let markup_end = captures.name("markup_end").unwrap().as_str();
55    let lookahead_str = if let Some(lookahead) = captures.name("lookahead") {
56        lookahead.as_str()
57    } else {
58        ""
59    };
60
61    let content_string = unicode_text_to_latex(content);
62
63    let mut node_vec = Vec::<TreeNodeType>::new();
64    let mut char_count: usize = 0;
65
66    if quotation_matches(lookbehind_str, content) {
67        let quoted_start_char_count = lookbehind_str.chars().count()
68            + markup_start.chars().count()
69            + content.chars().count()
70            + markup_end.chars().count();
71
72        let quoted_start_string: String = captures
73            .get(0)
74            .unwrap()
75            .as_str()
76            .chars()
77            .take(quoted_start_char_count)
78            .collect::<String>();
79
80        return (
81            vec![
82                TreeNodeType::Text {
83                    text: unicode_text_to_latex(quoted_start_string.as_str()),
84                }
85            ],
86            quoted_start_char_count,
87        );
88    }
89
90    if !lookbehind_str.is_empty() {
91        char_count += lookbehind_str.chars().count();
92        node_vec.push(
93            TreeNodeType::Text {
94                text: unicode_text_to_latex(lookbehind_str),
95            }
96        );
97    }
98
99    char_count += markup_start.chars().count()
100        + content.chars().count()
101        + markup_end.chars().count();
102    let markup_data = match pattern_name {
103        Pattern::StrongEmphasis => TreeNodeType::StrongEmphasis {
104            text: content_string,
105        },
106        Pattern::Emphasis => TreeNodeType::Emphasis {
107            text: content_string,
108        },
109        Pattern::Literal => TreeNodeType::Literal {
110            text: content_string,
111        },
112        _ => panic!("No such simple paired delimiter type!"),
113    };
114
115    node_vec.push(markup_data);
116
117    (node_vec, char_count)
118}
119
120/// Parses inline reference targets. These do not actually create new nodes,
121/// but push new labels into the doctree's inline target stack.
122pub fn inline_target(
123    opt_doctree_ref: &mut Option<&mut DocTree>,
124    pattern_name: Pattern,
125    captures: &regex::Captures,
126) -> (Vec<TreeNodeType>, usize) {
127    let lookbehind_str = if let Some(lookbehind) = captures.name("lookbehind") {
128        lookbehind.as_str()
129    } else {
130        ""
131    };
132    let markup_start = captures.name("markup_start").unwrap().as_str();
133    let content = captures.name("content").unwrap().as_str();
134    let markup_end = captures.name("markup_end").unwrap().as_str();
135    let lookahead_str = if let Some(lookahead) = captures.name("lookahead") {
136        lookahead.as_str()
137    } else {
138        ""
139    };
140
141    let lookbehind_len = lookbehind_str.chars().count();
142    let markup_start_len = markup_start.chars().count();
143    let content_len = content.chars().count();
144    let markup_end_len = markup_end.chars().count();
145    let lookahead_len = lookbehind_str.chars().count();
146
147    let mut node_vec = Vec::<TreeNodeType>::new();
148    let mut char_count = 0usize;
149
150    if !lookbehind_str.is_empty() {
151        let lookbehind_node = TreeNodeType::Text {
152            text: unicode_text_to_latex(lookbehind_str),
153        };
154        node_vec.push(lookbehind_node);
155        char_count += lookbehind_len;
156    }
157
158    if let Some(doctree) = opt_doctree_ref {
159        let normalized_label = normalize_refname(content);
160        doctree.push_to_internal_target_stack(normalized_label);
161    } else {
162        eprintln!(
163            "No doctree given so cannot process a new internal target \"{}{}{}\"...",
164            markup_start, content, markup_end
165        );
166    }
167
168    char_count += markup_start_len + content_len + markup_end_len;
169
170    (node_vec, char_count)
171}
172
173/// Parses inline whitespace.
174pub fn whitespace(
175    opt_doctree_ref: &mut Option<&mut DocTree>,
176    pattern_name: Pattern,
177    captures: &regex::Captures,
178) -> (Vec<TreeNodeType>, usize) {
179    let content = captures.get(0).unwrap();
180    let node_data = TreeNodeType::WhiteSpace {
181        text: String::from(content.as_str()),
182    };
183    let match_len = content.as_str().chars().count();
184
185    (vec![node_data], match_len)
186}
187
188pub fn interpreted_text(
189    opt_doctree_ref: &mut Option<&mut DocTree>,
190    pattern_name: Pattern,
191    captures: &regex::Captures,
192) -> (Vec<TreeNodeType>, usize) {
193    let whole_match = captures.get(0).unwrap().as_str();
194    let lookbehind_str = if let Some(lookbehind) = captures.name("lookbehind") {
195        lookbehind.as_str()
196    } else {
197        ""
198    };
199    let front_role_marker = if let Some(marker) = captures.name("front_role_marker") {
200        marker.as_str()
201    } else {
202        ""
203    };
204    let front_role = if let Some(role) = captures.name("front_role") {
205        role.as_str()
206    } else {
207        ""
208    };
209    let markup_start_str = captures.name("markup_start").unwrap().as_str();
210    let content = captures.name("content").unwrap().as_str();
211    let markup_end_str = captures.name("markup_end").unwrap().as_str();
212    let back_role_marker = if let Some(marker) = captures.name("back_role_marker") {
213        marker.as_str()
214    } else {
215        ""
216    };
217    let back_role = if let Some(role) = captures.name("back_role") {
218        role.as_str()
219    } else {
220        ""
221    };
222    let lookahead_str = if let Some(lookahead) = captures.name("lookahead") {
223        lookahead.as_str()
224    } else {
225        ""
226    };
227
228    let whole_match_len = whole_match.chars().count();
229    let lookbehind_len = lookbehind_str.chars().count();
230    let front_role_marker_len = front_role_marker.chars().count();
231    let front_role_len = front_role.chars().count();
232    let markup_start_len = markup_start_str.chars().count();
233    let content_len = content.chars().count();
234    let markup_end_len = markup_end_str.chars().count();
235    let back_role_marker_len = back_role_marker.chars().count();
236    let back_role_len = back_role.chars().count();
237    let lookahead_len = lookahead_str.chars().count();
238
239    if !front_role_marker.is_empty() && !back_role_marker.is_empty() {
240        eprintln!("Warning: found both pre- and suffix roles for interpreted text. Returning whole match as inline literal...");
241        let match_len = (lookbehind_str.to_string()
242            + front_role_marker
243            + markup_start_str
244            + content
245            + markup_end_str
246            + back_role_marker)
247            .chars()
248            .count();
249        let match_string: String = whole_match.chars().take(match_len).collect();
250        return (
251            vec![TreeNodeType::Literal { text: match_string }],
252            match_len,
253        );
254    }
255
256    if !front_role_marker.is_empty() && quotation_matches(lookbehind_str, front_role) {
257        let quoted_start_char_count = 2 * lookbehind_len + ":".chars().count();
258
259        let quoted_start_string: String = captures
260            .get(0)
261            .unwrap()
262            .as_str()
263            .chars()
264            .take(quoted_start_char_count)
265            .collect();
266
267        return (
268            vec![TreeNodeType::Text {
269                text: quoted_start_string,
270            }],
271            quoted_start_char_count,
272        );
273    } else if front_role_marker.is_empty() && quotation_matches(lookbehind_str, content) {
274        let quoted_start_char_count = lookbehind_len + markup_start_len;
275
276        let quoted_start_string: String = captures
277            .get(0)
278            .unwrap()
279            .as_str()
280            .chars()
281            .take(quoted_start_char_count)
282            .collect();
283
284        return (
285            vec![
286                TreeNodeType::Text { text: quoted_start_string }
287            ],
288            quoted_start_char_count,
289        );
290    } else if !lookbehind_str.is_empty() {
291        return (
292            vec![
293                TreeNodeType::Text { text: unicode_text_to_latex(lookbehind_str) }
294            ],
295            lookbehind_str.chars().count(),
296        );
297    }
298
299    let match_len = (lookbehind_str.to_string()
300        + front_role_marker
301        + markup_start_str
302        + content
303        + markup_end_str
304        + back_role_marker)
305        .chars()
306        .count();
307    let role = if !front_role.is_empty() {
308        front_role
309    } else if !back_role.is_empty() {
310        back_role
311    } else {
312
313        /// This is used as the interpreted text role, if no role was specified.
314        /// This is in accordance with the
315        /// [reStructuredText Markup Specification](https://docutils.sourceforge.io/docs/ref/rst/roles.html).
316        const DEFAULT_DEFAULT_ROLE: &str = "title-reference";
317
318        eprintln!(
319            "Warning: no role found for interpreted text. Using {}...",
320            DEFAULT_DEFAULT_ROLE
321        );
322        return (
323            vec![TreeNodeType::TitleReference {
324                displayed_text: content.to_string(),
325                target_label: normalize_refname(content),
326            }],
327            match_len,
328        );
329    };
330
331    match role {
332        "emphasis" => (
333            vec![TreeNodeType::Emphasis {
334                text: content.to_string(),
335            }],
336            match_len,
337        ),
338        "literal" => (
339            vec![TreeNodeType::Literal {
340                text: content.to_string(),
341            }],
342            match_len,
343        ),
344        "code" => (
345            vec![TreeNodeType::Literal {
346                text: content.to_string(),
347            }],
348            match_len,
349        ),
350        "math" => {
351            use utf8_to_latex::unicode_math_to_latex;
352            // TODO: add conversions from utf8-characters such as greek letters
353            //  to LaTeX macros to this, maybe via a "utf8_to_latex" function.
354            let content_string = unicode_math_to_latex(content);
355            (
356                vec![TreeNodeType::Math {
357                    text: content_string,
358                    class: None,
359                    name: None,
360                }],
361                match_len,
362            )
363        }
364        "pep-reference" | "PEP" => {
365            // PEP reference strings are 4 digits long
366            let zeroes = "0".repeat(4 - content_len);
367            let pep_ref = format!(
368                "https://www.python.org/peps/pep-{pep_num}.html",
369                pep_num = zeroes + content
370            );
371            let displayed_text = "PEP ".to_string() + content;
372            (
373                vec![TreeNodeType::Reference {
374                    displayed_text: Some(displayed_text),
375                    reference: crate::common::Reference::URI(pep_ref),
376                }],
377                match_len,
378            )
379        }
380        "rfc-reference" | "RFC" => {
381            let rfc_ref = format!(
382                "https://www.faqs.org/rfcs/rfc{rfc_num}.html",
383                rfc_num = content
384            );
385            let displayed_text = "RFC ".to_string() + content;
386            (
387                vec![TreeNodeType::Reference {
388                    displayed_text: Some(displayed_text),
389                    reference: crate::common::Reference::URI(rfc_ref),
390                }],
391                match_len,
392            )
393        }
394        "strong" => (
395            vec![TreeNodeType::StrongEmphasis {
396                text: content.to_string(),
397            }],
398            match_len,
399        ),
400        "subscript" => (
401            vec![TreeNodeType::Subscript {
402                text: content.to_string(),
403            }],
404            match_len,
405        ),
406        "superscript" => (
407            vec![TreeNodeType::Superscript {
408                text: content.to_string(),
409            }],
410            match_len,
411        ),
412        "title-reference" => (
413            vec![TreeNodeType::TitleReference {
414                displayed_text: content.to_string(),
415                target_label: normalize_refname(content),
416            }],
417            match_len,
418        ),
419
420        // Sphinx-specific roles
421        // "ref" => {
422        //   // TODO: Parse the content string with Parser::inline parse and handle the output accordingly.
423        //   (
424        //     vec![
425        //       TreeNodeType::Reference {
426        //         displayed_text: Some(content.to_string()),
427        //         reference: crate::common::Reference::Internal(normalize_refname(content))
428        //       }
429        //     ], match_len
430        //   )
431        // }
432        _ => {
433            // Unknown role into literal
434            let match_len = (lookbehind_str.to_string()
435                + front_role_marker
436                + markup_start_str
437                + content
438                + markup_end_str
439                + back_role_marker)
440                .chars()
441                .count();
442            let match_string: String = whole_match.chars().take(match_len).collect();
443            return (
444                vec![
445                    TreeNodeType::InterpretedText {
446                        role: role.to_string(),
447                        content: content.to_string()
448                }],
449                match_len,
450            );
451        }
452    }
453}
454
455/// Parses simple hyperlink references.
456pub fn simple_ref(
457    opt_doctree_ref: &mut Option<&mut DocTree>,
458    pattern_name: Pattern,
459    captures: &regex::Captures,
460) -> (Vec<TreeNodeType>, usize) {
461    let lookbehind_str = if let Some(lookbehind) = captures.name("lookbehind") {
462        lookbehind.as_str()
463    } else {
464        ""
465    };
466    let content = captures.name("content").unwrap().as_str();
467    let ref_type = captures.name("ref_type").unwrap().as_str();
468    let lookahead_str = if let Some(lookahead) = captures.name("lookahead") {
469        lookahead.as_str()
470    } else {
471        ""
472    };
473
474    if !lookbehind_str.is_empty() {
475        return (
476            vec![TreeNodeType::Text {
477                text: unicode_text_to_latex(lookbehind_str),
478            }],
479            lookbehind_str.chars().count(),
480        );
481    } else {
482        let target_label: String = match ref_type {
483            "__" => {
484                // Automatic reference label => ask doctree for label, if present. Else use the manual label
485
486                if let Some(doctree) = opt_doctree_ref {
487                    doctree.next_anon_reference_label()
488                } else {
489                    eprintln!("Warning: detected an automatic reference name but no doctree available to generate one...");
490                    normalize_refname(content)
491                }
492            }
493            "_" => {
494                // Manual reference label
495                normalize_refname(content)
496            }
497            _ => unreachable!(
498                "Only automatic or manual reference types are recognized. Computer says no..."
499            ),
500        };
501
502        let ref_node = TreeNodeType::Reference {
503            displayed_text: None,
504            reference: crate::common::Reference::Internal(target_label),
505        };
506
507        let match_len = (lookbehind_str.to_string() + content + ref_type)
508            .chars()
509            .count();
510
511        (vec![ref_node], match_len)
512    }
513}
514
515/// Parses phrase references.
516pub fn phrase_ref(
517    opt_doctree_ref: &mut Option<&mut DocTree>,
518    pattern_name: Pattern,
519    captures: &regex::Captures,
520) -> (Vec<TreeNodeType>, usize) {
521    let whole_match = captures.get(0).unwrap().as_str();
522    let lookbehind_str = if let Some(lookbehind) = captures.name("lookbehind") {
523        lookbehind.as_str()
524    } else {
525        ""
526    };
527    let markup_start_str = captures.name("markup_start").unwrap().as_str();
528    let content = captures.name("content").unwrap().as_str();
529    let embedded_uri_container = if let Some(uri) = captures.name("embedded_uri_container") {
530        uri.as_str()
531    } else {
532        ""
533    };
534    let embedded_uri = if let Some(uri) = captures.name("embedded_uri") {
535        uri.as_str()
536    } else {
537        ""
538    };
539    let ref_type = captures.name("ref_type").unwrap().as_str();
540    let markup_end_str = captures.name("markup_end").unwrap().as_str();
541    let lookahead_str = if let Some(lookahead) = captures.name("lookahead") {
542        lookahead.as_str()
543    } else {
544        ""
545    };
546
547    if quotation_matches(lookbehind_str, content) {
548        let quoted_start_char_count = lookbehind_str.chars().count()
549            + markup_start_str.chars().count()
550            + content.chars().count();
551
552        let quoted_start_string: String = captures
553            .get(0)
554            .unwrap()
555            .as_str()
556            .chars()
557            .take(quoted_start_char_count)
558            .collect();
559
560        return (
561            vec![TreeNodeType::Text {
562                text: quoted_start_string,
563            }],
564            quoted_start_char_count,
565        );
566    } else if !lookbehind_str.is_empty() {
567        return (
568            vec![TreeNodeType::Text {
569                text: unicode_text_to_latex(lookbehind_str),
570            }],
571            lookbehind_str.chars().count(),
572        );
573    }
574
575    let reference = match ref_type {
576        "__" => {
577            // Automatic reference label => ask doctree for label, if present. Else use the manual label
578
579            if let Some(doctree) = opt_doctree_ref {
580                Reference::Internal(doctree.next_anon_reference_label())
581            } else {
582                eprintln!("Warning: detected an automatic reference name but no doctree available to generate one...");
583                Reference::Internal(normalize_refname(content))
584            }
585        }
586        "_" => {
587            // Manual reference label
588
589            if !embedded_uri.is_empty() {
590                Reference::URI(normalize_refname(embedded_uri))
591            } else {
592                Reference::Internal(normalize_refname(content))
593            }
594        }
595        _ => unreachable!(
596            "Only automatic or manual reference types are recognized. Computer says no..."
597        ),
598    };
599
600    let ref_node = TreeNodeType::Reference {
601        displayed_text: Some(content.to_string()),
602        reference: reference,
603    };
604
605    let match_len = if embedded_uri.is_empty() {
606        (lookbehind_str.to_string() + markup_start_str + content + markup_end_str + ref_type)
607            .chars()
608            .count()
609    } else {
610        whole_match.chars().count()
611    };
612
613    (vec![ref_node], match_len)
614}
615
616/// Parses footnote references.
617pub fn footnote_ref(
618    opt_doctree_ref: &mut Option<&mut DocTree>,
619    pattern_name: Pattern,
620    captures: &regex::Captures,
621) -> (Vec<TreeNodeType>, usize) {
622
623    // Gather parts of regex into strings
624    let lookbehind_str = if let Some(lookbehind) = captures.name("lookbehind") {
625        lookbehind.as_str()
626    } else {
627        ""
628    };
629    let content = captures.name("content").unwrap().as_str();
630    let markup_start_str = captures.name("markup_start").unwrap().as_str();
631    let number = if let Some(label) = captures.name("number") {
632        label.as_str()
633    } else {
634        ""
635    };
636    let auto_number = if let Some(label) = captures.name("auto_number") {
637        label.as_str()
638    } else {
639        ""
640    };
641    let auto_number_label = if let Some(label) = captures.name("auto_number_label") {
642        label.as_str()
643    } else {
644        ""
645    };
646    let symbol = if let Some(label) = captures.name("symbol") {
647        label.as_str()
648    } else {
649        ""
650    };
651    let ref_type = captures.name("ref_type").unwrap().as_str();
652    let markup_end_str = captures.name("markup_end").unwrap().as_str();
653    let lookahead_str = if let Some(lookahead) = captures.name("lookahead") {
654        lookahead.as_str()
655    } else {
656        ""
657    };
658
659    let (mut nodevec, mut offset) = (Vec::new(), 0);
660
661    if !lookbehind_str.is_empty() {
662        nodevec.push(
663            TreeNodeType::Text {
664                text: unicode_text_to_latex(lookbehind_str),
665            }
666        );
667        offset += lookbehind_str.chars().count();
668    }
669
670    offset += markup_start_str.chars().count();
671
672    if ! number.is_empty() {
673        nodevec.push(
674            TreeNodeType::FootnoteReference {
675                displayed_text: String::from(number),
676                target_label: String::from(number),
677                kind: FootnoteKind::Manual
678            }
679        );
680    } else if ! auto_number.is_empty() {
681        nodevec.push(
682            TreeNodeType::FootnoteReference {
683                displayed_text: String::from(auto_number),
684                target_label: String::from(auto_number),
685                kind: FootnoteKind::AutoNumbered
686            }
687        );
688    } else if ! auto_number_label.is_empty() {
689        nodevec.push(
690            TreeNodeType::FootnoteReference {
691                displayed_text: String::from(auto_number_label),
692                target_label: String::from(auto_number_label),
693                kind: FootnoteKind::SimpleRefName
694            }
695        );
696    } else if ! symbol.is_empty() {
697        nodevec.push(
698            TreeNodeType::FootnoteReference {
699                displayed_text: String::from(symbol),
700                target_label: String::from(symbol),
701                kind:FootnoteKind::AutoSymbol
702            }
703        )
704    } else {
705        unreachable!(
706            "Unknown footnote reference type for {}. Computer says no...",
707            captures.get(0).unwrap().as_str()
708        )
709    };
710
711    offset += content.chars().count() + markup_end_str.chars().count() + ref_type.chars().count();
712
713    (nodevec, offset)
714}
715
716/// Parses citation references.
717pub fn citation_ref(
718    opt_doctree_ref: &mut Option<&mut DocTree>,
719    pattern_name: Pattern,
720    captures: &regex::Captures,
721) -> (Vec<TreeNodeType>, usize) {
722
723    let mut node_data = Vec::<TreeNodeType>::new();
724    let mut offset = 0 as usize;
725
726    let lookbehind_str = if let Some(lookbehind) = captures.name("lookbehind") {
727        lookbehind.as_str()
728    } else {
729        ""
730    };
731    let markup_start_str = captures.name("markup_start").unwrap().as_str();
732    let content = captures.name("content").unwrap().as_str();
733    let markup_end_str = captures.name("markup_end").unwrap().as_str();
734    let lookahead_str = if let Some(lookahead) = captures.name("lookahead") {
735        lookahead.as_str()
736    } else {
737        ""
738    };
739
740    // Check for quoted start
741    if ! lookbehind_str.is_empty() {
742        if quotation_matches(lookbehind_str, content) {
743            offset +=
744                lookbehind_str.chars().count() + markup_start_str.chars().count() + 1;
745
746            let quoted_start_string: String = captures
747                .get(0)
748                .unwrap()
749                .as_str()
750                .chars()
751                .take(offset)
752                .collect();
753            return ( vec![ TreeNodeType::Text { text: quoted_start_string }], offset );
754        } else {
755            node_data.push(TreeNodeType::Text { text: String::from(lookbehind_str) });
756            offset += lookbehind_str.chars().count();
757        }
758    }
759
760    let citation_ref = TreeNodeType::CitationReference {
761        displayed_text: content.trim().to_string(),
762        target_label: content.trim().to_string()
763    };
764    node_data.push(citation_ref);
765    offset += markup_start_str.chars().count()
766        + content.chars().count()
767        + markup_end_str.chars().count();
768
769    (node_data, offset)
770}
771
772/// Parses inline subsitution references. Also adds hyperlink information to the reference,
773/// if the matched string ended with a `__?`.
774pub fn substitution_ref(
775    opt_doctree_ref: &mut Option<&mut DocTree>,
776    pattern_name: Pattern,
777    captures: &regex::Captures,
778) -> (Vec<TreeNodeType>, usize) {
779    let lookbehind_str = if let Some(lookbehind) = captures.name("lookbehind") {
780        lookbehind.as_str()
781    } else {
782        ""
783    };
784    let markup_start_str = captures.name("markup_start").unwrap().as_str();
785    let content = captures.name("content").unwrap().as_str();
786    let ref_type = if let Some(ref_type_str) = captures.name("ref_type") {
787        ref_type_str.as_str()
788    } else {
789        ""
790    };
791    let markup_end_str = captures.name("markup_end").unwrap().as_str();
792    let lookahead_str = if let Some(lookahead) = captures.name("lookahead") {
793        lookahead.as_str()
794    } else {
795        ""
796    };
797
798    if quotation_matches(lookbehind_str, content) {
799        let quoted_start_char_count =
800            lookbehind_str.chars().count() + markup_start_str.chars().count() + 1;
801
802        let quoted_start_string: String = captures
803            .get(0)
804            .unwrap()
805            .as_str()
806            .chars()
807            .take(quoted_start_char_count)
808            .collect();
809
810        return (
811            vec![TreeNodeType::Text {
812                text: quoted_start_string,
813            }],
814            quoted_start_char_count,
815        );
816    } else if !lookbehind_str.is_empty() {
817        return (
818            vec![TreeNodeType::Text {
819                text: unicode_text_to_latex(lookbehind_str),
820            }],
821            lookbehind_str.chars().count(),
822        );
823    }
824
825    let target_label = if !ref_type.is_empty() {
826        let target_label: String = match ref_type {
827            "__" => {
828                // Automatic reference label => ask doctree for label, if present. Else use the manual label
829
830                if let Some(doctree) = opt_doctree_ref {
831                    doctree.next_anon_reference_label()
832                } else {
833                    eprintln!("Warning: detected an automatic reference name but no doctree available to generate one...");
834                    normalize_refname(content)
835                }
836            }
837            "_" => {
838                // Manual reference label
839                normalize_refname(content)
840            }
841            _ => unreachable!(
842                "Only automatic or manual reference types are recognized. Computer says no..."
843            ),
844        };
845
846        Some(target_label)
847    } else {
848        None
849    };
850
851    let substitution_ref_node = TreeNodeType::SubstitutionReference {
852        substitution_label: normalize_refname(content),
853        target_label: target_label,
854    };
855
856    let match_len =
857        (lookbehind_str.to_string() + markup_start_str + content + markup_end_str + ref_type)
858            .chars()
859            .count();
860
861    (vec![substitution_ref_node], match_len)
862}
863
864/// Parses inline URIs. These are split into general URIs and standalone email addresses.
865/// These two are differentiate by whether the URI starts with a protocol scheme,
866/// such as `https://`.
867pub fn uri(
868    opt_doctree_ref: &mut Option<&mut DocTree>,
869    pattern_name: Pattern,
870    captures: &regex::Captures,
871) -> (Vec<TreeNodeType>, usize) {
872    let whole_match = captures.get(0).unwrap().as_str();
873    let lookbehind_str = if let Some(lookbehind) = captures.name("lookbehind") {
874        lookbehind.as_str()
875    } else {
876        ""
877    };
878    let content = captures.name("content").unwrap().as_str();
879    let scheme_str = if let Some(scheme) = captures.name("scheme") {
880        scheme.as_str()
881    } else {
882        ""
883    };
884    let authority = if let Some(authority) = captures.name("authority") {
885        authority.as_str()
886    } else {
887        ""
888    };
889    let userinfo = if let Some(userinfo) = captures.name("userinfo") {
890        userinfo.as_str()
891    } else {
892        ""
893    };
894    let host = if let Some(host) = captures.name("host") {
895        host.as_str()
896    } else {
897        ""
898    };
899    let port = if let Some(port) = captures.name("port") {
900        port.as_str()
901    } else {
902        ""
903    };
904    let path = if let Some(path) = captures.name("path") {
905        path.as_str()
906    } else {
907        ""
908    };
909    let query = if let Some(query) = captures.name("query") {
910        query.as_str()
911    } else {
912        ""
913    };
914    let fragment = if let Some(fragment) = captures.name("fragment") {
915        fragment.as_str()
916    } else {
917        ""
918    };
919    let email_str = if let Some(email) = captures.name("email") {
920        email.as_str()
921    } else {
922        ""
923    };
924    let lookahead_str = if let Some(lookahead) = captures.name("lookahead") {
925        lookahead.as_str()
926    } else {
927        ""
928    };
929
930    if !lookbehind_str.is_empty() {
931        return (
932            vec![TreeNodeType::Text {
933                text: unicode_text_to_latex(lookbehind_str),
934            }],
935            lookbehind_str.chars().count(),
936        );
937    }
938
939    let mut is_valid = true;
940
941    let data = if scheme_str.is_empty() {
942        // If no email when missing a scheme, simply return match as string
943        if email_str.is_empty() {
944            let data = TreeNodeType::Text {
945                text: String::from(email_str),
946            };
947
948            return (vec![data], email_str.chars().count());
949        }
950        TreeNodeType::Reference {
951            displayed_text: None,
952            reference: Reference::EMail(String::from(content)),
953        }
954    } else {
955        // Validity checks
956
957        if !authority.is_empty() {
958            let has_slash = if let Some(c) = path.chars().next() {
959                if c == '/' {
960                    true
961                } else {
962                    false
963                }
964            } else {
965                false
966            };
967
968            if !path.is_empty() && !has_slash {
969                eprintln!("URI {} has an autority field and a non-empty path that doesn't start with a '/'. URI invalid.", whole_match);
970                is_valid = false;
971            }
972        }
973
974        // If URI is valid, return it as URI, else as text
975        if is_valid {
976            TreeNodeType::Reference {
977                displayed_text: None,
978                reference: Reference::URI(content.to_string()),
979            }
980        } else {
981            TreeNodeType::Text {
982                text: String::from(content),
983            }
984        }
985    };
986
987    let match_len = content.chars().count();
988    (vec![data], match_len)
989}
990
991/// This function is invoked when no other inline pattern matched.
992/// Eats up any consequent non-whitespace characters as a single
993/// word of "text".
994pub fn text(
995    opt_doctree_ref: &mut Option<&mut DocTree>,
996    pattern_name: Pattern,
997    captures: &regex::Captures,
998) -> (Vec<TreeNodeType>, usize) {
999    let content = captures.get(0).unwrap().as_str();
1000    let match_len = content.chars().count();
1001
1002    let unicode_text_escape = true; // TODO: Transform this to a compiler flag
1003    let content_string = if unicode_text_escape {
1004        unicode_text_to_latex(content)
1005    } else {
1006        content.to_string()
1007    };
1008
1009    let node_data = TreeNodeType::Text {
1010        text: content_string,
1011    };
1012    (vec![node_data], match_len)
1013}
1014
1015// =======================
1016//  Constants and helpers
1017// =======================
1018
1019/// Checks the two given string slices for matching reStructuredText quotation characters.
1020fn quotation_matches(start: &str, end: &str) -> bool {
1021    for (i, c) in OPENERS.iter().enumerate() {
1022        if start.ends_with(*c) && end.starts_with(*CLOSERS.get(i).unwrap()) {
1023            return true;
1024        }
1025    }
1026
1027    for c in DELIMITERS.iter() {
1028        if start.ends_with(*c) && end.starts_with(*c) {
1029            return true;
1030        }
1031    }
1032
1033    false
1034}
1035
1036/// A long string of "quotation openers".
1037///
1038/// source: https://sourceforge.net/p/docutils/code/HEAD/tree/trunk/docutils/docutils/utils/punctuation_chars.py#l46
1039const OPENERS: &[char] = &[
1040    '"', '\'', '(', '<', '\\', '[', '{', '\u{0f3a}', '\u{0f3c}', '\u{169b}', '\u{2045}',
1041    '\u{207d}', '\u{208d}', '\u{2329}', '\u{2768}', '\u{276a}', '\u{276c}', '\u{276e}', '\u{2770}',
1042    '\u{2772}', '\u{2774}', '\u{27c5}', '\u{27e6}', '\u{27e8}', '\u{27ea}', '\u{27ec}', '\u{27ee}',
1043    '\u{2983}', '\u{2985}', '\u{2987}', '\u{2989}', '\u{298b}', '\u{298d}', '\u{298f}', '\u{2991}',
1044    '\u{2993}', '\u{2995}', '\u{2997}', '\u{29d8}', '\u{29da}', '\u{29fc}', '\u{2e22}', '\u{2e24}',
1045    '\u{2e26}', '\u{2e28}', '\u{3008}', '\u{300a}', '\u{300c}', '\u{300e}', '\u{3010}', '\u{3014}',
1046    '\u{3016}', '\u{3018}', '\u{301a}', '\u{301d}', '\u{301d}', '\u{fd3e}', '\u{fe17}', '\u{fe35}',
1047    '\u{fe37}', '\u{fe39}', '\u{fe3b}', '\u{fe3d}', '\u{fe3f}', '\u{fe41}', '\u{fe43}', '\u{fe47}',
1048    '\u{fe59}', '\u{fe5b}', '\u{fe5d}', '\u{ff08}', '\u{ff3b}', '\u{ff5b}', '\u{ff5f}', '\u{ff62}',
1049    '\u{00ab}', '\u{2018}', '\u{201c}', '\u{2039}', '\u{2e02}', '\u{2e04}', '\u{2e09}', '\u{2e0c}',
1050    '\u{2e1c}', '\u{2e20}', '\u{201a}', '\u{201e}', '\u{00bb}', '\u{2019}', '\u{201d}', '\u{203a}',
1051    '\u{2e03}', '\u{2e05}', '\u{2e0a}', '\u{2e0d}', '\u{2e1d}', '\u{2e21}', '\u{201b}', '\u{201f}',
1052    // Additional (weird like the Swedish quotes that the Swedish don't even use) quotes
1053    '\u{00bb}', '\u{2018}', '\u{2019}', '\u{201a}', '\u{201a}', '\u{201c}', '\u{201e}', '\u{201e}',
1054    '\u{201d}', '\u{203a}',
1055];
1056
1057/// A long list of "quotation" closers.
1058///
1059/// source: https://sourceforge.net/p/docutils/code/HEAD/tree/trunk/docutils/docutils/utils/punctuation_chars.py#l56
1060const CLOSERS: &[char] = &[
1061    '"', '\'', ')', '>', '\\', ']', '}', '\u{0f3b}', '\u{0f3d}', '\u{169c}', '\u{2046}',
1062    '\u{207e}', '\u{208e}', '\u{232a}', '\u{2769}', '\u{276b}', '\u{276d}', '\u{276f}', '\u{2771}',
1063    '\u{2773}', '\u{2775}', '\u{27c6}', '\u{27e7}', '\u{27e9}', '\u{27eb}', '\u{27ed}', '\u{27ef}',
1064    '\u{2984}', '\u{2986}', '\u{2988}', '\u{298a}', '\u{298c}', '\u{298e}', '\u{2990}', '\u{2992}',
1065    '\u{2994}', '\u{2996}', '\u{2998}', '\u{29d9}', '\u{29db}', '\u{29fd}', '\u{2e23}', '\u{2e25}',
1066    '\u{2e27}', '\u{2e29}', '\u{3009}', '\u{300b}', '\u{300d}', '\u{300f}', '\u{3011}', '\u{3015}',
1067    '\u{3017}', '\u{3019}', '\u{301b}', '\u{301e}', '\u{301f}', '\u{fd3f}', '\u{fe18}', '\u{fe36}',
1068    '\u{fe38}', '\u{fe3a}', '\u{fe3c}', '\u{fe3e}', '\u{fe40}', '\u{fe42}', '\u{fe44}', '\u{fe48}',
1069    '\u{fe5a}', '\u{fe5c}', '\u{fe5e}', '\u{ff09}', '\u{ff3d}', '\u{ff5d}', '\u{ff60}', '\u{ff63}',
1070    '\u{00bb}', '\u{2019}', '\u{201d}', '\u{203a}', '\u{2e03}', '\u{2e05}', '\u{2e0a}', '\u{2e0d}',
1071    '\u{2e1d}', '\u{2e21}', '\u{201b}', '\u{201f}', '\u{00ab}', '\u{2018}', '\u{201c}', '\u{2039}',
1072    '\u{2e02}', '\u{2e04}', '\u{2e09}', '\u{2e0c}', '\u{2e1c}', '\u{2e20}', '\u{201a}', '\u{201e}',
1073    // Swedish, Albanian, etc. closers
1074    '\u{00bb}', '\u{201a}', '\u{2019}', '\u{2018}', '\u{2019}', '\u{201e}', '\u{201c}', '\u{201d}',
1075    '\u{201d}', '\u{203a}',
1076];
1077
1078/// A long string of general delimiters in the unicode range smaller than `\u{FFFFFF}`.
1079/// Wider code points have been exluded because of Rust limitations on unicode digits,
1080/// for now. The docutils parser supports even those, so a solution might have to be invented.
1081///
1082/// source: https://sourceforge.net/p/docutils/code/HEAD/tree/trunk/docutils/docutils/utils/punctuation_chars.py#l66
1083const DELIMITERS: &[char] = &[
1084    '"', '\\', '-', '/', ':', '}', '\u{058a}', '\u{00a1}', '\u{00b7}', '\u{00bf}', '\u{037e}',
1085    '\u{0387}', '\u{055a}', '-', '\u{055f}', '\u{0589}', '\u{05be}', '\u{05c0}', '\u{05c3}',
1086    '\u{05c6}', '\u{05f3}', '\u{05f4}', '\u{0609}', '\u{060a}', '\u{060c}', '\u{060d}', '\u{061b}',
1087    '\u{061e}', '\u{061f}', '\u{066a}', '-', '\u{066d}', '\u{06d4}', '\u{0700}', '-', '\u{070d}',
1088    '\u{07f7}', '-', '\u{07f9}', '\u{0830}', '-', '\u{083e}', '\u{0964}', '\u{0965}', '\u{0970}',
1089    '\u{0df4}', '\u{0e4f}', '\u{0e5a}', '\u{0e5b}', '\u{0f04}', '-', '\u{0f12}', '\u{0f85}',
1090    '\u{0fd0}', '-', '\u{0fd4}', '\u{104a}', '-', '\u{104f}', '\u{10fb}', '\u{1361}', '-',
1091    '\u{1368}', '\u{1400}', '\u{166d}', '\u{166e}', '\u{16eb}', '-', '\u{16ed}', '\u{1735}',
1092    '\u{1736}', '\u{17d4}', '-', '\u{17d6}', '\u{17d8}', '-', '\u{17da}', '\u{1800}', '-',
1093    '\u{180a}', '\u{1944}', '\u{1945}', '\u{19de}', '\u{19df}', '\u{1a1e}', '\u{1a1f}', '\u{1aa0}',
1094    '-', '\u{1aa6}', '\u{1aa8}', '-', '\u{1aad}', '\u{1b5a}', '-', '\u{1b60}', '\u{1c3b}', '-',
1095    '\u{1c3f}', '\u{1c7e}', '\u{1c7f}', '\u{1cd3}', '\u{2010}', '-', '\u{2017}', '\u{2020}', '-',
1096    '\u{2027}', '\u{2030}', '-', '\u{2038}', '\u{203b}', '-', '\u{203e}', '\u{2041}', '-',
1097    '\u{2043}', '\u{2047}', '-', '\u{2051}', '\u{2053}', '\u{2055}', '-', '\u{205e}', '\u{2cf9}',
1098    '-', '\u{2cfc}', '\u{2cfe}', '\u{2cff}', '\u{2e00}', '\u{2e01}', '\u{2e06}', '-', '\u{2e08}',
1099    '\u{2e0b}', '\u{2e0e}', '-', '\u{2e1b}', '\u{2e1e}', '\u{2e1f}', '\u{2e2a}', '-', '\u{2e2e}',
1100    '\u{2e30}', '\u{2e31}', '\u{3001}', '-', '\u{3003}', '\u{301c}', '\u{3030}', '\u{303d}',
1101    '\u{30a0}', '\u{30fb}', '\u{a4fe}', '\u{a4ff}', '\u{a60d}', '-', '\u{a60f}', '\u{a673}',
1102    '\u{a67e}', '\u{a6f2}', '-', '\u{a6f7}', '\u{a874}', '-', '\u{a877}', '\u{a8ce}', '\u{a8cf}',
1103    '\u{a8f8}', '-', '\u{a8fa}', '\u{a92e}', '\u{a92f}', '\u{a95f}', '\u{a9c1}', '-', '\u{a9cd}',
1104    '\u{a9de}', '\u{a9df}', '\u{aa5c}', '-', '\u{aa5f}', '\u{aade}', '\u{aadf}', '\u{abeb}',
1105    '\u{fe10}', '-', '\u{fe16}', '\u{fe19}', '\u{fe30}', '-', '\u{fe32}', '\u{fe45}', '\u{fe46}',
1106    '\u{fe49}', '-', '\u{fe4c}', '\u{fe50}', '-', '\u{fe52}', '\u{fe54}', '-', '\u{fe58}',
1107    '\u{fe5f}', '-', '\u{fe61}', '\u{fe63}', '\u{fe68}', '\u{fe6a}', '\u{fe6b}', '\u{ff01}', '-',
1108    '\u{ff03}', '\u{ff05}', '-', '\u{ff07}', '\u{ff0a}', '\u{ff0c}', '-', '\u{ff0f}', '\u{ff1a}',
1109    '\u{ff1b}', '\u{ff1f}', '\u{ff20}', '\u{ff3c}', '\u{ff61}', '\u{ff64}', '\u{ff65}',
1110];