rustla/parser/state_machine/
mod.rs

1/*!
2This module contains the `State` type and the different transition functions corresponding to each state
3in its submodules.
4
5Copyright © 2020 Santtu Söderholm
6*/
7
8// ===============================================
9// Submodules for namespacing transition functions
10// ===============================================
11pub mod aplus;
12pub mod aplus_questionnaire;
13pub mod block_quote;
14pub mod body;
15pub mod bullet_list;
16pub mod common;
17pub mod definition_list;
18pub mod enumerated_list;
19pub mod field_list;
20pub mod inline;
21pub mod literal_block;
22pub mod transitions;
23pub mod unknown_transitions;
24
25use std::collections::HashMap;
26use lazy_static::lazy_static;
27use regex;
28
29use super::*;
30
31/// An enum of states.
32/// The variants are used as keys to the static `TRANSITION_MAP`, which stores vectors of
33/// transitions as values.
34#[derive(Debug, PartialEq, Eq, Hash)]
35pub enum State {
36
37    /// A state for parsing body nodes inside admonitions.
38    Admonition,
39
40    /// A state for detecting reStructuredText & Sphinx body elements,
41    /// in addition to column breaks in the form of `::newcol` for A+ nodes that support them.
42    /// These include the Point of Interest directive.
43    AplusMultiCol,
44
45    /// A state for recognizing the sub-directives:
46    /// 1. `pick-one`,
47    /// 2. `pick-any` and
48    /// 3. `freetext`
49    AplusQuestionnaire,
50
51    /// A state for detecting choices and assignments inside a A+ questionnaire
52    /// subdirective `pick-one`.
53    AplusPickOne,
54
55    /// A state for detecting choices and assignments inside a A+ questionnaire
56    /// subdirective `pick-any`.
57    AplusPickAny,
58
59    /// A state for recognizing body elements such as lists or footnotes when focused on document root.
60    Body,
61
62    /// A state for detecting body elements inside a section.
63    Section,
64
65    /// A state for recognizing body elements inside a block quote.
66    /// In addition to normal body elements, attributions are also
67    /// recognized as such in this state.
68    BlockQuote,
69
70    /// In this state, the parser only recognizes empty lines and bullet list items.
71    BulletList,
72
73    /// Citation nodes may contain arbitrary body elements.
74    /// This state is therefore reserved for recognizing them when focused on a citation node.
75    Citation,
76
77    /// Definition lists may only contain empty lines and definition list items.
78    DefinitionList,
79
80    /// When in this state, the parser only recognizes empty lines and enumerated list items.
81    EnumeratedList,
82
83    HyperlinkTarget,
84
85    /// List items of any type, such as enumerated or field list items can contain arbitrary body elements.
86    /// This state is reserved for recognizing them when focused on one of the list item type nodes.
87    ListItem,
88
89    /// When focused on a field list node, the parser only recognizes empty lines and field list items.
90    FieldList,
91
92    Figure,
93
94    /// Footnotes can contain arbitrary body elements.
95    /// This state is reserved for recognizing them when focused on a footnote node.
96    Footnote,
97
98    /// There are 3 different types of hyperlink targets:
99    ///
100    /// 1. *internal*, which link to body elements that directly follow them,
101    /// 2. *external*, that reference external URIs and
102    /// 3. *indirect*, which reference other hyperlink targets inside the same document.
103    ///
104    /// ??? Normally, an external or indirect hyperlink target would simply be a node on its own, that simply contains a reference label
105    /// of some kind. However, chained *internal* hyperlinks all reference the same target node,
106    /// so a state of its own (this one) is reserved for parsing them until a node of a different kind (including other types
107    /// of hyperlink targets) is encountered. Once this happens, all of the internal hyperlinks are set to point
108    /// to this same target node. ???
109    InternalHyperlinkTarget,
110
111    /// When focused on an option list, only empty lines and option list items are recognized.
112    /// This state is reserved for that purpose.
113    OptionList,
114
115    /// Empty and line block lines (lines beginning with '`|`') are recognized in this state.
116    LineBlock,
117
118    /// A state for recognizing bullet list items inside a ListTable
119    ListTable,
120
121    /// A state for parsing field lists inside diretives. Field lists located inside directive nodes
122    /// work as directive parameters or settings.
123    ExtensionOptions,
124
125    /// A state for parsing section titles and document transitions (a.k.a. `\hrulefill` commands in LaTeX terms).
126    Line,
127
128    /// A state for parsing empty lines and literal blocks of text.
129    /// Literal blocks are (non-contiguous) indented or "quoted" blocks of text that
130    /// are  preceded by a paragraph ending in a `::`.
131    LiteralBlock,
132
133    /// An explicit failure state. Allows explicit signalling of transition failures.
134    Failure,
135
136    /// An End of File state. Could have also been named EOI, as in end of input,
137    /// as this state is transitioned to when a parser reaches the end of its source input:
138    /// This does not neecssarily correspond to the end of the given file during nested parsing sessions,
139    /// as nested parsers are usually limited to a parsijng single block of text behind a node indentifier.
140    EOF,
141}
142
143// ====================
144// Statemachine methods
145// ====================
146impl State {
147
148    /// Transitions a `StateMachine` into a `Failure` state using the From trait,
149    /// the implementation of which automatically implements the Into trait.
150    pub fn to_failure(self) -> Self {
151        match self {
152            _ => State::Failure,
153        }
154    }
155
156    /// Retrieves the list of transitions based on a given `StateMachine` variant
157    /// using a `match` statement. First checks for end states that don't contain transitions,
158    /// such as `EOF` or `Failure` and if these are not matched,
159    /// retrieves a list of transitions from the `TRANSITION_MAP`.
160    pub fn get_transitions(
161        &self,
162        line_cursor: &LineCursor,
163    ) -> Result<&Vec<Transition>, &'static str> {
164        match self {
165            State::EOF => Err("Already moved past EOF. No transitions to perform.\n"),
166            State::Failure => Err("Failure state has no transitions\n"),
167            State::Section
168            | State::ListItem
169            | State::Footnote
170            | State::Citation
171            | Self::Admonition
172            | Self::Figure => Ok(TRANSITION_MAP.get(&State::Body).unwrap()),
173            _ => {
174                if let Some(transition_table) = TRANSITION_MAP.get(self) {
175                    Ok(transition_table)
176                } else {
177                    panic!(
178                        "Found no transition table for state {:#?} on line {}",
179                        self,
180                        line_cursor.sum_total()
181                    )
182                }
183            }
184        }
185    }
186}
187
188/// =================================
189/// StateMachine associated functions
190/// =================================
191impl State {
192
193    /// Takes in a reference/slice to an associated array of uncompiled transitions
194    /// and compiles the regex patterns found. Returns a `Vec<Transition>` with compiled state machines
195    /// in palce of the regex patterns.
196    ///
197    /// Error handling needs to be added.
198    fn compile_state_transitions(transitions: &[UncompiledTransition]) -> Vec<Transition> {
199        let mut compiled_transitions = Vec::with_capacity(transitions.len());
200
201        for (pat_name, expr, fun) in transitions.iter() {
202            let r = regex::Regex::new(expr).unwrap();
203            compiled_transitions.push((*pat_name, r, *fun));
204        }
205
206        compiled_transitions
207    }
208}
209
210/// =================================
211/// StateMachine associated constants
212/// =================================
213impl State {}
214
215lazy_static! {
216
217  /// A static map of transititions for each `State` of the `Parser`.
218  ///
219  /// With this regexes are only compiled into automata once.
220  pub static ref TRANSITION_MAP: HashMap<State, Vec<(Pattern, regex::Regex, TransitionMethod)>> = {
221
222    let mut action_map = collections::HashMap::with_capacity(10);
223
224    let body_actions = State::compile_state_transitions(&State::BODY_TRANSITIONS);
225    action_map.insert(State::Body, body_actions);
226
227    let block_quote_actions = State::compile_state_transitions(&State::BLOCK_QUOTE_TRANSITIONS);
228    action_map.insert(State::BlockQuote, block_quote_actions);
229
230    let bullet_actions = State::compile_state_transitions(&State::BULLET_LIST_TRANSITIONS);
231    action_map.insert(State::BulletList, bullet_actions);
232
233    let definition_actions = State::compile_state_transitions(&State::DEFINITION_LIST_TRANSITIONS);
234    action_map.insert(State::DefinitionList, definition_actions);
235
236    let enumerated_actions = State::compile_state_transitions(&State::ENUMERATED_LIST_TRANSITIONS);
237    action_map.insert(State::EnumeratedList, enumerated_actions);
238
239    let field_actions = State::compile_state_transitions(&State::FIELD_LIST_TRANSITIONS);
240    action_map.insert(State::FieldList, field_actions);
241
242    let option_actions = State::compile_state_transitions(&State::OPTION_LIST_TRANSITIONS);
243    action_map.insert(State::OptionList, option_actions);
244
245    let line_block_actions = State::compile_state_transitions(&State::LINE_BLOCK_TRANSITIONS);
246    action_map.insert(State::LineBlock, line_block_actions);
247
248    let literal_block_actions = State::compile_state_transitions(&State::LITERAL_BLOCK_TRANSITIONS);
249    action_map.insert(State::LiteralBlock, literal_block_actions);
250
251    let extension_option_actions = State::compile_state_transitions(&State::EXTENSION_OPTION_TRANSITIONS);
252    action_map.insert(State::ExtensionOptions, extension_option_actions);
253
254    let line_actions = State::compile_state_transitions(&State::LINE_TRANSITIONS);
255    action_map.insert(State::Line, line_actions);
256
257    let list_table_actions = State::compile_state_transitions(&State::LIST_TABLE_TRANSITIONS);
258    action_map.insert(State::ListTable, list_table_actions);
259
260    // A+
261    let aplus_multicol_actions = State::compile_state_transitions(&State::APLUS_MULTICOL_TRANSITIONS);
262    action_map.insert(State::AplusMultiCol, aplus_multicol_actions);
263
264    let aplus_questionnaire_actions = State::compile_state_transitions(&State::APLUS_QUESTIONNAIRE_TRANSITIONS);
265    action_map.insert(State::AplusQuestionnaire, aplus_questionnaire_actions);
266
267    action_map
268
269  };
270
271  /// Inline text has different parsing requirements than (nested)
272  /// `Body` elements as they do not form blocks of text,
273  /// making detecting by source line impractical.
274  ///
275  /// Instead, a block of source text is given to `Parser::parse_inline_nodes`
276  /// which is then scanned with regular expressions.
277  pub static ref COMPILED_INLINE_TRANSITIONS: Vec<(Pattern, regex::Regex, InlineParsingMethod)> = {
278
279    let mut inline_transitions = Vec::with_capacity(State::INLINE_TRANSITIONS.len());
280
281    for (pat_name, expr, fun) in State::INLINE_TRANSITIONS.iter() {
282      let r = regex::Regex::new(expr).unwrap();
283      inline_transitions.push((*pat_name, r, *fun));
284    }
285
286    inline_transitions
287  };
288}
289
290impl Parser {
291    /// Checks whether the line following the current one allows for the construction of an enumerate list item.
292    /// Either the following line has to be blank, indented or the next enumerator in
293    /// the current list has to be constructable from it.
294    fn is_enumerated_list_item(
295        src_lines: &Vec<String>,
296        line_cursor: &mut LineCursor,
297        captures: &regex::Captures,
298        section_level: &mut usize,
299        base_indent: usize,
300        detected_enumerator_indent: usize,
301        detected_number: usize,
302        detected_kind: EnumKind,
303        detected_delims: EnumDelims,
304        pattern_name: &Pattern,
305        list_kind: &EnumKind,
306        in_list_item: bool,
307        list_item_number: usize,
308        list_start_index: usize,
309    ) -> bool {
310
311        use crate::parser::automata::ENUMERATOR_AUTOMATON;
312
313        if let Some(next_line) = src_lines.get(line_cursor.relative_offset() + 1) {
314            let next_line_indent = next_line
315                .chars()
316                .take_while(|c| c.is_whitespace())
317                .count() + base_indent;
318
319            if next_line.trim().is_empty() || next_line_indent > detected_enumerator_indent {
320                return true
321            } else if next_line_indent == detected_enumerator_indent {
322                if let Some(next_captures) = ENUMERATOR_AUTOMATON.captures(next_line) {
323                    let (next_number, next_kind, next_delims) = match converters::enum_captures_to_int_kind_and_delims(
324                        &next_captures,
325                        Some(list_kind),
326                        in_list_item,
327                        true,
328                        list_item_number,
329                        list_start_index
330                    ) {
331                        Some((number, kind, delims)) => (number, kind, delims),
332                        None => return false
333                    };
334                    if ! (
335                        next_number == detected_number + 1
336                        && next_kind == detected_kind
337                        && next_delims == detected_delims
338                    ) {
339                        eprintln!("Non-matching enumerator on next line...");
340                        return false
341                    } else {
342                        true
343                    }
344                } else {
345                    return false
346                }
347            } else {
348                true
349            }
350        } else {
351            true
352        }
353    }
354}