1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
/*!
This module contains the `State` type and the different transition functions corresponding to each state
in its submodules.
Copyright © 2020 Santtu Söderholm
*/
// ===============================================
// Submodules for namespacing transition functions
// ===============================================
pub mod aplus;
pub mod aplus_questionnaire;
pub mod block_quote;
pub mod body;
pub mod bullet_list;
pub mod common;
pub mod definition_list;
pub mod enumerated_list;
pub mod field_list;
pub mod inline;
pub mod literal_block;
pub mod transitions;
pub mod unknown_transitions;
use std::collections::HashMap;
use lazy_static::lazy_static;
use regex;
use super::*;
/// An enum of states.
/// The variants are used as keys to the static `TRANSITION_MAP`, which stores vectors of
/// transitions as values.
#[derive(Debug, PartialEq, Eq, Hash)]
pub enum State {
/// A state for parsing body nodes inside admonitions.
Admonition,
/// A state for detecting reStructuredText & Sphinx body elements,
/// in addition to column breaks in the form of `::newcol` for A+ nodes that support them.
/// These include the Point of Interest directive.
AplusMultiCol,
/// A state for recognizing the sub-directives:
/// 1. `pick-one`,
/// 2. `pick-any` and
/// 3. `freetext`
AplusQuestionnaire,
/// A state for detecting choices and assignments inside a A+ questionnaire
/// subdirective `pick-one`.
AplusPickOne,
/// A state for detecting choices and assignments inside a A+ questionnaire
/// subdirective `pick-any`.
AplusPickAny,
/// A state for recognizing body elements such as lists or footnotes when focused on document root.
Body,
/// A state for detecting body elements inside a section.
Section,
/// A state for recognizing body elements inside a block quote.
/// In addition to normal body elements, attributions are also
/// recognized as such in this state.
BlockQuote,
/// In this state, the parser only recognizes empty lines and bullet list items.
BulletList,
/// Citation nodes may contain arbitrary body elements.
/// This state is therefore reserved for recognizing them when focused on a citation node.
Citation,
/// Definition lists may only contain empty lines and definition list items.
DefinitionList,
/// When in this state, the parser only recognizes empty lines and enumerated list items.
EnumeratedList,
HyperlinkTarget,
/// List items of any type, such as enumerated or field list items can contain arbitrary body elements.
/// This state is reserved for recognizing them when focused on one of the list item type nodes.
ListItem,
/// When focused on a field list node, the parser only recognizes empty lines and field list items.
FieldList,
Figure,
/// Footnotes can contain arbitrary body elements.
/// This state is reserved for recognizing them when focused on a footnote node.
Footnote,
/// There are 3 different types of hyperlink targets:
///
/// 1. *internal*, which link to body elements that directly follow them,
/// 2. *external*, that reference external URIs and
/// 3. *indirect*, which reference other hyperlink targets inside the same document.
///
/// ??? Normally, an external or indirect hyperlink target would simply be a node on its own, that simply contains a reference label
/// of some kind. However, chained *internal* hyperlinks all reference the same target node,
/// so a state of its own (this one) is reserved for parsing them until a node of a different kind (including other types
/// of hyperlink targets) is encountered. Once this happens, all of the internal hyperlinks are set to point
/// to this same target node. ???
InternalHyperlinkTarget,
/// When focused on an option list, only empty lines and option list items are recognized.
/// This state is reserved for that purpose.
OptionList,
/// Empty and line block lines (lines beginning with '`|`') are recognized in this state.
LineBlock,
/// A state for recognizing bullet list items inside a ListTable
ListTable,
/// A state for parsing field lists inside diretives. Field lists located inside directive nodes
/// work as directive parameters or settings.
ExtensionOptions,
/// A state for parsing section titles and document transitions (a.k.a. `\hrulefill` commands in LaTeX terms).
Line,
/// A state for parsing empty lines and literal blocks of text.
/// Literal blocks are (non-contiguous) indented or "quoted" blocks of text that
/// are preceded by a paragraph ending in a `::`.
LiteralBlock,
/// An explicit failure state. Allows explicit signalling of transition failures.
Failure,
/// An End of File state. Could have also been named EOI, as in end of input,
/// as this state is transitioned to when a parser reaches the end of its source input:
/// This does not neecssarily correspond to the end of the given file during nested parsing sessions,
/// as nested parsers are usually limited to a parsijng single block of text behind a node indentifier.
EOF,
}
// ====================
// Statemachine methods
// ====================
impl State {
/// Transitions a `StateMachine` into a `Failure` state using the From trait,
/// the implementation of which automatically implements the Into trait.
pub fn to_failure(self) -> Self {
match self {
_ => State::Failure,
}
}
/// Retrieves the list of transitions based on a given `StateMachine` variant
/// using a `match` statement. First checks for end states that don't contain transitions,
/// such as `EOF` or `Failure` and if these are not matched,
/// retrieves a list of transitions from the `TRANSITION_MAP`.
pub fn get_transitions(
&self,
line_cursor: &LineCursor,
) -> Result<&Vec<Transition>, &'static str> {
match self {
State::EOF => Err("Already moved past EOF. No transitions to perform.\n"),
State::Failure => Err("Failure state has no transitions\n"),
State::Section
| State::ListItem
| State::Footnote
| State::Citation
| Self::Admonition
| Self::Figure => Ok(TRANSITION_MAP.get(&State::Body).unwrap()),
_ => {
if let Some(transition_table) = TRANSITION_MAP.get(self) {
Ok(transition_table)
} else {
panic!(
"Found no transition table for state {:#?} on line {}",
self,
line_cursor.sum_total()
)
}
}
}
}
}
/// =================================
/// StateMachine associated functions
/// =================================
impl State {
/// Takes in a reference/slice to an associated array of uncompiled transitions
/// and compiles the regex patterns found. Returns a `Vec<Transition>` with compiled state machines
/// in palce of the regex patterns.
///
/// Error handling needs to be added.
fn compile_state_transitions(transitions: &[UncompiledTransition]) -> Vec<Transition> {
let mut compiled_transitions = Vec::with_capacity(transitions.len());
for (pat_name, expr, fun) in transitions.iter() {
let r = regex::Regex::new(expr).unwrap();
compiled_transitions.push((*pat_name, r, *fun));
}
compiled_transitions
}
}
/// =================================
/// StateMachine associated constants
/// =================================
impl State {}
lazy_static! {
/// A static map of transititions for each `State` of the `Parser`.
///
/// With this regexes are only compiled into automata once.
pub static ref TRANSITION_MAP: HashMap<State, Vec<(Pattern, regex::Regex, TransitionMethod)>> = {
let mut action_map = collections::HashMap::with_capacity(10);
let body_actions = State::compile_state_transitions(&State::BODY_TRANSITIONS);
action_map.insert(State::Body, body_actions);
let block_quote_actions = State::compile_state_transitions(&State::BLOCK_QUOTE_TRANSITIONS);
action_map.insert(State::BlockQuote, block_quote_actions);
let bullet_actions = State::compile_state_transitions(&State::BULLET_LIST_TRANSITIONS);
action_map.insert(State::BulletList, bullet_actions);
let definition_actions = State::compile_state_transitions(&State::DEFINITION_LIST_TRANSITIONS);
action_map.insert(State::DefinitionList, definition_actions);
let enumerated_actions = State::compile_state_transitions(&State::ENUMERATED_LIST_TRANSITIONS);
action_map.insert(State::EnumeratedList, enumerated_actions);
let field_actions = State::compile_state_transitions(&State::FIELD_LIST_TRANSITIONS);
action_map.insert(State::FieldList, field_actions);
let option_actions = State::compile_state_transitions(&State::OPTION_LIST_TRANSITIONS);
action_map.insert(State::OptionList, option_actions);
let line_block_actions = State::compile_state_transitions(&State::LINE_BLOCK_TRANSITIONS);
action_map.insert(State::LineBlock, line_block_actions);
let literal_block_actions = State::compile_state_transitions(&State::LITERAL_BLOCK_TRANSITIONS);
action_map.insert(State::LiteralBlock, literal_block_actions);
let extension_option_actions = State::compile_state_transitions(&State::EXTENSION_OPTION_TRANSITIONS);
action_map.insert(State::ExtensionOptions, extension_option_actions);
let line_actions = State::compile_state_transitions(&State::LINE_TRANSITIONS);
action_map.insert(State::Line, line_actions);
let list_table_actions = State::compile_state_transitions(&State::LIST_TABLE_TRANSITIONS);
action_map.insert(State::ListTable, list_table_actions);
// A+
let aplus_multicol_actions = State::compile_state_transitions(&State::APLUS_MULTICOL_TRANSITIONS);
action_map.insert(State::AplusMultiCol, aplus_multicol_actions);
let aplus_questionnaire_actions = State::compile_state_transitions(&State::APLUS_QUESTIONNAIRE_TRANSITIONS);
action_map.insert(State::AplusQuestionnaire, aplus_questionnaire_actions);
action_map
};
/// Inline text has different parsing requirements than (nested)
/// `Body` elements as they do not form blocks of text,
/// making detecting by source line impractical.
///
/// Instead, a block of source text is given to `Parser::parse_inline_nodes`
/// which is then scanned with regular expressions.
pub static ref COMPILED_INLINE_TRANSITIONS: Vec<(Pattern, regex::Regex, InlineParsingMethod)> = {
let mut inline_transitions = Vec::with_capacity(State::INLINE_TRANSITIONS.len());
for (pat_name, expr, fun) in State::INLINE_TRANSITIONS.iter() {
let r = regex::Regex::new(expr).unwrap();
inline_transitions.push((*pat_name, r, *fun));
}
inline_transitions
};
}
impl Parser {
/// Checks whether the line following the current one allows for the construction of an enumerate list item.
/// Either the following line has to be blank, indented or the next enumerator in
/// the current list has to be constructable from it.
fn is_enumerated_list_item(
src_lines: &Vec<String>,
line_cursor: &mut LineCursor,
captures: ®ex::Captures,
section_level: &mut usize,
base_indent: usize,
detected_enumerator_indent: usize,
detected_number: usize,
detected_kind: EnumKind,
detected_delims: EnumDelims,
pattern_name: &Pattern,
list_kind: &EnumKind,
in_list_item: bool,
list_item_number: usize,
list_start_index: usize,
) -> bool {
use crate::parser::automata::ENUMERATOR_AUTOMATON;
if let Some(next_line) = src_lines.get(line_cursor.relative_offset() + 1) {
let next_line_indent = next_line
.chars()
.take_while(|c| c.is_whitespace())
.count() + base_indent;
if next_line.trim().is_empty() || next_line_indent > detected_enumerator_indent {
return true
} else if next_line_indent == detected_enumerator_indent {
if let Some(next_captures) = ENUMERATOR_AUTOMATON.captures(next_line) {
let (next_number, next_kind, next_delims) = match converters::enum_captures_to_int_kind_and_delims(
&next_captures,
Some(list_kind),
in_list_item,
true,
list_item_number,
list_start_index
) {
Some((number, kind, delims)) => (number, kind, delims),
None => return false
};
if ! (
next_number == detected_number + 1
&& next_kind == detected_kind
&& next_delims == detected_delims
) {
eprintln!("Non-matching enumerator on next line...");
return false
} else {
true
}
} else {
return false
}
} else {
true
}
} else {
true
}
}
}