1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
//! This contains the initial lexer, which identifies block-level structure (all tokens that
//! contribute to a block, newlines, etc.) All inline content is represented by `Token::Str`, but
//! is not parsed yet.
mod token;
use crate::string::{
read::{read_indentation, read_line, read_until, read_while},
roman::is_roman,
unicode::{is_alphabetic, is_blank, is_digit, is_line_ending},
};
pub(crate) use token::{OrderedListType, Token, TokenKind, UnorderedListType};
/// The lexer struct stores the current position and the input as bytes.
#[derive(Clone, Copy)]
pub(crate) struct Lexer<'a> {
i: usize,
bytes: &'a [u8],
}
impl<'a> Lexer<'a> {
pub(crate) fn new(input: &'a str) -> Self {
Self {
i: 0,
bytes: input.as_bytes(),
}
}
/// Handy function for getting a slice of the remaining bytes
#[inline]
fn remaining_bytes(&self) -> &[u8] {
&self.bytes[self.i..]
}
/// Handy function for getting a slice of the remaining bytes up to (but not including) the
/// first newline
#[inline]
fn current_line(&self) -> Option<&[u8]> {
let linesize = read_line(self.remaining_bytes());
self.bytes.get(self.i..self.i + linesize)
}
/// Identifies a soft break as a line ending character not followed by another line ending
/// character. If the previous line ended with `\`, then return a hard break.
fn line_break(&self) -> Option<(TokenKind, usize)> {
match (self.bytes.get(self.i)?, self.bytes.get(self.i + 1)?) {
(a, b) if is_line_ending(*a) && !is_line_ending(*b) => Some((TokenKind::SoftBreak, 1)),
(b'\\', a) if is_line_ending(*a) => Some((TokenKind::HardBreak, 2)),
_ => None,
}
}
/// A blank line is a line ending, followed by optional "blank characters" and then another
/// line ending
fn blank_line(&self) -> Option<(TokenKind, usize)> {
// First count one line ending
if is_line_ending(*self.bytes.get(self.i)?) {
// `i` is a temporary position marker
let mut i = self.i;
// `len` counts the number of bytes
let mut len = 1usize;
// Loop over bytes, increasing the temp position and token byte size until a condition
// is met
loop {
i += 1;
len += 1;
match self.bytes.get(i) {
// Encountering another line ending means the end of this token
Some(c) if is_line_ending(*c) => break Some((TokenKind::Blankline, len)),
// Blank characters do nothing, but the token size is increased
Some(c) if is_blank(*c) => {}
// No remaining bytes = end of document. Return a blank line token, but don't
// count EOF towards the token size
None => break Some((TokenKind::Blankline, len - 1)),
// If another character is encountered, this is not a blank line
_ => break None,
}
}
} else {
None
}
}
/// This identifies the indentation before a block. The token can appear after another token on
/// the same line (e.g. indentation after a blockquote token.)
///
/// The returned token also stores the level, which is the indentation size in spaces. Tabs are
/// considered as 4 spaces per the CommonMark spec: https://spec.commonmark.org/0.30/#example-1
fn indentation(&self) -> Option<(TokenKind, usize)> {
// Read the byte size of any indentation
let byte_size = read_indentation(self.remaining_bytes());
if byte_size > 0 {
// If indentation is found, determine the level by summing up the size in spaces
let level = self.bytes[self.i..self.i + byte_size]
.iter()
.map_while(|c| match c {
b' ' => Some(1),
b'\t' => Some(4),
_ => None,
})
.sum();
return Some((TokenKind::Indent(level), byte_size));
}
None
}
/// A thematic break is a line that contains at least three `*` or three `-`, along with
/// optional blank characters.
fn thematic_break(&self) -> Option<(TokenKind, usize)> {
let line = self.current_line()?;
let mut dashes = 0;
let mut asteriskes = 0;
for byte in line.iter() {
match byte {
b'-' => dashes += 1,
b'*' => asteriskes += 1,
c if is_blank(*c) => {}
_ => return None,
}
}
// We can't have both dashes and asteriskes, so check if the count of one is 0 and the other is above 2
match (dashes, asteriskes) {
(n, 0) | (0, n) if n > 2 => Some((TokenKind::ThematicBreak, line.len())),
_ => None,
}
}
/// The way the headings work now is that they accept any number of `#` signs, as long as they
/// are contiguous and followed by a space. Heading levels can only go up to 6, but this is
/// taken care of later
fn heading(&self) -> Option<(TokenKind, usize)> {
let level = read_while(self.remaining_bytes(), |c| c == b'#');
match self.bytes.get(self.i + level)? {
c if level > 0 && is_blank(*c) => Some((TokenKind::Heading(level), level + 1)),
_ => None,
}
}
/// If a line starts with 3 or more ``` characters, we have a code fence. Store the number of
/// graves, as that is used to determine when we get a matching code fence.
///
/// Note that a newline is not required after the fence, which means that any text after the
/// graves can be interpreted as e.g. a language
fn code_fence(&mut self) -> Option<(TokenKind, usize)> {
let fence_count = read_while(self.remaining_bytes(), |c| c == b'`');
if fence_count > 2 {
Some((TokenKind::CodeFence(fence_count), fence_count))
} else {
None
}
}
/// If a line starts with 3 or more `:` characters, we have a div fence. Just like with code
/// fences, we store the number of colons used, to match the fences against each other.
fn div_fence(&self) -> Option<(TokenKind, usize)> {
let fence_count = read_while(self.remaining_bytes(), |c| c == b':');
if fence_count > 2 {
Some((TokenKind::DivFence(fence_count), fence_count))
} else {
None
}
}
/// A blockquote is identified by a `>` character followed by a blank character. The token size
/// will always be 2 bytes
fn blockquote(&self) -> Option<(TokenKind, usize)> {
match (self.bytes.get(self.i)?, self.bytes.get(self.i + 1)?) {
(b'>', c) if is_blank(*c) => Some((TokenKind::Blockquote, 2)),
_ => None,
}
}
/// An attribute specifier is a line that starts with a `{` and ends with `}`.
///
/// Note that this can also be a paragraph with inline formatting, but this is handled by the
/// parser.
fn attributes(&self) -> Option<(TokenKind, usize)> {
let line = self.current_line()?;
match (line.first()?, line.last()?) {
(b'{', b'}') => Some((TokenKind::BlockAttributes, line.len())),
_ => None,
}
}
/// This finds both link definitions (`[label]: `) and footnote definitions (`[^label]`) and
/// returns the corresponding token.
///
/// NOTE: This method matches `[]: ` as a link definition. To me, this seems wrong, but the
/// current (2022-11-29) official edition of Djot does the same, so I'm keeping it for now
fn reference_definition(&self) -> Option<(TokenKind, usize)> {
let bytes = self.remaining_bytes();
// We need to match against at least three bytes
match (bytes.first()?, bytes.get(1)?, bytes.get(2)?) {
// Check for the sequence `[^`, but not `[^]`. The latter is interpreted as a regular
// link definition.
(b'[', b'^', c) if *c != b']' => {
// Read until reaching a `]` character
let end_id = read_until(b']', &bytes[2..]);
// Link labels can have at most 999 characters
if bytes.iter().skip(2).take(end_id).count() < 1000 {
// If the `]` is followed by `: `, then we have found a definition
match (bytes.get(3 + end_id)?, bytes.get(4 + end_id)?) {
(b':', c) if is_blank(*c) => {
return Some((TokenKind::FootnoteDefinition, 5 + end_id))
}
_ => {}
}
}
None
}
(b'[', _, _) => {
// Read until reaching a `]` character
let end_id = read_until(b']', &bytes[1..]);
// Link labels can have at most 999 characters
if bytes.iter().skip(1).take(end_id).count() < 999 {
// If the `]` is followed by `: `, then we have found a definition
match (bytes.get(2 + end_id)?, bytes.get(3 + end_id)?) {
(b':', c) if is_blank(*c) => {
return Some((TokenKind::LinkDefinition, 4 + end_id))
}
_ => {}
}
}
None
}
_ => None,
}
}
/// This is at the moment a very messy list marker finder. It almost works (the exception being
/// that it does not differ wrt. case), but it needs a bit of simplification before I document
/// it properly. Macros could be useful here
fn list_marker(&self) -> Option<(TokenKind, usize)> {
use OrderedListType::*;
use TokenKind::*;
use UnorderedListType::*;
let line = self.current_line()?;
match line.first()? {
// Unordered
b':' if *line.get(1)? == b' ' => Some((DefinitionList, 2)),
c @ b'-' | c @ b'+' | c @ b'*' if matches!(line.get(1)?, b' ') => {
match (line.get(2)?, line.get(3)?, line.get(4)?, line.get(5)?) {
(b'[', b' ' | b'x' | b'X', b']', c) if is_blank(*c) => Some((TaskList, 6)),
_ => match c {
b'-' => Some((UnorderedList(Dash), 2)),
b'+' => Some((UnorderedList(Plus), 2)),
b'*' => Some((UnorderedList(Asterisk), 2)),
_ => None,
},
}
}
// Ordered (TODO: Need to check case as well)
b'(' => match line.get(1) {
Some(c) if is_alphabetic(*c) && *line.get(2)? == b')' && *line.get(3)? == b' ' => {
Some((OrderedList(LowerEnclosed), 4))
}
Some(c) if is_roman(*c) => {
let mut i = 2;
i += read_while(&line[i..], is_roman);
if *line.get(i + 1)? == b')' && *line.get(i + 2)? == b' ' {
Some((OrderedList(LowerRomanEnclosed), i + 2))
} else {
None
}
}
Some(c) if is_digit(*c) => {
let mut i = 2;
i += read_while(&line[i..], is_digit);
if *line.get(i + 1)? == b')' && *line.get(i + 2)? == b' ' {
Some((OrderedList(DecimalEnclosed), i + 2))
} else {
None
}
}
_ => None,
},
// Decimal
c if is_digit(*c) => {
let mut i = 1;
i += read_while(&line[i..], is_digit);
match (line.get(i)?, line.get(i + 1)?) {
(b'.', c) if is_blank(*c) => Some((OrderedList(DecimalPeriod), i + 2)),
(b')', c) if is_blank(*c) => Some((OrderedList(DecimalParen), i + 2)),
_ => None,
}
}
// Roman
c if is_roman(*c) => {
let mut i = 1;
i += read_while(&line[i..], is_roman);
match (line.get(i)?, line.get(i + 1)?) {
(b'.', c) if is_blank(*c) => Some((OrderedList(LowerRomanPeriod), i + 2)),
(b')', c) if is_blank(*c) => Some((OrderedList(LowerRomanParen), i + 2)),
_ => None,
}
}
// Alphabetic
c if is_alphabetic(*c) => match (line.get(1)?, line.get(2)?) {
(b'.', c) if is_blank(*c) => Some((OrderedList(LowerPeriod), 3)),
(b')', c) if is_blank(*c) => Some((OrderedList(LowerParen), 3)),
_ => None,
},
_ => None,
}
}
}
impl<'a> Iterator for Lexer<'a> {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
if self.i >= self.bytes.len() {
return None;
}
// Try different token scannings until something is returned. This short-circuits, which
// means the order below determines the precedence
let token = self
.blank_line()
.or_else(|| self.line_break())
.or_else(|| self.indentation())
.or_else(|| self.attributes())
.or_else(|| self.blockquote())
.or_else(|| self.heading())
.or_else(|| self.thematic_break())
.or_else(|| self.code_fence())
.or_else(|| self.div_fence())
.or_else(|| self.list_marker())
.or_else(|| self.reference_definition());
match token {
// If a token is found, advance the position by it's length and yield a Token enum
Some((kind, len)) => {
let token = Token::new(kind, self.i..self.i + len);
self.i = token.range.end;
Some(token)
}
// If a token is not found, read until the end of the line and return that as a Str
// token.
//
// NOTE: Block level syntax always takes precedence, which means that inline tokens can
// only be found within Token::Str. We could start inline lexing here...
_ => {
let mut line_len = read_line(self.remaining_bytes());
// If the last byte is a backslash, rewind behind it, so it can be handled as a
// hard line on the next call.
if let Some(b'\\') = self.bytes.get(self.i + line_len) {
line_len -= 1;
}
let token = Token::new(TokenKind::Str, self.i..self.i + line_len);
self.i = token.range.end;
Some(token)
}
}
}
}