pulldown_latex/parser/
lex.rs

1use crate::event::{DelimiterType, Dimension, DimensionUnit, Glue, GroupingKind, Line};
2
3use super::{
4    tables::{primitive_color, token_to_delim},
5    Argument, CharToken, ErrorKind, InnerResult, Token,
6};
7
8/// Parse the right-hand side of a definition (TeXBook p. 271).
9///
10/// In this case, a definition is any of `def`, `edef`, `gdef`, or `xdef`.
11///
12/// Returns the control sequence, the parameter text, and the replacement text.
13pub fn definition<'a>(input: &mut &'a str) -> InnerResult<(&'a str, &'a str, &'a str)> {
14    let control_sequence = control_sequence(input)?;
15    let (parameter_text, rest) = input.split_once('{').ok_or(ErrorKind::MissingExpansion)?;
16
17    if let Some(idx) = parameter_text.find(|c: char| c == '%' || c == '}') {
18        return Err(if parameter_text.as_bytes()[idx] == b'%' {
19            ErrorKind::CommentInParamText
20        } else {
21            ErrorKind::BracesInParamText
22        });
23    }
24
25    *input = rest;
26    let replacement_text = group_content(input, GroupingKind::Normal)?;
27
28    Ok((control_sequence, parameter_text, replacement_text))
29}
30
31/// Parse an argument to a control sequence, and return it.
32pub fn argument<'a>(input: &mut &'a str) -> InnerResult<Argument<'a>> {
33    if let Some(rest) = input.trim_start().strip_prefix('{') {
34        *input = rest;
35        let content = group_content(input, GroupingKind::Normal)?;
36        Ok(Argument::Group(content))
37    } else {
38        Ok(Argument::Token(token(input)?))
39    }
40}
41
42pub fn optional_argument<'a>(input: &mut &'a str) -> Option<&'a str> {
43    if let Some(rest) = input.trim_start().strip_prefix('[') {
44        *input = rest;
45        let content = group_content(input, GroupingKind::OptionalArgument).ok()?;
46        Some(content)
47    } else {
48        None
49    }
50}
51
52pub fn brace_argument<'a>(input: &mut &'a str) -> InnerResult<&'a str> {
53    if let Some(rest) = input.trim_start().strip_prefix('{') {
54        *input = rest;
55        group_content(input, GroupingKind::Normal)
56    } else {
57        Err(ErrorKind::GroupArgument)
58    }
59}
60
61/// Parses the inside of a group, when the first opening tag is already parsed.
62///
63/// The output is the content within the group without the surrounding `start` and `end`.
64/// This content is guaranteed to be balanced.
65pub fn group_content<'a>(input: &mut &'a str, grouping_kind: GroupingKind) -> InnerResult<&'a str> {
66    let start = grouping_kind.opening_str();
67    let end = grouping_kind.closing_str();
68    let mut escaped = false;
69    let mut index = 0;
70    let mut depth = 0u32;
71    let bytes = input.as_bytes();
72    while escaped || depth > 0 || !bytes[index..].starts_with(end.as_bytes()) {
73        if index + end.len() > input.len() {
74            *input = &input[input.len()..];
75            return Err(ErrorKind::UnbalancedGroup(Some(grouping_kind)));
76        }
77        if !escaped && bytes[index..].starts_with(start.as_bytes()) {
78            depth += 1;
79            index += start.len();
80            continue;
81        }
82        if !escaped && bytes[index..].starts_with(end.as_bytes()) {
83            if depth.checked_sub(1).is_none() {
84                break;
85            }
86            depth -= 1;
87            index += end.len();
88            continue;
89        }
90        match bytes[index] {
91            b'\\' => escaped = !escaped,
92            b'%' if !escaped => {
93                let Some(rest_pos) = bytes[index..].iter().position(|&c| c == b'\n') else {
94                    return Err(ErrorKind::UnbalancedGroup(Some(grouping_kind)));
95                };
96                index += rest_pos;
97            }
98            _ => escaped = false,
99        }
100        index += 1;
101    }
102    let (argument, rest) = input.split_at(index);
103    *input = &rest[end.len()..];
104    Ok(argument)
105}
106
107/// Finds the shortest prefix of `input` that contains balanced groups and ends with the given `suffix`.
108pub fn content_with_suffix<'a>(input: &mut &'a str, suffix: &str) -> InnerResult<&'a str> {
109    let mut escaped = false;
110    let mut index = 0;
111    let bytes = input.as_bytes();
112    while escaped || !bytes[index..].starts_with(suffix.as_bytes()) {
113        if index + suffix.len() > input.len() {
114            *input = &input[input.len()..];
115            return Err(ErrorKind::MacroSuffixNotFound);
116        }
117        match bytes[index] {
118            b'\\' => escaped = !escaped,
119            b'%' if !escaped => {
120                let rest_pos = bytes[index..]
121                    .iter()
122                    .position(|&c| c == b'\n')
123                    .unwrap_or(bytes.len());
124                index += rest_pos;
125            }
126            b'{' if !escaped => {
127                let content = group_content(&mut &input[index + 1..], GroupingKind::Normal)?;
128                index += content.len() + 1;
129            }
130            _ => escaped = false,
131        }
132        index += 1;
133    }
134    let (argument, rest) = input.split_at(index);
135    *input = &rest[suffix.len()..];
136    Ok(argument)
137}
138
139/// Converts a control sequence or character into its corresponding delimiter unicode
140/// character, and whether or not the delimiter is an opening.
141///
142/// Current delimiters supported are listed in TeXBook p. 146, and on https://temml.org/docs/en/supported ("delimiter" section).
143pub fn delimiter(input: &mut &str) -> InnerResult<(char, DelimiterType)> {
144    let maybe_delim = token(input)?;
145    token_to_delim(maybe_delim).ok_or(ErrorKind::Delimiter)
146}
147
148/// Parse the right-hand side of a `futurelet` assignment (TeXBook p. 273).
149///
150/// Returns the control sequence, the token it should be assigned to, and the rest of the input
151/// with both tokens not consumed.
152pub fn futurelet_assignment<'a>(input: &mut &'a str) -> InnerResult<(&'a str, Token<'a>, &'a str)> {
153    let control_sequence = control_sequence(input)?;
154
155    let input_with_tokens = *input;
156
157    let _ = token(input)?;
158    let token = token(input)?;
159    Ok((control_sequence, token, input_with_tokens))
160}
161
162/// Parse the right-hand side of a `let` assignment (TeXBook p. 273).
163///
164/// Returns the control sequence and the value it is assigned to.
165pub fn let_assignment<'a>(input: &mut &'a str) -> InnerResult<(&'a str, Token<'a>)> {
166    let control_sequence = control_sequence(input)?;
167    if let Some(s) = input.trim_start().strip_prefix('=') {
168        *input = s;
169    }
170    let token = token(input)?;
171    Ok((control_sequence, token))
172}
173
174/// Parse a control_sequence, including the leading `\`.
175pub fn control_sequence<'a>(input: &mut &'a str) -> InnerResult<&'a str> {
176    if let Some(rest) = input.strip_prefix('\\') {
177        *input = rest;
178        rhs_control_sequence(input)
179    } else {
180        input
181            .chars()
182            .next()
183            .map_or(Err(ErrorKind::EmptyControlSequence), |_| {
184                Err(ErrorKind::ControlSequence)
185            })
186    }
187}
188
189pub fn limit_modifiers(input: &mut &str) -> Option<bool> {
190    let mut output = None;
191    while let Some((rest, limits)) = input
192        .trim_start()
193        .strip_prefix(r"\limits")
194        .map(|rest| (rest, true))
195        .or_else(|| {
196            input
197                .trim_start()
198                .strip_prefix(r"\nolimits")
199                .map(|rest| (rest, false))
200        })
201    {
202        *input = rest;
203        output = Some(limits);
204    }
205    output
206}
207
208/// Parse the right side of a control sequence (`\` already being parsed).
209///
210/// A control sequence can be of the form `\controlsequence`, or `\#` (control symbol).
211pub fn rhs_control_sequence<'a>(input: &mut &'a str) -> InnerResult<&'a str> {
212    if input.is_empty() {
213        return Err(ErrorKind::EmptyControlSequence);
214    }
215
216    let len = input
217        .chars()
218        .take_while(|c| c.is_ascii_alphabetic())
219        .count()
220        .max(1);
221
222    let (control_sequence, rest) = input.split_at(len);
223    *input = rest.trim_start();
224    Ok(control_sequence)
225}
226
227/// Parse a glue (TeXBook p. 267).
228pub fn glue(input: &mut &str) -> InnerResult<Glue> {
229    let mut dimen = (dimension(input)?, None, None);
230    if let Some(s) = input.trim_start().strip_prefix("plus") {
231        *input = s;
232        dimen.1 = Some(dimension(input)?);
233    }
234    if let Some(s) = input.trim_start().strip_prefix("minus") {
235        *input = s;
236        dimen.2 = Some(dimension(input)?);
237    }
238    Ok(dimen)
239}
240
241/// Parse a dimension (TeXBook p. 266).
242pub fn dimension(input: &mut &str) -> InnerResult<Dimension> {
243    let number = floating_point(input)?;
244    let unit = dimension_unit(input)?;
245    Ok(Dimension::new(number, unit))
246}
247
248/// Parse a dimension unit (TeXBook p. 266).
249pub fn dimension_unit(input: &mut &str) -> InnerResult<DimensionUnit> {
250    *input = input.trim_start();
251    let unit = input.get(0..2).ok_or(ErrorKind::DimensionUnit)?;
252    let unit = match unit {
253        "em" => DimensionUnit::Em,
254        "ex" => DimensionUnit::Ex,
255        "pt" => DimensionUnit::Pt,
256        "pc" => DimensionUnit::Pc,
257        "in" => DimensionUnit::In,
258        "bp" => DimensionUnit::Bp,
259        "cm" => DimensionUnit::Cm,
260        "mm" => DimensionUnit::Mm,
261        "dd" => DimensionUnit::Dd,
262        "cc" => DimensionUnit::Cc,
263        "sp" => DimensionUnit::Sp,
264        "mu" => DimensionUnit::Mu,
265        _ => return Err(ErrorKind::DimensionUnit),
266    };
267
268    *input = &input[2..];
269    one_optional_space(input);
270
271    Ok(unit)
272}
273
274/// Parse an integer that may be positive or negative and may be represented as octal, decimal,
275/// hexadecimal, or a character code (TeXBook p. 265).
276// For future use maybe.
277#[allow(dead_code)]
278pub fn integer(input: &mut &str) -> InnerResult<isize> {
279    let signum = signs(input)?;
280
281    let unsigned_int = unsigned_integer(input)?;
282
283    Ok(unsigned_int as isize * signum)
284}
285
286pub fn unsigned_integer(input: &mut &str) -> InnerResult<usize> {
287    // The following character must be ascii.
288    let next_char = input.chars().next().ok_or(ErrorKind::Number)?;
289    if next_char.is_ascii_digit() {
290        return Ok(decimal(input));
291    }
292    *input = &input[1..];
293    match next_char {
294        '`' => {
295            let mut next_byte = *input.as_bytes().first().ok_or(ErrorKind::Number)?;
296            if next_byte == b'\\' {
297                *input = &input[1..];
298                next_byte = *input.as_bytes().first().ok_or(ErrorKind::Number)?;
299            }
300            if next_byte.is_ascii() {
301                *input = &input[1..];
302                Ok(next_byte as usize)
303            } else {
304                Err(ErrorKind::CharacterNumber)
305            }
306        }
307        '\'' => Ok(octal(input)),
308        '"' => Ok(hexadecimal(input)),
309        _ => Err(ErrorKind::Number),
310    }
311}
312
313/// Parse the signs in front of a number, returning the signum.
314pub fn signs(input: &mut &str) -> InnerResult<isize> {
315    let mut minus_count = 0;
316    *input = input
317        .trim_start_matches(|c: char| {
318            if c == '-' {
319                minus_count += 1;
320                true
321            } else {
322                c == '+' || c.is_whitespace()
323            }
324        })
325        .trim_start();
326    Ok(if minus_count % 2 == 0 { 1 } else { -1 })
327}
328
329/// Parse a base 16 unsigned number.
330pub fn hexadecimal(input: &mut &str) -> usize {
331    let mut number = 0;
332    *input = input.trim_start_matches(|c: char| {
333        if c.is_ascii_alphanumeric() && c < 'G' {
334            number =
335                number * 16 + c.to_digit(16).expect("the character is a valid hex digit") as usize;
336            true
337        } else {
338            false
339        }
340    });
341    one_optional_space(input);
342
343    number
344}
345
346/// Parse a floating point number (named `factor` in TeXBook p. 266).
347pub fn floating_point(input: &mut &str) -> InnerResult<f32> {
348    let signum = signs(input)?;
349
350    let mut number = 0.;
351    *input = input.trim_start_matches(|c: char| {
352        if c.is_ascii_digit() {
353            number = number * 10. + (c as u8 - b'0') as f32;
354            true
355        } else {
356            false
357        }
358    });
359
360    if let Some(stripped_decimal_point) = input.strip_prefix(|c| c == '.' || c == ',') {
361        let mut decimal = 0.;
362        let mut decimal_divisor = 1.;
363        *input = stripped_decimal_point.trim_start_matches(|c: char| {
364            if c.is_ascii_digit() {
365                decimal = decimal * 10. + (c as u8 - b'0') as f32;
366                decimal_divisor *= 10.;
367                true
368            } else {
369                false
370            }
371        });
372        number += decimal / decimal_divisor;
373    };
374
375    Ok(signum as f32 * number)
376}
377
378/// Parse a base 10 unsigned number.
379pub fn decimal(input: &mut &str) -> usize {
380    let mut number = 0;
381    *input = input.trim_start_matches(|c: char| {
382        if c.is_ascii_digit() {
383            number = number * 10 + (c as u8 - b'0') as usize;
384            true
385        } else {
386            false
387        }
388    });
389    one_optional_space(input);
390
391    number
392}
393
394/// Parse a base 8 unsigned number.
395pub fn octal(input: &mut &str) -> usize {
396    let mut number = 0;
397    *input = input.trim_start_matches(|c: char| {
398        if c.is_ascii_digit() {
399            number = number * 8 + (c as u8 - b'0') as usize;
400            true
401        } else {
402            false
403        }
404    });
405    one_optional_space(input);
406
407    number
408}
409
410/// Parse an optional space.
411pub fn one_optional_space(input: &mut &str) -> bool {
412    let mut chars = input.chars();
413    if chars.next().is_some_and(|c| c.is_whitespace()) {
414        *input = &input[1..];
415        true
416    } else {
417        false
418    }
419}
420
421/// Return the next token in the input.
422///
423/// A token will never be whitespace, and will never be inside of a comment.
424pub fn token<'a>(input: &mut &'a str) -> InnerResult<Token<'a>> {
425    *input = input.trim_start();
426    match input.chars().next() {
427        Some('\\') => {
428            *input = &input[1..];
429            Ok(Token::ControlSequence(rhs_control_sequence(input)?))
430        }
431        Some('%') => {
432            let (_, rest) = input
433                .split_once('\n')
434                .unwrap_or(("", &input[input.len()..]));
435            *input = rest;
436            token(input)
437        }
438        Some(c) => {
439            let context = *input;
440            *input = input.split_at(c.len_utf8()).1;
441            Ok(Token::Character(CharToken::from_str(context)))
442        }
443        None => Err(ErrorKind::Token),
444    }
445}
446
447pub fn color(color: &str) -> Option<(u8, u8, u8)> {
448    match color.strip_prefix('#') {
449        Some(color) if color.len() == 6 => {
450            let r = u8::from_str_radix(&color[..2], 16).ok()?;
451            let g = u8::from_str_radix(&color[2..4], 16).ok()?;
452            let b = u8::from_str_radix(&color[4..], 16).ok()?;
453            Some((r, g, b))
454        }
455        None => primitive_color(color),
456        _ => None,
457    }
458}
459
460pub fn horizontal_lines(content: &mut &str) -> Box<[Line]> {
461    let mut horizontal_lines = Vec::new();
462    while let Some((rest, line)) = content
463        .trim_start()
464        .strip_prefix("\\hline")
465        .map(|rest| (rest, Line::Solid))
466        .or_else(|| {
467            content
468                .trim_start()
469                .strip_prefix("\\hdashline")
470                .map(|rest| (rest, Line::Dashed))
471        })
472    {
473        horizontal_lines.push(line);
474        *content = rest;
475    }
476
477    horizontal_lines.into()
478}
479
480#[cfg(test)]
481mod tests {
482    use crate::{
483        event::{Dimension, DimensionUnit, GroupingKind},
484        parser::{lex, Token},
485    };
486
487    #[test]
488    fn signs() {
489        let mut input = "  +    +-   \\test";
490        assert_eq!(lex::signs(&mut input).unwrap(), -1);
491        assert_eq!(input, "\\test");
492    }
493
494    #[test]
495    fn no_signs() {
496        let mut input = "\\mycommand";
497        assert_eq!(lex::signs(&mut input).unwrap(), 1);
498        assert_eq!(input, "\\mycommand");
499    }
500
501    // A complex exanple from problem 20.7 in TeXBook (p. 205):
502    // \def\cs AB#1#2C$#3\$ {#3{ab#1}#1 c##\x #2}
503    #[test]
504    fn definition_texbook() {
505        let mut input = "\\cs AB#1#2C$#3\\$ {#3{ab#1}#1 c##\\x #2}";
506
507        let (cs, param, repl) = lex::definition(&mut input).unwrap();
508        assert_eq!(cs, "cs");
509        assert_eq!(param, "AB#1#2C$#3\\$ ");
510        assert_eq!(repl, "#3{ab#1}#1 c##\\x #2");
511        assert_eq!(input, "");
512    }
513
514    #[test]
515    fn complex_definition() {
516        let mut input = r"\foo #1\test#2#{##\####2#2 \{{\}} \{\{\{} 5 + 5 = 10";
517        let (cs, param, repl) = lex::definition(&mut input).unwrap();
518
519        assert_eq!(cs, "foo");
520        assert_eq!(param, r"#1\test#2#");
521        assert_eq!(repl, r"##\####2#2 \{{\}} \{\{\{");
522        assert_eq!(input, " 5 + 5 = 10");
523    }
524
525    #[test]
526    fn let_assignment() {
527        let mut input = r"\foo = \bar";
528        let (cs, token) = lex::let_assignment(&mut input).unwrap();
529
530        assert_eq!(cs, "foo");
531        assert_eq!(token, Token::ControlSequence("bar"));
532        assert_eq!(input, "");
533    }
534
535    #[test]
536    fn futurelet_assignment() {
537        let mut input = r"\foo\bar\baz blah";
538        let (cs, token, rest) = lex::futurelet_assignment(&mut input).unwrap();
539
540        assert_eq!(cs, "foo");
541        assert_eq!(token, Token::ControlSequence("baz"));
542        assert_eq!(rest, r"\bar\baz blah");
543    }
544
545    #[test]
546    fn dimension() {
547        let mut input = "1.2pt";
548        let dim = lex::dimension(&mut input).unwrap();
549
550        assert_eq!(dim, Dimension::new(1.2, DimensionUnit::Pt));
551        assert_eq!(input, "");
552    }
553
554    #[test]
555    fn complex_glue() {
556        let mut input = "1.2 pt plus 3.4pt minus 5.6pt nope";
557        let glue = lex::glue(&mut input).unwrap();
558
559        assert_eq!(
560            glue,
561            (
562                Dimension::new(1.2, DimensionUnit::Pt),
563                Some(Dimension::new(3.4, DimensionUnit::Pt)),
564                Some(Dimension::new(5.6, DimensionUnit::Pt))
565            )
566        );
567        assert_eq!(input, "nope");
568    }
569
570    #[test]
571    fn numbers() {
572        let mut input = "123 -\"AEF24 --'3475 `\\a -.47";
573        assert_eq!(lex::integer(&mut input).unwrap(), 123);
574        assert_eq!(lex::integer(&mut input).unwrap(), -716580);
575        assert_eq!(lex::integer(&mut input).unwrap(), 1853);
576        assert_eq!(lex::integer(&mut input).unwrap(), 97);
577        assert_eq!(lex::floating_point(&mut input).unwrap(), -0.47);
578        assert_eq!(input, "");
579    }
580
581    #[test]
582    fn group_content() {
583        let mut input =
584            "this { { is a test } to see if { the content parsing { of this } } } works }";
585        let content = lex::group_content(&mut input, GroupingKind::Normal).unwrap();
586        assert_eq!(
587            content,
588            "this { { is a test } to see if { the content parsing { of this } } } works "
589        );
590    }
591}