texlang_stdlib/
expansion.rs

1//! Commands that alter the expansion process
2
3use texlang::traits::*;
4use texlang::*;
5
6/// Get the `\noexpand` command.
7pub fn get_noexpand<S>() -> command::BuiltIn<S> {
8    command::BuiltIn::new_expansion(noexpand_fn).with_tag(NO_EXPAND_TAG.get())
9}
10
11static NO_EXPAND_TAG: command::StaticTag = command::StaticTag::new();
12
13fn noexpand_fn<S>(
14    _: token::Token,
15    _: &mut vm::ExpansionInput<S>,
16) -> command::Result<Vec<token::Token>> {
17    panic!(
18        "The \\noexpand expansion function is never invoked directly. \
19         Instead, the primitive operates through the \\noexpand hook, \
20         which is a method on the `TexlangState` trait. \
21         Ensure your Texcraft VM is configured to use this hook."
22    )
23}
24
25#[inline]
26pub fn noexpand_hook<S: TexlangState>(
27    token: token::Token,
28    input: &mut vm::ExpansionInput<S>,
29    tag: Option<command::Tag>,
30) -> command::Result<Option<token::Token>> {
31    // Fast path: this is not the \noexpand command.
32    // We want this check to be inlined into the VM functions that perform
33    if tag != Some(NO_EXPAND_TAG.get()) {
34        return Ok(None);
35    }
36    // Slow path: this is not the \noexpand command.
37    // We don't want this check to be inlined because it will take up space in the instruction cache.
38    noexpand_hook_finish(token, input)
39}
40
41fn noexpand_hook_finish<S: TexlangState>(
42    token: token::Token,
43    input: &mut vm::ExpansionInput<S>,
44) -> command::Result<Option<token::Token>> {
45    match input.unexpanded().next()? {
46        None => Err(error::SimpleTokenError::new(
47            input.vm(),
48            token,
49            "unexpected end of input while expanding a `\\noexpand` command",
50        )
51        .into()),
52        // TODO .add_note("the `\\noexpand` command must be followed by 1 token")
53        Some(token) => Ok(Some(token)),
54    }
55}
56
57/// Get the simple `\expandafter` command.
58/// Get the simple `\expandafter` command.
59///
60/// This is the simplest implementation of the command, and the
61///     same implementation as in the original TeX.
62pub fn get_expandafter_simple<S: TexlangState>() -> command::BuiltIn<S> {
63    command::BuiltIn::new_expansion(expandafter_simple_fn)
64}
65
66/// Get the optimized `\expandafter` command.
67///
68/// This is a more complex implementation of `\expandafter` that is optimized for handling
69/// repeated `\expandafter` tokens.
70/// It contains two optimizations, as described below.
71/// These optimizations where developed "theoretically" and with no benchmarking, so it
72/// remains to be seen if they actually make expansion faster.
73/// For this reason both the optimized and simple `\expandafter` implementations are maintained.
74///
75/// We now describe the two optimizations.
76///  
77/// **First**, `\expandafter` control sequences are often linked together in the following format:
78///
79/// ```tex
80/// \expandafter<token 1>\expandafter<token 2>...\expandafter<token n><token n+1>
81/// ```
82///
83/// Here, to expand the first `\expandafter` we just need to expand `<token n+1>`.
84/// In the original implementation of TeX this works via recursion: the ith `\expandafter`
85/// requests expansion of the second-to-next token, which is the (i+1)th `\expandafter`.
86/// After n recursions, the last token is finally expanded.
87/// In the optimized implementation here, the token stream is scanned ahead for as long as the pattern
88/// `\expandafter<token i>` repeats.
89/// The token `<token n+1>` is expanded and the intermediate `\expandafter` tokens are dropped from the input.
90/// This is still an O(n) operation, but results in only 1 Rust function stack being used, rather than n.
91///
92/// **Second**, `\expandafter` commands are often grouped together like this:
93///
94/// ```tex
95/// \expandafter\expandafter\expandafter\A\expandafter\B\C
96/// ```
97///
98/// This TeX code causes `\C` to be expanded first, then `\B\` and finally `\A`.
99/// When the leading `\expandafter` is expanded, the first optimization kicks in and `\C` will be expanded, leaving:
100///
101/// ```tex
102/// \expandafter\A\B\Cexpanded
103/// ```
104///
105/// The second optimization is that the leading `\expandafter` that is left over will also be expanded
106///     without yielding control to the main expansion loop.
107/// If, after this pass, the leading token is again an `\expandafter` token, it will be expanded too.
108/// This process continues repeatedly until no `\expandafter` tokens are left at the start of the token stream.
109pub fn get_expandafter_optimized<S: TexlangState>() -> command::BuiltIn<S> {
110    command::BuiltIn::new_expansion(expandafter_optimized_fn)
111}
112
113fn expandafter_simple_fn<S: TexlangState>(
114    expandafter_token: token::Token,
115    input: &mut vm::ExpansionInput<S>,
116) -> command::Result<Vec<token::Token>> {
117    let next = match input.unexpanded().next()? {
118        None => {
119            return Err(expandafter_missing_first_token_error(
120                input.vm(),
121                expandafter_token,
122            ));
123        }
124        Some(next) => next,
125    };
126    if input.unexpanded().peek()?.is_none() {
127        return Err(expandafter_missing_second_token_error(
128            input.vm(),
129            expandafter_token,
130            next,
131        ));
132    }
133    input.expanded().expand_once()?;
134    input.expansions_mut().push(next);
135    Ok(vec![])
136}
137
138fn expandafter_optimized_fn<S: TexlangState>(
139    expandafter_token: token::Token,
140    input: &mut vm::ExpansionInput<S>,
141) -> command::Result<Vec<token::Token>> {
142    let mut buffer: Vec<token::Token> = input.checkout_token_buffer();
143    let unexpanded_input = input.unexpanded();
144    loop {
145        match unexpanded_input.next()? {
146            None => {
147                return Err(expandafter_missing_first_token_error(
148                    input.vm(),
149                    expandafter_token,
150                ))
151            }
152            Some(next) => buffer.push(next),
153        };
154        let token = match unexpanded_input.peek()? {
155            None => {
156                return Err(expandafter_missing_second_token_error(
157                    input.vm(),
158                    expandafter_token,
159                    *buffer.last().unwrap(),
160                ))
161            }
162            Some(token) => *token,
163        };
164        if token.value() != expandafter_token.value() {
165            break;
166        }
167        // Remove the \expandafter token from the stream
168        _ = unexpanded_input.next()?;
169    }
170    input.expanded().expand_once()?;
171
172    while let Some(&root) = buffer.first() {
173        if root.value() != expandafter_token.value() {
174            input.expansions_mut().extend(buffer.iter().rev());
175            break;
176        }
177        let mut last_expandafter_index = 0;
178        while let Some(next) = buffer.get(last_expandafter_index + 2) {
179            if next.value() != root.value() {
180                break;
181            }
182            last_expandafter_index += 2;
183        }
184        // We need to ensure that the buffer ends exactly one token after the last \expandafter token.
185        // There are three cases depending on whether the buffer is under-full, overfull, or exactly right.
186        match buffer
187            .len()
188            .checked_sub(last_expandafter_index + 1)
189            .unwrap()
190        {
191            // Under-full
192            0 => {
193                let next = match input.unexpanded().next()? {
194                    None => return Err(expandafter_missing_first_token_error(input.vm(), root)),
195                    Some(next) => next,
196                };
197                buffer.push(next);
198            }
199            // Exactly right
200            1 => {}
201            // Overfull
202            _ => {
203                input
204                    .expansions_mut()
205                    .extend(buffer[last_expandafter_index + 2..].iter().rev());
206                buffer.truncate(last_expandafter_index + 2);
207            }
208        }
209        // Check there is another token in the input. This is only relevant in the under-full and
210        // exactly right cases, but it's easier to put it here.
211        if input.unexpanded().peek()?.is_none() {
212            return Err(expandafter_missing_second_token_error(
213                input.vm(),
214                root,
215                *buffer.last().unwrap(),
216            ));
217        }
218        input.expanded().expand_once()?;
219        remove_even_indices(&mut buffer);
220    }
221    input.return_token_buffer(buffer);
222    Ok(vec![])
223}
224
225fn remove_even_indices(v: &mut Vec<token::Token>) {
226    let mut src = 1;
227    let mut dest = 0;
228    while let Some(token) = v.get(src) {
229        v[dest] = *token;
230        dest += 1;
231        src += 2;
232    }
233    v.truncate(dest);
234}
235
236fn expandafter_missing_first_token_error<S>(
237    vm: &vm::VM<S>,
238    expandafter_token: token::Token,
239) -> Box<error::Error> {
240    error::SimpleTokenError::new(
241        vm,
242        expandafter_token,
243        "unexpected end of input while expanding an `\\expandafter` command",
244    )
245    .into()
246    // TODO
247    // .add_note("the `\\expandafter` command must be followed by 2 tokens")
248    // .add_note("no more tokens were found")
249}
250
251fn expandafter_missing_second_token_error<S>(
252    vm: &vm::VM<S>,
253    expandafter_token: token::Token,
254    first_token: token::Token,
255) -> Box<error::Error> {
256    _ = first_token;
257    error::SimpleTokenError::new(
258        vm,
259        expandafter_token,
260        "unexpected end of input while expanding an `\\expandafter` command",
261    )
262    .into()
263    // TODO .add_note("the `\\expandafter` command must be followed by 2 tokens")
264    //.add_note("only 1 more tokens was found")
265}
266
267/// Get the `\relax` command.
268pub fn get_relax<S>() -> command::BuiltIn<S> {
269    command::BuiltIn::new_execution(|_, _| Ok(()))
270}
271
272#[cfg(test)]
273mod test {
274    use super::*;
275    use crate::prefix;
276    use crate::script;
277    use crate::testing::*;
278    use std::collections::HashMap;
279
280    #[derive(Default)]
281    pub struct State {
282        prefix: prefix::Component,
283        script: script::Component,
284        integer: i32,
285    }
286
287    impl State {
288        pub fn get_integer() -> command::BuiltIn<State> {
289            variable::Command::new_singleton(
290                |state: &State, _: variable::Index| -> &i32 { &state.integer },
291                |state: &mut State, _: variable::Index| -> &mut i32 { &mut state.integer },
292            )
293            .into()
294        }
295    }
296
297    impl TexlangState for State {
298        fn expansion_override_hook(
299            token: token::Token,
300            input: &mut vm::ExpansionInput<Self>,
301            tag: Option<command::Tag>,
302        ) -> command::Result<Option<token::Token>> {
303            noexpand_hook(token, input, tag)
304        }
305    }
306
307    implement_has_component![
308        State,
309        (script::Component, script),
310        (prefix::Component, prefix),
311    ];
312
313    fn initial_commands(optimized: bool) -> HashMap<&'static str, command::BuiltIn<State>> {
314        HashMap::from([
315            ("def", crate::def::get_def()),
316            ("noexpand", get_noexpand()),
317            ("integer", State::get_integer()),
318            (
319                "xa",
320                match optimized {
321                    true => get_expandafter_optimized(),
322                    false => get_expandafter_simple(),
323                },
324            ),
325        ])
326    }
327
328    test_suite![
329        options(TestOption::InitialCommandsDyn(Box::new(|| {
330            initial_commands(true)
331        })),),
332        expansion_equality_tests(
333            (simple_case, r"\def\a{Hello}\noexpand\a", r"\a"),
334            (
335                expandafter_and_noexpand_1,
336                r"\def\a#1\b{Hello '#1'}\def\b{World} \a\b",
337                " Hello ''"
338            ),
339            (
340                expandafter_and_noexpand_2,
341                r"\def\a#1\b{Hello '#1'}\def\b{World} \a\b\b",
342                " Hello ''World"
343            ),
344            (
345                expandafter_and_noexpand_3,
346                r"\def\a#1\b{Hello '#1'}\def\b{World} \xa\a\b\b",
347                " Hello 'World'"
348            ),
349            (
350                expandafter_and_noexpand_4,
351                r"\def\a#1\b{Hello '#1'}\def\b{World} \xa\a\noexpand\b\b",
352                " Hello ''World"
353            ),
354            (
355                only_expands_once,
356                r"\def\A{\B}\def\B{Hello}\xa\noexpand\A",
357                r"\B",
358            ),
359            (
360                peek_consumes_noexpand_1,
361                r"\def\A{\B}\def\B{Hello}\integer = 1 \noexpand\A",
362                r"\A",
363            ),
364            (
365                peek_consumes_noexpand_2,
366                r"\def\A{\B}\def\B{Hello}\integer = 1\noexpand\A",
367                r"Hello",
368            ),
369            // peek
370        ),
371        failure_tests((end_of_input, r"\noexpand"),),
372    ];
373
374    static PREFIX: &str = r"\def\mk#1#2{\def#1##1\notes##2\end{##1\notes##2#2\end}}\mk\a a\mk\b b\mk\c c\mk\d d\def\notes#1\end{#1}";
375    static POSTFIX: &str = r"\notes\end";
376
377    #[macro_export]
378    macro_rules! expandafter_test {
379        ( $( ( $name: ident, $lhs: expr, $rhs: expr ) ),* $(,)? ) => {
380            mod expandafter_simple {
381                use super::*;
382                test_suite![
383                    options(TestOption::InitialCommandsDyn(Box::new(|| { initial_commands(false) }))),
384                    expansion_equality_tests(
385                        $(
386                            ( $name, format!("{}{}{}", PREFIX, $lhs, POSTFIX), $rhs ),
387                        )*
388                    ),
389                ];
390            }
391            mod expandafter_optimized {
392                use super::*;
393                test_suite![
394                    options(TestOption::InitialCommandsDyn(Box::new(|| { initial_commands(true) }))),
395                    expansion_equality_tests(
396                        $(
397                            ( $name, format!("{}{}{}", PREFIX, $lhs, POSTFIX), $rhs ),
398                        )*
399                    ),
400                ];
401            }
402        };
403    }
404
405    expandafter_test![
406        (texbook_p374_3, r"\xa\a\b", r"ba"),
407        (texbook_p374_4, r"\xa\xa\xa\a\xa\b\c", "cba"),
408        (
409            texbook_p374_5,
410            r"\xa\xa\xa\xa\xa\xa\xa\a\xa\xa\xa\b\xa\c\d",
411            "dcba"
412        ),
413        /*
414        All of the following permutation cases were generated by hand, but there's actually
415        an algorithmic way to generate them. For each macro, count the number of \expandafter=\xa
416        tokens that come before. Then calculate the shift:
417        - if #\xa is 0, the shift is 0.
418        - if #\xa is 1, the shift is 1.
419        - if #\xa is 3, the shift is 2.
420        - if #\xa is 7, the shift is 3.
421        (In general if #\xa is 2^n-1, the shift is n-1.)
422        Now work backwards through the tokens `\a\b\c\d`. For each token, move it right by the associated
423        shift. You then get the result of the expansion.
424
425        For example: `\xa\xa\xa\a\xa\b\c\d`.
426        The #\xa numbers are: 3 1 0 0
427        The shifts are: 2 1 0 0
428        Then
429        - start:         `\a\b\c\d`
430        - shift \d by 0: `\a\b\c\d`
431        - shift \c by 0: `\a\b\c\d`
432        - shift \b by 1: `\a\c\b\d`
433        - shift \a by 2: `\c\b\a\d` <- this is the expansion
434        */
435        (permutation_abcd, r"\a\b\c\d", "abcd"),
436        (permutation_abdc, r"\a\b\xa\c\d", "abdc"),
437        (permutation_acbd, r"\a\xa\b\c\d", "acbd"),
438        (permutation_acdb, r"\a\xa\xa\xa\b\c\d", "acdb"),
439        (permutation_adbc, r"\a\xa\b\xa\c\d", "adbc"),
440        (permutation_adcb, r"\a\xa\xa\xa\b\xa\c\d", "adcb"),
441        (permutation_bacd, r"\xa\a\b\c\d", "bacd"),
442        (permutation_badc, r"\xa\a\b\xa\c\d", "badc"),
443        (permutation_bcad, r"\xa\xa\xa\a\b\c\d", "bcad"),
444        (permutation_bcda, r"\xa\xa\xa\xa\xa\xa\xa\a\b\c\d", "bcda"),
445        (permutation_bdac, r"\xa\xa\xa\a\b\xa\c\d", "bdac"),
446        (
447            permutation_bdca,
448            r"\xa\xa\xa\xa\xa\xa\xa\a\b\xa\c\d",
449            "bdca"
450        ),
451        (permutation_cabd, r"\xa\a\xa\b\c\d", "cabd"),
452        (permutation_cadb, r"\xa\a\xa\xa\xa\b\c\d", "cadb"),
453        (permutation_cbad, r"\xa\xa\xa\a\xa\b\c\d", "cbad"),
454        (
455            permutation_cbda,
456            r"\xa\xa\xa\xa\xa\xa\xa\a\xa\xa\xa\b\c\d",
457            "cdba"
458        ),
459        (permutation_cdab, r"\xa\xa\xa\a\xa\xa\xa\b\c\d", "cdab"),
460        (
461            permutation_cdba,
462            r"\xa\xa\xa\xa\xa\xa\xa\a\xa\xa\xa\b\c\d",
463            "cdba"
464        ),
465        (permutation_dabc, r"\xa\a\xa\b\xa\c\d", "dabc"),
466        (permutation_dacb, r"\xa\a\xa\xa\xa\b\xa\c\d", "dacb"),
467        (permutation_dbac, r"\xa\xa\xa\a\xa\b\xa\c\d", "dbac"),
468        (
469            permutation_dbca,
470            r"\xa\xa\xa\xa\xa\xa\xa\a\xa\b\xa\c\d",
471            "dbca"
472        ),
473        (permutation_dcab, r"\xa\xa\xa\a\xa\xa\xa\b\xa\c\d", "dcab"),
474        (
475            permutation_dcba,
476            r"\xa\xa\xa\xa\xa\xa\xa\a\xa\xa\xa\b\xa\c\d",
477            "dcba"
478        ),
479        (
480            expandafter_last_after_first_pass,
481            r"\xa\xa\xa\a\xa\xa\b\c\d",
482            "bdac"
483        ),
484    ];
485
486    fn run_expandafter_failure_test(input: &str, optimized: bool) {
487        let options = vec![crate::testing::TestOption::InitialCommandsDyn(Box::new(
488            || initial_commands(optimized),
489        ))];
490        crate::testing::run_failure_test(&input, &options);
491    }
492
493    #[macro_export]
494    macro_rules! expandafter_failure_test {
495        ($( ( $name: ident, $input: expr), )+) => {
496            $(
497            mod $name {
498                #[test]
499                fn simple() {
500                    super::run_expandafter_failure_test($input, false);
501                }
502                #[test]
503                fn optimized() {
504                    super::run_expandafter_failure_test($input, true);
505                }
506            }
507            )+
508        };
509    }
510
511    expandafter_failure_test![
512        (expandafter_missing_1st_token, r"\xa"),
513        (expandafter_missing_2nd_token, r"\xa\a"),
514        (expandafter_missing_1st_token_nested, r"\xa\xa\xa\a\xa\xa\b"),
515        (
516            expandafter_missing_2nd_token_nested,
517            r"\def\A{}\xa\xa\xa\A\A"
518        ),
519    ];
520}
texlang_stdlib/expansion.rs

texlang_stdlib/
expansion.rs