lexi_matic_derive/
lib.rs

1extern crate proc_macro;
2use proc_macro::TokenStream;
3use quote::{format_ident, quote};
4use regex_automata::{
5    dfa::{dense::DFA, StartKind},
6    MatchKind,
7};
8use syn::{parse_macro_input, Data, DeriveInput, Ident, LitStr};
9
10/// Derive the Lexer implementation.
11#[proc_macro_derive(Lexer, attributes(regex, token, lexer))]
12pub fn derive_lexer(input: TokenStream) -> TokenStream {
13    let input = parse_macro_input!(input);
14    derive_lexer_impl(input)
15        .unwrap_or_else(syn::Error::into_compile_error)
16        .into()
17}
18
19fn derive_lexer_impl(item: DeriveInput) -> syn::Result<proc_macro2::TokenStream> {
20    let e = match item.data {
21        Data::Enum(e) => e,
22        _ => return Err(syn::Error::new_spanned(item, "expect an enum")),
23    };
24    let vis = item.vis;
25    let name = item.ident;
26
27    let mut skip_regexes = Vec::new();
28    for a in item.attrs {
29        if a.path().is_ident("lexer") {
30            a.parse_nested_meta(|m| {
31                if m.path.is_ident("skip") {
32                    let r: LitStr = m.value()?.parse()?;
33                    skip_regexes.push(r.value());
34                    Ok(())
35                } else {
36                    Err(m.error("unsupported attribute"))
37                }
38            })?;
39        }
40    }
41
42    let mut regexes = Vec::with_capacity(e.variants.len());
43    let mut matches = Vec::new();
44    for (i, v) in e.variants.iter().enumerate() {
45        let vn = &v.ident;
46        let i = i as u32;
47        let mut more: Option<Ident> = None;
48        for a in &v.attrs {
49            if a.path().is_ident("lexer") {
50                a.parse_nested_meta(|m| {
51                    if m.path.is_ident("more") {
52                        more = Some(m.value()?.parse()?);
53                        Ok(())
54                    } else {
55                        Err(m.error("unsupported attribute"))
56                    }
57                })?;
58            }
59        }
60        let more = match more {
61            Some(more) => quote! {
62                len += match #more(&remaining[..len], &remaining[len..]) {
63                    Some(len) => len,
64                    None => return Some(Err(lexi_matic::Error(start))),
65                };
66            },
67            None => quote!(),
68        };
69        let construct = if v.fields.is_empty() {
70            quote!(#name::#vn)
71        } else {
72            quote!(#name::#vn((&remaining[..len]).into()))
73        };
74        matches.push(quote! {
75            #i => {
76                #more
77                #construct
78            }
79        });
80
81        let mut regex = None;
82        for a in &v.attrs {
83            let r = if a.path().is_ident("regex") {
84                let x: LitStr = a.parse_args()?;
85                x.value()
86            } else if a.path().is_ident("token") {
87                let x: LitStr = a.parse_args()?;
88                regex_syntax::escape(&x.value())
89            } else {
90                continue;
91            };
92            if regex.is_none() {
93                regex = Some(r);
94            } else if regex.is_some() {
95                return Err(syn::Error::new_spanned(
96                    a,
97                    "duplicated regex or token atrribute",
98                ));
99            }
100        }
101        match regex {
102            None => {
103                return Err(syn::Error::new_spanned(
104                    v,
105                    "missing a regex or token attribute",
106                ))
107            }
108            Some(r) => regexes.push(r),
109        }
110    }
111    regexes.extend(skip_regexes);
112
113    let dfa = DFA::builder()
114        .configure(
115            DFA::config()
116                // Use MatchKind::All to get longest match.
117                .match_kind(MatchKind::All)
118                .start_kind(StartKind::Anchored)
119                .accelerate(false)
120                .minimize(true),
121        )
122        .build_many(&regexes)
123        .unwrap();
124    let (little_bytes, little_p) = dfa.to_bytes_little_endian();
125    let (big_bytes, big_p) = dfa.to_bytes_big_endian();
126    let little_bytes = &little_bytes[little_p..];
127    let big_bytes = &big_bytes[big_p..];
128    let ll = little_bytes.len();
129    let bl = big_bytes.len();
130    let dfa = quote! {
131        #[repr(C, align(4))]
132        struct Align4<T>(T);
133        #[cfg(target_endian = "little")]
134        static __DFA_BYTES: &Align4<[u8; #ll]> = &Align4([ #(#little_bytes),* ]);
135        #[cfg(target_endian = "big")]
136        static __DFA_BYTES: &Align4<[u8; #bl]> = &Align4([ #(#big_bytes),* ]);
137        static DFA: std::sync::OnceLock<lexi_matic::DFA<&[u32]>> = std::sync::OnceLock::new();
138        let dfa = DFA.get_or_init(||
139            lexi_matic::DFA::from_bytes(&__DFA_BYTES.0).unwrap().0
140        );
141    };
142
143    let gen = if item.generics.lt_token.is_some() {
144        quote!(<'a>)
145    } else {
146        quote!()
147    };
148    let iter_name = format_ident!("{name}Iterator");
149    let lexer_impl = quote! {
150        impl <'a> lexi_matic::Lexer<'a> for #name #gen {
151            type Iterator = #iter_name<'a>;
152            fn lex(input: &'a str) -> #iter_name<'a> {
153                #iter_name {
154                    input,
155                    consumed: 0,
156                }
157            }
158        }
159
160        #vis struct #iter_name<'a> {
161            pub input: &'a str,
162            pub consumed: usize,
163        }
164
165        impl<'a> Iterator for #iter_name<'a> {
166            type Item = Result<(usize, #name #gen, usize), lexi_matic::Error>;
167            fn next(&mut self) -> Option<Self::Item> {
168                #dfa
169
170                loop {
171                    let start = self.consumed;
172                    let remaining = &self.input[start..];
173                    if remaining.is_empty() {
174                        return None;
175                    }
176
177                    let (pat, mut len) = match lexi_matic::dfa_search_next(dfa, remaining) {
178                        Some(t) => t,
179                        None => return Some(Err(lexi_matic::Error(start))),
180                    };
181                    let t = match pat.as_u32() {
182                        #(#matches)*
183                        _ => {
184                            // Skip.
185                            self.consumed += len;
186                            continue;
187                        }
188                    };
189                    self.consumed += len;
190                    return Some(Ok((start, t, start + len)));
191                }
192            }
193        }
194    };
195
196    Ok(lexer_impl)
197}