1extern crate proc_macro;
2use proc_macro::TokenStream;
3use quote::{format_ident, quote};
4use regex_automata::{
5 dfa::{dense::DFA, StartKind},
6 MatchKind,
7};
8use syn::{parse_macro_input, Data, DeriveInput, Ident, LitStr};
9
10#[proc_macro_derive(Lexer, attributes(regex, token, lexer))]
12pub fn derive_lexer(input: TokenStream) -> TokenStream {
13 let input = parse_macro_input!(input);
14 derive_lexer_impl(input)
15 .unwrap_or_else(syn::Error::into_compile_error)
16 .into()
17}
18
19fn derive_lexer_impl(item: DeriveInput) -> syn::Result<proc_macro2::TokenStream> {
20 let e = match item.data {
21 Data::Enum(e) => e,
22 _ => return Err(syn::Error::new_spanned(item, "expect an enum")),
23 };
24 let vis = item.vis;
25 let name = item.ident;
26
27 let mut skip_regexes = Vec::new();
28 for a in item.attrs {
29 if a.path().is_ident("lexer") {
30 a.parse_nested_meta(|m| {
31 if m.path.is_ident("skip") {
32 let r: LitStr = m.value()?.parse()?;
33 skip_regexes.push(r.value());
34 Ok(())
35 } else {
36 Err(m.error("unsupported attribute"))
37 }
38 })?;
39 }
40 }
41
42 let mut regexes = Vec::with_capacity(e.variants.len());
43 let mut matches = Vec::new();
44 for (i, v) in e.variants.iter().enumerate() {
45 let vn = &v.ident;
46 let i = i as u32;
47 let mut more: Option<Ident> = None;
48 for a in &v.attrs {
49 if a.path().is_ident("lexer") {
50 a.parse_nested_meta(|m| {
51 if m.path.is_ident("more") {
52 more = Some(m.value()?.parse()?);
53 Ok(())
54 } else {
55 Err(m.error("unsupported attribute"))
56 }
57 })?;
58 }
59 }
60 let more = match more {
61 Some(more) => quote! {
62 len += match #more(&remaining[..len], &remaining[len..]) {
63 Some(len) => len,
64 None => return Some(Err(lexi_matic::Error(start))),
65 };
66 },
67 None => quote!(),
68 };
69 let construct = if v.fields.is_empty() {
70 quote!(#name::#vn)
71 } else {
72 quote!(#name::#vn((&remaining[..len]).into()))
73 };
74 matches.push(quote! {
75 #i => {
76 #more
77 #construct
78 }
79 });
80
81 let mut regex = None;
82 for a in &v.attrs {
83 let r = if a.path().is_ident("regex") {
84 let x: LitStr = a.parse_args()?;
85 x.value()
86 } else if a.path().is_ident("token") {
87 let x: LitStr = a.parse_args()?;
88 regex_syntax::escape(&x.value())
89 } else {
90 continue;
91 };
92 if regex.is_none() {
93 regex = Some(r);
94 } else if regex.is_some() {
95 return Err(syn::Error::new_spanned(
96 a,
97 "duplicated regex or token atrribute",
98 ));
99 }
100 }
101 match regex {
102 None => {
103 return Err(syn::Error::new_spanned(
104 v,
105 "missing a regex or token attribute",
106 ))
107 }
108 Some(r) => regexes.push(r),
109 }
110 }
111 regexes.extend(skip_regexes);
112
113 let dfa = DFA::builder()
114 .configure(
115 DFA::config()
116 .match_kind(MatchKind::All)
118 .start_kind(StartKind::Anchored)
119 .accelerate(false)
120 .minimize(true),
121 )
122 .build_many(®exes)
123 .unwrap();
124 let (little_bytes, little_p) = dfa.to_bytes_little_endian();
125 let (big_bytes, big_p) = dfa.to_bytes_big_endian();
126 let little_bytes = &little_bytes[little_p..];
127 let big_bytes = &big_bytes[big_p..];
128 let ll = little_bytes.len();
129 let bl = big_bytes.len();
130 let dfa = quote! {
131 #[repr(C, align(4))]
132 struct Align4<T>(T);
133 #[cfg(target_endian = "little")]
134 static __DFA_BYTES: &Align4<[u8; #ll]> = &Align4([ #(#little_bytes),* ]);
135 #[cfg(target_endian = "big")]
136 static __DFA_BYTES: &Align4<[u8; #bl]> = &Align4([ #(#big_bytes),* ]);
137 static DFA: std::sync::OnceLock<lexi_matic::DFA<&[u32]>> = std::sync::OnceLock::new();
138 let dfa = DFA.get_or_init(||
139 lexi_matic::DFA::from_bytes(&__DFA_BYTES.0).unwrap().0
140 );
141 };
142
143 let gen = if item.generics.lt_token.is_some() {
144 quote!(<'a>)
145 } else {
146 quote!()
147 };
148 let iter_name = format_ident!("{name}Iterator");
149 let lexer_impl = quote! {
150 impl <'a> lexi_matic::Lexer<'a> for #name #gen {
151 type Iterator = #iter_name<'a>;
152 fn lex(input: &'a str) -> #iter_name<'a> {
153 #iter_name {
154 input,
155 consumed: 0,
156 }
157 }
158 }
159
160 #vis struct #iter_name<'a> {
161 pub input: &'a str,
162 pub consumed: usize,
163 }
164
165 impl<'a> Iterator for #iter_name<'a> {
166 type Item = Result<(usize, #name #gen, usize), lexi_matic::Error>;
167 fn next(&mut self) -> Option<Self::Item> {
168 #dfa
169
170 loop {
171 let start = self.consumed;
172 let remaining = &self.input[start..];
173 if remaining.is_empty() {
174 return None;
175 }
176
177 let (pat, mut len) = match lexi_matic::dfa_search_next(dfa, remaining) {
178 Some(t) => t,
179 None => return Some(Err(lexi_matic::Error(start))),
180 };
181 let t = match pat.as_u32() {
182 #(#matches)*
183 _ => {
184 self.consumed += len;
186 continue;
187 }
188 };
189 self.consumed += len;
190 return Some(Ok((start, t, start + len)));
191 }
192 }
193 }
194 };
195
196 Ok(lexer_impl)
197}