unfmt_macros/
unfmt_macros.rs

1#![no_std]
2
3extern crate alloc;
4extern crate proc_macro;
5
6use alloc::{borrow::ToOwned, format, vec::Vec};
7
8use bstr::ByteSlice;
9use proc_macro::TokenStream;
10use proc_macro2::Span;
11use quote::{quote, ToTokens};
12use syn::{
13    parse::{Parse, ParseStream, Result},
14    parse_macro_input, parse_str, Expr, Ident, Lit, LitBool, LitByteStr, Token, TypePath,
15};
16
17struct Unformat {
18    pattern: Vec<u8>,
19    text: Expr,
20    is_pattern_str: bool,
21    full_match: bool,
22}
23
24impl Parse for Unformat {
25    fn parse(input: ParseStream) -> Result<Self> {
26        #[allow(clippy::wildcard_enum_match_arm)]
27        let (pattern, is_pattern_str) = match input.parse::<Lit>()? {
28            Lit::Str(str) => (str.value().into_bytes(), true),
29            Lit::ByteStr(byte_str) => (byte_str.value(), false),
30            _ => return Err(input.error("expected a string literal")),
31        };
32
33        input.parse::<Token![,]>()?;
34
35        let text = input.parse::<Expr>()?;
36
37        let full_match = if input.parse::<Token![,]>().is_ok() {
38            input.parse::<LitBool>().map_or(false, |bool| bool.value)
39        } else {
40            false
41        };
42        Ok(Self {
43            pattern,
44            text,
45            is_pattern_str,
46            full_match,
47        })
48    }
49}
50
51enum Assignee {
52    Index(u32),
53    Variable(Ident),
54}
55
56impl Assignee {
57    fn new(variable: &str, index: &mut u32) -> Self {
58        variable.parse::<u32>().map_or_else(
59            |_| {
60                if variable.is_empty() {
61                    let tuple_index = *index;
62                    *index = index.saturating_add(1);
63                    Self::Index(tuple_index)
64                } else {
65                    Self::Variable(parse_str(variable).expect("invalid variable name"))
66                }
67            },
68            Self::Index,
69        )
70    }
71}
72
73enum CaptureTypePath {
74    Str,
75    Bytes,
76    Typed(TypePath),
77}
78
79impl CaptureTypePath {
80    fn new(type_path: &str, is_pattern_str: bool) -> Self {
81        if type_path.is_empty() {
82            if is_pattern_str {
83                Self::Str
84            } else {
85                Self::Bytes
86            }
87        } else if type_path == "&str" {
88            Self::Str
89        } else if type_path == "&[u8]" {
90            Self::Bytes
91        } else {
92            Self::Typed(parse_str(type_path).expect("invalid type path"))
93        }
94    }
95}
96
97impl ToTokens for CaptureTypePath {
98    fn to_tokens(&self, tokens: &mut proc_macro2::TokenStream) {
99        tokens.extend(match *self {
100            Self::Str => {
101                quote! { &str }
102            }
103            Self::Bytes => {
104                quote! { &[u8] }
105            }
106            Self::Typed(ref type_path) => {
107                quote! { #type_path }
108            }
109        });
110    }
111}
112
113struct Capture {
114    text: Vec<u8>,
115    assignee: Assignee,
116    r#type: CaptureTypePath,
117}
118
119impl Capture {
120    fn new(text: &[u8], capture: &str, is_pattern_str: bool, index: &mut u32) -> Self {
121        let (variable, type_path) = capture.split_once(':').unwrap_or((capture, ""));
122        Self {
123            text: text.to_vec(),
124            assignee: Assignee::new(variable, index),
125            r#type: CaptureTypePath::new(type_path, is_pattern_str),
126        }
127    }
128}
129
130impl ToTokens for Capture {
131    fn to_tokens(&self, tokens: &mut proc_macro2::TokenStream) {
132        let rhs = match self.r#type {
133            CaptureTypePath::Str => {
134                quote! {
135                    if let Ok(__unfmt_left) = __unfmt_left.to_str() {
136                        __unfmt_left
137                    } else {
138                        break 'unformat None;
139                    }
140                }
141            }
142            CaptureTypePath::Bytes => {
143                quote! { __unfmt_left }
144            }
145            CaptureTypePath::Typed(ref type_path) => {
146                quote! {
147                    if let Ok(Ok(__unfmt_left)) = __unfmt_left.to_str().map(|value| value.parse::<#type_path>()) {
148                        __unfmt_left
149                    } else {
150                        break 'unformat None;
151                    }
152                }
153            }
154        };
155        let assignment = match self.assignee {
156            Assignee::Index(ref index) => {
157                let ident = Ident::new(&format!("__unfmt_capture_{index}"), Span::call_site());
158                quote! { let #ident = #rhs }
159            }
160            Assignee::Variable(ref ident) => {
161                quote! { #ident = Some(#rhs) }
162            }
163        };
164        let text = LitByteStr::new(&self.text, Span::call_site());
165
166        // If text is empty, `find` will return `Some(0)` and the capture will
167        // be at the end of the pattern, so this capture (`__unfmt_left`) would
168        // be empty. Since captures are inherently .*? in regex, this capture
169        // should consume the remainder of the text, so we swap `__unfmt_left`
170        // and `__unfmt_right` to achieve this.
171        tokens.extend(if self.text.is_empty() {
172            quote! { let (__unfmt_left, __unfmt_right) = (__unfmt_byte_text, b""); }
173        } else {
174            quote! {
175                let Some((__unfmt_left, __unfmt_right)) = __unfmt_byte_text.split_once_str(#text) else {
176                    break 'unformat None;
177                };
178            }
179        });
180
181        tokens.extend(quote! {
182            #assignment;
183            __unfmt_byte_text = BStr::new(__unfmt_right);
184        });
185    }
186}
187
188/// Basic implementation of reversing the `format!` process. Matches a given
189/// text against a given pattern, returning any captures.
190///
191/// Rules:
192///
193///  - Patterns are substring matched.
194///  - Captures are written as `{<index-or-var>?(:<type>)?}` in the pattern.
195///  - Captures are similar to `(.*?)` in regex, but without backtracking.
196///  - Sequential captures (e.g. `{}{}`) are not supported and will return
197///    `None`.
198///
199/// # Panics
200///
201/// This function panics if the pattern is invalid. This includes:
202///
203///  - Consecutive captures.
204///  - Unmatched `}` in the pattern.
205///  - Invalid UTF-8 in capture names.
206///
207#[proc_macro]
208pub fn unformat(input: TokenStream) -> TokenStream {
209    let Unformat {
210        pattern,
211        text,
212        is_pattern_str,
213        full_match,
214    } = parse_macro_input!(input as Unformat);
215
216    let (initial_part, captures) = compile(&pattern, is_pattern_str);
217    let initial_part = Lit::ByteStr(LitByteStr::new(&initial_part, Span::call_site()));
218
219    let capture_idents = {
220        let mut capture_indices = captures
221            .iter()
222            .filter_map(|capture| match capture.assignee {
223                Assignee::Index(capture_index) => Some(capture_index),
224                Assignee::Variable(..) => None,
225            })
226            .collect::<Vec<_>>();
227
228        capture_indices.sort_by(|&index_a, &index_b| index_a.cmp(&index_b));
229
230        capture_indices
231            .into_iter()
232            .map(|index| Ident::new(&format!("__unfmt_capture_{index}"), Span::call_site()))
233            .collect::<Vec<_>>()
234    };
235
236    let capture_block = if full_match {
237        quote! {
238            if !__unfmt_left.is_empty() {
239                break 'unformat None;
240            }
241            #(#captures)*
242            if !__unfmt_byte_text.is_empty() {
243                break 'unformat None;
244            }
245        }
246    } else {
247        quote! { #(#captures)* }
248    };
249
250    TokenStream::from(quote! {
251        'unformat: {
252            use ::core::str::FromStr;
253            use ::unfmt::bstr::{ByteSlice, BStr};
254            let Some((__unfmt_left, mut __unfmt_byte_text)) = BStr::new(#text).split_once_str(#initial_part) else {
255                break 'unformat None;
256            };
257            #capture_block
258            Some((#(#capture_idents),*))
259        }
260    })
261}
262
263fn compile(pattern: &[u8], is_pattern_str: bool) -> (Vec<u8>, Vec<Capture>) {
264    let mut pattern = pattern.replace(b"{{", "\u{f8fd}");
265    pattern.reverse();
266    let mut pattern = pattern.replace(b"}}", "\u{f8fe}");
267    pattern.reverse();
268
269    let mut pattern_parts = pattern.split_str("{");
270
271    // SAFETY: The first part is always present.
272    let initial_part = unsafe {
273        pattern_parts
274            .next()
275            .unwrap_unchecked()
276            .replace("\u{f8fd}", "{")
277    };
278
279    let mut current_index: u32 = 0;
280    let mut compiled_pattern = Vec::new();
281    for pattern_part in pattern_parts {
282        let (capture, text) = pattern_part
283            .split_once_str("}")
284            .expect("unmatched } in pattern");
285        let capture = capture
286            .to_str()
287            .expect("invalid UTF-8 in capture names")
288            .to_owned();
289        let mut text = text.replace("\u{f8fd}", b"{");
290        text.reverse();
291        let mut text = text.replace("\u{f8fe}", b"}");
292        text.reverse();
293        compiled_pattern.push(Capture::new(
294            &text,
295            &capture,
296            is_pattern_str,
297            &mut current_index,
298        ));
299    }
300
301    assert!(
302        compiled_pattern.windows(2).all(|parts| parts
303            .iter()
304            .any(|&Capture { ref text, .. }| !text.is_empty())),
305        "consecutive captures are not allowed"
306    );
307
308    (initial_part, compiled_pattern)
309}