Skip to main content

bibtex_parser/parser/
lexer.rs

1//! Lexical analysis for BibTeX
2
3use super::{delimiter, PResult};
4use memchr;
5use winnow::prelude::*;
6use winnow::{
7    ascii::digit1,
8    combinator::{alt, opt},
9};
10
11/// Parse an identifier (letters, numbers, underscores, hyphens, colons)
12#[inline]
13pub fn identifier<'a>(input: &mut &'a str) -> PResult<'a, &'a str> {
14    let bytes = input.as_bytes();
15    let len = super::simd::scan_identifier(bytes);
16
17    if len == 0 {
18        return super::backtrack();
19    }
20
21    let result = &input[..len];
22    *input = &input[len..];
23    Ok(result)
24}
25
26/// Parse a field name (same as identifier but typically lowercase)
27#[inline]
28pub fn field_name<'a>(input: &mut &'a str) -> PResult<'a, &'a str> {
29    identifier(input)
30}
31
32/// Parse balanced braces { ... } with SIMD acceleration
33#[inline]
34pub fn balanced_braces<'a>(input: &mut &'a str) -> PResult<'a, &'a str> {
35    let original_input = *input;
36    let bytes = input.as_bytes();
37    let mut depth = 0;
38    let mut pos = 0;
39
40    // Use SIMD to find delimiters
41    while pos < bytes.len() {
42        // Find next delimiter using SIMD
43        if let Some(offset) = memchr::memchr3(b'{', b'}', b'\\', &bytes[pos..]) {
44            let idx = pos + offset;
45
46            // Include content up to delimiter
47            match bytes[idx] {
48                b'{' => {
49                    depth += 1;
50                    pos = idx + 1;
51                }
52                b'}' => {
53                    if depth == 0 {
54                        let result = &original_input[..idx];
55                        *input = &input[idx..];
56                        return Ok(result);
57                    }
58                    depth -= 1;
59                    pos = idx + 1;
60                }
61                b'\\' => {
62                    // Skip escaped character
63                    pos = idx + 2;
64                }
65                _ => unreachable!(),
66            }
67        } else {
68            // No more delimiters found
69            break;
70        }
71    }
72
73    super::backtrack()
74}
75
76/// Parse a quoted string "..." with SIMD acceleration
77#[inline]
78pub fn quoted_string<'a>(input: &mut &'a str) -> PResult<'a, &'a str> {
79    let bytes = input.as_bytes();
80
81    // Use SIMD-accelerated quote scanning
82    super::simd::find_balanced_quotes(bytes).map_or_else(super::backtrack, |end_pos| {
83        // Extract the content (without the quotes)
84        let result = &input[1..end_pos - 1];
85        *input = &input[end_pos..];
86        Ok(result)
87    })
88}
89
90/// Parse a number (integer)
91#[inline]
92pub fn number<'a>(input: &mut &'a str) -> PResult<'a, i64> {
93    let sign = opt(alt(('+', '-'))).parse_next(input)?;
94    let digits = digit1.parse_next(input)?;
95
96    let mut num = digits.parse::<i64>().map_err(|_| super::backtrack_err())?;
97
98    if sign == Some('-') {
99        num = -num;
100    }
101
102    Ok(num)
103}
104
105/// Parse balanced parentheses ( ... ) with SIMD acceleration
106#[inline]
107pub fn balanced_parentheses<'a>(input: &mut &'a str) -> PResult<'a, &'a str> {
108    let original_input = *input;
109    let bytes = input.as_bytes();
110    let mut depth = 0;
111    let mut pos = 0;
112
113    // Use SIMD to find delimiters
114    while pos < bytes.len() {
115        // Find next delimiter using SIMD
116        if let Some(offset) = memchr::memchr2(b'(', b')', &bytes[pos..]) {
117            let idx = pos + offset;
118
119            match bytes[idx] {
120                b'(' => {
121                    depth += 1;
122                    pos = idx + 1;
123                }
124                b')' => {
125                    if depth == 0 {
126                        let result = &original_input[..idx];
127                        *input = &input[idx..];
128                        return Ok(result);
129                    }
130                    depth -= 1;
131                    pos = idx + 1;
132                }
133                _ => unreachable!(),
134            }
135        } else {
136            // No more delimiters found
137            break;
138        }
139    }
140
141    super::backtrack()
142}
143
144/// Fast whitespace skipping (optimal for short runs per profiling)
145#[inline]
146pub fn skip_whitespace(input: &mut &str) {
147    let bytes = input.as_bytes();
148    let mut pos = 0;
149
150    while let Some(&byte) = bytes.get(pos) {
151        match byte {
152            b' ' | b'\t' | b'\n' | b'\r' => pos += 1,
153            _ => break,
154        }
155    }
156
157    *input = &input[pos..];
158}
159
160#[inline]
161pub(crate) fn skip_whitespace_peek(input: &mut &str) -> Option<u8> {
162    let bytes = input.as_bytes();
163    let mut pos = 0;
164
165    while let Some(&byte) = bytes.get(pos) {
166        match byte {
167            b' ' | b'\t' | b'\n' | b'\r' => pos += 1,
168            _ => {
169                *input = &input[pos..];
170                return Some(byte);
171            }
172        }
173    }
174
175    *input = "";
176    None
177}
178
179/// Fast scan to next BibTeX delimiter - re-export from delimiter module
180#[must_use]
181pub fn scan_to_bibtex_delimiter(haystack: &[u8], start: usize) -> Option<(usize, u8)> {
182    delimiter::find_delimiter(haystack, start)
183}
184
185#[cfg(test)]
186mod tests {
187    use super::*;
188
189    #[test]
190    fn test_identifier() {
191        let mut input = "hello-world_123:test.com xxx";
192        let result = identifier(&mut input).unwrap();
193        assert_eq!(result, "hello-world_123:test.com");
194        assert_eq!(input, " xxx");
195    }
196
197    #[test]
198    fn test_balanced_braces() {
199        let mut input = "hello {nested {braces}} world} xxx";
200        let result = balanced_braces(&mut input).unwrap();
201        assert_eq!(result, "hello {nested {braces}} world");
202        assert_eq!(input, "} xxx");
203    }
204
205    #[test]
206    fn test_balanced_braces_with_spaces() {
207        let mut input = "Second preamble} xxx";
208        let result = balanced_braces(&mut input).unwrap();
209        assert_eq!(result, "Second preamble");
210        assert_eq!(input, "} xxx");
211    }
212
213    #[test]
214    fn test_balanced_parentheses() {
215        let mut input = "hello (nested (parens)) world) xxx";
216        let result = balanced_parentheses(&mut input).unwrap();
217        assert_eq!(result, "hello (nested (parens)) world");
218        assert_eq!(input, ") xxx");
219    }
220
221    #[test]
222    fn test_quoted_string() {
223        let mut input = r#""hello \"world\"" xxx"#;
224        let result = quoted_string(&mut input).unwrap();
225        assert_eq!(result, r#"hello \"world\""#);
226        assert_eq!(input, " xxx");
227
228        // Test with nested braces
229        let mut input = r#""hello {world}" xxx"#;
230        let result = quoted_string(&mut input).unwrap();
231        assert_eq!(result, "hello {world}");
232    }
233
234    #[test]
235    fn test_number() {
236        let mut input = "42 xxx";
237        assert_eq!(number(&mut input).unwrap(), 42);
238
239        let mut input = "-42 xxx";
240        assert_eq!(number(&mut input).unwrap(), -42);
241
242        let mut input = "+42 xxx";
243        assert_eq!(number(&mut input).unwrap(), 42);
244    }
245
246    #[test]
247    fn test_scan_to_bibtex_delimiter() {
248        let input = b"hello @ world { test } = value, end";
249
250        assert_eq!(scan_to_bibtex_delimiter(input, 0), Some((6, b'@')));
251        assert_eq!(scan_to_bibtex_delimiter(input, 7), Some((14, b'{')));
252        assert_eq!(scan_to_bibtex_delimiter(input, 15), Some((21, b'}')));
253        assert_eq!(scan_to_bibtex_delimiter(input, 22), Some((23, b'=')));
254        assert_eq!(scan_to_bibtex_delimiter(input, 24), Some((30, b',')));
255        assert_eq!(scan_to_bibtex_delimiter(input, 31), None);
256    }
257}