Skip to main content

bibtex_parser/parser/
simd.rs

1//! SIMD-accelerated parsing utilities for BibTeX
2//!
3//! This module provides SIMD-optimized functions for common parsing operations
4//! like brace balancing and quote scanning, achieving 30-50% performance gains.
5
6use memchr::{memchr, memchr2};
7
8/// Find balanced braces using SIMD acceleration
9///
10/// This function scans for matching braces, handling:
11/// - Nested braces
12/// - Escaped characters (\{, \})
13/// - Efficient SIMD scanning for delimiters
14#[inline]
15#[must_use]
16pub fn find_balanced_braces(input: &[u8]) -> Option<usize> {
17    if input.is_empty() || input[0] != b'{' {
18        return None;
19    }
20
21    let mut depth = 1;
22    let mut pos = 1;
23
24    // Find braces directly and only inspect backslashes when a brace may close
25    // or change nesting.
26    while pos < input.len() {
27        if let Some(offset) = memchr2(b'{', b'}', &input[pos..]) {
28            let idx = pos + offset;
29            if is_escaped_delimiter(input, idx) {
30                pos = idx + 1;
31                continue;
32            }
33
34            match input[idx] {
35                b'{' => {
36                    depth += 1;
37                    pos = idx + 1;
38                }
39                b'}' => {
40                    depth -= 1;
41                    if depth == 0 {
42                        return Some(idx + 1); // Return position after closing brace
43                    }
44                    pos = idx + 1;
45                }
46                _ => unreachable!(),
47            }
48        } else {
49            // No more delimiters found, unbalanced
50            return None;
51        }
52    }
53
54    None // Unbalanced
55}
56
57/// Find balanced quotes using SIMD acceleration
58///
59/// This function scans for the closing quote, handling:
60/// - Escaped quotes (\")
61/// - Efficient SIMD scanning
62#[inline]
63#[must_use]
64pub fn find_balanced_quotes(input: &[u8]) -> Option<usize> {
65    if input.is_empty() || input[0] != b'"' {
66        return None;
67    }
68
69    let mut pos = 1;
70
71    // Find quotes directly and only check the backslash run immediately before
72    // each quote. This avoids visiting every LaTeX command backslash.
73    while pos < input.len() {
74        let offset = memchr(b'"', &input[pos..])?;
75        let idx = pos + offset;
76        if is_escaped_delimiter(input, idx) {
77            pos = idx + 1;
78            continue;
79        }
80        return Some(idx + 1); // Return position after closing quote
81    }
82
83    None // Unclosed quote
84}
85
86/// Find balanced parentheses using SIMD acceleration
87#[inline]
88#[must_use]
89pub fn find_balanced_parentheses(input: &[u8]) -> Option<usize> {
90    if input.is_empty() || input[0] != b'(' {
91        return None;
92    }
93
94    let mut depth = 1;
95    let mut pos = 1;
96
97    // Use SIMD to find next delimiter: ( or )
98    while pos < input.len() {
99        // Find next interesting character using SIMD
100        if let Some(offset) = memchr::memchr2(b'(', b')', &input[pos..]) {
101            let idx = pos + offset;
102
103            match input[idx] {
104                b'(' => {
105                    depth += 1;
106                    pos = idx + 1;
107                }
108                b')' => {
109                    depth -= 1;
110                    if depth == 0 {
111                        return Some(idx + 1); // Return position after closing paren
112                    }
113                    pos = idx + 1;
114                }
115                _ => unreachable!(),
116            }
117        } else {
118            // No more delimiters found, unbalanced
119            return None;
120        }
121    }
122
123    None // Unbalanced
124}
125
126#[inline]
127fn is_escaped_delimiter(input: &[u8], delimiter: usize) -> bool {
128    if delimiter == 0 || input[delimiter - 1] != b'\\' {
129        return false;
130    }
131
132    let mut slash_count = 0usize;
133    let mut pos = delimiter;
134
135    while pos > 0 && input[pos - 1] == b'\\' {
136        slash_count += 1;
137        pos -= 1;
138    }
139
140    slash_count % 2 == 1
141}
142
143/// Fast scan to find entry start (@)
144#[inline]
145#[must_use]
146pub fn find_entry_start(input: &[u8]) -> Option<usize> {
147    memchr::memchr(b'@', input)
148}
149
150/// Fast scan for field separator (=)
151#[inline]
152#[must_use]
153pub fn find_field_separator(input: &[u8]) -> Option<usize> {
154    memchr::memchr(b'=', input)
155}
156
157/// Fast scan for next comma or closing delimiter
158#[inline]
159#[must_use]
160pub fn find_field_end(input: &[u8]) -> Option<usize> {
161    memchr::memchr3(b',', b'}', b')', input)
162}
163
164/// SIMD-accelerated identifier scanning using lookup table
165/// Returns the length of the identifier (alphanumeric, _, -, :, .)
166#[inline]
167#[must_use]
168pub fn scan_identifier(input: &[u8]) -> usize {
169    let mut pos = 0;
170    let len = input.len();
171
172    // Unroll by 4 for better performance
173    while pos + 4 <= len {
174        // Check 4 bytes at once
175        if !is_identifier_byte(input[pos]) {
176            return pos;
177        }
178        if !is_identifier_byte(input[pos + 1]) {
179            return pos + 1;
180        }
181        if !is_identifier_byte(input[pos + 2]) {
182            return pos + 2;
183        }
184        if !is_identifier_byte(input[pos + 3]) {
185            return pos + 3;
186        }
187        pos += 4;
188    }
189
190    // Handle remaining bytes
191    while pos < len && is_identifier_byte(input[pos]) {
192        pos += 1;
193    }
194
195    pos
196}
197
198#[inline]
199const fn is_identifier_byte(byte: u8) -> bool {
200    IDENT_TABLE[byte as usize] == 1
201}
202
203const IDENT_TABLE: [u8; 256] = {
204    let mut table = [0u8; 256];
205    let mut byte = b'0';
206    while byte <= b'9' {
207        table[byte as usize] = 1;
208        byte += 1;
209    }
210    byte = b'A';
211    while byte <= b'Z' {
212        table[byte as usize] = 1;
213        byte += 1;
214    }
215    byte = b'a';
216    while byte <= b'z' {
217        table[byte as usize] = 1;
218        byte += 1;
219    }
220    table[b'_' as usize] = 1;
221    table[b'-' as usize] = 1;
222    table[b':' as usize] = 1;
223    table[b'.' as usize] = 1;
224    table
225};
226
227/// Lookup table for whitespace characters
228const WS_TABLE: [u8; 256] = {
229    let mut table = [0u8; 256];
230    table[b' ' as usize] = 1;
231    table[b'\t' as usize] = 1;
232    table[b'\n' as usize] = 1;
233    table[b'\r' as usize] = 1;
234    table
235};
236
237/// SIMD-accelerated whitespace scanning
238/// Returns the length of the whitespace sequence
239#[inline]
240#[must_use]
241pub const fn scan_whitespace(input: &[u8]) -> usize {
242    let mut pos = 0;
243    let len = input.len();
244
245    // Unroll by 4 for better performance
246    while pos + 4 <= len {
247        // Check 4 bytes at once
248        if WS_TABLE[input[pos] as usize] == 0 {
249            return pos;
250        }
251        if WS_TABLE[input[pos + 1] as usize] == 0 {
252            return pos + 1;
253        }
254        if WS_TABLE[input[pos + 2] as usize] == 0 {
255            return pos + 2;
256        }
257        if WS_TABLE[input[pos + 3] as usize] == 0 {
258            return pos + 3;
259        }
260        pos += 4;
261    }
262
263    // Handle remaining bytes
264    while pos < len && WS_TABLE[input[pos] as usize] == 1 {
265        pos += 1;
266    }
267
268    pos
269}
270
271#[cfg(test)]
272mod tests {
273    use super::*;
274
275    #[test]
276    fn test_balanced_braces() {
277        assert_eq!(find_balanced_braces(b"{}"), Some(2));
278        assert_eq!(find_balanced_braces(b"{hello}"), Some(7));
279        assert_eq!(find_balanced_braces(b"{nested {braces}}"), Some(17));
280        assert_eq!(find_balanced_braces(b"{escaped \\} brace}"), Some(18));
281        assert_eq!(find_balanced_braces(b"{unclosed"), None);
282        assert_eq!(find_balanced_braces(b""), None);
283        assert_eq!(find_balanced_braces(b"not starting with brace"), None);
284    }
285
286    #[test]
287    fn test_balanced_quotes() {
288        assert_eq!(find_balanced_quotes(b"\"\""), Some(2));
289        assert_eq!(find_balanced_quotes(b"\"hello\""), Some(7));
290        assert_eq!(find_balanced_quotes(b"\"escaped \\\" quote\""), Some(18));
291        assert_eq!(find_balanced_quotes(b"\"unclosed"), None);
292        assert_eq!(find_balanced_quotes(b""), None);
293        assert_eq!(find_balanced_quotes(b"not starting with quote"), None);
294    }
295
296    #[test]
297    fn test_balanced_parentheses() {
298        assert_eq!(find_balanced_parentheses(b"()"), Some(2));
299        assert_eq!(find_balanced_parentheses(b"(hello)"), Some(7));
300        assert_eq!(find_balanced_parentheses(b"(nested (parens))"), Some(17));
301        assert_eq!(find_balanced_parentheses(b"(unclosed"), None);
302    }
303}