Skip to main content

bibtex_parser/parser/
delimiter.rs

1//! Optimized delimiter finding using memchr
2
3/// Find the next BibTeX delimiter (@, {, }, =, ,) using memchr
4/// Uses two passes but returns the earliest delimiter found
5#[inline]
6#[must_use]
7pub fn find_delimiter(haystack: &[u8], start: usize) -> Option<(usize, u8)> {
8    if start >= haystack.len() {
9        return None;
10    }
11
12    let search_bytes = &haystack[start..];
13
14    // First pass: most common delimiters {, }, , (based on profiling)
15    let result1 =
16        memchr::memchr3(b'{', b'}', b',', search_bytes).map(|pos| (start + pos, search_bytes[pos]));
17
18    // Third pass: parenthesis delimiters (, )
19    let result3 =
20        memchr::memchr2(b'(', b')', search_bytes).map(|pos| (start + pos, search_bytes[pos]));
21
22    // Second pass: less common delimiters @, =
23    let result2 =
24        memchr::memchr2(b'@', b'=', search_bytes).map(|pos| (start + pos, search_bytes[pos]));
25
26    // Return whichever delimiter appears first
27    let mut earliest = None;
28
29    for (pos, byte) in [result1, result2, result3].into_iter().flatten() {
30        match earliest {
31            Some((earliest_pos, _)) if pos < earliest_pos => earliest = Some((pos, byte)),
32            None => earliest = Some((pos, byte)),
33            _ => {}
34        }
35    }
36
37    earliest
38}
39
40/// Find brace or backslash for balanced brace parsing
41#[inline]
42#[must_use]
43pub fn find_brace_delimiter(haystack: &[u8], start: usize) -> Option<(usize, u8)> {
44    if start >= haystack.len() {
45        return None;
46    }
47
48    memchr::memchr3(b'{', b'}', b'\\', &haystack[start..])
49        .map(|pos| (start + pos, haystack[start + pos]))
50}
51
52/// Find delimiters in quoted strings (\, ", {, })
53#[inline]
54#[must_use]
55pub fn find_quote_delimiter(haystack: &[u8], start: usize) -> Option<(usize, u8)> {
56    if start >= haystack.len() {
57        return None;
58    }
59
60    let search_bytes = &haystack[start..];
61
62    // Search for \, ", { (most common in quoted strings)
63    let result1 = memchr::memchr3(b'\\', b'"', b'{', search_bytes)
64        .map(|pos| (start + pos, search_bytes[pos]));
65
66    // Also need to check for } when inside braces
67    let result2 = memchr::memchr(b'}', search_bytes).map(|pos| (start + pos, b'}'));
68
69    // Return whichever delimiter appears first
70    match (result1, result2) {
71        (Some((pos1, byte1)), Some((pos2, _))) => {
72            if pos1 <= pos2 {
73                Some((pos1, byte1))
74            } else {
75                Some((pos2, b'}'))
76            }
77        }
78        (Some(r), None) | (None, Some(r)) => Some(r),
79        (None, None) => None,
80    }
81}
82
83/// Find a single specific delimiter
84#[must_use]
85pub fn find_byte(haystack: &[u8], needle: u8, start: usize) -> Option<usize> {
86    if start >= haystack.len() {
87        return None;
88    }
89
90    memchr::memchr(needle, &haystack[start..]).map(|pos| start + pos)
91}
92
93/// Find any of 2 delimiters
94#[must_use]
95pub fn find_bytes2(haystack: &[u8], needle1: u8, needle2: u8, start: usize) -> Option<(usize, u8)> {
96    if start >= haystack.len() {
97        return None;
98    }
99
100    memchr::memchr2(needle1, needle2, &haystack[start..])
101        .map(|pos| (start + pos, haystack[start + pos]))
102}
103
104/// Find any of 3 delimiters
105#[must_use]
106pub fn find_bytes3(
107    haystack: &[u8],
108    needle1: u8,
109    needle2: u8,
110    needle3: u8,
111    start: usize,
112) -> Option<(usize, u8)> {
113    if start >= haystack.len() {
114        return None;
115    }
116
117    memchr::memchr3(needle1, needle2, needle3, &haystack[start..])
118        .map(|pos| (start + pos, haystack[start + pos]))
119}
120
121#[cfg(test)]
122mod tests {
123    use super::*;
124
125    #[test]
126    fn test_find_delimiter() {
127        let input = b"hello @ world { test } = value, end";
128
129        assert_eq!(find_delimiter(input, 0), Some((6, b'@')));
130        assert_eq!(find_delimiter(input, 7), Some((14, b'{')));
131        assert_eq!(find_delimiter(input, 15), Some((21, b'}')));
132        assert_eq!(find_delimiter(input, 22), Some((23, b'=')));
133        assert_eq!(find_delimiter(input, 24), Some((30, b',')));
134        assert_eq!(find_delimiter(input, 31), None);
135    }
136
137    #[test]
138    fn test_find_delimiter_with_parentheses() {
139        let input = b"hello ( world ) test @ end";
140
141        assert_eq!(find_delimiter(input, 0), Some((6, b'(')));
142        assert_eq!(find_delimiter(input, 7), Some((14, b')')));
143        assert_eq!(find_delimiter(input, 15), Some((21, b'@')));
144        assert_eq!(find_delimiter(input, 22), None);
145    }
146
147    #[test]
148    fn test_specialized_searches() {
149        let input = b"test {nested} string";
150
151        assert_eq!(find_brace_delimiter(input, 0), Some((5, b'{')));
152        assert_eq!(find_brace_delimiter(input, 6), Some((12, b'}')));
153        assert_eq!(find_byte(input, b'}', 0), Some(12));
154        assert_eq!(find_byte(input, b'}', 13), None);
155    }
156}