Skip to main content

bibtex_parser/parser/
value.rs

1//! Value parsing for BibTeX fields
2
3use super::{lexer, PResult};
4use crate::model::Value;
5use std::borrow::Cow;
6
7/// Parse a BibTeX value (string, number, variable, or concatenation)
8#[inline]
9pub fn parse_value<'a>(input: &mut &'a str) -> PResult<'a, Value<'a>> {
10    parse_concatenated_value(input)
11}
12
13/// Parse a field value and consume trailing ASCII whitespace.
14///
15/// This variant is used by entry parsing so the field loop can read the
16/// delimiter directly without re-scanning whitespace.
17#[inline]
18pub(crate) fn parse_value_field<'a>(input: &mut &'a str) -> PResult<'a, Value<'a>> {
19    parse_concatenated_value_field(input)
20}
21
22/// Parse a concatenated value (value # value # ...)
23#[inline]
24fn parse_concatenated_value<'a>(input: &mut &'a str) -> PResult<'a, Value<'a>> {
25    let first = parse_single_value(input)?;
26
27    // Fast path: most fields are a single value with no concatenation.
28    if !consume_concat_separator(input) {
29        return Ok(first);
30    }
31
32    // Slow path: parse one or more `# value` segments.
33    let mut parts = Vec::with_capacity(3);
34    parts.push(first);
35
36    loop {
37        let part = parse_single_value(input)?;
38        parts.push(part);
39
40        if !consume_concat_separator(input) {
41            break;
42        }
43    }
44
45    Ok(Value::Concat(parts.into_boxed_slice()))
46}
47
48/// Parse a concatenated value and consume trailing ASCII whitespace.
49#[inline]
50fn parse_concatenated_value_field<'a>(input: &mut &'a str) -> PResult<'a, Value<'a>> {
51    let first = parse_single_value(input)?;
52
53    if !consume_concat_separator_field(input) {
54        return Ok(first);
55    }
56
57    // Slow path: parse one or more `# value` segments.
58    let mut parts = Vec::with_capacity(3);
59    parts.push(first);
60
61    loop {
62        let part = parse_single_value(input)?;
63        parts.push(part);
64
65        if !consume_concat_separator_field(input) {
66            break;
67        }
68    }
69
70    Ok(Value::Concat(parts.into_boxed_slice()))
71}
72
73/// Consume optional whitespace + `#` + optional whitespace.
74///
75/// Returns `true` if a concatenation separator was consumed. If no separator
76/// is present, input is left untouched.
77#[inline]
78fn consume_concat_separator(input: &mut &str) -> bool {
79    let mut probe = *input;
80    lexer::skip_whitespace(&mut probe);
81    if probe.as_bytes().first() != Some(&b'#') {
82        return false;
83    }
84
85    probe = &probe[1..];
86    lexer::skip_whitespace(&mut probe);
87    *input = probe;
88    true
89}
90
91/// Consume optional trailing whitespace and a field-value concatenation marker.
92///
93/// Unlike `consume_concat_separator`, this variant keeps the field parser's
94/// contract: trailing whitespace is consumed even when no `#` follows.
95#[inline]
96fn consume_concat_separator_field(input: &mut &str) -> bool {
97    match input.as_bytes().first() {
98        Some(b'#') => {
99            *input = &input[1..];
100            lexer::skip_whitespace(input);
101            true
102        }
103        Some(b' ' | b'\t' | b'\n' | b'\r') => {
104            lexer::skip_whitespace(input);
105            if input.as_bytes().first() == Some(&b'#') {
106                *input = &input[1..];
107                lexer::skip_whitespace(input);
108                true
109            } else {
110                false
111            }
112        }
113        Some(_) | None => false,
114    }
115}
116
117/// Parse a single value component
118#[inline]
119fn parse_single_value<'a>(input: &mut &'a str) -> PResult<'a, Value<'a>> {
120    // Fast dispatch based on first character
121    let bytes = input.as_bytes();
122    if let Some(&first) = bytes.first() {
123        match first {
124            b'"' => {
125                super::simd::find_balanced_quotes(bytes).map_or_else(super::backtrack, |end_pos| {
126                    let content = &input[1..end_pos - 1];
127                    *input = &input[end_pos..];
128                    Ok(Value::Literal(Cow::Borrowed(content)))
129                })
130            }
131            b'{' => {
132                super::simd::find_balanced_braces(bytes).map_or_else(super::backtrack, |end_pos| {
133                    let content = &input[1..end_pos - 1];
134                    *input = &input[end_pos..];
135                    Ok(Value::Literal(Cow::Borrowed(content)))
136                })
137            }
138            b'0'..=b'9' | b'+' | b'-' => parse_number_or_digit_string(input),
139            _ => parse_variable_value(input),
140        }
141    } else {
142        super::backtrack()
143    }
144}
145
146/// Parse either a number or a string that starts with digits
147/// This handles cases like "2024a", "12b", "1.2.3", etc.
148#[inline]
149fn parse_number_or_digit_string<'a>(input: &mut &'a str) -> PResult<'a, Value<'a>> {
150    let bytes = input.as_bytes();
151    let Some(&first) = bytes.first() else {
152        return super::backtrack();
153    };
154
155    let len = super::simd::scan_identifier(bytes);
156    if len == 0 {
157        return super::backtrack();
158    }
159
160    let token = &input[..len];
161    let token_bytes = token.as_bytes();
162
163    // Signed values must be strict integers (e.g., +42, -1).
164    // Non-digit suffixes after a sign are rejected.
165    if first == b'+' || first == b'-' {
166        if token_bytes.len() <= 1 || !token_bytes[1..].iter().all(u8::is_ascii_digit) {
167            return super::backtrack();
168        }
169        let num = parse_i64_ascii(token)?;
170        *input = &input[len..];
171        return Ok(Value::Number(num));
172    }
173
174    // Digit-starting tokens parse as numbers when fully numeric,
175    // otherwise as literals (e.g. 2024a).
176    if !first.is_ascii_digit() {
177        return super::backtrack();
178    }
179
180    *input = &input[len..];
181    if token_bytes.iter().all(u8::is_ascii_digit) {
182        let num = parse_i64_ascii(token)?;
183        Ok(Value::Number(num))
184    } else {
185        Ok(Value::Literal(Cow::Borrowed(token)))
186    }
187}
188
189#[inline]
190fn parse_i64_ascii(token: &str) -> PResult<'_, i64> {
191    let bytes = token.as_bytes();
192    let (negative, start) = match bytes.first() {
193        Some(b'-') => (true, 1),
194        Some(b'+') => (false, 1),
195        _ => (false, 0),
196    };
197
198    if start >= bytes.len() {
199        return super::backtrack();
200    }
201
202    let mut value: i64 = 0;
203    for &byte in &bytes[start..] {
204        if !byte.is_ascii_digit() {
205            return super::backtrack();
206        }
207
208        let digit = i64::from(byte - b'0');
209        value = if negative {
210            value
211                .checked_mul(10)
212                .and_then(|v| v.checked_sub(digit))
213                .ok_or_else(super::backtrack_err)?
214        } else {
215            value
216                .checked_mul(10)
217                .and_then(|v| v.checked_add(digit))
218                .ok_or_else(super::backtrack_err)?
219        };
220    }
221
222    Ok(value)
223}
224
225/// Parse a variable reference
226#[inline]
227fn parse_variable_value<'a>(input: &mut &'a str) -> PResult<'a, Value<'a>> {
228    // Parse as identifier - digit-starting values are handled by parse_number_or_digit_string
229    let ident = lexer::identifier(input)?;
230    Ok(Value::Variable(Cow::Borrowed(ident)))
231}
232
233/// Normalize a string value (remove excessive whitespace, handle LaTeX)
234#[must_use]
235pub fn normalize_value(s: &str) -> String {
236    // Basic normalization - can be extended with LaTeX processing
237    s.split_whitespace().collect::<Vec<_>>().join(" ")
238}
239
240#[cfg(test)]
241mod tests {
242    use super::*;
243
244    #[test]
245    fn test_parse_quoted_value() {
246        let mut input = r#""hello world" xxx"#;
247        let value = parse_value(&mut input).unwrap();
248        assert_eq!(value, Value::Literal(Cow::Borrowed("hello world")));
249        assert_eq!(input, " xxx");
250    }
251
252    #[test]
253    fn test_parse_braced_value() {
254        let mut input = "{hello world} xxx";
255        let value = parse_value(&mut input).unwrap();
256        assert_eq!(value, Value::Literal(Cow::Borrowed("hello world")));
257        assert_eq!(input, " xxx");
258    }
259
260    #[test]
261    fn test_parse_number_value() {
262        let mut input = "2023 xxx";
263        let value = parse_value(&mut input).unwrap();
264        assert_eq!(value, Value::Number(2023));
265        assert_eq!(input, " xxx");
266    }
267
268    #[test]
269    fn test_parse_variable_value() {
270        let mut input = "myvar xxx";
271        let value = parse_value(&mut input).unwrap();
272        assert_eq!(value, Value::Variable(Cow::Borrowed("myvar")));
273        assert_eq!(input, " xxx");
274    }
275
276    #[test]
277    fn test_parse_concatenated_value() {
278        let mut input = r#""hello" # myvar # {world} xxx"#;
279        let value = parse_value(&mut input).unwrap();
280        match value {
281            Value::Concat(parts) => {
282                assert_eq!(parts.len(), 3);
283                assert_eq!(parts[0], Value::Literal(Cow::Borrowed("hello")));
284                assert_eq!(parts[1], Value::Variable(Cow::Borrowed("myvar")));
285                assert_eq!(parts[2], Value::Literal(Cow::Borrowed("world")));
286            }
287            _ => panic!("Expected concatenated value"),
288        }
289        assert_eq!(input, " xxx");
290    }
291}