Skip to main content

bibtex_parser/parser/
value.rs

1//! Value parsing for BibTeX fields
2
3use super::{lexer, PResult};
4use crate::model::Value;
5use std::borrow::Cow;
6
7/// Parse a BibTeX value (string, number, variable, or concatenation)
8#[inline]
9pub fn parse_value<'a>(input: &mut &'a str) -> PResult<'a, Value<'a>> {
10    parse_concatenated_value(input)
11}
12
13/// Parse a field value and consume trailing ASCII whitespace.
14///
15/// This variant is used by entry parsing so the field loop can read the
16/// delimiter directly without re-scanning whitespace.
17#[inline]
18pub(crate) fn parse_value_field<'a>(input: &mut &'a str) -> PResult<'a, Value<'a>> {
19    parse_concatenated_value_field(input)
20}
21
22/// Parse a concatenated value (value # value # ...)
23#[inline]
24fn parse_concatenated_value<'a>(input: &mut &'a str) -> PResult<'a, Value<'a>> {
25    let first = parse_single_value(input)?;
26
27    // Fast path: most fields are a single value with no concatenation.
28    if !consume_concat_separator(input) {
29        return Ok(first);
30    }
31
32    // Slow path: parse one or more `# value` segments.
33    let mut parts = Vec::with_capacity(3);
34    parts.push(first);
35
36    loop {
37        let part = parse_single_value(input)?;
38        parts.push(part);
39
40        if !consume_concat_separator(input) {
41            break;
42        }
43    }
44
45    Ok(Value::Concat(parts.into_boxed_slice()))
46}
47
48/// Parse a concatenated value and consume trailing ASCII whitespace.
49#[inline]
50fn parse_concatenated_value_field<'a>(input: &mut &'a str) -> PResult<'a, Value<'a>> {
51    let first = parse_single_value(input)?;
52
53    if lexer::skip_whitespace_peek(input) != Some(b'#') {
54        return Ok(first);
55    }
56
57    // Slow path: parse one or more `# value` segments.
58    let mut parts = Vec::with_capacity(3);
59    parts.push(first);
60
61    loop {
62        // Consume '#'
63        *input = &input[1..];
64        lexer::skip_whitespace(input);
65
66        let part = parse_single_value(input)?;
67        parts.push(part);
68
69        if lexer::skip_whitespace_peek(input) != Some(b'#') {
70            break;
71        }
72    }
73
74    Ok(Value::Concat(parts.into_boxed_slice()))
75}
76
77/// Consume optional whitespace + `#` + optional whitespace.
78///
79/// Returns `true` if a concatenation separator was consumed. If no separator
80/// is present, input is left untouched.
81#[inline]
82fn consume_concat_separator(input: &mut &str) -> bool {
83    let mut probe = *input;
84    lexer::skip_whitespace(&mut probe);
85    if probe.as_bytes().first() != Some(&b'#') {
86        return false;
87    }
88
89    probe = &probe[1..];
90    lexer::skip_whitespace(&mut probe);
91    *input = probe;
92    true
93}
94
95/// Parse a single value component
96#[inline]
97fn parse_single_value<'a>(input: &mut &'a str) -> PResult<'a, Value<'a>> {
98    // Fast dispatch based on first character
99    let bytes = input.as_bytes();
100    if let Some(&first) = bytes.first() {
101        match first {
102            b'"' => {
103                super::simd::find_balanced_quotes(bytes).map_or_else(super::backtrack, |end_pos| {
104                    let content = &input[1..end_pos - 1];
105                    *input = &input[end_pos..];
106                    Ok(Value::Literal(Cow::Borrowed(content)))
107                })
108            }
109            b'{' => {
110                super::simd::find_balanced_braces(bytes).map_or_else(super::backtrack, |end_pos| {
111                    let content = &input[1..end_pos - 1];
112                    *input = &input[end_pos..];
113                    Ok(Value::Literal(Cow::Borrowed(content)))
114                })
115            }
116            b'0'..=b'9' | b'+' | b'-' => parse_number_or_digit_string(input),
117            _ => parse_variable_value(input),
118        }
119    } else {
120        super::backtrack()
121    }
122}
123
124/// Parse either a number or a string that starts with digits
125/// This handles cases like "2024a", "12b", "1.2.3", etc.
126#[inline]
127fn parse_number_or_digit_string<'a>(input: &mut &'a str) -> PResult<'a, Value<'a>> {
128    let bytes = input.as_bytes();
129    let Some(&first) = bytes.first() else {
130        return super::backtrack();
131    };
132
133    let len = super::simd::scan_identifier(bytes);
134    if len == 0 {
135        return super::backtrack();
136    }
137
138    let token = &input[..len];
139    let token_bytes = token.as_bytes();
140
141    // Signed values must be strict integers (e.g., +42, -1).
142    // Non-digit suffixes after a sign are rejected.
143    if first == b'+' || first == b'-' {
144        if token_bytes.len() <= 1 || !token_bytes[1..].iter().all(u8::is_ascii_digit) {
145            return super::backtrack();
146        }
147        let num = parse_i64_ascii(token)?;
148        *input = &input[len..];
149        return Ok(Value::Number(num));
150    }
151
152    // Digit-starting tokens parse as numbers when fully numeric,
153    // otherwise as literals (e.g. 2024a).
154    if !first.is_ascii_digit() {
155        return super::backtrack();
156    }
157
158    *input = &input[len..];
159    if token_bytes.iter().all(u8::is_ascii_digit) {
160        let num = parse_i64_ascii(token)?;
161        Ok(Value::Number(num))
162    } else {
163        Ok(Value::Literal(Cow::Borrowed(token)))
164    }
165}
166
167#[inline]
168fn parse_i64_ascii(token: &str) -> PResult<'_, i64> {
169    let bytes = token.as_bytes();
170    let (negative, start) = match bytes.first() {
171        Some(b'-') => (true, 1),
172        Some(b'+') => (false, 1),
173        _ => (false, 0),
174    };
175
176    if start >= bytes.len() {
177        return super::backtrack();
178    }
179
180    let mut value: i64 = 0;
181    for &byte in &bytes[start..] {
182        if !byte.is_ascii_digit() {
183            return super::backtrack();
184        }
185
186        let digit = i64::from(byte - b'0');
187        value = if negative {
188            value
189                .checked_mul(10)
190                .and_then(|v| v.checked_sub(digit))
191                .ok_or_else(super::backtrack_err)?
192        } else {
193            value
194                .checked_mul(10)
195                .and_then(|v| v.checked_add(digit))
196                .ok_or_else(super::backtrack_err)?
197        };
198    }
199
200    Ok(value)
201}
202
203/// Parse a variable reference
204#[inline]
205fn parse_variable_value<'a>(input: &mut &'a str) -> PResult<'a, Value<'a>> {
206    // Parse as identifier - digit-starting values are handled by parse_number_or_digit_string
207    let ident = lexer::identifier(input)?;
208    Ok(Value::Variable(Cow::Borrowed(ident)))
209}
210
211/// Normalize a string value (remove excessive whitespace, handle LaTeX)
212#[must_use]
213pub fn normalize_value(s: &str) -> String {
214    // Basic normalization - can be extended with LaTeX processing
215    s.split_whitespace().collect::<Vec<_>>().join(" ")
216}
217
218#[cfg(test)]
219mod tests {
220    use super::*;
221
222    #[test]
223    fn test_parse_quoted_value() {
224        let mut input = r#""hello world" xxx"#;
225        let value = parse_value(&mut input).unwrap();
226        assert_eq!(value, Value::Literal(Cow::Borrowed("hello world")));
227        assert_eq!(input, " xxx");
228    }
229
230    #[test]
231    fn test_parse_braced_value() {
232        let mut input = "{hello world} xxx";
233        let value = parse_value(&mut input).unwrap();
234        assert_eq!(value, Value::Literal(Cow::Borrowed("hello world")));
235        assert_eq!(input, " xxx");
236    }
237
238    #[test]
239    fn test_parse_number_value() {
240        let mut input = "2023 xxx";
241        let value = parse_value(&mut input).unwrap();
242        assert_eq!(value, Value::Number(2023));
243        assert_eq!(input, " xxx");
244    }
245
246    #[test]
247    fn test_parse_variable_value() {
248        let mut input = "myvar xxx";
249        let value = parse_value(&mut input).unwrap();
250        assert_eq!(value, Value::Variable(Cow::Borrowed("myvar")));
251        assert_eq!(input, " xxx");
252    }
253
254    #[test]
255    fn test_parse_concatenated_value() {
256        let mut input = r#""hello" # myvar # {world} xxx"#;
257        let value = parse_value(&mut input).unwrap();
258        match value {
259            Value::Concat(parts) => {
260                assert_eq!(parts.len(), 3);
261                assert_eq!(parts[0], Value::Literal(Cow::Borrowed("hello")));
262                assert_eq!(parts[1], Value::Variable(Cow::Borrowed("myvar")));
263                assert_eq!(parts[2], Value::Literal(Cow::Borrowed("world")));
264            }
265            _ => panic!("Expected concatenated value"),
266        }
267        assert_eq!(input, " xxx");
268    }
269}