Skip to main content

automapper_validation/expr/
token.rs

1//! Tokenizer for condition expression strings.
2
3use crate::error::ParseError;
4
5/// Token types produced by the condition expression tokenizer.
6#[derive(Debug, Clone, PartialEq, Eq)]
7pub enum Token {
8    /// A condition reference number, e.g., `931` from `[931]`.
9    ConditionId(String),
10    /// AND operator (`∧` or `AND`).
11    And,
12    /// OR operator (`∨` or `OR`).
13    Or,
14    /// XOR operator (`⊻` or `XOR`).
15    Xor,
16    /// NOT operator (`NOT`).
17    Not,
18    /// Opening parenthesis `(`.
19    LeftParen,
20    /// Closing parenthesis `)`.
21    RightParen,
22}
23
24/// A token with its position in the source string.
25#[derive(Debug, Clone, PartialEq, Eq)]
26pub struct SpannedToken {
27    pub token: Token,
28    pub position: usize,
29}
30
31/// AHB status prefixes that are stripped before tokenizing.
32const STATUS_PREFIXES: &[&str] = &["Muss", "Soll", "Kann", "X"];
33
34/// Strip the AHB status prefix (Muss, Soll, Kann, X) from the input.
35///
36/// Returns the remainder of the string after the prefix, or the original
37/// string if no prefix is found.
38pub fn strip_status_prefix(input: &str) -> &str {
39    let trimmed = input.trim();
40    for prefix in STATUS_PREFIXES {
41        if let Some(rest) = trimmed.strip_prefix(prefix) {
42            let rest = rest.trim_start();
43            if !rest.is_empty() {
44                return rest;
45            }
46        }
47    }
48    trimmed
49}
50
51/// Tokenize an AHB condition expression string.
52///
53/// The input should already have the status prefix stripped.
54pub fn tokenize(input: &str) -> Result<Vec<SpannedToken>, ParseError> {
55    let mut tokens = Vec::new();
56    let chars: Vec<char> = input.chars().collect();
57    let mut i = 0;
58
59    while i < chars.len() {
60        let c = chars[i];
61
62        // Skip whitespace
63        if c.is_whitespace() {
64            i += 1;
65            continue;
66        }
67
68        let position = i;
69
70        // Parentheses
71        if c == '(' {
72            tokens.push(SpannedToken {
73                token: Token::LeftParen,
74                position,
75            });
76            i += 1;
77            continue;
78        }
79        if c == ')' {
80            tokens.push(SpannedToken {
81                token: Token::RightParen,
82                position,
83            });
84            i += 1;
85            continue;
86        }
87
88        // Unicode operators
89        if c == '\u{2227}' {
90            // ∧ AND
91            tokens.push(SpannedToken {
92                token: Token::And,
93                position,
94            });
95            i += 1;
96            continue;
97        }
98        if c == '\u{2228}' {
99            // ∨ OR
100            tokens.push(SpannedToken {
101                token: Token::Or,
102                position,
103            });
104            i += 1;
105            continue;
106        }
107        if c == '\u{22BB}' {
108            // ⊻ XOR
109            tokens.push(SpannedToken {
110                token: Token::Xor,
111                position,
112            });
113            i += 1;
114            continue;
115        }
116
117        // Condition reference [...]
118        if c == '[' {
119            let start = i;
120            i += 1;
121            while i < chars.len() && chars[i] != ']' {
122                i += 1;
123            }
124            if i < chars.len() {
125                let content: String = chars[start + 1..i].iter().collect();
126                tokens.push(SpannedToken {
127                    token: Token::ConditionId(content),
128                    position: start,
129                });
130                i += 1; // skip closing ]
131            } else {
132                let content: String = chars[start + 1..].iter().collect();
133                return Err(ParseError::InvalidConditionRef { content });
134            }
135            continue;
136        }
137
138        // Text keywords: AND, OR, XOR, NOT (case-insensitive)
139        if c.is_ascii_alphabetic() {
140            let start = i;
141            while i < chars.len() && chars[i].is_ascii_alphabetic() {
142                i += 1;
143            }
144            let word: String = chars[start..i].iter().collect();
145            match word.to_uppercase().as_str() {
146                "AND" => tokens.push(SpannedToken {
147                    token: Token::And,
148                    position: start,
149                }),
150                "OR" => tokens.push(SpannedToken {
151                    token: Token::Or,
152                    position: start,
153                }),
154                "XOR" => tokens.push(SpannedToken {
155                    token: Token::Xor,
156                    position: start,
157                }),
158                "NOT" => tokens.push(SpannedToken {
159                    token: Token::Not,
160                    position: start,
161                }),
162                _ => {
163                    // Skip unknown words (could be status prefix remnants)
164                }
165            }
166            continue;
167        }
168
169        // Skip unknown characters
170        i += 1;
171    }
172
173    Ok(tokens)
174}
175
176#[cfg(test)]
177mod tests {
178    use super::*;
179
180    // --- strip_status_prefix tests ---
181
182    #[test]
183    fn test_strip_muss_prefix() {
184        assert_eq!(strip_status_prefix("Muss [494]"), "[494]");
185    }
186
187    #[test]
188    fn test_strip_soll_prefix() {
189        assert_eq!(strip_status_prefix("Soll [494]"), "[494]");
190    }
191
192    #[test]
193    fn test_strip_kann_prefix() {
194        assert_eq!(strip_status_prefix("Kann [182] ∧ [6]"), "[182] ∧ [6]");
195    }
196
197    #[test]
198    fn test_strip_x_prefix() {
199        assert_eq!(
200            strip_status_prefix("X (([939][14]) ∨ ([940][15]))"),
201            "(([939][14]) ∨ ([940][15]))"
202        );
203    }
204
205    #[test]
206    fn test_strip_no_prefix() {
207        assert_eq!(strip_status_prefix("[1] ∧ [2]"), "[1] ∧ [2]");
208    }
209
210    #[test]
211    fn test_strip_muss_only_returns_trimmed() {
212        // "Muss" alone with nothing after has no conditions
213        assert_eq!(strip_status_prefix("Muss"), "Muss");
214    }
215
216    #[test]
217    fn test_strip_whitespace_only() {
218        assert_eq!(strip_status_prefix("   "), "");
219    }
220
221    #[test]
222    fn test_strip_preserves_leading_whitespace_in_content() {
223        assert_eq!(strip_status_prefix("Muss   [1]"), "[1]");
224    }
225
226    // --- tokenize tests ---
227
228    #[test]
229    fn test_tokenize_single_condition() {
230        let tokens = tokenize("[931]").unwrap();
231        assert_eq!(tokens.len(), 1);
232        assert_eq!(tokens[0].token, Token::ConditionId("931".to_string()));
233    }
234
235    #[test]
236    fn test_tokenize_and_unicode() {
237        let tokens = tokenize("[1] ∧ [2]").unwrap();
238        assert_eq!(tokens.len(), 3);
239        assert_eq!(tokens[0].token, Token::ConditionId("1".to_string()));
240        assert_eq!(tokens[1].token, Token::And);
241        assert_eq!(tokens[2].token, Token::ConditionId("2".to_string()));
242    }
243
244    #[test]
245    fn test_tokenize_or_unicode() {
246        let tokens = tokenize("[1] ∨ [2]").unwrap();
247        assert_eq!(tokens[1].token, Token::Or);
248    }
249
250    #[test]
251    fn test_tokenize_xor_unicode() {
252        let tokens = tokenize("[1] ⊻ [2]").unwrap();
253        assert_eq!(tokens[1].token, Token::Xor);
254    }
255
256    #[test]
257    fn test_tokenize_text_keywords() {
258        let tokens = tokenize("[1] AND [2] OR [3] XOR [4]").unwrap();
259        assert_eq!(tokens.len(), 7);
260        assert_eq!(tokens[1].token, Token::And);
261        assert_eq!(tokens[3].token, Token::Or);
262        assert_eq!(tokens[5].token, Token::Xor);
263    }
264
265    #[test]
266    fn test_tokenize_not_keyword() {
267        let tokens = tokenize("NOT [1]").unwrap();
268        assert_eq!(tokens.len(), 2);
269        assert_eq!(tokens[0].token, Token::Not);
270        assert_eq!(tokens[1].token, Token::ConditionId("1".to_string()));
271    }
272
273    #[test]
274    fn test_tokenize_parentheses() {
275        let tokens = tokenize("([1] ∨ [2]) ∧ [3]").unwrap();
276        assert_eq!(tokens.len(), 7);
277        assert_eq!(tokens[0].token, Token::LeftParen);
278        assert_eq!(tokens[4].token, Token::RightParen);
279    }
280
281    #[test]
282    fn test_tokenize_adjacent_conditions_no_space() {
283        let tokens = tokenize("[939][14]").unwrap();
284        assert_eq!(tokens.len(), 2);
285        assert_eq!(tokens[0].token, Token::ConditionId("939".to_string()));
286        assert_eq!(tokens[1].token, Token::ConditionId("14".to_string()));
287    }
288
289    #[test]
290    fn test_tokenize_package_condition() {
291        let tokens = tokenize("[10P1..5]").unwrap();
292        assert_eq!(tokens.len(), 1);
293        assert_eq!(tokens[0].token, Token::ConditionId("10P1..5".to_string()));
294    }
295
296    #[test]
297    fn test_tokenize_time_condition() {
298        let tokens = tokenize("[UB1]").unwrap();
299        assert_eq!(tokens.len(), 1);
300        assert_eq!(tokens[0].token, Token::ConditionId("UB1".to_string()));
301    }
302
303    #[test]
304    fn test_tokenize_tabs_and_multiple_spaces() {
305        let tokens = tokenize("[1]\t∧\t[2]").unwrap();
306        assert_eq!(tokens.len(), 3);
307        assert_eq!(tokens[1].token, Token::And);
308    }
309
310    #[test]
311    fn test_tokenize_multiple_spaces() {
312        let tokens = tokenize("[1]    ∧    [2]").unwrap();
313        assert_eq!(tokens.len(), 3);
314    }
315
316    #[test]
317    fn test_tokenize_empty_string() {
318        let tokens = tokenize("").unwrap();
319        assert!(tokens.is_empty());
320    }
321
322    #[test]
323    fn test_tokenize_complex_real_world() {
324        // "X (([939] [147]) ∨ ([940] [148])) ∧ [567]"
325        // After prefix strip: "(([939] [147]) ∨ ([940] [148])) ∧ [567]"
326        let tokens = tokenize("(([939] [147]) ∨ ([940] [148])) ∧ [567]").unwrap();
327        assert_eq!(tokens.len(), 13);
328        assert_eq!(tokens[0].token, Token::LeftParen);
329        assert_eq!(tokens[1].token, Token::LeftParen);
330        assert_eq!(tokens[2].token, Token::ConditionId("939".to_string()));
331        assert_eq!(tokens[3].token, Token::ConditionId("147".to_string()));
332        assert_eq!(tokens[4].token, Token::RightParen);
333        assert_eq!(tokens[5].token, Token::Or);
334        assert_eq!(tokens[6].token, Token::LeftParen);
335        assert_eq!(tokens[7].token, Token::ConditionId("940".to_string()));
336        assert_eq!(tokens[8].token, Token::ConditionId("148".to_string()));
337        assert_eq!(tokens[9].token, Token::RightParen);
338        assert_eq!(tokens[10].token, Token::RightParen);
339        assert_eq!(tokens[11].token, Token::And);
340        assert_eq!(tokens[12].token, Token::ConditionId("567".to_string()));
341    }
342
343    #[test]
344    fn test_tokenize_positions_are_correct() {
345        let tokens = tokenize("[1] ∧ [2]").unwrap();
346        assert_eq!(tokens[0].position, 0); // [
347        assert_eq!(tokens[2].position, 6); // [ of [2] (∧ is a single char in char index)
348    }
349
350    #[test]
351    fn test_tokenize_case_insensitive_keywords() {
352        let tokens = tokenize("[1] and [2] or [3]").unwrap();
353        assert_eq!(tokens[1].token, Token::And);
354        assert_eq!(tokens[3].token, Token::Or);
355    }
356
357    #[test]
358    fn test_tokenize_unclosed_bracket_returns_error() {
359        let result = tokenize("[931");
360        assert!(result.is_err());
361    }
362}