Skip to main content

edifact_parser/
tokenizer.rs

1use edifact_primitives::EdifactDelimiters;
2
3/// Tokenizes raw EDIFACT byte input into segment strings.
4///
5/// Handles release character escaping, whitespace normalization (strips \r\n),
6/// and UNA segment detection.
7pub struct EdifactTokenizer {
8    delimiters: EdifactDelimiters,
9}
10
11impl EdifactTokenizer {
12    /// Creates a new tokenizer with the given delimiters.
13    pub fn new(delimiters: EdifactDelimiters) -> Self {
14        Self { delimiters }
15    }
16
17    /// Returns the delimiters used by this tokenizer.
18    pub fn delimiters(&self) -> &EdifactDelimiters {
19        &self.delimiters
20    }
21
22    /// Tokenizes EDIFACT input into segment strings.
23    ///
24    /// Splits on segment terminator, respecting release character escaping.
25    /// Strips `\r` and `\n` characters from the input (EDIFACT uses them
26    /// only for readability).
27    ///
28    /// Each yielded string is a segment WITHOUT its terminator character.
29    pub fn tokenize_segments<'a>(&self, input: &'a [u8]) -> SegmentIter<'a> {
30        SegmentIter {
31            input,
32            pos: 0,
33            segment_terminator: self.delimiters.segment,
34            release_char: self.delimiters.release,
35        }
36    }
37
38    /// Tokenizes a segment string into data elements.
39    ///
40    /// Splits on element separator, preserving release character escaping
41    /// (unescaping happens at the component level).
42    pub fn tokenize_elements<'a>(&self, segment: &'a str) -> ElementIter<'a> {
43        ElementIter {
44            input: segment,
45            pos: 0,
46            separator: self.delimiters.element as char,
47            release: self.delimiters.release as char,
48        }
49    }
50
51    /// Tokenizes a data element into components.
52    ///
53    /// Splits on component separator and unescapes release character sequences.
54    pub fn tokenize_components<'a>(&self, element: &'a str) -> ComponentIter<'a> {
55        ComponentIter {
56            input: element,
57            pos: 0,
58            separator: self.delimiters.component as char,
59            release: self.delimiters.release as char,
60        }
61    }
62}
63
64/// Iterator over segments in raw EDIFACT input bytes.
65pub struct SegmentIter<'a> {
66    input: &'a [u8],
67    pos: usize,
68    segment_terminator: u8,
69    release_char: u8,
70}
71
72impl<'a> Iterator for SegmentIter<'a> {
73    type Item = &'a str;
74
75    fn next(&mut self) -> Option<Self::Item> {
76        // Skip whitespace between segments
77        while self.pos < self.input.len() {
78            let b = self.input[self.pos];
79            if b == b'\r' || b == b'\n' || b == b' ' || b == b'\t' {
80                self.pos += 1;
81            } else {
82                break;
83            }
84        }
85
86        if self.pos >= self.input.len() {
87            return None;
88        }
89
90        let start = self.pos;
91        let mut i = self.pos;
92
93        while i < self.input.len() {
94            let b = self.input[i];
95
96            // Skip \r and \n within segments (EDIFACT ignores them)
97            if b == b'\r' || b == b'\n' {
98                i += 1;
99                continue;
100            }
101
102            // Check for release character — next byte is escaped
103            if b == self.release_char && i + 1 < self.input.len() {
104                i += 2; // skip release char and the escaped char
105                continue;
106            }
107
108            if b == self.segment_terminator {
109                // Found unescaped terminator
110                let segment_bytes = &self.input[start..i];
111                self.pos = i + 1;
112
113                // Build segment string, stripping \r and \n
114                let segment_str = strip_crlf(segment_bytes);
115                if segment_str.is_empty() {
116                    return self.next(); // skip empty segments
117                }
118                return Some(segment_str);
119            }
120
121            i += 1;
122        }
123
124        // Remaining content after last terminator (may be trailing whitespace)
125        if start < self.input.len() {
126            let segment_bytes = &self.input[start..];
127            self.pos = self.input.len();
128            let segment_str = strip_crlf(segment_bytes);
129            if segment_str.is_empty() {
130                return None;
131            }
132            return Some(segment_str);
133        }
134
135        None
136    }
137}
138
139/// Converts a byte slice to a string, stripping \r and \n characters.
140///
141/// In practice, EDIFACT segments never contain embedded newlines as data
142/// (they are only used as line separators between segments for readability).
143/// So we can safely interpret the bytes as UTF-8 and trim.
144fn strip_crlf(bytes: &[u8]) -> &str {
145    // Fast path: try to interpret as UTF-8 and trim
146    let s = std::str::from_utf8(bytes).unwrap_or("");
147    s.trim_matches(|c: char| c == '\r' || c == '\n')
148}
149
150/// Iterator over elements within a segment string.
151pub struct ElementIter<'a> {
152    input: &'a str,
153    pos: usize,
154    separator: char,
155    release: char,
156}
157
158impl<'a> Iterator for ElementIter<'a> {
159    type Item = &'a str;
160
161    fn next(&mut self) -> Option<Self::Item> {
162        if self.pos > self.input.len() {
163            return None;
164        }
165
166        let start = self.pos;
167        let bytes = self.input.as_bytes();
168        let mut i = self.pos;
169
170        while i < bytes.len() {
171            let ch = bytes[i] as char;
172
173            // Release character escapes the next character
174            if ch == self.release && i + 1 < bytes.len() {
175                i += 2;
176                continue;
177            }
178
179            if ch == self.separator {
180                let element = &self.input[start..i];
181                self.pos = i + 1;
182                return Some(element);
183            }
184
185            i += 1;
186        }
187
188        // Return remaining content
189        if start <= self.input.len() {
190            let element = &self.input[start..];
191            self.pos = self.input.len() + 1; // mark as exhausted
192            return Some(element);
193        }
194
195        None
196    }
197}
198
199/// Iterator over components within a data element.
200pub struct ComponentIter<'a> {
201    input: &'a str,
202    pos: usize,
203    separator: char,
204    release: char,
205}
206
207impl<'a> Iterator for ComponentIter<'a> {
208    type Item = &'a str;
209
210    fn next(&mut self) -> Option<Self::Item> {
211        if self.pos > self.input.len() {
212            return None;
213        }
214
215        let start = self.pos;
216        let bytes = self.input.as_bytes();
217        let mut i = self.pos;
218
219        while i < bytes.len() {
220            let ch = bytes[i] as char;
221
222            // Release character escapes the next character
223            if ch == self.release && i + 1 < bytes.len() {
224                i += 2;
225                continue;
226            }
227
228            if ch == self.separator {
229                let component = &self.input[start..i];
230                self.pos = i + 1;
231                return Some(component);
232            }
233
234            i += 1;
235        }
236
237        // Return remaining content
238        if start <= self.input.len() {
239            let component = &self.input[start..];
240            self.pos = self.input.len() + 1;
241            return Some(component);
242        }
243
244        None
245    }
246}
247
248#[cfg(test)]
249mod tests {
250    use super::*;
251
252    #[test]
253    fn test_tokenize_segments_simple() {
254        let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
255        let input = b"UNB+UNOC:3'UNH+00001'UNT+2+00001'UNZ+1'";
256        let segments: Vec<&str> = tokenizer.tokenize_segments(input).collect();
257        assert_eq!(
258            segments,
259            vec!["UNB+UNOC:3", "UNH+00001", "UNT+2+00001", "UNZ+1"]
260        );
261    }
262
263    #[test]
264    fn test_tokenize_segments_with_newlines() {
265        let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
266        let input = b"UNB+UNOC:3'\nUNH+00001'\r\nUNT+2+00001'\nUNZ+1'";
267        let segments: Vec<&str> = tokenizer.tokenize_segments(input).collect();
268        assert_eq!(
269            segments,
270            vec!["UNB+UNOC:3", "UNH+00001", "UNT+2+00001", "UNZ+1"]
271        );
272    }
273
274    #[test]
275    fn test_tokenize_segments_with_release_char() {
276        let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
277        // ?'  is an escaped apostrophe — NOT a segment terminator
278        let input = b"FTX+ACB+++text with ?'quotes?''";
279        let segments: Vec<&str> = tokenizer.tokenize_segments(input).collect();
280        assert_eq!(segments.len(), 1);
281        assert_eq!(segments[0], "FTX+ACB+++text with ?'quotes?'");
282    }
283
284    #[test]
285    fn test_tokenize_segments_empty_input() {
286        let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
287        let input = b"";
288        let segments: Vec<&str> = tokenizer.tokenize_segments(input).collect();
289        assert!(segments.is_empty());
290    }
291
292    #[test]
293    fn test_tokenize_segments_trailing_whitespace() {
294        let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
295        let input = b"UNH+00001'  \n  ";
296        let segments: Vec<&str> = tokenizer.tokenize_segments(input).collect();
297        assert_eq!(segments, vec!["UNH+00001"]);
298    }
299
300    #[test]
301    fn test_tokenize_segments_custom_delimiter() {
302        let delimiters = EdifactDelimiters {
303            segment: b'!',
304            ..EdifactDelimiters::default()
305        };
306        let tokenizer = EdifactTokenizer::new(delimiters);
307        let input = b"UNB+UNOC:3!UNH+00001!";
308        let segments: Vec<&str> = tokenizer.tokenize_segments(input).collect();
309        assert_eq!(segments, vec!["UNB+UNOC:3", "UNH+00001"]);
310    }
311
312    // --- Task 2: Element and Component Splitting ---
313
314    #[test]
315    fn test_tokenize_elements() {
316        let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
317        let elements: Vec<&str> = tokenizer
318            .tokenize_elements("NAD+Z04+9900123000002:500")
319            .collect();
320        assert_eq!(elements, vec!["NAD", "Z04", "9900123000002:500"]);
321    }
322
323    #[test]
324    fn test_tokenize_elements_escaped_plus() {
325        let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
326        let elements: Vec<&str> = tokenizer
327            .tokenize_elements("FTX+ACB+++value with ?+plus")
328            .collect();
329        // ?+ is escaped, so it should NOT split; +++ produces two empty elements
330        assert_eq!(elements, vec!["FTX", "ACB", "", "", "value with ?+plus"]);
331    }
332
333    #[test]
334    fn test_tokenize_components() {
335        let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
336        let components: Vec<&str> = tokenizer
337            .tokenize_components("UTILMD:D:11A:UN:S2.1")
338            .collect();
339        assert_eq!(components, vec!["UTILMD", "D", "11A", "UN", "S2.1"]);
340    }
341
342    #[test]
343    fn test_tokenize_components_escaped_colon() {
344        let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
345        let components: Vec<&str> = tokenizer.tokenize_components("value?:with:colon").collect();
346        // ?: is escaped, so "value?:with" is one component
347        assert_eq!(components, vec!["value?:with", "colon"]);
348    }
349
350    #[test]
351    fn test_tokenize_components_empty() {
352        let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
353        let components: Vec<&str> = tokenizer.tokenize_components("Z04::500").collect();
354        assert_eq!(components, vec!["Z04", "", "500"]);
355    }
356
357    #[test]
358    fn test_full_tokenization_pipeline() {
359        let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
360        let input = b"NAD+Z04+9900123000002::293'DTM+137:202501010000?+01:303'";
361
362        let segments: Vec<&str> = tokenizer.tokenize_segments(input).collect();
363        assert_eq!(segments.len(), 2);
364
365        // Parse first segment: NAD+Z04+9900123000002::293
366        let elements: Vec<&str> = tokenizer.tokenize_elements(segments[0]).collect();
367        assert_eq!(elements, vec!["NAD", "Z04", "9900123000002::293"]);
368
369        // Parse composite element: 9900123000002::293
370        let components: Vec<&str> = tokenizer.tokenize_components(elements[2]).collect();
371        assert_eq!(components, vec!["9900123000002", "", "293"]);
372
373        // Parse second segment: DTM+137:202501010000?+01:303
374        let dtm_elements: Vec<&str> = tokenizer.tokenize_elements(segments[1]).collect();
375        assert_eq!(dtm_elements, vec!["DTM", "137:202501010000?+01:303"]);
376
377        // Parse DTM composite (note: ?+ is escaped at element level, kept as-is)
378        let dtm_components: Vec<&str> = tokenizer.tokenize_components(dtm_elements[1]).collect();
379        assert_eq!(dtm_components, vec!["137", "202501010000?+01", "303"]);
380    }
381}