Skip to main content

edifact_parser/
parser.rs

1#[cfg(test)]
2use edifact_primitives::Control;
3use edifact_primitives::{EdifactDelimiters, RawSegment, SegmentPosition};
4
5use crate::error::ParseError;
6use crate::handler::EdifactHandler;
7use crate::segment_builder::SegmentBuilder as SegBuilder;
8use crate::tokenizer::EdifactTokenizer;
9
10/// Streaming EDIFACT parser.
11///
12/// Parses a byte slice by tokenizing it into segments and routing them
13/// to an `EdifactHandler`. Service segments (UNB, UNH, UNT, UNZ) are
14/// dispatched to specific handler methods in addition to `on_segment()`.
15pub struct EdifactStreamParser;
16
17impl EdifactStreamParser {
18    /// Parse an EDIFACT interchange from a byte slice.
19    ///
20    /// This is the main synchronous entry point. It:
21    /// 1. Detects UNA and determines delimiters
22    /// 2. Tokenizes input into segments
23    /// 3. Routes each segment to the handler
24    /// 4. Stops if the handler returns `Control::Stop`
25    ///
26    /// Supports both UTF-8 and ISO-8859-1 encoded input. If the input
27    /// contains non-UTF-8 bytes (raw ISO-8859-1), it is transparently
28    /// transcoded to UTF-8 before parsing. EDIFACT delimiters are always
29    /// ASCII, so transcoding does not affect delimiter detection.
30    pub fn parse(input: &[u8], handler: &mut dyn EdifactHandler) -> Result<(), ParseError> {
31        if std::str::from_utf8(input).is_ok() {
32            Self::parse_inner(input, handler)
33        } else {
34            let transcoded = transcode_iso_8859_1_to_utf8(input);
35            Self::parse_inner(&transcoded, handler)
36        }
37    }
38
39    fn parse_inner(input: &[u8], handler: &mut dyn EdifactHandler) -> Result<(), ParseError> {
40        // Step 1: Detect delimiters
41        let (has_una, delimiters) = EdifactDelimiters::detect(input);
42        handler.on_delimiters(&delimiters, has_una);
43
44        // Step 2: Determine where actual content starts (after UNA if present)
45        let content_start = if has_una { 9 } else { 0 };
46        let content = &input[content_start..];
47
48        // Step 3: Tokenize and process segments
49        let tokenizer = EdifactTokenizer::new(delimiters);
50        let seg_builder = SegBuilder::new(delimiters);
51
52        let mut segment_number: u32 = 0;
53        let mut message_number: u32 = 0;
54        let mut byte_offset = content_start;
55
56        for segment_str in tokenizer.tokenize_segments(content) {
57            segment_number += 1;
58
59            let position = SegmentPosition::new(segment_number, byte_offset, message_number);
60
61            let Some(raw_segment) = seg_builder.build(segment_str, position) else {
62                byte_offset += segment_str.len() + 1; // +1 for terminator
63                continue;
64            };
65
66            // Skip UNA segments in content
67            if raw_segment.is("UNA") {
68                byte_offset += segment_str.len() + 1;
69                segment_number -= 1; // don't count UNA
70                continue;
71            }
72
73            let id_upper = raw_segment.id.to_ascii_uppercase();
74
75            // Track message numbering
76            if id_upper == "UNH" {
77                message_number += 1;
78            }
79
80            // Rebuild position with correct message number
81            let effective_message_number = if id_upper == "UNB" || id_upper == "UNZ" {
82                0
83            } else {
84                message_number
85            };
86            let position =
87                SegmentPosition::new(segment_number, byte_offset, effective_message_number);
88            let raw_segment = RawSegment::new(raw_segment.id, raw_segment.elements, position);
89
90            // Route service segments
91            match id_upper.as_str() {
92                "UNB" => {
93                    if handler.on_interchange_start(&raw_segment).should_stop() {
94                        return Ok(());
95                    }
96                }
97                "UNH" => {
98                    if handler.on_message_start(&raw_segment).should_stop() {
99                        return Ok(());
100                    }
101                }
102                "UNT" => {
103                    handler.on_message_end(&raw_segment);
104                }
105                "UNZ" => {
106                    handler.on_interchange_end(&raw_segment);
107                }
108                _ => {}
109            }
110
111            // Always call on_segment
112            if handler.on_segment(&raw_segment).should_stop() {
113                return Ok(());
114            }
115
116            byte_offset += segment_str.len() + 1; // +1 for terminator
117        }
118
119        Ok(())
120    }
121}
122
123/// Transcode ISO-8859-1 bytes to UTF-8.
124///
125/// ISO-8859-1 code points 0x00–0xFF map directly to Unicode U+0000–U+00FF.
126/// - 0x00–0x7F: single UTF-8 byte (identical)
127/// - 0x80–0xBF: two UTF-8 bytes: 0xC2 + original byte
128/// - 0xC0–0xFF: two UTF-8 bytes: 0xC3 + (original byte - 0x40)
129fn transcode_iso_8859_1_to_utf8(input: &[u8]) -> Vec<u8> {
130    let mut output = Vec::with_capacity(input.len() + input.len() / 4);
131    for &b in input {
132        if b < 0x80 {
133            output.push(b);
134        } else {
135            // ISO-8859-1 byte to UTF-8 two-byte sequence
136            output.push(0xC0 | (b >> 6));
137            output.push(0x80 | (b & 0x3F));
138        }
139    }
140    output
141}
142
143#[cfg(test)]
144mod tests {
145    use super::*;
146    use std::cell::RefCell;
147
148    /// Handler that collects all event names in order.
149    struct EventCollector {
150        events: RefCell<Vec<String>>,
151    }
152
153    impl EventCollector {
154        fn new() -> Self {
155            Self {
156                events: RefCell::new(Vec::new()),
157            }
158        }
159
160        fn events(&self) -> Vec<String> {
161            self.events.borrow().clone()
162        }
163    }
164
165    impl EdifactHandler for EventCollector {
166        fn on_delimiters(&mut self, _d: &EdifactDelimiters, explicit_una: bool) {
167            self.events
168                .borrow_mut()
169                .push(format!("DELIMITERS(una={})", explicit_una));
170        }
171
172        fn on_interchange_start(&mut self, unb: &RawSegment) -> Control {
173            self.events
174                .borrow_mut()
175                .push(format!("INTERCHANGE_START({})", unb.id));
176            Control::Continue
177        }
178
179        fn on_message_start(&mut self, unh: &RawSegment) -> Control {
180            self.events
181                .borrow_mut()
182                .push(format!("MESSAGE_START(ref={})", unh.get_element(0)));
183            Control::Continue
184        }
185
186        fn on_segment(&mut self, seg: &RawSegment) -> Control {
187            self.events
188                .borrow_mut()
189                .push(format!("SEGMENT({})", seg.id));
190            Control::Continue
191        }
192
193        fn on_message_end(&mut self, _unt: &RawSegment) {
194            self.events.borrow_mut().push("MESSAGE_END".to_string());
195        }
196
197        fn on_interchange_end(&mut self, _unz: &RawSegment) {
198            self.events.borrow_mut().push("INTERCHANGE_END".to_string());
199        }
200    }
201
202    #[test]
203    fn test_parse_minimal_interchange() {
204        let input = b"UNA:+.? 'UNB+UNOC:3+SENDER+RECEIVER+210101:1200+REF001'UNH+MSG001+UTILMD:D:11A:UN:S2.1'BGM+E03+DOC001'UNT+3+MSG001'UNZ+1+REF001'";
205
206        let mut handler = EventCollector::new();
207        EdifactStreamParser::parse(input, &mut handler).unwrap();
208
209        let events = handler.events();
210        assert_eq!(events[0], "DELIMITERS(una=true)");
211        assert_eq!(events[1], "INTERCHANGE_START(UNB)");
212        assert_eq!(events[2], "SEGMENT(UNB)");
213        assert_eq!(events[3], "MESSAGE_START(ref=MSG001)");
214        assert_eq!(events[4], "SEGMENT(UNH)");
215        assert_eq!(events[5], "SEGMENT(BGM)");
216        assert_eq!(events[6], "MESSAGE_END");
217        assert_eq!(events[7], "SEGMENT(UNT)");
218        assert_eq!(events[8], "INTERCHANGE_END");
219        assert_eq!(events[9], "SEGMENT(UNZ)");
220    }
221
222    #[test]
223    fn test_parse_without_una() {
224        let input = b"UNB+UNOC:3+SENDER+RECEIVER'UNZ+0+REF'";
225
226        let mut handler = EventCollector::new();
227        EdifactStreamParser::parse(input, &mut handler).unwrap();
228
229        let events = handler.events();
230        assert_eq!(events[0], "DELIMITERS(una=false)");
231        assert_eq!(events[1], "INTERCHANGE_START(UNB)");
232    }
233
234    #[test]
235    fn test_parse_handler_stops_early() {
236        struct StopOnBgm {
237            segments_seen: Vec<String>,
238        }
239        impl EdifactHandler for StopOnBgm {
240            fn on_segment(&mut self, seg: &RawSegment) -> Control {
241                self.segments_seen.push(seg.id.to_string());
242                if seg.is("BGM") {
243                    Control::Stop
244                } else {
245                    Control::Continue
246                }
247            }
248        }
249
250        let input = b"UNA:+.? 'UNB+UNOC:3'UNH+001'BGM+E03'DTM+137:20250101'UNT+3+001'UNZ+1'";
251        let mut handler = StopOnBgm {
252            segments_seen: Vec::new(),
253        };
254        EdifactStreamParser::parse(input, &mut handler).unwrap();
255
256        // Should have seen UNB, UNH, BGM but NOT DTM, UNT, UNZ
257        assert_eq!(handler.segments_seen, vec!["UNB", "UNH", "BGM"]);
258    }
259
260    #[test]
261    fn test_parse_message_numbering() {
262        struct PositionTracker {
263            positions: Vec<(String, u32)>,
264        }
265        impl EdifactHandler for PositionTracker {
266            fn on_segment(&mut self, seg: &RawSegment) -> Control {
267                self.positions
268                    .push((seg.id.to_string(), seg.position.message_number));
269                Control::Continue
270            }
271        }
272
273        let input =
274            b"UNA:+.? 'UNB+UNOC:3'UNH+001'BGM+E03'UNT+2+001'UNH+002'BGM+E03'UNT+2+002'UNZ+2'";
275        let mut handler = PositionTracker {
276            positions: Vec::new(),
277        };
278        EdifactStreamParser::parse(input, &mut handler).unwrap();
279
280        // UNB is outside messages (message_number=0)
281        assert_eq!(handler.positions[0], ("UNB".to_string(), 0));
282        // First message
283        assert_eq!(handler.positions[1], ("UNH".to_string(), 1));
284        assert_eq!(handler.positions[2], ("BGM".to_string(), 1));
285        assert_eq!(handler.positions[3], ("UNT".to_string(), 1));
286        // Second message
287        assert_eq!(handler.positions[4], ("UNH".to_string(), 2));
288        assert_eq!(handler.positions[5], ("BGM".to_string(), 2));
289        assert_eq!(handler.positions[6], ("UNT".to_string(), 2));
290        // UNZ is outside messages
291        assert_eq!(handler.positions[7], ("UNZ".to_string(), 0));
292    }
293
294    #[test]
295    fn test_parse_empty_input() {
296        struct NoOp;
297        impl EdifactHandler for NoOp {}
298
299        let mut handler = NoOp;
300        let result = EdifactStreamParser::parse(b"", &mut handler);
301        assert!(result.is_ok());
302    }
303
304    #[test]
305    fn test_parse_iso_8859_1_preserves_characters() {
306        // EDIFACT with ISO-8859-1 encoded German characters:
307        // ß = 0xDF, ö = 0xF6, ü = 0xFC in ISO-8859-1
308        // These are single bytes > 0x7F, NOT valid UTF-8.
309        let input: Vec<u8> = [
310            b"UNA:+.? '".as_slice(),
311            b"UNB+UNOC:3+SENDER+RECEIVER+210101:1200+REF'",
312            b"UNH+001+UTILMD:D:11A:UN'",
313            // NAD segment with "Müller" where ü = 0xFC (ISO-8859-1)
314            b"NAD+Z09+++M",
315            &[0xFC], // ü in ISO-8859-1
316            b"ller:Max::::Herr'",
317            // LOC segment with "Straße" where ß = 0xDF (ISO-8859-1)
318            b"LOC+Z16+++Hauptstra",
319            &[0xDF], // ß in ISO-8859-1
320            b"e::5'",
321            b"UNT+4+001'",
322            b"UNZ+1+REF'",
323        ]
324        .concat();
325
326        struct SegCollector {
327            segments: Vec<(String, Vec<Vec<String>>)>,
328        }
329        impl EdifactHandler for SegCollector {
330            fn on_segment(&mut self, seg: &RawSegment) -> Control {
331                self.segments.push((
332                    seg.id.to_string(),
333                    seg.elements
334                        .iter()
335                        .map(|e| e.iter().map(|c| c.to_string()).collect())
336                        .collect(),
337                ));
338                Control::Continue
339            }
340        }
341
342        let mut handler = SegCollector {
343            segments: Vec::new(),
344        };
345        EdifactStreamParser::parse(&input, &mut handler).unwrap();
346
347        // Find NAD segment — should contain "Müller" (ü transcoded to UTF-8)
348        // NAD+Z09+++Müller:Max::::Herr → elements: [Z09], [], [], [Müller,Max,...,Herr]
349        let nad = handler
350            .segments
351            .iter()
352            .find(|(id, _)| id == "NAD")
353            .expect("NAD segment should be present");
354        let name = &nad.1[3][0]; // element 3, component 0
355        assert!(
356            name.contains("ller"),
357            "NAD name component should contain 'ller', got: {:?}",
358            name
359        );
360        assert!(
361            name.contains('ü'),
362            "NAD name should contain ü (transcoded from ISO-8859-1 0xFC), got: {:?}",
363            name
364        );
365
366        // Find LOC segment — should contain "Straße" (ß transcoded to UTF-8)
367        // LOC+Z16+++Hauptstraße::5 → elements: [Z16], [], [], [Hauptstraße,,5]
368        let loc = handler
369            .segments
370            .iter()
371            .find(|(id, _)| id == "LOC")
372            .expect("LOC segment should be present");
373        let street = &loc.1[3][0]; // element 3, component 0
374        assert!(
375            street.contains("stra"),
376            "LOC street should contain 'stra', got: {:?}",
377            street
378        );
379        assert!(
380            street.contains('ß'),
381            "LOC street should contain ß (transcoded from ISO-8859-1 0xDF), got: {:?}",
382            street
383        );
384    }
385
386    #[test]
387    fn test_parse_real_world_dtm_with_timezone() {
388        struct DtmCollector {
389            dtm_values: Vec<String>,
390        }
391        impl EdifactHandler for DtmCollector {
392            fn on_segment(&mut self, seg: &RawSegment) -> Control {
393                if seg.is("DTM") {
394                    let qualifier = seg.get_component(0, 0);
395                    let value = seg.get_component(0, 1);
396                    self.dtm_values.push(format!("{}={}", qualifier, value));
397                }
398                Control::Continue
399            }
400        }
401
402        let input = b"UNA:+.? 'UNB+UNOC:3'UNH+001'DTM+137:202506190130?+00:303'UNT+2+001'UNZ+1'";
403        let mut handler = DtmCollector {
404            dtm_values: Vec::new(),
405        };
406        EdifactStreamParser::parse(input, &mut handler).unwrap();
407
408        assert_eq!(handler.dtm_values.len(), 1);
409        assert_eq!(handler.dtm_values[0], "137=202506190130?+00");
410    }
411
412    mod fuzz {
413        use super::*;
414        use proptest::prelude::*;
415
416        /// A handler that does nothing but exercises all callbacks.
417        struct FuzzHandler {
418            segment_count: usize,
419        }
420
421        impl EdifactHandler for FuzzHandler {
422            fn on_delimiters(&mut self, _d: &EdifactDelimiters, _una: bool) {}
423
424            fn on_interchange_start(&mut self, _unb: &RawSegment) -> Control {
425                Control::Continue
426            }
427
428            fn on_message_start(&mut self, _unh: &RawSegment) -> Control {
429                Control::Continue
430            }
431
432            fn on_segment(&mut self, _seg: &RawSegment) -> Control {
433                self.segment_count += 1;
434                if self.segment_count > 10_000 {
435                    Control::Stop // safety valve for huge inputs
436                } else {
437                    Control::Continue
438                }
439            }
440
441            fn on_message_end(&mut self, _unt: &RawSegment) {}
442            fn on_interchange_end(&mut self, _unz: &RawSegment) {}
443
444            fn on_error(&mut self, _error: ParseError) -> Control {
445                Control::Continue // try to keep going
446            }
447        }
448
449        proptest! {
450            #[test]
451            fn parser_never_panics_on_arbitrary_input(input in proptest::collection::vec(any::<u8>(), 0..1024)) {
452                let mut handler = FuzzHandler { segment_count: 0 };
453                // Must not panic — errors are OK, panics are NOT
454                let _ = EdifactStreamParser::parse(&input, &mut handler);
455            }
456
457            #[test]
458            fn parser_never_panics_on_ascii_input(input in "[A-Z0-9:+.?' \n\r]{0,512}") {
459                let mut handler = FuzzHandler { segment_count: 0 };
460                let _ = EdifactStreamParser::parse(input.as_bytes(), &mut handler);
461            }
462
463            #[test]
464            fn parser_handles_valid_looking_messages(
465                sender in "[A-Z0-9]{10,13}",
466                receiver in "[A-Z0-9]{10,13}",
467                ref_num in "[A-Z0-9]{5,10}",
468            ) {
469                let msg = format!(
470                    "UNA:+.? 'UNB+UNOC:3+{}+{}+210101:1200+{}'UNZ+0+{}'",
471                    sender, receiver, ref_num, ref_num,
472                );
473                let mut handler = FuzzHandler { segment_count: 0 };
474                let result = EdifactStreamParser::parse(msg.as_bytes(), &mut handler);
475                prop_assert!(result.is_ok());
476                prop_assert!(handler.segment_count >= 2); // at least UNB and UNZ
477            }
478        }
479    }
480}