1#[cfg(test)]
2use edifact_primitives::Control;
3use edifact_primitives::{EdifactDelimiters, RawSegment, SegmentPosition};
4
5use crate::error::ParseError;
6use crate::handler::EdifactHandler;
7use crate::segment_builder::SegmentBuilder as SegBuilder;
8use crate::tokenizer::EdifactTokenizer;
9
10pub struct EdifactStreamParser;
16
17impl EdifactStreamParser {
18 pub fn parse(input: &[u8], handler: &mut dyn EdifactHandler) -> Result<(), ParseError> {
31 if std::str::from_utf8(input).is_ok() {
32 Self::parse_inner(input, handler)
33 } else {
34 let transcoded = transcode_iso_8859_1_to_utf8(input);
35 Self::parse_inner(&transcoded, handler)
36 }
37 }
38
39 fn parse_inner(input: &[u8], handler: &mut dyn EdifactHandler) -> Result<(), ParseError> {
40 let (has_una, delimiters) = EdifactDelimiters::detect(input);
42 handler.on_delimiters(&delimiters, has_una);
43
44 let content_start = if has_una { 9 } else { 0 };
46 let content = &input[content_start..];
47
48 let tokenizer = EdifactTokenizer::new(delimiters);
50 let seg_builder = SegBuilder::new(delimiters);
51
52 let mut segment_number: u32 = 0;
53 let mut message_number: u32 = 0;
54 let mut byte_offset = content_start;
55
56 for segment_str in tokenizer.tokenize_segments(content) {
57 segment_number += 1;
58
59 let position = SegmentPosition::new(segment_number, byte_offset, message_number);
60
61 let Some(raw_segment) = seg_builder.build(segment_str, position) else {
62 byte_offset += segment_str.len() + 1; continue;
64 };
65
66 if raw_segment.is("UNA") {
68 byte_offset += segment_str.len() + 1;
69 segment_number -= 1; continue;
71 }
72
73 let id_upper = raw_segment.id.to_ascii_uppercase();
74
75 if id_upper == "UNH" {
77 message_number += 1;
78 }
79
80 let effective_message_number = if id_upper == "UNB" || id_upper == "UNZ" {
82 0
83 } else {
84 message_number
85 };
86 let position =
87 SegmentPosition::new(segment_number, byte_offset, effective_message_number);
88 let raw_segment = RawSegment::new(raw_segment.id, raw_segment.elements, position);
89
90 match id_upper.as_str() {
92 "UNB" => {
93 if handler.on_interchange_start(&raw_segment).should_stop() {
94 return Ok(());
95 }
96 }
97 "UNH" => {
98 if handler.on_message_start(&raw_segment).should_stop() {
99 return Ok(());
100 }
101 }
102 "UNT" => {
103 handler.on_message_end(&raw_segment);
104 }
105 "UNZ" => {
106 handler.on_interchange_end(&raw_segment);
107 }
108 _ => {}
109 }
110
111 if handler.on_segment(&raw_segment).should_stop() {
113 return Ok(());
114 }
115
116 byte_offset += segment_str.len() + 1; }
118
119 Ok(())
120 }
121}
122
123fn transcode_iso_8859_1_to_utf8(input: &[u8]) -> Vec<u8> {
130 let mut output = Vec::with_capacity(input.len() + input.len() / 4);
131 for &b in input {
132 if b < 0x80 {
133 output.push(b);
134 } else {
135 output.push(0xC0 | (b >> 6));
137 output.push(0x80 | (b & 0x3F));
138 }
139 }
140 output
141}
142
143#[cfg(test)]
144mod tests {
145 use super::*;
146 use std::cell::RefCell;
147
148 struct EventCollector {
150 events: RefCell<Vec<String>>,
151 }
152
153 impl EventCollector {
154 fn new() -> Self {
155 Self {
156 events: RefCell::new(Vec::new()),
157 }
158 }
159
160 fn events(&self) -> Vec<String> {
161 self.events.borrow().clone()
162 }
163 }
164
165 impl EdifactHandler for EventCollector {
166 fn on_delimiters(&mut self, _d: &EdifactDelimiters, explicit_una: bool) {
167 self.events
168 .borrow_mut()
169 .push(format!("DELIMITERS(una={})", explicit_una));
170 }
171
172 fn on_interchange_start(&mut self, unb: &RawSegment) -> Control {
173 self.events
174 .borrow_mut()
175 .push(format!("INTERCHANGE_START({})", unb.id));
176 Control::Continue
177 }
178
179 fn on_message_start(&mut self, unh: &RawSegment) -> Control {
180 self.events
181 .borrow_mut()
182 .push(format!("MESSAGE_START(ref={})", unh.get_element(0)));
183 Control::Continue
184 }
185
186 fn on_segment(&mut self, seg: &RawSegment) -> Control {
187 self.events
188 .borrow_mut()
189 .push(format!("SEGMENT({})", seg.id));
190 Control::Continue
191 }
192
193 fn on_message_end(&mut self, _unt: &RawSegment) {
194 self.events.borrow_mut().push("MESSAGE_END".to_string());
195 }
196
197 fn on_interchange_end(&mut self, _unz: &RawSegment) {
198 self.events.borrow_mut().push("INTERCHANGE_END".to_string());
199 }
200 }
201
202 #[test]
203 fn test_parse_minimal_interchange() {
204 let input = b"UNA:+.? 'UNB+UNOC:3+SENDER+RECEIVER+210101:1200+REF001'UNH+MSG001+UTILMD:D:11A:UN:S2.1'BGM+E03+DOC001'UNT+3+MSG001'UNZ+1+REF001'";
205
206 let mut handler = EventCollector::new();
207 EdifactStreamParser::parse(input, &mut handler).unwrap();
208
209 let events = handler.events();
210 assert_eq!(events[0], "DELIMITERS(una=true)");
211 assert_eq!(events[1], "INTERCHANGE_START(UNB)");
212 assert_eq!(events[2], "SEGMENT(UNB)");
213 assert_eq!(events[3], "MESSAGE_START(ref=MSG001)");
214 assert_eq!(events[4], "SEGMENT(UNH)");
215 assert_eq!(events[5], "SEGMENT(BGM)");
216 assert_eq!(events[6], "MESSAGE_END");
217 assert_eq!(events[7], "SEGMENT(UNT)");
218 assert_eq!(events[8], "INTERCHANGE_END");
219 assert_eq!(events[9], "SEGMENT(UNZ)");
220 }
221
222 #[test]
223 fn test_parse_without_una() {
224 let input = b"UNB+UNOC:3+SENDER+RECEIVER'UNZ+0+REF'";
225
226 let mut handler = EventCollector::new();
227 EdifactStreamParser::parse(input, &mut handler).unwrap();
228
229 let events = handler.events();
230 assert_eq!(events[0], "DELIMITERS(una=false)");
231 assert_eq!(events[1], "INTERCHANGE_START(UNB)");
232 }
233
234 #[test]
235 fn test_parse_handler_stops_early() {
236 struct StopOnBgm {
237 segments_seen: Vec<String>,
238 }
239 impl EdifactHandler for StopOnBgm {
240 fn on_segment(&mut self, seg: &RawSegment) -> Control {
241 self.segments_seen.push(seg.id.to_string());
242 if seg.is("BGM") {
243 Control::Stop
244 } else {
245 Control::Continue
246 }
247 }
248 }
249
250 let input = b"UNA:+.? 'UNB+UNOC:3'UNH+001'BGM+E03'DTM+137:20250101'UNT+3+001'UNZ+1'";
251 let mut handler = StopOnBgm {
252 segments_seen: Vec::new(),
253 };
254 EdifactStreamParser::parse(input, &mut handler).unwrap();
255
256 assert_eq!(handler.segments_seen, vec!["UNB", "UNH", "BGM"]);
258 }
259
260 #[test]
261 fn test_parse_message_numbering() {
262 struct PositionTracker {
263 positions: Vec<(String, u32)>,
264 }
265 impl EdifactHandler for PositionTracker {
266 fn on_segment(&mut self, seg: &RawSegment) -> Control {
267 self.positions
268 .push((seg.id.to_string(), seg.position.message_number));
269 Control::Continue
270 }
271 }
272
273 let input =
274 b"UNA:+.? 'UNB+UNOC:3'UNH+001'BGM+E03'UNT+2+001'UNH+002'BGM+E03'UNT+2+002'UNZ+2'";
275 let mut handler = PositionTracker {
276 positions: Vec::new(),
277 };
278 EdifactStreamParser::parse(input, &mut handler).unwrap();
279
280 assert_eq!(handler.positions[0], ("UNB".to_string(), 0));
282 assert_eq!(handler.positions[1], ("UNH".to_string(), 1));
284 assert_eq!(handler.positions[2], ("BGM".to_string(), 1));
285 assert_eq!(handler.positions[3], ("UNT".to_string(), 1));
286 assert_eq!(handler.positions[4], ("UNH".to_string(), 2));
288 assert_eq!(handler.positions[5], ("BGM".to_string(), 2));
289 assert_eq!(handler.positions[6], ("UNT".to_string(), 2));
290 assert_eq!(handler.positions[7], ("UNZ".to_string(), 0));
292 }
293
294 #[test]
295 fn test_parse_empty_input() {
296 struct NoOp;
297 impl EdifactHandler for NoOp {}
298
299 let mut handler = NoOp;
300 let result = EdifactStreamParser::parse(b"", &mut handler);
301 assert!(result.is_ok());
302 }
303
304 #[test]
305 fn test_parse_iso_8859_1_preserves_characters() {
306 let input: Vec<u8> = [
310 b"UNA:+.? '".as_slice(),
311 b"UNB+UNOC:3+SENDER+RECEIVER+210101:1200+REF'",
312 b"UNH+001+UTILMD:D:11A:UN'",
313 b"NAD+Z09+++M",
315 &[0xFC], b"ller:Max::::Herr'",
317 b"LOC+Z16+++Hauptstra",
319 &[0xDF], b"e::5'",
321 b"UNT+4+001'",
322 b"UNZ+1+REF'",
323 ]
324 .concat();
325
326 struct SegCollector {
327 segments: Vec<(String, Vec<Vec<String>>)>,
328 }
329 impl EdifactHandler for SegCollector {
330 fn on_segment(&mut self, seg: &RawSegment) -> Control {
331 self.segments.push((
332 seg.id.to_string(),
333 seg.elements
334 .iter()
335 .map(|e| e.iter().map(|c| c.to_string()).collect())
336 .collect(),
337 ));
338 Control::Continue
339 }
340 }
341
342 let mut handler = SegCollector {
343 segments: Vec::new(),
344 };
345 EdifactStreamParser::parse(&input, &mut handler).unwrap();
346
347 let nad = handler
350 .segments
351 .iter()
352 .find(|(id, _)| id == "NAD")
353 .expect("NAD segment should be present");
354 let name = &nad.1[3][0]; assert!(
356 name.contains("ller"),
357 "NAD name component should contain 'ller', got: {:?}",
358 name
359 );
360 assert!(
361 name.contains('ü'),
362 "NAD name should contain ü (transcoded from ISO-8859-1 0xFC), got: {:?}",
363 name
364 );
365
366 let loc = handler
369 .segments
370 .iter()
371 .find(|(id, _)| id == "LOC")
372 .expect("LOC segment should be present");
373 let street = &loc.1[3][0]; assert!(
375 street.contains("stra"),
376 "LOC street should contain 'stra', got: {:?}",
377 street
378 );
379 assert!(
380 street.contains('ß'),
381 "LOC street should contain ß (transcoded from ISO-8859-1 0xDF), got: {:?}",
382 street
383 );
384 }
385
386 #[test]
387 fn test_parse_real_world_dtm_with_timezone() {
388 struct DtmCollector {
389 dtm_values: Vec<String>,
390 }
391 impl EdifactHandler for DtmCollector {
392 fn on_segment(&mut self, seg: &RawSegment) -> Control {
393 if seg.is("DTM") {
394 let qualifier = seg.get_component(0, 0);
395 let value = seg.get_component(0, 1);
396 self.dtm_values.push(format!("{}={}", qualifier, value));
397 }
398 Control::Continue
399 }
400 }
401
402 let input = b"UNA:+.? 'UNB+UNOC:3'UNH+001'DTM+137:202506190130?+00:303'UNT+2+001'UNZ+1'";
403 let mut handler = DtmCollector {
404 dtm_values: Vec::new(),
405 };
406 EdifactStreamParser::parse(input, &mut handler).unwrap();
407
408 assert_eq!(handler.dtm_values.len(), 1);
409 assert_eq!(handler.dtm_values[0], "137=202506190130?+00");
410 }
411
412 mod fuzz {
413 use super::*;
414 use proptest::prelude::*;
415
416 struct FuzzHandler {
418 segment_count: usize,
419 }
420
421 impl EdifactHandler for FuzzHandler {
422 fn on_delimiters(&mut self, _d: &EdifactDelimiters, _una: bool) {}
423
424 fn on_interchange_start(&mut self, _unb: &RawSegment) -> Control {
425 Control::Continue
426 }
427
428 fn on_message_start(&mut self, _unh: &RawSegment) -> Control {
429 Control::Continue
430 }
431
432 fn on_segment(&mut self, _seg: &RawSegment) -> Control {
433 self.segment_count += 1;
434 if self.segment_count > 10_000 {
435 Control::Stop } else {
437 Control::Continue
438 }
439 }
440
441 fn on_message_end(&mut self, _unt: &RawSegment) {}
442 fn on_interchange_end(&mut self, _unz: &RawSegment) {}
443
444 fn on_error(&mut self, _error: ParseError) -> Control {
445 Control::Continue }
447 }
448
449 proptest! {
450 #[test]
451 fn parser_never_panics_on_arbitrary_input(input in proptest::collection::vec(any::<u8>(), 0..1024)) {
452 let mut handler = FuzzHandler { segment_count: 0 };
453 let _ = EdifactStreamParser::parse(&input, &mut handler);
455 }
456
457 #[test]
458 fn parser_never_panics_on_ascii_input(input in "[A-Z0-9:+.?' \n\r]{0,512}") {
459 let mut handler = FuzzHandler { segment_count: 0 };
460 let _ = EdifactStreamParser::parse(input.as_bytes(), &mut handler);
461 }
462
463 #[test]
464 fn parser_handles_valid_looking_messages(
465 sender in "[A-Z0-9]{10,13}",
466 receiver in "[A-Z0-9]{10,13}",
467 ref_num in "[A-Z0-9]{5,10}",
468 ) {
469 let msg = format!(
470 "UNA:+.? 'UNB+UNOC:3+{}+{}+210101:1200+{}'UNZ+0+{}'",
471 sender, receiver, ref_num, ref_num,
472 );
473 let mut handler = FuzzHandler { segment_count: 0 };
474 let result = EdifactStreamParser::parse(msg.as_bytes(), &mut handler);
475 prop_assert!(result.is_ok());
476 prop_assert!(handler.segment_count >= 2); }
478 }
479 }
480}