Skip to main content

edifact_parser/
format_detection.rs

1//! Auto-detect BDEW format version from EDIFACT input.
2//!
3//! Looks at the UNH S009 composite (`MessageType:Directory:Release:Agency:Version`)
4//! and matches `(message_type, version)` against a known table.
5
6use edifact_primitives::{Control, RawSegment};
7use thiserror::Error;
8
9use crate::{EdifactHandler, EdifactStreamParser};
10
11/// Result of a successful format-version detection.
12///
13/// `format_version` is a BDEW format version string like `"FV2504"`. `note` is
14/// `Some` when the UNH version string matched multiple format versions and the
15/// newest one was picked.
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct DetectResult {
18    /// BDEW format version, e.g. `"FV2504"`.
19    pub format_version: &'static str,
20    /// EDIFACT message type from UNH S009.0065, e.g. `"UTILMD"`.
21    pub message_type: String,
22    /// Raw UNH version string from S009.0054, e.g. `"S2.1c"`.
23    pub unh_version: String,
24    /// Set when multiple format versions matched and the newest was selected.
25    pub note: Option<String>,
26}
27
28/// Errors returned by [`detect_format_version`].
29#[derive(Debug, Clone, Error, PartialEq, Eq)]
30pub enum DetectError {
31    /// No UNH segment was found in the input.
32    #[error("input contains no UNH segment")]
33    NoUnh,
34    /// The UNH version string is not in the lookup table for this message type.
35    /// `known` lists the UNH version strings the table currently knows for
36    /// `message_type`.
37    #[error("unknown version '{unh_version}' for {message_type} (known: {known:?})")]
38    UnknownVersion {
39        message_type: String,
40        unh_version: String,
41        known: Vec<String>,
42    },
43    /// The UNH message type is not covered by the auto-detection table.
44    #[error("message type '{message_type}' not supported by auto-detection")]
45    UnsupportedMessageType { message_type: String },
46    /// The input could not be parsed as EDIFACT (parser error or malformed UNH).
47    #[error("failed to parse EDIFACT input: {0}")]
48    ParseFailure(String),
49}
50
51/// Detect the BDEW format version of an EDIFACT message.
52///
53/// Performs a single-pass scan of the input, captures the first UNH segment's
54/// S009 version, and looks `(message_type, unh_version)` up in a static table.
55/// Scanning stops after the first UNH, so multi-message interchanges are
56/// detected from the first message only.
57pub fn detect_format_version(edifact: &str) -> Result<DetectResult, DetectError> {
58    let trimmed = edifact.trim_start_matches([' ', '\t', '\r', '\n', '\u{feff}']);
59    if trimmed.is_empty() {
60        return Err(DetectError::NoUnh);
61    }
62
63    let mut handler = UnhCapture::default();
64    EdifactStreamParser::parse(trimmed.as_bytes(), &mut handler)
65        .map_err(|e| DetectError::ParseFailure(e.to_string()))?;
66
67    let message_type = handler.message_type.ok_or(DetectError::NoUnh)?;
68    let unh_version = handler
69        .unh_version
70        .ok_or_else(|| DetectError::ParseFailure("UNH S009 missing version component".into()))?;
71
72    if !message_type_supported(&message_type) {
73        return Err(DetectError::UnsupportedMessageType { message_type });
74    }
75
76    let mut candidates: Vec<&'static str> = VERSION_TABLE
77        .iter()
78        .filter(|(mt, ver, _)| *mt == message_type && *ver == unh_version)
79        .map(|(_, _, fv)| *fv)
80        .collect();
81    // Sort newest first.
82    candidates.sort_by(|a, b| b.cmp(a));
83
84    match candidates.as_slice() {
85        [] => {
86            let known = known_versions_for(&message_type);
87            Err(DetectError::UnknownVersion {
88                message_type,
89                unh_version,
90                known,
91            })
92        }
93        [single] => Ok(DetectResult {
94            format_version: single,
95            message_type,
96            unh_version,
97            note: None,
98        }),
99        [newest, rest @ ..] => {
100            let other = rest.to_vec().join(", ");
101            Ok(DetectResult {
102                format_version: newest,
103                message_type,
104                unh_version,
105                note: Some(format!("also matches {}", other)),
106            })
107        }
108    }
109}
110
111/// `(message_type, unh_version, format_version)`.
112///
113/// Maintained by hand. Add new rows when a new FV ships. The
114/// `version_table_covers_mig_xml` test in `tests/format_detection_coverage.rs`
115/// asserts every `(message_type, version)` from the MIG XML submodule is here.
116const VERSION_TABLE: &[(&str, &str, &str)] = &[
117    // FV2504
118    ("APERAK", "2.1i", "FV2504"),
119    ("COMDIS", "1.0e", "FV2504"),
120    ("IFTSTA", "2.0f", "FV2504"),
121    ("INVOIC", "2.8d", "FV2504"),
122    ("MSCONS", "2.4c", "FV2504"),
123    ("ORDERS", "1.4a", "FV2504"),
124    ("ORDRSP", "1.4", "FV2504"),
125    ("PARTIN", "1.0e", "FV2504"),
126    ("PRICAT", "2.0d", "FV2504"),
127    ("QUOTES", "1.3a", "FV2504"),
128    ("REMADV", "2.9c", "FV2504"),
129    ("REQOTE", "1.3b", "FV2504"),
130    ("UTILMD", "G1.0a", "FV2504"),
131    ("UTILMD", "S2.1", "FV2504"),
132    ("UTILTS", "1.1e", "FV2504"),
133    // FV2510
134    ("APERAK", "2.1i", "FV2510"), // also matches FV2504
135    ("COMDIS", "1.0f", "FV2510"),
136    ("IFTSTA", "2.0g", "FV2510"),
137    ("INVOIC", "2.8e", "FV2510"),
138    ("MSCONS", "2.4c", "FV2510"), // also matches FV2504
139    ("ORDERS", "1.4b", "FV2510"),
140    ("ORDRSP", "1.4a", "FV2510"),
141    ("PARTIN", "1.0e", "FV2510"),
142    ("PRICAT", "2.0e", "FV2510"),
143    ("QUOTES", "1.3b", "FV2510"),
144    ("REMADV", "2.9d", "FV2510"),
145    ("REQOTE", "1.3c", "FV2510"),
146    ("UTILMD", "G1.0a", "FV2510"),
147    ("UTILMD", "S2.1", "FV2510"), // also matches FV2504
148    ("UTILTS", "1.1e", "FV2510"), // also matches FV2504
149    // FV2604
150    ("APERAK", "2.1i", "FV2604"), // also matches FV2504/FV2510
151    ("COMDIS", "1.0g", "FV2604"),
152    ("IFTSTA", "2.0g", "FV2604"), // also matches FV2510
153    ("INVOIC", "2.8e", "FV2604"), // also matches FV2510
154    ("MSCONS", "2.4c", "FV2604"), // also matches FV2504/FV2510
155    ("ORDERS", "1.4b", "FV2604"), // also matches FV2510
156    ("ORDRSP", "1.4b", "FV2604"),
157    ("PARTIN", "1.0f", "FV2604"),
158    ("PRICAT", "2.0e", "FV2604"), // also matches FV2510
159    ("QUOTES", "1.3b", "FV2604"), // also matches FV2510
160    ("REMADV", "2.9e", "FV2604"),
161    ("REQOTE", "1.3c", "FV2604"), // also matches FV2510
162    ("UTILMD", "G1.1", "FV2604"),
163    ("UTILMD", "S2.1", "FV2604"), // also matches FV2504/FV2510
164    ("UTILTS", "1.1e", "FV2604"), // also matches FV2504/FV2510
165    // FV2610
166    ("APERAK", "2.2", "FV2610"),
167    ("COMDIS", "1.0g", "FV2610"), // also matches FV2604
168    ("IFTSTA", "2.1", "FV2610"),
169    ("INVOIC", "2.8e", "FV2610"), // also matches FV2510/FV2604
170    ("MSCONS", "2.5", "FV2610"),
171    ("ORDERS", "1.4c", "FV2610"),
172    ("ORDRSP", "1.4c", "FV2610"),
173    ("PARTIN", "1.1", "FV2610"),
174    ("PRICAT", "2.1", "FV2610"),
175    ("QUOTES", "1.3c", "FV2610"),
176    ("REMADV", "2.9e", "FV2610"), // also matches FV2604
177    ("REQOTE", "1.3c", "FV2610"), // also matches FV2510/FV2604
178    ("UTILMD", "G1.2", "FV2610"),
179    ("UTILMD", "S2.2", "FV2610"),
180    ("UTILTS", "1.1e", "FV2610"), // also matches FV2504/FV2510/FV2604
181];
182
183fn known_versions_for(message_type: &str) -> Vec<String> {
184    let mut versions: Vec<String> = VERSION_TABLE
185        .iter()
186        .filter(|(mt, _, _)| *mt == message_type)
187        .map(|(_, ver, _)| (*ver).to_string())
188        .collect();
189    versions.sort();
190    versions.dedup();
191    versions
192}
193
194fn message_type_supported(message_type: &str) -> bool {
195    VERSION_TABLE.iter().any(|(mt, _, _)| *mt == message_type)
196}
197
198#[derive(Default)]
199struct UnhCapture {
200    message_type: Option<String>,
201    unh_version: Option<String>,
202}
203
204impl EdifactHandler for UnhCapture {
205    fn on_message_start(&mut self, unh: &RawSegment) -> Control {
206        // S009 is element index 1 of UNH (element 0 is the message reference number).
207        // Components of S009: 0=type, 1=directory, 2=release, 3=agency, 4=version.
208        // RawSegment.elements is `Vec<Vec<&str>>` where outer = elements, inner = components.
209        if let Some(s009) = unh.elements.get(1) {
210            self.message_type = s009.first().map(|s| s.to_string());
211            self.unh_version = s009.get(4).map(|s| s.to_string());
212        }
213        Control::Stop
214    }
215}
216
217#[cfg(test)]
218mod tests {
219    use super::*;
220
221    #[test]
222    fn empty_input_returns_no_unh() {
223        let err = detect_format_version("").unwrap_err();
224        assert_eq!(err, DetectError::NoUnh);
225    }
226
227    #[test]
228    fn extracts_unh_s009_for_utilmd_s2_1() {
229        let input = "UNB+UNOC:3+sender+recv+250505:0826+REF'\
230                     UNH+REF+UTILMD:D:11A:UN:S2.1'\
231                     UNT+1+REF'\
232                     UNZ+1+REF'";
233        let result = detect_format_version(input).unwrap();
234        assert_eq!(result.message_type, "UTILMD");
235        assert_eq!(result.unh_version, "S2.1");
236    }
237
238    #[test]
239    fn maps_utilmd_g1_1_to_fv2604() {
240        let input = "UNB+UNOC:3+s+r+260211:1006+R'\
241                     UNH+R+UTILMD:D:11A:UN:G1.1'\
242                     UNT+1+R'UNZ+1+R'";
243        let result = detect_format_version(input).unwrap();
244        assert_eq!(result.format_version, "FV2604");
245        assert_eq!(result.note, None);
246    }
247
248    #[test]
249    fn unknown_version_returns_known_list() {
250        let input = "UNB+UNOC:3+s+r+250505:0826+R'\
251                     UNH+R+UTILMD:D:11A:UN:S2.0a'\
252                     UNT+1+R'UNZ+1+R'";
253        let err = detect_format_version(input).unwrap_err();
254        match err {
255            DetectError::UnknownVersion {
256                message_type,
257                unh_version,
258                known,
259            } => {
260                assert_eq!(message_type, "UTILMD");
261                assert_eq!(unh_version, "S2.0a");
262                assert!(known.contains(&"S2.1".to_string()));
263            }
264            other => panic!("expected UnknownVersion, got {other:?}"),
265        }
266    }
267
268    #[test]
269    fn unsupported_message_type() {
270        let input = "UNB+UNOC:3+s+r+250505:0826+R'\
271                     UNH+R+FOOBAR:D:01A:UN:1.0'\
272                     UNT+1+R'UNZ+1+R'";
273        let err = detect_format_version(input).unwrap_err();
274        assert_eq!(
275            err,
276            DetectError::UnsupportedMessageType {
277                message_type: "FOOBAR".into()
278            }
279        );
280    }
281
282    #[test]
283    fn utilmd_s2_1_picks_newest_with_note() {
284        let input = "UNB+UNOC:3+s+r+251201:0826+R'\
285                     UNH+R+UTILMD:D:11A:UN:S2.1'\
286                     UNT+1+R'UNZ+1+R'";
287        let result = detect_format_version(input).unwrap();
288        assert_eq!(result.format_version, "FV2604");
289        let note = result.note.as_deref().unwrap_or("");
290        assert!(note.contains("FV2504"), "note was: {note}");
291        assert!(note.contains("FV2510"), "note was: {note}");
292    }
293
294    #[test]
295    fn mscons_2_4c_picks_newest_with_note() {
296        let input = "UNB+UNOC:3+s+r+260301:0826+R'\
297                     UNH+R+MSCONS:D:04B:UN:2.4c'\
298                     UNT+1+R'UNZ+1+R'";
299        let result = detect_format_version(input).unwrap();
300        assert_eq!(result.format_version, "FV2604");
301        assert!(result.note.as_deref().unwrap_or("").contains("FV2504"));
302    }
303
304    #[test]
305    fn handles_leading_whitespace_and_bom() {
306        let input = "\u{feff}\r\n  UNB+UNOC:3+s+r+250505:0826+R'\
307                     UNH+R+UTILMD:D:11A:UN:S2.1'\
308                     UNT+1+R'UNZ+1+R'";
309        let result = detect_format_version(input).unwrap();
310        assert_eq!(result.message_type, "UTILMD");
311    }
312
313    #[test]
314    fn handles_una_with_default_delimiters() {
315        let input = "UNA:+.? '\
316                     UNB+UNOC:3+s+r+250505:0826+R'\
317                     UNH+R+UTILMD:D:11A:UN:S2.1'\
318                     UNT+1+R'UNZ+1+R'";
319        let result = detect_format_version(input).unwrap();
320        assert_eq!(result.format_version, "FV2604"); // ambiguity → newest
321    }
322
323    #[test]
324    fn empty_s009_version_is_unknown_version_not_parse_failure() {
325        let input = "UNB+UNOC:3+s+r+250505:0826+R'\
326                     UNH+R+UTILMD:D:11A:UN:'\
327                     UNT+1+R'UNZ+1+R'";
328        let err = detect_format_version(input).unwrap_err();
329        match err {
330            DetectError::UnknownVersion { unh_version, .. } => assert_eq!(unh_version, ""),
331            DetectError::ParseFailure(_) => {
332                // acceptable — depends on whether the parser keeps the trailing empty component
333            }
334            other => panic!("unexpected: {other:?}"),
335        }
336    }
337
338    #[test]
339    fn multi_message_interchange_uses_first_unh() {
340        let input = "UNB+UNOC:3+s+r+250505:0826+R'\
341                     UNH+R1+UTILMD:D:11A:UN:S2.1'UNT+1+R1'\
342                     UNH+R2+APERAK:D:07B:UN:2.1i'UNT+1+R2'\
343                     UNZ+2+R'";
344        let result = detect_format_version(input).unwrap();
345        assert_eq!(result.message_type, "UTILMD");
346    }
347
348    #[test]
349    fn no_unh_just_unb() {
350        let input = "UNB+UNOC:3+s+r+250505:0826+R'UNZ+0+R'";
351        let err = detect_format_version(input).unwrap_err();
352        assert_eq!(err, DetectError::NoUnh);
353    }
354}