edifact-parser 0.1.60

Streaming EDIFACT tokenizer and SAX-style parser — standalone, no BO4E dependency
Documentation
//! Auto-detect BDEW format version from EDIFACT input.
//!
//! Looks at the UNH S009 composite (`MessageType:Directory:Release:Agency:Version`)
//! and matches `(message_type, version)` against a known table.

use edifact_primitives::{Control, RawSegment};
use thiserror::Error;

use crate::{EdifactHandler, EdifactStreamParser};

/// Result of a successful format-version detection.
///
/// `format_version` is a BDEW format version string like `"FV2504"`. `note` is
/// `Some` when the UNH version string matched multiple format versions and the
/// newest one was picked.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DetectResult {
    /// BDEW format version, e.g. `"FV2504"`.
    pub format_version: &'static str,
    /// EDIFACT message type from UNH S009.0065, e.g. `"UTILMD"`.
    pub message_type: String,
    /// Raw UNH version string from S009.0054, e.g. `"S2.1c"`.
    pub unh_version: String,
    /// Set when multiple format versions matched and the newest was selected.
    pub note: Option<String>,
}

/// Errors returned by [`detect_format_version`].
#[derive(Debug, Clone, Error, PartialEq, Eq)]
pub enum DetectError {
    /// No UNH segment was found in the input.
    #[error("input contains no UNH segment")]
    NoUnh,
    /// The UNH version string is not in the lookup table for this message type.
    /// `known` lists the UNH version strings the table currently knows for
    /// `message_type`.
    #[error("unknown version '{unh_version}' for {message_type} (known: {known:?})")]
    UnknownVersion {
        message_type: String,
        unh_version: String,
        known: Vec<String>,
    },
    /// The UNH message type is not covered by the auto-detection table.
    #[error("message type '{message_type}' not supported by auto-detection")]
    UnsupportedMessageType { message_type: String },
    /// The input could not be parsed as EDIFACT (parser error or malformed UNH).
    #[error("failed to parse EDIFACT input: {0}")]
    ParseFailure(String),
}

/// Detect the BDEW format version of an EDIFACT message.
///
/// Performs a single-pass scan of the input, captures the first UNH segment's
/// S009 version, and looks `(message_type, unh_version)` up in a static table.
/// Scanning stops after the first UNH, so multi-message interchanges are
/// detected from the first message only.
pub fn detect_format_version(edifact: &str) -> Result<DetectResult, DetectError> {
    let trimmed = edifact.trim_start_matches([' ', '\t', '\r', '\n', '\u{feff}']);
    if trimmed.is_empty() {
        return Err(DetectError::NoUnh);
    }

    let mut handler = UnhCapture::default();
    EdifactStreamParser::parse(trimmed.as_bytes(), &mut handler)
        .map_err(|e| DetectError::ParseFailure(e.to_string()))?;

    let message_type = handler.message_type.ok_or(DetectError::NoUnh)?;
    let unh_version = handler
        .unh_version
        .ok_or_else(|| DetectError::ParseFailure("UNH S009 missing version component".into()))?;

    if !message_type_supported(&message_type) {
        return Err(DetectError::UnsupportedMessageType { message_type });
    }

    let mut candidates: Vec<&'static str> = VERSION_TABLE
        .iter()
        .filter(|(mt, ver, _)| *mt == message_type && *ver == unh_version)
        .map(|(_, _, fv)| *fv)
        .collect();
    // Sort newest first.
    candidates.sort_by(|a, b| b.cmp(a));

    match candidates.as_slice() {
        [] => {
            let known = known_versions_for(&message_type);
            Err(DetectError::UnknownVersion {
                message_type,
                unh_version,
                known,
            })
        }
        [single] => Ok(DetectResult {
            format_version: single,
            message_type,
            unh_version,
            note: None,
        }),
        [newest, rest @ ..] => {
            let other = rest.to_vec().join(", ");
            Ok(DetectResult {
                format_version: newest,
                message_type,
                unh_version,
                note: Some(format!("also matches {}", other)),
            })
        }
    }
}

/// `(message_type, unh_version, format_version)`.
///
/// Maintained by hand. Add new rows when a new FV ships. The
/// `version_table_covers_mig_xml` test in `tests/format_detection_coverage.rs`
/// asserts every `(message_type, version)` from the MIG XML submodule is here.
const VERSION_TABLE: &[(&str, &str, &str)] = &[
    // FV2504
    ("APERAK", "2.1i", "FV2504"),
    ("COMDIS", "1.0e", "FV2504"),
    ("IFTSTA", "2.0f", "FV2504"),
    ("INVOIC", "2.8d", "FV2504"),
    ("MSCONS", "2.4c", "FV2504"),
    ("ORDERS", "1.4a", "FV2504"),
    ("ORDRSP", "1.4", "FV2504"),
    ("PARTIN", "1.0e", "FV2504"),
    ("PRICAT", "2.0d", "FV2504"),
    ("QUOTES", "1.3a", "FV2504"),
    ("REMADV", "2.9c", "FV2504"),
    ("REQOTE", "1.3b", "FV2504"),
    ("UTILMD", "G1.0a", "FV2504"),
    ("UTILMD", "S2.1", "FV2504"),
    ("UTILTS", "1.1e", "FV2504"),
    // FV2510
    ("APERAK", "2.1i", "FV2510"), // also matches FV2504
    ("COMDIS", "1.0f", "FV2510"),
    ("IFTSTA", "2.0g", "FV2510"),
    ("INVOIC", "2.8e", "FV2510"),
    ("MSCONS", "2.4c", "FV2510"), // also matches FV2504
    ("ORDERS", "1.4b", "FV2510"),
    ("ORDRSP", "1.4a", "FV2510"),
    ("PARTIN", "1.0e", "FV2510"),
    ("PRICAT", "2.0e", "FV2510"),
    ("QUOTES", "1.3b", "FV2510"),
    ("REMADV", "2.9d", "FV2510"),
    ("REQOTE", "1.3c", "FV2510"),
    ("UTILMD", "G1.0a", "FV2510"),
    ("UTILMD", "S2.1", "FV2510"), // also matches FV2504
    ("UTILTS", "1.1e", "FV2510"), // also matches FV2504
    // FV2604
    ("APERAK", "2.1i", "FV2604"), // also matches FV2504/FV2510
    ("COMDIS", "1.0g", "FV2604"),
    ("IFTSTA", "2.0g", "FV2604"), // also matches FV2510
    ("INVOIC", "2.8e", "FV2604"), // also matches FV2510
    ("MSCONS", "2.4c", "FV2604"), // also matches FV2504/FV2510
    ("ORDERS", "1.4b", "FV2604"), // also matches FV2510
    ("ORDRSP", "1.4b", "FV2604"),
    ("PARTIN", "1.0f", "FV2604"),
    ("PRICAT", "2.0e", "FV2604"), // also matches FV2510
    ("QUOTES", "1.3b", "FV2604"), // also matches FV2510
    ("REMADV", "2.9e", "FV2604"),
    ("REQOTE", "1.3c", "FV2604"), // also matches FV2510
    ("UTILMD", "G1.1", "FV2604"),
    ("UTILMD", "S2.1", "FV2604"), // also matches FV2504/FV2510
    ("UTILTS", "1.1e", "FV2604"), // also matches FV2504/FV2510
    // FV2610
    ("APERAK", "2.2", "FV2610"),
    ("COMDIS", "1.0g", "FV2610"), // also matches FV2604
    ("IFTSTA", "2.1", "FV2610"),
    ("INVOIC", "2.8e", "FV2610"), // also matches FV2510/FV2604
    ("MSCONS", "2.5", "FV2610"),
    ("ORDERS", "1.4c", "FV2610"),
    ("ORDRSP", "1.4c", "FV2610"),
    ("PARTIN", "1.1", "FV2610"),
    ("PRICAT", "2.1", "FV2610"),
    ("QUOTES", "1.3c", "FV2610"),
    ("REMADV", "2.9e", "FV2610"), // also matches FV2604
    ("REQOTE", "1.3c", "FV2610"), // also matches FV2510/FV2604
    ("UTILMD", "G1.2", "FV2610"),
    ("UTILMD", "S2.2", "FV2610"),
    ("UTILTS", "1.1e", "FV2610"), // also matches FV2504/FV2510/FV2604
];

fn known_versions_for(message_type: &str) -> Vec<String> {
    let mut versions: Vec<String> = VERSION_TABLE
        .iter()
        .filter(|(mt, _, _)| *mt == message_type)
        .map(|(_, ver, _)| (*ver).to_string())
        .collect();
    versions.sort();
    versions.dedup();
    versions
}

fn message_type_supported(message_type: &str) -> bool {
    VERSION_TABLE.iter().any(|(mt, _, _)| *mt == message_type)
}

#[derive(Default)]
struct UnhCapture {
    message_type: Option<String>,
    unh_version: Option<String>,
}

impl EdifactHandler for UnhCapture {
    fn on_message_start(&mut self, unh: &RawSegment) -> Control {
        // S009 is element index 1 of UNH (element 0 is the message reference number).
        // Components of S009: 0=type, 1=directory, 2=release, 3=agency, 4=version.
        // RawSegment.elements is `Vec<Vec<&str>>` where outer = elements, inner = components.
        if let Some(s009) = unh.elements.get(1) {
            self.message_type = s009.first().map(|s| s.to_string());
            self.unh_version = s009.get(4).map(|s| s.to_string());
        }
        Control::Stop
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn empty_input_returns_no_unh() {
        let err = detect_format_version("").unwrap_err();
        assert_eq!(err, DetectError::NoUnh);
    }

    #[test]
    fn extracts_unh_s009_for_utilmd_s2_1() {
        let input = "UNB+UNOC:3+sender+recv+250505:0826+REF'\
                     UNH+REF+UTILMD:D:11A:UN:S2.1'\
                     UNT+1+REF'\
                     UNZ+1+REF'";
        let result = detect_format_version(input).unwrap();
        assert_eq!(result.message_type, "UTILMD");
        assert_eq!(result.unh_version, "S2.1");
    }

    #[test]
    fn maps_utilmd_g1_1_to_fv2604() {
        let input = "UNB+UNOC:3+s+r+260211:1006+R'\
                     UNH+R+UTILMD:D:11A:UN:G1.1'\
                     UNT+1+R'UNZ+1+R'";
        let result = detect_format_version(input).unwrap();
        assert_eq!(result.format_version, "FV2604");
        assert_eq!(result.note, None);
    }

    #[test]
    fn unknown_version_returns_known_list() {
        let input = "UNB+UNOC:3+s+r+250505:0826+R'\
                     UNH+R+UTILMD:D:11A:UN:S2.0a'\
                     UNT+1+R'UNZ+1+R'";
        let err = detect_format_version(input).unwrap_err();
        match err {
            DetectError::UnknownVersion {
                message_type,
                unh_version,
                known,
            } => {
                assert_eq!(message_type, "UTILMD");
                assert_eq!(unh_version, "S2.0a");
                assert!(known.contains(&"S2.1".to_string()));
            }
            other => panic!("expected UnknownVersion, got {other:?}"),
        }
    }

    #[test]
    fn unsupported_message_type() {
        let input = "UNB+UNOC:3+s+r+250505:0826+R'\
                     UNH+R+FOOBAR:D:01A:UN:1.0'\
                     UNT+1+R'UNZ+1+R'";
        let err = detect_format_version(input).unwrap_err();
        assert_eq!(
            err,
            DetectError::UnsupportedMessageType {
                message_type: "FOOBAR".into()
            }
        );
    }

    #[test]
    fn utilmd_s2_1_picks_newest_with_note() {
        let input = "UNB+UNOC:3+s+r+251201:0826+R'\
                     UNH+R+UTILMD:D:11A:UN:S2.1'\
                     UNT+1+R'UNZ+1+R'";
        let result = detect_format_version(input).unwrap();
        assert_eq!(result.format_version, "FV2604");
        let note = result.note.as_deref().unwrap_or("");
        assert!(note.contains("FV2504"), "note was: {note}");
        assert!(note.contains("FV2510"), "note was: {note}");
    }

    #[test]
    fn mscons_2_4c_picks_newest_with_note() {
        let input = "UNB+UNOC:3+s+r+260301:0826+R'\
                     UNH+R+MSCONS:D:04B:UN:2.4c'\
                     UNT+1+R'UNZ+1+R'";
        let result = detect_format_version(input).unwrap();
        assert_eq!(result.format_version, "FV2604");
        assert!(result.note.as_deref().unwrap_or("").contains("FV2504"));
    }

    #[test]
    fn handles_leading_whitespace_and_bom() {
        let input = "\u{feff}\r\n  UNB+UNOC:3+s+r+250505:0826+R'\
                     UNH+R+UTILMD:D:11A:UN:S2.1'\
                     UNT+1+R'UNZ+1+R'";
        let result = detect_format_version(input).unwrap();
        assert_eq!(result.message_type, "UTILMD");
    }

    #[test]
    fn handles_una_with_default_delimiters() {
        let input = "UNA:+.? '\
                     UNB+UNOC:3+s+r+250505:0826+R'\
                     UNH+R+UTILMD:D:11A:UN:S2.1'\
                     UNT+1+R'UNZ+1+R'";
        let result = detect_format_version(input).unwrap();
        assert_eq!(result.format_version, "FV2604"); // ambiguity → newest
    }

    #[test]
    fn empty_s009_version_is_unknown_version_not_parse_failure() {
        let input = "UNB+UNOC:3+s+r+250505:0826+R'\
                     UNH+R+UTILMD:D:11A:UN:'\
                     UNT+1+R'UNZ+1+R'";
        let err = detect_format_version(input).unwrap_err();
        match err {
            DetectError::UnknownVersion { unh_version, .. } => assert_eq!(unh_version, ""),
            DetectError::ParseFailure(_) => {
                // acceptable — depends on whether the parser keeps the trailing empty component
            }
            other => panic!("unexpected: {other:?}"),
        }
    }

    #[test]
    fn multi_message_interchange_uses_first_unh() {
        let input = "UNB+UNOC:3+s+r+250505:0826+R'\
                     UNH+R1+UTILMD:D:11A:UN:S2.1'UNT+1+R1'\
                     UNH+R2+APERAK:D:07B:UN:2.1i'UNT+1+R2'\
                     UNZ+2+R'";
        let result = detect_format_version(input).unwrap();
        assert_eq!(result.message_type, "UTILMD");
    }

    #[test]
    fn no_unh_just_unb() {
        let input = "UNB+UNOC:3+s+r+250505:0826+R'UNZ+0+R'";
        let err = detect_format_version(input).unwrap_err();
        assert_eq!(err, DetectError::NoUnh);
    }
}