use crate::edi_parse_error::EdiParseError;
pub type DocumentTokens<'a> = Vec<SegmentTokens<'a>>;
pub type SegmentTokens<'a> = Vec<&'a str>;
pub(crate) struct TokenizeResult<'a> {
pub(crate) element_delimiter: char,
pub(crate) sub_element_delimiter: char,
pub(crate) segment_delimiter: char,
pub(crate) tokens: DocumentTokens<'a>,
}
pub(crate) fn tokenize<'a>(input: &'a str) -> Result<TokenizeResult<'a>, EdiParseError> {
edi_assert!(
input.len() >= 106,
"input not long enough to contain ISA header delimiters"
);
let delimiters_str: Vec<char> = input[103..106].chars().collect();
let (element_delimiter, sub_element_delimiter, segment_delimiter) =
(delimiters_str[0], delimiters_str[1], delimiters_str[2]);
edi_assert!(
element_delimiter != sub_element_delimiter,
"element and subelement delimiters cannot be the same",
element_delimiter,
sub_element_delimiter
);
edi_assert!(
sub_element_delimiter != segment_delimiter,
"subelement and segment delimiters cannot be the same",
sub_element_delimiter,
segment_delimiter
);
edi_assert!(
element_delimiter != segment_delimiter,
"element and segment delimiters cannot be the same",
element_delimiter,
segment_delimiter
);
let segments: SegmentTokens = input
.split(segment_delimiter)
.map(|x| x.trim())
.filter(|x| !x.is_empty())
.collect();
let tokens: DocumentTokens = segments
.iter()
.map(|x| x.split(element_delimiter).collect::<Vec<&str>>())
.collect();
Ok(TokenizeResult {
tokens,
element_delimiter,
sub_element_delimiter,
segment_delimiter,
})
}
#[test]
fn basic_segment_tokenize() {
let test_input = "ISA*00* *00* *ZZ*SENDERISA *14*0073268795005 *020226*1534*U*00401*000000001*0*T*>~
GS*PO*SENDERGS*007326879*20020226*1534*1*X*004010~
ST*850*000000001~
BEG*00*SA*A99999-01**19970214~
REF*VR*54321~
ITD*01*3*1**15**16~
DTM*002*19971219~
DTM*002*19971219~
SE*35*000000001~
GE*1*1~
IEA*1*000000001~";
let res = tokenize(test_input).unwrap();
assert_eq!(res.tokens.len(), 11);
assert_eq!(res.tokens[0].len(), 17);
assert_eq!(res.element_delimiter, '*');
assert_eq!(res.sub_element_delimiter, '>');
assert_eq!(res.segment_delimiter, '~');
}
#[test]
fn fail_to_tokenize_no_header() {
let test_input =
"00* *ZZ*SENDERISA *14*0073268795005 *020226*1534*U*00401*000000001*0*T";
assert!(tokenize(test_input).is_err());
}
#[test]
fn fail_same_delimiters() {
let test_input = "ISA*00* *00* *ZZ*SENDERISA *14*0073268795005 *020226*1534*U*00401*000000001*0*T~~~
GS*PO*SENDERGS*007326879*20020226*1534*1*X*004010~
ST*850*000000001~";
assert!(tokenize(test_input).is_err());
}