1use crate::{error::EdifactError, model::Span};
7use memchr::{memchr, memchr3};
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub struct ServiceStringAdvice {
14 pub element_sep: u8,
16 pub component_sep: u8,
18 pub release_char: u8,
20 pub decimal_mark: u8,
23 pub segment_term: u8,
25}
26
27impl Default for ServiceStringAdvice {
28 fn default() -> Self {
29 Self {
30 element_sep: b'+',
31 component_sep: b':',
32 release_char: b'?',
33 decimal_mark: b'.',
34 segment_term: b'\'',
35 }
36 }
37}
38
39impl ServiceStringAdvice {
40 pub fn from_bytes(input: &[u8]) -> Self {
46 if input.len() >= 9 && &input[..3] == b"UNA" {
48 Self {
49 component_sep: input[3],
50 element_sep: input[4],
51 decimal_mark: input[5],
52 release_char: input[6],
53 segment_term: input[8],
55 }
56 } else {
57 Self::default()
58 }
59 }
60
61 pub fn from_bytes_strict(input: &[u8]) -> Result<Self, crate::error::EdifactError> {
68 let ssa = Self::from_bytes(input);
69 if !ssa.is_valid() {
70 return Err(crate::error::EdifactError::InvalidUna);
71 }
72 Ok(ssa)
73 }
74
75 pub fn is_valid(&self) -> bool {
78 let [e, c, r, t] = [
79 self.element_sep,
80 self.component_sep,
81 self.release_char,
82 self.segment_term,
83 ];
84 let no_ws = |b: u8| !matches!(b, b' ' | b'\t' | b'\r' | b'\n');
85 no_ws(e) && no_ws(c) && no_ws(r) && no_ws(t)
87 && e != c && e != r && e != t
88 && c != r && c != t
89 && r != t
90 }
91}
92
93#[derive(Debug, Clone, PartialEq, Eq)]
95pub enum Token<'a> {
96 SegmentTag {
98 value: &'a str,
100 span: Span,
102 },
103 DataElement {
105 value: &'a str,
107 span: Span,
109 },
110 ComponentElement {
112 value: &'a str,
114 span: Span,
116 },
117 SegmentTerminator {
119 span: Span,
121 },
122}
123
124
125#[derive(Debug)]
126pub(crate) struct RawSegment {
127 pub(crate) bytes: Vec<u8>,
128 pub(crate) start_offset: usize,
129}
130
131pub struct Tokenizer<'a> {
135 input: &'a [u8],
136 pos: usize,
137 ssa: ServiceStringAdvice,
138 state: TokState,
139}
140
141#[derive(Debug, Clone, Copy, PartialEq, Eq)]
142enum TokState {
143 ExpectTag,
145 InSegment,
147}
148
149impl<'a> Tokenizer<'a> {
150 pub fn new(input: &'a [u8], ssa: ServiceStringAdvice) -> Self {
152 let pos = if input.len() >= 9 && &input[..3] == b"UNA" {
154 9
155 } else {
156 0
157 };
158 Self {
159 input,
160 pos,
161 ssa,
162 state: TokState::ExpectTag,
163 }
164 }
165
166 #[inline]
168 pub fn position(&self) -> usize {
169 self.pos
170 }
171
172 #[inline]
174 pub fn service_string_advice(&self) -> ServiceStringAdvice {
175 self.ssa
176 }
177
178 fn skip_inter_segment_whitespace(&mut self) {
180 while self.pos < self.input.len() {
181 match self.input[self.pos] {
182 b' ' | b'\t' | b'\r' | b'\n' => self.pos += 1,
183 _ => break,
184 }
185 }
186 }
187
188 fn read_value(&mut self) -> Result<(&'a str, Span), EdifactError> {
196 let start = self.pos;
197 let (elem, comp, release, term) = (
198 self.ssa.element_sep,
199 self.ssa.component_sep,
200 self.ssa.release_char,
201 self.ssa.segment_term,
202 );
203 loop {
204 let remaining = &self.input[self.pos..];
205 if remaining.is_empty() {
206 break;
207 }
208 let hit_ect = memchr3(elem, comp, release, remaining);
212 let hit_term = memchr(term, remaining);
213 let hit = match (hit_ect, hit_term) {
214 (None, None) => {
215 self.pos += remaining.len();
216 break;
217 }
218 (Some(a), None) => a,
219 (None, Some(b)) => b,
220 (Some(a), Some(b)) => a.min(b),
221 };
222 let b = remaining[hit];
223 if b == release {
224 if remaining.len() - hit == 1 {
227 return Err(EdifactError::InvalidReleaseSequence {
228 offset: self.pos + hit,
229 });
230 }
231 self.pos += hit + 2;
233 continue;
234 }
235 self.pos += hit;
237 break;
238 }
239 let span = Span::new(start, self.pos);
240 let value = std::str::from_utf8(&self.input[start..self.pos])
241 .map_err(|_| EdifactError::InvalidText { offset: start })?;
242 Ok((value, span))
243 }
244
245 fn read_tag(&mut self) -> Result<Option<Token<'a>>, EdifactError> {
247 self.skip_inter_segment_whitespace();
248 if self.pos >= self.input.len() {
249 return Ok(None);
250 }
251 let start = self.pos;
252 let remaining = &self.input[self.pos..];
255 let end = memchr(self.ssa.element_sep, remaining)
256 .or_else(|| memchr(self.ssa.segment_term, remaining))
257 .unwrap_or(remaining.len());
258
259 if end == 0 {
260 let byte = self.input[self.pos];
262 self.pos += 1;
263 return Err(EdifactError::InvalidDelimiter { byte, offset: start });
264 }
265
266 let tag_bytes = &self.input[start..start + end];
267 self.pos = start + end;
269 let tag = std::str::from_utf8(tag_bytes)
270 .map_err(|_| EdifactError::InvalidSegmentTag(format!("{tag_bytes:?}")))?;
271 if tag.len() != 3 || !tag.bytes().all(|b| b.is_ascii_uppercase()) {
272 return Err(EdifactError::InvalidSegmentTag(tag.to_owned()));
273 }
274 self.state = TokState::InSegment;
275 Ok(Some(Token::SegmentTag {
276 value: tag,
277 span: Span::new(start, start + end),
278 }))
279 }
280}
281
282impl<'a> Iterator for Tokenizer<'a> {
283 type Item = Result<Token<'a>, EdifactError>;
284
285 fn next(&mut self) -> Option<Self::Item> {
286 loop {
287 if self.pos >= self.input.len() {
288 return None;
289 }
290
291 match self.state {
292 TokState::ExpectTag => {
293 return match self.read_tag() {
294 Ok(Some(tok)) => Some(Ok(tok)),
295 Ok(None) => None,
296 Err(e) => Some(Err(e)),
297 };
298 }
299 TokState::InSegment => {
300 let b = self.input[self.pos];
301 let (elem, comp, term) = (
302 self.ssa.element_sep,
303 self.ssa.component_sep,
304 self.ssa.segment_term,
305 );
306
307 if b == term {
308 let start = self.pos;
309 self.pos += 1;
310 self.state = TokState::ExpectTag;
311 return Some(Ok(Token::SegmentTerminator {
312 span: Span::new(start, self.pos),
313 }));
314 } else if b == elem {
315 self.pos += 1;
316 let (value, span) = match self.read_value() {
317 Ok(value) => value,
318 Err(error) => return Some(Err(error)),
319 };
320 return Some(Ok(Token::DataElement { value, span }));
324 } else if b == comp {
325 self.pos += 1;
326 let (value, span) = match self.read_value() {
327 Ok(value) => value,
328 Err(error) => return Some(Err(error)),
329 };
330 return Some(Ok(Token::ComponentElement { value, span }));
331 } else if b == b'\r' || b == b'\n' {
332 self.pos += 1;
333 continue;
335 } else {
336 let offset = self.pos;
338 self.pos += 1; self.state = TokState::ExpectTag;
340 return Some(Err(EdifactError::InvalidDelimiter { byte: b, offset }));
341 }
342 }
343 }
344 }
345 }
346}
347
348#[cfg(test)]
349mod tests {
350 use super::*;
351
352 fn tokens(input: &[u8]) -> Vec<Token<'_>> {
353 let ssa = ServiceStringAdvice::from_bytes(input);
354 Tokenizer::new(input, ssa)
355 .collect::<Result<Vec<_>, _>>()
356 .expect("tokenize failed")
357 }
358
359 #[test]
360 fn minimal_unb_unz() {
361 let input = b"UNB+UNOA:1+SENDER+RECEIVER+200101:0900+1'UNZ+0+1'";
362 let toks = tokens(input);
363 assert!(matches!(toks[0], Token::SegmentTag { value: "UNB", .. }));
364 assert!(matches!(toks.last(), Some(Token::SegmentTerminator { .. })));
366 }
367
368 #[test]
369 fn release_character_not_a_delimiter() {
370 let input = b"BGM+220+test?+value'";
372 let toks = tokens(input);
373 let vals: Vec<_> = toks
375 .iter()
376 .filter_map(|t| {
377 if let Token::DataElement { value, .. } = t {
378 Some(*value)
379 } else {
380 None
381 }
382 })
383 .collect();
384 assert_eq!(vals, vec!["220", "test?+value"]);
385 }
386
387 #[test]
388 fn custom_una_delimiters() {
389 let input = b"UNA:;.? 'BGM;220;hello'";
391 let toks = tokens(input);
392 assert!(matches!(toks[0], Token::SegmentTag { value: "BGM", .. }));
393 let vals: Vec<_> = toks
394 .iter()
395 .filter_map(|t| {
396 if let Token::DataElement { value, .. } = t {
397 Some(*value)
398 } else {
399 None
400 }
401 })
402 .collect();
403 assert!(vals.contains(&"220"));
404 }
405
406 #[test]
407 fn tokens_expose_spans() {
408 let input = b"BGM+220+ABC'";
409 let toks = tokens(input);
410 assert!(matches!(
411 toks[0],
412 Token::SegmentTag {
413 value: "BGM",
414 span: Span { start: 0, end: 3 }
415 }
416 ));
417 assert!(matches!(
418 toks[1],
419 Token::DataElement {
420 value: "220",
421 span: Span { start: 4, end: 7 }
422 }
423 ));
424 }
425
426 #[test]
427 fn truncated_input_does_not_panic() {
428 let input = b"UNB+UNOA:1"; let _: Vec<_> = Tokenizer::new(input, ServiceStringAdvice::default()).collect();
430 }
432
433 #[test]
434 fn invalid_segment_tags_are_rejected() {
435 for input in [
436 &b"bgm+220+'"[..],
437 &b"ABCDE+220+'"[..],
438 &b"BGM1+220+'"[..],
439 &b"BGM +220+'"[..],
440 &b" BG+220+'"[..],
441 ] {
442 let result = Tokenizer::new(input, ServiceStringAdvice::default())
443 .collect::<Result<Vec<_>, _>>();
444 assert!(result.is_err(), "expected tag rejection for {input:?}");
445 }
446 }
447
448 #[test]
449 fn chunked_reader_parses_via_parser() {
450 let input = b"UNA:+.? 'BGM+220+test?+value'UNT+2+1'";
452 let segments =
453 crate::parser::from_bufread(std::io::BufReader::new(std::io::Cursor::new(input)))
454 .expect("parser should succeed");
455 assert!(segments.iter().any(|s| s.tag == "BGM"));
456 let bgm = segments.iter().find(|s| s.tag == "BGM").unwrap();
458 let raw_val = bgm.elements.get(1).and_then(|e| e.components.first()).map(|s| s.as_str());
459 assert_eq!(raw_val, Some("test+value"));
460 }
461}