edifact_rs/tokenizer.rs
1//! EDIFACT tokenizer — splits raw bytes into typed tokens.
2//!
3//! Respects UNA service string advice for non-default delimiters.
4//! Uses `memchr` for fast delimiter scanning (no byte-by-byte inner loops).
5
6use crate::{error::EdifactError, model::Span};
7use memchr::{memchr, memchr3};
8
9/// EDIFACT service string advice (UNA segment).
10///
11/// Defaults: `+` (element), `:` (component), `?` (release), space (reserved), `'` (segment).
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub struct ServiceStringAdvice {
14 /// Data element separator (default `+`)
15 pub element_sep: u8,
16 /// Component data element separator (default `:`)
17 pub component_sep: u8,
18 /// Release character (default `?`)
19 pub release_char: u8,
20 /// Decimal notation mark (default `.`; UNA byte 5, ISO 9735-1 §7.1).
21 /// Not used by the tokenizer for splitting, but preserved for downstream use.
22 pub decimal_mark: u8,
23 /// Segment terminator (default `'`)
24 pub segment_term: u8,
25}
26
27impl Default for ServiceStringAdvice {
28 fn default() -> Self {
29 Self {
30 element_sep: b'+',
31 component_sep: b':',
32 release_char: b'?',
33 decimal_mark: b'.',
34 segment_term: b'\'',
35 }
36 }
37}
38
39impl ServiceStringAdvice {
40 /// Parse a UNA header from the beginning of an EDIFACT interchange.
41 ///
42 /// If no UNA is present, returns [`ServiceStringAdvice::default`].
43 /// Does not validate that the 6 service characters are mutually distinct;
44 /// use [`ServiceStringAdvice::from_bytes_strict`] when that matters.
45 pub fn from_bytes(input: &[u8]) -> Self {
46 // UNA is 9 bytes: "UNA" + 6 service chars
47 if input.len() >= 9 && &input[..3] == b"UNA" {
48 Self {
49 component_sep: input[3],
50 element_sep: input[4],
51 decimal_mark: input[5],
52 release_char: input[6],
53 // input[7] = repetition separator (ISO 9735-4 §3.1; not modelled here)
54 segment_term: input[8],
55 }
56 } else {
57 Self::default()
58 }
59 }
60
61 /// Parse a UNA header and validate that the five active service characters
62 /// (`element_sep`, `component_sep`, `decimal_mark`, `release_char`, `segment_term`) are all
63 /// mutually distinct and in the printable ASCII range `0x21–0x7E`.
64 ///
65 /// Returns [`EdifactError::InvalidUna`] if the invariant is violated.
66 /// Falls back to [`ServiceStringAdvice::default`] when no UNA is present.
67 pub fn from_bytes_strict(input: &[u8]) -> Result<Self, crate::error::EdifactError> {
68 let ssa = Self::from_bytes(input);
69 if !ssa.is_valid() {
70 return Err(crate::error::EdifactError::InvalidUna);
71 }
72 Ok(ssa)
73 }
74
75 /// Return `true` if all five active service characters are mutually distinct
76 /// and all fall in the printable ASCII range `0x21–0x7E` (excl. space `0x20`,
77 /// control characters `0x00–0x1F`, and `DEL 0x7F`).
78 ///
79 /// The five characters are `element_sep`, `component_sep`, `decimal_mark`,
80 /// `release_char`, and `segment_term`. All 10 pairwise combinations are
81 /// checked.
82 ///
83 /// Bytes outside `0x21–0x7E` are rejected: high-bytes (`>= 0x80`) would cause
84 /// incorrect single-byte tokenization of multi-byte UTF-8 sequences, and DEL
85 /// (`0x7F`) is a non-printable control character.
86 pub fn is_valid(&self) -> bool {
87 let [e, c, d, r, t] = [
88 self.element_sep,
89 self.component_sep,
90 self.decimal_mark,
91 self.release_char,
92 self.segment_term,
93 ];
94 // All five must be printable ASCII 0x21–0x7E (excludes high-bytes, control chars,
95 // whitespace, and DEL 0x7F) and mutually distinct (10 pairwise checks).
96 let printable_ascii = |b: u8| b >= 0x21 && b <= 0x7E;
97 printable_ascii(e)
98 && printable_ascii(c)
99 && printable_ascii(d)
100 && printable_ascii(r)
101 && printable_ascii(t)
102 && e != c
103 && e != d
104 && e != r
105 && e != t
106 && c != d
107 && c != r
108 && c != t
109 && d != r
110 && d != t
111 && r != t
112 }
113}
114
115/// Token produced by [`Tokenizer`].
116#[derive(Debug, Clone, PartialEq, Eq)]
117pub enum Token<'a> {
118 /// 3-character segment tag (e.g. `"BGM"`)
119 SegmentTag {
120 /// Raw tag value.
121 value: &'a str,
122 /// Source span of the tag.
123 span: Span,
124 },
125 /// Data element value (between element separators)
126 DataElement {
127 /// Raw element value.
128 value: &'a str,
129 /// Source span of the element value.
130 span: Span,
131 },
132 /// Component within a composite data element (between component separators)
133 ComponentElement {
134 /// Raw component value.
135 value: &'a str,
136 /// Source span of the component value.
137 span: Span,
138 },
139 /// Segment terminator — signals the end of a segment
140 SegmentTerminator {
141 /// Source span of the segment terminator byte.
142 span: Span,
143 },
144}
145
146#[derive(Debug)]
147pub(crate) struct RawSegment {
148 pub(crate) bytes: Vec<u8>,
149 pub(crate) start_offset: usize,
150}
151
152/// Zero-copy tokenizer over a byte slice.
153///
154/// Yields `Token` values, each borrowing from the original input.
155///
156/// # Segment size guard
157///
158/// The default constructor [`Tokenizer::new`] enforces a **64 KiB** per-segment
159/// limit, which is sufficient for all well-formed EDIFACT interchanges and guards
160/// against adversarially crafted inputs that omit segment terminators.
161/// Use [`Tokenizer::with_limit`] to raise or lower this threshold, or
162/// [`Tokenizer::unlimited`] to remove it entirely (trusted / pre-validated input only).
163pub struct Tokenizer<'a> {
164 input: &'a [u8],
165 pos: usize,
166 ssa: ServiceStringAdvice,
167 state: TokState,
168 /// Maximum allowed segment byte length (tag + elements, **excluding** the
169 /// segment terminator byte itself). Checked in `read_value` and `read_tag`.
170 /// `usize::MAX` = unlimited.
171 max_segment_bytes: usize,
172 /// Byte position where the current segment started (set in `read_tag`).
173 segment_start: usize,
174}
175
176#[derive(Debug, Clone, Copy, PartialEq, Eq)]
177enum TokState {
178 /// Expecting a segment tag next
179 ExpectTag,
180 /// Inside a segment; next byte could be element or component sep, release, or terminator
181 InSegment,
182}
183
184impl<'a> Tokenizer<'a> {
185 /// Return the byte offset of the first non-UNA byte in `input`.
186 ///
187 /// If the input starts with the `UNA` service string advice (first 3
188 /// bytes are `b"UNA"`), the UNA header is exactly 9 bytes long and the
189 /// first segment tag starts at offset 9. Otherwise parsing starts at 0.
190 #[inline]
191 fn una_start_pos(input: &[u8]) -> usize {
192 if input.len() >= 9 && &input[..3] == b"UNA" {
193 9
194 } else {
195 0
196 }
197 }
198
199 /// Construct a tokenizer with the default 64 KiB segment-size limit.
200 ///
201 /// If a single segment's byte length exceeds 65 536 bytes, the iterator
202 /// returns [`EdifactError::SegmentTooLong`]. This guards against
203 /// pathological or adversarially crafted inputs that omit segment
204 /// terminators and would otherwise cause unbounded scanning.
205 ///
206 /// Call [`Tokenizer::unlimited`] if you deliberately need to process
207 /// segments larger than 64 KiB, or [`Tokenizer::with_limit`] to supply a
208 /// custom bound.
209 pub fn new(input: &'a [u8], ssa: ServiceStringAdvice) -> Self {
210 Self::with_limit(input, ssa, 65_536)
211 }
212
213 /// Construct a tokenizer with **no** segment-size limit.
214 ///
215 /// # Security
216 ///
217 /// This constructor imposes **no upper bound** on how many bytes a single
218 /// segment may consume. For untrusted or adversarially crafted input a
219 /// missing segment terminator can cause the tokenizer to scan the entire
220 /// input before returning an error. Prefer [`Tokenizer::new`] (64 KiB
221 /// limit) or [`Tokenizer::with_limit`] for untrusted sources.
222 pub fn unlimited(input: &'a [u8], ssa: ServiceStringAdvice) -> Self {
223 Self {
224 input,
225 pos: Self::una_start_pos(input),
226 ssa,
227 state: TokState::ExpectTag,
228 max_segment_bytes: usize::MAX,
229 segment_start: 0,
230 }
231 }
232
233 /// Construct a tokenizer with a segment-size limit.
234 ///
235 /// If a single segment's byte length (from the start of the tag to the end
236 /// of the last value, not including the terminator itself) exceeds `limit`,
237 /// the iterator returns [`EdifactError::SegmentTooLong`].
238 ///
239 /// # Examples
240 ///
241 /// ```
242 /// use edifact_rs::{ServiceStringAdvice, Tokenizer};
243 ///
244 /// let input = b"BGM+220+PO-4711+9'";
245 /// let ssa = ServiceStringAdvice::default();
246 /// let tokens: Vec<_> = Tokenizer::with_limit(input, ssa, 64)
247 /// .collect::<Result<_, _>>()
248 /// .unwrap();
249 /// assert!(!tokens.is_empty());
250 /// ```
251 pub fn with_limit(input: &'a [u8], ssa: ServiceStringAdvice, max_segment_bytes: usize) -> Self {
252 Self {
253 input,
254 pos: Self::una_start_pos(input),
255 ssa,
256 state: TokState::ExpectTag,
257 max_segment_bytes,
258 segment_start: 0,
259 }
260 }
261
262 /// Current byte position in the input.
263 #[inline]
264 pub fn position(&self) -> usize {
265 self.pos
266 }
267
268 /// Return the service string advice active for this tokenizer.
269 #[inline]
270 pub fn service_string_advice(&self) -> ServiceStringAdvice {
271 self.ssa
272 }
273
274 /// Consume leading whitespace / CR / LF between segments (not inside data values).
275 fn skip_inter_segment_whitespace(&mut self) {
276 while self.pos < self.input.len() {
277 match self.input[self.pos] {
278 b' ' | b'\t' | b'\r' | b'\n' => self.pos += 1,
279 _ => break,
280 }
281 }
282 }
283
284 /// Read a field value starting at `self.pos`, advancing past the value.
285 ///
286 /// Recognises the release character (`?` by default) and returns the raw
287 /// slice including release sequences. The parser layer resolves them.
288 ///
289 /// Uses `memchr3` to bulk-scan over non-special bytes between hits, only
290 /// falling back to a per-byte step when a release character is encountered.
291 fn read_value(&mut self) -> Result<(&'a str, Span), EdifactError> {
292 let start = self.pos;
293 let (elem, comp, release, term) = (
294 self.ssa.element_sep,
295 self.ssa.component_sep,
296 self.ssa.release_char,
297 self.ssa.segment_term,
298 );
299 loop {
300 let remaining = &self.input[self.pos..];
301 if remaining.is_empty() {
302 break;
303 }
304 // Scan for release OR a value-terminating delimiter.
305 // memchr3 can hold three bytes; we combine elem/comp/release.
306 // A separate memchr finds term so we take the nearest hit.
307 let hit_ect = memchr3(elem, comp, release, remaining);
308 let hit_term = memchr(term, remaining);
309 let hit = match (hit_ect, hit_term) {
310 (None, None) => {
311 self.pos += remaining.len();
312 break;
313 }
314 (Some(a), None) => a,
315 (None, Some(b)) => b,
316 (Some(a), Some(b)) => a.min(b),
317 };
318 let b = remaining[hit];
319 if b == release {
320 // A release char must be followed by exactly one escaped byte.
321 // If it is the last byte in the buffer the sequence is malformed.
322 if remaining.len() - hit == 1 {
323 return Err(EdifactError::InvalidReleaseSequence {
324 offset: self.pos + hit,
325 });
326 }
327 // Skip release char + the escaped byte.
328 self.pos += hit + 2;
329 continue;
330 }
331 // b is elem, comp, or term — end of value.
332 self.pos += hit;
333 break;
334 }
335 let span = Span::new(start, self.pos);
336 let value = std::str::from_utf8(&self.input[start..self.pos])
337 .map_err(|_| EdifactError::InvalidText { offset: start })?;
338 // Enforce the per-segment byte-length guard.
339 if self.pos - self.segment_start > self.max_segment_bytes {
340 return Err(EdifactError::SegmentTooLong {
341 offset: self.segment_start,
342 limit: self.max_segment_bytes,
343 });
344 }
345 Ok((value, span))
346 }
347
348 /// Fast scan for the segment tag (exactly 3 ASCII uppercase letters).
349 fn read_tag(&mut self) -> Result<Option<Token<'a>>, EdifactError> {
350 self.skip_inter_segment_whitespace();
351 if self.pos >= self.input.len() {
352 return Ok(None);
353 }
354 let start = self.pos;
355 // A segment tag is terminated by the element separator or segment terminator.
356 // Bound the scan to max_segment_bytes + 1 so adversarial input with no delimiters
357 // cannot force memchr to scan arbitrarily large buffers before we return an error.
358 let input_remaining = &self.input[self.pos..];
359 let scan_limit = self
360 .max_segment_bytes
361 .saturating_add(1)
362 .min(input_remaining.len());
363 let remaining = &input_remaining[..scan_limit];
364 let end = memchr(self.ssa.element_sep, remaining)
365 .or_else(|| memchr(self.ssa.segment_term, remaining))
366 .unwrap_or(remaining.len());
367
368 if end == 0 {
369 // First byte is already a delimiter — tag is zero-length, which is invalid.
370 let byte = self.input[self.pos];
371 self.pos += 1;
372 return Err(EdifactError::InvalidDelimiter {
373 byte,
374 offset: start,
375 });
376 }
377
378 // Enforce the per-segment byte-length guard in read_tag as well.
379 // Without this check, adversarial input with no delimiters could cause
380 // memchr to scan the entire remaining buffer (potentially hundreds of MB).
381 if end > self.max_segment_bytes {
382 // Advance past the offending bytes so the iterator can continue.
383 self.pos = start + end;
384 return Err(EdifactError::SegmentTooLong {
385 offset: start,
386 limit: self.max_segment_bytes,
387 });
388 }
389 let tag_bytes = &self.input[start..start + end];
390 // Always advance pos so errors cannot cause an infinite retry loop.
391 self.pos = start + end;
392 // Record segment start for the size-limit check in read_value.
393 self.segment_start = start;
394 let tag = std::str::from_utf8(tag_bytes)
395 .map_err(|_| EdifactError::InvalidSegmentTag(format!("{tag_bytes:?}")))?;
396 if tag.len() != 3 || !tag.bytes().all(|b| b.is_ascii_uppercase()) {
397 return Err(EdifactError::InvalidSegmentTag(tag.to_owned()));
398 }
399 self.state = TokState::InSegment;
400 Ok(Some(Token::SegmentTag {
401 value: tag,
402 span: Span::new(start, start + end),
403 }))
404 }
405}
406
407impl<'a> Iterator for Tokenizer<'a> {
408 type Item = Result<Token<'a>, EdifactError>;
409
410 fn next(&mut self) -> Option<Self::Item> {
411 loop {
412 if self.pos >= self.input.len() {
413 return None;
414 }
415
416 match self.state {
417 TokState::ExpectTag => {
418 return match self.read_tag() {
419 Ok(Some(tok)) => Some(Ok(tok)),
420 Ok(None) => None,
421 Err(e) => Some(Err(e)),
422 };
423 }
424 TokState::InSegment => {
425 let b = self.input[self.pos];
426 let (elem, comp, term) = (
427 self.ssa.element_sep,
428 self.ssa.component_sep,
429 self.ssa.segment_term,
430 );
431
432 if b == term {
433 let start = self.pos;
434 self.pos += 1;
435 self.state = TokState::ExpectTag;
436 return Some(Ok(Token::SegmentTerminator {
437 span: Span::new(start, self.pos),
438 }));
439 } else if b == elem {
440 self.pos += 1;
441 let (value, span) = match self.read_value() {
442 Ok(value) => value,
443 Err(error) => return Some(Err(error)),
444 };
445 // Peek: is the *next* byte a component sep?
446 // We emit DataElement for the leading sub-element regardless;
447 // subsequent components within the same element are ComponentElement.
448 return Some(Ok(Token::DataElement { value, span }));
449 } else if b == comp {
450 self.pos += 1;
451 let (value, span) = match self.read_value() {
452 Ok(value) => value,
453 Err(error) => return Some(Err(error)),
454 };
455 return Some(Ok(Token::ComponentElement { value, span }));
456 } else if b == b'\r' || b == b'\n' {
457 self.pos += 1;
458 // inter-element whitespace inside a segment — skip
459 continue;
460 } else {
461 // Unexpected byte inside a segment — skip it and report.
462 let offset = self.pos;
463 self.pos += 1; // always advance to prevent infinite retry loop
464 self.state = TokState::ExpectTag;
465 return Some(Err(EdifactError::InvalidDelimiter { byte: b, offset }));
466 }
467 }
468 }
469 }
470 }
471}
472
473#[cfg(test)]
474mod tests {
475 use super::*;
476
477 fn tokens(input: &[u8]) -> Vec<Token<'_>> {
478 let ssa = ServiceStringAdvice::from_bytes(input);
479 Tokenizer::new(input, ssa)
480 .collect::<Result<Vec<_>, _>>()
481 .expect("tokenize failed")
482 }
483
484 #[test]
485 fn minimal_unb_unz() {
486 let input = b"UNB+UNOA:1+SENDER+RECEIVER+200101:0900+1'UNZ+0+1'";
487 let toks = tokens(input);
488 assert!(matches!(toks[0], Token::SegmentTag { value: "UNB", .. }));
489 // should end with UNZ terminator
490 assert!(matches!(toks.last(), Some(Token::SegmentTerminator { .. })));
491 }
492
493 #[test]
494 fn release_character_not_a_delimiter() {
495 // `?+` inside a value must NOT produce a DataElement split
496 let input = b"BGM+220+test?+value'";
497 let toks = tokens(input);
498 // Elements after BGM tag: "220", "test?+value"
499 let vals: Vec<_> = toks
500 .iter()
501 .filter_map(|t| {
502 if let Token::DataElement { value, .. } = t {
503 Some(*value)
504 } else {
505 None
506 }
507 })
508 .collect();
509 assert_eq!(vals, vec!["220", "test?+value"]);
510 }
511
512 #[test]
513 fn custom_una_delimiters() {
514 // UNA with `;` as element sep
515 let input = b"UNA:;.? 'BGM;220;hello'";
516 let toks = tokens(input);
517 assert!(matches!(toks[0], Token::SegmentTag { value: "BGM", .. }));
518 let vals: Vec<_> = toks
519 .iter()
520 .filter_map(|t| {
521 if let Token::DataElement { value, .. } = t {
522 Some(*value)
523 } else {
524 None
525 }
526 })
527 .collect();
528 assert!(vals.contains(&"220"));
529 }
530
531 #[test]
532 fn tokens_expose_spans() {
533 let input = b"BGM+220+ABC'";
534 let toks = tokens(input);
535 assert!(matches!(
536 toks[0],
537 Token::SegmentTag {
538 value: "BGM",
539 span: Span { start: 0, end: 3 }
540 }
541 ));
542 assert!(matches!(
543 toks[1],
544 Token::DataElement {
545 value: "220",
546 span: Span { start: 4, end: 7 }
547 }
548 ));
549 }
550
551 #[test]
552 fn truncated_input_does_not_panic() {
553 let input = b"UNB+UNOA:1"; // no terminator
554 let _: Vec<_> = Tokenizer::new(input, ServiceStringAdvice::default()).collect();
555 // must not panic regardless of result
556 }
557
558 #[test]
559 fn invalid_segment_tags_are_rejected() {
560 for input in [
561 &b"bgm+220+'"[..],
562 &b"ABCDE+220+'"[..],
563 &b"BGM1+220+'"[..],
564 &b"BGM +220+'"[..],
565 &b" BG+220+'"[..],
566 ] {
567 let result = Tokenizer::new(input, ServiceStringAdvice::default())
568 .collect::<Result<Vec<_>, _>>();
569 assert!(result.is_err(), "expected tag rejection for {input:?}");
570 }
571 }
572
573 #[test]
574 fn chunked_reader_parses_via_parser() {
575 // The reader tokenizer path was removed; verify the equivalent via the parser.
576 let input = b"UNA:+.? 'BGM+220+test?+value'UNT+2+1'";
577 let segments =
578 crate::parser::from_bufread(std::io::BufReader::new(std::io::Cursor::new(input)))
579 .expect("parser should succeed");
580 assert!(segments.iter().any(|s| s.tag == "BGM"));
581 // The release sequence '?+' inside 'test?+value' should survive in the element.
582 let bgm = segments.iter().find(|s| s.tag == "BGM").unwrap();
583 let raw_val = bgm
584 .elements
585 .get(1)
586 .and_then(|e| e.components.first())
587 .map(|(s, _)| s.as_str());
588 assert_eq!(raw_val, Some("test+value"));
589 }
590}