Skip to main content

edifact_rs/
tokenizer.rs

1//! EDIFACT tokenizer — splits raw bytes into typed tokens.
2//!
3//! Respects UNA service string advice for non-default delimiters.
4//! Uses `memchr` for fast delimiter scanning (no byte-by-byte inner loops).
5
6use crate::{error::EdifactError, model::Span};
7use memchr::{memchr, memchr3};
8
9/// EDIFACT service string advice (UNA segment).
10///
11/// Defaults: `+` (element), `:` (component), `?` (release), space (reserved), `'` (segment).
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub struct ServiceStringAdvice {
14    /// Data element separator (default `+`)
15    pub element_sep: u8,
16    /// Component data element separator (default `:`)
17    pub component_sep: u8,
18    /// Release character (default `?`)
19    pub release_char: u8,
20    /// Decimal notation mark (default `.`; UNA byte 5, ISO 9735-1 §7.1).
21    /// Not used by the tokenizer for splitting, but preserved for downstream use.
22    pub decimal_mark: u8,
23    /// Segment terminator (default `'`)
24    pub segment_term: u8,
25}
26
27impl Default for ServiceStringAdvice {
28    fn default() -> Self {
29        Self {
30            element_sep: b'+',
31            component_sep: b':',
32            release_char: b'?',
33            decimal_mark: b'.',
34            segment_term: b'\'',
35        }
36    }
37}
38
39impl ServiceStringAdvice {
40    /// Parse a UNA header from the beginning of an EDIFACT interchange.
41    ///
42    /// If no UNA is present, returns [`ServiceStringAdvice::default`].
43    /// Does not validate that the 6 service characters are mutually distinct;
44    /// use [`ServiceStringAdvice::from_bytes_strict`] when that matters.
45    pub fn from_bytes(input: &[u8]) -> Self {
46        // UNA is 9 bytes: "UNA" + 6 service chars
47        if input.len() >= 9 && &input[..3] == b"UNA" {
48            Self {
49                component_sep: input[3],
50                element_sep: input[4],
51                decimal_mark: input[5],
52                release_char: input[6],
53                // input[7] = repetition separator (ISO 9735-4 §3.1; not modelled here)
54                segment_term: input[8],
55            }
56        } else {
57            Self::default()
58        }
59    }
60
61    /// Parse a UNA header and validate that the five active service characters
62    /// (`element_sep`, `component_sep`, `decimal_mark`, `release_char`, `segment_term`) are all
63    /// mutually distinct and in the printable ASCII range `0x21–0x7E`.
64    ///
65    /// Returns [`EdifactError::InvalidUna`] if the invariant is violated.
66    /// Falls back to [`ServiceStringAdvice::default`] when no UNA is present.
67    pub fn from_bytes_strict(input: &[u8]) -> Result<Self, crate::error::EdifactError> {
68        let ssa = Self::from_bytes(input);
69        if !ssa.is_valid() {
70            return Err(crate::error::EdifactError::InvalidUna);
71        }
72        Ok(ssa)
73    }
74
75    /// Return `true` if all five active service characters are mutually distinct
76    /// and all fall in the printable ASCII range `0x21–0x7E` (excl. space `0x20`,
77    /// control characters `0x00–0x1F`, and `DEL 0x7F`).
78    ///
79    /// The five characters are `element_sep`, `component_sep`, `decimal_mark`,
80    /// `release_char`, and `segment_term`.  All 10 pairwise combinations are
81    /// checked.
82    ///
83    /// Bytes outside `0x21–0x7E` are rejected: high-bytes (`>= 0x80`) would cause
84    /// incorrect single-byte tokenization of multi-byte UTF-8 sequences, and DEL
85    /// (`0x7F`) is a non-printable control character.
86    pub fn is_valid(&self) -> bool {
87        let [e, c, d, r, t] = [
88            self.element_sep,
89            self.component_sep,
90            self.decimal_mark,
91            self.release_char,
92            self.segment_term,
93        ];
94        // All five must be printable ASCII 0x21–0x7E (excludes high-bytes, control chars,
95        // whitespace, and DEL 0x7F) and mutually distinct (10 pairwise checks).
96        let printable_ascii = |b: u8| b >= 0x21 && b <= 0x7E;
97        printable_ascii(e)
98            && printable_ascii(c)
99            && printable_ascii(d)
100            && printable_ascii(r)
101            && printable_ascii(t)
102            && e != c
103            && e != d
104            && e != r
105            && e != t
106            && c != d
107            && c != r
108            && c != t
109            && d != r
110            && d != t
111            && r != t
112    }
113}
114
115/// Token produced by [`Tokenizer`].
116#[derive(Debug, Clone, PartialEq, Eq)]
117pub enum Token<'a> {
118    /// 3-character segment tag (e.g. `"BGM"`)
119    SegmentTag {
120        /// Raw tag value.
121        value: &'a str,
122        /// Source span of the tag.
123        span: Span,
124    },
125    /// Data element value (between element separators)
126    DataElement {
127        /// Raw element value.
128        value: &'a str,
129        /// Source span of the element value.
130        span: Span,
131    },
132    /// Component within a composite data element (between component separators)
133    ComponentElement {
134        /// Raw component value.
135        value: &'a str,
136        /// Source span of the component value.
137        span: Span,
138    },
139    /// Segment terminator — signals the end of a segment
140    SegmentTerminator {
141        /// Source span of the segment terminator byte.
142        span: Span,
143    },
144}
145
146#[derive(Debug)]
147pub(crate) struct RawSegment {
148    pub(crate) bytes: Vec<u8>,
149    pub(crate) start_offset: usize,
150}
151
152/// Zero-copy tokenizer over a byte slice.
153///
154/// Yields `Token` values, each borrowing from the original input.
155///
156/// # Segment size guard
157///
158/// Pass a limit to [`Tokenizer::with_limit`] to reject segments that exceed a
159/// byte-length threshold.  This bounds both the memory and CPU cost of parsing
160/// a single segment on the zero-copy slice path, and causes an
161/// [`EdifactError::SegmentTooLong`] error when the limit is exceeded.
162/// The default constructor [`Tokenizer::new`] sets no limit (`usize::MAX`).
163pub struct Tokenizer<'a> {
164    input: &'a [u8],
165    pos: usize,
166    ssa: ServiceStringAdvice,
167    state: TokState,
168    /// Maximum allowed segment byte length (tag + elements, **excluding** the
169    /// segment terminator byte itself).  Checked in `read_value` and `read_tag`.
170    /// `usize::MAX` = unlimited.
171    max_segment_bytes: usize,
172    /// Byte position where the current segment started (set in `read_tag`).
173    segment_start: usize,
174}
175
176#[derive(Debug, Clone, Copy, PartialEq, Eq)]
177enum TokState {
178    /// Expecting a segment tag next
179    ExpectTag,
180    /// Inside a segment; next byte could be element or component sep, release, or terminator
181    InSegment,
182}
183
184impl<'a> Tokenizer<'a> {
185    /// Return the byte offset of the first non-UNA byte in `input`.
186    ///
187    /// If the input starts with the `UNA` service string advice (first 3
188    /// bytes are `b"UNA"`), the UNA header is exactly 9 bytes long and the
189    /// first segment tag starts at offset 9.  Otherwise parsing starts at 0.
190    #[inline]
191    fn una_start_pos(input: &[u8]) -> usize {
192        if input.len() >= 9 && &input[..3] == b"UNA" {
193            9
194        } else {
195            0
196        }
197    }
198
199    /// Construct a zero-copy tokenizer over `input` with explicit service-string advice.
200    ///
201    /// No segment-size limit is applied.  Use [`Tokenizer::with_limit`] when
202    /// processing untrusted input to bound CPU and memory usage.
203    ///
204    /// # Security
205    ///
206    /// This constructor imposes **no upper bound** on how many bytes a single
207    /// segment may consume.  For untrusted or adversarially crafted input a
208    /// missing segment terminator can cause the tokenizer to scan the entire
209    /// input before returning an error.  Call [`Tokenizer::with_limit`]
210    /// instead, or use the higher-level [`crate::from_bytes`] /
211    /// [`crate::from_reader_with_config`] which default to a 64 KiB limit.
212    pub fn new(input: &'a [u8], ssa: ServiceStringAdvice) -> Self {
213        Self {
214            input,
215            pos: Self::una_start_pos(input),
216            ssa,
217            state: TokState::ExpectTag,
218            max_segment_bytes: usize::MAX,
219            segment_start: 0,
220        }
221    }
222
223    /// Construct a tokenizer with a segment-size limit.
224    ///
225    /// If a single segment's byte length (from the start of the tag to the end
226    /// of the last value, not including the terminator itself) exceeds `limit`,
227    /// the iterator returns [`EdifactError::SegmentTooLong`].
228    ///
229    /// # Examples
230    ///
231    /// ```
232    /// use edifact_rs::{ServiceStringAdvice, Tokenizer};
233    ///
234    /// let input = b"BGM+220+PO-4711+9'";
235    /// let ssa = ServiceStringAdvice::default();
236    /// let tokens: Vec<_> = Tokenizer::with_limit(input, ssa, 64)
237    ///     .collect::<Result<_, _>>()
238    ///     .unwrap();
239    /// assert!(!tokens.is_empty());
240    /// ```
241    pub fn with_limit(input: &'a [u8], ssa: ServiceStringAdvice, max_segment_bytes: usize) -> Self {
242        Self {
243            input,
244            pos: Self::una_start_pos(input),
245            ssa,
246            state: TokState::ExpectTag,
247            max_segment_bytes,
248            segment_start: 0,
249        }
250    }
251
252    /// Current byte position in the input.
253    #[inline]
254    pub fn position(&self) -> usize {
255        self.pos
256    }
257
258    /// Return the service string advice active for this tokenizer.
259    #[inline]
260    pub fn service_string_advice(&self) -> ServiceStringAdvice {
261        self.ssa
262    }
263
264    /// Consume leading whitespace / CR / LF between segments (not inside data values).
265    fn skip_inter_segment_whitespace(&mut self) {
266        while self.pos < self.input.len() {
267            match self.input[self.pos] {
268                b' ' | b'\t' | b'\r' | b'\n' => self.pos += 1,
269                _ => break,
270            }
271        }
272    }
273
274    /// Read a field value starting at `self.pos`, advancing past the value.
275    ///
276    /// Recognises the release character (`?` by default) and returns the raw
277    /// slice including release sequences. The parser layer resolves them.
278    ///
279    /// Uses `memchr3` to bulk-scan over non-special bytes between hits, only
280    /// falling back to a per-byte step when a release character is encountered.
281    fn read_value(&mut self) -> Result<(&'a str, Span), EdifactError> {
282        let start = self.pos;
283        let (elem, comp, release, term) = (
284            self.ssa.element_sep,
285            self.ssa.component_sep,
286            self.ssa.release_char,
287            self.ssa.segment_term,
288        );
289        loop {
290            let remaining = &self.input[self.pos..];
291            if remaining.is_empty() {
292                break;
293            }
294            // Scan for release OR a value-terminating delimiter.
295            // memchr3 can hold three bytes; we combine elem/comp/release.
296            // A separate memchr finds term so we take the nearest hit.
297            let hit_ect = memchr3(elem, comp, release, remaining);
298            let hit_term = memchr(term, remaining);
299            let hit = match (hit_ect, hit_term) {
300                (None, None) => {
301                    self.pos += remaining.len();
302                    break;
303                }
304                (Some(a), None) => a,
305                (None, Some(b)) => b,
306                (Some(a), Some(b)) => a.min(b),
307            };
308            let b = remaining[hit];
309            if b == release {
310                // A release char must be followed by exactly one escaped byte.
311                // If it is the last byte in the buffer the sequence is malformed.
312                if remaining.len() - hit == 1 {
313                    return Err(EdifactError::InvalidReleaseSequence {
314                        offset: self.pos + hit,
315                    });
316                }
317                // Skip release char + the escaped byte.
318                self.pos += hit + 2;
319                continue;
320            }
321            // b is elem, comp, or term — end of value.
322            self.pos += hit;
323            break;
324        }
325        let span = Span::new(start, self.pos);
326        let value = std::str::from_utf8(&self.input[start..self.pos])
327            .map_err(|_| EdifactError::InvalidText { offset: start })?;
328        // Enforce the per-segment byte-length guard.
329        if self.pos - self.segment_start > self.max_segment_bytes {
330            return Err(EdifactError::SegmentTooLong {
331                offset: self.segment_start,
332                limit: self.max_segment_bytes,
333            });
334        }
335        Ok((value, span))
336    }
337
338    /// Fast scan for the segment tag (exactly 3 ASCII uppercase letters).
339    fn read_tag(&mut self) -> Result<Option<Token<'a>>, EdifactError> {
340        self.skip_inter_segment_whitespace();
341        if self.pos >= self.input.len() {
342            return Ok(None);
343        }
344        let start = self.pos;
345        // A segment tag is terminated by the element separator or segment terminator.
346        // Bound the scan to max_segment_bytes + 1 so adversarial input with no delimiters
347        // cannot force memchr to scan arbitrarily large buffers before we return an error.
348        let input_remaining = &self.input[self.pos..];
349        let scan_limit = self
350            .max_segment_bytes
351            .saturating_add(1)
352            .min(input_remaining.len());
353        let remaining = &input_remaining[..scan_limit];
354        let end = memchr(self.ssa.element_sep, remaining)
355            .or_else(|| memchr(self.ssa.segment_term, remaining))
356            .unwrap_or(remaining.len());
357
358        if end == 0 {
359            // First byte is already a delimiter — tag is zero-length, which is invalid.
360            let byte = self.input[self.pos];
361            self.pos += 1;
362            return Err(EdifactError::InvalidDelimiter {
363                byte,
364                offset: start,
365            });
366        }
367
368        // Enforce the per-segment byte-length guard in read_tag as well.
369        // Without this check, adversarial input with no delimiters could cause
370        // memchr to scan the entire remaining buffer (potentially hundreds of MB).
371        if end > self.max_segment_bytes {
372            // Advance past the offending bytes so the iterator can continue.
373            self.pos = start + end;
374            return Err(EdifactError::SegmentTooLong {
375                offset: start,
376                limit: self.max_segment_bytes,
377            });
378        }
379        let tag_bytes = &self.input[start..start + end];
380        // Always advance pos so errors cannot cause an infinite retry loop.
381        self.pos = start + end;
382        // Record segment start for the size-limit check in read_value.
383        self.segment_start = start;
384        let tag = std::str::from_utf8(tag_bytes)
385            .map_err(|_| EdifactError::InvalidSegmentTag(format!("{tag_bytes:?}")))?;
386        if tag.len() != 3 || !tag.bytes().all(|b| b.is_ascii_uppercase()) {
387            return Err(EdifactError::InvalidSegmentTag(tag.to_owned()));
388        }
389        self.state = TokState::InSegment;
390        Ok(Some(Token::SegmentTag {
391            value: tag,
392            span: Span::new(start, start + end),
393        }))
394    }
395}
396
397impl<'a> Iterator for Tokenizer<'a> {
398    type Item = Result<Token<'a>, EdifactError>;
399
400    fn next(&mut self) -> Option<Self::Item> {
401        loop {
402            if self.pos >= self.input.len() {
403                return None;
404            }
405
406            match self.state {
407                TokState::ExpectTag => {
408                    return match self.read_tag() {
409                        Ok(Some(tok)) => Some(Ok(tok)),
410                        Ok(None) => None,
411                        Err(e) => Some(Err(e)),
412                    };
413                }
414                TokState::InSegment => {
415                    let b = self.input[self.pos];
416                    let (elem, comp, term) = (
417                        self.ssa.element_sep,
418                        self.ssa.component_sep,
419                        self.ssa.segment_term,
420                    );
421
422                    if b == term {
423                        let start = self.pos;
424                        self.pos += 1;
425                        self.state = TokState::ExpectTag;
426                        return Some(Ok(Token::SegmentTerminator {
427                            span: Span::new(start, self.pos),
428                        }));
429                    } else if b == elem {
430                        self.pos += 1;
431                        let (value, span) = match self.read_value() {
432                            Ok(value) => value,
433                            Err(error) => return Some(Err(error)),
434                        };
435                        // Peek: is the *next* byte a component sep?
436                        // We emit DataElement for the leading sub-element regardless;
437                        // subsequent components within the same element are ComponentElement.
438                        return Some(Ok(Token::DataElement { value, span }));
439                    } else if b == comp {
440                        self.pos += 1;
441                        let (value, span) = match self.read_value() {
442                            Ok(value) => value,
443                            Err(error) => return Some(Err(error)),
444                        };
445                        return Some(Ok(Token::ComponentElement { value, span }));
446                    } else if b == b'\r' || b == b'\n' {
447                        self.pos += 1;
448                        // inter-element whitespace inside a segment — skip
449                        continue;
450                    } else {
451                        // Unexpected byte inside a segment — skip it and report.
452                        let offset = self.pos;
453                        self.pos += 1; // always advance to prevent infinite retry loop
454                        self.state = TokState::ExpectTag;
455                        return Some(Err(EdifactError::InvalidDelimiter { byte: b, offset }));
456                    }
457                }
458            }
459        }
460    }
461}
462
463#[cfg(test)]
464mod tests {
465    use super::*;
466
467    fn tokens(input: &[u8]) -> Vec<Token<'_>> {
468        let ssa = ServiceStringAdvice::from_bytes(input);
469        Tokenizer::new(input, ssa)
470            .collect::<Result<Vec<_>, _>>()
471            .expect("tokenize failed")
472    }
473
474    #[test]
475    fn minimal_unb_unz() {
476        let input = b"UNB+UNOA:1+SENDER+RECEIVER+200101:0900+1'UNZ+0+1'";
477        let toks = tokens(input);
478        assert!(matches!(toks[0], Token::SegmentTag { value: "UNB", .. }));
479        // should end with UNZ terminator
480        assert!(matches!(toks.last(), Some(Token::SegmentTerminator { .. })));
481    }
482
483    #[test]
484    fn release_character_not_a_delimiter() {
485        // `?+` inside a value must NOT produce a DataElement split
486        let input = b"BGM+220+test?+value'";
487        let toks = tokens(input);
488        // Elements after BGM tag: "220", "test?+value"
489        let vals: Vec<_> = toks
490            .iter()
491            .filter_map(|t| {
492                if let Token::DataElement { value, .. } = t {
493                    Some(*value)
494                } else {
495                    None
496                }
497            })
498            .collect();
499        assert_eq!(vals, vec!["220", "test?+value"]);
500    }
501
502    #[test]
503    fn custom_una_delimiters() {
504        // UNA with `;` as element sep
505        let input = b"UNA:;.? 'BGM;220;hello'";
506        let toks = tokens(input);
507        assert!(matches!(toks[0], Token::SegmentTag { value: "BGM", .. }));
508        let vals: Vec<_> = toks
509            .iter()
510            .filter_map(|t| {
511                if let Token::DataElement { value, .. } = t {
512                    Some(*value)
513                } else {
514                    None
515                }
516            })
517            .collect();
518        assert!(vals.contains(&"220"));
519    }
520
521    #[test]
522    fn tokens_expose_spans() {
523        let input = b"BGM+220+ABC'";
524        let toks = tokens(input);
525        assert!(matches!(
526            toks[0],
527            Token::SegmentTag {
528                value: "BGM",
529                span: Span { start: 0, end: 3 }
530            }
531        ));
532        assert!(matches!(
533            toks[1],
534            Token::DataElement {
535                value: "220",
536                span: Span { start: 4, end: 7 }
537            }
538        ));
539    }
540
541    #[test]
542    fn truncated_input_does_not_panic() {
543        let input = b"UNB+UNOA:1"; // no terminator
544        let _: Vec<_> = Tokenizer::new(input, ServiceStringAdvice::default()).collect();
545        // must not panic regardless of result
546    }
547
548    #[test]
549    fn invalid_segment_tags_are_rejected() {
550        for input in [
551            &b"bgm+220+'"[..],
552            &b"ABCDE+220+'"[..],
553            &b"BGM1+220+'"[..],
554            &b"BGM +220+'"[..],
555            &b" BG+220+'"[..],
556        ] {
557            let result = Tokenizer::new(input, ServiceStringAdvice::default())
558                .collect::<Result<Vec<_>, _>>();
559            assert!(result.is_err(), "expected tag rejection for {input:?}");
560        }
561    }
562
563    #[test]
564    fn chunked_reader_parses_via_parser() {
565        // The reader tokenizer path was removed; verify the equivalent via the parser.
566        let input = b"UNA:+.? 'BGM+220+test?+value'UNT+2+1'";
567        let segments =
568            crate::parser::from_bufread(std::io::BufReader::new(std::io::Cursor::new(input)))
569                .expect("parser should succeed");
570        assert!(segments.iter().any(|s| s.tag == "BGM"));
571        // The release sequence '?+' inside 'test?+value' should survive in the element.
572        let bgm = segments.iter().find(|s| s.tag == "BGM").unwrap();
573        let raw_val = bgm
574            .elements
575            .get(1)
576            .and_then(|e| e.components.first())
577            .map(|s| s.as_str());
578        assert_eq!(raw_val, Some("test+value"));
579    }
580}