edifact_rs/tokenizer.rs
1//! EDIFACT tokenizer — splits raw bytes into typed tokens.
2//!
3//! Respects UNA service string advice for non-default delimiters.
4//! Uses `memchr` for fast delimiter scanning (no byte-by-byte inner loops).
5
6use crate::{error::EdifactError, model::Span};
7use memchr::{memchr, memchr3};
8
9/// EDIFACT service string advice (UNA segment).
10///
11/// Defaults: `+` (element), `:` (component), `?` (release), space (reserved), `'` (segment).
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub struct ServiceStringAdvice {
14 /// Data element separator (default `+`)
15 pub element_sep: u8,
16 /// Component data element separator (default `:`)
17 pub component_sep: u8,
18 /// Release character (default `?`)
19 pub release_char: u8,
20 /// Decimal notation mark (default `.`; UNA byte 5, ISO 9735-1 §7.1).
21 /// Not used by the tokenizer for splitting, but preserved for downstream use.
22 pub decimal_mark: u8,
23 /// Segment terminator (default `'`)
24 pub segment_term: u8,
25}
26
27impl Default for ServiceStringAdvice {
28 fn default() -> Self {
29 Self {
30 element_sep: b'+',
31 component_sep: b':',
32 release_char: b'?',
33 decimal_mark: b'.',
34 segment_term: b'\'',
35 }
36 }
37}
38
39impl ServiceStringAdvice {
40 /// Parse a UNA header from the beginning of an EDIFACT interchange.
41 ///
42 /// If no UNA is present, returns [`ServiceStringAdvice::default`].
43 /// Does not validate that the 6 service characters are mutually distinct;
44 /// use [`ServiceStringAdvice::from_bytes_strict`] when that matters.
45 pub fn from_bytes(input: &[u8]) -> Self {
46 // UNA is 9 bytes: "UNA" + 6 service chars
47 if input.len() >= 9 && &input[..3] == b"UNA" {
48 Self {
49 component_sep: input[3],
50 element_sep: input[4],
51 decimal_mark: input[5],
52 release_char: input[6],
53 // input[7] = repetition separator (ISO 9735-4 §3.1; not modelled here)
54 segment_term: input[8],
55 }
56 } else {
57 Self::default()
58 }
59 }
60
61 /// Parse a UNA header and validate that the four active service characters
62 /// (`element_sep`, `component_sep`, `release_char`, `segment_term`) are all
63 /// mutually distinct and are not ASCII whitespace (`CR`, `LF`, space, tab).
64 ///
65 /// Returns [`EdifactError::InvalidUna`] if the invariant is violated.
66 /// Falls back to [`ServiceStringAdvice::default`] when no UNA is present.
67 pub fn from_bytes_strict(input: &[u8]) -> Result<Self, crate::error::EdifactError> {
68 let ssa = Self::from_bytes(input);
69 if !ssa.is_valid() {
70 return Err(crate::error::EdifactError::InvalidUna);
71 }
72 Ok(ssa)
73 }
74
75 /// Return `true` if the four active service characters are mutually distinct
76 /// and none is ASCII whitespace (`CR`, `LF`, space, tab).
77 pub fn is_valid(&self) -> bool {
78 let [e, c, r, t] = [
79 self.element_sep,
80 self.component_sep,
81 self.release_char,
82 self.segment_term,
83 ];
84 let no_ws = |b: u8| !matches!(b, b' ' | b'\t' | b'\r' | b'\n');
85 // All must be non-whitespace and mutually distinct (6 pairwise checks).
86 no_ws(e)
87 && no_ws(c)
88 && no_ws(r)
89 && no_ws(t)
90 && e != c
91 && e != r
92 && e != t
93 && c != r
94 && c != t
95 && r != t
96 }
97}
98
99/// Token produced by [`Tokenizer`].
100#[derive(Debug, Clone, PartialEq, Eq)]
101pub enum Token<'a> {
102 /// 3-character segment tag (e.g. `"BGM"`)
103 SegmentTag {
104 /// Raw tag value.
105 value: &'a str,
106 /// Source span of the tag.
107 span: Span,
108 },
109 /// Data element value (between element separators)
110 DataElement {
111 /// Raw element value.
112 value: &'a str,
113 /// Source span of the element value.
114 span: Span,
115 },
116 /// Component within a composite data element (between component separators)
117 ComponentElement {
118 /// Raw component value.
119 value: &'a str,
120 /// Source span of the component value.
121 span: Span,
122 },
123 /// Segment terminator — signals the end of a segment
124 SegmentTerminator {
125 /// Source span of the segment terminator byte.
126 span: Span,
127 },
128}
129
130#[derive(Debug)]
131pub(crate) struct RawSegment {
132 pub(crate) bytes: Vec<u8>,
133 pub(crate) start_offset: usize,
134}
135
136/// Zero-copy tokenizer over a byte slice.
137///
138/// Yields `Token` values, each borrowing from the original input.
139///
140/// # Segment size guard
141///
142/// Pass a limit to [`Tokenizer::with_limit`] to reject segments that exceed a
143/// byte-length threshold. This bounds both the memory and CPU cost of parsing
144/// a single segment on the zero-copy slice path, and causes an
145/// [`EdifactError::SegmentTooLong`] error when the limit is exceeded.
146/// The default constructor [`Tokenizer::new`] sets no limit (`usize::MAX`).
147pub struct Tokenizer<'a> {
148 input: &'a [u8],
149 pos: usize,
150 ssa: ServiceStringAdvice,
151 state: TokState,
152 /// Maximum allowed segment byte length (tag + elements, **excluding** the
153 /// segment terminator byte itself). Checked in `read_value` and `read_tag`.
154 /// `usize::MAX` = unlimited.
155 max_segment_bytes: usize,
156 /// Byte position where the current segment started (set in `read_tag`).
157 segment_start: usize,
158}
159
160#[derive(Debug, Clone, Copy, PartialEq, Eq)]
161enum TokState {
162 /// Expecting a segment tag next
163 ExpectTag,
164 /// Inside a segment; next byte could be element or component sep, release, or terminator
165 InSegment,
166}
167
168impl<'a> Tokenizer<'a> {
169 /// Return the byte offset of the first non-UNA byte in `input`.
170 ///
171 /// If the input starts with the `UNA` service string advice (first 3
172 /// bytes are `b"UNA"`), the UNA header is exactly 9 bytes long and the
173 /// first segment tag starts at offset 9. Otherwise parsing starts at 0.
174 #[inline]
175 fn una_start_pos(input: &[u8]) -> usize {
176 if input.len() >= 9 && &input[..3] == b"UNA" {
177 9
178 } else {
179 0
180 }
181 }
182
183 /// Construct a zero-copy tokenizer over `input` with explicit service-string advice.
184 ///
185 /// No segment-size limit is applied. Use [`Tokenizer::with_limit`] when
186 /// processing untrusted input to bound CPU and memory usage.
187 ///
188 /// # Security
189 ///
190 /// This constructor imposes **no upper bound** on how many bytes a single
191 /// segment may consume. For untrusted or adversarially crafted input a
192 /// missing segment terminator can cause the tokenizer to scan the entire
193 /// input before returning an error. Call [`Tokenizer::with_limit`]
194 /// instead, or use the higher-level [`crate::from_bytes`] /
195 /// [`crate::from_reader_with_config`] which default to a 64 KiB limit.
196 pub fn new(input: &'a [u8], ssa: ServiceStringAdvice) -> Self {
197 Self {
198 input,
199 pos: Self::una_start_pos(input),
200 ssa,
201 state: TokState::ExpectTag,
202 max_segment_bytes: usize::MAX,
203 segment_start: 0,
204 }
205 }
206
207 /// Construct a tokenizer with a segment-size limit.
208 ///
209 /// If a single segment's byte length (from the start of the tag to the end
210 /// of the last value, not including the terminator itself) exceeds `limit`,
211 /// the iterator returns [`EdifactError::SegmentTooLong`].
212 ///
213 /// # Examples
214 ///
215 /// ```
216 /// use edifact_rs::{ServiceStringAdvice, Tokenizer};
217 ///
218 /// let input = b"BGM+220+PO-4711+9'";
219 /// let ssa = ServiceStringAdvice::default();
220 /// let tokens: Vec<_> = Tokenizer::with_limit(input, ssa, 64)
221 /// .collect::<Result<_, _>>()
222 /// .unwrap();
223 /// assert!(!tokens.is_empty());
224 /// ```
225 pub fn with_limit(input: &'a [u8], ssa: ServiceStringAdvice, max_segment_bytes: usize) -> Self {
226 Self {
227 input,
228 pos: Self::una_start_pos(input),
229 ssa,
230 state: TokState::ExpectTag,
231 max_segment_bytes,
232 segment_start: 0,
233 }
234 }
235
236 /// Current byte position in the input.
237 #[inline]
238 pub fn position(&self) -> usize {
239 self.pos
240 }
241
242 /// Return the service string advice active for this tokenizer.
243 #[inline]
244 pub fn service_string_advice(&self) -> ServiceStringAdvice {
245 self.ssa
246 }
247
248 /// Consume leading whitespace / CR / LF between segments (not inside data values).
249 fn skip_inter_segment_whitespace(&mut self) {
250 while self.pos < self.input.len() {
251 match self.input[self.pos] {
252 b' ' | b'\t' | b'\r' | b'\n' => self.pos += 1,
253 _ => break,
254 }
255 }
256 }
257
258 /// Read a field value starting at `self.pos`, advancing past the value.
259 ///
260 /// Recognises the release character (`?` by default) and returns the raw
261 /// slice including release sequences. The parser layer resolves them.
262 ///
263 /// Uses `memchr3` to bulk-scan over non-special bytes between hits, only
264 /// falling back to a per-byte step when a release character is encountered.
265 fn read_value(&mut self) -> Result<(&'a str, Span), EdifactError> {
266 let start = self.pos;
267 let (elem, comp, release, term) = (
268 self.ssa.element_sep,
269 self.ssa.component_sep,
270 self.ssa.release_char,
271 self.ssa.segment_term,
272 );
273 loop {
274 let remaining = &self.input[self.pos..];
275 if remaining.is_empty() {
276 break;
277 }
278 // Scan for release OR a value-terminating delimiter.
279 // memchr3 can hold three bytes; we combine elem/comp/release.
280 // A separate memchr finds term so we take the nearest hit.
281 let hit_ect = memchr3(elem, comp, release, remaining);
282 let hit_term = memchr(term, remaining);
283 let hit = match (hit_ect, hit_term) {
284 (None, None) => {
285 self.pos += remaining.len();
286 break;
287 }
288 (Some(a), None) => a,
289 (None, Some(b)) => b,
290 (Some(a), Some(b)) => a.min(b),
291 };
292 let b = remaining[hit];
293 if b == release {
294 // A release char must be followed by exactly one escaped byte.
295 // If it is the last byte in the buffer the sequence is malformed.
296 if remaining.len() - hit == 1 {
297 return Err(EdifactError::InvalidReleaseSequence {
298 offset: self.pos + hit,
299 });
300 }
301 // Skip release char + the escaped byte.
302 self.pos += hit + 2;
303 continue;
304 }
305 // b is elem, comp, or term — end of value.
306 self.pos += hit;
307 break;
308 }
309 let span = Span::new(start, self.pos);
310 let value = std::str::from_utf8(&self.input[start..self.pos])
311 .map_err(|_| EdifactError::InvalidText { offset: start })?;
312 // Enforce the per-segment byte-length guard.
313 if self.pos - self.segment_start > self.max_segment_bytes {
314 return Err(EdifactError::SegmentTooLong {
315 offset: self.segment_start,
316 limit: self.max_segment_bytes,
317 });
318 }
319 Ok((value, span))
320 }
321
322 /// Fast scan for the segment tag (exactly 3 ASCII uppercase letters).
323 fn read_tag(&mut self) -> Result<Option<Token<'a>>, EdifactError> {
324 self.skip_inter_segment_whitespace();
325 if self.pos >= self.input.len() {
326 return Ok(None);
327 }
328 let start = self.pos;
329 // A segment tag is terminated by the element separator or segment terminator.
330 // Bound the scan to max_segment_bytes + 1 so adversarial input with no delimiters
331 // cannot force memchr to scan arbitrarily large buffers before we return an error.
332 let input_remaining = &self.input[self.pos..];
333 let scan_limit = self
334 .max_segment_bytes
335 .saturating_add(1)
336 .min(input_remaining.len());
337 let remaining = &input_remaining[..scan_limit];
338 let end = memchr(self.ssa.element_sep, remaining)
339 .or_else(|| memchr(self.ssa.segment_term, remaining))
340 .unwrap_or(remaining.len());
341
342 if end == 0 {
343 // First byte is already a delimiter — tag is zero-length, which is invalid.
344 let byte = self.input[self.pos];
345 self.pos += 1;
346 return Err(EdifactError::InvalidDelimiter {
347 byte,
348 offset: start,
349 });
350 }
351
352 // Enforce the per-segment byte-length guard in read_tag as well.
353 // Without this check, adversarial input with no delimiters could cause
354 // memchr to scan the entire remaining buffer (potentially hundreds of MB).
355 if end > self.max_segment_bytes {
356 // Advance past the offending bytes so the iterator can continue.
357 self.pos = start + end;
358 return Err(EdifactError::SegmentTooLong {
359 offset: start,
360 limit: self.max_segment_bytes,
361 });
362 }
363 let tag_bytes = &self.input[start..start + end];
364 // Always advance pos so errors cannot cause an infinite retry loop.
365 self.pos = start + end;
366 // Record segment start for the size-limit check in read_value.
367 self.segment_start = start;
368 let tag = std::str::from_utf8(tag_bytes)
369 .map_err(|_| EdifactError::InvalidSegmentTag(format!("{tag_bytes:?}")))?;
370 if tag.len() != 3 || !tag.bytes().all(|b| b.is_ascii_uppercase()) {
371 return Err(EdifactError::InvalidSegmentTag(tag.to_owned()));
372 }
373 self.state = TokState::InSegment;
374 Ok(Some(Token::SegmentTag {
375 value: tag,
376 span: Span::new(start, start + end),
377 }))
378 }
379}
380
381impl<'a> Iterator for Tokenizer<'a> {
382 type Item = Result<Token<'a>, EdifactError>;
383
384 fn next(&mut self) -> Option<Self::Item> {
385 loop {
386 if self.pos >= self.input.len() {
387 return None;
388 }
389
390 match self.state {
391 TokState::ExpectTag => {
392 return match self.read_tag() {
393 Ok(Some(tok)) => Some(Ok(tok)),
394 Ok(None) => None,
395 Err(e) => Some(Err(e)),
396 };
397 }
398 TokState::InSegment => {
399 let b = self.input[self.pos];
400 let (elem, comp, term) = (
401 self.ssa.element_sep,
402 self.ssa.component_sep,
403 self.ssa.segment_term,
404 );
405
406 if b == term {
407 let start = self.pos;
408 self.pos += 1;
409 self.state = TokState::ExpectTag;
410 return Some(Ok(Token::SegmentTerminator {
411 span: Span::new(start, self.pos),
412 }));
413 } else if b == elem {
414 self.pos += 1;
415 let (value, span) = match self.read_value() {
416 Ok(value) => value,
417 Err(error) => return Some(Err(error)),
418 };
419 // Peek: is the *next* byte a component sep?
420 // We emit DataElement for the leading sub-element regardless;
421 // subsequent components within the same element are ComponentElement.
422 return Some(Ok(Token::DataElement { value, span }));
423 } else if b == comp {
424 self.pos += 1;
425 let (value, span) = match self.read_value() {
426 Ok(value) => value,
427 Err(error) => return Some(Err(error)),
428 };
429 return Some(Ok(Token::ComponentElement { value, span }));
430 } else if b == b'\r' || b == b'\n' {
431 self.pos += 1;
432 // inter-element whitespace inside a segment — skip
433 continue;
434 } else {
435 // Unexpected byte inside a segment — skip it and report.
436 let offset = self.pos;
437 self.pos += 1; // always advance to prevent infinite retry loop
438 self.state = TokState::ExpectTag;
439 return Some(Err(EdifactError::InvalidDelimiter { byte: b, offset }));
440 }
441 }
442 }
443 }
444 }
445}
446
447#[cfg(test)]
448mod tests {
449 use super::*;
450
451 fn tokens(input: &[u8]) -> Vec<Token<'_>> {
452 let ssa = ServiceStringAdvice::from_bytes(input);
453 Tokenizer::new(input, ssa)
454 .collect::<Result<Vec<_>, _>>()
455 .expect("tokenize failed")
456 }
457
458 #[test]
459 fn minimal_unb_unz() {
460 let input = b"UNB+UNOA:1+SENDER+RECEIVER+200101:0900+1'UNZ+0+1'";
461 let toks = tokens(input);
462 assert!(matches!(toks[0], Token::SegmentTag { value: "UNB", .. }));
463 // should end with UNZ terminator
464 assert!(matches!(toks.last(), Some(Token::SegmentTerminator { .. })));
465 }
466
467 #[test]
468 fn release_character_not_a_delimiter() {
469 // `?+` inside a value must NOT produce a DataElement split
470 let input = b"BGM+220+test?+value'";
471 let toks = tokens(input);
472 // Elements after BGM tag: "220", "test?+value"
473 let vals: Vec<_> = toks
474 .iter()
475 .filter_map(|t| {
476 if let Token::DataElement { value, .. } = t {
477 Some(*value)
478 } else {
479 None
480 }
481 })
482 .collect();
483 assert_eq!(vals, vec!["220", "test?+value"]);
484 }
485
486 #[test]
487 fn custom_una_delimiters() {
488 // UNA with `;` as element sep
489 let input = b"UNA:;.? 'BGM;220;hello'";
490 let toks = tokens(input);
491 assert!(matches!(toks[0], Token::SegmentTag { value: "BGM", .. }));
492 let vals: Vec<_> = toks
493 .iter()
494 .filter_map(|t| {
495 if let Token::DataElement { value, .. } = t {
496 Some(*value)
497 } else {
498 None
499 }
500 })
501 .collect();
502 assert!(vals.contains(&"220"));
503 }
504
505 #[test]
506 fn tokens_expose_spans() {
507 let input = b"BGM+220+ABC'";
508 let toks = tokens(input);
509 assert!(matches!(
510 toks[0],
511 Token::SegmentTag {
512 value: "BGM",
513 span: Span { start: 0, end: 3 }
514 }
515 ));
516 assert!(matches!(
517 toks[1],
518 Token::DataElement {
519 value: "220",
520 span: Span { start: 4, end: 7 }
521 }
522 ));
523 }
524
525 #[test]
526 fn truncated_input_does_not_panic() {
527 let input = b"UNB+UNOA:1"; // no terminator
528 let _: Vec<_> = Tokenizer::new(input, ServiceStringAdvice::default()).collect();
529 // must not panic regardless of result
530 }
531
532 #[test]
533 fn invalid_segment_tags_are_rejected() {
534 for input in [
535 &b"bgm+220+'"[..],
536 &b"ABCDE+220+'"[..],
537 &b"BGM1+220+'"[..],
538 &b"BGM +220+'"[..],
539 &b" BG+220+'"[..],
540 ] {
541 let result = Tokenizer::new(input, ServiceStringAdvice::default())
542 .collect::<Result<Vec<_>, _>>();
543 assert!(result.is_err(), "expected tag rejection for {input:?}");
544 }
545 }
546
547 #[test]
548 fn chunked_reader_parses_via_parser() {
549 // The reader tokenizer path was removed; verify the equivalent via the parser.
550 let input = b"UNA:+.? 'BGM+220+test?+value'UNT+2+1'";
551 let segments =
552 crate::parser::from_bufread(std::io::BufReader::new(std::io::Cursor::new(input)))
553 .expect("parser should succeed");
554 assert!(segments.iter().any(|s| s.tag == "BGM"));
555 // The release sequence '?+' inside 'test?+value' should survive in the element.
556 let bgm = segments.iter().find(|s| s.tag == "BGM").unwrap();
557 let raw_val = bgm
558 .elements
559 .get(1)
560 .and_then(|e| e.components.first())
561 .map(|s| s.as_str());
562 assert_eq!(raw_val, Some("test+value"));
563 }
564}