edifact_parser/
tokenizer.rs1use edifact_primitives::EdifactDelimiters;
2
3pub struct EdifactTokenizer {
8 delimiters: EdifactDelimiters,
9}
10
11impl EdifactTokenizer {
12 pub fn new(delimiters: EdifactDelimiters) -> Self {
14 Self { delimiters }
15 }
16
17 pub fn delimiters(&self) -> &EdifactDelimiters {
19 &self.delimiters
20 }
21
22 pub fn tokenize_segments<'a>(&self, input: &'a [u8]) -> SegmentIter<'a> {
30 SegmentIter {
31 input,
32 pos: 0,
33 segment_terminator: self.delimiters.segment,
34 release_char: self.delimiters.release,
35 }
36 }
37
38 pub fn tokenize_elements<'a>(&self, segment: &'a str) -> ElementIter<'a> {
43 ElementIter {
44 input: segment,
45 pos: 0,
46 separator: self.delimiters.element as char,
47 release: self.delimiters.release as char,
48 }
49 }
50
51 pub fn tokenize_components<'a>(&self, element: &'a str) -> ComponentIter<'a> {
55 ComponentIter {
56 input: element,
57 pos: 0,
58 separator: self.delimiters.component as char,
59 release: self.delimiters.release as char,
60 }
61 }
62}
63
64pub struct SegmentIter<'a> {
66 input: &'a [u8],
67 pos: usize,
68 segment_terminator: u8,
69 release_char: u8,
70}
71
72impl<'a> Iterator for SegmentIter<'a> {
73 type Item = &'a str;
74
75 fn next(&mut self) -> Option<Self::Item> {
76 while self.pos < self.input.len() {
78 let b = self.input[self.pos];
79 if b == b'\r' || b == b'\n' || b == b' ' || b == b'\t' {
80 self.pos += 1;
81 } else {
82 break;
83 }
84 }
85
86 if self.pos >= self.input.len() {
87 return None;
88 }
89
90 let start = self.pos;
91 let mut i = self.pos;
92
93 while i < self.input.len() {
94 let b = self.input[i];
95
96 if b == b'\r' || b == b'\n' {
98 i += 1;
99 continue;
100 }
101
102 if b == self.release_char && i + 1 < self.input.len() {
104 i += 2; continue;
106 }
107
108 if b == self.segment_terminator {
109 let segment_bytes = &self.input[start..i];
111 self.pos = i + 1;
112
113 let segment_str = strip_crlf(segment_bytes);
115 if segment_str.is_empty() {
116 return self.next(); }
118 return Some(segment_str);
119 }
120
121 i += 1;
122 }
123
124 if start < self.input.len() {
126 let segment_bytes = &self.input[start..];
127 self.pos = self.input.len();
128 let segment_str = strip_crlf(segment_bytes);
129 if segment_str.is_empty() {
130 return None;
131 }
132 return Some(segment_str);
133 }
134
135 None
136 }
137}
138
139fn strip_crlf(bytes: &[u8]) -> &str {
145 let s = std::str::from_utf8(bytes).unwrap_or("");
147 s.trim_matches(|c: char| c == '\r' || c == '\n')
148}
149
150pub struct ElementIter<'a> {
152 input: &'a str,
153 pos: usize,
154 separator: char,
155 release: char,
156}
157
158impl<'a> Iterator for ElementIter<'a> {
159 type Item = &'a str;
160
161 fn next(&mut self) -> Option<Self::Item> {
162 if self.pos > self.input.len() {
163 return None;
164 }
165
166 let start = self.pos;
167 let bytes = self.input.as_bytes();
168 let mut i = self.pos;
169
170 while i < bytes.len() {
171 let ch = bytes[i] as char;
172
173 if ch == self.release && i + 1 < bytes.len() {
175 i += 2;
176 continue;
177 }
178
179 if ch == self.separator {
180 let element = &self.input[start..i];
181 self.pos = i + 1;
182 return Some(element);
183 }
184
185 i += 1;
186 }
187
188 if start <= self.input.len() {
190 let element = &self.input[start..];
191 self.pos = self.input.len() + 1; return Some(element);
193 }
194
195 None
196 }
197}
198
199pub struct ComponentIter<'a> {
201 input: &'a str,
202 pos: usize,
203 separator: char,
204 release: char,
205}
206
207impl<'a> Iterator for ComponentIter<'a> {
208 type Item = &'a str;
209
210 fn next(&mut self) -> Option<Self::Item> {
211 if self.pos > self.input.len() {
212 return None;
213 }
214
215 let start = self.pos;
216 let bytes = self.input.as_bytes();
217 let mut i = self.pos;
218
219 while i < bytes.len() {
220 let ch = bytes[i] as char;
221
222 if ch == self.release && i + 1 < bytes.len() {
224 i += 2;
225 continue;
226 }
227
228 if ch == self.separator {
229 let component = &self.input[start..i];
230 self.pos = i + 1;
231 return Some(component);
232 }
233
234 i += 1;
235 }
236
237 if start <= self.input.len() {
239 let component = &self.input[start..];
240 self.pos = self.input.len() + 1;
241 return Some(component);
242 }
243
244 None
245 }
246}
247
248#[cfg(test)]
249mod tests {
250 use super::*;
251
252 #[test]
253 fn test_tokenize_segments_simple() {
254 let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
255 let input = b"UNB+UNOC:3'UNH+00001'UNT+2+00001'UNZ+1'";
256 let segments: Vec<&str> = tokenizer.tokenize_segments(input).collect();
257 assert_eq!(
258 segments,
259 vec!["UNB+UNOC:3", "UNH+00001", "UNT+2+00001", "UNZ+1"]
260 );
261 }
262
263 #[test]
264 fn test_tokenize_segments_with_newlines() {
265 let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
266 let input = b"UNB+UNOC:3'\nUNH+00001'\r\nUNT+2+00001'\nUNZ+1'";
267 let segments: Vec<&str> = tokenizer.tokenize_segments(input).collect();
268 assert_eq!(
269 segments,
270 vec!["UNB+UNOC:3", "UNH+00001", "UNT+2+00001", "UNZ+1"]
271 );
272 }
273
274 #[test]
275 fn test_tokenize_segments_with_release_char() {
276 let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
277 let input = b"FTX+ACB+++text with ?'quotes?''";
279 let segments: Vec<&str> = tokenizer.tokenize_segments(input).collect();
280 assert_eq!(segments.len(), 1);
281 assert_eq!(segments[0], "FTX+ACB+++text with ?'quotes?'");
282 }
283
284 #[test]
285 fn test_tokenize_segments_empty_input() {
286 let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
287 let input = b"";
288 let segments: Vec<&str> = tokenizer.tokenize_segments(input).collect();
289 assert!(segments.is_empty());
290 }
291
292 #[test]
293 fn test_tokenize_segments_trailing_whitespace() {
294 let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
295 let input = b"UNH+00001' \n ";
296 let segments: Vec<&str> = tokenizer.tokenize_segments(input).collect();
297 assert_eq!(segments, vec!["UNH+00001"]);
298 }
299
300 #[test]
301 fn test_tokenize_segments_custom_delimiter() {
302 let delimiters = EdifactDelimiters {
303 segment: b'!',
304 ..EdifactDelimiters::default()
305 };
306 let tokenizer = EdifactTokenizer::new(delimiters);
307 let input = b"UNB+UNOC:3!UNH+00001!";
308 let segments: Vec<&str> = tokenizer.tokenize_segments(input).collect();
309 assert_eq!(segments, vec!["UNB+UNOC:3", "UNH+00001"]);
310 }
311
312 #[test]
315 fn test_tokenize_elements() {
316 let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
317 let elements: Vec<&str> = tokenizer
318 .tokenize_elements("NAD+Z04+9900123000002:500")
319 .collect();
320 assert_eq!(elements, vec!["NAD", "Z04", "9900123000002:500"]);
321 }
322
323 #[test]
324 fn test_tokenize_elements_escaped_plus() {
325 let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
326 let elements: Vec<&str> = tokenizer
327 .tokenize_elements("FTX+ACB+++value with ?+plus")
328 .collect();
329 assert_eq!(elements, vec!["FTX", "ACB", "", "", "value with ?+plus"]);
331 }
332
333 #[test]
334 fn test_tokenize_components() {
335 let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
336 let components: Vec<&str> = tokenizer
337 .tokenize_components("UTILMD:D:11A:UN:S2.1")
338 .collect();
339 assert_eq!(components, vec!["UTILMD", "D", "11A", "UN", "S2.1"]);
340 }
341
342 #[test]
343 fn test_tokenize_components_escaped_colon() {
344 let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
345 let components: Vec<&str> = tokenizer.tokenize_components("value?:with:colon").collect();
346 assert_eq!(components, vec!["value?:with", "colon"]);
348 }
349
350 #[test]
351 fn test_tokenize_components_empty() {
352 let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
353 let components: Vec<&str> = tokenizer.tokenize_components("Z04::500").collect();
354 assert_eq!(components, vec!["Z04", "", "500"]);
355 }
356
357 #[test]
358 fn test_full_tokenization_pipeline() {
359 let tokenizer = EdifactTokenizer::new(EdifactDelimiters::default());
360 let input = b"NAD+Z04+9900123000002::293'DTM+137:202501010000?+01:303'";
361
362 let segments: Vec<&str> = tokenizer.tokenize_segments(input).collect();
363 assert_eq!(segments.len(), 2);
364
365 let elements: Vec<&str> = tokenizer.tokenize_elements(segments[0]).collect();
367 assert_eq!(elements, vec!["NAD", "Z04", "9900123000002::293"]);
368
369 let components: Vec<&str> = tokenizer.tokenize_components(elements[2]).collect();
371 assert_eq!(components, vec!["9900123000002", "", "293"]);
372
373 let dtm_elements: Vec<&str> = tokenizer.tokenize_elements(segments[1]).collect();
375 assert_eq!(dtm_elements, vec!["DTM", "137:202501010000?+01:303"]);
376
377 let dtm_components: Vec<&str> = tokenizer.tokenize_components(dtm_elements[1]).collect();
379 assert_eq!(dtm_components, vec!["137", "202501010000?+01", "303"]);
380 }
381}