edifact_rs/writer.rs
1//! EDIFACT writer — serializes [`Segment`]s to wire format.
2
3use crate::{error::EdifactError, model::Segment, tokenizer::ServiceStringAdvice};
4use std::borrow::Cow;
5use std::io::Write;
6
7/// Streaming EDIFACT writer.
8///
9/// Wraps any [`Write`] implementation and serializes segments one at a time.
10/// Call [`Writer::finish`] to flush and get the underlying writer back.
11pub struct Writer<W: Write> {
12 inner: W,
13 ssa: ServiceStringAdvice,
14 /// Running count of segments written. `u64` to prevent silent overflow on
15 /// pathological inputs (a `u32` would wrap after ~4 billion segments).
16 segment_count: u64,
17}
18
19impl<W: Write> Writer<W> {
20 /// Create a new writer with default EDIFACT delimiters.
21 pub fn new(inner: W) -> Self {
22 Self {
23 inner,
24 ssa: ServiceStringAdvice::default(),
25 segment_count: 0,
26 }
27 }
28
29 /// Create a writer with custom delimiters and write a UNA segment first.
30 pub fn with_una(mut inner: W, ssa: ServiceStringAdvice) -> Result<Self, EdifactError> {
31 // All five active service characters must be mutually distinct, non-whitespace,
32 // and within the ASCII range so they never bisect multi-byte UTF-8 sequences.
33 if !ssa.is_valid() {
34 return Err(EdifactError::InvalidUna);
35 }
36 // UNA: component_sep, element_sep, decimal_mark, release_char, space, segment_term
37 let una = [
38 b'U',
39 b'N',
40 b'A',
41 ssa.component_sep,
42 ssa.element_sep,
43 ssa.decimal_mark,
44 ssa.release_char,
45 b' ',
46 ssa.segment_term,
47 ];
48 inner.write_all(&una)?;
49 Ok(Self {
50 inner,
51 ssa,
52 segment_count: 0,
53 })
54 }
55
56 /// Write a single segment.
57 pub fn write_segment(&mut self, seg: &Segment<'_>) -> Result<(), EdifactError> {
58 // Tag
59 self.inner.write_all(seg.tag.as_bytes())?;
60
61 for element in &seg.elements {
62 // Element separator
63 self.inner.write_all(&[self.ssa.element_sep])?;
64 let mut first_component = true;
65 for (component, _) in &element.components {
66 if !first_component {
67 self.inner.write_all(&[self.ssa.component_sep])?;
68 }
69 first_component = false;
70 self.write_escaped(component)?;
71 }
72 }
73
74 // Segment terminator
75 self.inner.write_all(&[self.ssa.segment_term])?;
76 self.segment_count += 1;
77 Ok(())
78 }
79
80 /// Write a raw segment from tag + element string slices.
81 ///
82 /// Each element string is split on the **active component-separator byte** from the
83 /// configured [`ServiceStringAdvice`][crate::ServiceStringAdvice] to identify component
84 /// boundaries. The default component separator is `:` (0x3A), but this can differ when a
85 /// non-default `UNA` string was used to construct the writer.
86 ///
87 /// # Delimiter dependency
88 ///
89 /// Callers that embed the literal `:` character in element strings rely on `:` being
90 /// the component separator. When the writer uses a non-default delimiter set, `:` will
91 /// **not** be treated as a component boundary and the segment will be written incorrectly.
92 ///
93 /// **UTF-8 safety**: EDIFACT syntax requires all delimiter bytes to be single-byte ASCII
94 /// characters (values 0x00–0x7F). Non-ASCII delimiter bytes would bisect multi-byte UTF-8
95 /// sequences in data values and produce malformed output. All fields of
96 /// [`ServiceStringAdvice`][crate::ServiceStringAdvice] must therefore hold ASCII byte values.
97 ///
98 /// To produce correct output regardless of the active delimiter, prefer
99 /// [`Self::write_segment_parts`] which accepts pre-split component slices.
100 pub fn write_raw(&mut self, tag: &str, elements: &[&str]) -> Result<(), EdifactError> {
101 self.inner.write_all(tag.as_bytes())?;
102 let comp_sep = self.ssa.component_sep;
103 for el in elements {
104 self.inner.write_all(&[self.ssa.element_sep])?;
105 // Byte-level split: EDIFACT delimiters are always single bytes.
106 let mut parts = el.as_bytes().split(|&b| b == comp_sep);
107 if let Some(first) = parts.next() {
108 // INVARIANT: input is valid UTF-8 and we split on a single-byte ASCII
109 // delimiter, so each part remains a valid UTF-8 slice.
110 self.write_escaped(
111 std::str::from_utf8(first).map_err(|_| EdifactError::InvalidUtf8)?,
112 )?;
113 }
114 for part in parts {
115 self.inner.write_all(&[comp_sep])?;
116 self.write_escaped(
117 std::str::from_utf8(part).map_err(|_| EdifactError::InvalidUtf8)?,
118 )?;
119 }
120 }
121 self.inner.write_all(&[self.ssa.segment_term])?;
122 self.segment_count += 1;
123 Ok(())
124 }
125
126 /// Write a segment from a tag and pre-split element/component data.
127 ///
128 /// `elements` is a slice of elements; each element is a sequence of component strings.
129 /// This avoids the lifetime constraints of [`Self::write_segment`] when building
130 /// segments from runtime-owned data (e.g. inside [`crate::WriterEmitter`]).
131 pub fn write_segment_parts<E>(&mut self, tag: &str, elements: &[E]) -> Result<(), EdifactError>
132 where
133 E: AsRef<[String]>,
134 {
135 self.inner.write_all(tag.as_bytes())?;
136 for element in elements {
137 self.inner.write_all(&[self.ssa.element_sep])?;
138 let mut first = true;
139 for comp in element.as_ref() {
140 if !first {
141 self.inner.write_all(&[self.ssa.component_sep])?;
142 }
143 first = false;
144 self.write_escaped(comp.as_str())?;
145 }
146 }
147 self.inner.write_all(&[self.ssa.segment_term])?;
148 self.segment_count += 1;
149 Ok(())
150 }
151
152 /// Flush and return the underlying writer.
153 pub fn finish(mut self) -> Result<W, EdifactError> {
154 self.inner.flush()?;
155 Ok(self.inner)
156 }
157
158 /// Write the `UNT` segment and return the inner writer.
159 ///
160 /// The segment count written into `UNT` element 1 (DE 0074) is the number of
161 /// segments already written **plus one** for the `UNT` segment itself, which
162 /// EDIFACT requires to be included in the count alongside `UNH`.
163 ///
164 /// # Errors
165 ///
166 /// Returns an error if writing fails. Do **not** call [`write_raw`][Self::write_raw] or
167 /// [`write_segment`][Self::write_segment] after `finish_unt` — the writer is consumed.
168 pub fn finish_unt(mut self, message_ref: &str) -> Result<W, EdifactError> {
169 // DE 0074: count includes UNH and UNT themselves.
170 let count = self.segment_count + 1;
171 let count_str = count.to_string();
172 self.write_raw("UNT", &[count_str.as_str(), message_ref])?;
173 self.finish()
174 }
175
176 /// Returns the total number of segments written so far.
177 pub fn segment_count(&self) -> u64 {
178 self.segment_count
179 }
180
181 /// Returns the active [`ServiceStringAdvice`] (delimiter configuration).
182 pub fn service_string_advice(&self) -> ServiceStringAdvice {
183 self.ssa
184 }
185
186 /// Escape a value string for inclusion in an EDIFACT segment.
187 ///
188 /// Any character in `value` that matches the active element separator,
189 /// component separator, release character, or segment terminator is escaped
190 /// by prefixing it with the release character (default `?`).
191 ///
192 /// Returns a borrowed `Cow::Borrowed(value)` when no escaping is needed,
193 /// avoiding an allocation on the fast path.
194 ///
195 /// # Example
196 ///
197 /// ```rust,ignore
198 /// let writer = Writer::new(std::io::sink());
199 /// // '+' must be escaped since it is the default element separator.
200 /// assert_eq!(writer.escape_value("price+tax"), "price?+tax");
201 /// ```
202 pub fn escape_value<'v>(&self, value: &'v str) -> Cow<'v, str> {
203 let (elem, comp, release, term) = (
204 self.ssa.element_sep,
205 self.ssa.component_sep,
206 self.ssa.release_char,
207 self.ssa.segment_term,
208 );
209 let bytes = value.as_bytes();
210 let needs_escape = bytes
211 .iter()
212 .any(|&b| b == elem || b == comp || b == release || b == term);
213 if !needs_escape {
214 return Cow::Borrowed(value);
215 }
216 let mut out = Vec::with_capacity(value.len() + 4);
217 let mut last = 0;
218 let mut pos = 0;
219 while pos < bytes.len() {
220 let remaining = &bytes[pos..];
221 let hit_ecr = memchr::memchr3(elem, comp, release, remaining);
222 let hit_t = memchr::memchr(term, remaining);
223 let hit = match (hit_ecr, hit_t) {
224 (None, None) => break,
225 (Some(a), None) => a,
226 (None, Some(b)) => b,
227 (Some(a), Some(b)) => a.min(b),
228 };
229 let abs = pos + hit;
230 out.extend_from_slice(&bytes[last..abs]);
231 out.push(release);
232 out.push(bytes[abs]);
233 last = abs + 1;
234 pos = abs + 1;
235 }
236 out.extend_from_slice(&bytes[last..]);
237 // SAFETY:
238 // 1. `value` is a valid `&str`, so `bytes` is valid UTF-8 to start.
239 // 2. `self.ssa.release_char` is a single-byte ASCII value (0x21–0x7E),
240 // enforced at construction time by `ServiceStringAdvice::is_valid()`
241 // (called in `Writer::with_una`; the default SSA hardcodes `?` = 0x3F).
242 // Inserting a single ASCII byte cannot split or corrupt a multi-byte
243 // UTF-8 sequence, because ASCII bytes always have the high bit clear
244 // while continuation bytes of multi-byte sequences always have the high
245 // bit set (0x80–0xBF).
246 // 3. All other bytes are copied verbatim from the valid UTF-8 source.
247 Cow::Owned(unsafe { String::from_utf8_unchecked(out) })
248 }
249 /// Write only the segment tag bytes — no element separator or terminator.
250 ///
251 /// Used by [`crate::WriterEmitter`] for eager, zero-allocation event writing.
252 #[inline]
253 pub(crate) fn write_tag_only(&mut self, tag: &str) -> Result<(), EdifactError> {
254 self.inner.write_all(tag.as_bytes())?;
255 Ok(())
256 }
257
258 /// Write one element separator byte.
259 #[inline]
260 pub(crate) fn write_element_sep(&mut self) -> Result<(), EdifactError> {
261 self.inner.write_all(&[self.ssa.element_sep])?;
262 Ok(())
263 }
264
265 /// Write one component separator byte.
266 #[inline]
267 pub(crate) fn write_component_sep(&mut self) -> Result<(), EdifactError> {
268 self.inner.write_all(&[self.ssa.component_sep])?;
269 Ok(())
270 }
271
272 /// Write the segment terminator and increment the internal segment counter.
273 #[inline]
274 pub(crate) fn write_segment_term_and_count(&mut self) -> Result<(), EdifactError> {
275 self.inner.write_all(&[self.ssa.segment_term])?;
276 self.segment_count += 1;
277 Ok(())
278 }
279
280 /// Write a value, escaping any delimiter characters.
281 pub(crate) fn write_escaped(&mut self, value: &str) -> Result<(), EdifactError> {
282 let (elem, comp, release, term) = (
283 self.ssa.element_sep,
284 self.ssa.component_sep,
285 self.ssa.release_char,
286 self.ssa.segment_term,
287 );
288 let bytes = value.as_bytes();
289 let mut last = 0;
290 let mut pos = 0;
291 while pos < bytes.len() {
292 // Use memchr3 for three delimiters + memchr for the fourth to avoid
293 // a manual byte-by-byte scan.
294 let remaining = &bytes[pos..];
295 let hit_ecr = memchr::memchr3(elem, comp, release, remaining);
296 let hit_t = memchr::memchr(term, remaining);
297 let hit = match (hit_ecr, hit_t) {
298 (None, None) => break,
299 (Some(a), None) => a,
300 (None, Some(b)) => b,
301 (Some(a), Some(b)) => a.min(b),
302 };
303 let abs = pos + hit;
304 if abs > last {
305 self.inner.write_all(&bytes[last..abs])?;
306 }
307 self.inner.write_all(&[release, bytes[abs]])?;
308 last = abs + 1;
309 pos = abs + 1;
310 }
311 self.inner.write_all(&bytes[last..])?;
312 Ok(())
313 }
314}
315
316#[cfg(test)]
317mod tests {
318 use super::*;
319 use crate::model::Element;
320
321 #[test]
322 fn write_and_parse_simple_segment() {
323 let segs: Vec<Segment<'static>> = vec![Segment::new(
324 "BGM",
325 vec![Element::of(&["220"]), Element::of(&["ORDER123"])],
326 )];
327 let bytes = crate::segments_to_bytes(&segs).unwrap();
328 let s = std::str::from_utf8(&bytes).unwrap();
329 assert!(s.starts_with("BGM+220+ORDER123'"));
330 }
331
332 #[test]
333 fn release_char_escaped() {
334 let segs: Vec<Segment<'static>> = vec![Segment::new(
335 "FTX",
336 vec![Element::of(&["value+with+delimiters"])],
337 )];
338 let bytes = crate::segments_to_bytes(&segs).unwrap();
339 let s = std::str::from_utf8(&bytes).unwrap();
340 // The `+` in the value must be escaped as `?+`
341 assert!(s.contains("?+"), "escape missing: {s}");
342 }
343
344 #[test]
345 fn round_trip_preserves_values() {
346 let segs: Vec<Segment<'static>> = vec![
347 Segment::new(
348 "UNB",
349 vec![
350 Element::of(&["UNOA", "1"]),
351 Element::of(&["SENDER"]),
352 Element::of(&["RECEIVER"]),
353 ],
354 ),
355 Segment::new("UNZ", vec![Element::of(&["0"]), Element::of(&["1"])]),
356 ];
357 let bytes = crate::segments_to_bytes(&segs).unwrap();
358 let rt: Vec<crate::OwnedSegment> = crate::parser::from_reader(std::io::Cursor::new(&bytes))
359 .expect("round-trip parse failed");
360 assert_eq!(rt[0].tag, "UNB");
361 assert_eq!(rt[0].as_borrowed().element_str(0), Some("UNOA"));
362 assert_eq!(rt[1].tag, "UNZ");
363 }
364
365 /// Verify that `Writer::with_una` uses the configured delimiters throughout,
366 /// and that `write_segment_parts` (the delimiter-agnostic API) produces correct
367 /// component separators even with a non-default UNA.
368 #[test]
369 fn with_una_non_default_delimiters() {
370 use crate::tokenizer::ServiceStringAdvice;
371
372 // Custom UNA: comp_sep=| elem_sep=! esc=? dec_mark=, seg_term=~
373 let ssa = ServiceStringAdvice {
374 component_sep: b'|',
375 element_sep: b'!',
376 release_char: b'?',
377 decimal_mark: b',',
378 segment_term: b'~',
379 };
380
381 let buf = Vec::new();
382 let mut writer = Writer::with_una(buf, ssa).expect("writer creation failed");
383
384 // write_segment_parts: pre-split; no hard-coded `:` in element strings
385 writer
386 .write_segment_parts(
387 "BGM",
388 &[
389 vec!["220".to_owned(), "SUB1".to_owned()],
390 vec!["PO1".to_owned()],
391 ],
392 )
393 .expect("write failed");
394
395 let out = writer.finish().expect("finish failed");
396 let s = std::str::from_utf8(&out).unwrap();
397
398 // Output must use `!` as element separator, `|` as component separator, `~` as terminator.
399 // The writer also emits a UNA header when with_una is used.
400 assert!(s.contains("BGM"), "BGM segment missing: {s}");
401 // Slice after UNA so assertions target segment output, not UNA header bytes.
402 let after_una = s.find("BGM").map(|i| &s[i..]).unwrap_or(s);
403 assert!(
404 after_una.contains('!'),
405 "missing element sep in segment: {after_una}"
406 );
407 assert!(
408 after_una.contains('|'),
409 "missing component sep in segment: {after_una}"
410 );
411 assert!(
412 after_una.ends_with('~'),
413 "missing segment term in segment: {after_una}"
414 );
415 // Decimal mark appears in the UNA header (no decimal-bearing values in this segment).
416 assert!(s.contains(','), "missing decimal mark in UNA: {s}");
417 assert!(!s.contains('+'), "default element sep leaked: {s}");
418 assert!(!s.contains(':'), "default component sep leaked: {s}");
419 // segment_term '~' is not the default; ensure no default ' leaks (UNA itself aside)
420 assert!(
421 !after_una.contains('\''),
422 "default segment term leaked after UNA: {after_una}"
423 );
424 }
425}