Skip to main content

edifact_rs/
writer.rs

1//! EDIFACT writer — serializes [`Segment`]s to wire format.
2
3use crate::{error::EdifactError, model::Segment, tokenizer::ServiceStringAdvice};
4use std::borrow::Cow;
5use std::io::Write;
6
7/// Streaming EDIFACT writer.
8///
9/// Wraps any [`Write`] implementation and serializes segments one at a time.
10/// Call [`Writer::finish`] to flush and get the underlying writer back.
11pub struct Writer<W: Write> {
12    inner: W,
13    ssa: ServiceStringAdvice,
14    /// Running count of segments written.  `u64` to prevent silent overflow on
15    /// pathological inputs (a `u32` would wrap after ~4 billion segments).
16    segment_count: u64,
17}
18
19impl<W: Write> Writer<W> {
20    /// Create a new writer with default EDIFACT delimiters.
21    pub fn new(inner: W) -> Self {
22        Self {
23            inner,
24            ssa: ServiceStringAdvice::default(),
25            segment_count: 0,
26        }
27    }
28
29    /// Create a writer with custom delimiters and write a UNA segment first.
30    pub fn with_una(mut inner: W, ssa: ServiceStringAdvice) -> Result<Self, EdifactError> {
31        // All five active service characters must be mutually distinct, non-whitespace,
32        // and within the ASCII range so they never bisect multi-byte UTF-8 sequences.
33        if !ssa.is_valid() {
34            return Err(EdifactError::InvalidUna);
35        }
36        // UNA: component_sep, element_sep, decimal_mark, release_char, space, segment_term
37        let una = [
38            b'U',
39            b'N',
40            b'A',
41            ssa.component_sep,
42            ssa.element_sep,
43            ssa.decimal_mark,
44            ssa.release_char,
45            b' ',
46            ssa.segment_term,
47        ];
48        inner.write_all(&una)?;
49        Ok(Self {
50            inner,
51            ssa,
52            segment_count: 0,
53        })
54    }
55
56    /// Write a single segment.
57    pub fn write_segment(&mut self, seg: &Segment<'_>) -> Result<(), EdifactError> {
58        // Tag
59        self.inner.write_all(seg.tag.as_bytes())?;
60
61        for element in &seg.elements {
62            // Element separator
63            self.inner.write_all(&[self.ssa.element_sep])?;
64            let mut first_component = true;
65            for (component, _) in &element.components {
66                if !first_component {
67                    self.inner.write_all(&[self.ssa.component_sep])?;
68                }
69                first_component = false;
70                self.write_escaped(component)?;
71            }
72        }
73
74        // Segment terminator
75        self.inner.write_all(&[self.ssa.segment_term])?;
76        self.segment_count += 1;
77        Ok(())
78    }
79
80    /// Write a raw segment from tag + element string slices.
81    ///
82    /// Each element string is split on the **active component-separator byte** from the
83    /// configured [`ServiceStringAdvice`][crate::ServiceStringAdvice] to identify component
84    /// boundaries.  The default component separator is `:` (0x3A), but this can differ when a
85    /// non-default `UNA` string was used to construct the writer.
86    ///
87    /// # Delimiter dependency
88    ///
89    /// Callers that embed the literal `:` character in element strings rely on `:` being
90    /// the component separator.  When the writer uses a non-default delimiter set, `:` will
91    /// **not** be treated as a component boundary and the segment will be written incorrectly.
92    ///
93    /// **UTF-8 safety**: EDIFACT syntax requires all delimiter bytes to be single-byte ASCII
94    /// characters (values 0x00–0x7F).  Non-ASCII delimiter bytes would bisect multi-byte UTF-8
95    /// sequences in data values and produce malformed output.  All fields of
96    /// [`ServiceStringAdvice`][crate::ServiceStringAdvice] must therefore hold ASCII byte values.
97    ///
98    /// To produce correct output regardless of the active delimiter, prefer
99    /// [`Self::write_segment_parts`] which accepts pre-split component slices.
100    pub fn write_raw(&mut self, tag: &str, elements: &[&str]) -> Result<(), EdifactError> {
101        self.inner.write_all(tag.as_bytes())?;
102        let comp_sep = self.ssa.component_sep;
103        for el in elements {
104            self.inner.write_all(&[self.ssa.element_sep])?;
105            // Byte-level split: EDIFACT delimiters are always single bytes.
106            let mut parts = el.as_bytes().split(|&b| b == comp_sep);
107            if let Some(first) = parts.next() {
108                // INVARIANT: input is valid UTF-8 and we split on a single-byte ASCII
109                // delimiter, so each part remains a valid UTF-8 slice.
110                self.write_escaped(
111                    std::str::from_utf8(first).map_err(|_| EdifactError::InvalidUtf8)?,
112                )?;
113            }
114            for part in parts {
115                self.inner.write_all(&[comp_sep])?;
116                self.write_escaped(
117                    std::str::from_utf8(part).map_err(|_| EdifactError::InvalidUtf8)?,
118                )?;
119            }
120        }
121        self.inner.write_all(&[self.ssa.segment_term])?;
122        self.segment_count += 1;
123        Ok(())
124    }
125
126    /// Write a segment from a tag and pre-split element/component data.
127    ///
128    /// `elements` is a slice of elements; each element is a sequence of component strings.
129    /// This avoids the lifetime constraints of [`Self::write_segment`] when building
130    /// segments from runtime-owned data (e.g. inside [`crate::WriterEmitter`]).
131    pub fn write_segment_parts<E>(&mut self, tag: &str, elements: &[E]) -> Result<(), EdifactError>
132    where
133        E: AsRef<[String]>,
134    {
135        self.inner.write_all(tag.as_bytes())?;
136        for element in elements {
137            self.inner.write_all(&[self.ssa.element_sep])?;
138            let mut first = true;
139            for comp in element.as_ref() {
140                if !first {
141                    self.inner.write_all(&[self.ssa.component_sep])?;
142                }
143                first = false;
144                self.write_escaped(comp.as_str())?;
145            }
146        }
147        self.inner.write_all(&[self.ssa.segment_term])?;
148        self.segment_count += 1;
149        Ok(())
150    }
151
152    /// Flush and return the underlying writer.
153    pub fn finish(mut self) -> Result<W, EdifactError> {
154        self.inner.flush()?;
155        Ok(self.inner)
156    }
157
158    /// Write the `UNT` segment and return the inner writer.
159    ///
160    /// The segment count written into `UNT` element 1 (DE 0074) is the number of
161    /// segments already written **plus one** for the `UNT` segment itself, which
162    /// EDIFACT requires to be included in the count alongside `UNH`.
163    ///
164    /// # Errors
165    ///
166    /// Returns an error if writing fails.  Do **not** call [`write_raw`][Self::write_raw] or
167    /// [`write_segment`][Self::write_segment] after `finish_unt` — the writer is consumed.
168    pub fn finish_unt(mut self, message_ref: &str) -> Result<W, EdifactError> {
169        // DE 0074: count includes UNH and UNT themselves.
170        let count = self.segment_count + 1;
171        let count_str = count.to_string();
172        self.write_raw("UNT", &[count_str.as_str(), message_ref])?;
173        self.finish()
174    }
175
176    /// Returns the total number of segments written so far.
177    pub fn segment_count(&self) -> u64 {
178        self.segment_count
179    }
180
181    /// Returns the active [`ServiceStringAdvice`] (delimiter configuration).
182    pub fn service_string_advice(&self) -> ServiceStringAdvice {
183        self.ssa
184    }
185
186    /// Escape a value string for inclusion in an EDIFACT segment.
187    ///
188    /// Any character in `value` that matches the active element separator,
189    /// component separator, release character, or segment terminator is escaped
190    /// by prefixing it with the release character (default `?`).
191    ///
192    /// Returns a borrowed `Cow::Borrowed(value)` when no escaping is needed,
193    /// avoiding an allocation on the fast path.
194    ///
195    /// # Example
196    ///
197    /// ```rust,ignore
198    /// let writer = Writer::new(std::io::sink());
199    /// // '+' must be escaped since it is the default element separator.
200    /// assert_eq!(writer.escape_value("price+tax"), "price?+tax");
201    /// ```
202    pub fn escape_value<'v>(&self, value: &'v str) -> Cow<'v, str> {
203        let (elem, comp, release, term) = (
204            self.ssa.element_sep,
205            self.ssa.component_sep,
206            self.ssa.release_char,
207            self.ssa.segment_term,
208        );
209        let bytes = value.as_bytes();
210        let needs_escape = bytes
211            .iter()
212            .any(|&b| b == elem || b == comp || b == release || b == term);
213        if !needs_escape {
214            return Cow::Borrowed(value);
215        }
216        let mut out = Vec::with_capacity(value.len() + 4);
217        let mut last = 0;
218        let mut pos = 0;
219        while pos < bytes.len() {
220            let remaining = &bytes[pos..];
221            let hit_ecr = memchr::memchr3(elem, comp, release, remaining);
222            let hit_t = memchr::memchr(term, remaining);
223            let hit = match (hit_ecr, hit_t) {
224                (None, None) => break,
225                (Some(a), None) => a,
226                (None, Some(b)) => b,
227                (Some(a), Some(b)) => a.min(b),
228            };
229            let abs = pos + hit;
230            out.extend_from_slice(&bytes[last..abs]);
231            out.push(release);
232            out.push(bytes[abs]);
233            last = abs + 1;
234            pos = abs + 1;
235        }
236        out.extend_from_slice(&bytes[last..]);
237        // SAFETY:
238        //   1. `value` is a valid `&str`, so `bytes` is valid UTF-8 to start.
239        //   2. `self.ssa.release_char` is a single-byte ASCII value (0x21–0x7E),
240        //      enforced at construction time by `ServiceStringAdvice::is_valid()`
241        //      (called in `Writer::with_una`; the default SSA hardcodes `?` = 0x3F).
242        //      Inserting a single ASCII byte cannot split or corrupt a multi-byte
243        //      UTF-8 sequence, because ASCII bytes always have the high bit clear
244        //      while continuation bytes of multi-byte sequences always have the high
245        //      bit set (0x80–0xBF).
246        //   3. All other bytes are copied verbatim from the valid UTF-8 source.
247        Cow::Owned(unsafe { String::from_utf8_unchecked(out) })
248    }
249    /// Write only the segment tag bytes — no element separator or terminator.
250    ///
251    /// Used by [`crate::WriterEmitter`] for eager, zero-allocation event writing.
252    #[inline]
253    pub(crate) fn write_tag_only(&mut self, tag: &str) -> Result<(), EdifactError> {
254        self.inner.write_all(tag.as_bytes())?;
255        Ok(())
256    }
257
258    /// Write one element separator byte.
259    #[inline]
260    pub(crate) fn write_element_sep(&mut self) -> Result<(), EdifactError> {
261        self.inner.write_all(&[self.ssa.element_sep])?;
262        Ok(())
263    }
264
265    /// Write one component separator byte.
266    #[inline]
267    pub(crate) fn write_component_sep(&mut self) -> Result<(), EdifactError> {
268        self.inner.write_all(&[self.ssa.component_sep])?;
269        Ok(())
270    }
271
272    /// Write the segment terminator and increment the internal segment counter.
273    #[inline]
274    pub(crate) fn write_segment_term_and_count(&mut self) -> Result<(), EdifactError> {
275        self.inner.write_all(&[self.ssa.segment_term])?;
276        self.segment_count += 1;
277        Ok(())
278    }
279
280    /// Write a value, escaping any delimiter characters.
281    pub(crate) fn write_escaped(&mut self, value: &str) -> Result<(), EdifactError> {
282        let (elem, comp, release, term) = (
283            self.ssa.element_sep,
284            self.ssa.component_sep,
285            self.ssa.release_char,
286            self.ssa.segment_term,
287        );
288        let bytes = value.as_bytes();
289        let mut last = 0;
290        let mut pos = 0;
291        while pos < bytes.len() {
292            // Use memchr3 for three delimiters + memchr for the fourth to avoid
293            // a manual byte-by-byte scan.
294            let remaining = &bytes[pos..];
295            let hit_ecr = memchr::memchr3(elem, comp, release, remaining);
296            let hit_t = memchr::memchr(term, remaining);
297            let hit = match (hit_ecr, hit_t) {
298                (None, None) => break,
299                (Some(a), None) => a,
300                (None, Some(b)) => b,
301                (Some(a), Some(b)) => a.min(b),
302            };
303            let abs = pos + hit;
304            if abs > last {
305                self.inner.write_all(&bytes[last..abs])?;
306            }
307            self.inner.write_all(&[release, bytes[abs]])?;
308            last = abs + 1;
309            pos = abs + 1;
310        }
311        self.inner.write_all(&bytes[last..])?;
312        Ok(())
313    }
314}
315
316#[cfg(test)]
317mod tests {
318    use super::*;
319    use crate::model::Element;
320
321    #[test]
322    fn write_and_parse_simple_segment() {
323        let segs: Vec<Segment<'static>> = vec![Segment::new(
324            "BGM",
325            vec![Element::of(&["220"]), Element::of(&["ORDER123"])],
326        )];
327        let bytes = crate::segments_to_bytes(&segs).unwrap();
328        let s = std::str::from_utf8(&bytes).unwrap();
329        assert!(s.starts_with("BGM+220+ORDER123'"));
330    }
331
332    #[test]
333    fn release_char_escaped() {
334        let segs: Vec<Segment<'static>> = vec![Segment::new(
335            "FTX",
336            vec![Element::of(&["value+with+delimiters"])],
337        )];
338        let bytes = crate::segments_to_bytes(&segs).unwrap();
339        let s = std::str::from_utf8(&bytes).unwrap();
340        // The `+` in the value must be escaped as `?+`
341        assert!(s.contains("?+"), "escape missing: {s}");
342    }
343
344    #[test]
345    fn round_trip_preserves_values() {
346        let segs: Vec<Segment<'static>> = vec![
347            Segment::new(
348                "UNB",
349                vec![
350                    Element::of(&["UNOA", "1"]),
351                    Element::of(&["SENDER"]),
352                    Element::of(&["RECEIVER"]),
353                ],
354            ),
355            Segment::new("UNZ", vec![Element::of(&["0"]), Element::of(&["1"])]),
356        ];
357        let bytes = crate::segments_to_bytes(&segs).unwrap();
358        let rt: Vec<crate::OwnedSegment> = crate::parser::from_reader(std::io::Cursor::new(&bytes))
359            .expect("round-trip parse failed");
360        assert_eq!(rt[0].tag, "UNB");
361        assert_eq!(rt[0].as_borrowed().element_str(0), Some("UNOA"));
362        assert_eq!(rt[1].tag, "UNZ");
363    }
364
365    /// Verify that `Writer::with_una` uses the configured delimiters throughout,
366    /// and that `write_segment_parts` (the delimiter-agnostic API) produces correct
367    /// component separators even with a non-default UNA.
368    #[test]
369    fn with_una_non_default_delimiters() {
370        use crate::tokenizer::ServiceStringAdvice;
371
372        // Custom UNA: comp_sep=|  elem_sep=!  esc=?  dec_mark=,  seg_term=~
373        let ssa = ServiceStringAdvice {
374            component_sep: b'|',
375            element_sep: b'!',
376            release_char: b'?',
377            decimal_mark: b',',
378            segment_term: b'~',
379        };
380
381        let buf = Vec::new();
382        let mut writer = Writer::with_una(buf, ssa).expect("writer creation failed");
383
384        // write_segment_parts: pre-split; no hard-coded `:` in element strings
385        writer
386            .write_segment_parts(
387                "BGM",
388                &[
389                    vec!["220".to_owned(), "SUB1".to_owned()],
390                    vec!["PO1".to_owned()],
391                ],
392            )
393            .expect("write failed");
394
395        let out = writer.finish().expect("finish failed");
396        let s = std::str::from_utf8(&out).unwrap();
397
398        // Output must use `!` as element separator, `|` as component separator, `~` as terminator.
399        // The writer also emits a UNA header when with_una is used.
400        assert!(s.contains("BGM"), "BGM segment missing: {s}");
401        // Slice after UNA so assertions target segment output, not UNA header bytes.
402        let after_una = s.find("BGM").map(|i| &s[i..]).unwrap_or(s);
403        assert!(
404            after_una.contains('!'),
405            "missing element sep in segment: {after_una}"
406        );
407        assert!(
408            after_una.contains('|'),
409            "missing component sep in segment: {after_una}"
410        );
411        assert!(
412            after_una.ends_with('~'),
413            "missing segment term in segment: {after_una}"
414        );
415        // Decimal mark appears in the UNA header (no decimal-bearing values in this segment).
416        assert!(s.contains(','), "missing decimal mark in UNA: {s}");
417        assert!(!s.contains('+'), "default element sep leaked: {s}");
418        assert!(!s.contains(':'), "default component sep leaked: {s}");
419        // segment_term '~' is not the default; ensure no default ' leaks (UNA itself aside)
420        assert!(
421            !after_una.contains('\''),
422            "default segment term leaked after UNA: {after_una}"
423        );
424    }
425}