Skip to main content

edifact_rs/
writer.rs

1//! EDIFACT writer — serializes [`Segment`]s to wire format.
2
3use crate::{error::EdifactError, model::Segment, tokenizer::ServiceStringAdvice};
4use std::io::Write;
5
6/// Streaming EDIFACT writer.
7///
8/// Wraps any [`Write`] implementation and serializes segments one at a time.
9/// Call [`Writer::finish`] to flush and get the underlying writer back.
10pub struct Writer<W: Write> {
11    inner: W,
12    ssa: ServiceStringAdvice,
13    /// Running count of segments written.  `u64` to prevent silent overflow on
14    /// pathological inputs (a `u32` would wrap after ~4 billion segments).
15    segment_count: u64,
16}
17
18impl<W: Write> Writer<W> {
19    /// Create a new writer with default EDIFACT delimiters.
20    pub fn new(inner: W) -> Self {
21        Self {
22            inner,
23            ssa: ServiceStringAdvice::default(),
24            segment_count: 0,
25        }
26    }
27
28    /// Create a writer with custom delimiters and write a UNA segment first.
29    pub fn with_una(mut inner: W, ssa: ServiceStringAdvice) -> Result<Self, EdifactError> {
30        // All five active service characters must be mutually distinct, non-whitespace,
31        // and within the ASCII range so they never bisect multi-byte UTF-8 sequences.
32        if !ssa.is_valid() {
33            return Err(EdifactError::InvalidUna);
34        }
35        // UNA: component_sep, element_sep, decimal_mark, release_char, space, segment_term
36        let una = [
37            b'U',
38            b'N',
39            b'A',
40            ssa.component_sep,
41            ssa.element_sep,
42            ssa.decimal_mark,
43            ssa.release_char,
44            b' ',
45            ssa.segment_term,
46        ];
47        inner.write_all(&una)?;
48        Ok(Self {
49            inner,
50            ssa,
51            segment_count: 0,
52        })
53    }
54
55    /// Write a single segment.
56    pub fn write_segment(&mut self, seg: &Segment<'_>) -> Result<(), EdifactError> {
57        // Tag
58        self.inner.write_all(seg.tag.as_bytes())?;
59
60        for element in &seg.elements {
61            // Element separator
62            self.inner.write_all(&[self.ssa.element_sep])?;
63            let mut first_component = true;
64            for component in &element.components {
65                if !first_component {
66                    self.inner.write_all(&[self.ssa.component_sep])?;
67                }
68                first_component = false;
69                self.write_escaped(component)?;
70            }
71        }
72
73        // Segment terminator
74        self.inner.write_all(&[self.ssa.segment_term])?;
75        self.segment_count += 1;
76        Ok(())
77    }
78
79    /// Write a raw segment from tag + element string slices.
80    ///
81    /// Each element string is split on the **active component-separator byte** from the
82    /// configured [`ServiceStringAdvice`][crate::ServiceStringAdvice] to identify component
83    /// boundaries.  The default component separator is `:` (0x3A), but this can differ when a
84    /// non-default `UNA` string was used to construct the writer.
85    ///
86    /// # Delimiter dependency
87    ///
88    /// Callers that embed the literal `:` character in element strings rely on `:` being
89    /// the component separator.  When the writer uses a non-default delimiter set, `:` will
90    /// **not** be treated as a component boundary and the segment will be written incorrectly.
91    ///
92    /// **UTF-8 safety**: EDIFACT syntax requires all delimiter bytes to be single-byte ASCII
93    /// characters (values 0x00–0x7F).  Non-ASCII delimiter bytes would bisect multi-byte UTF-8
94    /// sequences in data values and produce malformed output.  All fields of
95    /// [`ServiceStringAdvice`][crate::ServiceStringAdvice] must therefore hold ASCII byte values.
96    ///
97    /// To produce correct output regardless of the active delimiter, prefer
98    /// [`Self::write_segment_parts`] which accepts pre-split component slices.
99    pub fn write_raw(&mut self, tag: &str, elements: &[&str]) -> Result<(), EdifactError> {
100        self.inner.write_all(tag.as_bytes())?;
101        let comp_sep = self.ssa.component_sep;
102        for el in elements {
103            self.inner.write_all(&[self.ssa.element_sep])?;
104            // Byte-level split: EDIFACT delimiters are always single bytes.
105            let mut parts = el.as_bytes().split(|&b| b == comp_sep);
106            if let Some(first) = parts.next() {
107                // INVARIANT: input is valid UTF-8 and we split on a single-byte ASCII
108                // delimiter, so each part remains a valid UTF-8 slice.
109                self.write_escaped(
110                    std::str::from_utf8(first).map_err(|_| EdifactError::InvalidUtf8)?,
111                )?;
112            }
113            for part in parts {
114                self.inner.write_all(&[comp_sep])?;
115                self.write_escaped(
116                    std::str::from_utf8(part).map_err(|_| EdifactError::InvalidUtf8)?,
117                )?;
118            }
119        }
120        self.inner.write_all(&[self.ssa.segment_term])?;
121        self.segment_count += 1;
122        Ok(())
123    }
124
125    /// Write a segment from a tag and pre-split element/component data.
126    ///
127    /// `elements` is a slice of elements; each element is a sequence of component strings.
128    /// This avoids the lifetime constraints of [`Self::write_segment`] when building
129    /// segments from runtime-owned data (e.g. inside [`crate::WriterEmitter`]).
130    pub fn write_segment_parts<E>(&mut self, tag: &str, elements: &[E]) -> Result<(), EdifactError>
131    where
132        E: AsRef<[String]>,
133    {
134        self.inner.write_all(tag.as_bytes())?;
135        for element in elements {
136            self.inner.write_all(&[self.ssa.element_sep])?;
137            let mut first = true;
138            for comp in element.as_ref() {
139                if !first {
140                    self.inner.write_all(&[self.ssa.component_sep])?;
141                }
142                first = false;
143                self.write_escaped(comp.as_str())?;
144            }
145        }
146        self.inner.write_all(&[self.ssa.segment_term])?;
147        self.segment_count += 1;
148        Ok(())
149    }
150
151    /// Flush and return the underlying writer.
152    pub fn finish(mut self) -> Result<W, EdifactError> {
153        self.inner.flush()?;
154        Ok(self.inner)
155    }
156
157    /// Write the `UNT` segment and return the inner writer.
158    ///
159    /// The segment count written into `UNT` element 1 (DE 0074) is the number of
160    /// segments already written **plus one** for the `UNT` segment itself, which
161    /// EDIFACT requires to be included in the count alongside `UNH`.
162    ///
163    /// # Errors
164    ///
165    /// Returns an error if writing fails.  Do **not** call [`write_raw`][Self::write_raw] or
166    /// [`write_segment`][Self::write_segment] after `finish_unt` — the writer is consumed.
167    pub fn finish_unt(mut self, message_ref: &str) -> Result<W, EdifactError> {
168        // DE 0074: count includes UNH and UNT themselves.
169        let count = self.segment_count + 1;
170        let count_str = count.to_string();
171        self.write_raw("UNT", &[count_str.as_str(), message_ref])?;
172        self.finish()
173    }
174
175    /// Returns the total number of segments written so far.
176    pub fn segment_count(&self) -> u64 {
177        self.segment_count
178    }
179
180    /// Returns the active [`ServiceStringAdvice`] (delimiter configuration).
181    pub fn service_string_advice(&self) -> ServiceStringAdvice {
182        self.ssa
183    }
184    /// Write only the segment tag bytes — no element separator or terminator.
185    ///
186    /// Used by [`crate::WriterEmitter`] for eager, zero-allocation event writing.
187    #[inline]
188    pub(crate) fn write_tag_only(&mut self, tag: &str) -> Result<(), EdifactError> {
189        self.inner.write_all(tag.as_bytes())?;
190        Ok(())
191    }
192
193    /// Write one element separator byte.
194    #[inline]
195    pub(crate) fn write_element_sep(&mut self) -> Result<(), EdifactError> {
196        self.inner.write_all(&[self.ssa.element_sep])?;
197        Ok(())
198    }
199
200    /// Write one component separator byte.
201    #[inline]
202    pub(crate) fn write_component_sep(&mut self) -> Result<(), EdifactError> {
203        self.inner.write_all(&[self.ssa.component_sep])?;
204        Ok(())
205    }
206
207    /// Write the segment terminator and increment the internal segment counter.
208    #[inline]
209    pub(crate) fn write_segment_term_and_count(&mut self) -> Result<(), EdifactError> {
210        self.inner.write_all(&[self.ssa.segment_term])?;
211        self.segment_count += 1;
212        Ok(())
213    }
214
215    /// Write a value, escaping any delimiter characters.
216    pub(crate) fn write_escaped(&mut self, value: &str) -> Result<(), EdifactError> {
217        let (elem, comp, release, term) = (
218            self.ssa.element_sep,
219            self.ssa.component_sep,
220            self.ssa.release_char,
221            self.ssa.segment_term,
222        );
223        let bytes = value.as_bytes();
224        let mut last = 0;
225        let mut pos = 0;
226        while pos < bytes.len() {
227            // Use memchr3 for three delimiters + memchr for the fourth to avoid
228            // a manual byte-by-byte scan.
229            let remaining = &bytes[pos..];
230            let hit_ecr = memchr::memchr3(elem, comp, release, remaining);
231            let hit_t = memchr::memchr(term, remaining);
232            let hit = match (hit_ecr, hit_t) {
233                (None, None) => break,
234                (Some(a), None) => a,
235                (None, Some(b)) => b,
236                (Some(a), Some(b)) => a.min(b),
237            };
238            let abs = pos + hit;
239            if abs > last {
240                self.inner.write_all(&bytes[last..abs])?;
241            }
242            self.inner.write_all(&[release, bytes[abs]])?;
243            last = abs + 1;
244            pos = abs + 1;
245        }
246        self.inner.write_all(&bytes[last..])?;
247        Ok(())
248    }
249}
250
251#[cfg(test)]
252mod tests {
253    use super::*;
254    use crate::model::Element;
255
256    #[test]
257    fn write_and_parse_simple_segment() {
258        let segs: Vec<Segment<'static>> = vec![Segment::new(
259            "BGM",
260            vec![Element::of(&["220"]), Element::of(&["ORDER123"])],
261        )];
262        let bytes = crate::segments_to_bytes(&segs).unwrap();
263        let s = std::str::from_utf8(&bytes).unwrap();
264        assert!(s.starts_with("BGM+220+ORDER123'"));
265    }
266
267    #[test]
268    fn release_char_escaped() {
269        let segs: Vec<Segment<'static>> = vec![Segment::new(
270            "FTX",
271            vec![Element::of(&["value+with+delimiters"])],
272        )];
273        let bytes = crate::segments_to_bytes(&segs).unwrap();
274        let s = std::str::from_utf8(&bytes).unwrap();
275        // The `+` in the value must be escaped as `?+`
276        assert!(s.contains("?+"), "escape missing: {s}");
277    }
278
279    #[test]
280    fn round_trip_preserves_values() {
281        let segs: Vec<Segment<'static>> = vec![
282            Segment::new(
283                "UNB",
284                vec![
285                    Element::of(&["UNOA", "1"]),
286                    Element::of(&["SENDER"]),
287                    Element::of(&["RECEIVER"]),
288                ],
289            ),
290            Segment::new("UNZ", vec![Element::of(&["0"]), Element::of(&["1"])]),
291        ];
292        let bytes = crate::segments_to_bytes(&segs).unwrap();
293        let rt: Vec<crate::OwnedSegment> = crate::parser::from_reader(std::io::Cursor::new(&bytes))
294            .expect("round-trip parse failed");
295        assert_eq!(rt[0].tag, "UNB");
296        assert_eq!(rt[0].as_borrowed().element_str(0), Some("UNOA"));
297        assert_eq!(rt[1].tag, "UNZ");
298    }
299
300    /// Verify that `Writer::with_una` uses the configured delimiters throughout,
301    /// and that `write_segment_parts` (the delimiter-agnostic API) produces correct
302    /// component separators even with a non-default UNA.
303    #[test]
304    fn with_una_non_default_delimiters() {
305        use crate::tokenizer::ServiceStringAdvice;
306
307        // Custom UNA: comp_sep=|  elem_sep=!  esc=?  dec_mark=,  seg_term=~
308        let ssa = ServiceStringAdvice {
309            component_sep: b'|',
310            element_sep: b'!',
311            release_char: b'?',
312            decimal_mark: b',',
313            segment_term: b'~',
314        };
315
316        let buf = Vec::new();
317        let mut writer = Writer::with_una(buf, ssa).expect("writer creation failed");
318
319        // write_segment_parts: pre-split; no hard-coded `:` in element strings
320        writer
321            .write_segment_parts(
322                "BGM",
323                &[
324                    vec!["220".to_owned(), "SUB1".to_owned()],
325                    vec!["PO1".to_owned()],
326                ],
327            )
328            .expect("write failed");
329
330        let out = writer.finish().expect("finish failed");
331        let s = std::str::from_utf8(&out).unwrap();
332
333        // Output must use `!` as element separator, `|` as component separator, `~` as terminator.
334        // The writer also emits a UNA header when with_una is used.
335        assert!(s.contains("BGM"), "BGM segment missing: {s}");
336        // Slice after UNA so assertions target segment output, not UNA header bytes.
337        let after_una = s.find("BGM").map(|i| &s[i..]).unwrap_or(s);
338        assert!(
339            after_una.contains('!'),
340            "missing element sep in segment: {after_una}"
341        );
342        assert!(
343            after_una.contains('|'),
344            "missing component sep in segment: {after_una}"
345        );
346        assert!(
347            after_una.ends_with('~'),
348            "missing segment term in segment: {after_una}"
349        );
350        // Decimal mark appears in the UNA header (no decimal-bearing values in this segment).
351        assert!(s.contains(','), "missing decimal mark in UNA: {s}");
352        assert!(!s.contains('+'), "default element sep leaked: {s}");
353        assert!(!s.contains(':'), "default component sep leaked: {s}");
354        // segment_term '~' is not the default; ensure no default ' leaks (UNA itself aside)
355        assert!(
356            !after_una.contains('\''),
357            "default segment term leaked after UNA: {after_una}"
358        );
359    }
360}