Skip to main content

edifact_rs/
writer.rs

1//! EDIFACT writer — serializes [`Segment`]s to wire format.
2
3use crate::{error::EdifactError, model::Segment, tokenizer::ServiceStringAdvice};
4use std::io::Write;
5
6/// Streaming EDIFACT writer.
7///
8/// Wraps any [`Write`] implementation and serializes segments one at a time.
9/// Call [`Writer::finish`] to flush and get the underlying writer back.
10pub struct Writer<W: Write> {
11    inner: W,
12    ssa: ServiceStringAdvice,
13    /// Running count of segments written.  `u64` to prevent silent overflow on
14    /// pathological inputs (a `u32` would wrap after ~4 billion segments).
15    segment_count: u64,
16}
17
18impl<W: Write> Writer<W> {
19    /// Create a new writer with default EDIFACT delimiters.
20    pub fn new(inner: W) -> Self {
21        Self {
22            inner,
23            ssa: ServiceStringAdvice::default(),
24            segment_count: 0,
25        }
26    }
27
28    /// Create a writer with custom delimiters and write a UNA segment first.
29    pub fn with_una(mut inner: W, ssa: ServiceStringAdvice) -> Result<Self, EdifactError> {
30        // EDIFACT syntax requires all delimiter bytes to be ASCII (0x00–0x7F).
31        // Non-ASCII bytes would bisect multi-byte UTF-8 sequences in data values.
32        if [
33            ssa.component_sep,
34            ssa.element_sep,
35            ssa.decimal_mark,
36            ssa.release_char,
37            ssa.segment_term,
38        ]
39        .iter()
40        .any(|&b| b > 0x7F)
41        {
42            return Err(EdifactError::InvalidUna);
43        }
44        // UNA: component_sep, element_sep, decimal_mark, release_char, space, segment_term
45        let una = [
46            b'U',
47            b'N',
48            b'A',
49            ssa.component_sep,
50            ssa.element_sep,
51            ssa.decimal_mark,
52            ssa.release_char,
53            b' ',
54            ssa.segment_term,
55        ];
56        inner.write_all(&una)?;
57        Ok(Self {
58            inner,
59            ssa,
60            segment_count: 0,
61        })
62    }
63
64    /// Write a single segment.
65    pub fn write_segment(&mut self, seg: &Segment<'_>) -> Result<(), EdifactError> {
66        // Tag
67        self.inner.write_all(seg.tag.as_bytes())?;
68
69        for element in &seg.elements {
70            // Element separator
71            self.inner.write_all(&[self.ssa.element_sep])?;
72            let mut first_component = true;
73            for component in &element.components {
74                if !first_component {
75                    self.inner.write_all(&[self.ssa.component_sep])?;
76                }
77                first_component = false;
78                self.write_escaped(component)?;
79            }
80        }
81
82        // Segment terminator
83        self.inner.write_all(&[self.ssa.segment_term])?;
84        self.segment_count += 1;
85        Ok(())
86    }
87
88    /// Write a raw segment from tag + element string slices.
89    ///
90    /// Each element string is split on the **active component-separator byte** from the
91    /// configured [`ServiceStringAdvice`][crate::ServiceStringAdvice] to identify component
92    /// boundaries.  The default component separator is `:` (0x3A), but this can differ when a
93    /// non-default `UNA` string was used to construct the writer.
94    ///
95    /// # Delimiter dependency
96    ///
97    /// Callers that embed the literal `:` character in element strings rely on `:` being
98    /// the component separator.  When the writer uses a non-default delimiter set, `:` will
99    /// **not** be treated as a component boundary and the segment will be written incorrectly.
100    ///
101    /// **UTF-8 safety**: EDIFACT syntax requires all delimiter bytes to be single-byte ASCII
102    /// characters (values 0x00–0x7F).  Non-ASCII delimiter bytes would bisect multi-byte UTF-8
103    /// sequences in data values and produce malformed output.  All fields of
104    /// [`ServiceStringAdvice`][crate::ServiceStringAdvice] must therefore hold ASCII byte values.
105    ///
106    /// To produce correct output regardless of the active delimiter, prefer
107    /// [`Self::write_segment_parts`] which accepts pre-split component slices.
108    pub fn write_raw(&mut self, tag: &str, elements: &[&str]) -> Result<(), EdifactError> {
109        self.inner.write_all(tag.as_bytes())?;
110        let comp_sep = self.ssa.component_sep;
111        for el in elements {
112            self.inner.write_all(&[self.ssa.element_sep])?;
113            // Byte-level split: EDIFACT delimiters are always single bytes.
114            let mut parts = el.as_bytes().split(|&b| b == comp_sep);
115            if let Some(first) = parts.next() {
116                // SAFETY: input is valid UTF-8 and we split on a single-byte delimiter,
117                // so each part remains a valid UTF-8 slice.
118                self.write_escaped(
119                    std::str::from_utf8(first).map_err(|_| EdifactError::InvalidUtf8)?,
120                )?;
121            }
122            for part in parts {
123                self.inner.write_all(&[comp_sep])?;
124                self.write_escaped(
125                    std::str::from_utf8(part).map_err(|_| EdifactError::InvalidUtf8)?,
126                )?;
127            }
128        }
129        self.inner.write_all(&[self.ssa.segment_term])?;
130        self.segment_count += 1;
131        Ok(())
132    }
133
134    /// Write a segment from a tag and pre-split element/component data.
135    ///
136    /// `elements` is a slice of elements; each element is a sequence of component strings.
137    /// This avoids the lifetime constraints of [`Self::write_segment`] when building
138    /// segments from runtime-owned data (e.g. inside [`crate::WriterEmitter`]).
139    pub fn write_segment_parts<E>(&mut self, tag: &str, elements: &[E]) -> Result<(), EdifactError>
140    where
141        E: AsRef<[String]>,
142    {
143        self.inner.write_all(tag.as_bytes())?;
144        for element in elements {
145            self.inner.write_all(&[self.ssa.element_sep])?;
146            let mut first = true;
147            for comp in element.as_ref() {
148                if !first {
149                    self.inner.write_all(&[self.ssa.component_sep])?;
150                }
151                first = false;
152                self.write_escaped(comp.as_str())?;
153            }
154        }
155        self.inner.write_all(&[self.ssa.segment_term])?;
156        self.segment_count += 1;
157        Ok(())
158    }
159
160    /// Flush and return the underlying writer.
161    pub fn finish(mut self) -> Result<W, EdifactError> {
162        self.inner.flush()?;
163        Ok(self.inner)
164    }
165
166    /// Returns the total number of segments written so far.
167    pub fn segment_count(&self) -> u64 {
168        self.segment_count
169    }
170
171    /// Write only the segment tag bytes — no element separator or terminator.
172    ///
173    /// Used by [`crate::WriterEmitter`] for eager, zero-allocation event writing.
174    #[inline]
175    pub(crate) fn write_tag_only(&mut self, tag: &str) -> Result<(), EdifactError> {
176        self.inner.write_all(tag.as_bytes())?;
177        Ok(())
178    }
179
180    /// Write one element separator byte.
181    #[inline]
182    pub(crate) fn write_element_sep(&mut self) -> Result<(), EdifactError> {
183        self.inner.write_all(&[self.ssa.element_sep])?;
184        Ok(())
185    }
186
187    /// Write one component separator byte.
188    #[inline]
189    pub(crate) fn write_component_sep(&mut self) -> Result<(), EdifactError> {
190        self.inner.write_all(&[self.ssa.component_sep])?;
191        Ok(())
192    }
193
194    /// Write the segment terminator and increment the internal segment counter.
195    #[inline]
196    pub(crate) fn write_segment_term_and_count(&mut self) -> Result<(), EdifactError> {
197        self.inner.write_all(&[self.ssa.segment_term])?;
198        self.segment_count += 1;
199        Ok(())
200    }
201
202    /// Write a value, escaping any delimiter characters.
203    pub(crate) fn write_escaped(&mut self, value: &str) -> Result<(), EdifactError> {
204        let (elem, comp, release, term) = (
205            self.ssa.element_sep,
206            self.ssa.component_sep,
207            self.ssa.release_char,
208            self.ssa.segment_term,
209        );
210        let bytes = value.as_bytes();
211        let mut pos = 0;
212        while pos < bytes.len() {
213            // Find next byte that needs escaping
214            let end = bytes[pos..]
215                .iter()
216                .position(|&b| b == elem || b == comp || b == release || b == term)
217                .map(|r| pos + r)
218                .unwrap_or(bytes.len());
219            if end > pos {
220                self.inner.write_all(&bytes[pos..end])?;
221            }
222            if end < bytes.len() {
223                self.inner.write_all(&[release, bytes[end]])?;
224                pos = end + 1;
225            } else {
226                break;
227            }
228        }
229        Ok(())
230    }
231}
232
233#[cfg(test)]
234mod tests {
235    use super::*;
236    use crate::model::Element;
237
238    #[test]
239    fn write_and_parse_simple_segment() {
240        let segs: Vec<Segment<'static>> = vec![Segment::new(
241            "BGM",
242            vec![Element::of(&["220"]), Element::of(&["ORDER123"])],
243        )];
244        let bytes = crate::segments_to_bytes(&segs).unwrap();
245        let s = std::str::from_utf8(&bytes).unwrap();
246        assert!(s.starts_with("BGM+220+ORDER123'"));
247    }
248
249    #[test]
250    fn release_char_escaped() {
251        let segs: Vec<Segment<'static>> = vec![Segment::new(
252            "FTX",
253            vec![Element::of(&["value+with+delimiters"])],
254        )];
255        let bytes = crate::segments_to_bytes(&segs).unwrap();
256        let s = std::str::from_utf8(&bytes).unwrap();
257        // The `+` in the value must be escaped as `?+`
258        assert!(s.contains("?+"), "escape missing: {s}");
259    }
260
261    #[test]
262    fn round_trip_preserves_values() {
263        let segs: Vec<Segment<'static>> = vec![
264            Segment::new(
265                "UNB",
266                vec![
267                    Element::of(&["UNOA", "1"]),
268                    Element::of(&["SENDER"]),
269                    Element::of(&["RECEIVER"]),
270                ],
271            ),
272            Segment::new("UNZ", vec![Element::of(&["0"]), Element::of(&["1"])]),
273        ];
274        let bytes = crate::segments_to_bytes(&segs).unwrap();
275        let rt: Vec<crate::OwnedSegment> = crate::parser::from_reader(std::io::Cursor::new(&bytes))
276            .expect("round-trip parse failed");
277        assert_eq!(rt[0].tag, "UNB");
278        assert_eq!(rt[0].as_borrowed().element_str(0), Some("UNOA"));
279        assert_eq!(rt[1].tag, "UNZ");
280    }
281
282    /// Verify that `Writer::with_una` uses the configured delimiters throughout,
283    /// and that `write_segment_parts` (the delimiter-agnostic API) produces correct
284    /// component separators even with a non-default UNA.
285    #[test]
286    fn with_una_non_default_delimiters() {
287        use crate::tokenizer::ServiceStringAdvice;
288
289        // Custom UNA: comp_sep=|  elem_sep=!  esc=?  dec_mark=,  seg_term=~
290        let ssa = ServiceStringAdvice {
291            component_sep: b'|',
292            element_sep: b'!',
293            release_char: b'?',
294            decimal_mark: b',',
295            segment_term: b'~',
296        };
297
298        let buf = Vec::new();
299        let mut writer = Writer::with_una(buf, ssa).expect("writer creation failed");
300
301        // write_segment_parts: pre-split; no hard-coded `:` in element strings
302        writer
303            .write_segment_parts(
304                "BGM",
305                &[
306                    vec!["220".to_owned(), "SUB1".to_owned()],
307                    vec!["PO1".to_owned()],
308                ],
309            )
310            .expect("write failed");
311
312        let out = writer.finish().expect("finish failed");
313        let s = std::str::from_utf8(&out).unwrap();
314
315        // Output must use `!` as element separator, `|` as component separator, `~` as terminator.
316        // The writer also emits a UNA header when with_una is used.
317        assert!(s.contains("BGM"), "BGM segment missing: {s}");
318        // Slice after UNA so assertions target segment output, not UNA header bytes.
319        let after_una = s.find("BGM").map(|i| &s[i..]).unwrap_or(s);
320        assert!(
321            after_una.contains('!'),
322            "missing element sep in segment: {after_una}"
323        );
324        assert!(
325            after_una.contains('|'),
326            "missing component sep in segment: {after_una}"
327        );
328        assert!(
329            after_una.ends_with('~'),
330            "missing segment term in segment: {after_una}"
331        );
332        // Decimal mark appears in the UNA header (no decimal-bearing values in this segment).
333        assert!(s.contains(','), "missing decimal mark in UNA: {s}");
334        assert!(!s.contains('+'), "default element sep leaked: {s}");
335        assert!(!s.contains(':'), "default component sep leaked: {s}");
336        // segment_term '~' is not the default; ensure no default ' leaks (UNA itself aside)
337        assert!(
338            !after_una.contains('\''),
339            "default segment term leaked after UNA: {after_una}"
340        );
341    }
342}