edifact-rs 0.7.0

Zero-copy EDIFACT parser, writer, serde traits, and extensible validation support
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
#![cfg_attr(docsrs, feature(doc_cfg))]

//! `edifact-rs` — zero-copy EDIFACT tokenizer, parser, writer, serde traits,
//! validation engine, and extensible directory support.
//!
//! `edifact-rs` is the main entry point of this workspace. The core parsing,
//! writing, and validation infrastructure is always available. Custom directory
//! validators can be implemented by downstream crates or generated through
//! external build tooling.
//!
//! # Quick start
//! ```
//! use edifact_rs::from_bytes;
//! let input = b"UNB+UNOA:1+SENDER+RECEIVER+200101:0900+1'UNZ+0+1'";
//! let segments: Vec<_> = from_bytes(input).collect::<Result<_, _>>().unwrap();
//! assert_eq!(segments[0].tag, "UNB");
//! ```
//!
//! # Crate features
//!
//! - `derive` (enabled by default): re-exports the derive macros from
//!   `edifact-rs-derive`.
//! - `diagnostics` (disabled by default): enables rich diagnostic output via `miette`.
//!   When enabled, errors implement `miette::Diagnostic` for enhanced error reporting.
//!   This feature adds an optional dependency and has no impact on parsing performance.
//!
//! The crate is expected to compile both with defaults and with
//! `--no-default-features` for consumers who only want the core parsing and
//! writing functionality.
//!
//! ## Feature matrix workflows
//!
//! - default features:
//!   `cargo test -p edifact-rs`
//! - no default features:
//!   `cargo test -p edifact-rs --no-default-features`
//! - all features:
//!   `cargo test -p edifact-rs --all-features`
//!
//! # Diagnostic Feature
//!
//! When the `diagnostics` feature is enabled, [`EdifactError`] gains additional
//! traits and methods that enable rich, human-readable error output:
//!
//! ```text
//! Error: invalid delimiter byte 0xAB at offset 42
//!
//!  ╭─ input.edi:2:3
//!//!  2 │ UNB+UNOA:1+....[invalid]...
//!  │         ^^^ invalid byte here
//!//! Error Code: E002
//! Help: The byte 0xAB is not a valid delimiter. Check UNA configuration
//! ```
//!
//! This feature is useful for CLI tools and error reporting, but is not required
//! for applications that handle errors programmatically.
//!
//! # Parse And Text Contracts
//!
//! Parsing in `edifact-rs` is strict and deterministic:
//!
//! - Segment and element text must decode as UTF-8 (`E003` on failure).
//! - Release characters must escape exactly one following byte.
//!   A trailing `?` at end-of-input is rejected (`E019`).
//! - Malformed delimiters and truncated segments are reported with stable
//!   error codes rather than panicking.
//!
//! These contracts apply to both slice-based parsing (`from_bytes`) and
//! reader-based parsing (`from_reader`).
//!
//! ```
//! use edifact_rs::from_reader;
//! use std::io::Cursor;
//!
//! let input = b"UNA:;.? 'BGM;220;test?;value'";
//! let segments = from_reader(Cursor::new(&input[..])).unwrap();
//! assert_eq!(segments.len(), 1);
//! assert_eq!(segments[0].tag, "BGM");
//! assert_eq!(segments[0].elements[0].components[0], "220");
//! assert_eq!(segments[0].elements[1].components[0], "test;value");
//! ```
//!
//! # Validation Quick Start
//!
//! The `Validator` trait and `ValidationContext` provide a flexible framework
//! for building custom validators. Users can generate validators from official
//! UNECE sources or implement their own.
//!
//! See the [`Validator`] trait documentation and the `cookbook_fixture_validation.rs`
//! example for details on creating custom validators.
//!
//! # Custom Profile Packs
//!
//! `ProfileRulePack` is the extension point for downstream MIG/profile crates.
//! Packs can be authored with public APIs only and plugged into a
//! [`ValidationContext`]:
//!
//! ```
//! use edifact_rs::{
//!     from_bytes, ProfileRulePack, ValidationContext, ValidationIssue, ValidationSeverity,
//! };
//!
//! let segments: Vec<_> = from_bytes(b"UNH+1+ORDERS:D:96A:UN'BGM+220+PO123+9'UNT+3+1'")
//!     .collect::<Result<_, _>>()?;
//!
//! let pack = ProfileRulePack::new("ORDERS-DEMO")
//!     .for_message_type("ORDERS")
//!     .with_stateless_rule_fn(|segments| {
//!         let bgm = segments.iter().find(|segment| segment.tag == "BGM")?;
//!         let document_code = bgm.get_element(0)?.get_component(0)?;
//!         (document_code == "220").then(|| {
//!             ValidationIssue::new(
//!                 ValidationSeverity::Warning,
//!                 "demo pack rejects BGM 220 for illustration",
//!             )
//!             .with_rule_id("DEMO-P001")
//!             .with_segment("BGM")
//!             .with_element_index(0)
//!         })
//!     });
//!
//! let report = ValidationContext::builder()
//!     .with_profile_pack(pack)
//!     .build()
//!     .validate_lenient(&segments);
//!
//! assert!(report.has_warnings());
//! let partner_report = report.filter_by_rule_prefix("DEMO-");
//! assert!(partner_report.total_issues() >= 1);
//! # Ok::<(), edifact_rs::EdifactError>(())
//! ```
//!
//! # Async Usage
//!
//! `edifact-rs` does not provide a native `async` API.  All parsing is
//! synchronous and driven by the standard `std::io::Read` / `std::io::BufRead`
//! traits.  The recommended integration pattern with async runtimes is:
//!
//! 1. Use your async runtime's read utilities to read the entire message into a
//!    `Vec<u8>` (e.g. `tokio::io::AsyncReadExt::read_to_end`).
//! 2. Parse the in-memory slice with [`from_bytes`].
//!
//! ```rust,no_run
//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
//! // With tokio:
//! // let mut buf = Vec::new();
//! // reader.read_to_end(&mut buf).await?;
//! // let segments: Vec<_> = edifact_rs::from_bytes(&buf).collect::<Result<_, _>>()?;
//! # Ok(())
//! # }
//! ```
//!
//! A native zero-copy streaming async API is tracked as a future roadmap item.
// ── core modules ──────────────────────────────────────────────────────────────
pub mod directory_validator;
pub(crate) mod envelope;
/// Error types and validation reporting primitives.
pub(crate) mod error;
pub mod group;
/// Core zero-copy and owned EDIFACT data model types.
pub(crate) mod model;
pub(crate) mod parser;
pub(crate) mod tokenizer;
pub(crate) mod validator;
pub(crate) mod writer;

// ── typed serialization layer ─────────────────────────────────────────────────
pub mod de;
pub(crate) mod event;
pub mod ser;

// ── flat re-exports: core ─────────────────────────────────────────────────────
pub use envelope::{
    InterchangeEnvelope, MessageEnvelope, MessageIdentifier, parse_unh, validate_envelope,
};
pub use error::{EdifactError, IoError, ValidationIssue, ValidationReport, ValidationSeverity};
pub use group::{GroupDef, SegmentGroup, group_segments};
pub use model::{
    BorrowedElement, BorrowedSegment, Element, OwnedElement, OwnedSegment, Segment, Span,
};
pub use parser::{
    Parser, ReaderConfig, from_bufread, from_bufread_stream, from_bufread_stream_with_config,
    from_reader_with_config,
};
pub use tokenizer::{ServiceStringAdvice, Tokenizer};
pub use validator::{
    EnvelopeValidator, ProfileRule, ProfileRulePack, ValidationContext, ValidationContextBuilder,
    ValidationLayer, ValidationRuleContext, Validator, validate_each,
};
pub use writer::Writer;

// ── flat re-exports: serde ────────────────────────────────────────────────────

/// User-facing deserialization API.
pub use de::{
    CompositeElement, DispatchedMessage, EdifactCompositeDeserialize, EdifactDeserialize,
    EdifactSegmentTag, MessageDispatch, MessageWindow, MessageWindowsIter, MessageWindowsSliceIter,
    OwnedMessageWindow, SegmentAccessor, deserialize, deserialize_all_from_reader,
    deserialize_all_streaming, deserialize_first_from_reader, deserialize_first_streaming,
    deserialize_messages_bytes, deserialize_messages_from_reader, deserialize_str,
    groups_are_contiguous_by_qualifier, message_windows_bytes, message_windows_from_reader,
};

/// Alias for [`message_windows_bytes`] — a more discoverable entry-point for
/// window-based message parsing.
///
/// Splits a byte slice into [`MessageWindow`] views, one per UNH/UNT envelope,
/// enabling parallel or lazy per-message processing without copying data.
///
/// # Example
/// ```rust,ignore
/// use edifact_rs::from_bytes_windows;
/// let windows: Vec<_> = from_bytes_windows(input).collect();
/// ```
pub use de::message_windows_bytes as from_bytes_windows;

// ── Proc-macro support ─────────────────────────────────────────────────────────
/// Private implementation helpers used by code generated from `#[derive(EdifactDeserialize)]`.
///
/// **This module is not part of the public API.**  Names, signatures, and
/// existence of items inside `__private` may change in any release without a
/// semver bump.  Do not depend on this module directly.
#[doc(hidden)]
pub mod __private {
    pub use super::de::{
        composite_element, contiguous_groups_by_qualifier, element_str, find_qualified_segment,
        find_qualified_segment_owned, find_segment, find_segment_owned, find_segment_typed,
        find_segments_iter, find_segments_typed, get_components_iter, optional_component,
        optional_element, qualifier_matches_pattern, required_component, required_element,
    };
}
pub use directory_validator::{
    DirectoryValidator, DirectoryValidatorBuilder, ElementRef, OwnedElementRef, OwnedSegmentDef,
    SegmentDefinition, Status,
};
#[cfg(feature = "derive")]
#[cfg_attr(docsrs, doc(cfg(feature = "derive")))]
pub use edifact_rs_derive::{EdifactDeserialize, EdifactSerialize};
pub use event::{EdifactEvent, EventEmitter, OwnedEdifactEvent, VecEmitter, WriterEmitter};
pub use ser::{
    DecimalFloat, DecimalFloatDisplay, EdifactCompositeSerialize, EdifactSerialize, to_bytes,
    to_edifact_string,
};

// ── core free functions ───────────────────────────────────────────────────────

use std::io::{Read, Write};

/// Iterator returned by [`from_bytes`].
pub struct FromBytesIter<'a> {
    parser: Option<parser::Parser<'a>>,
    pending_error: Option<EdifactError>,
    /// Remaining segment allowance (`None` = unlimited).
    segments_remaining: Option<usize>,
    /// Maximum byte budget (`None` = unlimited).
    bytes_remaining: Option<u64>,
    /// Byte offset of the start of the current parse position (approximated
    /// as the sum of previously yielded segment spans — the borrowed tokenizer
    /// does not expose a byte counter, so we track it from `Segment::span`).
    bytes_consumed: u64,
}

/// Iterator returned by [`from_reader_iter`].
pub struct FromReaderIter<R: Read> {
    inner: parser::OwnedSegmentStream<std::io::BufReader<R>>,
}

impl<R: Read> Iterator for FromReaderIter<R> {
    type Item = Result<OwnedSegment, EdifactError>;

    fn next(&mut self) -> Option<Self::Item> {
        self.inner.next()
    }
}

impl<'a> Iterator for FromBytesIter<'a> {
    type Item = Result<Segment<'a>, EdifactError>;

    fn next(&mut self) -> Option<Self::Item> {
        if let Some(err) = self.pending_error.take() {
            return Some(Err(err));
        }
        // max_segments guard
        if let Some(ref mut remaining) = self.segments_remaining {
            if *remaining == 0 {
                self.parser = None;
                return None;
            }
        }
        // max_input_bytes guard
        if let Some(max) = self.bytes_remaining {
            if self.bytes_consumed >= max {
                self.parser = None;
                return None;
            }
        }
        let item = self.parser.as_mut()?.next();
        if let Some(Ok(ref seg)) = item {
            // Decrement segment allowance
            if let Some(ref mut remaining) = self.segments_remaining {
                *remaining = remaining.saturating_sub(1);
            }
            // Update byte counter from segment span and eagerly stop if exhausted
            self.bytes_consumed = self.bytes_consumed.saturating_add(seg.span.len() as u64);
            if let Some(max) = self.bytes_remaining {
                if self.bytes_consumed >= max {
                    self.parser = None;
                }
            }
        }
        item
    }
}

/// Parse `input` bytes into an iterator of [`Segment`]s.
///
/// Borrows directly from `input` — zero allocation for segment data.
///
/// # Segment-size limit
///
/// Applies a default 64 KiB per-segment limit, matching the reader-based path.
/// Use [`from_bytes_with_config`] to override.
pub fn from_bytes(input: &[u8]) -> FromBytesIter<'_> {
    from_bytes_with_config(input, parser::ReaderConfig::default())
}

/// Parse `input` bytes into an iterator of [`Segment`]s with explicit configuration.
///
/// All three [`ReaderConfig`] limits are enforced:
/// - `max_segment_bytes`: returns [`EdifactError::SegmentTooLong`] if a single segment
///   exceeds the threshold.
/// - `max_segments`: stops the iterator after this many segments have been yielded.
/// - `max_input_bytes`: stops the iterator once this many bytes have been consumed
///   (byte count is approximated from segment spans; the last segment that pushes
///   consumption over the threshold is still returned).
///
/// Pass `ReaderConfig::default()` to use the default 64 KiB per-segment limit with
/// no segment-count or byte-budget cap.
///
/// # Example
///
/// ```
/// use edifact_rs::{ReaderConfig, from_bytes_with_config};
///
/// let cfg = ReaderConfig::default().max_segment_bytes(128);
/// let result: Result<Vec<_>, _> = from_bytes_with_config(b"BGM+220+1+9'", cfg).collect();
/// assert!(result.is_ok());
/// ```
pub fn from_bytes_with_config<'a>(
    input: &'a [u8],
    config: parser::ReaderConfig,
) -> FromBytesIter<'a> {
    let segments_remaining = config.max_segments;
    let bytes_remaining = config.max_input_bytes;
    match tokenizer::ServiceStringAdvice::from_bytes_strict(input) {
        Ok(ssa) => {
            let t = tokenizer::Tokenizer::with_limit(input, ssa, config.max_segment_bytes);
            FromBytesIter {
                parser: Some(parser::Parser::new(t)),
                pending_error: None,
                segments_remaining,
                bytes_remaining,
                bytes_consumed: 0,
            }
        }
        Err(error) => FromBytesIter {
            parser: None,
            pending_error: Some(error),
            segments_remaining,
            bytes_remaining,
            bytes_consumed: 0,
        },
    }
}

/// Parse a reader into owned segments.
///
/// # Errors
///
/// Returns an error if the input contains malformed EDIFACT syntax,
/// invalid UTF-8 segment text, dangling release sequences, or underlying I/O failures.
pub fn from_reader<R: Read>(reader: R) -> Result<Vec<OwnedSegment>, EdifactError> {
    parser::from_reader(reader)
}

/// Parse `input` bytes eagerly into an iterator of [`OwnedSegment`]s.
///
/// Unlike [`from_bytes`] (which yields borrowed [`Segment`]s tied to the input
/// lifetime), every segment returned here is fully owned.  This is convenient
/// when you need to store or return segments without retaining a reference to
/// the original byte slice.
///
/// # Example
///
/// ```
/// let segs: Vec<edifact_rs::OwnedSegment> = edifact_rs::from_bytes_owned(b"BGM+220+1+9'")
///     .collect::<Result<_, _>>()
///     .unwrap();
/// assert_eq!(segs[0].tag, "BGM");
/// ```
pub fn from_bytes_owned(
    input: &[u8],
) -> impl Iterator<Item = Result<OwnedSegment, EdifactError>> + '_ {
    from_bytes(input).map(|r| r.map(OwnedSegment::from))
}

/// Parse a reader into owned segments as a streaming iterator.
///
/// This keeps memory bounded by yielding segments incrementally instead of
/// materializing the full interchange up front.
pub fn from_reader_iter<R: Read>(reader: R) -> FromReaderIter<R> {
    FromReaderIter {
        inner: parser::from_reader_stream(reader),
    }
}

/// Serialize `segments` to an [`std::io::Write`] implementation.
///
/// # Errors
///
/// Returns an error if writing fails or if segment serialization fails.
pub fn to_writer<'a, 'b, W, I>(w: W, segments: I) -> Result<(), EdifactError>
where
    'b: 'a,
    W: Write,
    I: IntoIterator<Item = &'a Segment<'b>>,
{
    let mut wr = writer::Writer::new(w);
    for seg in segments {
        wr.write_segment(seg)?;
    }
    wr.finish().map(|_| ())
}

/// Serialize `segments` to an owned `Vec<u8>`.
///
/// # Errors
///
/// Returns an error if serialization fails.
pub fn segments_to_bytes<'a, 'b, I>(segments: I) -> Result<Vec<u8>, EdifactError>
where
    'b: 'a,
    I: IntoIterator<Item = &'a Segment<'b>>,
{
    let mut buf = Vec::new();
    to_writer(&mut buf, segments)?;
    Ok(buf)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn from_bytes_rejects_invalid_una() {
        let err = from_bytes(b"UNA::.? 'BGM:220'")
            .collect::<Result<Vec<_>, _>>()
            .expect_err("invalid UNA should fail slice parsing");
        assert!(matches!(err, EdifactError::InvalidUna));
    }
}