Skip to main content

uuencoding_multi/
lib.rs

1//! Multi-part UUencoded Usenet/email post reassembly.
2//!
3//! # Background
4//!
5//! Before MIME attachments became universal, large binary files were shared on
6//! Usenet and via email by UUencoding them and splitting the result across
7//! multiple posts or messages. Each post contained a sequential segment of the
8//! encoded data, identified by a subject-line marker such as `[2/7]` or
9//! `(2 of 7)`. Readers would collect all parts and concatenate the UU bodies
10//! before decoding.
11//!
12//! Each multi-part series often began with a part 0 (the TOC post) that listed
13//! the files being distributed along with their sizes and which parts each file
14//! spanned. This crate handles both the TOC and the data parts.
15//!
16//! # What this crate provides
17//!
18//! - [`parse_subject`] — extract part index, part total, and base subject from
19//!   a Usenet/email subject line. Recognises five common marker formats:
20//!   `(N/M)`, `[N/M]`, `Part N/M`, `Part N of M`, and `- N/M`.
21//! - [`PartCollection`] — accumulate [`PartEntry`] values keyed by part number
22//!   until all parts are present, with gap detection and duplicate rejection.
23//! - [`reassemble()`] — validate completeness, concatenate raw UU bodies in
24//!   ascending part order, and decode via the `uuencoding` crate.
25//! - [`parse_toc`] — best-effort parse of a TOC body (part 0), returning a
26//!   [`ParsedToc`] with [`TocEntry`] records for each file listed.
27//!
28//! # What this crate does NOT do
29//!
30//! - **MIME parsing**: this crate operates on raw message body bytes that the
31//!   caller has already extracted from the MIME structure. Use the `mime-tree`
32//!   crate (or equivalent) to parse the enclosing MIME message and locate the
33//!   plain-text body part before passing bytes here.
34//! - **Message fetching or storage**: retrieving articles from an NNTP server,
35//!   reading mailbox files, or persisting collected parts is entirely the
36//!   caller's responsibility.
37//! - **yEnc decoding**: subject lines that contain a `yEnc` marker are
38//!   explicitly rejected by [`parse_subject`] (returns `None`). yEnc is a
39//!   distinct binary encoding with its own tools.
40//!
41//! # Integration with `mime-tree`
42//!
43//! The expected integration pattern is:
44//! 1. Parse the raw RFC 5322 message bytes with `mime-tree` to obtain the
45//!    `Subject` header value and the plain-text body.
46//! 2. Pass the `Subject` string to [`parse_subject`] to identify the part
47//!    number and group key.
48//! 3. Wrap the body bytes in a [`PartEntry`] and insert it into a
49//!    [`PartCollection`] keyed by the base subject.
50//! 4. Once the collection is complete, call [`reassemble()`].
51//!
52//! # Security
53//!
54//! The `data` field of [`ReassembledFile`] is raw decoded bytes that may
55//! represent a compressed archive (`.tar.gz`, `.zip`, `.rar`, etc.). **This
56//! crate never decompresses the output.** Callers that subsequently decompress
57//! the data must apply independent size and resource limits to defend against
58//! decompression-bomb attacks before beginning decompression.
59//!
60//! # End-to-end usage example
61//!
62//! ```no_run
63//! use uuencoding_multi::{
64//!     parse_subject, PartCollection, PartEntry, reassemble,
65//! };
66//!
67//! // Imagine these come from an NNTP server or mailbox.
68//! let raw_messages: Vec<(String, Vec<u8>)> = todo!("fetch messages");
69//!
70//! let mut collections: std::collections::HashMap<String, PartCollection> =
71//!     std::collections::HashMap::new();
72//!
73//! for (subject, body_bytes) in raw_messages {
74//!     // Step 1: parse the subject to identify part number and grouping key.
75//!     let Some(sp) = parse_subject(&subject) else {
76//!         continue; // empty or yEnc subject — skip
77//!     };
78//!     let Some(part_index) = sp.part_index else {
79//!         continue; // no part marker — treat as a plain message
80//!     };
81//!
82//!     // Step 2: accumulate parts by base subject.
83//!     let coll = collections.entry(sp.base_subject).or_default();
84//!     if let Some(total) = sp.part_total {
85//!         if coll.total().is_none() {
86//!             *coll = PartCollection::with_total(total);
87//!         }
88//!     }
89//!     let entry = PartEntry { part_number: part_index, body_bytes, subject: Some(subject) };
90//!     let _ = coll.add(entry); // ignore duplicates
91//! }
92//!
93//! // Step 3: reassemble complete collections.
94//! for (key, coll) in &collections {
95//!     if !coll.is_complete() {
96//!         eprintln!("{key}: still waiting for {:?}", coll.missing_parts());
97//!         continue;
98//!     }
99//!     let file = reassemble(coll).expect("complete collection should decode");
100//!     // IMPORTANT: apply size/resource limits before decompressing `file.data`.
101//!     println!("decoded {} ({} bytes, mode {:o})", file.filename, file.data.len(), file.mode);
102//! }
103//! ```
104
105pub(crate) mod collection;
106pub(crate) mod error;
107pub(crate) mod reassemble;
108pub(crate) mod subject;
109pub(crate) mod toc;
110
111pub use collection::{PartCollection, PartEntry};
112pub use error::MultiUuError;
113pub use reassemble::{reassemble, ReassembledFile};
114pub use subject::parse_subject;
115pub use toc::{parse_toc, ParsedToc, TocEntry};
116
117/// Fields extracted from a parsed Usenet/email subject line.
118///
119/// Returned by [`parse_subject`]. The `base_subject` field can be used as a
120/// stable grouping key across parts of the same series.
121///
122/// # Field invariants
123///
124/// - `base_subject` is never empty when `SubjectParts` is returned (the only
125///   way to get an empty or no-marker subject back is if `parse_subject`
126///   returns `Some` with `part_index = None`).
127/// - `part_total` is always `Some` when `part_index` is `Some`, because every
128///   supported marker format includes the total count.
129#[derive(Debug)]
130pub struct SubjectParts {
131    /// Subject line with the part-number marker removed and surrounding
132    /// whitespace trimmed. Safe to use as a collection grouping key because
133    /// all parts of the same series share the same base subject.
134    pub base_subject: String,
135    /// 1-based part number extracted from the marker. `Some(0)` indicates a
136    /// TOC post (e.g. `(00/17)`). `None` when no recognised marker was found.
137    pub part_index: Option<u32>,
138    /// Total number of parts as declared in the subject marker.
139    /// Always `Some` when `part_index` is `Some`; `None` otherwise.
140    pub part_total: Option<u32>,
141}