yencoding 0.2.0

yEnc encoding and decoding for Usenet binary posts
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
//! yEnc encode implementation.
//!
//! The core algorithm: add 42 to each byte (mod 256). Four specific encoded
//! values must be escaped with a preceding `=` and then adding 64 (mod 256):
//! - `\0` (NUL, 0x00) — would terminate C strings and confuse some software
//! - `\n` (LF, 0x0A) — line terminator; would be misread as end of data line
//! - `\r` (CR, 0x0D) — carriage return; same issue
//! - `=`  (0x3D) — the escape character itself
//!
//! Additionally, the yEnc spec documents two optional-but-common escapes:
//! - `.` at the start of a line is escaped to avoid NNTP dot-stuffing ambiguity
//! - TAB (0x09) at the start of a line is escaped on some encoders
//!
//! This implementation escapes `.` and TAB at line start, matching what real
//! Usenet encoders produce and what the Python fixture generator in
//! `tests/fixtures/gen_fixtures.py` does.
//!
//! Lines are wrapped at `line_length` *encoded* characters (default 128).
//! Line endings are `\r\n`.

/// Default line length for yEnc encoding, matching the yEnc spec recommendation.
pub const DEFAULT_LINE_LENGTH: u8 = 128;

/// Encode `data` as a single-part yEnc article.
///
/// The output is a complete article body including `=ybegin`, the encoded
/// data lines wrapped at `line_length` characters, and `=yend` with CRC32.
/// It does **not** include NNTP message headers.
///
/// # Parameters
///
/// - `data` — raw bytes to encode. May be empty.
/// - `filename` — written verbatim to the `name=` field of `=ybegin`.
///   No validation is performed; control characters should be avoided.
/// - `line_length` — maximum number of encoded bytes per line (1–255).
///   Values below 2 are clamped to 2 (escape pairs are 2 bytes and must
///   fit on one line). Pass `DEFAULT_LINE_LENGTH` (128) for the standard value.
///
/// # Returns
///
/// A `Vec<u8>` containing the complete encoded article ready for posting.
#[must_use]
pub fn encode(data: &[u8], filename: &str, line_length: u8) -> Vec<u8> {
    // Clamp to at least 2: escape pairs are 2 bytes and must fit on one line.
    let line_length = line_length.max(2) as usize;
    let mut out = Vec::with_capacity(data.len().saturating_mul(11) / 10 + 128);

    // =ybegin header (single-part: no part= or total= fields)
    out.extend_from_slice(
        format!(
            "=ybegin line={line_length} size={} name={filename}\r\n",
            data.len()
        )
        .as_bytes(),
    );

    // Encode body
    let crc = encode_body(data, line_length, &mut out);

    // =yend footer
    out.extend_from_slice(format!("=yend size={} crc32={crc:08x}\r\n", data.len()).as_bytes());

    out
}

/// Encode one part of a multi-part yEnc series.
///
/// Called from the public `encode_part` wrapper in `lib.rs` which groups
/// the parameters into `EncodePartOptions` to satisfy the argument-count lint.
#[allow(clippy::too_many_arguments)]
#[must_use]
pub fn encode_part(
    data: &[u8],
    filename: &str,
    total_size: u64,
    total_parts: u32,
    part: u32,
    begin: u64,
    end: u64,
    whole_file_crc32: u32,
    line_length: u8,
) -> Vec<u8> {
    // Clamp to at least 2: escape pairs are 2 bytes and must fit on one line.
    let line_length = line_length.max(2) as usize;
    let mut out = Vec::with_capacity(data.len().saturating_mul(11) / 10 + 256);

    // =ybegin header (multi-part: includes part= and total= fields)
    out.extend_from_slice(
        format!(
            "=ybegin part={part} total={total_parts} line={line_length} \
             size={total_size} name={filename}\r\n"
        )
        .as_bytes(),
    );

    // =ypart header
    out.extend_from_slice(format!("=ypart begin={begin} end={end}\r\n").as_bytes());

    // Encode body; get per-part CRC
    let pcrc = encode_body(data, line_length, &mut out);

    // =yend footer: includes both pcrc32= (this part) and crc32= (whole file)
    out.extend_from_slice(
        format!(
            "=yend size={} part={part} pcrc32={pcrc:08x} crc32={whole_file_crc32:08x}\r\n",
            data.len()
        )
        .as_bytes(),
    );

    out
}

// ---------------------------------------------------------------------------
// Core encode body (used by both encode() and encode_part())
// ---------------------------------------------------------------------------

/// Encode `data` into `out`, wrapping at `line_length` encoded bytes.
///
/// Returns the CRC32 of the *raw* (pre-encode) `data`.
fn encode_body(data: &[u8], line_length: usize, out: &mut Vec<u8>) -> u32 {
    let mut hasher = crc32fast::Hasher::new();
    hasher.update(data);
    let crc = hasher.finalize();

    let mut col: usize = 0; // current position within the current line

    for &byte in data {
        let encoded = byte.wrapping_add(42);

        // Determine whether this byte must be escaped.
        // NUL, LF, CR, and '=' are always escaped.
        // '.' and TAB (0x09) at position 0 of a line are also escaped.
        let must_escape = matches!(encoded, 0x00 | 0x0A | 0x0D | 0x3D)
            || (col == 0 && matches!(encoded, 0x2E | 0x09));

        if must_escape {
            // A 2-byte escape pair must fit on one line.  If there is only one
            // column left, flush the current line before emitting the pair.
            if col + 2 > line_length {
                out.extend_from_slice(b"\r\n");
                col = 0;
            }
            out.push(b'=');
            out.push(encoded.wrapping_add(64));
            col += 2;
        } else {
            out.push(encoded);
            col += 1;
        }

        if col >= line_length {
            out.extend_from_slice(b"\r\n");
            col = 0;
        }
    }

    // Emit a final line ending if there are any leftover characters.
    if col > 0 {
        out.extend_from_slice(b"\r\n");
    }

    crc
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use crate::decode::decode;

    // -----------------------------------------------------------------------
    // encode_body unit tests
    // Oracle: Python gen_fixtures.py — same encoding algorithm
    // -----------------------------------------------------------------------

    #[test]
    fn encode_simple_bytes() {
        // Oracle: byte 0 + 42 = '*' (0x2A), byte 1 = '+' (0x2B), etc.
        let mut out = Vec::new();
        let crc = encode_body(&[0, 1, 2, 3], 128, &mut out);
        assert_eq!(&out[..4], b"*+,-");
        // CRC32 of [0,1,2,3] — from Python: binascii.crc32(bytes([0,1,2,3])) = 0x8bb98613
        assert_eq!(crc, 0x8bb9_8613);
    }

    #[test]
    fn encode_escapes_nul() {
        // byte 214 + 42 = 256 mod 256 = 0 (NUL) → escape as '=' + (0+64) = '@'
        let mut out = Vec::new();
        encode_body(&[214], 128, &mut out);
        assert_eq!(&out[..2], b"=@");
    }

    #[test]
    fn encode_escapes_lf() {
        // Oracle: byte 224 + 42 = 266 mod 256 = 10 (LF) → escape as '=' + (10+64=74='J')
        let mut out = Vec::new();
        encode_body(&[224], 128, &mut out);
        assert_eq!(&out[..2], b"=J");
    }

    #[test]
    fn encode_escapes_cr() {
        // Oracle: byte 227 + 42 = 269 mod 256 = 13 (CR) → escape as '=' + (13+64='M')
        // python3: chr((227+42)%256) == '\r', chr(13+64) == 'M'
        let mut out = Vec::new();
        encode_body(&[227], 128, &mut out);
        assert_eq!(&out[..2], b"=M");
    }

    #[test]
    fn encode_escapes_eq() {
        // byte 19 + 42 = 61 = '=' → escape as '=' + (61+64) = '}'
        let mut out = Vec::new();
        encode_body(&[19], 128, &mut out);
        assert_eq!(&out[..2], b"=}");
    }

    #[test]
    fn encode_escapes_dot_at_line_start() {
        // byte 4 + 42 = 46 = '.' — at start of line, must be escaped
        let mut out = Vec::new();
        encode_body(&[4], 128, &mut out);
        // '.' + 64 = 110 = 'n'
        assert_eq!(&out[..2], b"=n");
    }

    #[test]
    fn encode_dot_not_escaped_mid_line() {
        // '.' at position > 0 is NOT escaped
        // byte 1 = '+' (not dot), byte 4 = '.' mid-line
        let mut out = Vec::new();
        encode_body(&[1, 4], 128, &mut out);
        assert_eq!(out[0], b'+');
        assert_eq!(out[1], b'.'); // not escaped, mid-line
    }

    #[test]
    fn encode_line_wrapping() {
        // With line_length=4, every 4 encoded chars should be followed by \r\n.
        let data = vec![0u8; 8]; // 8 zeros → 8 '*' chars → 2 lines of 4
        let mut out = Vec::new();
        encode_body(&data, 4, &mut out);
        // Expected: "****\r\n****\r\n"
        assert_eq!(&out[..4], b"****");
        assert_eq!(&out[4..6], b"\r\n");
        assert_eq!(&out[6..10], b"****");
        assert_eq!(&out[10..12], b"\r\n");
    }

    // -----------------------------------------------------------------------
    // Independent-oracle tests for dot and TAB escapes at line start.
    // Oracle: manual calculation from the yEnc spec — no decode() call.
    //
    // Escape rule: if `encoded` (= raw + 42 mod 256) equals 0x2E ('.') or
    // 0x09 (TAB) at column 0, emit '=' (0x3D) followed by (encoded + 64) % 256.
    //
    // Dot case:  raw = 0x04 → encoded = 0x2E ('.') → escape char = 0x6E ('n')
    //            emitted bytes: b'=' b'n' = [0x3D, 0x6E]
    //
    // TAB case:  raw = 0xDF (223) → encoded = 0x09 (TAB) → escape char = 0x49 ('I')
    //            emitted bytes: b'=' b'I' = [0x3D, 0x49]
    // -----------------------------------------------------------------------

    #[test]
    fn encode_dot_at_line_start_uses_escape() {
        // Oracle: raw byte 0x04, encoded = (0x04 + 42) % 256 = 0x2E = '.'
        // At column 0 this must be escaped: '=' + (0x2E + 64) % 256 = '=' + 'n'
        // Expected encoded body line: b"=n\r\n"
        let mut out = Vec::new();
        encode_body(&[0x04u8], 128, &mut out);
        // First two bytes of output (before CRLF) must be the escape pair.
        assert_eq!(
            &out[..2],
            b"=n",
            "dot (raw 0x04) at line start must encode as '=n'"
        );
    }

    #[test]
    fn encode_tab_at_line_start_uses_escape() {
        // Oracle: raw byte 0xDF (223), encoded = (223 + 42) % 256 = 0x09 = TAB
        // At column 0 this must be escaped: '=' + (0x09 + 64) % 256 = '=' + 'I'
        // Expected encoded body line: b"=I\r\n"
        let mut out = Vec::new();
        encode_body(&[0xDFu8], 128, &mut out);
        // First two bytes of output (before CRLF) must be the escape pair.
        assert_eq!(
            &out[..2],
            b"=I",
            "TAB (raw 0xDF) at line start must encode as '=I'"
        );
    }

    #[test]
    fn encode_all_bytes_round_trip() {
        // Oracle: Python algorithm (b+42)%256, escape if encoded byte is in
        // {0,10,13,61} (NUL/LF/CR/=).  '.' and TAB are only escaped at col==0;
        // their mid-line treatment is verified by independent tests above.
        //
        // We use line_length=128 (DEFAULT_LINE_LENGTH).  With 256 raw bytes and
        // 4 mandatory-escape bytes (each adding 1 byte), the total encoded body
        // is 260 bytes spread over multiple lines.  We extract all data lines
        // (strip =ybegin and =yend), concatenate them (stripping per-line \r\n),
        // and compare the concatenated bytes against the oracle.
        //
        // '.' (raw 0x04) and TAB (raw 0xDF) are NOT at col 0 of any line for
        // this 256-byte input with line_length=128 (they fall mid-line), so the
        // oracle correctly omits their line-start escapes.
        let raw: Vec<u8> = (0u8..=255).collect();
        let line_length: u8 = 128;

        // Build expected encoded body bytes using the Python oracle.
        let mut expected_encoded = Vec::new();
        for &b in &raw {
            let v = b.wrapping_add(42);
            if matches!(v, 0 | 10 | 13 | 61) {
                expected_encoded.push(b'=');
                expected_encoded.push(v.wrapping_add(64));
            } else {
                // '.' and TAB land mid-line here — not escaped.
                expected_encoded.push(v);
            }
        }

        let encoded = encode(&raw, "all.bin", line_length);

        // Extract body: find the end of the =ybegin line, then the start of
        // the =yend line.  Everything in between is data lines, each ending
        // with \r\n.  Concatenate them (stripping the per-line \r\n) to get
        // the flat byte stream.
        let ybegin_end = encoded
            .windows(2)
            .position(|w| w == b"\r\n")
            .expect("no \\r\\n after =ybegin")
            + 2;
        let yend_start = {
            // Find the last \r\n before =yend.
            let needle = b"\r\n=yend";
            encoded
                .windows(needle.len())
                .rposition(|w| w == needle)
                .expect("no \\r\\n=yend in output")
                + 2 // advance past the \r\n to point at '='
        };
        let body_section = &encoded[ybegin_end..yend_start];

        // Concatenate all data lines, stripping their \r\n endings.
        let mut actual_encoded: Vec<u8> = Vec::new();
        for line in body_section.split(|&b| b == b'\n') {
            // Each line ends with \r before the \n (we split on \n alone).
            let line = line.strip_suffix(b"\r").unwrap_or(line);
            if !line.is_empty() {
                actual_encoded.extend_from_slice(line);
            }
        }

        assert_eq!(
            actual_encoded, expected_encoded,
            "encoded body bytes do not match oracle"
        );

        // Round-trip must also be correct.
        let decoded = decode(&encoded).expect("round-trip decode failed");
        assert_eq!(decoded.data, raw, "all-bytes round-trip failed");
    }

    // -----------------------------------------------------------------------
    // Full encode() tests
    // -----------------------------------------------------------------------

    #[test]
    fn encode_single_part_header_footer() {
        let data = b"Cat";
        let out = encode(data, "cat.bin", 128);
        let s = String::from_utf8_lossy(&out);
        assert!(s.starts_with("=ybegin line=128 size=3 name=cat.bin\r\n"));
        assert!(s.contains("=yend size=3 crc32="));
        assert!(s.ends_with("\r\n"));
    }

    #[test]
    fn encode_empty_data() {
        let out = encode(b"", "empty.bin", 128);
        let s = String::from_utf8_lossy(&out);
        assert!(s.starts_with("=ybegin line=128 size=0 name=empty.bin\r\n"));
        assert!(s.contains("=yend size=0 crc32="));
        // No data lines between header and footer.
        let parts: Vec<&str> = s.lines().collect();
        assert_eq!(parts[0], "=ybegin line=128 size=0 name=empty.bin");
        assert_eq!(parts[1], "=yend size=0 crc32=00000000");
    }

    #[test]
    fn encode_single_part_crc_correct() {
        // Oracle: binascii.crc32(bytes(range(64))) = 0x100ece8c
        let data: Vec<u8> = (0..64).collect();
        let out = encode(&data, "test.bin", 128);
        assert!(
            String::from_utf8_lossy(&out).contains("crc32=100ece8c"),
            "CRC32 mismatch in encoded output"
        );
        // Also verify decode round-trip
        let decoded = decode(&out).unwrap();
        assert_eq!(decoded.data, data);
        assert!(decoded.crc32_verified);
    }

    #[test]
    fn encode_part_header_fields() {
        let data: Vec<u8> = (0..64).collect();
        let out = encode_part(&data, "test.bin", 128, 2, 1, 1, 64, 0xdeadbeef, 128);
        let s = String::from_utf8_lossy(&out);
        assert!(s.starts_with("=ybegin part=1 total=2 line=128 size=128 name=test.bin\r\n"));
        assert!(s.contains("=ypart begin=1 end=64\r\n"));
        assert!(s.contains("pcrc32="));
        assert!(s.contains("crc32=deadbeef"));
    }

    #[test]
    fn encode_part_pcrc_is_part_crc() {
        // Oracle: pcrc32 of bytes(range(64)) = 0x100ece8c
        let data: Vec<u8> = (0..64).collect();
        let out = encode_part(&data, "test.bin", 128, 2, 1, 1, 64, 0x24650d57, 128);
        let s = String::from_utf8_lossy(&out);
        assert!(s.contains("pcrc32=100ece8c"), "per-part CRC wrong: {s}");
        assert!(s.contains("crc32=24650d57"), "whole-file CRC wrong: {s}");
    }

    #[test]
    fn encode_line_length_1_clamped_to_2() {
        // line_length=1 would make escape pairs (2 bytes) overflow the column
        // limit.  The encoder must clamp it to at least 2.
        // Oracle: byte 214 encodes to NUL (needs escape), so we get "=@" which
        // is 2 bytes — must fit on one line even though the caller asked for 1.
        let out = encode(&[214], "t.bin", 1);
        let s = String::from_utf8_lossy(&out);
        for line in s.lines() {
            if line.starts_with("=ybegin") || line.starts_with("=yend") || line.is_empty() {
                continue;
            }
            assert!(
                line.len() <= 2,
                "line too long with clamped line_length=2: {:?}",
                line
            );
        }
        // Round-trip must succeed.
        let decoded = decode(&out).expect("round-trip decode of line_length=1 input failed");
        assert_eq!(decoded.data, &[214]);
    }

    #[test]
    fn encode_line_length_2_does_not_panic() {
        // line_length=2 is the minimum valid value; must not panic and must
        // produce a decodable output.
        // Oracle: bytes [0, 1] encode to '*' '+' — no escaping needed.
        let data = &[0u8, 1u8];
        let out = encode(data, "t.bin", 2);
        let decoded = decode(&out).expect("round-trip decode of line_length=2 input failed");
        assert_eq!(decoded.data, data);
    }

    #[test]
    fn no_line_exceeds_line_length() {
        use crate::encode;
        // Use a small line_length (e.g. 10) and a payload that forces escapes at various positions.
        // Bytes that must be escaped: those whose (b+42)%256 equals 0, 10, 13, or 61.
        // (b+42)%256 = 0 -> b = 214
        // (b+42)%256 = 10 -> b = 224
        // (b+42)%256 = 13 -> b = 227
        // (b+42)%256 = 61 -> b = 19
        // Create a payload with escapes at position 9 (last column for line_length=10).
        let line_length = 10u8;
        // Build a 50-byte payload where every 9th byte (0-indexed) forces an escape:
        let payload: Vec<u8> = (0u8..50)
            .map(|i| if i % 9 == 8 { 19u8 } else { 0u8 })
            .collect();
        let encoded = encode(&payload, "test.bin", line_length);
        // Extract data lines (skip =ybegin and =yend lines)
        for line in encoded.split(|&b| b == b'\n') {
            let line = if line.ends_with(b"\r") {
                &line[..line.len() - 1]
            } else {
                line
            };
            // Skip header/footer lines
            if line.starts_with(b"=ybegin") || line.starts_with(b"=yend") || line.is_empty() {
                continue;
            }
            assert!(
                line.len() <= line_length as usize,
                "data line too long: {} chars (limit {}): {:?}",
                line.len(),
                line_length,
                std::str::from_utf8(line).unwrap_or("<binary>")
            );
        }
    }
}