uucore 0.4.0

uutils ~ 'core' uutils code library (cross-platform)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
// This file is part of the uutils coreutils package.
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.

// spell-checker:ignore memmem algo PCLMULQDQ refin xorout Hdlc

//! Implementations of digest functions, like md5 and sha1.
//!
//! The [`Digest`] trait represents the interface for providing inputs
//! to these digest functions and accessing the resulting hash. The
//! [`DigestWriter`] struct provides a wrapper around [`Digest`] that
//! implements the [`Write`] trait, for use in situations where calling
//! [`write`] would be useful.
use std::io::Write;

use hex::encode;
#[cfg(windows)]
use memchr::memmem;

pub trait Digest {
    fn new() -> Self
    where
        Self: Sized;
    fn hash_update(&mut self, input: &[u8]);
    fn hash_finalize(&mut self, out: &mut [u8]);
    fn reset(&mut self);
    fn output_bits(&self) -> usize;
    fn output_bytes(&self) -> usize {
        self.output_bits().div_ceil(8)
    }
    fn result_str(&mut self) -> String {
        let mut buf: Vec<u8> = vec![0; self.output_bytes()];
        self.hash_finalize(&mut buf);
        encode(buf)
    }
}

/// first element of the tuple is the blake2b state
/// second is the number of output bits
pub struct Blake2b(blake2b_simd::State, usize);

impl Blake2b {
    /// Return a new Blake2b instance with a custom output bytes length
    pub fn with_output_bytes(output_bytes: usize) -> Self {
        let mut params = blake2b_simd::Params::new();
        params.hash_length(output_bytes);

        let state = params.to_state();
        Self(state, output_bytes * 8)
    }
}

impl Digest for Blake2b {
    fn new() -> Self {
        // by default, Blake2b output is 512 bits long (= 64B)
        Self::with_output_bytes(64)
    }

    fn hash_update(&mut self, input: &[u8]) {
        self.0.update(input);
    }

    fn hash_finalize(&mut self, out: &mut [u8]) {
        let hash_result = &self.0.finalize();
        out.copy_from_slice(hash_result.as_bytes());
    }

    fn reset(&mut self) {
        *self = Self::with_output_bytes(self.output_bytes());
    }

    fn output_bits(&self) -> usize {
        self.1
    }
}

pub struct Blake3(blake3::Hasher);
impl Digest for Blake3 {
    fn new() -> Self {
        Self(blake3::Hasher::new())
    }

    fn hash_update(&mut self, input: &[u8]) {
        self.0.update(input);
    }

    fn hash_finalize(&mut self, out: &mut [u8]) {
        let hash_result = &self.0.finalize();
        out.copy_from_slice(hash_result.as_bytes());
    }

    fn reset(&mut self) {
        *self = Self::new();
    }

    fn output_bits(&self) -> usize {
        256
    }
}

pub struct Sm3(sm3::Sm3);
impl Digest for Sm3 {
    fn new() -> Self {
        Self(<sm3::Sm3 as sm3::Digest>::new())
    }

    fn hash_update(&mut self, input: &[u8]) {
        <sm3::Sm3 as sm3::Digest>::update(&mut self.0, input);
    }

    fn hash_finalize(&mut self, out: &mut [u8]) {
        out.copy_from_slice(&<sm3::Sm3 as sm3::Digest>::finalize(self.0.clone()));
    }

    fn reset(&mut self) {
        *self = Self::new();
    }

    fn output_bits(&self) -> usize {
        256
    }
}

pub struct Crc {
    digest: crc_fast::Digest,
    size: usize,
}

impl Crc {
    /// POSIX cksum SIMD configuration for crc-fast
    /// This uses SIMD instructions (PCLMULQDQ) for fast CRC computation
    fn get_posix_cksum_params() -> crc_fast::CrcParams {
        crc_fast::CrcParams::new(
            "CRC-32/CKSUM", // Name
            32,             // Width
            0x04c11db7,     // Polynomial
            0x00000000,     // Initial CRC value: 0 (not 0xffffffff)
            false,          // No input reflection (refin)
            0xffffffff,     // XOR output with 0xffffffff (xorout)
            0,              // Check value (not used)
        )
    }
}

impl Digest for Crc {
    fn new() -> Self {
        Self {
            digest: crc_fast::Digest::new_with_params(Self::get_posix_cksum_params()),
            size: 0,
        }
    }

    fn hash_update(&mut self, input: &[u8]) {
        self.digest.update(input);
        self.size += input.len();
    }

    fn hash_finalize(&mut self, out: &mut [u8]) {
        // Add the size at the end of the buffer.
        let mut sz = self.size;
        while sz > 0 {
            self.digest.update(&[sz as u8]);
            sz >>= 8;
        }

        out.copy_from_slice(&self.digest.finalize().to_ne_bytes());
    }

    fn result_str(&mut self) -> String {
        let mut out: [u8; 8] = [0; 8];
        self.hash_finalize(&mut out);
        u64::from_ne_bytes(out).to_string()
    }

    fn reset(&mut self) {
        self.digest.reset();
        self.size = 0;
    }

    fn output_bits(&self) -> usize {
        256
    }
}

pub struct CRC32B {
    digest: crc_fast::Digest,
}

impl Digest for CRC32B {
    fn new() -> Self {
        Self {
            digest: crc_fast::Digest::new(crc_fast::CrcAlgorithm::Crc32IsoHdlc),
        }
    }

    fn hash_update(&mut self, input: &[u8]) {
        self.digest.update(input);
    }

    fn hash_finalize(&mut self, out: &mut [u8]) {
        let result = self.digest.finalize();
        // crc_fast returns a 64-bit value, but CRC32B should be 32-bit
        // Take the lower 32 bits and convert to big-endian bytes
        let crc32_value = (result & 0xffffffff) as u32;
        out.copy_from_slice(&crc32_value.to_be_bytes());
    }

    fn reset(&mut self) {
        self.digest.reset();
    }

    fn output_bits(&self) -> usize {
        32
    }

    fn result_str(&mut self) -> String {
        let mut out = [0; 4];
        self.hash_finalize(&mut out);
        format!("{}", u32::from_be_bytes(out))
    }
}

pub struct Bsd {
    state: u16,
}
impl Digest for Bsd {
    fn new() -> Self {
        Self { state: 0 }
    }

    fn hash_update(&mut self, input: &[u8]) {
        for &byte in input {
            self.state = (self.state >> 1) + ((self.state & 1) << 15);
            self.state = self.state.wrapping_add(u16::from(byte));
        }
    }

    fn hash_finalize(&mut self, out: &mut [u8]) {
        out.copy_from_slice(&self.state.to_ne_bytes());
    }

    fn result_str(&mut self) -> String {
        let mut _out: Vec<u8> = vec![0; 2];
        self.hash_finalize(&mut _out);
        format!("{}", self.state)
    }

    fn reset(&mut self) {
        *self = Self::new();
    }

    fn output_bits(&self) -> usize {
        128
    }
}

pub struct SysV {
    state: u32,
}
impl Digest for SysV {
    fn new() -> Self {
        Self { state: 0 }
    }

    fn hash_update(&mut self, input: &[u8]) {
        for &byte in input {
            self.state = self.state.wrapping_add(u32::from(byte));
        }
    }

    fn hash_finalize(&mut self, out: &mut [u8]) {
        self.state = (self.state & 0xffff) + (self.state >> 16);
        self.state = (self.state & 0xffff) + (self.state >> 16);
        out.copy_from_slice(&(self.state as u16).to_ne_bytes());
    }

    fn result_str(&mut self) -> String {
        let mut _out: Vec<u8> = vec![0; 2];
        self.hash_finalize(&mut _out);
        format!("{}", self.state)
    }

    fn reset(&mut self) {
        *self = Self::new();
    }

    fn output_bits(&self) -> usize {
        512
    }
}

// Implements the Digest trait for sha2 / sha3 algorithms with fixed output
macro_rules! impl_digest_common {
    ($algo_type: ty, $size: expr) => {
        impl Digest for $algo_type {
            fn new() -> Self {
                Self(Default::default())
            }

            fn hash_update(&mut self, input: &[u8]) {
                digest::Digest::update(&mut self.0, input);
            }

            fn hash_finalize(&mut self, out: &mut [u8]) {
                digest::Digest::finalize_into_reset(&mut self.0, out.into());
            }

            fn reset(&mut self) {
                *self = Self::new();
            }

            fn output_bits(&self) -> usize {
                $size
            }
        }
    };
}

// Implements the Digest trait for sha2 / sha3 algorithms with variable output
macro_rules! impl_digest_shake {
    ($algo_type: ty) => {
        impl Digest for $algo_type {
            fn new() -> Self {
                Self(Default::default())
            }

            fn hash_update(&mut self, input: &[u8]) {
                digest::Update::update(&mut self.0, input);
            }

            fn hash_finalize(&mut self, out: &mut [u8]) {
                digest::ExtendableOutputReset::finalize_xof_reset_into(&mut self.0, out);
            }

            fn reset(&mut self) {
                *self = Self::new();
            }

            fn output_bits(&self) -> usize {
                0
            }
        }
    };
}

pub struct Md5(md5::Md5);
pub struct Sha1(sha1::Sha1);
pub struct Sha224(sha2::Sha224);
pub struct Sha256(sha2::Sha256);
pub struct Sha384(sha2::Sha384);
pub struct Sha512(sha2::Sha512);
impl_digest_common!(Md5, 128);
impl_digest_common!(Sha1, 160);
impl_digest_common!(Sha224, 224);
impl_digest_common!(Sha256, 256);
impl_digest_common!(Sha384, 384);
impl_digest_common!(Sha512, 512);

pub struct Sha3_224(sha3::Sha3_224);
pub struct Sha3_256(sha3::Sha3_256);
pub struct Sha3_384(sha3::Sha3_384);
pub struct Sha3_512(sha3::Sha3_512);
impl_digest_common!(Sha3_224, 224);
impl_digest_common!(Sha3_256, 256);
impl_digest_common!(Sha3_384, 384);
impl_digest_common!(Sha3_512, 512);

pub struct Shake128(sha3::Shake128);
pub struct Shake256(sha3::Shake256);
impl_digest_shake!(Shake128);
impl_digest_shake!(Shake256);

/// A struct that writes to a digest.
///
/// This struct wraps a [`Digest`] and provides a [`Write`]
/// implementation that passes input bytes directly to the
/// [`Digest::hash_update`].
///
/// On Windows, if `binary` is `false`, then the [`write`]
/// implementation replaces instances of "\r\n" with "\n" before passing
/// the input bytes to the [`digest`].
pub struct DigestWriter<'a> {
    digest: &'a mut Box<dyn Digest>,

    /// Whether to write to the digest in binary mode or text mode on Windows.
    ///
    /// If this is `false`, then instances of "\r\n" are replaced with
    /// "\n" before passing input bytes to the [`digest`].
    #[allow(dead_code)]
    binary: bool,

    /// Whether the previous
    #[allow(dead_code)]
    was_last_character_carriage_return: bool,
    // TODO These are dead code only on non-Windows operating systems.
    // It might be better to use a `#[cfg(windows)]` guard here.
}

impl<'a> DigestWriter<'a> {
    pub fn new(digest: &'a mut Box<dyn Digest>, binary: bool) -> Self {
        let was_last_character_carriage_return = false;
        DigestWriter {
            digest,
            binary,
            was_last_character_carriage_return,
        }
    }

    pub fn finalize(&mut self) -> bool {
        if self.was_last_character_carriage_return {
            self.digest.hash_update(b"\r");
            true
        } else {
            false
        }
    }
}

impl Write for DigestWriter<'_> {
    #[cfg(not(windows))]
    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
        self.digest.hash_update(buf);
        Ok(buf.len())
    }

    #[cfg(windows)]
    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
        if self.binary {
            self.digest.hash_update(buf);
            return Ok(buf.len());
        }

        // The remaining code handles Windows text mode, where we must
        // replace each occurrence of "\r\n" with "\n".
        //
        // First, if the last character written was "\r" and the first
        // character in the current buffer to write is not "\n", then we
        // need to write the "\r" that we buffered from the previous
        // call to `write()`.
        let n = buf.len();
        if self.was_last_character_carriage_return && n > 0 && buf[0] != b'\n' {
            self.digest.hash_update(b"\r");
        }

        // Next, find all occurrences of "\r\n", inputting the slice
        // just before the "\n" in the previous instance of "\r\n" and
        // the beginning of this "\r\n".
        let mut i_prev = 0;
        for i in memmem::find_iter(buf, b"\r\n") {
            self.digest.hash_update(&buf[i_prev..i]);
            i_prev = i + 1;
        }

        // Finally, check whether the last character is "\r". If so,
        // buffer it until we know that the next character is not "\n",
        // which can only be known on the next call to `write()`.
        //
        // This all assumes that `write()` will be called on adjacent
        // blocks of the input.
        if n > 0 && buf[n - 1] == b'\r' {
            self.was_last_character_carriage_return = true;
            self.digest.hash_update(&buf[i_prev..n - 1]);
        } else {
            self.was_last_character_carriage_return = false;
            self.digest.hash_update(&buf[i_prev..n]);
        }

        // Even though we dropped a "\r" for each "\r\n" we found, we
        // still report the number of bytes written as `n`. This is
        // because the meaning of the returned number is supposed to be
        // the number of bytes consumed by the writer, so that if the
        // calling code were calling `write()` in a loop, it would know
        // where the next contiguous slice of the buffer starts.
        Ok(n)
    }

    fn flush(&mut self) -> std::io::Result<()> {
        Ok(())
    }
}

#[cfg(test)]
mod tests {

    /// Test for replacing a "\r\n" sequence with "\n" when the "\r" is
    /// at the end of one block and the "\n" is at the beginning of the
    /// next block, when reading in blocks.
    #[cfg(windows)]
    #[test]
    fn test_crlf_across_blocks() {
        use std::io::Write;

        use super::Digest;
        use super::DigestWriter;
        use super::Md5;

        // Writing "\r" in one call to `write()`, and then "\n" in another.
        let mut digest = Box::new(Md5::new()) as Box<dyn Digest>;
        let mut writer_crlf = DigestWriter::new(&mut digest, false);
        writer_crlf.write_all(b"\r").unwrap();
        writer_crlf.write_all(b"\n").unwrap();
        writer_crlf.finalize();
        let result_crlf = digest.result_str();

        // We expect "\r\n" to be replaced with "\n" in text mode on Windows.
        let mut digest = Box::new(Md5::new()) as Box<dyn Digest>;
        let mut writer_lf = DigestWriter::new(&mut digest, false);
        writer_lf.write_all(b"\n").unwrap();
        writer_lf.finalize();
        let result_lf = digest.result_str();

        assert_eq!(result_crlf, result_lf);
    }

    use super::{Crc, Digest};

    #[test]
    fn test_crc_basic_functionality() {
        // Test that our CRC implementation works with basic functionality
        let mut crc1 = Crc::new();
        let mut crc2 = Crc::new();

        // Same input should give same output
        crc1.hash_update(b"test");
        crc2.hash_update(b"test");

        let mut out1 = [0u8; 8];
        let mut out2 = [0u8; 8];
        crc1.hash_finalize(&mut out1);
        crc2.hash_finalize(&mut out2);

        assert_eq!(out1, out2);
    }

    #[test]
    fn test_crc_digest_basic() {
        let mut crc = Crc::new();

        // Test empty input
        let mut output = [0u8; 8];
        crc.hash_finalize(&mut output);
        let empty_result = u64::from_ne_bytes(output);

        // Reset and test with "test" string
        let mut crc = Crc::new();
        crc.hash_update(b"test");
        crc.hash_finalize(&mut output);
        let test_result = u64::from_ne_bytes(output);

        // The result should be different for different inputs
        assert_ne!(empty_result, test_result);

        // Test known value: "test" should give 3076352578
        assert_eq!(test_result, 3076352578);
    }

    #[test]
    fn test_crc_digest_incremental() {
        let mut crc1 = Crc::new();
        let mut crc2 = Crc::new();

        // Test that processing in chunks gives same result as all at once
        let data = b"Hello, World! This is a test string for CRC computation.";

        // Process all at once
        crc1.hash_update(data);
        let mut output1 = [0u8; 8];
        crc1.hash_finalize(&mut output1);

        // Process in chunks
        crc2.hash_update(&data[0..10]);
        crc2.hash_update(&data[10..30]);
        crc2.hash_update(&data[30..]);
        let mut output2 = [0u8; 8];
        crc2.hash_finalize(&mut output2);

        assert_eq!(output1, output2);
    }

    #[test]
    fn test_crc_slice8_vs_single_byte() {
        // Test that our optimized slice-by-8 gives same results as byte-by-byte
        let test_data = b"This is a longer test string to verify slice-by-8 optimization works correctly with various data sizes including remainders.";

        let mut crc_optimized = Crc::new();
        crc_optimized.hash_update(test_data);
        let mut output_opt = [0u8; 8];
        crc_optimized.hash_finalize(&mut output_opt);

        // Create a reference implementation using hash_update
        let mut crc_reference = Crc::new();
        for &byte in test_data {
            crc_reference.hash_update(&[byte]);
        }
        let mut output_ref = [0u8; 8];
        crc_reference.hash_finalize(&mut output_ref);

        assert_eq!(output_opt, output_ref);
    }

    #[test]
    fn test_crc_known_values() {
        // Test against our CRC implementation values
        // Note: These are the correct values for our POSIX cksum implementation
        let test_cases = [
            ("", 4294967295_u64),
            ("a", 1220704766_u64),
            ("abc", 1219131554_u64),
        ];

        for (input, expected) in test_cases {
            let mut crc = Crc::new();
            crc.hash_update(input.as_bytes());
            let mut output = [0u8; 8];
            crc.hash_finalize(&mut output);
            let result = u64::from_ne_bytes(output);

            assert_eq!(result, expected, "CRC mismatch for input: '{input}'");
        }
    }

    #[test]
    fn test_crc_hash_update_edge_cases() {
        let mut crc = Crc::new();

        // Test with data that's not a multiple of 8 bytes
        let data7 = b"1234567"; // 7 bytes
        crc.hash_update(data7);

        let data9 = b"123456789"; // 9 bytes
        let mut crc2 = Crc::new();
        crc2.hash_update(data9);

        // Should not panic and should produce valid results
        let mut out1 = [0u8; 8];
        let mut out2 = [0u8; 8];
        crc.hash_finalize(&mut out1);
        crc2.hash_finalize(&mut out2);

        // Results should be different for different inputs
        assert_ne!(out1, out2);
    }
}