dec_sixbit/
encode.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
//! Functions for encoding strings into DEC SIXBIT format.
//!
//! This module provides both safe and unsafe encoding functions. The safe functions perform validation
//! to ensure all characters are within the valid SIXBIT range, while the unsafe functions assume the input
//! is already valid for increased performance.

use crate::{Error, MASK_FOUR_BITS, MASK_TWO_BITS, ASCII_OFFSET, SHIFT_TWO_BITS, SHIFT_FOUR_BITS, SHIFT_SIX_BITS};

/// This function converts the input string into a compact SIXBIT-encoded byte vector and returns the
/// encoded bytes along with the original string length.
///
/// # Constraints
/// - Only ASCII characters in the range 32-95 (space through underscore) are allowed.
///
/// # Errors
/// Returns an [`Error::InvalidCharacter`] if the input contains characters outside the valid range.
///
/// # Examples
///
/// ```rust
/// use dec_sixbit::encode;
///
/// let input = "HELLO";
/// let (encoded_bytes, length) = encode(input).unwrap();
/// ```
pub fn encode(str: &str) -> Result<(Vec<u8>, usize), Error> {
    // Check if input string contains only ASCII characters
    if !str.is_ascii() {
        return Err(Error::InvalidCharacter);
    }
    let len = str.len();
    // Every 4 characters need 3 bytes, round up
    let bytes_needed = (len * 3 + 3) / 4;
    let mut bytes = vec![0u8; bytes_needed];

    let full_chunks = len / 4;
    let remaining = len % 4;

    for chunk_idx in 0..full_chunks {
        let start = chunk_idx * 4;
        let chunk = &str.as_bytes()[start..start + 4];

        // Validate characters
        for &code in chunk {
            if !(ASCII_OFFSET..=95).contains(&code) {
                return Err(Error::InvalidCharacter);
            }
        }

        // Convert to SIXBIT values by subtracting ASCII_OFFSET
        let a = chunk[0] - ASCII_OFFSET;
        let b = chunk[1] - ASCII_OFFSET;
        let c = chunk[2] - ASCII_OFFSET;
        let d = chunk[3] - ASCII_OFFSET;

        let byte_idx = chunk_idx * 3;

        // Pack 4 SIXBIT values into 3 bytes
        bytes[byte_idx] = (a << SHIFT_TWO_BITS) | (b >> SHIFT_FOUR_BITS);
        bytes[byte_idx + 1] = ((b & MASK_FOUR_BITS) << SHIFT_FOUR_BITS) | (c >> SHIFT_TWO_BITS);
        bytes[byte_idx + 2] = ((c & MASK_TWO_BITS) << SHIFT_SIX_BITS) | d;
    }

    // Handle the remaining 1-3 characters, if any
    if remaining > 0 {
        let start = full_chunks * 4;
        let chunk = &str.as_bytes()[start..];
        let byte_idx = full_chunks * 3;

        match chunk.len() {
            3 => {
                // Validate characters
                for &code in chunk {
                    if !(ASCII_OFFSET..=95).contains(&code) {
                        return Err(Error::InvalidCharacter);
                    }
                }

                // Convert to SIXBIT values by subtracting ASCII_OFFSET
                let a = chunk[0] - ASCII_OFFSET;
                let b = chunk[1] - ASCII_OFFSET;
                let c = chunk[2] - ASCII_OFFSET;

                // Pack 3 SIXBIT values into 2.25 bytes (rounded up to 3 bytes)
                bytes[byte_idx] = (a << SHIFT_TWO_BITS) | (b >> SHIFT_FOUR_BITS);
                bytes[byte_idx + 1] = ((b & MASK_FOUR_BITS) << SHIFT_FOUR_BITS) | (c >> SHIFT_TWO_BITS);
                bytes[byte_idx + 2] = (c & MASK_TWO_BITS) << SHIFT_SIX_BITS;
            },
            2 => {
                // Validate characters
                for &code in chunk {
                    if !(ASCII_OFFSET..=95).contains(&code) {
                        return Err(Error::InvalidCharacter);
                    }
                }

                // Convert to SIXBIT values by subtracting ASCII_OFFSET
                let a = chunk[0] - ASCII_OFFSET;
                let b = chunk[1] - ASCII_OFFSET;

                // Pack 2 SIXBIT values into 1.5 bytes (rounded up to 2 bytes)
                bytes[byte_idx] = (a << SHIFT_TWO_BITS) | (b >> SHIFT_FOUR_BITS);
                bytes[byte_idx + 1] = (b & MASK_FOUR_BITS) << SHIFT_FOUR_BITS;
            },
            1 => {
                // Validate character
                let code = chunk[0];
                if !(ASCII_OFFSET..=95).contains(&code) {
                    return Err(Error::InvalidCharacter);
                }

                // Convert to SIXBIT value by subtracting ASCII_OFFSET
                let a = code - ASCII_OFFSET;

                // Pack 1 SIXBIT value into 0.75 bytes (rounded up to 1 byte)
                bytes[byte_idx] = a << SHIFT_TWO_BITS;
            },
            _ => unreachable!(),
        }
    }

    Ok((bytes, len))
}

/// This function performs encoding without validating whether the input string contains only
/// valid SIXBIT characters (ASCII 32-95). Use this function only when you are certain the input
/// meets the required constraints to avoid undefined behavior.
///
/// # Safety
/// The caller must ensure that all characters in `str` are within the valid SIXBIT range (32-95).
///
/// # Examples
///
/// ```rust
/// use dec_sixbit::encode_unchecked;
///
/// let input = "HELLO";
/// let (encoded_bytes, length) = unsafe { encode_unchecked(input) };
/// ```
pub fn encode_unchecked(str: &str) -> (Vec<u8>, usize) {
    let len = str.len();
    // Every 4 characters need 3 bytes, round up
    let bytes_needed = (len * 3 + 3) / 4;
    let mut bytes = vec![0u8; bytes_needed];

    let full_chunks = len / 4;
    let remaining = len % 4;

    for chunk_idx in 0..full_chunks {
        let start = chunk_idx * 4;
        let chunk = &str.as_bytes()[start..start + 4];

        // Convert to SIXBIT values by subtracting ASCII_OFFSET directly
        let a = chunk[0] - ASCII_OFFSET;
        let b = chunk[1] - ASCII_OFFSET;
        let c = chunk[2] - ASCII_OFFSET;
        let d = chunk[3] - ASCII_OFFSET;

        let byte_idx = chunk_idx * 3;

        // Pack 4 SIXBIT values into 3 bytes
        bytes[byte_idx] = (a << SHIFT_TWO_BITS) | (b >> SHIFT_FOUR_BITS);
        bytes[byte_idx + 1] = ((b & MASK_FOUR_BITS) << SHIFT_FOUR_BITS) | (c >> SHIFT_TWO_BITS);
        bytes[byte_idx + 2] = ((c & MASK_TWO_BITS) << SHIFT_SIX_BITS) | d;
    }

    // Handle the remaining 1-3 characters, if any
    if remaining > 0 {
        let start = full_chunks * 4;
        let chunk = &str.as_bytes()[start..];
        let byte_idx = full_chunks * 3;

        match chunk.len() {
            3 => {
                // Convert to SIXBIT values by subtracting ASCII_OFFSET directly
                let a = chunk[0] - ASCII_OFFSET;
                let b = chunk[1] - ASCII_OFFSET;
                let c = chunk[2] - ASCII_OFFSET;

                // Pack 3 SIXBIT values into 2.25 bytes (rounded up to 3 bytes)
                bytes[byte_idx] = (a << SHIFT_TWO_BITS) | (b >> SHIFT_FOUR_BITS);
                bytes[byte_idx + 1] = ((b & MASK_FOUR_BITS) << SHIFT_FOUR_BITS) | (c >> SHIFT_TWO_BITS);
                bytes[byte_idx + 2] = (c & MASK_TWO_BITS) << SHIFT_SIX_BITS;
            },
            2 => {
                // Convert to SIXBIT values by subtracting ASCII_OFFSET directly
                let a = chunk[0] - ASCII_OFFSET;
                let b = chunk[1] - ASCII_OFFSET;

                // Pack 2 SIXBIT values into 1.5 bytes (rounded up to 2 bytes)
                bytes[byte_idx] = (a << SHIFT_TWO_BITS) | (b >> SHIFT_FOUR_BITS);
                bytes[byte_idx + 1] = (b & MASK_FOUR_BITS) << SHIFT_FOUR_BITS;
            },
            1 => {
                // Convert to SIXBIT value by subtracting ASCII_OFFSET directly
                let a = chunk[0] - ASCII_OFFSET;

                // Pack 1 SIXBIT value into 0.75 bytes (rounded up to 1 byte)
                bytes[byte_idx] = a << SHIFT_TWO_BITS;
            },
            _ => unreachable!(),
        }
    }

    (bytes, len)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_encode_empty_string() {
        let input = "";
        let (encoded, len) = encode(input).expect("Encoding should succeed for empty string");
        assert!(encoded.is_empty(), "Encoded bytes should be empty");
        assert_eq!(len, 0, "Length should be 0");
    }

    #[test]
    fn test_encode_single_character() {
        let input = "A"; // ASCII 65
        let (encoded, len) = encode(input).expect("Encoding should succeed for single character");
        let expected = vec![(65 - ASCII_OFFSET) << SHIFT_TWO_BITS];
        assert_eq!(encoded, expected, "Encoded bytes do not match expected value");
        assert_eq!(len, 1, "Length should be 1");
    }

    #[test]
    fn test_encode_two_characters() {
        let input = "AB"; // ASCII 65, 66
        let (encoded, len) = encode(input).expect("Encoding should succeed for two characters");
        let a = 65 - ASCII_OFFSET;
        let b = 66 - ASCII_OFFSET;
        let expected = vec![
            (a << SHIFT_TWO_BITS) | (b >> SHIFT_FOUR_BITS),
            (b & MASK_FOUR_BITS) << SHIFT_FOUR_BITS,
        ];
        assert_eq!(encoded, expected, "Encoded bytes do not match expected value for two characters");
        assert_eq!(len, 2, "Length should be 2");
    }

    #[test]
    fn test_encode_three_characters() {
        let input = "ABC"; // ASCII 65, 66, 67
        let (encoded, len) = encode(input).expect("Encoding should succeed for three characters");
        let a = 65 - ASCII_OFFSET;
        let b = 66 - ASCII_OFFSET;
        let c = 67 - ASCII_OFFSET;
        let expected = vec![
            (a << SHIFT_TWO_BITS) | (b >> SHIFT_FOUR_BITS),
            ((b & MASK_FOUR_BITS) << SHIFT_FOUR_BITS) | (c >> SHIFT_TWO_BITS),
            (c & MASK_TWO_BITS) << SHIFT_SIX_BITS,
        ];
        assert_eq!(encoded, expected, "Encoded bytes do not match expected value for three characters");
        assert_eq!(len, 3, "Length should be 3");
    }

    #[test]
    fn test_encode_four_characters() {
        let input = "ABCD"; // ASCII 65, 66, 67, 68
        let (encoded, len) = encode(input).expect("Encoding should succeed for four characters");
        let a = 65 - ASCII_OFFSET;
        let b = 66 - ASCII_OFFSET;
        let c = 67 - ASCII_OFFSET;
        let d = 68 - ASCII_OFFSET;
        let expected = vec![
            (a << SHIFT_TWO_BITS) | (b >> SHIFT_FOUR_BITS),
            ((b & MASK_FOUR_BITS) << SHIFT_FOUR_BITS) | (c >> SHIFT_TWO_BITS),
            ((c & MASK_TWO_BITS) << SHIFT_SIX_BITS) | d,
        ];
        assert_eq!(encoded, expected, "Encoded bytes do not match expected value for four characters");
        assert_eq!(len, 4, "Length should be 4");
    }

    #[test]
    fn test_encode_multiple_chunks() {
        let input = "HELLOWORLD_ "; // 12 characters
        let (encoded, len) = encode(input).expect("Encoding should succeed for multiple chunks");
        assert_eq!(len, input.len(), "Length should match input length");

        // Manually compute expected bytes
        // Chunks: "HELL", "OWOR", "LD_ "
        let chunks = ["HELL", "OWOR", "LD_ "];
        let mut expected = Vec::new();

        for chunk in &chunks {
            let a = chunk.as_bytes()[0] - ASCII_OFFSET;
            let b = chunk.as_bytes()[1] - ASCII_OFFSET;
            let c = chunk.as_bytes()[2] - ASCII_OFFSET;
            let d = chunk.as_bytes()[3] - ASCII_OFFSET;

            expected.push((a << SHIFT_TWO_BITS) | (b >> SHIFT_FOUR_BITS));
            expected.push(((b & MASK_FOUR_BITS) << SHIFT_FOUR_BITS) | (c >> SHIFT_TWO_BITS));
            expected.push(((c & MASK_TWO_BITS) << SHIFT_SIX_BITS) | d);
        }

        assert_eq!(encoded, expected, "Encoded bytes do not match expected value for multiple chunks");
    }

    #[test]
    fn test_encode_with_invalid_character_non_ascii() {
        let input = "Hello€"; // '€' is not ASCII
        let result = encode(input);
        assert!(matches!(result, Err(Error::InvalidCharacter)), "Should return InvalidCharacter error for non-ASCII characters");
    }

    #[test]
    fn test_encode_with_invalid_character_below_range() {
        let input = "HELLO\x1F"; // ASCII 31, below valid range
        let result = encode(input);
        assert!(matches!(result, Err(Error::InvalidCharacter)), "Should return InvalidCharacter error for characters below range");
    }

    #[test]
    fn test_encode_with_invalid_character_above_range() {
        let input = "HELLO~"; // '~' is ASCII 126, above valid range
        let result = encode(input);
        assert!(matches!(result, Err(Error::InvalidCharacter)), "Should return InvalidCharacter error for characters above range");
    }

    #[test]
    fn test_encode_unchecked_valid_input() {
        let input = "ABCD";
        let (safe_encoded, _) = encode(input).expect("Safe encode should succeed for valid input");
        let (unsafe_encoded, _) = encode_unchecked(input);
        assert_eq!(safe_encoded, unsafe_encoded, "Unchecked encoding should match safe encoding for valid input");
    }

    #[test]
    fn test_encode_unchecked_empty_string() {
        let input = "";
        let (encoded, len) = encode_unchecked(input);
        assert!(encoded.is_empty(), "Encoded bytes should be empty for empty string");
        assert_eq!(len, 0, "Length should be 0 for empty string");
    }

    #[test]
    fn test_encode_unchecked_large_input() {
        let input = "THEQUICKBROWNFOXJUMPSOVERTHELAZYDOG_12345";
        let (safe_encoded, len_safe) = encode(input).expect("Safe encode should succeed for large input");
        let (unsafe_encoded, len_unsafe) = {
            let (bytes, len) = encode_unchecked(input);
            (bytes, len)
        };
        assert_eq!(safe_encoded, unsafe_encoded, "Unchecked encoding should match safe encoding for large input");
        assert_eq!(len_safe, len_unsafe, "Lengths should match for large input");
    }

    #[test]
    #[allow(clippy::precedence)]
    fn test_encode_partial_chunks() {
        // Test inputs that result in partial chunks (1-3 remaining characters)
        let cases = vec![
            ("A", vec![(65 - ASCII_OFFSET) << SHIFT_TWO_BITS], 1),
            ("AB", vec![(65 - ASCII_OFFSET) << SHIFT_TWO_BITS | ((66 - ASCII_OFFSET) >> SHIFT_FOUR_BITS), ((66 - ASCII_OFFSET) & 0b1111) << SHIFT_FOUR_BITS], 2),
            ("ABC", vec![
                (65 - ASCII_OFFSET) << SHIFT_TWO_BITS | ((66 - ASCII_OFFSET) >> SHIFT_FOUR_BITS),
                (((66 - ASCII_OFFSET) & MASK_FOUR_BITS) << SHIFT_FOUR_BITS) | ((67 - ASCII_OFFSET) >> SHIFT_TWO_BITS),
                ((67 - ASCII_OFFSET) & MASK_TWO_BITS) << SHIFT_SIX_BITS
            ], 3),
            ("ABCDE", vec![
                // "ABCD"
                (65 - ASCII_OFFSET) << SHIFT_TWO_BITS | ((66 - ASCII_OFFSET) >> SHIFT_FOUR_BITS),
                (((66 - ASCII_OFFSET) & MASK_FOUR_BITS) << SHIFT_FOUR_BITS) | ((67 - ASCII_OFFSET) >> SHIFT_TWO_BITS),
                ((67 - ASCII_OFFSET) & MASK_TWO_BITS) << SHIFT_SIX_BITS | (68 - ASCII_OFFSET),
                // "E"
                (69 - ASCII_OFFSET) << SHIFT_TWO_BITS
            ], 5),
        ];

        for (input, expected, len) in cases {
            let (encoded, encoded_len) = encode(input).expect("Encoding should succeed");
            assert_eq!(encoded, expected, "Encoded bytes do not match for input '{}'", input);
            assert_eq!(encoded_len, len, "Length does not match for input '{}'", input);
        }
    }

    #[test]
    fn test_encode_unchecked_two_characters() {
        let input = "AB"; // ASCII 65, 66
        // Confirm that encode_unchecked produces the same result after safely encoding
        let (safe_encoded, _) = encode(input).expect("Safe encode should succeed for two characters");
        let (unsafe_encoded, _) = encode_unchecked(input);
        assert_eq!(safe_encoded, unsafe_encoded, "Unchecked encoding should match safe encoding for two characters");
    }

    #[test]
    fn test_encode_unchecked_three_characters() {
        let input = "ABC"; // ASCII 65, 66, 67
        // Confirm that encode_unchecked produces the same result after safely encoding
        let (safe_encoded, _) = encode(input).expect("Safe encode should succeed for three characters");
        let (unsafe_encoded, _) = encode_unchecked(input);
        assert_eq!(safe_encoded, unsafe_encoded, "Unchecked encoding should match safe encoding for three characters");
    }

    #[test]
    #[allow(clippy::precedence)]
    fn test_encode_unchecked_partial_chunks() {
        // Cases including partial chunks (1-3 characters)
        let cases = vec![
            ("A", vec![(65 - ASCII_OFFSET) << SHIFT_TWO_BITS], 1),
            ("AB", vec![
                (65 - ASCII_OFFSET) << SHIFT_TWO_BITS | ((66 - ASCII_OFFSET) >> SHIFT_FOUR_BITS),
                ((66 - ASCII_OFFSET) & MASK_FOUR_BITS) << SHIFT_FOUR_BITS,
            ], 2),
            ("ABC", vec![
                (65 - ASCII_OFFSET) << SHIFT_TWO_BITS | ((66 - ASCII_OFFSET) >> SHIFT_FOUR_BITS),
                ((66 - ASCII_OFFSET) & MASK_FOUR_BITS) << SHIFT_FOUR_BITS | ((67 - ASCII_OFFSET) >> SHIFT_TWO_BITS),
                ((67 - ASCII_OFFSET) & MASK_TWO_BITS) << SHIFT_SIX_BITS,
            ], 3),
            ("ABCDE", vec![
                // "ABCD"
                (65 - ASCII_OFFSET) << SHIFT_TWO_BITS | ((66 - ASCII_OFFSET) >> SHIFT_FOUR_BITS),
                ((66 - ASCII_OFFSET) & MASK_FOUR_BITS) << SHIFT_FOUR_BITS | ((67 - ASCII_OFFSET) >> SHIFT_TWO_BITS),
                ((67 - ASCII_OFFSET) & MASK_TWO_BITS) << SHIFT_SIX_BITS | (68 - ASCII_OFFSET),
                // "E"
                (69 - ASCII_OFFSET) << SHIFT_TWO_BITS,
            ], 5),
        ];

        for (input, expected, len) in cases {
            let (safe_encoded, encoded_len_safe) = encode(input).expect("Safe encode should succeed");
            let (unsafe_encoded, encoded_len_unsafe) = encode_unchecked(input);
            assert_eq!(safe_encoded, expected, "Safe encoding does not match expected for input '{}'", input);
            assert_eq!(unsafe_encoded, expected, "Unchecked encoding does not match expected for input '{}'", input);
            assert_eq!(encoded_len_safe, len, "Length does not match expected value for input '{}'", input);
            assert_eq!(encoded_len_unsafe, len, "Length should be correct for input '{}'", input);
        }
    }
}