base_d/encoders/algorithms/
word.rs

1//! Word-based encoding using radix conversion.
2//!
3//! Same mathematical approach as character-based radix encoding,
4//! but outputs words joined by a delimiter instead of concatenated characters.
5
6use crate::core::word_dictionary::WordDictionary;
7use num_integer::Integer;
8use num_traits::Zero;
9
10pub use super::errors::DecodeError;
11
12/// Encodes binary data as a sequence of words.
13///
14/// Uses radix (base) conversion where each "digit" is a word from the dictionary.
15/// Words are joined by the dictionary's delimiter.
16///
17/// # Example
18///
19/// ```
20/// use base_d::{WordDictionary, word};
21///
22/// let dict = WordDictionary::builder()
23///     .words(vec!["abandon", "ability", "able", "about"])
24///     .delimiter(" ")
25///     .build()
26///     .unwrap();
27///
28/// let encoded = word::encode(b"\x00\x01\x02", &dict);
29/// // Result is words joined by spaces
30/// ```
31pub fn encode(data: &[u8], dictionary: &WordDictionary) -> String {
32    if data.is_empty() {
33        return String::new();
34    }
35
36    // Count leading zeros for efficient handling
37    let leading_zeros = data.iter().take_while(|&&b| b == 0).count();
38
39    // If all zeros, return early
40    if leading_zeros == data.len() {
41        let zero_word = dictionary.encode_word(0).unwrap();
42        return std::iter::repeat_n(zero_word, data.len())
43            .collect::<Vec<_>>()
44            .join(dictionary.delimiter());
45    }
46
47    let base = dictionary.base();
48    let mut num = num_bigint::BigUint::from_bytes_be(&data[leading_zeros..]);
49
50    // Pre-allocate result vector with estimated capacity
51    let max_words =
52        ((data.len() - leading_zeros) * 8 * 1000) / (base as f64).log2() as usize / 1000 + 1;
53    let mut result: Vec<&str> = Vec::with_capacity(max_words + leading_zeros);
54
55    let base_big = num_bigint::BigUint::from(base);
56
57    while !num.is_zero() {
58        let (quotient, remainder) = num.div_rem(&base_big);
59        let digit = remainder.to_u64_digits();
60        let digit_val = if digit.is_empty() {
61            0
62        } else {
63            digit[0] as usize
64        };
65        result.push(dictionary.encode_word(digit_val).unwrap());
66        num = quotient;
67    }
68
69    // Add leading zeros
70    let zero_word = dictionary.encode_word(0).unwrap();
71    for _ in 0..leading_zeros {
72        result.push(zero_word);
73    }
74
75    result.reverse();
76    result.join(dictionary.delimiter())
77}
78
79/// Decodes a word sequence back to binary data.
80///
81/// Splits the input on the dictionary's delimiter, then performs
82/// reverse radix conversion.
83///
84/// # Errors
85///
86/// Returns `DecodeError` if:
87/// - Input is empty
88/// - A word is not found in the dictionary
89pub fn decode(encoded: &str, dictionary: &WordDictionary) -> Result<Vec<u8>, DecodeError> {
90    if encoded.is_empty() {
91        return Err(DecodeError::EmptyInput);
92    }
93
94    let base = dictionary.base();
95    let mut num = num_bigint::BigUint::from(0u8);
96    let base_big = num_bigint::BigUint::from(base);
97
98    // Split on delimiter
99    let words: Vec<&str> = encoded.split(dictionary.delimiter()).collect();
100    let mut leading_zeros = 0;
101
102    // Track position for error reporting
103    let mut char_position = 0;
104    for word in &words {
105        let digit = dictionary
106            .decode_word(word)
107            .ok_or_else(|| DecodeError::invalid_word(word, char_position, encoded))?;
108
109        if num.is_zero() && digit == 0 {
110            leading_zeros += 1;
111        } else {
112            num *= &base_big;
113            num += num_bigint::BigUint::from(digit);
114        }
115
116        // Track position (word + delimiter)
117        char_position += word.len() + dictionary.delimiter().len();
118    }
119
120    // Handle all-zero case
121    if num.is_zero() && leading_zeros > 0 {
122        return Ok(vec![0u8; leading_zeros]);
123    }
124
125    let bytes = num.to_bytes_be();
126
127    // Construct result with pre-allocated capacity
128    let mut result = Vec::with_capacity(leading_zeros + bytes.len());
129    result.resize(leading_zeros, 0u8);
130    result.extend_from_slice(&bytes);
131
132    Ok(result)
133}
134
135#[cfg(test)]
136mod tests {
137    use super::*;
138
139    fn test_dictionary() -> WordDictionary {
140        // Small dictionary for testing (base 4)
141        WordDictionary::builder()
142            .words(vec!["zero", "one", "two", "three"])
143            .delimiter(" ")
144            .build()
145            .unwrap()
146    }
147
148    fn bip39_style_dictionary() -> WordDictionary {
149        // Larger dictionary mimicking BIP-39 structure (base 16 for easier testing)
150        WordDictionary::builder()
151            .words(vec![
152                "abandon", "ability", "able", "about", "above", "absent", "absorb", "abstract",
153                "absurd", "abuse", "access", "accident", "account", "accuse", "achieve", "acid",
154            ])
155            .delimiter(" ")
156            .build()
157            .unwrap()
158    }
159
160    #[test]
161    fn test_encode_empty() {
162        let dict = test_dictionary();
163        assert_eq!(encode(&[], &dict), "");
164    }
165
166    #[test]
167    fn test_encode_single_byte() {
168        let dict = test_dictionary();
169        // 0x05 in base 4 = 1*4 + 1 = "one one"
170        let encoded = encode(&[0x05], &dict);
171        assert_eq!(encoded, "one one");
172    }
173
174    #[test]
175    fn test_encode_decode_roundtrip() {
176        let dict = test_dictionary();
177        let data = b"hello";
178        let encoded = encode(data, &dict);
179        let decoded = decode(&encoded, &dict).unwrap();
180        assert_eq!(decoded, data);
181    }
182
183    #[test]
184    fn test_encode_decode_roundtrip_larger() {
185        let dict = bip39_style_dictionary();
186        let data = b"The quick brown fox";
187        let encoded = encode(data, &dict);
188        let decoded = decode(&encoded, &dict).unwrap();
189        assert_eq!(decoded, data);
190    }
191
192    #[test]
193    fn test_leading_zeros_preserved() {
194        let dict = test_dictionary();
195        let data = &[0x00, 0x00, 0x05];
196        let encoded = encode(data, &dict);
197        let decoded = decode(&encoded, &dict).unwrap();
198        assert_eq!(decoded, data);
199    }
200
201    #[test]
202    fn test_all_zeros() {
203        let dict = test_dictionary();
204        let data = &[0x00, 0x00, 0x00];
205        let encoded = encode(data, &dict);
206        assert_eq!(encoded, "zero zero zero");
207        let decoded = decode(&encoded, &dict).unwrap();
208        assert_eq!(decoded, data);
209    }
210
211    #[test]
212    fn test_decode_empty_error() {
213        let dict = test_dictionary();
214        let result = decode("", &dict);
215        assert!(matches!(result, Err(DecodeError::EmptyInput)));
216    }
217
218    #[test]
219    fn test_decode_unknown_word() {
220        let dict = test_dictionary();
221        let result = decode("zero unknown one", &dict);
222        assert!(result.is_err());
223    }
224
225    #[test]
226    fn test_case_insensitive_decode() {
227        let dict = WordDictionary::builder()
228            .words(vec!["Alpha", "Bravo", "Charlie", "Delta"])
229            .case_sensitive(false)
230            .build()
231            .unwrap();
232
233        let data = &[0x01];
234        let encoded = encode(data, &dict);
235
236        // Should decode regardless of case
237        let decoded_lower = decode(&encoded.to_lowercase(), &dict).unwrap();
238        let decoded_upper = decode(&encoded.to_uppercase(), &dict).unwrap();
239        assert_eq!(decoded_lower, data);
240        assert_eq!(decoded_upper, data);
241    }
242
243    #[test]
244    fn test_custom_delimiter() {
245        let dict = WordDictionary::builder()
246            .words(vec!["a", "b", "c", "d"])
247            .delimiter("-")
248            .build()
249            .unwrap();
250
251        let data = &[0x05];
252        let encoded = encode(data, &dict);
253        assert!(encoded.contains("-"));
254
255        let decoded = decode(&encoded, &dict).unwrap();
256        assert_eq!(decoded, data);
257    }
258}