1use std::collections::HashMap;
2use crate::config::EncodingMode;
3
4const MAX_LOOKUP_TABLE_SIZE: usize = 256;
5
6#[derive(Debug, Clone)]
12pub struct Alphabet {
13 chars: Vec<char>,
14 char_to_index: HashMap<char, usize>,
15 lookup_table: Option<Box<[Option<usize>; 256]>>,
17 mode: EncodingMode,
18 padding: Option<char>,
19 start_codepoint: Option<u32>,
20}
21
22impl Alphabet {
23 pub fn new(chars: Vec<char>) -> Result<Self, String> {
33 Self::new_with_mode(chars, EncodingMode::BaseConversion, None)
34 }
35
36 pub fn new_with_mode(chars: Vec<char>, mode: EncodingMode, padding: Option<char>) -> Result<Self, String> {
50 Self::new_with_mode_and_range(chars, mode, padding, None)
51 }
52
53 pub fn new_with_mode_and_range(chars: Vec<char>, mode: EncodingMode, padding: Option<char>, start_codepoint: Option<u32>) -> Result<Self, String> {
66 if mode == EncodingMode::ByteRange {
68 if let Some(start) = start_codepoint {
69 if let Some(end_codepoint) = start.checked_add(255) {
71 if std::char::from_u32(end_codepoint).is_none() {
72 return Err(format!("Invalid Unicode range: {}-{}", start, end_codepoint));
73 }
74 for offset in 0..=255 {
76 if std::char::from_u32(start + offset).is_none() {
77 return Err(format!("Invalid Unicode codepoint in range: {}", start + offset));
78 }
79 }
80 } else {
81 return Err("Start codepoint too high for 256-byte range".to_string());
82 }
83
84 return Ok(Alphabet {
85 chars: Vec::new(),
86 char_to_index: HashMap::new(),
87 lookup_table: None,
88 mode,
89 padding,
90 start_codepoint: Some(start),
91 });
92 } else {
93 return Err("ByteRange mode requires start_codepoint".to_string());
94 }
95 }
96
97 if chars.is_empty() {
98 return Err("Alphabet cannot be empty".to_string());
99 }
100
101 if mode == EncodingMode::Chunked {
103 let base = chars.len();
104 if !base.is_power_of_two() {
105 return Err(format!("Chunked mode requires power-of-two alphabet size, got {}", base));
106 }
107 if base != 2 && base != 4 && base != 8 && base != 16 && base != 32 && base != 64 && base != 128 && base != 256 {
109 return Err(format!("Chunked mode requires alphabet size of 2, 4, 8, 16, 32, 64, 128, or 256, got {}", base));
110 }
111 }
112
113 let mut char_to_index = HashMap::new();
115 for (i, &c) in chars.iter().enumerate() {
116 if char_to_index.insert(c, i).is_some() {
118 return Err(format!("Duplicate character in alphabet: '{}' (U+{:04X})", c, c as u32));
119 }
120
121 if c.is_control() && c != '\t' && c != '\n' && c != '\r' {
123 return Err(format!("Control character not allowed in alphabet: U+{:04X}", c as u32));
124 }
125
126 if c.is_whitespace() {
128 return Err(format!("Whitespace character not allowed in alphabet: '{}' (U+{:04X})", c, c as u32));
129 }
130 }
131
132 if let Some(pad) = padding {
134 if char_to_index.contains_key(&pad) {
135 return Err(format!("Padding character '{}' conflicts with alphabet characters", pad));
136 }
137 if pad.is_control() && pad != '\t' && pad != '\n' && pad != '\r' {
138 return Err(format!("Control character not allowed as padding: U+{:04X}", pad as u32));
139 }
140 }
141
142 let lookup_table = if chars.iter().all(|&c| (c as u32) < MAX_LOOKUP_TABLE_SIZE as u32) {
144 let mut table = Box::new([None; 256]);
145 for (i, &c) in chars.iter().enumerate() {
146 table[c as usize] = Some(i);
147 }
148 Some(table)
149 } else {
150 None
151 };
152
153 Ok(Alphabet {
154 chars,
155 char_to_index,
156 lookup_table,
157 mode,
158 padding,
159 start_codepoint: None,
160 })
161 }
162
163 pub fn from_str(s: &str) -> Result<Self, String> {
169 let chars: Vec<char> = s.chars().collect();
170 Self::new(chars)
171 }
172
173 pub fn base(&self) -> usize {
177 match self.mode {
178 EncodingMode::ByteRange => 256,
179 _ => self.chars.len(),
180 }
181 }
182
183 pub fn mode(&self) -> &EncodingMode {
185 &self.mode
186 }
187
188 pub fn padding(&self) -> Option<char> {
190 self.padding
191 }
192
193 pub fn start_codepoint(&self) -> Option<u32> {
195 self.start_codepoint
196 }
197
198 pub fn encode_digit(&self, digit: usize) -> Option<char> {
202 match self.mode {
203 EncodingMode::ByteRange => {
204 if let Some(start) = self.start_codepoint {
205 if digit < 256 {
206 return std::char::from_u32(start + digit as u32);
207 }
208 }
209 None
210 }
211 _ => self.chars.get(digit).copied(),
212 }
213 }
214
215 pub fn decode_char(&self, c: char) -> Option<usize> {
219 match self.mode {
220 EncodingMode::ByteRange => {
221 if let Some(start) = self.start_codepoint {
222 let codepoint = c as u32;
223 if codepoint >= start && codepoint < start + 256 {
224 return Some((codepoint - start) as usize);
225 }
226 }
227 None
228 }
229 _ => {
230 if let Some(ref table) = self.lookup_table {
232 let char_val = c as u32;
233 if char_val < MAX_LOOKUP_TABLE_SIZE as u32 {
234 return table[char_val as usize];
235 }
236 }
237 self.char_to_index.get(&c).copied()
239 }
240 }
241 }
242}
243
244#[cfg(test)]
245mod tests {
246 use super::*;
247
248 #[test]
249 fn test_duplicate_character_detection() {
250 let chars = vec!['a', 'b', 'c', 'a'];
251 let result = Alphabet::new(chars);
252 assert!(result.is_err());
253 assert!(result.unwrap_err().contains("Duplicate character"));
254 }
255
256 #[test]
257 fn test_empty_alphabet() {
258 let chars = vec![];
259 let result = Alphabet::new(chars);
260 assert!(result.is_err());
261 assert!(result.unwrap_err().contains("cannot be empty"));
262 }
263
264 #[test]
265 fn test_chunked_mode_power_of_two() {
266 let chars = vec!['a', 'b', 'c']; let result = Alphabet::new_with_mode(chars, EncodingMode::Chunked, None);
268 assert!(result.is_err());
269 assert!(result.unwrap_err().contains("power-of-two"));
270 }
271
272 #[test]
273 fn test_chunked_mode_valid_sizes() {
274 for &size in &[2, 4, 8, 16, 32, 64] {
276 let chars: Vec<char> = (0..size).map(|i| {
277 char::from_u32('A' as u32 + (i % 26) + ((i / 26) * 100)).unwrap()
279 }).collect();
280 let result = Alphabet::new_with_mode(chars, EncodingMode::Chunked, None);
281 assert!(result.is_ok(), "Size {} should be valid", size);
282 }
283 }
284
285 #[test]
286 fn test_control_character_rejection() {
287 let chars = vec!['a', 'b', '\x00', 'c']; let result = Alphabet::new(chars);
289 assert!(result.is_err());
290 assert!(result.unwrap_err().contains("Control character"));
291 }
292
293 #[test]
294 fn test_whitespace_rejection() {
295 let chars = vec!['a', 'b', ' ', 'c'];
296 let result = Alphabet::new(chars);
297 assert!(result.is_err());
298 assert!(result.unwrap_err().contains("Whitespace"));
299 }
300
301 #[test]
302 fn test_padding_conflict_with_alphabet() {
303 let chars = vec!['a', 'b', 'c', 'd'];
304 let result = Alphabet::new_with_mode(chars, EncodingMode::BaseConversion, Some('b'));
305 assert!(result.is_err());
306 let err = result.unwrap_err();
307 assert!(err.contains("Padding character"));
308 assert!(err.contains("conflicts"));
309 }
310
311 #[test]
312 fn test_valid_padding() {
313 let chars = vec!['a', 'b', 'c', 'd'];
314 let result = Alphabet::new_with_mode(chars, EncodingMode::BaseConversion, Some('='));
315 assert!(result.is_ok());
316 }
317
318 #[test]
319 fn test_byte_range_exceeds_unicode() {
320 let result = Alphabet::new_with_mode_and_range(
322 Vec::new(),
323 EncodingMode::ByteRange,
324 None,
325 Some(0x10FF80) );
327 assert!(result.is_err());
328 }
329
330 #[test]
331 fn test_byte_range_valid_start() {
332 let result = Alphabet::new_with_mode_and_range(
333 Vec::new(),
334 EncodingMode::ByteRange,
335 None,
336 Some(0x1F300) );
338 assert!(result.is_ok());
339 }
340
341 #[test]
342 fn test_byte_range_no_start_codepoint() {
343 let result = Alphabet::new_with_mode_and_range(
344 Vec::new(),
345 EncodingMode::ByteRange,
346 None,
347 None
348 );
349 assert!(result.is_err());
350 assert!(result.unwrap_err().contains("requires start_codepoint"));
351 }
352
353 #[test]
354 fn test_detailed_error_messages() {
355 let chars = vec!['a', 'b', 'a'];
357 let err = Alphabet::new(chars).unwrap_err();
358 assert!(err.contains("'a'") || err.contains("U+"));
359 }
360}