1use std::collections::HashMap;
2use crate::config::EncodingMode;
3
4#[derive(Debug, Clone)]
10pub struct Alphabet {
11 chars: Vec<char>,
12 char_to_index: HashMap<char, usize>,
13 mode: EncodingMode,
14 padding: Option<char>,
15 start_codepoint: Option<u32>,
16}
17
18impl Alphabet {
19 pub fn new(chars: Vec<char>) -> Result<Self, String> {
29 Self::new_with_mode(chars, EncodingMode::BaseConversion, None)
30 }
31
32 pub fn new_with_mode(chars: Vec<char>, mode: EncodingMode, padding: Option<char>) -> Result<Self, String> {
46 Self::new_with_mode_and_range(chars, mode, padding, None)
47 }
48
49 pub fn new_with_mode_and_range(chars: Vec<char>, mode: EncodingMode, padding: Option<char>, start_codepoint: Option<u32>) -> Result<Self, String> {
62 if mode == EncodingMode::ByteRange {
64 if let Some(start) = start_codepoint {
65 if let Some(end_codepoint) = start.checked_add(255) {
67 if std::char::from_u32(end_codepoint).is_none() {
68 return Err(format!("Invalid Unicode range: {}-{}", start, end_codepoint));
69 }
70 for offset in 0..=255 {
72 if std::char::from_u32(start + offset).is_none() {
73 return Err(format!("Invalid Unicode codepoint in range: {}", start + offset));
74 }
75 }
76 } else {
77 return Err("Start codepoint too high for 256-byte range".to_string());
78 }
79
80 return Ok(Alphabet {
81 chars: Vec::new(),
82 char_to_index: HashMap::new(),
83 mode,
84 padding,
85 start_codepoint: Some(start),
86 });
87 } else {
88 return Err("ByteRange mode requires start_codepoint".to_string());
89 }
90 }
91
92 if chars.is_empty() {
93 return Err("Alphabet cannot be empty".to_string());
94 }
95
96 if mode == EncodingMode::Chunked {
98 let base = chars.len();
99 if !base.is_power_of_two() {
100 return Err(format!("Chunked mode requires power-of-two alphabet size, got {}", base));
101 }
102 if base != 2 && base != 4 && base != 8 && base != 16 && base != 32 && base != 64 && base != 128 && base != 256 {
104 return Err(format!("Chunked mode requires alphabet size of 2, 4, 8, 16, 32, 64, 128, or 256, got {}", base));
105 }
106 }
107
108 let mut char_to_index = HashMap::new();
110 for (i, &c) in chars.iter().enumerate() {
111 if char_to_index.insert(c, i).is_some() {
113 return Err(format!("Duplicate character in alphabet: '{}' (U+{:04X})", c, c as u32));
114 }
115
116 if c.is_control() && c != '\t' && c != '\n' && c != '\r' {
118 return Err(format!("Control character not allowed in alphabet: U+{:04X}", c as u32));
119 }
120
121 if c.is_whitespace() {
123 return Err(format!("Whitespace character not allowed in alphabet: '{}' (U+{:04X})", c, c as u32));
124 }
125 }
126
127 if let Some(pad) = padding {
129 if char_to_index.contains_key(&pad) {
130 return Err(format!("Padding character '{}' conflicts with alphabet characters", pad));
131 }
132 if pad.is_control() && pad != '\t' && pad != '\n' && pad != '\r' {
133 return Err(format!("Control character not allowed as padding: U+{:04X}", pad as u32));
134 }
135 }
136
137 Ok(Alphabet {
138 chars,
139 char_to_index,
140 mode,
141 padding,
142 start_codepoint: None,
143 })
144 }
145
146 pub fn from_str(s: &str) -> Result<Self, String> {
152 let chars: Vec<char> = s.chars().collect();
153 Self::new(chars)
154 }
155
156 pub fn base(&self) -> usize {
160 match self.mode {
161 EncodingMode::ByteRange => 256,
162 _ => self.chars.len(),
163 }
164 }
165
166 pub fn mode(&self) -> &EncodingMode {
168 &self.mode
169 }
170
171 pub fn padding(&self) -> Option<char> {
173 self.padding
174 }
175
176 pub fn start_codepoint(&self) -> Option<u32> {
178 self.start_codepoint
179 }
180
181 pub fn encode_digit(&self, digit: usize) -> Option<char> {
185 match self.mode {
186 EncodingMode::ByteRange => {
187 if let Some(start) = self.start_codepoint {
188 if digit < 256 {
189 return std::char::from_u32(start + digit as u32);
190 }
191 }
192 None
193 }
194 _ => self.chars.get(digit).copied(),
195 }
196 }
197
198 pub fn decode_char(&self, c: char) -> Option<usize> {
202 match self.mode {
203 EncodingMode::ByteRange => {
204 if let Some(start) = self.start_codepoint {
205 let codepoint = c as u32;
206 if codepoint >= start && codepoint < start + 256 {
207 return Some((codepoint - start) as usize);
208 }
209 }
210 None
211 }
212 _ => self.char_to_index.get(&c).copied(),
213 }
214 }
215}
216
217#[cfg(test)]
218mod tests {
219 use super::*;
220
221 #[test]
222 fn test_duplicate_character_detection() {
223 let chars = vec!['a', 'b', 'c', 'a'];
224 let result = Alphabet::new(chars);
225 assert!(result.is_err());
226 assert!(result.unwrap_err().contains("Duplicate character"));
227 }
228
229 #[test]
230 fn test_empty_alphabet() {
231 let chars = vec![];
232 let result = Alphabet::new(chars);
233 assert!(result.is_err());
234 assert!(result.unwrap_err().contains("cannot be empty"));
235 }
236
237 #[test]
238 fn test_chunked_mode_power_of_two() {
239 let chars = vec!['a', 'b', 'c']; let result = Alphabet::new_with_mode(chars, EncodingMode::Chunked, None);
241 assert!(result.is_err());
242 assert!(result.unwrap_err().contains("power-of-two"));
243 }
244
245 #[test]
246 fn test_chunked_mode_valid_sizes() {
247 for &size in &[2, 4, 8, 16, 32, 64] {
249 let chars: Vec<char> = (0..size).map(|i| {
250 char::from_u32('A' as u32 + (i % 26) + ((i / 26) * 100)).unwrap()
252 }).collect();
253 let result = Alphabet::new_with_mode(chars, EncodingMode::Chunked, None);
254 assert!(result.is_ok(), "Size {} should be valid", size);
255 }
256 }
257
258 #[test]
259 fn test_control_character_rejection() {
260 let chars = vec!['a', 'b', '\x00', 'c']; let result = Alphabet::new(chars);
262 assert!(result.is_err());
263 assert!(result.unwrap_err().contains("Control character"));
264 }
265
266 #[test]
267 fn test_whitespace_rejection() {
268 let chars = vec!['a', 'b', ' ', 'c'];
269 let result = Alphabet::new(chars);
270 assert!(result.is_err());
271 assert!(result.unwrap_err().contains("Whitespace"));
272 }
273
274 #[test]
275 fn test_padding_conflict_with_alphabet() {
276 let chars = vec!['a', 'b', 'c', 'd'];
277 let result = Alphabet::new_with_mode(chars, EncodingMode::BaseConversion, Some('b'));
278 assert!(result.is_err());
279 let err = result.unwrap_err();
280 assert!(err.contains("Padding character"));
281 assert!(err.contains("conflicts"));
282 }
283
284 #[test]
285 fn test_valid_padding() {
286 let chars = vec!['a', 'b', 'c', 'd'];
287 let result = Alphabet::new_with_mode(chars, EncodingMode::BaseConversion, Some('='));
288 assert!(result.is_ok());
289 }
290
291 #[test]
292 fn test_byte_range_exceeds_unicode() {
293 let result = Alphabet::new_with_mode_and_range(
295 Vec::new(),
296 EncodingMode::ByteRange,
297 None,
298 Some(0x10FF80) );
300 assert!(result.is_err());
301 }
302
303 #[test]
304 fn test_byte_range_valid_start() {
305 let result = Alphabet::new_with_mode_and_range(
306 Vec::new(),
307 EncodingMode::ByteRange,
308 None,
309 Some(0x1F300) );
311 assert!(result.is_ok());
312 }
313
314 #[test]
315 fn test_byte_range_no_start_codepoint() {
316 let result = Alphabet::new_with_mode_and_range(
317 Vec::new(),
318 EncodingMode::ByteRange,
319 None,
320 None
321 );
322 assert!(result.is_err());
323 assert!(result.unwrap_err().contains("requires start_codepoint"));
324 }
325
326 #[test]
327 fn test_detailed_error_messages() {
328 let chars = vec!['a', 'b', 'a'];
330 let err = Alphabet::new(chars).unwrap_err();
331 assert!(err.contains("'a'") || err.contains("U+"));
332 }
333}