base_d/alphabet.rs
1use std::collections::HashMap;
2use crate::config::EncodingMode;
3
4/// Represents an encoding alphabet with its characters and configuration.
5///
6/// An alphabet defines the character set and encoding mode used for converting
7/// binary data to text. Supports three modes: mathematical base conversion,
8/// chunked (RFC 4648), and byte-range mapping.
9#[derive(Debug, Clone)]
10pub struct Alphabet {
11 chars: Vec<char>,
12 char_to_index: HashMap<char, usize>,
13 mode: EncodingMode,
14 padding: Option<char>,
15 start_codepoint: Option<u32>,
16}
17
18impl Alphabet {
19 /// Creates a new alphabet with default settings (BaseConversion mode, no padding).
20 ///
21 /// # Arguments
22 ///
23 /// * `chars` - Vector of characters to use in the alphabet
24 ///
25 /// # Errors
26 ///
27 /// Returns an error if the alphabet is empty or contains duplicate characters.
28 pub fn new(chars: Vec<char>) -> Result<Self, String> {
29 Self::new_with_mode(chars, EncodingMode::BaseConversion, None)
30 }
31
32 /// Creates a new alphabet with specified encoding mode and optional padding.
33 ///
34 /// # Arguments
35 ///
36 /// * `chars` - Vector of characters to use in the alphabet
37 /// * `mode` - Encoding mode (BaseConversion, Chunked, or ByteRange)
38 /// * `padding` - Optional padding character (typically '=' for RFC modes)
39 ///
40 /// # Errors
41 ///
42 /// Returns an error if:
43 /// - The alphabet is empty or contains duplicates
44 /// - Chunked mode is used with a non-power-of-two alphabet size
45 pub fn new_with_mode(chars: Vec<char>, mode: EncodingMode, padding: Option<char>) -> Result<Self, String> {
46 Self::new_with_mode_and_range(chars, mode, padding, None)
47 }
48
49 /// Creates a new alphabet with full configuration including byte-range support.
50 ///
51 /// # Arguments
52 ///
53 /// * `chars` - Vector of characters (empty for ByteRange mode)
54 /// * `mode` - Encoding mode
55 /// * `padding` - Optional padding character
56 /// * `start_codepoint` - Starting Unicode codepoint for ByteRange mode
57 ///
58 /// # Errors
59 ///
60 /// Returns an error if configuration is invalid for the specified mode.
61 pub fn new_with_mode_and_range(chars: Vec<char>, mode: EncodingMode, padding: Option<char>, start_codepoint: Option<u32>) -> Result<Self, String> {
62 // ByteRange mode doesn't need chars, just validates start_codepoint
63 if mode == EncodingMode::ByteRange {
64 if let Some(start) = start_codepoint {
65 // Validate that we can represent all 256 bytes
66 if let Some(end_codepoint) = start.checked_add(255) {
67 if std::char::from_u32(end_codepoint).is_none() {
68 return Err(format!("Invalid Unicode range: {}-{}", start, end_codepoint));
69 }
70 } else {
71 return Err("Start codepoint too high for 256-byte range".to_string());
72 }
73
74 return Ok(Alphabet {
75 chars: Vec::new(),
76 char_to_index: HashMap::new(),
77 mode,
78 padding,
79 start_codepoint: Some(start),
80 });
81 } else {
82 return Err("ByteRange mode requires start_codepoint".to_string());
83 }
84 }
85
86 if chars.is_empty() {
87 return Err("Alphabet cannot be empty".to_string());
88 }
89
90 // Validate alphabet size for chunked mode
91 if mode == EncodingMode::Chunked {
92 let base = chars.len();
93 if !base.is_power_of_two() {
94 return Err(format!("Chunked mode requires power-of-two alphabet size, got {}", base));
95 }
96 }
97
98 let mut char_to_index = HashMap::new();
99 for (i, &c) in chars.iter().enumerate() {
100 if char_to_index.insert(c, i).is_some() {
101 return Err(format!("Duplicate character in alphabet: {}", c));
102 }
103 }
104
105 Ok(Alphabet {
106 chars,
107 char_to_index,
108 mode,
109 padding,
110 start_codepoint: None,
111 })
112 }
113
114 /// Creates an alphabet from a string of characters.
115 ///
116 /// # Arguments
117 ///
118 /// * `s` - String containing the alphabet characters
119 pub fn from_str(s: &str) -> Result<Self, String> {
120 let chars: Vec<char> = s.chars().collect();
121 Self::new(chars)
122 }
123
124 /// Returns the base (radix) of the alphabet.
125 ///
126 /// For ByteRange mode, always returns 256. Otherwise returns the number of characters.
127 pub fn base(&self) -> usize {
128 match self.mode {
129 EncodingMode::ByteRange => 256,
130 _ => self.chars.len(),
131 }
132 }
133
134 /// Returns the encoding mode of this alphabet.
135 pub fn mode(&self) -> &EncodingMode {
136 &self.mode
137 }
138
139 /// Returns the padding character, if any.
140 pub fn padding(&self) -> Option<char> {
141 self.padding
142 }
143
144 /// Returns the starting Unicode codepoint for ByteRange mode.
145 pub fn start_codepoint(&self) -> Option<u32> {
146 self.start_codepoint
147 }
148
149 /// Encodes a digit (0 to base-1) as a character.
150 ///
151 /// Returns `None` if the digit is out of range.
152 pub fn encode_digit(&self, digit: usize) -> Option<char> {
153 match self.mode {
154 EncodingMode::ByteRange => {
155 if let Some(start) = self.start_codepoint {
156 if digit < 256 {
157 return std::char::from_u32(start + digit as u32);
158 }
159 }
160 None
161 }
162 _ => self.chars.get(digit).copied(),
163 }
164 }
165
166 /// Decodes a character back to its digit value.
167 ///
168 /// Returns `None` if the character is not in the alphabet.
169 pub fn decode_char(&self, c: char) -> Option<usize> {
170 match self.mode {
171 EncodingMode::ByteRange => {
172 if let Some(start) = self.start_codepoint {
173 let codepoint = c as u32;
174 if codepoint >= start && codepoint < start + 256 {
175 return Some((codepoint - start) as usize);
176 }
177 }
178 None
179 }
180 _ => self.char_to_index.get(&c).copied(),
181 }
182 }
183}
184
185