rxing/aztec/encoder/high_level_encoder.rs
1/*
2 * Copyright 2013 ZXing authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17use crate::common::{BitArray, CharacterSet, Result};
18
19use super::{State, Token};
20
21/**
22 * This produces nearly optimal encodings of text into the first-level of
23 * encoding used by Aztec code.
24 *
25 * It uses a dynamic algorithm. For each prefix of the string, it determines
26 * a set of encodings that could lead to this prefix. We repeatedly add a
27 * character and generate a new set of optimal encodings until we have read
28 * through the entire input.
29 *
30 * @author Frank Yellin
31 * @author Rustam Abdullaev
32 */
33pub struct HighLevelEncoder {
34 text: Vec<u8>,
35 charset: CharacterSet,
36}
37
38impl HighLevelEncoder {
39 pub const MODE_NAMES: [&'static str; 5] = ["UPPER", "LOWER", "DIGIT", "MIXED", "PUNCT"];
40
41 pub const MODE_UPPER: usize = 0; // 5 bits
42 pub const MODE_LOWER: usize = 1; // 5 bits
43 pub const MODE_DIGIT: usize = 2; // 4 bits
44 pub const MODE_MIXED: usize = 3; // 5 bits
45 pub const MODE_PUNCT: usize = 4; // 5 bits
46
47 // The Latch Table shows, for each pair of Modes, the optimal method for
48 // getting from one mode to another. In the worst possible case, this can
49 // be up to 14 bits. In the best possible case, we are already there!
50 // The high half-word of each entry gives the number of bits.
51 // The low half-word of each entry are the actual bits necessary to change
52 pub const LATCH_TABLE: [[u32; 5]; 5] = [
53 [
54 0,
55 (5 << 16) + 28, // UPPER -> LOWER
56 (5 << 16) + 30, // UPPER -> DIGIT
57 (5 << 16) + 29, // UPPER -> MIXED
58 (10 << 16) + (29 << 5) + 30, // UPPER -> MIXED -> PUNCT
59 ],
60 [
61 (9 << 16) + (30 << 4) + 14, // LOWER -> DIGIT -> UPPER
62 0,
63 (5 << 16) + 30, // LOWER -> DIGIT
64 (5 << 16) + 29, // LOWER -> MIXED
65 (10 << 16) + (29 << 5) + 30, // LOWER -> MIXED -> PUNCT
66 ],
67 [
68 (4 << 16) + 14, // DIGIT -> UPPER
69 (9 << 16) + (14 << 5) + 28, // DIGIT -> UPPER -> LOWER
70 0,
71 (9 << 16) + (14 << 5) + 29, // DIGIT -> UPPER -> MIXED
72 (14 << 16) + (14 << 10) + (29 << 5) + 30,
73 ], // DIGIT -> UPPER -> MIXED -> PUNCT
74 [
75 (5 << 16) + 29, // MIXED -> UPPER
76 (5 << 16) + 28, // MIXED -> LOWER
77 (10 << 16) + (29 << 5) + 30, // MIXED -> UPPER -> DIGIT
78 0,
79 (5 << 16) + 30, // MIXED -> PUNCT
80 ],
81 [
82 (5 << 16) + 31, // PUNCT -> UPPER
83 (10 << 16) + (31 << 5) + 28, // PUNCT -> UPPER -> LOWER
84 (10 << 16) + (31 << 5) + 30, // PUNCT -> UPPER -> DIGIT
85 (10 << 16) + (31 << 5) + 29, // PUNCT -> UPPER -> MIXED
86 0,
87 ],
88 ];
89
90 // A reverse mapping from [mode][char] to the encoding for that character
91 // in that mode. An entry of 0 indicates no mapping exists.
92 pub const CHAR_MAP: [[u8; 256]; 5] = {
93 let mut char_map = [[0u8; 256]; 5];
94 char_map[Self::MODE_UPPER][b' ' as usize] = 1;
95 let mut c = b'A';
96 while c <= b'Z' {
97 char_map[Self::MODE_UPPER][c as usize] = c - b'A' + 2;
98 c += 1;
99 }
100 // for (int c = 'A'; c <= 'Z'; c++) {
101 // char_map[Self::MODE_UPPER][c] = c - 'A' + 2;
102 // }
103 char_map[Self::MODE_LOWER][b' ' as usize] = 1;
104 let mut c = b'a';
105 while c <= b'z' {
106 char_map[Self::MODE_LOWER][c as usize] = c - b'a' + 2;
107 c += 1;
108 }
109 // for (int c = 'a'; c <= 'z'; c++) {
110 // char_map[Self::MODE_LOWER][c] = c - 'a' + 2;
111 // }
112 char_map[Self::MODE_DIGIT][b' ' as usize] = 1;
113 let mut c = b'0';
114 while c <= b'9' {
115 char_map[Self::MODE_DIGIT][c as usize] = c - b'0' + 2;
116 c += 1;
117 }
118 // for (int c = '0'; c <= '9'; c++) {
119 // char_map[Self::MODE_DIGIT][c] = c - '0' + 2;
120 // }
121 char_map[Self::MODE_DIGIT][b',' as usize] = 12;
122 char_map[Self::MODE_DIGIT][b'.' as usize] = 13;
123 let mixed_table = [
124 '\0', ' ', '\u{1}', '\u{2}', '\u{3}', '\u{4}', '\u{5}', '\u{6}', '\u{7}', '\u{8}',
125 '\t', '\n', '\u{000b}', '\u{000c}', '\r', '\u{001b}', '\u{001c}', '\u{001d}',
126 '\u{001e}', '\u{001f}', '@', '\\', '^', '_', '`', '|', '~', '\u{007f}',
127 ];
128 let mut i = 0;
129 while i < mixed_table.len() {
130 char_map[Self::MODE_MIXED][mixed_table[i] as u8 as usize] = i as u8;
131 i += 1;
132 }
133 // for (int i = 0; i < mixedTable.length; i++) {
134 // CHAR_MAP[MODE_MIXED][mixedTable[i]] = i;
135 // }
136 let punct_table = [
137 b'\0', b'\r', b'\0', b'\0', b'\0', b'\0', b'!', b'\'', b'#', b'$', b'%', b'&', b'\'',
138 b'(', b')', b'*', b'+', b',', b'-', b'.', b'/', b':', b';', b'<', b'=', b'>', b'?',
139 b'[', b']', b'{', b'}',
140 ];
141 let mut i = 0;
142 while i < punct_table.len() {
143 if punct_table[i] > 0u8 {
144 char_map[Self::MODE_PUNCT][punct_table[i] as usize] = i as u8;
145 }
146 i += 1;
147 }
148 // for (int i = 0; i < punctTable.length; i++) {
149 // if (punctTable[i] > 0) {
150 // CHAR_MAP[MODE_PUNCT][punctTable[i]] = i;
151 // }
152 // }
153
154 char_map
155 };
156 // private static final int[][] CHAR_MAP = new int[5][256];
157 // static {
158 // CHAR_MAP[MODE_UPPER][' '] = 1;
159 // for (int c = 'A'; c <= 'Z'; c++) {
160 // CHAR_MAP[MODE_UPPER][c] = c - 'A' + 2;
161 // }
162 // CHAR_MAP[MODE_LOWER][' '] = 1;
163 // for (int c = 'a'; c <= 'z'; c++) {
164 // CHAR_MAP[MODE_LOWER][c] = c - 'a' + 2;
165 // }
166 // CHAR_MAP[MODE_DIGIT][' '] = 1;
167 // for (int c = '0'; c <= '9'; c++) {
168 // CHAR_MAP[MODE_DIGIT][c] = c - '0' + 2;
169 // }
170 // CHAR_MAP[MODE_DIGIT][','] = 12;
171 // CHAR_MAP[MODE_DIGIT]['.'] = 13;
172 // int[] mixedTable = {
173 // '\0', ' ', '\1', '\2', '\3', '\4', '\5', '\6', '\7', '\b', '\t', '\n',
174 // '\13', '\f', '\r', '\33', '\34', '\35', '\36', '\37', '@', '\\', '^',
175 // '_', '`', '|', '~', '\177'
176 // };
177 // for (int i = 0; i < mixedTable.length; i++) {
178 // CHAR_MAP[MODE_MIXED][mixedTable[i]] = i;
179 // }
180 // int[] punctTable = {
181 // '\0', '\r', '\0', '\0', '\0', '\0', '!', '\'', '#', '$', '%', '&', '\'',
182 // '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?',
183 // '[', ']', '{', '}'
184 // };
185 // for (int i = 0; i < punctTable.length; i++) {
186 // if (punctTable[i] > 0) {
187 // CHAR_MAP[MODE_PUNCT][punctTable[i]] = i;
188 // }
189 // }
190 // }
191
192 // A map showing the available shift codes. (The shifts to BINARY are not
193 // shown
194 pub const SHIFT_TABLE: [[i32; 6]; 6] = {
195 // mode shift codes, per table
196 let mut shift_table = [[-1; 6]; 6];
197
198 shift_table[Self::MODE_UPPER][Self::MODE_PUNCT] = 0;
199
200 shift_table[Self::MODE_LOWER][Self::MODE_PUNCT] = 0;
201 shift_table[Self::MODE_LOWER][Self::MODE_UPPER] = 28;
202
203 shift_table[Self::MODE_MIXED][Self::MODE_PUNCT] = 0;
204
205 shift_table[Self::MODE_DIGIT][Self::MODE_PUNCT] = 0;
206 shift_table[Self::MODE_DIGIT][Self::MODE_UPPER] = 15;
207
208 shift_table
209 };
210 // const SHIFT_TABLE : [[u32]]= new int[6][6]; // mode shift codes, per table
211 // static {
212 // for (int[] table : SHIFT_TABLE) {
213 // Arrays.fill(table, -1);
214 // }
215 // SHIFT_TABLE[MODE_UPPER][MODE_PUNCT] = 0;
216
217 // SHIFT_TABLE[MODE_LOWER][MODE_PUNCT] = 0;
218 // SHIFT_TABLE[MODE_LOWER][MODE_UPPER] = 28;
219
220 // SHIFT_TABLE[MODE_MIXED][MODE_PUNCT] = 0;
221
222 // SHIFT_TABLE[MODE_DIGIT][MODE_PUNCT] = 0;
223 // SHIFT_TABLE[MODE_DIGIT][MODE_UPPER] = 15;
224 // }
225
226 pub fn new(text: Vec<u8>) -> Self {
227 Self {
228 text,
229 charset: CharacterSet::ISO8859_1,
230 }
231 }
232
233 pub fn with_charset(text: Vec<u8>, charset: CharacterSet) -> Self {
234 Self { text, charset }
235 }
236
237 /**
238 * @return text represented by this encoder encoded as a {@link BitArray}
239 */
240 pub fn encode(&self) -> Result<BitArray> {
241 let mut initial_state = State::new(Token::new(), Self::MODE_UPPER as u32, 0, 0);
242 //if let Some(eci) = CharacterSetECI::getCharacterSetECI(self.charset) {
243 if self.charset != CharacterSet::ISO8859_1 {
244 //} && eci != CharacterSetECI::Cp1252 {
245 initial_state = initial_state.appendFLGn(self.charset.into())?;
246 }
247 // } else {
248 // return Err(Exceptions::illegal_argument_with(
249 // "No ECI code for character set",
250 // ));
251 // }
252 // if self.charset != null {
253 // CharacterSetECI eci = CharacterSetECI.getCharacterSetECI(charset);
254 // if (null == eci) {
255 // throw new IllegalArgumentException("No ECI code for character set " + charset);
256 // }
257 // initialState = initialState.appendFLGn(eci.getValue());
258 // }
259 let mut states = vec![initial_state];
260 let mut index = 0;
261 while index < self.text.len() {
262 // for index in 0..self.text.len() {
263 // for (int index = 0; index < text.length; index++) {
264
265 let next_char = if index + 1 < self.text.len() {
266 self.text[index + 1]
267 } else {
268 0
269 };
270 let pair_code = match self.text[index] {
271 b'\r' if next_char == b'\n' => 2,
272 b'.' if next_char == b' ' => 3,
273 b',' if next_char == b' ' => 4,
274 b':' if next_char == b' ' => 5,
275 _ => 0,
276 };
277
278 if pair_code > 0 {
279 // We have one of the four special PUNCT pairs. Treat them specially.
280 // Get a new set of states for the two new characters.
281 states = Self::update_state_list_for_pair(states, index as u32, pair_code);
282 index += 1;
283 } else {
284 // Get a new set of states for the new character.
285 states = self.update_state_list_for_char(states, index as u32);
286 }
287 index += 1;
288 }
289
290 // for state in &states {
291 // dbg!(state.clone().toBitArray(&self.text).to_string());
292 // }
293
294 // We are left with a set of states. Find the shortest one.
295 let min_state = states
296 .into_iter()
297 .min_by(|a, b| {
298 let diff: i64 = a.getBitCount() as i64 - b.getBitCount() as i64;
299 diff.cmp(&0)
300 // match diff {
301 // ..0 => Ordering::Less,
302 // 0 => Ordering::Equal,
303 // 0.. => Ordering::Greater,
304 // }
305 // if diff < 0 {
306 // Ordering::Less
307 // } else if diff == 0 {
308 // Ordering::Equal
309 // } else {
310 // Ordering::Greater
311 // }
312 // a.getBitCount() - b.getBitCount()
313 })
314 .unwrap();
315 // let minState = Collections.min(states, new Comparator<State>() {
316 // @Override
317 // public int compare(State a, State b) {
318 // return a.getBitCount() - b.getBitCount();
319 // }
320 // });
321 // Convert it to a bit array, and return.
322 // dbg!(min_state.clone().toBitArray(&self.text).to_string());
323
324 min_state.toBitArray(&self.text)
325 }
326
327 // We update a set of states for a new character by updating each state
328 // for the new character, merging the results, and then removing the
329 // non-optimal states.
330 fn update_state_list_for_char(&self, states: Vec<State>, index: u32) -> Vec<State> {
331 let mut result = Vec::new();
332
333 states
334 .into_iter()
335 .for_each(|state| self.update_state_for_char(state, index, &mut result));
336
337 Self::simplify_states(result)
338 }
339
340 // Return a set of states that represent the possible ways of updating this
341 // state for the next character. The resulting set of states are added to
342 // the "result" list.
343 fn update_state_for_char(&self, state: State, index: u32, result: &mut Vec<State>) {
344 let ch = self.text[index as usize];
345 let char_in_current_table = Self::CHAR_MAP[state.getMode() as usize][ch as usize] > 0;
346 let mut state_no_binary = None;
347 for mode in 0..=Self::MODE_PUNCT {
348 // for (int mode = 0; mode <= MODE_PUNCT; mode++) {
349 let char_in_mode = Self::CHAR_MAP[mode][ch as usize];
350 if char_in_mode > 0 {
351 if state_no_binary.is_none() {
352 // Only create stateNoBinary the first time it's required.
353 state_no_binary = Some(state.clone().endBinaryShift(index));
354 }
355 // Try generating the character by latching to its mode
356 if !char_in_current_table
357 || mode as u32 == state.getMode()
358 || mode == Self::MODE_DIGIT
359 {
360 // If the character is in the current table, we don't want to latch to
361 // any other mode except possibly digit (which uses only 4 bits). Any
362 // other latch would be equally successful *after* this character, and
363 // so wouldn't save any bits.
364 let latch_state = state_no_binary
365 .clone()
366 .unwrap()
367 .latchAndAppend(mode as u32, char_in_mode as u32);
368 result.push(latch_state);
369 }
370 // Try generating the character by switching to its mode.
371 if !char_in_current_table && Self::SHIFT_TABLE[state.getMode() as usize][mode] >= 0
372 {
373 // It never makes sense to temporarily shift to another mode if the
374 // character exists in the current mode. That can never save bits.
375 let shift_state = state_no_binary
376 .clone()
377 .unwrap()
378 .shiftAndAppend(mode as u32, char_in_mode as u32);
379 result.push(shift_state);
380 }
381 }
382 }
383 if state.getBinaryShiftByteCount() > 0
384 || Self::CHAR_MAP[state.getMode() as usize][ch as usize] == 0
385 {
386 // It's never worthwhile to go into binary shift mode if you're not already
387 // in binary shift mode, and the character exists in your current mode.
388 // That can never save bits over just outputting the char in the current mode.
389 let binary_state = state.addBinaryShiftChar(index);
390 result.push(binary_state);
391 }
392 }
393
394 fn update_state_list_for_pair(states: Vec<State>, index: u32, pairCode: u32) -> Vec<State> {
395 let mut result = Vec::new();
396 for state in states {
397 // for (State state : states) {
398 Self::update_state_for_pair(state, index, pairCode, &mut result);
399 }
400
401 Self::simplify_states(result)
402 }
403
404 fn update_state_for_pair(state: State, index: u32, pair_code: u32, result: &mut Vec<State>) {
405 let state_no_binary = state.clone().endBinaryShift(index);
406 // Possibility 1. Latch to MODE_PUNCT, and then append this code
407 result.push(
408 state_no_binary
409 .clone()
410 .latchAndAppend(Self::MODE_PUNCT as u32, pair_code),
411 );
412 if state.getMode() != Self::MODE_PUNCT as u32 {
413 // Possibility 2. Shift to MODE_PUNCT, and then append this code.
414 // Every state except MODE_PUNCT (handled above) can shift
415 result.push(
416 state_no_binary
417 .clone()
418 .shiftAndAppend(Self::MODE_PUNCT as u32, pair_code),
419 );
420 }
421 if pair_code == 3 || pair_code == 4 {
422 // both characters are in DIGITS. Sometimes better to just add two digits
423 let digit_state = state_no_binary
424 .latchAndAppend(Self::MODE_DIGIT as u32, 16 - pair_code) // period or comma in DIGIT
425 .latchAndAppend(Self::MODE_DIGIT as u32, 1); // space in DIGIT
426 result.push(digit_state);
427 }
428 if state.getBinaryShiftByteCount() > 0 {
429 // It only makes sense to do the characters as binary if we're already
430 // in binary mode.
431 let binary_state = state
432 .addBinaryShiftChar(index)
433 .addBinaryShiftChar(index + 1);
434 result.push(binary_state);
435 }
436 }
437
438 fn simplify_states(states: Vec<State>) -> Vec<State> {
439 let mut result = Vec::with_capacity(states.len());
440 // Reserve up front if you have a rough idea of how many you'll keep.
441 // result.reserve(states.into_iter().size_hint().0);
442
443 for new_state in states {
444 // 1) If any existing state already dominates-or-equals `new_state`, skip it.
445 if result
446 .iter()
447 .any(|s: &State| s.isBetterThanOrEqualTo(&new_state))
448 {
449 continue;
450 }
451
452 // 2) Otherwise, remove any states that the new one dominates-or-equals,
453 // then push `new_state` onto the frontier.
454 result.retain(|s| !new_state.isBetterThanOrEqualTo(s));
455 result.push(new_state);
456 }
457
458 result
459 }
460}