rxing/aztec/encoder/
high_level_encoder.rs

1/*
2 * Copyright 2013 ZXing authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17use crate::common::{BitArray, CharacterSet, Result};
18
19use super::{State, Token};
20
21/**
22 * This produces nearly optimal encodings of text into the first-level of
23 * encoding used by Aztec code.
24 *
25 * It uses a dynamic algorithm.  For each prefix of the string, it determines
26 * a set of encodings that could lead to this prefix.  We repeatedly add a
27 * character and generate a new set of optimal encodings until we have read
28 * through the entire input.
29 *
30 * @author Frank Yellin
31 * @author Rustam Abdullaev
32 */
33pub struct HighLevelEncoder {
34    text: Vec<u8>,
35    charset: CharacterSet,
36}
37
38impl HighLevelEncoder {
39    pub const MODE_NAMES: [&'static str; 5] = ["UPPER", "LOWER", "DIGIT", "MIXED", "PUNCT"];
40
41    pub const MODE_UPPER: usize = 0; // 5 bits
42    pub const MODE_LOWER: usize = 1; // 5 bits
43    pub const MODE_DIGIT: usize = 2; // 4 bits
44    pub const MODE_MIXED: usize = 3; // 5 bits
45    pub const MODE_PUNCT: usize = 4; // 5 bits
46
47    // The Latch Table shows, for each pair of Modes, the optimal method for
48    // getting from one mode to another.  In the worst possible case, this can
49    // be up to 14 bits.  In the best possible case, we are already there!
50    // The high half-word of each entry gives the number of bits.
51    // The low half-word of each entry are the actual bits necessary to change
52    pub const LATCH_TABLE: [[u32; 5]; 5] = [
53        [
54            0,
55            (5 << 16) + 28,              // UPPER -> LOWER
56            (5 << 16) + 30,              // UPPER -> DIGIT
57            (5 << 16) + 29,              // UPPER -> MIXED
58            (10 << 16) + (29 << 5) + 30, // UPPER -> MIXED -> PUNCT
59        ],
60        [
61            (9 << 16) + (30 << 4) + 14, // LOWER -> DIGIT -> UPPER
62            0,
63            (5 << 16) + 30,              // LOWER -> DIGIT
64            (5 << 16) + 29,              // LOWER -> MIXED
65            (10 << 16) + (29 << 5) + 30, // LOWER -> MIXED -> PUNCT
66        ],
67        [
68            (4 << 16) + 14,             // DIGIT -> UPPER
69            (9 << 16) + (14 << 5) + 28, // DIGIT -> UPPER -> LOWER
70            0,
71            (9 << 16) + (14 << 5) + 29, // DIGIT -> UPPER -> MIXED
72            (14 << 16) + (14 << 10) + (29 << 5) + 30,
73        ], // DIGIT -> UPPER -> MIXED -> PUNCT
74        [
75            (5 << 16) + 29,              // MIXED -> UPPER
76            (5 << 16) + 28,              // MIXED -> LOWER
77            (10 << 16) + (29 << 5) + 30, // MIXED -> UPPER -> DIGIT
78            0,
79            (5 << 16) + 30, // MIXED -> PUNCT
80        ],
81        [
82            (5 << 16) + 31,              // PUNCT -> UPPER
83            (10 << 16) + (31 << 5) + 28, // PUNCT -> UPPER -> LOWER
84            (10 << 16) + (31 << 5) + 30, // PUNCT -> UPPER -> DIGIT
85            (10 << 16) + (31 << 5) + 29, // PUNCT -> UPPER -> MIXED
86            0,
87        ],
88    ];
89
90    // A reverse mapping from [mode][char] to the encoding for that character
91    // in that mode.  An entry of 0 indicates no mapping exists.
92    pub const CHAR_MAP: [[u8; 256]; 5] = {
93        let mut char_map = [[0u8; 256]; 5];
94        char_map[Self::MODE_UPPER][b' ' as usize] = 1;
95        let mut c = b'A';
96        while c <= b'Z' {
97            char_map[Self::MODE_UPPER][c as usize] = c - b'A' + 2;
98            c += 1;
99        }
100        // for (int c = 'A'; c <= 'Z'; c++) {
101        //   char_map[Self::MODE_UPPER][c] = c - 'A' + 2;
102        // }
103        char_map[Self::MODE_LOWER][b' ' as usize] = 1;
104        let mut c = b'a';
105        while c <= b'z' {
106            char_map[Self::MODE_LOWER][c as usize] = c - b'a' + 2;
107            c += 1;
108        }
109        // for (int c = 'a'; c <= 'z'; c++) {
110        //   char_map[Self::MODE_LOWER][c] = c - 'a' + 2;
111        // }
112        char_map[Self::MODE_DIGIT][b' ' as usize] = 1;
113        let mut c = b'0';
114        while c <= b'9' {
115            char_map[Self::MODE_DIGIT][c as usize] = c - b'0' + 2;
116            c += 1;
117        }
118        // for (int c = '0'; c <= '9'; c++) {
119        //   char_map[Self::MODE_DIGIT][c] = c - '0' + 2;
120        // }
121        char_map[Self::MODE_DIGIT][b',' as usize] = 12;
122        char_map[Self::MODE_DIGIT][b'.' as usize] = 13;
123        let mixed_table = [
124            '\0', ' ', '\u{1}', '\u{2}', '\u{3}', '\u{4}', '\u{5}', '\u{6}', '\u{7}', '\u{8}',
125            '\t', '\n', '\u{000b}', '\u{000c}', '\r', '\u{001b}', '\u{001c}', '\u{001d}',
126            '\u{001e}', '\u{001f}', '@', '\\', '^', '_', '`', '|', '~', '\u{007f}',
127        ];
128        let mut i = 0;
129        while i < mixed_table.len() {
130            char_map[Self::MODE_MIXED][mixed_table[i] as u8 as usize] = i as u8;
131            i += 1;
132        }
133        // for (int i = 0; i < mixedTable.length; i++) {
134        //   CHAR_MAP[MODE_MIXED][mixedTable[i]] = i;
135        // }
136        let punct_table = [
137            b'\0', b'\r', b'\0', b'\0', b'\0', b'\0', b'!', b'\'', b'#', b'$', b'%', b'&', b'\'',
138            b'(', b')', b'*', b'+', b',', b'-', b'.', b'/', b':', b';', b'<', b'=', b'>', b'?',
139            b'[', b']', b'{', b'}',
140        ];
141        let mut i = 0;
142        while i < punct_table.len() {
143            if punct_table[i] > 0u8 {
144                char_map[Self::MODE_PUNCT][punct_table[i] as usize] = i as u8;
145            }
146            i += 1;
147        }
148        // for (int i = 0; i < punctTable.length; i++) {
149        //   if (punctTable[i] > 0) {
150        //     CHAR_MAP[MODE_PUNCT][punctTable[i]] = i;
151        //   }
152        // }
153
154        char_map
155    };
156    // private static final int[][] CHAR_MAP = new int[5][256];
157    // static {
158    //   CHAR_MAP[MODE_UPPER][' '] = 1;
159    //   for (int c = 'A'; c <= 'Z'; c++) {
160    //     CHAR_MAP[MODE_UPPER][c] = c - 'A' + 2;
161    //   }
162    //   CHAR_MAP[MODE_LOWER][' '] = 1;
163    //   for (int c = 'a'; c <= 'z'; c++) {
164    //     CHAR_MAP[MODE_LOWER][c] = c - 'a' + 2;
165    //   }
166    //   CHAR_MAP[MODE_DIGIT][' '] = 1;
167    //   for (int c = '0'; c <= '9'; c++) {
168    //     CHAR_MAP[MODE_DIGIT][c] = c - '0' + 2;
169    //   }
170    //   CHAR_MAP[MODE_DIGIT][','] = 12;
171    //   CHAR_MAP[MODE_DIGIT]['.'] = 13;
172    //   int[] mixedTable = {
173    //       '\0', ' ', '\1', '\2', '\3', '\4', '\5', '\6', '\7', '\b', '\t', '\n',
174    //       '\13', '\f', '\r', '\33', '\34', '\35', '\36', '\37', '@', '\\', '^',
175    //       '_', '`', '|', '~', '\177'
176    //   };
177    //   for (int i = 0; i < mixedTable.length; i++) {
178    //     CHAR_MAP[MODE_MIXED][mixedTable[i]] = i;
179    //   }
180    //   int[] punctTable = {
181    //       '\0', '\r', '\0', '\0', '\0', '\0', '!', '\'', '#', '$', '%', '&', '\'',
182    //       '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?',
183    //       '[', ']', '{', '}'
184    //   };
185    //   for (int i = 0; i < punctTable.length; i++) {
186    //     if (punctTable[i] > 0) {
187    //       CHAR_MAP[MODE_PUNCT][punctTable[i]] = i;
188    //     }
189    //   }
190    // }
191
192    // A map showing the available shift codes.  (The shifts to BINARY are not
193    // shown
194    pub const SHIFT_TABLE: [[i32; 6]; 6] = {
195        // mode shift codes, per table
196        let mut shift_table = [[-1; 6]; 6];
197
198        shift_table[Self::MODE_UPPER][Self::MODE_PUNCT] = 0;
199
200        shift_table[Self::MODE_LOWER][Self::MODE_PUNCT] = 0;
201        shift_table[Self::MODE_LOWER][Self::MODE_UPPER] = 28;
202
203        shift_table[Self::MODE_MIXED][Self::MODE_PUNCT] = 0;
204
205        shift_table[Self::MODE_DIGIT][Self::MODE_PUNCT] = 0;
206        shift_table[Self::MODE_DIGIT][Self::MODE_UPPER] = 15;
207
208        shift_table
209    };
210    // const SHIFT_TABLE : [[u32]]= new int[6][6]; // mode shift codes, per table
211    // static {
212    //   for (int[] table : SHIFT_TABLE) {
213    //     Arrays.fill(table, -1);
214    //   }
215    //   SHIFT_TABLE[MODE_UPPER][MODE_PUNCT] = 0;
216
217    //   SHIFT_TABLE[MODE_LOWER][MODE_PUNCT] = 0;
218    //   SHIFT_TABLE[MODE_LOWER][MODE_UPPER] = 28;
219
220    //   SHIFT_TABLE[MODE_MIXED][MODE_PUNCT] = 0;
221
222    //   SHIFT_TABLE[MODE_DIGIT][MODE_PUNCT] = 0;
223    //   SHIFT_TABLE[MODE_DIGIT][MODE_UPPER] = 15;
224    // }
225
226    pub fn new(text: Vec<u8>) -> Self {
227        Self {
228            text,
229            charset: CharacterSet::ISO8859_1,
230        }
231    }
232
233    pub fn with_charset(text: Vec<u8>, charset: CharacterSet) -> Self {
234        Self { text, charset }
235    }
236
237    /**
238     * @return text represented by this encoder encoded as a {@link BitArray}
239     */
240    pub fn encode(&self) -> Result<BitArray> {
241        let mut initial_state = State::new(Token::new(), Self::MODE_UPPER as u32, 0, 0);
242        //if let Some(eci) = CharacterSetECI::getCharacterSetECI(self.charset) {
243        if self.charset != CharacterSet::ISO8859_1 {
244            //} && eci != CharacterSetECI::Cp1252 {
245            initial_state = initial_state.appendFLGn(self.charset.into())?;
246        }
247        // } else {
248        //     return Err(Exceptions::illegal_argument_with(
249        //         "No ECI code for character set",
250        //     ));
251        // }
252        // if self.charset != null {
253        //   CharacterSetECI eci = CharacterSetECI.getCharacterSetECI(charset);
254        //   if (null == eci) {
255        //     throw new IllegalArgumentException("No ECI code for character set " + charset);
256        //   }
257        //   initialState = initialState.appendFLGn(eci.getValue());
258        // }
259        let mut states = vec![initial_state];
260        let mut index = 0;
261        while index < self.text.len() {
262            // for index in 0..self.text.len() {
263            // for (int index = 0; index < text.length; index++) {
264
265            let next_char = if index + 1 < self.text.len() {
266                self.text[index + 1]
267            } else {
268                0
269            };
270            let pair_code = match self.text[index] {
271                b'\r' if next_char == b'\n' => 2,
272                b'.' if next_char == b' ' => 3,
273                b',' if next_char == b' ' => 4,
274                b':' if next_char == b' ' => 5,
275                _ => 0,
276            };
277
278            if pair_code > 0 {
279                // We have one of the four special PUNCT pairs.  Treat them specially.
280                // Get a new set of states for the two new characters.
281                states = Self::update_state_list_for_pair(states, index as u32, pair_code);
282                index += 1;
283            } else {
284                // Get a new set of states for the new character.
285                states = self.update_state_list_for_char(states, index as u32);
286            }
287            index += 1;
288        }
289
290        // for state in &states {
291        //     dbg!(state.clone().toBitArray(&self.text).to_string());
292        // }
293
294        // We are left with a set of states.  Find the shortest one.
295        let min_state = states
296            .into_iter()
297            .min_by(|a, b| {
298                let diff: i64 = a.getBitCount() as i64 - b.getBitCount() as i64;
299                diff.cmp(&0)
300                // match diff {
301                //     ..0 => Ordering::Less,
302                //     0 => Ordering::Equal,
303                //     0.. => Ordering::Greater,
304                // }
305                // if diff < 0 {
306                //     Ordering::Less
307                // } else if diff == 0 {
308                //     Ordering::Equal
309                // } else {
310                //     Ordering::Greater
311                // }
312                //  a.getBitCount() - b.getBitCount()
313            })
314            .unwrap();
315        // let minState = Collections.min(states, new Comparator<State>() {
316        //   @Override
317        //   public int compare(State a, State b) {
318        //     return a.getBitCount() - b.getBitCount();
319        //   }
320        // });
321        // Convert it to a bit array, and return.
322        // dbg!(min_state.clone().toBitArray(&self.text).to_string());
323
324        min_state.toBitArray(&self.text)
325    }
326
327    // We update a set of states for a new character by updating each state
328    // for the new character, merging the results, and then removing the
329    // non-optimal states.
330    fn update_state_list_for_char(&self, states: Vec<State>, index: u32) -> Vec<State> {
331        let mut result = Vec::new();
332
333        states
334            .into_iter()
335            .for_each(|state| self.update_state_for_char(state, index, &mut result));
336
337        Self::simplify_states(result)
338    }
339
340    // Return a set of states that represent the possible ways of updating this
341    // state for the next character.  The resulting set of states are added to
342    // the "result" list.
343    fn update_state_for_char(&self, state: State, index: u32, result: &mut Vec<State>) {
344        let ch = self.text[index as usize];
345        let char_in_current_table = Self::CHAR_MAP[state.getMode() as usize][ch as usize] > 0;
346        let mut state_no_binary = None;
347        for mode in 0..=Self::MODE_PUNCT {
348            // for (int mode = 0; mode <= MODE_PUNCT; mode++) {
349            let char_in_mode = Self::CHAR_MAP[mode][ch as usize];
350            if char_in_mode > 0 {
351                if state_no_binary.is_none() {
352                    // Only create stateNoBinary the first time it's required.
353                    state_no_binary = Some(state.clone().endBinaryShift(index));
354                }
355                // Try generating the character by latching to its mode
356                if !char_in_current_table
357                    || mode as u32 == state.getMode()
358                    || mode == Self::MODE_DIGIT
359                {
360                    // If the character is in the current table, we don't want to latch to
361                    // any other mode except possibly digit (which uses only 4 bits).  Any
362                    // other latch would be equally successful *after* this character, and
363                    // so wouldn't save any bits.
364                    let latch_state = state_no_binary
365                        .clone()
366                        .unwrap()
367                        .latchAndAppend(mode as u32, char_in_mode as u32);
368                    result.push(latch_state);
369                }
370                // Try generating the character by switching to its mode.
371                if !char_in_current_table && Self::SHIFT_TABLE[state.getMode() as usize][mode] >= 0
372                {
373                    // It never makes sense to temporarily shift to another mode if the
374                    // character exists in the current mode.  That can never save bits.
375                    let shift_state = state_no_binary
376                        .clone()
377                        .unwrap()
378                        .shiftAndAppend(mode as u32, char_in_mode as u32);
379                    result.push(shift_state);
380                }
381            }
382        }
383        if state.getBinaryShiftByteCount() > 0
384            || Self::CHAR_MAP[state.getMode() as usize][ch as usize] == 0
385        {
386            // It's never worthwhile to go into binary shift mode if you're not already
387            // in binary shift mode, and the character exists in your current mode.
388            // That can never save bits over just outputting the char in the current mode.
389            let binary_state = state.addBinaryShiftChar(index);
390            result.push(binary_state);
391        }
392    }
393
394    fn update_state_list_for_pair(states: Vec<State>, index: u32, pairCode: u32) -> Vec<State> {
395        let mut result = Vec::new();
396        for state in states {
397            // for (State state : states) {
398            Self::update_state_for_pair(state, index, pairCode, &mut result);
399        }
400
401        Self::simplify_states(result)
402    }
403
404    fn update_state_for_pair(state: State, index: u32, pair_code: u32, result: &mut Vec<State>) {
405        let state_no_binary = state.clone().endBinaryShift(index);
406        // Possibility 1.  Latch to MODE_PUNCT, and then append this code
407        result.push(
408            state_no_binary
409                .clone()
410                .latchAndAppend(Self::MODE_PUNCT as u32, pair_code),
411        );
412        if state.getMode() != Self::MODE_PUNCT as u32 {
413            // Possibility 2.  Shift to MODE_PUNCT, and then append this code.
414            // Every state except MODE_PUNCT (handled above) can shift
415            result.push(
416                state_no_binary
417                    .clone()
418                    .shiftAndAppend(Self::MODE_PUNCT as u32, pair_code),
419            );
420        }
421        if pair_code == 3 || pair_code == 4 {
422            // both characters are in DIGITS.  Sometimes better to just add two digits
423            let digit_state = state_no_binary
424                .latchAndAppend(Self::MODE_DIGIT as u32, 16 - pair_code) // period or comma in DIGIT
425                .latchAndAppend(Self::MODE_DIGIT as u32, 1); // space in DIGIT
426            result.push(digit_state);
427        }
428        if state.getBinaryShiftByteCount() > 0 {
429            // It only makes sense to do the characters as binary if we're already
430            // in binary mode.
431            let binary_state = state
432                .addBinaryShiftChar(index)
433                .addBinaryShiftChar(index + 1);
434            result.push(binary_state);
435        }
436    }
437
438    fn simplify_states(states: Vec<State>) -> Vec<State> {
439        let mut result = Vec::with_capacity(states.len());
440        // Reserve up front if you have a rough idea of how many you'll keep.
441        // result.reserve(states.into_iter().size_hint().0);
442
443        for new_state in states {
444            // 1) If any existing state already dominates-or-equals `new_state`, skip it.
445            if result
446                .iter()
447                .any(|s: &State| s.isBetterThanOrEqualTo(&new_state))
448            {
449                continue;
450            }
451
452            // 2) Otherwise, remove any states that the new one dominates-or-equals,
453            //    then push `new_state` onto the frontier.
454            result.retain(|s| !new_state.isBetterThanOrEqualTo(s));
455            result.push(new_state);
456        }
457
458        result
459    }
460}