string_inspector/
decoding.rs

1//! Things for decoding bytes into strings.
2use colored::*;
3use std::borrow::Cow;
4use encoding::types::EncodingRef;
5
6extern crate encoding;
7
8use encoding::{Encoding, DecoderTrap, EncoderTrap};
9
10const BYTE_DISPLAY_SIZE: u16 = 3;
11
12/// A logical character that has been decoded from some code points.
13#[derive(Debug, Clone)]
14pub struct DecodedCharacter {
15    pub character: char,
16    pub bytes: Vec<u8>
17}
18
19impl DecodedCharacter {
20    /// The number of columns required to format this character in the output.
21    fn width(&self) -> usize {
22        self.bytes.len() * BYTE_DISPLAY_SIZE as usize
23    }
24
25    /// Convert a raw character into a DecodedCharacter using a particular Encoding.
26    ///
27    /// # Limitations
28    /// It's assumed that `encoding` is the same one used to decode the character.
29    /// We use this to reencode the character, in order to work out which code points
30    /// within the string actually belong to this character. This allows us to display bytes
31    /// and characters/unicode code points side by side. However, if the input is a unicode replacement
32    /// character, that means that there were code points in the input which could not be decoded,
33    /// and this method won't be able to recover those.
34    ///
35    /// # Panics
36    /// Panics if character is unrepresentable in the provided encoding,
37    /// and that encoding cannot encode a unicode replacement character (U+FFFD).
38    fn new(character: char, encoding: &dyn Encoding) -> DecodedCharacter {
39        let bytes_for_character = encoding.encode(&character.to_string(), EncoderTrap::Replace).unwrap();
40        DecodedCharacter { character, bytes: bytes_for_character }
41    }
42
43    /// Format the character in an easy to understand way.
44    /// ASCII characters are rendered normally.
45    /// Tabs, carriage returns and newlines are represented as escape sequences.
46    /// All other characters are rendered as their unicode codepoints.
47    ///
48    /// # Limitations
49    /// This is not guaranteed to work properly if the codepoint in hex is longer than the number of
50    /// bytes used to represent it in the encoding; for example, latin characters in UTF-16.
51    fn format_character(&self) -> String {
52        let char_size = self.width();
53        let character = self.character;
54
55        match character {
56            '\t' | '\r' | '\n' => {
57                let escaped = character.escape_default();
58                format!("{:width$} ", escaped, width = char_size)
59            }
60            '\u{20}'...'\u{7e}' => {
61                format!("{:width$}", character, width = char_size)
62            }
63            _ => {
64                let codepoint = format!("{:02x} ", character as u32);
65                format!("{:width$}", codepoint, width = char_size)
66            }
67        }
68    }
69
70    /// Format the byte representation of the character using hex.
71    fn format_bytes(&self) -> String {
72        let mut buffer = String::new();
73        for byte in self.bytes.iter() {
74            let byte_hex = format!("{:02x} ", byte);
75            buffer.push_str(&byte_hex)
76        }
77        buffer
78    }
79}
80
81/// A string that has been decoded using a particular character encoding.
82pub struct DecodedString {
83    pub encoding: &'static dyn Encoding,
84    pub characters: Vec<DecodedCharacter>
85}
86
87impl DecodedString {
88    /// Decode a sequence of bytes using a particular encoding.
89    ///
90    /// Any characters that cannot be encoded will be represented using unicode replacement characters (U+FFFD).
91    ///
92    /// # Errors
93    /// Returns an error if anything goes wrong with the underlying decoder. This shouldn't actually happen(?)
94    pub fn decode(string: &[u8], encoding: EncodingRef) -> Result<DecodedString, Cow<'static, str>> {
95        match encoding.decode(string, DecoderTrap::Replace) {
96            Ok(result) => {
97                let characters = result.chars().map(|c| DecodedCharacter::new(c, encoding)).collect();
98                Ok(DecodedString {
99                    encoding: encoding,
100                    characters: characters
101                })
102            },
103            Err(msg) => Err(msg)
104        }
105    }
106
107    /// Format the byte representation of the string using hex.
108    pub fn format_bytes(&self) -> String {
109        self.toggle_color(self.characters.iter().map(DecodedCharacter::format_bytes))
110    }
111
112    /// Format the string in an easy to understand way.
113    /// ASCII characters are rendered normally.
114    /// Tabs, carriage returns and newlines are represented as escape sequences.
115    /// All other characters are rendered as their unicode codepoints.
116    ///
117    /// # Limitations
118    /// This is not guaranteed to work properly if codepoints in hex are longer than the number of
119    /// bytes used to represent it in the encoding; for example, latin characters in UTF-16.
120    pub fn format_characters(&self) -> String {
121        self.toggle_color(self.characters.iter().map(DecodedCharacter::format_character))
122    }
123
124    fn toggle_color<I>(&self, iterator: I) -> String
125    where I: Iterator<Item = String>
126    {
127        let mut color_toggle = true;
128        let mut buffer = String::new();
129
130        for string in iterator {
131            if color_toggle {
132                buffer.push_str(&string.green().to_string());
133            } else {
134                buffer.push_str(&string.blue().to_string());
135            }
136            color_toggle = !color_toggle;
137        }
138        buffer
139    }
140
141    /// Convert to a regular string.
142    pub fn to_string(&self) -> String {
143        self.characters.iter().map(|c| c.character).collect()
144    }
145
146    /// Split into chunks so that the output of [format_bytes](#method.format_bytes) and [format_characters](#method.format_characters)
147    /// fit within `max_line_width` characters for each chunk.
148    pub fn wrap_lines(&self, max_line_width: usize) -> Vec<DecodedString> {
149        let mut lines = Vec::new();
150        let mut characters_in_line = Vec::new();
151        let mut line_size = 0;
152
153        for character in self.characters.iter() {
154            let char_output_width = character.width();
155            if line_size + char_output_width > max_line_width as usize {
156                lines.push(DecodedString {characters: characters_in_line, encoding: self.encoding});
157                characters_in_line = Vec::new();
158                line_size = 0;
159            }
160
161            characters_in_line.push(character.clone());
162            line_size += character.width();
163        }
164
165        if characters_in_line.len() > 0 {
166            lines.push(DecodedString {characters: characters_in_line, encoding: self.encoding});
167        }
168
169        lines
170    }
171}
172
173#[cfg(test)]
174mod tests {
175    use super::*;
176    use encoding::all::UTF_8;
177
178    #[test]
179    fn ascii_printables() {
180        colored::control::set_override(false);
181        let decoding = DecodedString::decode("!aA1".as_bytes(), UTF_8).unwrap();
182        assert_eq!(decoding.format_bytes(), "21 61 41 31 ");
183        assert_eq!(decoding.format_characters(), "!  a  A  1  ");
184    }
185
186    #[test]
187    fn ascii_escapables() {
188        colored::control::set_override(false);
189        let decoding = DecodedString::decode("\n\r\t".as_bytes(), UTF_8).unwrap();
190        assert_eq!(decoding.format_bytes(), "0a 0d 09 ");
191        assert_eq!(decoding.format_characters(), "\\n \\r \\t ");
192    }
193
194    #[test]
195    fn ascii_non_printables() {
196        colored::control::set_override(false);
197        let decoding = DecodedString::decode("\u{00}\u{7f}".as_bytes(), UTF_8).unwrap();
198        assert_eq!(decoding.format_bytes(), "00 7f ");
199        assert_eq!(decoding.format_characters(), "00 7f ");
200    }
201
202    #[test]
203    fn extra_latin_letters() {
204        colored::control::set_override(false);
205        let decoding = DecodedString::decode("éß".as_bytes(), UTF_8).unwrap();
206        assert_eq!(decoding.format_bytes(), "c3 a9 c3 9f ");
207        assert_eq!(decoding.format_characters(), "e9    df    ");
208    }
209
210    #[test]
211    fn display_width_single_byte() {
212        let decoded_character = DecodedCharacter {character: 'a', bytes: "a".as_bytes().to_owned()};
213        assert_eq!(decoded_character.width(), 3);
214    }
215
216    #[test]
217    fn display_width_two_bytes() {
218        let decoded_character = DecodedCharacter {character: 'ß', bytes: "ß".as_bytes().to_owned()};
219        assert_eq!(decoded_character.width(), 6);
220    }
221
222    #[test]
223    fn line_wrapping_if_it_fits() {
224        colored::control::set_override(false);
225        let text = "aaaaa";
226        let screen_width = 15;
227        let decoding = DecodedString::decode(text.as_bytes(), UTF_8).unwrap();
228        assert_eq!(decoding.format_bytes(), "61 61 61 61 61 ");
229        assert_eq!(decoding.format_characters(), "a  a  a  a  a  ");
230
231        let lines = decoding.wrap_lines(screen_width);
232        assert_eq!(lines.len(), 1);
233        assert_eq!(lines[0].format_bytes(), "61 61 61 61 61 ");
234        assert_eq!(lines[0].format_characters(), "a  a  a  a  a  ");
235    }
236
237    #[test]
238    fn line_wrapping_wraps_to_exact_number_of_lines() {
239        colored::control::set_override(false);
240        let text = "aaaaabbbbb";
241        let screen_width = 15;
242        let decoding = DecodedString::decode(text.as_bytes(), UTF_8).unwrap();
243        let lines = decoding.wrap_lines(screen_width);
244
245        assert_eq!(lines.len(), 2);
246
247        assert_eq!(lines[0].format_bytes(), "61 61 61 61 61 ");
248        assert_eq!(lines[0].format_characters(), "a  a  a  a  a  ");
249
250        assert_eq!(lines[1].format_bytes(), "62 62 62 62 62 ");
251        assert_eq!(lines[1].format_characters(), "b  b  b  b  b  ");
252    }
253
254    #[test]
255    fn line_wrapping_wraps_to_inexact_number_of_lines() {
256        colored::control::set_override(false);
257        let text = "aaaaabbbbbcc";
258        let screen_width = 15;
259        let decoding = DecodedString::decode(text.as_bytes(), UTF_8).unwrap();
260        let lines = decoding.wrap_lines(screen_width);
261
262        assert_eq!(lines.len(), 3);
263
264        assert_eq!(lines[0].format_bytes(), "61 61 61 61 61 ");
265        assert_eq!(lines[0].format_characters(), "a  a  a  a  a  ");
266
267        assert_eq!(lines[1].format_bytes(), "62 62 62 62 62 ");
268        assert_eq!(lines[1].format_characters(), "b  b  b  b  b  ");
269
270        assert_eq!(lines[2].format_bytes(), "63 63 ");
271        assert_eq!(lines[2].format_characters(), "c  c  ");
272    }
273}