display_bytes/
lib.rs

1// Copyright 2017 Austin Bonander
2//
3// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5// http://opensource.org/licenses/MIT>, at your option. This file may not be
6// copied, modified, or distributed except according to those terms.
7//! Human-readable display of byte sequences.
8//!
9//! Supports printing of both UTF-8 and ASCII-only sequences.
10//!
11//! For easy usage, see the free functions `display_bytes()` and `display_bytes_string()`
12//! in this crate. For more control over formatting, see the statics in this crate or build
13//! an instance of `DisplayBytesConfig` yourself.
14//!
15//! ```rust
16//! extern crate display_bytes;
17//! 
18//! use display_bytes::{display_bytes, display_bytes_string};
19//! 
20//! fn main() {
21//!     let bytes = b"Hello, world!\x89\x90\xAB\xCD";
22//!     println!("{:?}", bytes);
23//!     println!("{}", display_bytes(bytes));
24//!     assert_eq!(display_bytes_string(bytes),
25//!                "Hello, world! {{ 89 90 AB CD }} ");
26//! }
27//! ```
28#![warn(missing_docs)]
29use std::borrow::Cow;
30use std::cell::Cell;
31use std::{fmt, ops, str};
32
33mod base64;
34mod hex;
35
36pub use base64::FormatBase64;
37pub use hex::{DEFAULT_HEX, FormatHex};
38use std::fmt::Write;
39
40#[derive(Clone, Debug)]
41enum MaybeOwned<'a, T: 'a> {
42    Borrowed(&'a T),
43    Owned(T),
44}
45
46impl<'a, T: 'a> ops::Deref for MaybeOwned<'a, T> {
47    type Target = T;
48
49    fn deref(&self) -> &T {
50        match *self {
51            MaybeOwned::Borrowed(b) => b,
52            MaybeOwned::Owned(ref o) => o,
53        }
54    }
55}
56
57impl<'a, T: 'a> From<&'a T> for MaybeOwned<'a, T> {
58    fn from(refr: &'a T) -> Self { MaybeOwned::Borrowed(refr) }
59}
60
61impl<'a, T: 'a> From<T> for MaybeOwned<'a, T> {
62    fn from(owned: T) -> Self { MaybeOwned::Owned(owned) }
63}
64
65/// Prints byte sections with hexadecimal bytes wrapped in ` {{ }} `. Prints only ASCII sequences.
66pub const HEX_ASCII: DisplayBytesConfig<'static, FormatHex<'static>> = DisplayBytesConfig {
67    delim: [" {{ ", " }} "],
68    ascii_only: true,
69    min_str_len: 6,
70    print_terms: true,
71    invert_delims: false,
72    escape_ctl: false,
73    byte_format: DEFAULT_HEX,
74};
75
76/// Prints byte sections with hexadecimal bytes wrapped in ` {{ }} `. Prints all valid UTF-8 strings.
77pub const HEX_UTF8: DisplayBytesConfig<'static, FormatHex<'static>> = DisplayBytesConfig {
78    delim: [" {{ ", " }} "],
79    ascii_only: false,
80    min_str_len: 6,
81    print_terms: true,
82    invert_delims: false,
83    escape_ctl: false,
84    byte_format: DEFAULT_HEX
85};
86
87/// Prints byte sections as Base-64 wrapped in ` {{ }} `. Prints only ASCII sequences.
88///
89/// Provided as a static so it may be used by-reference.
90pub const BASE64_ASCII: DisplayBytesConfig<'static, FormatBase64> = DisplayBytesConfig {
91    delim: [" {{ ", " }} "],
92    ascii_only: true,
93    min_str_len: 6,
94    print_terms: true,
95    invert_delims: false,
96    escape_ctl: false,
97    byte_format: FormatBase64,
98};
99
100/// Prints byte sections as Base-64 wrapped in ` {{ }} `. Prints all valid UTF-8 strings.
101pub const BASE64_UTF8: DisplayBytesConfig<'static, FormatBase64> = DisplayBytesConfig {
102    delim: [" {{ ", " }} "],
103    ascii_only: false,
104    min_str_len: 6,
105    print_terms: true,
106    invert_delims: false,
107    escape_ctl: false,
108    byte_format: FormatBase64,
109};
110
111/// Configuration builder for `DisplayBytes`.
112///
113/// Consts with sane defaults are provided in this module.
114#[derive(Clone, Debug)]
115pub struct DisplayBytesConfig<'d, F> {
116    delim: [&'d str; 2],
117    ascii_only: bool,
118    min_str_len: usize,
119    print_terms: bool,
120    invert_delims: bool,
121    escape_ctl: bool,
122    byte_format: F
123}
124
125impl Default for DisplayBytesConfig<'static, FormatHex<'static>> {
126    fn default() -> Self {
127        HEX_UTF8.clone()
128    }
129}
130
131impl Default for DisplayBytesConfig<'static, FormatBase64> {
132    fn default() -> Self { BASE64_UTF8.clone() }
133}
134
135impl<'d, F> DisplayBytesConfig<'d, F> {
136    /// Set the type used to format byte sequences.
137    pub fn byte_format<F_: ByteFormat>(self, format: F_) -> DisplayBytesConfig<'d, F_> {
138        DisplayBytesConfig {
139            delim: self.delim,
140            ascii_only: self.ascii_only,
141            min_str_len: self.min_str_len,
142            print_terms: self.print_terms,
143            invert_delims: self.invert_delims,
144            escape_ctl: self.escape_ctl,
145            byte_format: format,
146        }
147    }
148
149    /// Get a mutable reference to the current `ByteFormat`.
150    pub fn byte_format_mut(&mut self) -> &mut F {
151        &mut self.byte_format
152    }
153
154    /// Set the pair of delimiters used to wrap byte sequences in the formatted stream.
155    ///
156    /// Note that this can change the lifetime bound.
157    pub fn delimiters<'d_>(self, delimiters: [&'d_ str; 2]) -> DisplayBytesConfig<'d_, F> {
158        DisplayBytesConfig {
159            delim: delimiters,
160            ascii_only: self.ascii_only,
161            min_str_len: self.min_str_len,
162            print_terms: self.print_terms,
163            invert_delims: self.invert_delims,
164            escape_ctl: self.escape_ctl,
165            byte_format: self.byte_format
166        }
167    }
168
169    /// Get a mutable reference to the current pair of delimiters.
170    pub fn delimiters_mut(&mut self) -> &mut [&'d str; 2] {
171        &mut self.delim
172    }
173
174    /// If set to `true`, only displays ASCII byte sequences (bytes in `[0x00, 0x7F]`).
175    ///
176    /// Otherwise, displays all valid UTF-8 sequences at least `min_str_len` bytes long.
177    pub fn ascii_only(self, ascii_only: bool) -> Self {
178        DisplayBytesConfig { ascii_only, ..self }
179    }
180
181    /// The minimum number of *bytes* in length that a valid string sequence
182    /// must be to be displayed.
183    ///
184    /// Strings shorter than this length will be included in the nearest byte sequence. Use this
185    /// to avoid extra noise from random decodable characters splitting byte sequences.
186    ///
187    /// ## Note
188    /// This does not affect byte sequences that can be completely decoded. If `print_terminators`
189    /// is set, this also will not affect strings at the beginning or at the end of the byte
190    /// slice (e.g. valid strings at the start and end will be printed regardless of length).
191    pub fn min_str_len(self, min_str_len: usize) -> Self {
192        DisplayBytesConfig { min_str_len, ..self }
193    }
194
195    /// If set to `true`, valid strings at the start and end of a byte slice will
196    /// be printed regardless of their length relative to `min_str_len`.
197    pub fn print_terminators(self, print_terminators: bool) -> Self {
198        DisplayBytesConfig{ print_terms: print_terminators, .. self }
199    }
200
201    /// If set to `true`, control characters will be printed in their escaped form (`\n`)
202    /// instead of printed directly.
203    pub fn escape_control(self, escape_ctl: bool) -> Self {
204        DisplayBytesConfig{ escape_ctl, .. self }
205    }
206
207    /// If set to `true`, wraps decoded strings in the given delimiters
208    /// rather than byte sequences.
209    pub fn invert_delimiters(self, invert_delimiters: bool) -> Self {
210        DisplayBytesConfig { invert_delims: invert_delimiters, .. self }
211    }
212}
213
214impl<'d, F: ByteFormat> DisplayBytesConfig<'d, F> {
215    fn valid_subseq<'b>(&self, bytes: &'b [u8]) -> Option<(&'b str, &'b [u8])> {
216        match self.try_convert(bytes) {
217            Ok(all_good) => Some((all_good, &[])),
218            Err(valid_end) if valid_end > 0 =>
219                Some((assume_utf8(&bytes[..valid_end]), &bytes[valid_end..])),
220            _ => None,
221        }
222    }
223
224    fn try_convert<'b>(&self, bytes: &'b [u8]) -> Result<&'b str, usize> {
225        if self.ascii_only {
226            if bytes.is_ascii() {
227                Ok(assume_utf8(bytes))
228            } else {
229                Err(bytes.iter().position(|b| !b.is_ascii()).unwrap_or(0))
230            }
231        } else {
232            str::from_utf8(bytes).map_err(|e| e.valid_up_to())
233        }
234    }
235
236    fn next_valid_idx(&self, bytes: &[u8]) -> Option<usize> {
237        if self.ascii_only {
238            bytes.iter().position(u8::is_ascii)
239        } else {
240            next_valid_idx(bytes)
241        }
242    }
243
244    fn next_valid_subseq<'b>(&self, bytes: &'b [u8]) -> Option<(&'b [u8], &'b str, &'b [u8])> {
245        let mut start = 0;
246
247        while let Some(next_valid) = self.next_valid_idx(&bytes[start..]) {
248            start += next_valid;
249
250            if let Some((valid, after)) = self.valid_subseq(&bytes[start..]) {
251                // We handle this here so we don't need to do special handling of delimiters
252                // in `DisplayBytes::fmt()`
253                if valid.len() >= self.min_str_len || (after.is_empty() && self.print_terms) {
254                    return Some((&bytes[..start], valid, after));
255                }
256            }
257
258            start += 1;
259        }
260
261        None
262    }
263
264    /// Attempt to convert `bytes` to a string (an ASCII-only string if `ascii_only` is set,
265    /// UTF-8 otherwise), or otherwise format `bytes` to a string using the properties
266    /// in this configuration.
267    pub fn display_bytes_string<'b>(&self, bytes: &'b [u8]) -> Cow<'b, str> where 'd: 'b, F: 'b {
268        match self.try_convert(bytes) {
269            Ok(s) => s.into(),
270            Err(valid_end) => DisplayBytes {
271                bytes, config: self.into(), valid_end: Some(valid_end).into(),
272            }.to_string().into(),
273        }
274    }
275
276    /// Get a type that implements `Display` which will format `bytes` to an output stream
277    /// using the properties in this configuration.
278    pub fn ref_display_bytes<'b>(&'b self, bytes: &'b [u8]) -> DisplayBytes<'b, F> {
279        DisplayBytes {
280            bytes,
281            valid_end: Cell::new(None),
282            config: self.into(),
283        }
284    }
285}
286
287
288impl<'d, F: ByteFormat> DisplayBytesConfig<'d, F> {
289    /// Get a type that implements `Display` which will format `bytes` to an output stream
290    /// using the properties in this configuration.
291    pub fn display_bytes<'b>(self, bytes: &'b [u8]) -> DisplayBytes<'b, F> where 'd: 'b {
292        DisplayBytes {
293            bytes,
294            valid_end: Cell::new(None),
295            config: self.into(),
296        }
297    }
298}
299
300/// Formats byte sequences in human-readable representations.
301pub trait ByteFormat {
302    /// Encode the given byte-sequence in some human-readable format and print it to `f`.
303    fn fmt_bytes(&self, bytes: &[u8], f: &mut fmt::Formatter) -> fmt::Result;
304
305    /// Uses `fmt_bytes()` to encode the byte-sequence and print it to a `String`.
306    ///
307    /// Not used directly except for testing. However, you may find it useful.
308    fn bytes_to_string(&self, bytes: &[u8]) -> String {
309        struct DisplayAdapter<'a, F: ?Sized + 'a>(&'a [u8], &'a F);
310
311        impl<'a, F: ByteFormat + ?Sized + 'a> fmt::Display for DisplayAdapter<'a, F> {
312            fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
313                self.1.fmt_bytes(self.0, f)
314            }
315        }
316
317        format!("{}", DisplayAdapter(bytes, self))
318    }
319}
320
321fn next_valid_idx(bytes: &[u8]) -> Option<usize> {
322    if bytes.len() < 4 {
323        // Check each byte to see if it's a valid starting idx
324        (0 .. bytes.len()).position(|start| starts_valid(&bytes[start ..]))
325    } else {
326        // only need to check 4-byte sliding window, anything else is overkill
327        bytes.windows(4).position(starts_valid)
328            // Check the last 3 bytes
329            .or_else(|| next_valid_idx(&bytes[bytes.len() - 3 ..]))
330    }
331}
332
333fn starts_valid(bytes: &[u8]) -> bool {
334    match str::from_utf8(bytes) {
335        Ok(_) => true,
336        Err(e) => e.valid_up_to() > 0,
337    }
338}
339
340
341/// Attempt to convert the byte slice to a string, or else format it to a string using the default
342/// `DisplayBytesConfig`.
343///
344/// All string-decodable sequences of useful length will be displayed as they are, and all
345/// non-decodable byte sequences will be printed in a human-readable format.
346///
347/// The format is unspecified. If you want to specify a format, use `DisplayBytesConfig` directly
348/// or one of the statics in the crate root.
349pub fn display_bytes_string(bytes: &[u8]) -> Cow<str> {
350    HEX_UTF8.display_bytes_string(bytes)
351}
352
353/// Wrap a byte slice in an adapter which implements `Display`.
354///
355/// This adapter will print any string-decodable sequences of useful length in the byte stream,
356/// and all non-decodable byte sequences in a human-readable format.
357///
358/// The format is deliberately unspecified in the type. If you want to specify a format, use
359/// `DisplayBytesConfig` directly or one of the statics in the crate root.
360pub fn display_bytes<'b>(bytes: &'b [u8]) -> impl fmt::Display + 'b {
361    HEX_ASCII.display_bytes(bytes)
362}
363
364/// A wrapper around a byte sequence which implements `Display`.
365///
366/// Non-decodable byte sequences will be printed in human-readable representation based
367/// on the byte format `F`. Use `DisplayBytesConfig` to get an instance of this type.
368#[derive(Debug)]
369pub struct DisplayBytes<'b, F: 'b> {
370    bytes: &'b [u8],
371    valid_end: Cell<Option<usize>>,
372    config: MaybeOwned<'b, DisplayBytesConfig<'b, F>>,
373}
374
375impl<'b, F> DisplayBytes<'b, F> {
376    fn maybe_escape(&self, str: &str, f: &mut fmt::Formatter) -> fmt::Result {
377        if self.config.escape_ctl {
378            for c in str.chars() {
379                if c.is_ascii_control() {
380                    for c in c.escape_default() {
381                        f.write_char(c)?;
382                    }
383                } else {
384                    f.write_char(c)?;
385                }
386            }
387
388            Ok(())
389        } else {
390            f.write_str(str)
391        }
392    }
393}
394
395impl<'b, F: ByteFormat + 'b> fmt::Display for DisplayBytes<'b, F> {
396    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
397        let maybe_valid = self.valid_end.get()
398            .map(|valid_end| {
399                let (valid, rest) = self.bytes.split_at(valid_end);
400                (assume_utf8(valid), rest)
401            })
402            .or_else(|| self.config.valid_subseq(self.bytes));
403
404        // We handle this here because this is the only time we *know* we're at the
405        // start of the byte sequence
406        let accept_start = |s: &str| self.config.print_terms ||
407                                        s.len() >= self.config.min_str_len;
408
409        let mut rem = match maybe_valid {
410            Some((valid, rem)) if accept_start(valid) => {
411                // Memoize the first valid sequence of the bytes
412                self.valid_end.set(Some(valid.len()));
413
414                if self.config.invert_delims {
415                    f.write_str(self.config.delim[0])?;
416                }
417
418                self.maybe_escape(valid, f)?;
419
420                if self.config.invert_delims {
421                    f.write_str(self.config.delim[1])?;
422                }
423
424                rem
425            },
426            _ => {
427                // Memoize that the byte sequence doesn't start valid
428                self.valid_end.set(Some(0));
429                self.bytes
430            }
431        };
432
433        while let Some((before, valid, after)) = self.config.next_valid_subseq(rem) {
434            if !self.config.invert_delims {
435                f.write_str(self.config.delim[0])?;
436                self.config.byte_format.fmt_bytes(before, f)?;
437                f.write_str(self.config.delim[1])?;
438                self.maybe_escape(valid, f)?;
439            } else {
440                self.config.byte_format.fmt_bytes(before, f)?;
441                f.write_str(self.config.delim[0])?;
442                self.maybe_escape(valid, f)?;
443                f.write_str(self.config.delim[1])?;
444            }
445
446
447            rem = after;
448        }
449
450        if !rem.is_empty() {
451            if !self.config.invert_delims {
452                f.write_str(self.config.delim[0])?;
453            }
454
455            self.config.byte_format.fmt_bytes(rem, f)?;
456
457            if !self.config.invert_delims {
458                f.write_str(self.config.delim[1])?;
459            }
460        }
461
462        Ok(())
463    }
464}
465
466/// In debug mode, asserts that `bytes` is valid UTF-8; in release mode, converts it unchecked
467fn assume_utf8(bytes: &[u8]) -> &str {
468    if cfg!(debug) {
469        str::from_utf8(bytes).unwrap_or_else(|e|
470            panic!("{}; lossy conversion: {}", e, String::from_utf8_lossy(bytes))
471        )
472    } else {
473        unsafe { str::from_utf8_unchecked(bytes) }
474    }
475}
476
477#[test]
478fn basic_test() {
479    let format = &HEX_UTF8;
480    assert_eq!(format.display_bytes_string(b"Hello, world!"), "Hello, world!");
481    assert_eq!(format.display_bytes_string(b"Hello,\xAB\xCD\xEF"), "Hello, {{ AB CD EF }} ");
482    assert_eq!(format.display_bytes_string(b"\xF0o\xBAr"), " {{ F0 6F BA }} r");
483    // test of `min_str_len`, note that the "r" at the end of `\xF0o\xBAr` is printed
484    // because it is part of a string of valid length ("r foobar "), but the "o" between
485    // 0xF0 and 0xBA is considered part of the byte sequence.
486    assert_eq!(format.display_bytes_string(b"\xF0o\xBAr foobar\xAB\xCD\xEF"),
487               " {{ F0 6F BA }} r foobar {{ AB CD EF }} ");
488}
489
490#[test]
491fn test_memoization() {
492    let display = HEX_UTF8.display_bytes(b"Hello,\xAB\xCD\xEF");
493    assert_eq!(display.to_string(), "Hello, {{ AB CD EF }} ");
494    assert_eq!(display.to_string(), "Hello, {{ AB CD EF }} ");
495}
496
497#[test]
498fn test_print_terminators() {
499    let bytes = b"ab\xCD \xEFgh";
500    let display = HEX_UTF8.display_bytes(bytes);
501    let config = HEX_UTF8.clone().print_terminators(false);
502    let display2 = config.display_bytes(bytes);
503
504    assert_eq!(display.to_string(), "ab {{ CD 20 EF }} gh");
505    assert_eq!(display2.to_string(), " {{ 61 62 CD 20 EF 67 68 }} ");
506}
507
508#[test]
509fn test_invert_delims() {
510    let bytes = b"\x80\x90Hello, world!\xAB\xCD";
511    let config = HEX_UTF8.clone().invert_delimiters(true);
512
513    let display = config.display_bytes(bytes);
514    let display2 = HEX_UTF8.display_bytes(bytes);
515
516    assert_eq!(display.to_string(), "80 90 {{ Hello, world! }} AB CD");
517    assert_eq!(display2.to_string(), " {{ 80 90 }} Hello, world! {{ AB CD }} ")
518}