unicode_bom/
lib.rs

1// Copyright © 2018 Phil Booth
2//
3// Licensed under the Apache License, Version 2.0 (the "License"); you may
4// not use this file except in compliance with the License. You may obtain
5// a copy of the License at:
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12// implied. See the License for the specific language governing
13// permissions and limitations under the License.
14
15//! Detects and classifies
16//! [Unicode byte-order marks](https://en.wikipedia.org/wiki/Byte_order_mark).
17//!
18//! ## Usage
19//!
20//! ```
21//! use unicode_bom::Bom;
22//!
23//! // Detect the UTF-32 (little-endian) BOM in a file on disk
24//! let bom: Bom = "fixtures/utf32-le.txt".parse().unwrap();
25//! assert_eq!(bom, Bom::Utf32Le);
26//! assert_eq!(bom.len(), 4);
27//!
28//! // Detect the UTF-16 (little-endian) BOM in a file on disk
29//! let bom: Bom = "fixtures/utf16-le.txt".parse().unwrap();
30//! assert_eq!(bom, Bom::Utf16Le);
31//! assert_eq!(bom.len(), 2);
32//!
33//! // Detect no BOM in a file on disk
34//! let bom: Bom = "fixtures/ascii.txt".parse().unwrap();
35//! assert_eq!(bom, Bom::Null);
36//! assert_eq!(bom.len(), 0);
37//!
38//! // Detect the BOM in a byte array
39//! let bytes = [0u8, 0u8, 0xfeu8, 0xffu8];
40//! assert_eq!(Bom::from(&bytes[0..]), Bom::Utf32Be);
41//! ```
42
43use std::fmt::{self, Display, Formatter};
44use std::fs::File;
45use std::io::{Error, ErrorKind, Read};
46use std::str::FromStr;
47
48#[cfg(test)]
49mod test;
50
51/// Unicode byte-order mark (BOM) abstraction.
52#[derive(Clone, Copy, Debug, PartialEq)]
53pub enum Bom {
54    /// Indicates no BOM was detected.
55    Null,
56
57    /// Indicates [BOCU-1](https://www.unicode.org/notes/tn6/) BOM was detected.
58    Bocu1,
59
60    /// Indicates [GB 18030](https://en.wikipedia.org/wiki/GB_18030) BOM was detected.
61    Gb18030,
62
63    /// Indicates [SCSU](https://www.unicode.org/reports/tr6/) BOM was detected.
64    Scsu,
65
66    /// Indicates [UTF-EBCIDC](https://www.unicode.org/reports/tr16/) BOM was detected.
67    UtfEbcdic,
68
69    /// Indicates [UTF-1](https://en.wikipedia.org/wiki/UTF-1) BOM was detected.
70    Utf1,
71
72    /// Indicates [UTF-7](https://tools.ietf.org/html/rfc2152) BOM was detected.
73    Utf7,
74
75    /// Indicates [UTF-8](https://tools.ietf.org/html/rfc3629) BOM was detected.
76    Utf8,
77
78    /// Indicates [UTF-16](https://tools.ietf.org/html/rfc2781) (big-endian) BOM was detected.
79    Utf16Be,
80
81    /// Indicates [UTF-16](https://tools.ietf.org/html/rfc2781) (little-endian) BOM was detected.
82    Utf16Le,
83
84    /// Indicates [UTF-32](https://www.unicode.org/reports/tr19/) (big-endian) BOM was detected.
85    Utf32Be,
86
87    /// Indicates [UTF-32](https://www.unicode.org/reports/tr19/) (little-endian) BOM was detected.
88    Utf32Le,
89}
90
91impl Bom {
92    /// Returns the size in bytes of the BOM.
93    pub fn len(&self) -> usize {
94        match *self {
95            Bom::Null => 0,
96            Bom::Bocu1 => 3,
97            Bom::Gb18030 => 4,
98            Bom::Scsu => 3,
99            Bom::UtfEbcdic => 4,
100            Bom::Utf1 => 3,
101            Bom::Utf7 => 4,
102            Bom::Utf8 => 3,
103            Bom::Utf16Be => 2,
104            Bom::Utf16Le => 2,
105            Bom::Utf32Be => 4,
106            Bom::Utf32Le => 4,
107        }
108    }
109}
110
111impl AsRef<str> for Bom {
112    /// Returns a `&str` representation of the BOM type.
113    fn as_ref(&self) -> &str {
114        match *self {
115            Bom::Null => "[not set]",
116            Bom::Bocu1 => "BOCU-1",
117            Bom::Gb18030 => "GB 18030",
118            Bom::Scsu => "SCSU",
119            Bom::UtfEbcdic => "UTF-EBCDIC",
120            Bom::Utf1 => "UTF-1",
121            Bom::Utf7 => "UTF-7",
122            Bom::Utf8 => "UTF-8",
123            Bom::Utf16Be => "UTF-16 (big-endian)",
124            Bom::Utf16Le => "UTF-16 (little-endian)",
125            Bom::Utf32Be => "UTF-32 (big-endian)",
126            Bom::Utf32Le => "UTF-32 (little-endian)",
127        }
128    }
129}
130
131impl AsRef<[u8]> for Bom {
132    /// Returns the BOM byte-array literal.
133    ///
134    /// Note that for UTF-7,
135    /// only the first three bytes of the BOM are returned.
136    /// That's because the last two bits of the fourth byte
137    /// belong to the following character,
138    /// so it's impossible to return the fourth byte
139    /// without further context.
140    /// Possible values for the missing fourth byte
141    /// are `0x38`, `0x39`, `0x2a` and `0x2b`.
142    fn as_ref(&self) -> &[u8] {
143        match *self {
144            Bom::Null => &[],
145            Bom::Bocu1 => &[0xfb, 0xee, 0x28],
146            Bom::Gb18030 => &[0x84, 0x31, 0x95, 0x33],
147            Bom::Scsu => &[0x0e, 0xfe, 0xff],
148            Bom::UtfEbcdic => &[0xdd, 0x73, 0x66, 0x73],
149            Bom::Utf1 => &[0xf7, 0x64, 0x4c],
150            Bom::Utf7 => &[0x2b, 0x2f, 0x76],
151            Bom::Utf8 => &[0xef, 0xbb, 0xbf],
152            Bom::Utf16Be => &[0xfe, 0xff],
153            Bom::Utf16Le => &[0xff, 0xfe],
154            Bom::Utf32Be => &[0, 0, 0xfe, 0xff],
155            Bom::Utf32Le => &[0xff, 0xfe, 0, 0],
156        }
157    }
158}
159
160impl Default for Bom {
161    /// Returns the default/empty BOM type, `Bom::Null`.
162    fn default() -> Self {
163        Bom::Null
164    }
165}
166
167impl Display for Bom {
168    /// Formats the BOM type as a `String`.
169    fn fmt(&self, formatter: &mut Formatter) -> fmt::Result {
170        write!(formatter, "{}", AsRef::<str>::as_ref(self))
171    }
172}
173
174impl Eq for Bom {}
175
176macro_rules! compare_tail {
177    ($slice:ident, $bytes:expr) => {
178        compare_tail!($slice, $bytes, 1)
179    };
180
181    ($slice:ident, $bytes:expr, $from:expr) => {
182        compare_tail!($slice, $bytes.len() + $from, $bytes, $from)
183    };
184
185    ($slice:ident, $len:expr, $bytes:expr, $from:expr) => {
186        $slice.len() >= $len && $slice[$from..$from + $bytes.len()] == $bytes
187    };
188}
189
190impl From<&[u8]> for Bom {
191    /// Detect the BOM type from a byte array.
192    fn from(slice: &[u8]) -> Self {
193        if slice.len() >= 2 {
194            match slice[0] {
195                0 => {
196                    if compare_tail!(slice, [0, 0xfe, 0xff]) {
197                        return Bom::Utf32Be;
198                    }
199                }
200                0x0e => {
201                    if compare_tail!(slice, [0xfe, 0xff]) {
202                        return Bom::Scsu;
203                    }
204                }
205                0x2b => {
206                    if compare_tail!(slice, 4, [0x2f, 0x76], 1)
207                        && (slice[3] == 0x38
208                            || slice[3] == 0x39
209                            || slice[3] == 0x2b
210                            || slice[3] == 0x2f)
211                    {
212                        return Bom::Utf7;
213                    }
214                }
215                0x84 => {
216                    if compare_tail!(slice, [0x31, 0x95, 0x33]) {
217                        return Bom::Gb18030;
218                    }
219                }
220                0xdd => {
221                    if compare_tail!(slice, [0x73, 0x66, 0x73]) {
222                        return Bom::UtfEbcdic;
223                    }
224                }
225                0xef => {
226                    if compare_tail!(slice, [0xbb, 0xbf]) {
227                        return Bom::Utf8;
228                    }
229                }
230                0xf7 => {
231                    if compare_tail!(slice, [0x64, 0x4c]) {
232                        return Bom::Utf1;
233                    }
234                }
235                0xfb => {
236                    if compare_tail!(slice, [0xee, 0x28]) {
237                        return Bom::Bocu1;
238                    }
239                }
240                0xfe => {
241                    if slice[1] == 0xff {
242                        return Bom::Utf16Be;
243                    }
244                }
245                0xff => {
246                    if slice[1] == 0xfe {
247                        if compare_tail!(slice, [0, 0], 2) {
248                            return Bom::Utf32Le;
249                        }
250
251                        return Bom::Utf16Le;
252                    }
253                }
254                _ => {}
255            }
256        }
257
258        Bom::Null
259    }
260}
261
262impl From<&mut File> for Bom {
263    /// Detect the BOM type from a `File` instance.
264    ///
265    /// Note that I/O errors are swallowed by this method.
266    /// Instead the default type, `Bom::Null`,
267    /// will be returned.
268    fn from(file: &mut File) -> Self {
269        let mut data = [0u8; 4];
270        let mut result = file.read_exact(&mut data);
271
272        if let Err(ref error) = result {
273            if error.kind() == ErrorKind::UnexpectedEof {
274                let short_data = [0u8; 3];
275                result = file.read_exact(&mut data);
276
277                if let Err(ref error) = result {
278                    if error.kind() == ErrorKind::UnexpectedEof {
279                        let short_data = [0u8; 2];
280                        result = file.read_exact(&mut data);
281                        data[0] = short_data[0];
282                        data[1] = short_data[1];
283                    }
284                } else {
285                    data[0] = short_data[0];
286                    data[1] = short_data[1];
287                    data[2] = short_data[2];
288                }
289            }
290        }
291
292        if result.is_ok() {
293            Bom::from(&data[0..])
294        } else {
295            Bom::Null
296        }
297    }
298}
299
300impl FromStr for Bom {
301    /// A `std::io::Error` instance returned by `std::fs::File::open`.
302    type Err = Error;
303
304    /// Parse the BOM type from the file located at `path`.
305    fn from_str(path: &str) -> Result<Self, Self::Err> {
306        let mut file = File::open(path)?;
307        Ok(Bom::from(&mut file))
308    }
309}