unicode-bom 0.1.0

Unicode byte-order mark detection for files and byte arrays.
// Copyright © 2018 Phil Booth
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License. You may obtain
// a copy of the License at:
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

//! Detects and classifies
//! [Unicode byte-order marks](https://en.wikipedia.org/wiki/Byte_order_mark).
//!
//! ## Usage
//!
//! ```
//! use unicode_bom::Bom;
//!
//! // Detect the BOM in a file on disk
//! assert_eq!(Bom::from("fixtures/ascii.txt"), Bom::Null);
//! assert_eq!(Bom::from("fixtures/utf16-le.txt"), Bom::Utf16Le);
//!
//! // Detect the BOM in a byte array
//! assert_eq!(Bom::from(&[0u8, 0u8, 0xfeu8, 0xffu8][0..]), Bom::Utf32Be);
//! ```

use std::fmt::{self, Display, Formatter};
use std::fs::File;
use std::io::{ErrorKind, Read};

#[cfg(test)]
mod test;

/// Unicode byte-order mark (BOM) abstraction.
#[derive(Clone, Copy, Debug, PartialEq)]
pub enum Bom {
    /// Indicates no BOM was detected.
    Null,

    /// Indicates [BOCU-1](https://www.unicode.org/notes/tn6/) BOM was detected.
    Bocu1,

    /// Indicates [GB 18030](https://en.wikipedia.org/wiki/GB_18030) BOM was detected.
    Gb18030,

    /// Indicates [SCSU](https://www.unicode.org/reports/tr6/) BOM was detected.
    Scsu,

    /// Indicates [UTF-EBCIDC](https://www.unicode.org/reports/tr16/) BOM was detected.
    UtfEbcdic,

    /// Indicates [UTF-1](https://en.wikipedia.org/wiki/UTF-1) BOM was detected.
    Utf1,

    /// Indicates [UTF-7](https://tools.ietf.org/html/rfc2152) BOM was detected.
    Utf7,

    /// Indicates [UTF-8](https://tools.ietf.org/html/rfc3629) BOM was detected.
    Utf8,

    /// Indicates [UTF-16](https://tools.ietf.org/html/rfc2781) (big-endian) BOM was detected.
    Utf16Be,

    /// Indicates [UTF-16](https://tools.ietf.org/html/rfc2781) (little-endian) BOM was detected.
    Utf16Le,

    /// Indicates [UTF-32](https://www.unicode.org/reports/tr19/) (big-endian) BOM was detected.
    Utf32Be,

    /// Indicates [UTF-32](https://www.unicode.org/reports/tr19/) (little-endian) BOM was detected.
    Utf32Le,
}

impl AsRef<str> for Bom {
    /// Returns a `&str` representation of the BOM type.
    fn as_ref(&self) -> &str {
        match *self {
            Bom::Null => "[not set]",
            Bom::Bocu1 => "BOCU-1",
            Bom::Gb18030 => "GB 18030",
            Bom::Scsu => "SCSU",
            Bom::UtfEbcdic => "UTF-EBCDIC",
            Bom::Utf1 => "UTF-1",
            Bom::Utf7 => "UTF-7",
            Bom::Utf8 => "UTF-8",
            Bom::Utf16Be => "UTF-16 (big-endian)",
            Bom::Utf16Le => "UTF-16 (little-endian)",
            Bom::Utf32Be => "UTF-32 (big-endian)",
            Bom::Utf32Le => "UTF-32 (little-endian)",
        }
    }
}

impl Default for Bom {
    /// Returns the default/empty BOM type, `Bom::Null`.
    fn default() -> Self {
        Bom::Null
    }
}

impl Display for Bom {
    /// Formats the BOM type as a `String`.
    fn fmt(&self, formatter: &mut Formatter) -> fmt::Result {
        write!(formatter, "{}", self.as_ref())
    }
}

impl Eq for Bom {}

macro_rules! compare_slice {
    ($slice:ident, $len:expr, [$($index:expr => $byte:expr),+]) => {
        $slice.len() >= $len $(&& $slice[$index] == $byte)+
    }
}

impl From<&[u8]> for Bom {
    /// Detect the BOM type from a byte array.
    fn from(slice: &[u8]) -> Self {
        if slice.len() >= 2 {
            match slice[0] {
                0 => {
                    if compare_slice!(slice, 4, [1 => 0, 2 => 0xfe, 3 => 0xff]) {
                        return Bom::Utf32Be;
                    }
                }
                0x0e => {
                    if compare_slice!(slice, 3, [1 => 0xfe, 2 => 0xff]) {
                        return Bom::Scsu;
                    }
                }
                0x2b => {
                    if compare_slice!(slice, 4, [1 => 0x2f, 2 => 0x76])
                        && (slice[3] == 0x38
                            || slice[3] == 0x39
                            || slice[3] == 0x2b
                            || slice[3] == 0x2f)
                    {
                        return Bom::Utf7;
                    }
                }
                0x84 => {
                    if compare_slice!(slice, 4, [1 => 0x31, 2 => 0x95, 3 => 0x33]) {
                        return Bom::Gb18030;
                    }
                }
                0xdd => {
                    if compare_slice!(slice, 4, [1 => 0x73, 2 => 0x66, 3 => 0x73]) {
                        return Bom::UtfEbcdic;
                    }
                }
                0xef => {
                    if compare_slice!(slice, 3, [1 => 0xbb, 2 => 0xbf]) {
                        return Bom::Utf8;
                    }
                }
                0xf7 => {
                    if compare_slice!(slice, 3, [1 => 0x64, 2 => 0x4c]) {
                        return Bom::Utf1;
                    }
                }
                0xfb => {
                    if compare_slice!(slice, 3, [1 => 0xee, 2 => 0x28]) {
                        return Bom::Bocu1;
                    }
                }
                0xfe => {
                    if slice[1] == 0xff {
                        return Bom::Utf16Be;
                    }
                }
                0xff => {
                    if slice[1] == 0xfe {
                        if compare_slice!(slice, 4, [2 => 0, 3 => 0]) {
                            return Bom::Utf32Le;
                        }

                        return Bom::Utf16Le;
                    }
                }
                _ => {}
            }
        }

        Bom::Null
    }
}

impl From<&mut File> for Bom {
    /// Detect the BOM type in a `File` instance.
    ///
    /// Note that I/O errors are swallowed by this method.
    /// If the file does not exist
    /// or there are insufficient permissions for instance,
    /// the default type, `Bom::Null`,
    /// will be returned.
    fn from(file: &mut File) -> Self {
        let mut data = [0u8; 4];
        let mut result = file.read_exact(&mut data);

        if let Err(ref error) = result {
            if error.kind() == ErrorKind::UnexpectedEof {
                let short_data = [0u8; 3];
                result = file.read_exact(&mut data);

                if let Err(ref error) = result {
                    if error.kind() == ErrorKind::UnexpectedEof {
                        let short_data = [0u8; 2];
                        result = file.read_exact(&mut data);
                        data[0] = short_data[0];
                        data[1] = short_data[1];
                    }
                } else {
                    data[0] = short_data[0];
                    data[1] = short_data[1];
                    data[2] = short_data[2];
                }
            }
        }

        if result.is_ok() {
            Bom::from(&data[0..])
        } else {
            Bom::Null
        }
    }
}

impl From<&str> for Bom {
    /// Detect the BOM type in a file on disk.
    ///
    /// Note that I/O errors are swallowed by this method.
    /// If the file does not exist
    /// or there are insufficient permissions for instance,
    /// the default type, `Bom::Null`,
    /// will be returned.
    fn from(path: &str) -> Self {
        match File::open(path) {
            Ok(mut file) => Bom::from(&mut file),
            Err(_) => Bom::Null,
        }
    }
}

unsafe impl Send for Bom {}
unsafe impl Sync for Bom {}