Skip to main content

bom_strip/
lib.rs

1//! # bom-strip
2//!
3//! Strip UTF-8/16/32 BOMs and stray U+FEFF code points from text.
4//!
5//! A leading byte order mark breaks `serde_json::from_str`, hash-based
6//! deduplication, and config parsers that don't allow leading
7//! whitespace. This crate gives you four small functions:
8//!
9//! - [`strip_str`] — strip a leading U+FEFF from a `&str`.
10//! - [`strip_all`] — strip every U+FEFF in the input, not just leading.
11//! - [`strip_bytes`] — strip a leading UTF-8 / UTF-16 LE/BE / UTF-32
12//!   LE/BE BOM from a `&[u8]`.
13//! - [`detect_bom`] — identify which BOM (if any) leads `&[u8]`.
14//!
15//! ## Example
16//!
17//! ```
18//! use bom_strip::{strip_str, strip_bytes, detect_bom, Bom};
19//!
20//! assert_eq!(strip_str("\u{FEFF}hello"), "hello");
21//! assert_eq!(strip_bytes(&[0xEF, 0xBB, 0xBF, b'h', b'i']), &[b'h', b'i']);
22//! assert_eq!(detect_bom(&[0xFF, 0xFE, b'a', 0]), Some(Bom::Utf16Le));
23//! ```
24
25#![deny(missing_docs)]
26
27/// Identified BOM kind.
28#[derive(Debug, Clone, Copy, PartialEq, Eq)]
29pub enum Bom {
30    /// `EF BB BF` (UTF-8)
31    Utf8,
32    /// `FE FF` (UTF-16 big-endian)
33    Utf16Be,
34    /// `FF FE` (UTF-16 little-endian)
35    Utf16Le,
36    /// `00 00 FE FF` (UTF-32 big-endian)
37    Utf32Be,
38    /// `FF FE 00 00` (UTF-32 little-endian; check before UTF-16 LE)
39    Utf32Le,
40}
41
42impl Bom {
43    /// Length of this BOM in bytes.
44    pub fn len(self) -> usize {
45        match self {
46            Bom::Utf8 => 3,
47            Bom::Utf16Be | Bom::Utf16Le => 2,
48            Bom::Utf32Be | Bom::Utf32Le => 4,
49        }
50    }
51}
52
53/// Detect which BOM (if any) leads `b`.
54pub fn detect_bom(b: &[u8]) -> Option<Bom> {
55    if b.starts_with(&[0xEF, 0xBB, 0xBF]) {
56        return Some(Bom::Utf8);
57    }
58    // 4-byte BOMs must be checked before 2-byte to avoid misidentifying
59    // a UTF-32 LE BOM (`FF FE 00 00`) as a UTF-16 LE BOM (`FF FE`).
60    if b.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) {
61        return Some(Bom::Utf32Le);
62    }
63    if b.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) {
64        return Some(Bom::Utf32Be);
65    }
66    if b.starts_with(&[0xFE, 0xFF]) {
67        return Some(Bom::Utf16Be);
68    }
69    if b.starts_with(&[0xFF, 0xFE]) {
70        return Some(Bom::Utf16Le);
71    }
72    None
73}
74
75/// Strip a leading BOM from `b`. Returns the input unchanged if none.
76pub fn strip_bytes(b: &[u8]) -> &[u8] {
77    match detect_bom(b) {
78        Some(bom) => &b[bom.len()..],
79        None => b,
80    }
81}
82
83/// Strip a leading U+FEFF from `s`.
84pub fn strip_str(s: &str) -> &str {
85    s.strip_prefix('\u{FEFF}').unwrap_or(s)
86}
87
88/// Strip every U+FEFF (BOM and zero-width no-break-space) in `s`.
89pub fn strip_all(s: &str) -> String {
90    s.chars().filter(|c| *c != '\u{FEFF}').collect()
91}