bom_strip/lib.rs
1//! # bom-strip
2//!
3//! Strip UTF-8/16/32 BOMs and stray U+FEFF code points from text.
4//!
5//! A leading byte order mark breaks `serde_json::from_str`, hash-based
6//! deduplication, and config parsers that don't allow leading
7//! whitespace. This crate gives you four small functions:
8//!
9//! - [`strip_str`] — strip a leading U+FEFF from a `&str`.
10//! - [`strip_all`] — strip every U+FEFF in the input, not just leading.
11//! - [`strip_bytes`] — strip a leading UTF-8 / UTF-16 LE/BE / UTF-32
12//! LE/BE BOM from a `&[u8]`.
13//! - [`detect_bom`] — identify which BOM (if any) leads `&[u8]`.
14//!
15//! ## Example
16//!
17//! ```
18//! use bom_strip::{strip_str, strip_bytes, detect_bom, Bom};
19//!
20//! assert_eq!(strip_str("\u{FEFF}hello"), "hello");
21//! assert_eq!(strip_bytes(&[0xEF, 0xBB, 0xBF, b'h', b'i']), &[b'h', b'i']);
22//! assert_eq!(detect_bom(&[0xFF, 0xFE, b'a', 0]), Some(Bom::Utf16Le));
23//! ```
24
25#![deny(missing_docs)]
26
27/// Identified BOM kind.
28#[derive(Debug, Clone, Copy, PartialEq, Eq)]
29pub enum Bom {
30 /// `EF BB BF` (UTF-8)
31 Utf8,
32 /// `FE FF` (UTF-16 big-endian)
33 Utf16Be,
34 /// `FF FE` (UTF-16 little-endian)
35 Utf16Le,
36 /// `00 00 FE FF` (UTF-32 big-endian)
37 Utf32Be,
38 /// `FF FE 00 00` (UTF-32 little-endian; check before UTF-16 LE)
39 Utf32Le,
40}
41
42impl Bom {
43 /// Length of this BOM in bytes.
44 pub fn len(self) -> usize {
45 match self {
46 Bom::Utf8 => 3,
47 Bom::Utf16Be | Bom::Utf16Le => 2,
48 Bom::Utf32Be | Bom::Utf32Le => 4,
49 }
50 }
51}
52
53/// Detect which BOM (if any) leads `b`.
54pub fn detect_bom(b: &[u8]) -> Option<Bom> {
55 if b.starts_with(&[0xEF, 0xBB, 0xBF]) {
56 return Some(Bom::Utf8);
57 }
58 // 4-byte BOMs must be checked before 2-byte to avoid misidentifying
59 // a UTF-32 LE BOM (`FF FE 00 00`) as a UTF-16 LE BOM (`FF FE`).
60 if b.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) {
61 return Some(Bom::Utf32Le);
62 }
63 if b.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) {
64 return Some(Bom::Utf32Be);
65 }
66 if b.starts_with(&[0xFE, 0xFF]) {
67 return Some(Bom::Utf16Be);
68 }
69 if b.starts_with(&[0xFF, 0xFE]) {
70 return Some(Bom::Utf16Le);
71 }
72 None
73}
74
75/// Strip a leading BOM from `b`. Returns the input unchanged if none.
76pub fn strip_bytes(b: &[u8]) -> &[u8] {
77 match detect_bom(b) {
78 Some(bom) => &b[bom.len()..],
79 None => b,
80 }
81}
82
83/// Strip a leading U+FEFF from `s`.
84pub fn strip_str(s: &str) -> &str {
85 s.strip_prefix('\u{FEFF}').unwrap_or(s)
86}
87
88/// Strip every U+FEFF (BOM and zero-width no-break-space) in `s`.
89pub fn strip_all(s: &str) -> String {
90 s.chars().filter(|c| *c != '\u{FEFF}').collect()
91}