use std::fmt;
use serde::Serialize;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
#[serde(rename_all = "lowercase")]
pub enum ContentType {
Utf8,
Utf16Le,
Utf16Be,
Binary,
}
impl ContentType {
#[inline]
pub const fn is_text(self) -> bool {
!matches!(self, Self::Binary)
}
#[inline]
pub const fn as_str(self) -> &'static str {
match self {
Self::Utf8 => "utf-8",
Self::Utf16Le => "utf-16le",
Self::Utf16Be => "utf-16be",
Self::Binary => "binary",
}
}
}
impl fmt::Display for ContentType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(self.as_str())
}
}
use crate::constants::BINARY_DETECT_SIZE;
pub fn is_binary(data: &[u8]) -> bool {
detect_content_type(data) == ContentType::Binary
}
pub fn detect_content_type(data: &[u8]) -> ContentType {
let check = &data[..data.len().min(BINARY_DETECT_SIZE)];
match content_inspector::inspect(check) {
content_inspector::ContentType::UTF_8 | content_inspector::ContentType::UTF_8_BOM => {
ContentType::Utf8
}
content_inspector::ContentType::UTF_16LE => ContentType::Utf16Le,
content_inspector::ContentType::UTF_16BE => ContentType::Utf16Be,
content_inspector::ContentType::BINARY => ContentType::Binary,
_ => ContentType::Binary, }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn null_byte_detected_as_binary() {
assert!(is_binary(&[0, 1, 2, 3]));
}
#[test]
fn valid_utf8_not_binary() {
assert!(!is_binary(b"hello world"));
}
#[test]
fn empty_not_binary() {
assert!(!is_binary(b""));
}
#[test]
fn null_after_threshold_not_detected() {
let mut data = vec![b'a'; BINARY_DETECT_SIZE + 1];
data[BINARY_DETECT_SIZE] = 0;
assert!(!is_binary(&data));
}
#[test]
fn high_bytes_not_binary() {
assert!(!is_binary(&[0xFF, 0xFE, 0xFD]));
}
#[test]
fn utf16le_with_bom_is_text_not_binary() {
let mut utf16le_with_bom: Vec<u8> = vec![0xFF, 0xFE]; for c in "hello world".encode_utf16() {
utf16le_with_bom.extend_from_slice(&c.to_le_bytes());
}
assert_eq!(detect_content_type(&utf16le_with_bom), ContentType::Utf16Le);
assert!(!is_binary(&utf16le_with_bom));
}
#[test]
fn utf16be_with_bom_is_text() {
let mut utf16be_with_bom: Vec<u8> = vec![0xFE, 0xFF]; for c in "hello world".encode_utf16() {
utf16be_with_bom.extend_from_slice(&c.to_be_bytes());
}
assert_eq!(detect_content_type(&utf16be_with_bom), ContentType::Utf16Be);
}
#[test]
fn utf16le_without_bom_does_not_panic() {
let utf16le_ascii: Vec<u8> = "hello world"
.as_bytes()
.iter()
.flat_map(|b| [*b, 0x00])
.collect();
let ct = detect_content_type(&utf16le_ascii);
assert!(
matches!(
ct,
ContentType::Utf16Le | ContentType::Utf8 | ContentType::Binary
),
"must return known ContentType, got: {ct:?}"
);
}
#[test]
fn utf8_bom_detected() {
let mut data = vec![0xEF, 0xBB, 0xBF];
data.extend_from_slice(b"hello");
assert_eq!(detect_content_type(&data), ContentType::Utf8);
}
#[test]
fn high_random_bytes_classified_as_binary() {
let data: Vec<u8> = (0u8..=255).cycle().take(1024).collect();
let ct = detect_content_type(&data);
assert!(
matches!(
ct,
ContentType::Utf8
| ContentType::Utf16Le
| ContentType::Utf16Be
| ContentType::Binary
),
"must return a known ContentType, got: {ct:?}"
);
}
#[test]
fn random_bytes_classified_as_binary() {
let data: Vec<u8> = (0..512).map(|i| (i * 7 + 13) as u8).collect();
assert_eq!(detect_content_type(&data), ContentType::Binary);
}
#[test]
fn content_type_serialization_is_lowercase() {
let json = serde_json::to_string(&ContentType::Utf16Le).unwrap();
assert_eq!(json, "\"utf16le\"", "rename_all = lowercase emits utf16le");
let json = serde_json::to_string(&ContentType::Binary).unwrap();
assert_eq!(json, "\"binary\"");
}
}