use std::io::{self, Read};
#[derive(Debug)]
struct Bom(u8, u8, u8, u8);
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum Flavour {
UCS,
UTF,
EBCDIC,
ASCII,
Unknown,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum ByteOrder {
BigEndian,
LittleEndian,
Unusual2143,
Unusual3412,
NotApplicable,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum Width {
EightBit = 8,
SixteenBit = 16,
ThirtyTwoBit = 32,
}
#[derive(Clone, Debug, PartialEq, Eq)]
struct Descriptor(Flavour, Width, ByteOrder);
const UCS_4_BE: Descriptor = Descriptor(Flavour::UCS, Width::ThirtyTwoBit, ByteOrder::BigEndian);
const UCS_4_LE: Descriptor = Descriptor(Flavour::UCS, Width::ThirtyTwoBit, ByteOrder::LittleEndian);
const UCS_4_2143: Descriptor =
Descriptor(Flavour::UCS, Width::ThirtyTwoBit, ByteOrder::Unusual2143);
const UCS_4_3412: Descriptor =
Descriptor(Flavour::UCS, Width::ThirtyTwoBit, ByteOrder::Unusual3412);
const UTF_16_BE: Descriptor = Descriptor(Flavour::UTF, Width::SixteenBit, ByteOrder::BigEndian);
const UTF_16_LE: Descriptor = Descriptor(Flavour::UTF, Width::SixteenBit, ByteOrder::LittleEndian);
const UTF_8: Descriptor = Descriptor(Flavour::UTF, Width::EightBit, ByteOrder::NotApplicable);
const EBCDIC: Descriptor = Descriptor(Flavour::EBCDIC, Width::EightBit, ByteOrder::NotApplicable);
const ASCII_32BIT_BE: Descriptor =
Descriptor(Flavour::Unknown, Width::ThirtyTwoBit, ByteOrder::BigEndian);
const ASCII_32BIT_LE: Descriptor = Descriptor(
Flavour::Unknown,
Width::ThirtyTwoBit,
ByteOrder::LittleEndian,
);
const ASCII_16BIT_BE: Descriptor =
Descriptor(Flavour::Unknown, Width::SixteenBit, ByteOrder::BigEndian);
const ASCII_16BIT_LE: Descriptor =
Descriptor(Flavour::Unknown, Width::SixteenBit, ByteOrder::LittleEndian);
const ASCII_8BIT: Descriptor =
Descriptor(Flavour::ASCII, Width::EightBit, ByteOrder::NotApplicable);
pub fn detect<R: Read>(reader: &mut R, hint: Option<String>) -> Result<Vec<String>, io::Error> {
let mut first_four_bytes = [0u8; 4];
reader.read_exact(&mut first_four_bytes)?;
let bom = Bom(
first_four_bytes[0],
first_four_bytes[1],
first_four_bytes[2],
first_four_bytes[3],
);
let possible_encoding = detect_byte_order_mark(&bom);
let mut buf = [0u8; 512];
loop {
match reader.read(&mut buf) {
Ok(0) => return Ok(Vec::new()), Ok(_n) => break,
Err(ref err) if err.kind() == io::ErrorKind::Interrupted => {} Err(err) => return Err(err),
};
}
let mut candidates = Vec::with_capacity(3);
search("encoding=", &buf, possible_encoding.as_ref())
.or_else(|| search("charset=", &buf, possible_encoding.as_ref()))
.map(normalise)
.map(|encoding| {
push_if_not_contains(
&mut candidates,
endianify(&encoding, possible_encoding.as_ref()),
)
});
hint.map(normalise).map(|encoding| {
push_if_not_contains(
&mut candidates,
endianify(&encoding, possible_encoding.as_ref()),
)
});
match possible_encoding {
Some(UCS_4_LE) => Some("ucs-4le"),
Some(UCS_4_BE) => Some("ucs-4be"),
Some(UTF_16_LE) => Some("utf-16le"),
Some(UTF_16_BE) => Some("utf-16be"),
Some(Descriptor(Flavour::UTF, Width::EightBit, _)) => Some("utf-8"),
Some(EBCDIC) => Some("ebcdic"),
_ => None,
}
.map(|encoding| push_if_not_contains(&mut candidates, encoding.to_string()));
if candidates.is_empty() && std::str::from_utf8(&buf).is_ok() {
candidates.push("utf-8".to_string());
}
Ok(candidates)
}
fn detect_byte_order_mark(bom: &Bom) -> Option<Descriptor> {
match *bom {
Bom(0x00, 0x00, 0xFE, 0xFF) => Some(UCS_4_BE),
Bom(0xFF, 0xFE, 0x00, 0x00) => Some(UCS_4_LE),
Bom(0x00, 0x00, 0xFF, 0xFE) => Some(UCS_4_2143),
Bom(0xFE, 0xFF, 0x00, 0x00) => Some(UCS_4_3412),
Bom(0xFE, 0xFF, c, d) if c > 0 || d > 0 => Some(UTF_16_BE),
Bom(0xFF, 0xFE, c, d) if c > 0 || d > 0 => Some(UTF_16_LE),
Bom(0xEF, 0xBB, 0xBF, _) => Some(UTF_8),
Bom(0x00, 0x00, 0x00, 0x3C) => Some(ASCII_32BIT_BE),
Bom(0x3C, 0x00, 0x00, 0x00) => Some(ASCII_32BIT_LE),
Bom(0x00, 0x00, 0x3C, 0x00) => Some(Descriptor(
Flavour::Unknown,
Width::ThirtyTwoBit,
ByteOrder::Unusual2143,
)),
Bom(0x00, 0x3C, 0x00, 0x00) => Some(Descriptor(
Flavour::Unknown,
Width::ThirtyTwoBit,
ByteOrder::Unusual3412,
)),
Bom(0x00, 0x3C, 0x00, 0x3F) => Some(ASCII_16BIT_BE),
Bom(0x3C, 0x00, 0x3F, 0x00) => Some(ASCII_16BIT_LE),
Bom(0x3C, 0x3F, 0x78, 0x6D) => Some(ASCII_8BIT),
Bom(0x4C, 0x6F, 0xA7, 0x94) => Some(EBCDIC),
_ => None,
}
}
fn normalise<S: AsRef<str>>(encoding: S) -> String {
encoding
.as_ref()
.to_lowercase()
.replace("us-ascii", "ascii")
.replace("utf8", "utf-8")
.replace("shift-jis", "shift_jis")
}
fn push_if_not_contains<T: PartialEq>(vec: &mut Vec<T>, item: T) {
if !vec.contains(&item) {
vec.push(item);
}
}
fn endianify(encoding: &str, descriptor: Option<&Descriptor>) -> String {
let ascii = ASCII_8BIT;
let &Descriptor(_, _, ref order) = descriptor.unwrap_or(&ascii);
match encoding {
"utf-16" => match *order {
ByteOrder::LittleEndian => "utf-16le".to_string(),
ByteOrder::BigEndian => "utf-16be".to_string(),
_ => encoding.to_string(),
},
_ => encoding.to_string(),
}
}
fn search(needle: &str, haystack: &[u8], descriptor: Option<&Descriptor>) -> Option<String> {
let ascii = ASCII_8BIT;
let &Descriptor(_, ref width, ref order) = descriptor.unwrap_or(&ascii);
let chunk_size = (*width as usize) / 8;
let mut index = match *order {
ByteOrder::NotApplicable | ByteOrder::LittleEndian => 0,
ByteOrder::BigEndian => chunk_size - 1,
ByteOrder::Unusual2143 => 2,
ByteOrder::Unusual3412 => 1,
};
let mut ascii_bytes = Vec::with_capacity(haystack.len() / chunk_size);
while index < haystack.len() {
ascii_bytes.push(haystack[index]);
index += chunk_size;
}
let ascii_haystack = String::from_utf8_lossy(&ascii_bytes);
ascii_haystack.find(needle).map(|pos| {
ascii_haystack[pos + needle.len()..]
.chars()
.skip_while(|char| *char == '"' || *char == '\'')
.take_while(|char| *char != '"' && *char != '\'')
.collect()
})
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
#[test]
fn test_detect_empty() {
let mut text_cursor = Cursor::new("");
let detected_charsets = detect(&mut text_cursor, None);
assert!(detected_charsets.is_err()); }
#[test]
fn test_detect_4_bytes() {
let mut text_cursor = Cursor::new("1234");
let detected_charsets = detect(&mut text_cursor, None).unwrap();
assert!(detected_charsets.is_empty());
}
}