const CONFIDENCE_HIGH: f64 = 0.95;
pub(crate) fn detect_encoding_impl(bytes: &[u8]) -> (String, f64) {
use chardetng::{EncodingDetector, Iso2022JpDetection, Utf8Detection};
let mut detector = EncodingDetector::new(Iso2022JpDetection::Allow);
detector.feed(bytes, true);
let encoding = detector.guess(None, Utf8Detection::Allow);
(encoding.name().to_owned(), CONFIDENCE_HIGH)
}
const COMMON_ENCODING_LABELS: &[&str] = &[
"utf-8",
"utf-16le",
"utf-16be",
"windows-1250",
"windows-1251",
"windows-1252",
"windows-1254",
"iso-8859-1",
"iso-8859-2",
"iso-8859-15",
"koi8-r",
"koi8-u",
"shift_jis",
"euc-jp",
"iso-2022-jp",
"euc-kr",
"big5",
"gbk",
"gb18030",
"macintosh",
];
pub(crate) fn decode_to_utf8_impl(
bytes: &[u8],
encoding: Option<&str>,
min_confidence: f64,
strict: bool,
) -> Result<(String, bool), crate::ErrorRepr> {
if !(0.0..=1.0).contains(&min_confidence) {
return Err(crate::ErrorRepr::MinConfidenceOutOfRange { min_confidence });
}
let enc = if let Some(name) = encoding {
encoding_rs::Encoding::for_label(name.as_bytes()).ok_or_else(|| {
let suggestion =
crate::utils::closest_match(name, COMMON_ENCODING_LABELS.iter().copied())
.map(|s| format!(" (did you mean '{s}'?)"))
.unwrap_or_default();
crate::ErrorRepr::UnknownEncoding {
got: name.to_owned(),
suggestion,
}
})?
} else {
let (name, confidence) = detect_encoding_impl(bytes);
if confidence < min_confidence {
return Err(crate::ErrorRepr::EncodingConfidenceTooLow {
confidence,
min_confidence,
guess: name,
});
}
encoding_rs::Encoding::for_label(name.as_bytes())
.ok_or(crate::ErrorRepr::UnsupportedAutoEncoding { got: name })?
};
let (decoded, actual_encoding, had_errors) = enc.decode(bytes);
if strict && had_errors {
return Err(crate::ErrorRepr::LossyDecode {
encoding: actual_encoding.name().to_owned(),
});
}
Ok((decoded.into_owned(), had_errors))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_utf8() {
let (encoding, confidence) = detect_encoding_impl(b"hello world");
assert!(encoding == "windows-1252" || encoding == "UTF-8");
assert!(confidence > 0.0);
}
#[test]
fn test_detect_utf8_with_bom() {
let (encoding, _) = detect_encoding_impl(b"\xef\xbb\xbfhello");
assert_eq!(encoding, "UTF-8");
}
#[test]
fn test_detect_regression_pins() {
let cases: &[(&[u8], &str)] = &[
(
&[0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 0xE8, 0xAA, 0x9E],
"UTF-8",
),
(
&[0x1B, 0x24, 0x42, 0x46, 0x7C, 0x4B, 0x5C, 0x1B, 0x28, 0x42],
"ISO-2022-JP",
),
(
&[
0x82, 0xB1, 0x82, 0xF1, 0x82, 0xC9, 0x82, 0xBF, 0x82, 0xCD, 0x90, 0xA2, 0x8A,
0x45,
],
"Shift_JIS",
),
];
for (bytes, expected) in cases {
let (encoding, confidence) = detect_encoding_impl(bytes);
assert_eq!(&encoding, expected, "encoding mismatch for {bytes:02x?}");
assert!(
(confidence - super::CONFIDENCE_HIGH).abs() < 1e-9,
"confidence should always be CONFIDENCE_HIGH, got {confidence}"
);
}
}
#[test]
fn test_decode_utf8() {
let (decoded, had_errors) =
decode_to_utf8_impl("café".as_bytes(), Some("UTF-8"), 0.0, false).unwrap();
assert_eq!(decoded, "café");
assert!(!had_errors);
}
#[test]
fn test_decode_latin1() {
let (decoded, had_errors) =
decode_to_utf8_impl(&[0x63, 0x61, 0x66, 0xE9], Some("ISO-8859-1"), 0.0, false).unwrap();
assert_eq!(decoded, "café");
assert!(!had_errors);
}
#[test]
fn test_decode_unknown_encoding_errors() {
let result = decode_to_utf8_impl(b"hello", Some("FAKE-999"), 0.0, false);
assert!(result.is_err());
}
#[test]
fn test_detect_empty_input() {
let (encoding, confidence) = detect_encoding_impl(b"");
assert!(!encoding.is_empty());
assert!(confidence > 0.0);
}
#[test]
fn test_decode_auto_detect() {
let (decoded, had_errors) = decode_to_utf8_impl(b"hello world", None, 0.0, false).unwrap();
assert_eq!(decoded, "hello world");
assert!(!had_errors);
}
#[test]
fn test_decode_min_confidence_rejected() {
let result = decode_to_utf8_impl(b"hi", None, 1.0, false);
assert!(result.is_err());
let msg = result.unwrap_err().to_string();
assert!(
msg.contains("below the required minimum"),
"unexpected: {msg}"
);
}
#[test]
fn test_decode_min_confidence_accepted() {
let result = decode_to_utf8_impl(b"hi", Some("UTF-8"), 1.0, false);
assert!(result.is_ok());
}
#[test]
fn test_decode_min_confidence_out_of_range_rejected() {
for bad in [-0.5_f64, 1.5, f64::NAN, -0.000_001, 1.000_001] {
let auto = decode_to_utf8_impl(b"hi", None, bad, false);
let explicit = decode_to_utf8_impl(b"hi", Some("UTF-8"), bad, false);
for r in [auto, explicit] {
assert!(
matches!(r, Err(crate::ErrorRepr::MinConfidenceOutOfRange { .. })),
"min_confidence {bad} should be rejected by the core"
);
}
}
assert!(decode_to_utf8_impl(b"hi", Some("UTF-8"), 0.0, false).is_ok());
assert!(decode_to_utf8_impl(b"hi", Some("UTF-8"), 1.0, false).is_ok());
}
}