pub fn has_windows1250_pattern(sample: &[u8]) -> bool {
const DEFINITIVE_1250_BYTES: [u8; 3] = [0x8D, 0x8F, 0x9D];
const STRONG_1250_BYTES: [u8; 3] = [
0x8C, 0x9C, 0x9F, ];
let mut definitive_count = 0;
let mut strong_count = 0;
for &byte in sample {
if DEFINITIVE_1250_BYTES.contains(&byte) {
definitive_count += 1;
}
if STRONG_1250_BYTES.contains(&byte) {
strong_count += 1;
}
}
if definitive_count > 0 {
return true;
}
if strong_count >= 2 {
return true;
}
false
}
pub fn has_windows1251_pattern(sample: &[u8]) -> bool {
let mut has_cyrillic_upper = false; let mut has_cyrillic_lower = false; let mut has_yo = false;
for &byte in sample {
if (0xC0..=0xDE).contains(&byte) {
has_cyrillic_upper = true;
}
if (0xE0..=0xFF).contains(&byte) {
has_cyrillic_lower = true;
}
if byte == 0xA8 || byte == 0xB8 {
has_yo = true;
}
}
if has_cyrillic_upper && has_cyrillic_lower {
return true;
}
if has_yo && has_cyrillic_lower {
return true;
}
false
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_definitive_bytes_t_caron() {
let with_t_caron = [0x6D, 0x9D, 0x73, 0x74, 0x6F]; assert!(
has_windows1250_pattern(&with_t_caron),
"Byte 0x9D (ť) should trigger Windows-1250 detection"
);
}
#[test]
fn test_definitive_bytes_z_acute_upper() {
let with_z_acute_upper = [0x8F, 0x72, 0xF3, 0x64, 0xB3, 0x6F]; assert!(
has_windows1250_pattern(&with_z_acute_upper),
"Byte 0x8F (Ź) should trigger Windows-1250 detection"
);
}
#[test]
fn test_definitive_bytes_t_caron_upper() {
let with_t_caron_upper = [0x8D, 0x65, 0x73, 0x74]; assert!(
has_windows1250_pattern(&with_t_caron_upper),
"Byte 0x8D (Ť) should trigger Windows-1250 detection"
);
}
#[test]
fn test_strong_indicators_0x80_range() {
let polish_text = [
0x9C, 0x77, 0x69, 0x65, 0x74, 0x79, 0x20, 0x8C, 0x77, 0x69, 0x61, 0x74, ];
assert!(
has_windows1250_pattern(&polish_text),
"Multiple Polish characters (ś, Ś) should trigger Windows-1250"
);
}
#[test]
fn test_ambiguous_bytes_not_strong_indicators() {
let zolc = [0xBF, 0xF3, 0xB3, 0xE6];
assert!(
!has_windows1250_pattern(&zolc),
"Ambiguous bytes (0xBF, 0xB3) should NOT trigger Windows-1250"
);
let polish_text = [
0x6D, 0xB9, 0x6B, 0x61, 0x20, 0x6D, 0xB3, 0x6F, 0x64, 0x79, ];
assert!(
!has_windows1250_pattern(&polish_text),
"Ambiguous bytes (0xB9, 0xB3) should NOT trigger Windows-1250"
);
}
#[test]
fn test_pound_and_yen_not_indicators() {
let currency = [0x31, 0x30, 0xA3, 0x20, 0x31, 0x30, 0xA5]; assert!(
!has_windows1250_pattern(¤cy),
"Currency symbols (£, ¥) should not trigger Windows-1250"
);
}
#[test]
fn test_pure_ascii() {
let ascii = b"Hello, World!";
assert!(
!has_windows1250_pattern(ascii),
"Pure ASCII should not trigger Windows-1250"
);
}
#[test]
fn test_windows1252_french() {
let french = [
0x43, 0x61, 0x66, 0xE9, 0x20, 0x72, 0xE9, 0x73, 0x75, 0x6D, 0xE9, ];
assert!(
!has_windows1250_pattern(&french),
"French text should not trigger Windows-1250"
);
}
#[test]
fn test_windows1251_upper_and_lower() {
let privet: &[u8] = &[0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2];
assert!(
has_windows1251_pattern(privet),
"Cyrillic upper + lower should trigger Windows-1251"
);
}
#[test]
fn test_windows1251_with_yo() {
let yozhik: &[u8] = &[0xB8, 0xE6, 0xE8, 0xEA];
assert!(
has_windows1251_pattern(yozhik),
"ё + lowercase Cyrillic letters should trigger Windows-1251"
);
}
#[test]
fn test_windows1251_sentence_with_spaces() {
let hello_world: &[u8] = &[
0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2, 0x20, 0xEC, 0xE8, 0xF0, ];
assert!(
has_windows1251_pattern(hello_world),
"Russian sentence should trigger Windows-1251"
);
}
#[test]
fn test_not_windows1251_french() {
let french = [
0x43, 0x61, 0x66, 0xE9, 0x20, 0x72, 0xE9, 0x73, 0x75, 0x6D, 0xE9, ];
assert!(
!has_windows1251_pattern(&french),
"French text should not trigger Windows-1251"
);
}
#[test]
fn test_not_windows1251_ascii() {
let ascii = b"Hello, World!";
assert!(
!has_windows1251_pattern(ascii),
"Pure ASCII should not trigger Windows-1251"
);
}
#[test]
fn test_not_windows1251_lowercase_only_latin1() {
let only_lower = [0xE9, 0xE9, 0xE9, 0xE9, 0xE9];
assert!(
!has_windows1251_pattern(&only_lower),
"Runs of lowercase Latin-1 letters should NOT trigger Windows-1251"
);
}
#[test]
fn test_not_windows1251_ambiguous_polish() {
let zolc = [0xBF, 0xF3, 0xB3, 0xE6];
assert!(
!has_windows1251_pattern(&zolc),
"Lowercase-only Polish bytes should NOT trigger Windows-1251"
);
}
#[test]
fn test_not_windows1251_sharp_s_excluded() {
let german = [0x53, 0x74, 0x72, 0x61, 0xDF, 0x65]; assert!(
!has_windows1251_pattern(&german),
"German ß should not trigger Windows-1251"
);
}
#[test]
fn test_czech_pangram() {
let czech_pangram: &[u8] = &[
0x50, 0xF8, 0xED, 0x6C, 0x69, 0x9A, 0x20, 0x9E, 0x6C, 0x75, 0x9D, 0x6F, 0x75, 0xE8, 0x6B, 0xFD, 0x20, 0x6B, 0xF9, 0xF2, 0x20, 0xFA, 0x70, 0xEC, 0x6C, 0x20, 0xEF, 0xE1, 0x62, 0x65, 0x6C, 0x73, 0x6B, 0xE9, 0x20, 0xF3, 0x64, 0x79, ];
assert!(
has_windows1250_pattern(czech_pangram),
"Czech pangram should trigger Windows-1250 (contains ť = 0x9D)"
);
}
}