#[cfg(feature = "encoding")]
use encoding_rs::Encoding;
use crate::prelude::*;
pub const COLLATION_FLAG_UTF8: u32 = 0x0800_0000;
pub const LCID_MASK: u32 = 0x000F_FFFF;
pub const PRIMARY_LANGUAGE_MASK: u32 = 0x0000_FFFF;
#[inline]
pub fn is_utf8_collation(lcid: u32) -> bool {
lcid & COLLATION_FLAG_UTF8 != 0
}
#[cfg(feature = "encoding")]
pub fn encoding_for_lcid(lcid: u32) -> Option<&'static Encoding> {
if is_utf8_collation(lcid) {
return None;
}
let primary_lang = lcid & PRIMARY_LANGUAGE_MASK;
match primary_lang {
0x0411 => Some(encoding_rs::SHIFT_JIS),
0x0804 | 0x1004 => Some(encoding_rs::GB18030),
0x0404 | 0x0C04 | 0x1404 => Some(encoding_rs::BIG5),
0x0412 => Some(encoding_rs::EUC_KR),
0x041E => Some(encoding_rs::WINDOWS_874),
0x042A => Some(encoding_rs::WINDOWS_1258),
0x0405 | 0x0415 | 0x040E | 0x041A | 0x081A | 0x141A | 0x101A | 0x041B | 0x0424 | 0x0418 | 0x041C => Some(encoding_rs::WINDOWS_1250),
0x0419 | 0x0422 | 0x0423 | 0x0402 | 0x042F | 0x0C1A | 0x201A | 0x0440 | 0x0843 | 0x0444 | 0x0450 | 0x0485 => Some(encoding_rs::WINDOWS_1251),
0x0408 => Some(encoding_rs::WINDOWS_1253),
0x041F | 0x042C => Some(encoding_rs::WINDOWS_1254),
0x040D => Some(encoding_rs::WINDOWS_1255),
0x0401 | 0x0801 | 0x0C01 | 0x1001 | 0x1401 | 0x1801 | 0x1C01 | 0x2001 | 0x2401 | 0x2801 | 0x2C01 | 0x3001 | 0x3401 | 0x3801 | 0x3C01 | 0x4001 | 0x0429 | 0x0420 | 0x048C | 0x0463 => Some(encoding_rs::WINDOWS_1256),
0x0425..=0x0427 => Some(encoding_rs::WINDOWS_1257),
0x0409 | 0x0809 | 0x0C09 | 0x1009 | 0x1409 | 0x1809 | 0x040C | 0x080C | 0x0C0C | 0x100C | 0x140C | 0x0407 | 0x0807 | 0x0C07 | 0x1007 | 0x1407 | 0x040A | 0x080A | 0x0C0A | 0x100A | 0x140A | 0x180A | 0x1C0A | 0x200A | 0x240A | 0x280A | 0x2C0A | 0x300A | 0x340A | 0x380A | 0x3C0A | 0x400A | 0x440A | 0x480A | 0x4C0A | 0x500A | 0x0410 | 0x0810 | 0x0816 | 0x0416 | 0x0413 | 0x0813 | 0x0406 | 0x0414 | 0x0814 | 0x041D | 0x081D | 0x040B | 0x040F | 0x0403 | 0x0456 | 0x042D | 0x0436 | 0x0421 | 0x043E | 0x0441 => Some(encoding_rs::WINDOWS_1252),
_ => None,
}
}
#[cfg(feature = "encoding")]
pub fn code_page_for_lcid(lcid: u32) -> Option<u16> {
if is_utf8_collation(lcid) {
return Some(65001); }
let primary_lang = lcid & PRIMARY_LANGUAGE_MASK;
match primary_lang {
0x0411 => Some(932), 0x0804 | 0x1004 => Some(936), 0x0404 | 0x0C04 | 0x1404 => Some(950), 0x0412 => Some(949), 0x041E => Some(874), 0x042A => Some(1258),
0x0405 | 0x0415 | 0x040E | 0x041A | 0x081A | 0x141A | 0x101A | 0x041B | 0x0424 | 0x0418
| 0x041C => Some(1250),
0x0419 | 0x0422 | 0x0423 | 0x0402 | 0x042F | 0x0C1A | 0x201A | 0x0440 | 0x0843 | 0x0444
| 0x0450 | 0x0485 => Some(1251),
0x0408 => Some(1253), 0x041F | 0x042C => Some(1254), 0x040D => Some(1255),
0x0401 | 0x0801 | 0x0C01 | 0x1001 | 0x1401 | 0x1801 | 0x1C01 | 0x2001 | 0x2401 | 0x2801
| 0x2C01 | 0x3001 | 0x3401 | 0x3801 | 0x3C01 | 0x4001 | 0x0429 | 0x0420 | 0x048C
| 0x0463 => Some(1256),
0x0425..=0x0427 => Some(1257),
_ => Some(1252),
}
}
#[cfg(feature = "encoding")]
pub fn encoding_name_for_lcid(lcid: u32) -> &'static str {
if is_utf8_collation(lcid) {
return "UTF-8";
}
match encoding_for_lcid(lcid) {
Some(enc) => enc.name(),
None => "windows-1252", }
}
pub fn encode_str_for_collation(
value: &str,
collation: Option<&crate::token::Collation>,
) -> Vec<u8> {
#[cfg(feature = "encoding")]
{
if let Some(c) = collation {
if c.is_utf8() {
return value.as_bytes().to_vec();
}
if let Some(encoding) = c.encoding() {
let (encoded, _, _) = encoding.encode(value);
return encoded.into_owned();
}
}
let (encoded, _, _) = encoding_rs::WINDOWS_1252.encode(value);
encoded.into_owned()
}
#[cfg(not(feature = "encoding"))]
{
let _ = collation;
value
.chars()
.map(|ch| if (ch as u32) <= 0xFF { ch as u8 } else { b'?' })
.collect()
}
}
#[cfg(all(test, feature = "encoding"))]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
#[test]
fn test_utf8_detection() {
assert!(is_utf8_collation(0x0800_0409)); assert!(!is_utf8_collation(0x0409)); }
#[test]
fn test_japanese_encoding() {
let enc = encoding_for_lcid(0x0411);
assert!(enc.is_some());
assert_eq!(enc.unwrap().name(), "Shift_JIS");
assert_eq!(code_page_for_lcid(0x0411), Some(932));
}
#[test]
fn test_chinese_simplified_encoding() {
let enc = encoding_for_lcid(0x0804);
assert!(enc.is_some());
assert_eq!(enc.unwrap().name(), "gb18030");
assert_eq!(code_page_for_lcid(0x0804), Some(936));
}
#[test]
fn test_chinese_traditional_encoding() {
let enc = encoding_for_lcid(0x0404);
assert!(enc.is_some());
assert_eq!(enc.unwrap().name(), "Big5");
assert_eq!(code_page_for_lcid(0x0404), Some(950));
}
#[test]
fn test_korean_encoding() {
let enc = encoding_for_lcid(0x0412);
assert!(enc.is_some());
assert_eq!(enc.unwrap().name(), "EUC-KR");
assert_eq!(code_page_for_lcid(0x0412), Some(949));
}
#[test]
fn test_cyrillic_encoding() {
let enc = encoding_for_lcid(0x0419);
assert!(enc.is_some());
assert_eq!(enc.unwrap().name(), "windows-1251");
assert_eq!(code_page_for_lcid(0x0419), Some(1251));
let enc = encoding_for_lcid(0x0422);
assert!(enc.is_some());
assert_eq!(enc.unwrap().name(), "windows-1251");
}
#[test]
fn test_western_european_encoding() {
let enc = encoding_for_lcid(0x0409);
assert!(enc.is_some());
assert_eq!(enc.unwrap().name(), "windows-1252");
assert_eq!(code_page_for_lcid(0x0409), Some(1252));
let enc = encoding_for_lcid(0x040C);
assert!(enc.is_some());
assert_eq!(enc.unwrap().name(), "windows-1252");
let enc = encoding_for_lcid(0x0407);
assert!(enc.is_some());
assert_eq!(enc.unwrap().name(), "windows-1252");
}
#[test]
fn test_greek_encoding() {
let enc = encoding_for_lcid(0x0408);
assert!(enc.is_some());
assert_eq!(enc.unwrap().name(), "windows-1253");
assert_eq!(code_page_for_lcid(0x0408), Some(1253));
}
#[test]
fn test_turkish_encoding() {
let enc = encoding_for_lcid(0x041F);
assert!(enc.is_some());
assert_eq!(enc.unwrap().name(), "windows-1254");
assert_eq!(code_page_for_lcid(0x041F), Some(1254));
}
#[test]
fn test_hebrew_encoding() {
let enc = encoding_for_lcid(0x040D);
assert!(enc.is_some());
assert_eq!(enc.unwrap().name(), "windows-1255");
assert_eq!(code_page_for_lcid(0x040D), Some(1255));
}
#[test]
fn test_arabic_encoding() {
let enc = encoding_for_lcid(0x0401);
assert!(enc.is_some());
assert_eq!(enc.unwrap().name(), "windows-1256");
assert_eq!(code_page_for_lcid(0x0401), Some(1256));
let enc = encoding_for_lcid(0x0429);
assert!(enc.is_some());
assert_eq!(enc.unwrap().name(), "windows-1256");
}
#[test]
fn test_baltic_encoding() {
let enc = encoding_for_lcid(0x0425);
assert!(enc.is_some());
assert_eq!(enc.unwrap().name(), "windows-1257");
assert_eq!(code_page_for_lcid(0x0425), Some(1257));
let enc = encoding_for_lcid(0x0427);
assert!(enc.is_some());
assert_eq!(enc.unwrap().name(), "windows-1257");
}
#[test]
fn test_thai_encoding() {
let enc = encoding_for_lcid(0x041E);
assert!(enc.is_some());
assert_eq!(enc.unwrap().name(), "windows-874");
assert_eq!(code_page_for_lcid(0x041E), Some(874));
}
#[test]
fn test_vietnamese_encoding() {
let enc = encoding_for_lcid(0x042A);
assert!(enc.is_some());
assert_eq!(enc.unwrap().name(), "windows-1258");
assert_eq!(code_page_for_lcid(0x042A), Some(1258));
}
#[test]
fn test_unknown_lcid_fallback() {
let enc = encoding_for_lcid(0x9999);
assert!(enc.is_none());
assert_eq!(code_page_for_lcid(0x9999), Some(1252));
}
#[test]
fn test_encoding_name() {
assert_eq!(encoding_name_for_lcid(0x0411), "Shift_JIS");
assert_eq!(encoding_name_for_lcid(0x0419), "windows-1251");
assert_eq!(encoding_name_for_lcid(0x0800_0409), "UTF-8");
assert_eq!(encoding_name_for_lcid(0x9999), "windows-1252"); }
#[test]
fn test_decode_chinese_text() {
let enc = encoding_for_lcid(0x0804).unwrap();
let gb_bytes = [0xD6, 0xD0, 0xCE, 0xC4];
let (decoded, _, had_errors) = enc.decode(&gb_bytes);
assert!(!had_errors);
assert_eq!(decoded, "中文");
}
#[test]
fn test_decode_cyrillic_text() {
let enc = encoding_for_lcid(0x0419).unwrap();
let cp1251_bytes = [0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2];
let (decoded, _, had_errors) = enc.decode(&cp1251_bytes);
assert!(!had_errors);
assert_eq!(decoded, "Привет");
}
#[test]
fn test_decode_japanese_text() {
let enc = encoding_for_lcid(0x0411).unwrap();
let sjis_bytes = [0x93, 0xFA, 0x96, 0x7B, 0x8C, 0xEA];
let (decoded, _, had_errors) = enc.decode(&sjis_bytes);
assert!(!had_errors);
assert_eq!(decoded, "日本語");
}
}