tds_protocol/collation.rs
1//! Collation encoding support for SQL Server VARCHAR decoding.
2//!
3//! This module provides mappings from SQL Server collation LCIDs (Locale IDs)
4//! to their corresponding character encodings, enabling proper decoding of
5//! non-UTF-8 VARCHAR data.
6//!
7//! # Supported Encodings
8//!
9//! The following encoding families are supported based on the collation's LCID:
10//!
11//! | Code Page | Encoding | Languages |
12//! |-----------|----------|-----------|
13//! | 874 | Windows-874 (TIS-620) | Thai |
14//! | 932 | Shift_JIS | Japanese |
15//! | 936 | GBK/GB18030 | Simplified Chinese |
16//! | 949 | EUC-KR | Korean |
17//! | 950 | Big5 | Traditional Chinese |
18//! | 1250 | Windows-1250 | Central/Eastern European |
19//! | 1251 | Windows-1251 | Cyrillic |
20//! | 1252 | Windows-1252 | Western European (default) |
21//! | 1253 | Windows-1253 | Greek |
22//! | 1254 | Windows-1254 | Turkish |
23//! | 1255 | Windows-1255 | Hebrew |
24//! | 1256 | Windows-1256 | Arabic |
25//! | 1257 | Windows-1257 | Baltic |
26//! | 1258 | Windows-1258 | Vietnamese |
27//!
28//! # UTF-8 Collations
29//!
30//! SQL Server 2019+ supports UTF-8 collations (suffix `_UTF8`). These are
31//! detected by checking the collation flags. When a UTF-8 collation is used,
32//! no encoding conversion is needed as the data is already UTF-8.
33//!
34//! # References
35//!
36//! - [MS-LCID: Windows Language Code Identifier Reference](https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/)
37//! - [Code Page Identifiers](https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers)
38
39#[cfg(feature = "encoding")]
40use encoding_rs::Encoding;
41
42/// Flag bit indicating UTF-8 collation (SQL Server 2019+).
43/// This is bit 27 (0x0800_0000) in the collation info field.
44pub const COLLATION_FLAG_UTF8: u32 = 0x0800_0000;
45
46/// Mask to extract the primary LCID from the collation info.
47/// The LCID is stored in the lower 20 bits.
48pub const LCID_MASK: u32 = 0x000F_FFFF;
49
50/// Mask to extract the primary language ID (lower 16 bits of LCID).
51pub const PRIMARY_LANGUAGE_MASK: u32 = 0x0000_FFFF;
52
53/// Returns whether the collation uses UTF-8 encoding.
54///
55/// SQL Server 2019+ supports UTF-8 collations with the `_UTF8` suffix.
56/// These collations set bit 27 in the collation info field.
57#[inline]
58pub fn is_utf8_collation(lcid: u32) -> bool {
59 lcid & COLLATION_FLAG_UTF8 != 0
60}
61
62/// Returns the encoding for a given LCID, if known.
63///
64/// This function maps SQL Server collation LCIDs to their corresponding
65/// character encodings from the `encoding_rs` crate.
66///
67/// # Arguments
68///
69/// * `lcid` - The locale ID from the SQL Server collation
70///
71/// # Returns
72///
73/// * `Some(&Encoding)` - The corresponding encoding if the LCID is recognized
74/// * `None` - If the LCID is not recognized or uses UTF-8
75///
76/// # UTF-8 Handling
77///
78/// UTF-8 collations (SQL Server 2019+) return `None` because no transcoding
79/// is needed - the data is already valid UTF-8.
80#[cfg(feature = "encoding")]
81pub fn encoding_for_lcid(lcid: u32) -> Option<&'static Encoding> {
82 // UTF-8 collations don't need transcoding
83 if is_utf8_collation(lcid) {
84 return None;
85 }
86
87 // Extract the primary language ID
88 let primary_lang = lcid & PRIMARY_LANGUAGE_MASK;
89
90 // Map LCID to encoding based on Windows code page assignments
91 // Reference: https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/
92 match primary_lang {
93 // Japanese (Code Page 932 - Shift_JIS)
94 0x0411 => Some(encoding_rs::SHIFT_JIS),
95
96 // Chinese - Simplified (Code Page 936 - GBK/GB18030)
97 // Includes: zh-CN, zh-SG
98 0x0804 | // Chinese (Simplified, PRC)
99 0x1004 // Chinese (Simplified, Singapore)
100 => Some(encoding_rs::GB18030),
101
102 // Chinese - Traditional (Code Page 950 - Big5)
103 // Includes: zh-TW, zh-HK, zh-MO
104 0x0404 | // Chinese (Traditional, Taiwan)
105 0x0C04 | // Chinese (Traditional, Hong Kong SAR)
106 0x1404 // Chinese (Traditional, Macao SAR)
107 => Some(encoding_rs::BIG5),
108
109 // Korean (Code Page 949 - EUC-KR)
110 0x0412 => Some(encoding_rs::EUC_KR),
111
112 // Thai (Code Page 874 - Windows-874/TIS-620)
113 0x041E => Some(encoding_rs::WINDOWS_874),
114
115 // Vietnamese (Code Page 1258)
116 0x042A => Some(encoding_rs::WINDOWS_1258),
117
118 // Central/Eastern European (Code Page 1250)
119 // Includes: Czech, Polish, Hungarian, Croatian, Slovak, Slovenian, Romanian, Albanian
120 0x0405 | // Czech
121 0x0415 | // Polish
122 0x040E | // Hungarian
123 0x041A | // Croatian
124 0x081A | // Serbian (Latin)
125 0x141A | // Bosnian (Latin)
126 0x101A | // Croatian (Bosnia and Herzegovina)
127 0x041B | // Slovak
128 0x0424 | // Slovenian
129 0x0418 | // Romanian
130 0x041C // Albanian
131 => Some(encoding_rs::WINDOWS_1250),
132
133 // Cyrillic (Code Page 1251)
134 // Includes: Russian, Ukrainian, Belarusian, Bulgarian, Macedonian, Serbian Cyrillic
135 0x0419 | // Russian
136 0x0422 | // Ukrainian
137 0x0423 | // Belarusian
138 0x0402 | // Bulgarian
139 0x042F | // Macedonian
140 0x0C1A | // Serbian (Cyrillic)
141 0x201A | // Bosnian (Cyrillic)
142 0x0440 | // Kyrgyz
143 0x0843 | // Uzbek (Cyrillic)
144 0x0444 | // Tatar
145 0x0450 | // Mongolian (Cyrillic)
146 0x0485 // Sakha
147 => Some(encoding_rs::WINDOWS_1251),
148
149 // Greek (Code Page 1253)
150 0x0408 => Some(encoding_rs::WINDOWS_1253),
151
152 // Turkish (Code Page 1254)
153 0x041F | // Turkish
154 0x042C // Azerbaijani (Latin)
155 => Some(encoding_rs::WINDOWS_1254),
156
157 // Hebrew (Code Page 1255)
158 0x040D => Some(encoding_rs::WINDOWS_1255),
159
160 // Arabic (Code Page 1256)
161 // Includes all Arabic variants and Farsi/Persian, Urdu, etc.
162 0x0401 | // Arabic (Saudi Arabia)
163 0x0801 | // Arabic (Iraq)
164 0x0C01 | // Arabic (Egypt)
165 0x1001 | // Arabic (Libya)
166 0x1401 | // Arabic (Algeria)
167 0x1801 | // Arabic (Morocco)
168 0x1C01 | // Arabic (Tunisia)
169 0x2001 | // Arabic (Oman)
170 0x2401 | // Arabic (Yemen)
171 0x2801 | // Arabic (Syria)
172 0x2C01 | // Arabic (Jordan)
173 0x3001 | // Arabic (Lebanon)
174 0x3401 | // Arabic (Kuwait)
175 0x3801 | // Arabic (UAE)
176 0x3C01 | // Arabic (Bahrain)
177 0x4001 | // Arabic (Qatar)
178 0x0429 | // Farsi/Persian
179 0x0420 | // Urdu
180 0x048C | // Dari
181 0x0463 // Pashto
182 => Some(encoding_rs::WINDOWS_1256),
183
184 // Baltic (Code Page 1257)
185 0x0425..=0x0427 // Lithuanian
186 => Some(encoding_rs::WINDOWS_1257),
187
188 // Western European (Code Page 1252) - Default for most European languages
189 // Includes: English, French, German, Spanish, Italian, Portuguese, Dutch, etc.
190 0x0409 | // English (United States)
191 0x0809 | // English (United Kingdom)
192 0x0C09 | // English (Australia)
193 0x1009 | // English (Canada)
194 0x1409 | // English (New Zealand)
195 0x1809 | // English (Ireland)
196 0x040C | // French (France)
197 0x080C | // French (Belgium)
198 0x0C0C | // French (Canada)
199 0x100C | // French (Switzerland)
200 0x140C | // French (Luxembourg)
201 0x0407 | // German (Germany)
202 0x0807 | // German (Switzerland)
203 0x0C07 | // German (Austria)
204 0x1007 | // German (Luxembourg)
205 0x1407 | // German (Liechtenstein)
206 0x040A | // Spanish (Traditional Sort)
207 0x080A | // Spanish (Mexico)
208 0x0C0A | // Spanish (Modern Sort)
209 0x100A | // Spanish (Guatemala)
210 0x140A | // Spanish (Costa Rica)
211 0x180A | // Spanish (Panama)
212 0x1C0A | // Spanish (Dominican Republic)
213 0x200A | // Spanish (Venezuela)
214 0x240A | // Spanish (Colombia)
215 0x280A | // Spanish (Peru)
216 0x2C0A | // Spanish (Argentina)
217 0x300A | // Spanish (Ecuador)
218 0x340A | // Spanish (Chile)
219 0x380A | // Spanish (Uruguay)
220 0x3C0A | // Spanish (Paraguay)
221 0x400A | // Spanish (Bolivia)
222 0x440A | // Spanish (El Salvador)
223 0x480A | // Spanish (Honduras)
224 0x4C0A | // Spanish (Nicaragua)
225 0x500A | // Spanish (Puerto Rico)
226 0x0410 | // Italian (Italy)
227 0x0810 | // Italian (Switzerland)
228 0x0816 | // Portuguese (Portugal)
229 0x0416 | // Portuguese (Brazil)
230 0x0413 | // Dutch (Netherlands)
231 0x0813 | // Dutch (Belgium)
232 0x0406 | // Danish
233 0x0414 | // Norwegian (Bokmål)
234 0x0814 | // Norwegian (Nynorsk)
235 0x041D | // Swedish
236 0x081D | // Swedish (Finland)
237 0x040B | // Finnish
238 0x040F | // Icelandic
239 0x0403 | // Catalan
240 0x0456 | // Galician
241 0x042D | // Basque
242 0x0436 | // Afrikaans
243 0x0421 | // Indonesian
244 0x043E | // Malay (Malaysia)
245 0x0441 // Swahili
246 => Some(encoding_rs::WINDOWS_1252),
247
248 // Unknown LCID - return None, caller should use Windows-1252 as fallback
249 _ => None,
250 }
251}
252
253/// Returns the Windows code page number for a given LCID.
254///
255/// This is useful for error messages and debugging.
256#[cfg(feature = "encoding")]
257pub fn code_page_for_lcid(lcid: u32) -> Option<u16> {
258 if is_utf8_collation(lcid) {
259 return Some(65001); // UTF-8
260 }
261
262 let primary_lang = lcid & PRIMARY_LANGUAGE_MASK;
263
264 match primary_lang {
265 0x0411 => Some(932), // Japanese - Shift_JIS
266 0x0804 | 0x1004 => Some(936), // Chinese Simplified - GBK
267 0x0404 | 0x0C04 | 0x1404 => Some(950), // Chinese Traditional - Big5
268 0x0412 => Some(949), // Korean - EUC-KR
269 0x041E => Some(874), // Thai
270 0x042A => Some(1258), // Vietnamese
271
272 // Code Page 1250 - Central European
273 0x0405 | 0x0415 | 0x040E | 0x041A | 0x081A | 0x141A | 0x101A | 0x041B | 0x0424 | 0x0418
274 | 0x041C => Some(1250),
275
276 // Code Page 1251 - Cyrillic
277 0x0419 | 0x0422 | 0x0423 | 0x0402 | 0x042F | 0x0C1A | 0x201A | 0x0440 | 0x0843 | 0x0444
278 | 0x0450 | 0x0485 => Some(1251),
279
280 0x0408 => Some(1253), // Greek
281 0x041F | 0x042C => Some(1254), // Turkish, Azerbaijani
282 0x040D => Some(1255), // Hebrew
283
284 // Code Page 1256 - Arabic
285 0x0401 | 0x0801 | 0x0C01 | 0x1001 | 0x1401 | 0x1801 | 0x1C01 | 0x2001 | 0x2401 | 0x2801
286 | 0x2C01 | 0x3001 | 0x3401 | 0x3801 | 0x3C01 | 0x4001 | 0x0429 | 0x0420 | 0x048C
287 | 0x0463 => Some(1256),
288
289 // Code Page 1257 - Baltic
290 0x0425..=0x0427 => Some(1257),
291
292 // Default to Code Page 1252 for Western European
293 _ => Some(1252),
294 }
295}
296
297/// Returns the encoding name for display/logging purposes.
298#[cfg(feature = "encoding")]
299pub fn encoding_name_for_lcid(lcid: u32) -> &'static str {
300 if is_utf8_collation(lcid) {
301 return "UTF-8";
302 }
303
304 match encoding_for_lcid(lcid) {
305 Some(enc) => enc.name(),
306 None => "windows-1252", // Default fallback
307 }
308}
309
310#[cfg(all(test, feature = "encoding"))]
311#[allow(clippy::unwrap_used)]
312mod tests {
313 use super::*;
314
315 #[test]
316 fn test_utf8_detection() {
317 // UTF-8 collation flag
318 assert!(is_utf8_collation(0x0800_0409)); // English with UTF-8
319 assert!(!is_utf8_collation(0x0409)); // English without UTF-8
320 }
321
322 #[test]
323 fn test_japanese_encoding() {
324 let enc = encoding_for_lcid(0x0411);
325 assert!(enc.is_some());
326 assert_eq!(enc.unwrap().name(), "Shift_JIS");
327 assert_eq!(code_page_for_lcid(0x0411), Some(932));
328 }
329
330 #[test]
331 fn test_chinese_simplified_encoding() {
332 let enc = encoding_for_lcid(0x0804);
333 assert!(enc.is_some());
334 assert_eq!(enc.unwrap().name(), "gb18030");
335 assert_eq!(code_page_for_lcid(0x0804), Some(936));
336 }
337
338 #[test]
339 fn test_chinese_traditional_encoding() {
340 let enc = encoding_for_lcid(0x0404);
341 assert!(enc.is_some());
342 assert_eq!(enc.unwrap().name(), "Big5");
343 assert_eq!(code_page_for_lcid(0x0404), Some(950));
344 }
345
346 #[test]
347 fn test_korean_encoding() {
348 let enc = encoding_for_lcid(0x0412);
349 assert!(enc.is_some());
350 assert_eq!(enc.unwrap().name(), "EUC-KR");
351 assert_eq!(code_page_for_lcid(0x0412), Some(949));
352 }
353
354 #[test]
355 fn test_cyrillic_encoding() {
356 // Russian
357 let enc = encoding_for_lcid(0x0419);
358 assert!(enc.is_some());
359 assert_eq!(enc.unwrap().name(), "windows-1251");
360 assert_eq!(code_page_for_lcid(0x0419), Some(1251));
361
362 // Ukrainian
363 let enc = encoding_for_lcid(0x0422);
364 assert!(enc.is_some());
365 assert_eq!(enc.unwrap().name(), "windows-1251");
366 }
367
368 #[test]
369 fn test_western_european_encoding() {
370 // English (US)
371 let enc = encoding_for_lcid(0x0409);
372 assert!(enc.is_some());
373 assert_eq!(enc.unwrap().name(), "windows-1252");
374 assert_eq!(code_page_for_lcid(0x0409), Some(1252));
375
376 // French
377 let enc = encoding_for_lcid(0x040C);
378 assert!(enc.is_some());
379 assert_eq!(enc.unwrap().name(), "windows-1252");
380
381 // German
382 let enc = encoding_for_lcid(0x0407);
383 assert!(enc.is_some());
384 assert_eq!(enc.unwrap().name(), "windows-1252");
385 }
386
387 #[test]
388 fn test_greek_encoding() {
389 let enc = encoding_for_lcid(0x0408);
390 assert!(enc.is_some());
391 assert_eq!(enc.unwrap().name(), "windows-1253");
392 assert_eq!(code_page_for_lcid(0x0408), Some(1253));
393 }
394
395 #[test]
396 fn test_turkish_encoding() {
397 let enc = encoding_for_lcid(0x041F);
398 assert!(enc.is_some());
399 assert_eq!(enc.unwrap().name(), "windows-1254");
400 assert_eq!(code_page_for_lcid(0x041F), Some(1254));
401 }
402
403 #[test]
404 fn test_hebrew_encoding() {
405 let enc = encoding_for_lcid(0x040D);
406 assert!(enc.is_some());
407 assert_eq!(enc.unwrap().name(), "windows-1255");
408 assert_eq!(code_page_for_lcid(0x040D), Some(1255));
409 }
410
411 #[test]
412 fn test_arabic_encoding() {
413 // Arabic (Saudi Arabia)
414 let enc = encoding_for_lcid(0x0401);
415 assert!(enc.is_some());
416 assert_eq!(enc.unwrap().name(), "windows-1256");
417 assert_eq!(code_page_for_lcid(0x0401), Some(1256));
418
419 // Farsi/Persian
420 let enc = encoding_for_lcid(0x0429);
421 assert!(enc.is_some());
422 assert_eq!(enc.unwrap().name(), "windows-1256");
423 }
424
425 #[test]
426 fn test_baltic_encoding() {
427 // Estonian
428 let enc = encoding_for_lcid(0x0425);
429 assert!(enc.is_some());
430 assert_eq!(enc.unwrap().name(), "windows-1257");
431 assert_eq!(code_page_for_lcid(0x0425), Some(1257));
432
433 // Lithuanian
434 let enc = encoding_for_lcid(0x0427);
435 assert!(enc.is_some());
436 assert_eq!(enc.unwrap().name(), "windows-1257");
437 }
438
439 #[test]
440 fn test_thai_encoding() {
441 let enc = encoding_for_lcid(0x041E);
442 assert!(enc.is_some());
443 assert_eq!(enc.unwrap().name(), "windows-874");
444 assert_eq!(code_page_for_lcid(0x041E), Some(874));
445 }
446
447 #[test]
448 fn test_vietnamese_encoding() {
449 let enc = encoding_for_lcid(0x042A);
450 assert!(enc.is_some());
451 assert_eq!(enc.unwrap().name(), "windows-1258");
452 assert_eq!(code_page_for_lcid(0x042A), Some(1258));
453 }
454
455 #[test]
456 fn test_unknown_lcid_fallback() {
457 // Unknown LCID should return None (caller uses Windows-1252)
458 let enc = encoding_for_lcid(0x9999);
459 assert!(enc.is_none());
460 // But code page should default to 1252
461 assert_eq!(code_page_for_lcid(0x9999), Some(1252));
462 }
463
464 #[test]
465 fn test_encoding_name() {
466 assert_eq!(encoding_name_for_lcid(0x0411), "Shift_JIS");
467 assert_eq!(encoding_name_for_lcid(0x0419), "windows-1251");
468 assert_eq!(encoding_name_for_lcid(0x0800_0409), "UTF-8");
469 assert_eq!(encoding_name_for_lcid(0x9999), "windows-1252"); // fallback
470 }
471
472 #[test]
473 fn test_decode_chinese_text() {
474 let enc = encoding_for_lcid(0x0804).unwrap();
475 // "中文" in GB18030 encoding
476 let gb_bytes = [0xD6, 0xD0, 0xCE, 0xC4];
477 let (decoded, _, had_errors) = enc.decode(&gb_bytes);
478 assert!(!had_errors);
479 assert_eq!(decoded, "中文");
480 }
481
482 #[test]
483 fn test_decode_cyrillic_text() {
484 let enc = encoding_for_lcid(0x0419).unwrap();
485 // "Привет" in Windows-1251
486 let cp1251_bytes = [0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2];
487 let (decoded, _, had_errors) = enc.decode(&cp1251_bytes);
488 assert!(!had_errors);
489 assert_eq!(decoded, "Привет");
490 }
491
492 #[test]
493 fn test_decode_japanese_text() {
494 let enc = encoding_for_lcid(0x0411).unwrap();
495 // "日本語" in Shift_JIS
496 let sjis_bytes = [0x93, 0xFA, 0x96, 0x7B, 0x8C, 0xEA];
497 let (decoded, _, had_errors) = enc.decode(&sjis_bytes);
498 assert!(!had_errors);
499 assert_eq!(decoded, "日本語");
500 }
501}