tds_protocol/
collation.rs

1//! Collation encoding support for SQL Server VARCHAR decoding.
2//!
3//! This module provides mappings from SQL Server collation LCIDs (Locale IDs)
4//! to their corresponding character encodings, enabling proper decoding of
5//! non-UTF-8 VARCHAR data.
6//!
7//! # Supported Encodings
8//!
9//! The following encoding families are supported based on the collation's LCID:
10//!
11//! | Code Page | Encoding | Languages |
12//! |-----------|----------|-----------|
13//! | 874 | Windows-874 (TIS-620) | Thai |
14//! | 932 | Shift_JIS | Japanese |
15//! | 936 | GBK/GB18030 | Simplified Chinese |
16//! | 949 | EUC-KR | Korean |
17//! | 950 | Big5 | Traditional Chinese |
18//! | 1250 | Windows-1250 | Central/Eastern European |
19//! | 1251 | Windows-1251 | Cyrillic |
20//! | 1252 | Windows-1252 | Western European (default) |
21//! | 1253 | Windows-1253 | Greek |
22//! | 1254 | Windows-1254 | Turkish |
23//! | 1255 | Windows-1255 | Hebrew |
24//! | 1256 | Windows-1256 | Arabic |
25//! | 1257 | Windows-1257 | Baltic |
26//! | 1258 | Windows-1258 | Vietnamese |
27//!
28//! # UTF-8 Collations
29//!
30//! SQL Server 2019+ supports UTF-8 collations (suffix `_UTF8`). These are
31//! detected by checking the collation flags. When a UTF-8 collation is used,
32//! no encoding conversion is needed as the data is already UTF-8.
33//!
34//! # References
35//!
36//! - [MS-LCID: Windows Language Code Identifier Reference](https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/)
37//! - [Code Page Identifiers](https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers)
38
39#[cfg(feature = "encoding")]
40use encoding_rs::Encoding;
41
42/// Flag bit indicating UTF-8 collation (SQL Server 2019+).
43/// This is bit 27 (0x0800_0000) in the collation info field.
44pub const COLLATION_FLAG_UTF8: u32 = 0x0800_0000;
45
46/// Mask to extract the primary LCID from the collation info.
47/// The LCID is stored in the lower 20 bits.
48pub const LCID_MASK: u32 = 0x000F_FFFF;
49
50/// Mask to extract the primary language ID (lower 16 bits of LCID).
51pub const PRIMARY_LANGUAGE_MASK: u32 = 0x0000_FFFF;
52
53/// Returns whether the collation uses UTF-8 encoding.
54///
55/// SQL Server 2019+ supports UTF-8 collations with the `_UTF8` suffix.
56/// These collations set bit 27 in the collation info field.
57#[inline]
58pub fn is_utf8_collation(lcid: u32) -> bool {
59    lcid & COLLATION_FLAG_UTF8 != 0
60}
61
62/// Returns the encoding for a given LCID, if known.
63///
64/// This function maps SQL Server collation LCIDs to their corresponding
65/// character encodings from the `encoding_rs` crate.
66///
67/// # Arguments
68///
69/// * `lcid` - The locale ID from the SQL Server collation
70///
71/// # Returns
72///
73/// * `Some(&Encoding)` - The corresponding encoding if the LCID is recognized
74/// * `None` - If the LCID is not recognized or uses UTF-8
75///
76/// # UTF-8 Handling
77///
78/// UTF-8 collations (SQL Server 2019+) return `None` because no transcoding
79/// is needed - the data is already valid UTF-8.
80#[cfg(feature = "encoding")]
81pub fn encoding_for_lcid(lcid: u32) -> Option<&'static Encoding> {
82    // UTF-8 collations don't need transcoding
83    if is_utf8_collation(lcid) {
84        return None;
85    }
86
87    // Extract the primary language ID
88    let primary_lang = lcid & PRIMARY_LANGUAGE_MASK;
89
90    // Map LCID to encoding based on Windows code page assignments
91    // Reference: https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/
92    match primary_lang {
93        // Japanese (Code Page 932 - Shift_JIS)
94        0x0411 => Some(encoding_rs::SHIFT_JIS),
95
96        // Chinese - Simplified (Code Page 936 - GBK/GB18030)
97        // Includes: zh-CN, zh-SG
98        0x0804 | // Chinese (Simplified, PRC)
99        0x1004   // Chinese (Simplified, Singapore)
100        => Some(encoding_rs::GB18030),
101
102        // Chinese - Traditional (Code Page 950 - Big5)
103        // Includes: zh-TW, zh-HK, zh-MO
104        0x0404 | // Chinese (Traditional, Taiwan)
105        0x0C04 | // Chinese (Traditional, Hong Kong SAR)
106        0x1404   // Chinese (Traditional, Macao SAR)
107        => Some(encoding_rs::BIG5),
108
109        // Korean (Code Page 949 - EUC-KR)
110        0x0412 => Some(encoding_rs::EUC_KR),
111
112        // Thai (Code Page 874 - Windows-874/TIS-620)
113        0x041E => Some(encoding_rs::WINDOWS_874),
114
115        // Vietnamese (Code Page 1258)
116        0x042A => Some(encoding_rs::WINDOWS_1258),
117
118        // Central/Eastern European (Code Page 1250)
119        // Includes: Czech, Polish, Hungarian, Croatian, Slovak, Slovenian, Romanian, Albanian
120        0x0405 | // Czech
121        0x0415 | // Polish
122        0x040E | // Hungarian
123        0x041A | // Croatian
124        0x081A | // Serbian (Latin)
125        0x141A | // Bosnian (Latin)
126        0x101A | // Croatian (Bosnia and Herzegovina)
127        0x041B | // Slovak
128        0x0424 | // Slovenian
129        0x0418 | // Romanian
130        0x041C   // Albanian
131        => Some(encoding_rs::WINDOWS_1250),
132
133        // Cyrillic (Code Page 1251)
134        // Includes: Russian, Ukrainian, Belarusian, Bulgarian, Macedonian, Serbian Cyrillic
135        0x0419 | // Russian
136        0x0422 | // Ukrainian
137        0x0423 | // Belarusian
138        0x0402 | // Bulgarian
139        0x042F | // Macedonian
140        0x0C1A | // Serbian (Cyrillic)
141        0x201A | // Bosnian (Cyrillic)
142        0x0440 | // Kyrgyz
143        0x0843 | // Uzbek (Cyrillic)
144        0x0444 | // Tatar
145        0x0450 | // Mongolian (Cyrillic)
146        0x0485   // Sakha
147        => Some(encoding_rs::WINDOWS_1251),
148
149        // Greek (Code Page 1253)
150        0x0408 => Some(encoding_rs::WINDOWS_1253),
151
152        // Turkish (Code Page 1254)
153        0x041F | // Turkish
154        0x042C   // Azerbaijani (Latin)
155        => Some(encoding_rs::WINDOWS_1254),
156
157        // Hebrew (Code Page 1255)
158        0x040D => Some(encoding_rs::WINDOWS_1255),
159
160        // Arabic (Code Page 1256)
161        // Includes all Arabic variants and Farsi/Persian, Urdu, etc.
162        0x0401 | // Arabic (Saudi Arabia)
163        0x0801 | // Arabic (Iraq)
164        0x0C01 | // Arabic (Egypt)
165        0x1001 | // Arabic (Libya)
166        0x1401 | // Arabic (Algeria)
167        0x1801 | // Arabic (Morocco)
168        0x1C01 | // Arabic (Tunisia)
169        0x2001 | // Arabic (Oman)
170        0x2401 | // Arabic (Yemen)
171        0x2801 | // Arabic (Syria)
172        0x2C01 | // Arabic (Jordan)
173        0x3001 | // Arabic (Lebanon)
174        0x3401 | // Arabic (Kuwait)
175        0x3801 | // Arabic (UAE)
176        0x3C01 | // Arabic (Bahrain)
177        0x4001 | // Arabic (Qatar)
178        0x0429 | // Farsi/Persian
179        0x0420 | // Urdu
180        0x048C | // Dari
181        0x0463   // Pashto
182        => Some(encoding_rs::WINDOWS_1256),
183
184        // Baltic (Code Page 1257)
185        0x0425..=0x0427   // Lithuanian
186        => Some(encoding_rs::WINDOWS_1257),
187
188        // Western European (Code Page 1252) - Default for most European languages
189        // Includes: English, French, German, Spanish, Italian, Portuguese, Dutch, etc.
190        0x0409 | // English (United States)
191        0x0809 | // English (United Kingdom)
192        0x0C09 | // English (Australia)
193        0x1009 | // English (Canada)
194        0x1409 | // English (New Zealand)
195        0x1809 | // English (Ireland)
196        0x040C | // French (France)
197        0x080C | // French (Belgium)
198        0x0C0C | // French (Canada)
199        0x100C | // French (Switzerland)
200        0x140C | // French (Luxembourg)
201        0x0407 | // German (Germany)
202        0x0807 | // German (Switzerland)
203        0x0C07 | // German (Austria)
204        0x1007 | // German (Luxembourg)
205        0x1407 | // German (Liechtenstein)
206        0x040A | // Spanish (Traditional Sort)
207        0x080A | // Spanish (Mexico)
208        0x0C0A | // Spanish (Modern Sort)
209        0x100A | // Spanish (Guatemala)
210        0x140A | // Spanish (Costa Rica)
211        0x180A | // Spanish (Panama)
212        0x1C0A | // Spanish (Dominican Republic)
213        0x200A | // Spanish (Venezuela)
214        0x240A | // Spanish (Colombia)
215        0x280A | // Spanish (Peru)
216        0x2C0A | // Spanish (Argentina)
217        0x300A | // Spanish (Ecuador)
218        0x340A | // Spanish (Chile)
219        0x380A | // Spanish (Uruguay)
220        0x3C0A | // Spanish (Paraguay)
221        0x400A | // Spanish (Bolivia)
222        0x440A | // Spanish (El Salvador)
223        0x480A | // Spanish (Honduras)
224        0x4C0A | // Spanish (Nicaragua)
225        0x500A | // Spanish (Puerto Rico)
226        0x0410 | // Italian (Italy)
227        0x0810 | // Italian (Switzerland)
228        0x0816 | // Portuguese (Portugal)
229        0x0416 | // Portuguese (Brazil)
230        0x0413 | // Dutch (Netherlands)
231        0x0813 | // Dutch (Belgium)
232        0x0406 | // Danish
233        0x0414 | // Norwegian (Bokmål)
234        0x0814 | // Norwegian (Nynorsk)
235        0x041D | // Swedish
236        0x081D | // Swedish (Finland)
237        0x040B | // Finnish
238        0x040F | // Icelandic
239        0x0403 | // Catalan
240        0x0456 | // Galician
241        0x042D | // Basque
242        0x0436 | // Afrikaans
243        0x0421 | // Indonesian
244        0x043E | // Malay (Malaysia)
245        0x0441   // Swahili
246        => Some(encoding_rs::WINDOWS_1252),
247
248        // Unknown LCID - return None, caller should use Windows-1252 as fallback
249        _ => None,
250    }
251}
252
253/// Returns the Windows code page number for a given LCID.
254///
255/// This is useful for error messages and debugging.
256#[cfg(feature = "encoding")]
257pub fn code_page_for_lcid(lcid: u32) -> Option<u16> {
258    if is_utf8_collation(lcid) {
259        return Some(65001); // UTF-8
260    }
261
262    let primary_lang = lcid & PRIMARY_LANGUAGE_MASK;
263
264    match primary_lang {
265        0x0411 => Some(932),                   // Japanese - Shift_JIS
266        0x0804 | 0x1004 => Some(936),          // Chinese Simplified - GBK
267        0x0404 | 0x0C04 | 0x1404 => Some(950), // Chinese Traditional - Big5
268        0x0412 => Some(949),                   // Korean - EUC-KR
269        0x041E => Some(874),                   // Thai
270        0x042A => Some(1258),                  // Vietnamese
271
272        // Code Page 1250 - Central European
273        0x0405 | 0x0415 | 0x040E | 0x041A | 0x081A | 0x141A | 0x101A | 0x041B | 0x0424 | 0x0418
274        | 0x041C => Some(1250),
275
276        // Code Page 1251 - Cyrillic
277        0x0419 | 0x0422 | 0x0423 | 0x0402 | 0x042F | 0x0C1A | 0x201A | 0x0440 | 0x0843 | 0x0444
278        | 0x0450 | 0x0485 => Some(1251),
279
280        0x0408 => Some(1253),          // Greek
281        0x041F | 0x042C => Some(1254), // Turkish, Azerbaijani
282        0x040D => Some(1255),          // Hebrew
283
284        // Code Page 1256 - Arabic
285        0x0401 | 0x0801 | 0x0C01 | 0x1001 | 0x1401 | 0x1801 | 0x1C01 | 0x2001 | 0x2401 | 0x2801
286        | 0x2C01 | 0x3001 | 0x3401 | 0x3801 | 0x3C01 | 0x4001 | 0x0429 | 0x0420 | 0x048C
287        | 0x0463 => Some(1256),
288
289        // Code Page 1257 - Baltic
290        0x0425..=0x0427 => Some(1257),
291
292        // Default to Code Page 1252 for Western European
293        _ => Some(1252),
294    }
295}
296
297/// Returns the encoding name for display/logging purposes.
298#[cfg(feature = "encoding")]
299pub fn encoding_name_for_lcid(lcid: u32) -> &'static str {
300    if is_utf8_collation(lcid) {
301        return "UTF-8";
302    }
303
304    match encoding_for_lcid(lcid) {
305        Some(enc) => enc.name(),
306        None => "windows-1252", // Default fallback
307    }
308}
309
310#[cfg(all(test, feature = "encoding"))]
311#[allow(clippy::unwrap_used)]
312mod tests {
313    use super::*;
314
315    #[test]
316    fn test_utf8_detection() {
317        // UTF-8 collation flag
318        assert!(is_utf8_collation(0x0800_0409)); // English with UTF-8
319        assert!(!is_utf8_collation(0x0409)); // English without UTF-8
320    }
321
322    #[test]
323    fn test_japanese_encoding() {
324        let enc = encoding_for_lcid(0x0411);
325        assert!(enc.is_some());
326        assert_eq!(enc.unwrap().name(), "Shift_JIS");
327        assert_eq!(code_page_for_lcid(0x0411), Some(932));
328    }
329
330    #[test]
331    fn test_chinese_simplified_encoding() {
332        let enc = encoding_for_lcid(0x0804);
333        assert!(enc.is_some());
334        assert_eq!(enc.unwrap().name(), "gb18030");
335        assert_eq!(code_page_for_lcid(0x0804), Some(936));
336    }
337
338    #[test]
339    fn test_chinese_traditional_encoding() {
340        let enc = encoding_for_lcid(0x0404);
341        assert!(enc.is_some());
342        assert_eq!(enc.unwrap().name(), "Big5");
343        assert_eq!(code_page_for_lcid(0x0404), Some(950));
344    }
345
346    #[test]
347    fn test_korean_encoding() {
348        let enc = encoding_for_lcid(0x0412);
349        assert!(enc.is_some());
350        assert_eq!(enc.unwrap().name(), "EUC-KR");
351        assert_eq!(code_page_for_lcid(0x0412), Some(949));
352    }
353
354    #[test]
355    fn test_cyrillic_encoding() {
356        // Russian
357        let enc = encoding_for_lcid(0x0419);
358        assert!(enc.is_some());
359        assert_eq!(enc.unwrap().name(), "windows-1251");
360        assert_eq!(code_page_for_lcid(0x0419), Some(1251));
361
362        // Ukrainian
363        let enc = encoding_for_lcid(0x0422);
364        assert!(enc.is_some());
365        assert_eq!(enc.unwrap().name(), "windows-1251");
366    }
367
368    #[test]
369    fn test_western_european_encoding() {
370        // English (US)
371        let enc = encoding_for_lcid(0x0409);
372        assert!(enc.is_some());
373        assert_eq!(enc.unwrap().name(), "windows-1252");
374        assert_eq!(code_page_for_lcid(0x0409), Some(1252));
375
376        // French
377        let enc = encoding_for_lcid(0x040C);
378        assert!(enc.is_some());
379        assert_eq!(enc.unwrap().name(), "windows-1252");
380
381        // German
382        let enc = encoding_for_lcid(0x0407);
383        assert!(enc.is_some());
384        assert_eq!(enc.unwrap().name(), "windows-1252");
385    }
386
387    #[test]
388    fn test_greek_encoding() {
389        let enc = encoding_for_lcid(0x0408);
390        assert!(enc.is_some());
391        assert_eq!(enc.unwrap().name(), "windows-1253");
392        assert_eq!(code_page_for_lcid(0x0408), Some(1253));
393    }
394
395    #[test]
396    fn test_turkish_encoding() {
397        let enc = encoding_for_lcid(0x041F);
398        assert!(enc.is_some());
399        assert_eq!(enc.unwrap().name(), "windows-1254");
400        assert_eq!(code_page_for_lcid(0x041F), Some(1254));
401    }
402
403    #[test]
404    fn test_hebrew_encoding() {
405        let enc = encoding_for_lcid(0x040D);
406        assert!(enc.is_some());
407        assert_eq!(enc.unwrap().name(), "windows-1255");
408        assert_eq!(code_page_for_lcid(0x040D), Some(1255));
409    }
410
411    #[test]
412    fn test_arabic_encoding() {
413        // Arabic (Saudi Arabia)
414        let enc = encoding_for_lcid(0x0401);
415        assert!(enc.is_some());
416        assert_eq!(enc.unwrap().name(), "windows-1256");
417        assert_eq!(code_page_for_lcid(0x0401), Some(1256));
418
419        // Farsi/Persian
420        let enc = encoding_for_lcid(0x0429);
421        assert!(enc.is_some());
422        assert_eq!(enc.unwrap().name(), "windows-1256");
423    }
424
425    #[test]
426    fn test_baltic_encoding() {
427        // Estonian
428        let enc = encoding_for_lcid(0x0425);
429        assert!(enc.is_some());
430        assert_eq!(enc.unwrap().name(), "windows-1257");
431        assert_eq!(code_page_for_lcid(0x0425), Some(1257));
432
433        // Lithuanian
434        let enc = encoding_for_lcid(0x0427);
435        assert!(enc.is_some());
436        assert_eq!(enc.unwrap().name(), "windows-1257");
437    }
438
439    #[test]
440    fn test_thai_encoding() {
441        let enc = encoding_for_lcid(0x041E);
442        assert!(enc.is_some());
443        assert_eq!(enc.unwrap().name(), "windows-874");
444        assert_eq!(code_page_for_lcid(0x041E), Some(874));
445    }
446
447    #[test]
448    fn test_vietnamese_encoding() {
449        let enc = encoding_for_lcid(0x042A);
450        assert!(enc.is_some());
451        assert_eq!(enc.unwrap().name(), "windows-1258");
452        assert_eq!(code_page_for_lcid(0x042A), Some(1258));
453    }
454
455    #[test]
456    fn test_unknown_lcid_fallback() {
457        // Unknown LCID should return None (caller uses Windows-1252)
458        let enc = encoding_for_lcid(0x9999);
459        assert!(enc.is_none());
460        // But code page should default to 1252
461        assert_eq!(code_page_for_lcid(0x9999), Some(1252));
462    }
463
464    #[test]
465    fn test_encoding_name() {
466        assert_eq!(encoding_name_for_lcid(0x0411), "Shift_JIS");
467        assert_eq!(encoding_name_for_lcid(0x0419), "windows-1251");
468        assert_eq!(encoding_name_for_lcid(0x0800_0409), "UTF-8");
469        assert_eq!(encoding_name_for_lcid(0x9999), "windows-1252"); // fallback
470    }
471
472    #[test]
473    fn test_decode_chinese_text() {
474        let enc = encoding_for_lcid(0x0804).unwrap();
475        // "中文" in GB18030 encoding
476        let gb_bytes = [0xD6, 0xD0, 0xCE, 0xC4];
477        let (decoded, _, had_errors) = enc.decode(&gb_bytes);
478        assert!(!had_errors);
479        assert_eq!(decoded, "中文");
480    }
481
482    #[test]
483    fn test_decode_cyrillic_text() {
484        let enc = encoding_for_lcid(0x0419).unwrap();
485        // "Привет" in Windows-1251
486        let cp1251_bytes = [0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2];
487        let (decoded, _, had_errors) = enc.decode(&cp1251_bytes);
488        assert!(!had_errors);
489        assert_eq!(decoded, "Привет");
490    }
491
492    #[test]
493    fn test_decode_japanese_text() {
494        let enc = encoding_for_lcid(0x0411).unwrap();
495        // "日本語" in Shift_JIS
496        let sjis_bytes = [0x93, 0xFA, 0x96, 0x7B, 0x8C, 0xEA];
497        let (decoded, _, had_errors) = enc.decode(&sjis_bytes);
498        assert!(!had_errors);
499        assert_eq!(decoded, "日本語");
500    }
501}