Skip to main content

tds_protocol/
collation.rs

1//! Collation encoding support for SQL Server VARCHAR decoding.
2//!
3//! This module provides mappings from SQL Server collation LCIDs (Locale IDs)
4//! to their corresponding character encodings, enabling proper decoding of
5//! non-UTF-8 VARCHAR data.
6//!
7//! # Supported Encodings
8//!
9//! The following encoding families are supported based on the collation's LCID:
10//!
11//! | Code Page | Encoding | Languages |
12//! |-----------|----------|-----------|
13//! | 874 | Windows-874 (TIS-620) | Thai |
14//! | 932 | Shift_JIS | Japanese |
15//! | 936 | GBK/GB18030 | Simplified Chinese |
16//! | 949 | EUC-KR | Korean |
17//! | 950 | Big5 | Traditional Chinese |
18//! | 1250 | Windows-1250 | Central/Eastern European |
19//! | 1251 | Windows-1251 | Cyrillic |
20//! | 1252 | Windows-1252 | Western European (default) |
21//! | 1253 | Windows-1253 | Greek |
22//! | 1254 | Windows-1254 | Turkish |
23//! | 1255 | Windows-1255 | Hebrew |
24//! | 1256 | Windows-1256 | Arabic |
25//! | 1257 | Windows-1257 | Baltic |
26//! | 1258 | Windows-1258 | Vietnamese |
27//!
28//! # UTF-8 Collations
29//!
30//! SQL Server 2019+ supports UTF-8 collations (suffix `_UTF8`). These are
31//! detected by checking the collation flags. When a UTF-8 collation is used,
32//! no encoding conversion is needed as the data is already UTF-8.
33//!
34//! # References
35//!
36//! - [MS-LCID: Windows Language Code Identifier Reference](https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/)
37//! - [Code Page Identifiers](https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers)
38
39#[cfg(feature = "encoding")]
40use encoding_rs::Encoding;
41
42// Re-export Vec from the internal prelude for no_std + alloc builds.
43use crate::prelude::*;
44
45/// Flag bit indicating UTF-8 collation (SQL Server 2019+).
46/// This is fUTF8, bit 26 (0x0400_0000) in the collation info field per the
47/// MS-TDS Collation Rule Definition (bit 27 is FRESERVEDBIT). Matches
48/// mssql-jdbc's `UTF8_IN_TDSCOLLATION = 0x4000000`.
49pub const COLLATION_FLAG_UTF8: u32 = 0x0400_0000;
50
51/// Mask to extract the primary LCID from the collation info.
52/// The LCID is stored in the lower 20 bits.
53pub const LCID_MASK: u32 = 0x000F_FFFF;
54
55/// Mask to extract the primary language ID (lower 16 bits of LCID).
56pub const PRIMARY_LANGUAGE_MASK: u32 = 0x0000_FFFF;
57
58/// Returns whether the collation uses UTF-8 encoding.
59///
60/// SQL Server 2019+ supports UTF-8 collations with the `_UTF8` suffix.
61/// These collations set fUTF8 (bit 26) in the collation info field.
62#[inline]
63pub fn is_utf8_collation(lcid: u32) -> bool {
64    lcid & COLLATION_FLAG_UTF8 != 0
65}
66
67/// Returns the encoding for a given LCID, if known.
68///
69/// This function maps SQL Server collation LCIDs to their corresponding
70/// character encodings from the `encoding_rs` crate.
71///
72/// # Arguments
73///
74/// * `lcid` - The locale ID from the SQL Server collation
75///
76/// # Returns
77///
78/// * `Some(&Encoding)` - The corresponding encoding if the LCID is recognized
79/// * `None` - If the LCID is not recognized or uses UTF-8
80///
81/// # UTF-8 Handling
82///
83/// UTF-8 collations (SQL Server 2019+) return `None` because no transcoding
84/// is needed - the data is already valid UTF-8.
85#[cfg(feature = "encoding")]
86pub fn encoding_for_lcid(lcid: u32) -> Option<&'static Encoding> {
87    // UTF-8 collations don't need transcoding
88    if is_utf8_collation(lcid) {
89        return None;
90    }
91
92    // Extract the primary language ID
93    let primary_lang = lcid & PRIMARY_LANGUAGE_MASK;
94
95    // Map LCID to encoding based on Windows code page assignments
96    // Reference: https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/
97    match primary_lang {
98        // Japanese (Code Page 932 - Shift_JIS)
99        0x0411 => Some(encoding_rs::SHIFT_JIS),
100
101        // Chinese - Simplified (Code Page 936 - GBK/GB18030)
102        // Includes: zh-CN, zh-SG
103        0x0804 | // Chinese (Simplified, PRC)
104        0x1004   // Chinese (Simplified, Singapore)
105        => Some(encoding_rs::GB18030),
106
107        // Chinese - Traditional (Code Page 950 - Big5)
108        // Includes: zh-TW, zh-HK, zh-MO
109        0x0404 | // Chinese (Traditional, Taiwan)
110        0x0C04 | // Chinese (Traditional, Hong Kong SAR)
111        0x1404   // Chinese (Traditional, Macao SAR)
112        => Some(encoding_rs::BIG5),
113
114        // Korean (Code Page 949 - EUC-KR)
115        0x0412 => Some(encoding_rs::EUC_KR),
116
117        // Thai (Code Page 874 - Windows-874/TIS-620)
118        0x041E => Some(encoding_rs::WINDOWS_874),
119
120        // Vietnamese (Code Page 1258)
121        0x042A => Some(encoding_rs::WINDOWS_1258),
122
123        // Central/Eastern European (Code Page 1250)
124        // Includes: Czech, Polish, Hungarian, Croatian, Slovak, Slovenian, Romanian, Albanian
125        0x0405 | // Czech
126        0x0415 | // Polish
127        0x040E | // Hungarian
128        0x041A | // Croatian
129        0x081A | // Serbian (Latin)
130        0x141A | // Bosnian (Latin)
131        0x101A | // Croatian (Bosnia and Herzegovina)
132        0x041B | // Slovak
133        0x0424 | // Slovenian
134        0x0418 | // Romanian
135        0x041C   // Albanian
136        => Some(encoding_rs::WINDOWS_1250),
137
138        // Cyrillic (Code Page 1251)
139        // Includes: Russian, Ukrainian, Belarusian, Bulgarian, Macedonian, Serbian Cyrillic
140        0x0419 | // Russian
141        0x0422 | // Ukrainian
142        0x0423 | // Belarusian
143        0x0402 | // Bulgarian
144        0x042F | // Macedonian
145        0x0C1A | // Serbian (Cyrillic)
146        0x201A | // Bosnian (Cyrillic)
147        0x0440 | // Kyrgyz
148        0x0843 | // Uzbek (Cyrillic)
149        0x0444 | // Tatar
150        0x0450 | // Mongolian (Cyrillic)
151        0x0485   // Sakha
152        => Some(encoding_rs::WINDOWS_1251),
153
154        // Greek (Code Page 1253)
155        0x0408 => Some(encoding_rs::WINDOWS_1253),
156
157        // Turkish (Code Page 1254)
158        0x041F | // Turkish
159        0x042C   // Azerbaijani (Latin)
160        => Some(encoding_rs::WINDOWS_1254),
161
162        // Hebrew (Code Page 1255)
163        0x040D => Some(encoding_rs::WINDOWS_1255),
164
165        // Arabic (Code Page 1256)
166        // Includes all Arabic variants and Farsi/Persian, Urdu, etc.
167        0x0401 | // Arabic (Saudi Arabia)
168        0x0801 | // Arabic (Iraq)
169        0x0C01 | // Arabic (Egypt)
170        0x1001 | // Arabic (Libya)
171        0x1401 | // Arabic (Algeria)
172        0x1801 | // Arabic (Morocco)
173        0x1C01 | // Arabic (Tunisia)
174        0x2001 | // Arabic (Oman)
175        0x2401 | // Arabic (Yemen)
176        0x2801 | // Arabic (Syria)
177        0x2C01 | // Arabic (Jordan)
178        0x3001 | // Arabic (Lebanon)
179        0x3401 | // Arabic (Kuwait)
180        0x3801 | // Arabic (UAE)
181        0x3C01 | // Arabic (Bahrain)
182        0x4001 | // Arabic (Qatar)
183        0x0429 | // Farsi/Persian
184        0x0420 | // Urdu
185        0x048C | // Dari
186        0x0463   // Pashto
187        => Some(encoding_rs::WINDOWS_1256),
188
189        // Baltic (Code Page 1257)
190        0x0425..=0x0427   // Lithuanian
191        => Some(encoding_rs::WINDOWS_1257),
192
193        // Western European (Code Page 1252) - Default for most European languages
194        // Includes: English, French, German, Spanish, Italian, Portuguese, Dutch, etc.
195        0x0409 | // English (United States)
196        0x0809 | // English (United Kingdom)
197        0x0C09 | // English (Australia)
198        0x1009 | // English (Canada)
199        0x1409 | // English (New Zealand)
200        0x1809 | // English (Ireland)
201        0x040C | // French (France)
202        0x080C | // French (Belgium)
203        0x0C0C | // French (Canada)
204        0x100C | // French (Switzerland)
205        0x140C | // French (Luxembourg)
206        0x0407 | // German (Germany)
207        0x0807 | // German (Switzerland)
208        0x0C07 | // German (Austria)
209        0x1007 | // German (Luxembourg)
210        0x1407 | // German (Liechtenstein)
211        0x040A | // Spanish (Traditional Sort)
212        0x080A | // Spanish (Mexico)
213        0x0C0A | // Spanish (Modern Sort)
214        0x100A | // Spanish (Guatemala)
215        0x140A | // Spanish (Costa Rica)
216        0x180A | // Spanish (Panama)
217        0x1C0A | // Spanish (Dominican Republic)
218        0x200A | // Spanish (Venezuela)
219        0x240A | // Spanish (Colombia)
220        0x280A | // Spanish (Peru)
221        0x2C0A | // Spanish (Argentina)
222        0x300A | // Spanish (Ecuador)
223        0x340A | // Spanish (Chile)
224        0x380A | // Spanish (Uruguay)
225        0x3C0A | // Spanish (Paraguay)
226        0x400A | // Spanish (Bolivia)
227        0x440A | // Spanish (El Salvador)
228        0x480A | // Spanish (Honduras)
229        0x4C0A | // Spanish (Nicaragua)
230        0x500A | // Spanish (Puerto Rico)
231        0x0410 | // Italian (Italy)
232        0x0810 | // Italian (Switzerland)
233        0x0816 | // Portuguese (Portugal)
234        0x0416 | // Portuguese (Brazil)
235        0x0413 | // Dutch (Netherlands)
236        0x0813 | // Dutch (Belgium)
237        0x0406 | // Danish
238        0x0414 | // Norwegian (Bokmål)
239        0x0814 | // Norwegian (Nynorsk)
240        0x041D | // Swedish
241        0x081D | // Swedish (Finland)
242        0x040B | // Finnish
243        0x040F | // Icelandic
244        0x0403 | // Catalan
245        0x0456 | // Galician
246        0x042D | // Basque
247        0x0436 | // Afrikaans
248        0x0421 | // Indonesian
249        0x043E | // Malay (Malaysia)
250        0x0441   // Swahili
251        => Some(encoding_rs::WINDOWS_1252),
252
253        // Unknown LCID - return None, caller should use Windows-1252 as fallback
254        _ => None,
255    }
256}
257
258/// Returns the encoding for a SQL collation identified by its SortId.
259///
260/// A collation with a non-zero SortId is a "SQL collation" (one of a
261/// predefined set of sort orders); its code page is derived from the SortId,
262/// not the LCID (MS-TDS Collation rule). For example
263/// `SQL_Latin1_General_CP1250_CS_AS` has SortId 80 → windows-1250, while its
264/// LCID would otherwise resolve to windows-1252.
265///
266/// Returns `None` for SortIds whose code page `encoding_rs` cannot represent
267/// (the OEM code pages CP437 and CP850 are not in the WHATWG encoding set)
268/// and for unknown SortIds. Table derived from the Microsoft/mssql-jdbc
269/// SortId mapping.
270#[cfg(feature = "encoding")]
271pub fn encoding_for_sort_id(sort_id: u8) -> Option<&'static Encoding> {
272    match sort_id {
273        // 30..=35  => CP437 — not representable in encoding_rs
274        // 40..=49  => CP850 — not representable in encoding_rs
275        50..=54 | 71..=75 | 183..=186 | 210..=217 => Some(encoding_rs::WINDOWS_1252),
276        // 55..=62  => CP850 — not representable in encoding_rs
277        80..=98 => Some(encoding_rs::WINDOWS_1250),
278        104..=108 => Some(encoding_rs::WINDOWS_1251),
279        112..=114 | 120..=124 => Some(encoding_rs::WINDOWS_1253),
280        128..=130 => Some(encoding_rs::WINDOWS_1254),
281        136..=138 => Some(encoding_rs::WINDOWS_1255),
282        144..=146 => Some(encoding_rs::WINDOWS_1256),
283        152..=160 => Some(encoding_rs::WINDOWS_1257),
284        192 | 193 | 200 => Some(encoding_rs::SHIFT_JIS), // CP932
285        194 | 195 | 201 => Some(encoding_rs::EUC_KR),    // CP949
286        196 | 197 | 202 => Some(encoding_rs::BIG5),      // CP950
287        198 | 199 | 203 => Some(encoding_rs::GB18030),   // CP936
288        204..=206 => Some(encoding_rs::WINDOWS_874),
289        _ => None,
290    }
291}
292
293/// Returns the Windows code page number for a SQL collation's SortId.
294///
295/// Unlike [`encoding_for_sort_id`], this reports the true code page even for
296/// the OEM pages `encoding_rs` cannot decode (437, 850), so callers can
297/// produce an accurate "unsupported code page" error. Returns `None` for
298/// unknown SortIds.
299#[cfg(feature = "encoding")]
300pub fn code_page_for_sort_id(sort_id: u8) -> Option<u16> {
301    match sort_id {
302        30..=35 => Some(437),
303        40..=49 | 55..=62 => Some(850),
304        50..=54 | 71..=75 | 183..=186 | 210..=217 => Some(1252),
305        80..=98 => Some(1250),
306        104..=108 => Some(1251),
307        112..=114 | 120..=124 => Some(1253),
308        128..=130 => Some(1254),
309        136..=138 => Some(1255),
310        144..=146 => Some(1256),
311        152..=160 => Some(1257),
312        192 | 193 | 200 => Some(932),
313        194 | 195 | 201 => Some(949),
314        196 | 197 | 202 => Some(950),
315        198 | 199 | 203 => Some(936),
316        204..=206 => Some(874),
317        _ => None,
318    }
319}
320
321/// Returns the Windows code page number for a given LCID.
322///
323/// This is useful for error messages and debugging.
324#[cfg(feature = "encoding")]
325pub fn code_page_for_lcid(lcid: u32) -> Option<u16> {
326    if is_utf8_collation(lcid) {
327        return Some(65001); // UTF-8
328    }
329
330    let primary_lang = lcid & PRIMARY_LANGUAGE_MASK;
331
332    match primary_lang {
333        0x0411 => Some(932),                   // Japanese - Shift_JIS
334        0x0804 | 0x1004 => Some(936),          // Chinese Simplified - GBK
335        0x0404 | 0x0C04 | 0x1404 => Some(950), // Chinese Traditional - Big5
336        0x0412 => Some(949),                   // Korean - EUC-KR
337        0x041E => Some(874),                   // Thai
338        0x042A => Some(1258),                  // Vietnamese
339
340        // Code Page 1250 - Central European
341        0x0405 | 0x0415 | 0x040E | 0x041A | 0x081A | 0x141A | 0x101A | 0x041B | 0x0424 | 0x0418
342        | 0x041C => Some(1250),
343
344        // Code Page 1251 - Cyrillic
345        0x0419 | 0x0422 | 0x0423 | 0x0402 | 0x042F | 0x0C1A | 0x201A | 0x0440 | 0x0843 | 0x0444
346        | 0x0450 | 0x0485 => Some(1251),
347
348        0x0408 => Some(1253),          // Greek
349        0x041F | 0x042C => Some(1254), // Turkish, Azerbaijani
350        0x040D => Some(1255),          // Hebrew
351
352        // Code Page 1256 - Arabic
353        0x0401 | 0x0801 | 0x0C01 | 0x1001 | 0x1401 | 0x1801 | 0x1C01 | 0x2001 | 0x2401 | 0x2801
354        | 0x2C01 | 0x3001 | 0x3401 | 0x3801 | 0x3C01 | 0x4001 | 0x0429 | 0x0420 | 0x048C
355        | 0x0463 => Some(1256),
356
357        // Code Page 1257 - Baltic
358        0x0425..=0x0427 => Some(1257),
359
360        // Default to Code Page 1252 for Western European
361        _ => Some(1252),
362    }
363}
364
365/// Returns the encoding name for display/logging purposes.
366#[cfg(feature = "encoding")]
367pub fn encoding_name_for_lcid(lcid: u32) -> &'static str {
368    if is_utf8_collation(lcid) {
369        return "UTF-8";
370    }
371
372    match encoding_for_lcid(lcid) {
373        Some(enc) => enc.name(),
374        None => "windows-1252", // Default fallback
375    }
376}
377
378/// Encode UTF-8 text into the given codepage, replacing unmappable
379/// characters with `?`.
380///
381/// `encoding_rs`'s convenience `Encoding::encode()` performs HTML-form
382/// substitution — unmappable characters become decimal numeric character
383/// references like `&#20320;`. That is never what a database client wants:
384/// one CJK character would silently expand into eight ASCII characters of
385/// markup. SQL Server itself and the other first-party drivers (ADO.NET,
386/// JDBC, ODBC) substitute `?`, so this drives the lower-level encoder by
387/// hand to match that convention.
388#[cfg(feature = "encoding")]
389fn encode_lossy_question_mark(value: &str, encoding: &'static encoding_rs::Encoding) -> Vec<u8> {
390    let mut encoder = encoding.new_encoder();
391    let mut out = Vec::with_capacity(value.len());
392    let mut buf = [0u8; 1024];
393    let mut input = value;
394    loop {
395        let (result, read, written) =
396            encoder.encode_from_utf8_without_replacement(input, &mut buf, true);
397        out.extend_from_slice(&buf[..written]);
398        input = &input[read..];
399        match result {
400            encoding_rs::EncoderResult::InputEmpty => break,
401            encoding_rs::EncoderResult::OutputFull => {}
402            encoding_rs::EncoderResult::Unmappable(_) => out.push(b'?'),
403        }
404    }
405    out
406}
407
408/// Low-level collation-aware string encoder shared across the workspace crates.
409///
410/// Internal plumbing reached cross-crate only via [`crate::__private`]; not
411/// public API and exempt from semver guarantees (see #242).
412pub(crate) mod sealed {
413    use super::*;
414
415    /// Transcode a Rust `&str` into single-byte VARCHAR bytes for the given collation.
416    ///
417    /// - UTF-8 collations (SQL Server 2019+) pass through as raw UTF-8 bytes.
418    /// - Known non-UTF-8 LCIDs transcode via the matching `encoding_rs` codec.
419    /// - Unknown or `None` collations fall back to Windows-1252 (Latin1_General_CI_AS).
420    ///
421    /// Characters not representable in the target codepage are replaced with `?`,
422    /// matching SQL Server's own conversion behavior and the other first-party
423    /// drivers. (Regardless of whether the `encoding` feature is enabled.)
424    pub fn encode_str_for_collation(
425        value: &str,
426        collation: Option<&crate::token::Collation>,
427    ) -> Vec<u8> {
428        #[cfg(feature = "encoding")]
429        {
430            if let Some(c) = collation {
431                if c.is_utf8() {
432                    return value.as_bytes().to_vec();
433                }
434                if let Some(encoding) = c.encoding() {
435                    return encode_lossy_question_mark(value, encoding);
436                }
437            }
438            encode_lossy_question_mark(value, encoding_rs::WINDOWS_1252)
439        }
440        #[cfg(not(feature = "encoding"))]
441        {
442            let _ = collation;
443            value
444                .chars()
445                .map(|ch| if (ch as u32) <= 0xFF { ch as u8 } else { b'?' })
446                .collect()
447        }
448    }
449}
450
451// Keep the crate-private path `crate::collation::encode_str_for_collation`
452// working for intra-crate callers (rpc, tvp); off the public surface.
453pub(crate) use sealed::encode_str_for_collation;
454
455#[cfg(all(test, feature = "encoding"))]
456#[allow(clippy::unwrap_used)]
457mod tests {
458    use super::*;
459
460    #[test]
461    fn test_utf8_detection() {
462        // UTF-8 collation flag: fUTF8 is bit 26 (0x0400_0000), matching
463        // mssql-jdbc's UTF8_IN_TDSCOLLATION. A real Latin1_General_100_*_UTF8
464        // collation info field has this bit set.
465        assert!(is_utf8_collation(0x0400_0409)); // English with UTF-8
466        assert!(!is_utf8_collation(0x0409)); // English without UTF-8
467        // Regression: bit 27 is FRESERVEDBIT, not fUTF8. Treating it as UTF-8
468        // made is_utf8() always false for real _UTF8 collations (issue #153).
469        assert!(!is_utf8_collation(0x0800_0409));
470    }
471
472    /// Unmappable characters must become `?` (SQL Server / ADO.NET / JDBC
473    /// convention), NOT the decimal numeric character references
474    /// (`&#20320;`) that `encoding_rs::Encoding::encode()` produces for HTML
475    /// form submission. A regression here silently expands one CJK char into
476    /// eight ASCII chars of markup in the database.
477    #[test]
478    fn test_unmappable_chars_become_question_marks() {
479        // Windows-1252 fallback: Ω and 你 are unmappable, € maps to 0x80.
480        let encoded = encode_str_for_collation("aΩ€你b", None);
481        assert_eq!(encoded, b"a?\x80?b");
482        // No NCR entities anywhere.
483        let one_each = encode_str_for_collation("你好世界", None);
484        assert_eq!(one_each, b"????");
485
486        // Collation-specific path: GB18030 maps the CJK chars but not every
487        // codepoint; the mappable ones must round through the codec.
488        let chinese = crate::token::Collation {
489            lcid: 0x0804,
490            sort_id: 0,
491        };
492        let encoded = encode_str_for_collation("你好", Some(&chinese));
493        let (expected, _, _) = encoding_rs::GB18030.encode("你好");
494        assert_eq!(encoded, expected.into_owned());
495    }
496
497    #[test]
498    fn test_japanese_encoding() {
499        let enc = encoding_for_lcid(0x0411);
500        assert!(enc.is_some());
501        assert_eq!(enc.unwrap().name(), "Shift_JIS");
502        assert_eq!(code_page_for_lcid(0x0411), Some(932));
503    }
504
505    #[test]
506    fn test_chinese_simplified_encoding() {
507        let enc = encoding_for_lcid(0x0804);
508        assert!(enc.is_some());
509        assert_eq!(enc.unwrap().name(), "gb18030");
510        assert_eq!(code_page_for_lcid(0x0804), Some(936));
511    }
512
513    #[test]
514    fn test_chinese_traditional_encoding() {
515        let enc = encoding_for_lcid(0x0404);
516        assert!(enc.is_some());
517        assert_eq!(enc.unwrap().name(), "Big5");
518        assert_eq!(code_page_for_lcid(0x0404), Some(950));
519    }
520
521    #[test]
522    fn test_korean_encoding() {
523        let enc = encoding_for_lcid(0x0412);
524        assert!(enc.is_some());
525        assert_eq!(enc.unwrap().name(), "EUC-KR");
526        assert_eq!(code_page_for_lcid(0x0412), Some(949));
527    }
528
529    #[test]
530    fn test_cyrillic_encoding() {
531        // Russian
532        let enc = encoding_for_lcid(0x0419);
533        assert!(enc.is_some());
534        assert_eq!(enc.unwrap().name(), "windows-1251");
535        assert_eq!(code_page_for_lcid(0x0419), Some(1251));
536
537        // Ukrainian
538        let enc = encoding_for_lcid(0x0422);
539        assert!(enc.is_some());
540        assert_eq!(enc.unwrap().name(), "windows-1251");
541    }
542
543    #[test]
544    fn test_western_european_encoding() {
545        // English (US)
546        let enc = encoding_for_lcid(0x0409);
547        assert!(enc.is_some());
548        assert_eq!(enc.unwrap().name(), "windows-1252");
549        assert_eq!(code_page_for_lcid(0x0409), Some(1252));
550
551        // French
552        let enc = encoding_for_lcid(0x040C);
553        assert!(enc.is_some());
554        assert_eq!(enc.unwrap().name(), "windows-1252");
555
556        // German
557        let enc = encoding_for_lcid(0x0407);
558        assert!(enc.is_some());
559        assert_eq!(enc.unwrap().name(), "windows-1252");
560    }
561
562    #[test]
563    fn test_greek_encoding() {
564        let enc = encoding_for_lcid(0x0408);
565        assert!(enc.is_some());
566        assert_eq!(enc.unwrap().name(), "windows-1253");
567        assert_eq!(code_page_for_lcid(0x0408), Some(1253));
568    }
569
570    #[test]
571    fn test_turkish_encoding() {
572        let enc = encoding_for_lcid(0x041F);
573        assert!(enc.is_some());
574        assert_eq!(enc.unwrap().name(), "windows-1254");
575        assert_eq!(code_page_for_lcid(0x041F), Some(1254));
576    }
577
578    #[test]
579    fn test_hebrew_encoding() {
580        let enc = encoding_for_lcid(0x040D);
581        assert!(enc.is_some());
582        assert_eq!(enc.unwrap().name(), "windows-1255");
583        assert_eq!(code_page_for_lcid(0x040D), Some(1255));
584    }
585
586    #[test]
587    fn test_arabic_encoding() {
588        // Arabic (Saudi Arabia)
589        let enc = encoding_for_lcid(0x0401);
590        assert!(enc.is_some());
591        assert_eq!(enc.unwrap().name(), "windows-1256");
592        assert_eq!(code_page_for_lcid(0x0401), Some(1256));
593
594        // Farsi/Persian
595        let enc = encoding_for_lcid(0x0429);
596        assert!(enc.is_some());
597        assert_eq!(enc.unwrap().name(), "windows-1256");
598    }
599
600    #[test]
601    fn test_baltic_encoding() {
602        // Estonian
603        let enc = encoding_for_lcid(0x0425);
604        assert!(enc.is_some());
605        assert_eq!(enc.unwrap().name(), "windows-1257");
606        assert_eq!(code_page_for_lcid(0x0425), Some(1257));
607
608        // Lithuanian
609        let enc = encoding_for_lcid(0x0427);
610        assert!(enc.is_some());
611        assert_eq!(enc.unwrap().name(), "windows-1257");
612    }
613
614    /// Issue #158: a SQL collation's code page comes from its SortId, not the
615    /// LCID. Without consulting SortId these silently decoded as
616    /// windows-1252.
617    #[test]
618    fn test_sort_id_drives_encoding() {
619        // SQL_Latin1_General_CP1250_CS_AS — SortId 80, LCID 0x0409 (English,
620        // which would otherwise resolve to windows-1252).
621        assert_eq!(
622            encoding_for_sort_id(80).map(|e| e.name()),
623            Some("windows-1250")
624        );
625        assert_eq!(code_page_for_sort_id(80), Some(1250));
626
627        // Cyrillic, Greek, Turkish, Hebrew, Arabic, Baltic SQL collations.
628        assert_eq!(
629            encoding_for_sort_id(105).map(|e| e.name()),
630            Some("windows-1251")
631        );
632        assert_eq!(
633            encoding_for_sort_id(112).map(|e| e.name()),
634            Some("windows-1253")
635        );
636        assert_eq!(
637            encoding_for_sort_id(128).map(|e| e.name()),
638            Some("windows-1254")
639        );
640        assert_eq!(
641            encoding_for_sort_id(136).map(|e| e.name()),
642            Some("windows-1255")
643        );
644        assert_eq!(
645            encoding_for_sort_id(144).map(|e| e.name()),
646            Some("windows-1256")
647        );
648        assert_eq!(
649            encoding_for_sort_id(152).map(|e| e.name()),
650            Some("windows-1257")
651        );
652
653        // The common default SQL_Latin1_General_CP1_CI_AS (SortId 52) is 1252.
654        assert_eq!(
655            encoding_for_sort_id(52).map(|e| e.name()),
656            Some("windows-1252")
657        );
658
659        // CJK SQL collations.
660        assert_eq!(
661            encoding_for_sort_id(192).map(|e| e.name()),
662            Some("Shift_JIS")
663        );
664        assert_eq!(encoding_for_sort_id(198).map(|e| e.name()), Some("gb18030"));
665
666        // Issue #187: SortId 201 is NLS_CP949_CS (Korean Wansung
667        // case-sensitive) → CP949, not CP950/Big5. Its neighbors stay CP950.
668        assert_eq!(encoding_for_sort_id(201).map(|e| e.name()), Some("EUC-KR"));
669        assert_eq!(code_page_for_sort_id(201), Some(949));
670        assert_eq!(encoding_for_sort_id(202).map(|e| e.name()), Some("Big5"));
671        assert_eq!(code_page_for_sort_id(202), Some(950));
672
673        // CP437/CP850 are not representable in encoding_rs: no encoding, but
674        // the true code page is still reported for error messages.
675        assert_eq!(encoding_for_sort_id(40), None);
676        assert_eq!(code_page_for_sort_id(40), Some(850));
677        assert_eq!(encoding_for_sort_id(30), None);
678        assert_eq!(code_page_for_sort_id(30), Some(437));
679
680        // Unknown SortId.
681        assert_eq!(encoding_for_sort_id(250), None);
682        assert_eq!(code_page_for_sort_id(250), None);
683    }
684
685    /// Issue #158: the `Collation` accessors must branch on SortId.
686    #[test]
687    fn test_collation_methods_consult_sort_id() {
688        use crate::token::Collation;
689
690        // SortId 80 (CP1250) with an English LCID — must NOT be 1252.
691        let sql_collation = Collation {
692            lcid: 0x0409,
693            sort_id: 80,
694        };
695        assert_eq!(
696            sql_collation.encoding().map(|e| e.name()),
697            Some("windows-1250")
698        );
699        assert_eq!(sql_collation.code_page(), Some(1250));
700        assert_eq!(sql_collation.encoding_name(), "windows-1250");
701
702        // SortId 0 (Windows collation) still uses the LCID path.
703        let win_collation = Collation {
704            lcid: 0x0419, // Russian
705            sort_id: 0,
706        };
707        assert_eq!(
708            win_collation.encoding().map(|e| e.name()),
709            Some("windows-1251")
710        );
711    }
712
713    #[test]
714    fn test_thai_encoding() {
715        let enc = encoding_for_lcid(0x041E);
716        assert!(enc.is_some());
717        assert_eq!(enc.unwrap().name(), "windows-874");
718        assert_eq!(code_page_for_lcid(0x041E), Some(874));
719    }
720
721    #[test]
722    fn test_vietnamese_encoding() {
723        let enc = encoding_for_lcid(0x042A);
724        assert!(enc.is_some());
725        assert_eq!(enc.unwrap().name(), "windows-1258");
726        assert_eq!(code_page_for_lcid(0x042A), Some(1258));
727    }
728
729    #[test]
730    fn test_unknown_lcid_fallback() {
731        // Unknown LCID should return None (caller uses Windows-1252)
732        let enc = encoding_for_lcid(0x9999);
733        assert!(enc.is_none());
734        // But code page should default to 1252
735        assert_eq!(code_page_for_lcid(0x9999), Some(1252));
736    }
737
738    #[test]
739    fn test_encoding_name() {
740        assert_eq!(encoding_name_for_lcid(0x0411), "Shift_JIS");
741        assert_eq!(encoding_name_for_lcid(0x0419), "windows-1251");
742        assert_eq!(encoding_name_for_lcid(0x0400_0409), "UTF-8");
743        assert_eq!(encoding_name_for_lcid(0x9999), "windows-1252"); // fallback
744    }
745
746    #[test]
747    fn test_decode_chinese_text() {
748        let enc = encoding_for_lcid(0x0804).unwrap();
749        // "中文" in GB18030 encoding
750        let gb_bytes = [0xD6, 0xD0, 0xCE, 0xC4];
751        let (decoded, _, had_errors) = enc.decode(&gb_bytes);
752        assert!(!had_errors);
753        assert_eq!(decoded, "中文");
754    }
755
756    #[test]
757    fn test_decode_cyrillic_text() {
758        let enc = encoding_for_lcid(0x0419).unwrap();
759        // "Привет" in Windows-1251
760        let cp1251_bytes = [0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2];
761        let (decoded, _, had_errors) = enc.decode(&cp1251_bytes);
762        assert!(!had_errors);
763        assert_eq!(decoded, "Привет");
764    }
765
766    #[test]
767    fn test_decode_japanese_text() {
768        let enc = encoding_for_lcid(0x0411).unwrap();
769        // "日本語" in Shift_JIS
770        let sjis_bytes = [0x93, 0xFA, 0x96, 0x7B, 0x8C, 0xEA];
771        let (decoded, _, had_errors) = enc.decode(&sjis_bytes);
772        assert!(!had_errors);
773        assert_eq!(decoded, "日本語");
774    }
775}