tds_protocol/collation.rs
1//! Collation encoding support for SQL Server VARCHAR decoding.
2//!
3//! This module provides mappings from SQL Server collation LCIDs (Locale IDs)
4//! to their corresponding character encodings, enabling proper decoding of
5//! non-UTF-8 VARCHAR data.
6//!
7//! # Supported Encodings
8//!
9//! The following encoding families are supported based on the collation's LCID:
10//!
11//! | Code Page | Encoding | Languages |
12//! |-----------|----------|-----------|
13//! | 874 | Windows-874 (TIS-620) | Thai |
14//! | 932 | Shift_JIS | Japanese |
15//! | 936 | GBK/GB18030 | Simplified Chinese |
16//! | 949 | EUC-KR | Korean |
17//! | 950 | Big5 | Traditional Chinese |
18//! | 1250 | Windows-1250 | Central/Eastern European |
19//! | 1251 | Windows-1251 | Cyrillic |
20//! | 1252 | Windows-1252 | Western European (default) |
21//! | 1253 | Windows-1253 | Greek |
22//! | 1254 | Windows-1254 | Turkish |
23//! | 1255 | Windows-1255 | Hebrew |
24//! | 1256 | Windows-1256 | Arabic |
25//! | 1257 | Windows-1257 | Baltic |
26//! | 1258 | Windows-1258 | Vietnamese |
27//!
28//! # UTF-8 Collations
29//!
30//! SQL Server 2019+ supports UTF-8 collations (suffix `_UTF8`). These are
31//! detected by checking the collation flags. When a UTF-8 collation is used,
32//! no encoding conversion is needed as the data is already UTF-8.
33//!
34//! # References
35//!
36//! - [MS-LCID: Windows Language Code Identifier Reference](https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/)
37//! - [Code Page Identifiers](https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers)
38
39#[cfg(feature = "encoding")]
40use encoding_rs::Encoding;
41
42// Re-export Vec from the internal prelude for no_std + alloc builds.
43use crate::prelude::*;
44
45/// Flag bit indicating UTF-8 collation (SQL Server 2019+).
46/// This is fUTF8, bit 26 (0x0400_0000) in the collation info field per the
47/// MS-TDS Collation Rule Definition (bit 27 is FRESERVEDBIT). Matches
48/// mssql-jdbc's `UTF8_IN_TDSCOLLATION = 0x4000000`.
49pub const COLLATION_FLAG_UTF8: u32 = 0x0400_0000;
50
51/// Mask to extract the primary LCID from the collation info.
52/// The LCID is stored in the lower 20 bits.
53pub const LCID_MASK: u32 = 0x000F_FFFF;
54
55/// Mask to extract the primary language ID (lower 16 bits of LCID).
56pub const PRIMARY_LANGUAGE_MASK: u32 = 0x0000_FFFF;
57
58/// Returns whether the collation uses UTF-8 encoding.
59///
60/// SQL Server 2019+ supports UTF-8 collations with the `_UTF8` suffix.
61/// These collations set fUTF8 (bit 26) in the collation info field.
62#[inline]
63pub fn is_utf8_collation(lcid: u32) -> bool {
64 lcid & COLLATION_FLAG_UTF8 != 0
65}
66
67/// Returns the encoding for a given LCID, if known.
68///
69/// This function maps SQL Server collation LCIDs to their corresponding
70/// character encodings from the `encoding_rs` crate.
71///
72/// # Arguments
73///
74/// * `lcid` - The locale ID from the SQL Server collation
75///
76/// # Returns
77///
78/// * `Some(&Encoding)` - The corresponding encoding if the LCID is recognized
79/// * `None` - If the LCID is not recognized or uses UTF-8
80///
81/// # UTF-8 Handling
82///
83/// UTF-8 collations (SQL Server 2019+) return `None` because no transcoding
84/// is needed - the data is already valid UTF-8.
85#[cfg(feature = "encoding")]
86pub fn encoding_for_lcid(lcid: u32) -> Option<&'static Encoding> {
87 // UTF-8 collations don't need transcoding
88 if is_utf8_collation(lcid) {
89 return None;
90 }
91
92 // Extract the primary language ID
93 let primary_lang = lcid & PRIMARY_LANGUAGE_MASK;
94
95 // Map LCID to encoding based on Windows code page assignments
96 // Reference: https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/
97 match primary_lang {
98 // Japanese (Code Page 932 - Shift_JIS)
99 0x0411 => Some(encoding_rs::SHIFT_JIS),
100
101 // Chinese - Simplified (Code Page 936 - GBK/GB18030)
102 // Includes: zh-CN, zh-SG
103 0x0804 | // Chinese (Simplified, PRC)
104 0x1004 // Chinese (Simplified, Singapore)
105 => Some(encoding_rs::GB18030),
106
107 // Chinese - Traditional (Code Page 950 - Big5)
108 // Includes: zh-TW, zh-HK, zh-MO
109 0x0404 | // Chinese (Traditional, Taiwan)
110 0x0C04 | // Chinese (Traditional, Hong Kong SAR)
111 0x1404 // Chinese (Traditional, Macao SAR)
112 => Some(encoding_rs::BIG5),
113
114 // Korean (Code Page 949 - EUC-KR)
115 0x0412 => Some(encoding_rs::EUC_KR),
116
117 // Thai (Code Page 874 - Windows-874/TIS-620)
118 0x041E => Some(encoding_rs::WINDOWS_874),
119
120 // Vietnamese (Code Page 1258)
121 0x042A => Some(encoding_rs::WINDOWS_1258),
122
123 // Central/Eastern European (Code Page 1250)
124 // Includes: Czech, Polish, Hungarian, Croatian, Slovak, Slovenian, Romanian, Albanian
125 0x0405 | // Czech
126 0x0415 | // Polish
127 0x040E | // Hungarian
128 0x041A | // Croatian
129 0x081A | // Serbian (Latin)
130 0x141A | // Bosnian (Latin)
131 0x101A | // Croatian (Bosnia and Herzegovina)
132 0x041B | // Slovak
133 0x0424 | // Slovenian
134 0x0418 | // Romanian
135 0x041C // Albanian
136 => Some(encoding_rs::WINDOWS_1250),
137
138 // Cyrillic (Code Page 1251)
139 // Includes: Russian, Ukrainian, Belarusian, Bulgarian, Macedonian, Serbian Cyrillic
140 0x0419 | // Russian
141 0x0422 | // Ukrainian
142 0x0423 | // Belarusian
143 0x0402 | // Bulgarian
144 0x042F | // Macedonian
145 0x0C1A | // Serbian (Cyrillic)
146 0x201A | // Bosnian (Cyrillic)
147 0x0440 | // Kyrgyz
148 0x0843 | // Uzbek (Cyrillic)
149 0x0444 | // Tatar
150 0x0450 | // Mongolian (Cyrillic)
151 0x0485 // Sakha
152 => Some(encoding_rs::WINDOWS_1251),
153
154 // Greek (Code Page 1253)
155 0x0408 => Some(encoding_rs::WINDOWS_1253),
156
157 // Turkish (Code Page 1254)
158 0x041F | // Turkish
159 0x042C // Azerbaijani (Latin)
160 => Some(encoding_rs::WINDOWS_1254),
161
162 // Hebrew (Code Page 1255)
163 0x040D => Some(encoding_rs::WINDOWS_1255),
164
165 // Arabic (Code Page 1256)
166 // Includes all Arabic variants and Farsi/Persian, Urdu, etc.
167 0x0401 | // Arabic (Saudi Arabia)
168 0x0801 | // Arabic (Iraq)
169 0x0C01 | // Arabic (Egypt)
170 0x1001 | // Arabic (Libya)
171 0x1401 | // Arabic (Algeria)
172 0x1801 | // Arabic (Morocco)
173 0x1C01 | // Arabic (Tunisia)
174 0x2001 | // Arabic (Oman)
175 0x2401 | // Arabic (Yemen)
176 0x2801 | // Arabic (Syria)
177 0x2C01 | // Arabic (Jordan)
178 0x3001 | // Arabic (Lebanon)
179 0x3401 | // Arabic (Kuwait)
180 0x3801 | // Arabic (UAE)
181 0x3C01 | // Arabic (Bahrain)
182 0x4001 | // Arabic (Qatar)
183 0x0429 | // Farsi/Persian
184 0x0420 | // Urdu
185 0x048C | // Dari
186 0x0463 // Pashto
187 => Some(encoding_rs::WINDOWS_1256),
188
189 // Baltic (Code Page 1257)
190 0x0425..=0x0427 // Lithuanian
191 => Some(encoding_rs::WINDOWS_1257),
192
193 // Western European (Code Page 1252) - Default for most European languages
194 // Includes: English, French, German, Spanish, Italian, Portuguese, Dutch, etc.
195 0x0409 | // English (United States)
196 0x0809 | // English (United Kingdom)
197 0x0C09 | // English (Australia)
198 0x1009 | // English (Canada)
199 0x1409 | // English (New Zealand)
200 0x1809 | // English (Ireland)
201 0x040C | // French (France)
202 0x080C | // French (Belgium)
203 0x0C0C | // French (Canada)
204 0x100C | // French (Switzerland)
205 0x140C | // French (Luxembourg)
206 0x0407 | // German (Germany)
207 0x0807 | // German (Switzerland)
208 0x0C07 | // German (Austria)
209 0x1007 | // German (Luxembourg)
210 0x1407 | // German (Liechtenstein)
211 0x040A | // Spanish (Traditional Sort)
212 0x080A | // Spanish (Mexico)
213 0x0C0A | // Spanish (Modern Sort)
214 0x100A | // Spanish (Guatemala)
215 0x140A | // Spanish (Costa Rica)
216 0x180A | // Spanish (Panama)
217 0x1C0A | // Spanish (Dominican Republic)
218 0x200A | // Spanish (Venezuela)
219 0x240A | // Spanish (Colombia)
220 0x280A | // Spanish (Peru)
221 0x2C0A | // Spanish (Argentina)
222 0x300A | // Spanish (Ecuador)
223 0x340A | // Spanish (Chile)
224 0x380A | // Spanish (Uruguay)
225 0x3C0A | // Spanish (Paraguay)
226 0x400A | // Spanish (Bolivia)
227 0x440A | // Spanish (El Salvador)
228 0x480A | // Spanish (Honduras)
229 0x4C0A | // Spanish (Nicaragua)
230 0x500A | // Spanish (Puerto Rico)
231 0x0410 | // Italian (Italy)
232 0x0810 | // Italian (Switzerland)
233 0x0816 | // Portuguese (Portugal)
234 0x0416 | // Portuguese (Brazil)
235 0x0413 | // Dutch (Netherlands)
236 0x0813 | // Dutch (Belgium)
237 0x0406 | // Danish
238 0x0414 | // Norwegian (Bokmål)
239 0x0814 | // Norwegian (Nynorsk)
240 0x041D | // Swedish
241 0x081D | // Swedish (Finland)
242 0x040B | // Finnish
243 0x040F | // Icelandic
244 0x0403 | // Catalan
245 0x0456 | // Galician
246 0x042D | // Basque
247 0x0436 | // Afrikaans
248 0x0421 | // Indonesian
249 0x043E | // Malay (Malaysia)
250 0x0441 // Swahili
251 => Some(encoding_rs::WINDOWS_1252),
252
253 // Unknown LCID - return None, caller should use Windows-1252 as fallback
254 _ => None,
255 }
256}
257
258/// Returns the encoding for a SQL collation identified by its SortId.
259///
260/// A collation with a non-zero SortId is a "SQL collation" (one of a
261/// predefined set of sort orders); its code page is derived from the SortId,
262/// not the LCID (MS-TDS Collation rule). For example
263/// `SQL_Latin1_General_CP1250_CS_AS` has SortId 80 → windows-1250, while its
264/// LCID would otherwise resolve to windows-1252.
265///
266/// Returns `None` for SortIds whose code page `encoding_rs` cannot represent
267/// (the OEM code pages CP437 and CP850 are not in the WHATWG encoding set)
268/// and for unknown SortIds. Table derived from the Microsoft/mssql-jdbc
269/// SortId mapping.
270#[cfg(feature = "encoding")]
271pub fn encoding_for_sort_id(sort_id: u8) -> Option<&'static Encoding> {
272 match sort_id {
273 // 30..=35 => CP437 — not representable in encoding_rs
274 // 40..=49 => CP850 — not representable in encoding_rs
275 50..=54 | 71..=75 | 183..=186 | 210..=217 => Some(encoding_rs::WINDOWS_1252),
276 // 55..=62 => CP850 — not representable in encoding_rs
277 80..=98 => Some(encoding_rs::WINDOWS_1250),
278 104..=108 => Some(encoding_rs::WINDOWS_1251),
279 112..=114 | 120..=124 => Some(encoding_rs::WINDOWS_1253),
280 128..=130 => Some(encoding_rs::WINDOWS_1254),
281 136..=138 => Some(encoding_rs::WINDOWS_1255),
282 144..=146 => Some(encoding_rs::WINDOWS_1256),
283 152..=160 => Some(encoding_rs::WINDOWS_1257),
284 192 | 193 | 200 => Some(encoding_rs::SHIFT_JIS), // CP932
285 194 | 195 | 201 => Some(encoding_rs::EUC_KR), // CP949
286 196 | 197 | 202 => Some(encoding_rs::BIG5), // CP950
287 198 | 199 | 203 => Some(encoding_rs::GB18030), // CP936
288 204..=206 => Some(encoding_rs::WINDOWS_874),
289 _ => None,
290 }
291}
292
293/// Returns the Windows code page number for a SQL collation's SortId.
294///
295/// Unlike [`encoding_for_sort_id`], this reports the true code page even for
296/// the OEM pages `encoding_rs` cannot decode (437, 850), so callers can
297/// produce an accurate "unsupported code page" error. Returns `None` for
298/// unknown SortIds.
299#[cfg(feature = "encoding")]
300pub fn code_page_for_sort_id(sort_id: u8) -> Option<u16> {
301 match sort_id {
302 30..=35 => Some(437),
303 40..=49 | 55..=62 => Some(850),
304 50..=54 | 71..=75 | 183..=186 | 210..=217 => Some(1252),
305 80..=98 => Some(1250),
306 104..=108 => Some(1251),
307 112..=114 | 120..=124 => Some(1253),
308 128..=130 => Some(1254),
309 136..=138 => Some(1255),
310 144..=146 => Some(1256),
311 152..=160 => Some(1257),
312 192 | 193 | 200 => Some(932),
313 194 | 195 | 201 => Some(949),
314 196 | 197 | 202 => Some(950),
315 198 | 199 | 203 => Some(936),
316 204..=206 => Some(874),
317 _ => None,
318 }
319}
320
321/// Returns the Windows code page number for a given LCID.
322///
323/// This is useful for error messages and debugging.
324#[cfg(feature = "encoding")]
325pub fn code_page_for_lcid(lcid: u32) -> Option<u16> {
326 if is_utf8_collation(lcid) {
327 return Some(65001); // UTF-8
328 }
329
330 let primary_lang = lcid & PRIMARY_LANGUAGE_MASK;
331
332 match primary_lang {
333 0x0411 => Some(932), // Japanese - Shift_JIS
334 0x0804 | 0x1004 => Some(936), // Chinese Simplified - GBK
335 0x0404 | 0x0C04 | 0x1404 => Some(950), // Chinese Traditional - Big5
336 0x0412 => Some(949), // Korean - EUC-KR
337 0x041E => Some(874), // Thai
338 0x042A => Some(1258), // Vietnamese
339
340 // Code Page 1250 - Central European
341 0x0405 | 0x0415 | 0x040E | 0x041A | 0x081A | 0x141A | 0x101A | 0x041B | 0x0424 | 0x0418
342 | 0x041C => Some(1250),
343
344 // Code Page 1251 - Cyrillic
345 0x0419 | 0x0422 | 0x0423 | 0x0402 | 0x042F | 0x0C1A | 0x201A | 0x0440 | 0x0843 | 0x0444
346 | 0x0450 | 0x0485 => Some(1251),
347
348 0x0408 => Some(1253), // Greek
349 0x041F | 0x042C => Some(1254), // Turkish, Azerbaijani
350 0x040D => Some(1255), // Hebrew
351
352 // Code Page 1256 - Arabic
353 0x0401 | 0x0801 | 0x0C01 | 0x1001 | 0x1401 | 0x1801 | 0x1C01 | 0x2001 | 0x2401 | 0x2801
354 | 0x2C01 | 0x3001 | 0x3401 | 0x3801 | 0x3C01 | 0x4001 | 0x0429 | 0x0420 | 0x048C
355 | 0x0463 => Some(1256),
356
357 // Code Page 1257 - Baltic
358 0x0425..=0x0427 => Some(1257),
359
360 // Default to Code Page 1252 for Western European
361 _ => Some(1252),
362 }
363}
364
365/// Returns the encoding name for display/logging purposes.
366#[cfg(feature = "encoding")]
367pub fn encoding_name_for_lcid(lcid: u32) -> &'static str {
368 if is_utf8_collation(lcid) {
369 return "UTF-8";
370 }
371
372 match encoding_for_lcid(lcid) {
373 Some(enc) => enc.name(),
374 None => "windows-1252", // Default fallback
375 }
376}
377
378/// Encode UTF-8 text into the given codepage, replacing unmappable
379/// characters with `?`.
380///
381/// `encoding_rs`'s convenience `Encoding::encode()` performs HTML-form
382/// substitution — unmappable characters become decimal numeric character
383/// references like `你`. That is never what a database client wants:
384/// one CJK character would silently expand into eight ASCII characters of
385/// markup. SQL Server itself and the other first-party drivers (ADO.NET,
386/// JDBC, ODBC) substitute `?`, so this drives the lower-level encoder by
387/// hand to match that convention.
388#[cfg(feature = "encoding")]
389fn encode_lossy_question_mark(value: &str, encoding: &'static encoding_rs::Encoding) -> Vec<u8> {
390 let mut encoder = encoding.new_encoder();
391 let mut out = Vec::with_capacity(value.len());
392 let mut buf = [0u8; 1024];
393 let mut input = value;
394 loop {
395 let (result, read, written) =
396 encoder.encode_from_utf8_without_replacement(input, &mut buf, true);
397 out.extend_from_slice(&buf[..written]);
398 input = &input[read..];
399 match result {
400 encoding_rs::EncoderResult::InputEmpty => break,
401 encoding_rs::EncoderResult::OutputFull => {}
402 encoding_rs::EncoderResult::Unmappable(_) => out.push(b'?'),
403 }
404 }
405 out
406}
407
408/// Low-level collation-aware string encoder shared across the workspace crates.
409///
410/// Internal plumbing reached cross-crate only via [`crate::__private`]; not
411/// public API and exempt from semver guarantees (see #242).
412pub(crate) mod sealed {
413 use super::*;
414
415 /// Transcode a Rust `&str` into single-byte VARCHAR bytes for the given collation.
416 ///
417 /// - UTF-8 collations (SQL Server 2019+) pass through as raw UTF-8 bytes.
418 /// - Known non-UTF-8 LCIDs transcode via the matching `encoding_rs` codec.
419 /// - Unknown or `None` collations fall back to Windows-1252 (Latin1_General_CI_AS).
420 ///
421 /// Characters not representable in the target codepage are replaced with `?`,
422 /// matching SQL Server's own conversion behavior and the other first-party
423 /// drivers. (Regardless of whether the `encoding` feature is enabled.)
424 pub fn encode_str_for_collation(
425 value: &str,
426 collation: Option<&crate::token::Collation>,
427 ) -> Vec<u8> {
428 #[cfg(feature = "encoding")]
429 {
430 if let Some(c) = collation {
431 if c.is_utf8() {
432 return value.as_bytes().to_vec();
433 }
434 if let Some(encoding) = c.encoding() {
435 return encode_lossy_question_mark(value, encoding);
436 }
437 }
438 encode_lossy_question_mark(value, encoding_rs::WINDOWS_1252)
439 }
440 #[cfg(not(feature = "encoding"))]
441 {
442 let _ = collation;
443 value
444 .chars()
445 .map(|ch| if (ch as u32) <= 0xFF { ch as u8 } else { b'?' })
446 .collect()
447 }
448 }
449}
450
451// Keep the crate-private path `crate::collation::encode_str_for_collation`
452// working for intra-crate callers (rpc, tvp); off the public surface.
453pub(crate) use sealed::encode_str_for_collation;
454
455#[cfg(all(test, feature = "encoding"))]
456#[allow(clippy::unwrap_used)]
457mod tests {
458 use super::*;
459
460 #[test]
461 fn test_utf8_detection() {
462 // UTF-8 collation flag: fUTF8 is bit 26 (0x0400_0000), matching
463 // mssql-jdbc's UTF8_IN_TDSCOLLATION. A real Latin1_General_100_*_UTF8
464 // collation info field has this bit set.
465 assert!(is_utf8_collation(0x0400_0409)); // English with UTF-8
466 assert!(!is_utf8_collation(0x0409)); // English without UTF-8
467 // Regression: bit 27 is FRESERVEDBIT, not fUTF8. Treating it as UTF-8
468 // made is_utf8() always false for real _UTF8 collations (issue #153).
469 assert!(!is_utf8_collation(0x0800_0409));
470 }
471
472 /// Unmappable characters must become `?` (SQL Server / ADO.NET / JDBC
473 /// convention), NOT the decimal numeric character references
474 /// (`你`) that `encoding_rs::Encoding::encode()` produces for HTML
475 /// form submission. A regression here silently expands one CJK char into
476 /// eight ASCII chars of markup in the database.
477 #[test]
478 fn test_unmappable_chars_become_question_marks() {
479 // Windows-1252 fallback: Ω and 你 are unmappable, € maps to 0x80.
480 let encoded = encode_str_for_collation("aΩ€你b", None);
481 assert_eq!(encoded, b"a?\x80?b");
482 // No NCR entities anywhere.
483 let one_each = encode_str_for_collation("你好世界", None);
484 assert_eq!(one_each, b"????");
485
486 // Collation-specific path: GB18030 maps the CJK chars but not every
487 // codepoint; the mappable ones must round through the codec.
488 let chinese = crate::token::Collation {
489 lcid: 0x0804,
490 sort_id: 0,
491 };
492 let encoded = encode_str_for_collation("你好", Some(&chinese));
493 let (expected, _, _) = encoding_rs::GB18030.encode("你好");
494 assert_eq!(encoded, expected.into_owned());
495 }
496
497 #[test]
498 fn test_japanese_encoding() {
499 let enc = encoding_for_lcid(0x0411);
500 assert!(enc.is_some());
501 assert_eq!(enc.unwrap().name(), "Shift_JIS");
502 assert_eq!(code_page_for_lcid(0x0411), Some(932));
503 }
504
505 #[test]
506 fn test_chinese_simplified_encoding() {
507 let enc = encoding_for_lcid(0x0804);
508 assert!(enc.is_some());
509 assert_eq!(enc.unwrap().name(), "gb18030");
510 assert_eq!(code_page_for_lcid(0x0804), Some(936));
511 }
512
513 #[test]
514 fn test_chinese_traditional_encoding() {
515 let enc = encoding_for_lcid(0x0404);
516 assert!(enc.is_some());
517 assert_eq!(enc.unwrap().name(), "Big5");
518 assert_eq!(code_page_for_lcid(0x0404), Some(950));
519 }
520
521 #[test]
522 fn test_korean_encoding() {
523 let enc = encoding_for_lcid(0x0412);
524 assert!(enc.is_some());
525 assert_eq!(enc.unwrap().name(), "EUC-KR");
526 assert_eq!(code_page_for_lcid(0x0412), Some(949));
527 }
528
529 #[test]
530 fn test_cyrillic_encoding() {
531 // Russian
532 let enc = encoding_for_lcid(0x0419);
533 assert!(enc.is_some());
534 assert_eq!(enc.unwrap().name(), "windows-1251");
535 assert_eq!(code_page_for_lcid(0x0419), Some(1251));
536
537 // Ukrainian
538 let enc = encoding_for_lcid(0x0422);
539 assert!(enc.is_some());
540 assert_eq!(enc.unwrap().name(), "windows-1251");
541 }
542
543 #[test]
544 fn test_western_european_encoding() {
545 // English (US)
546 let enc = encoding_for_lcid(0x0409);
547 assert!(enc.is_some());
548 assert_eq!(enc.unwrap().name(), "windows-1252");
549 assert_eq!(code_page_for_lcid(0x0409), Some(1252));
550
551 // French
552 let enc = encoding_for_lcid(0x040C);
553 assert!(enc.is_some());
554 assert_eq!(enc.unwrap().name(), "windows-1252");
555
556 // German
557 let enc = encoding_for_lcid(0x0407);
558 assert!(enc.is_some());
559 assert_eq!(enc.unwrap().name(), "windows-1252");
560 }
561
562 #[test]
563 fn test_greek_encoding() {
564 let enc = encoding_for_lcid(0x0408);
565 assert!(enc.is_some());
566 assert_eq!(enc.unwrap().name(), "windows-1253");
567 assert_eq!(code_page_for_lcid(0x0408), Some(1253));
568 }
569
570 #[test]
571 fn test_turkish_encoding() {
572 let enc = encoding_for_lcid(0x041F);
573 assert!(enc.is_some());
574 assert_eq!(enc.unwrap().name(), "windows-1254");
575 assert_eq!(code_page_for_lcid(0x041F), Some(1254));
576 }
577
578 #[test]
579 fn test_hebrew_encoding() {
580 let enc = encoding_for_lcid(0x040D);
581 assert!(enc.is_some());
582 assert_eq!(enc.unwrap().name(), "windows-1255");
583 assert_eq!(code_page_for_lcid(0x040D), Some(1255));
584 }
585
586 #[test]
587 fn test_arabic_encoding() {
588 // Arabic (Saudi Arabia)
589 let enc = encoding_for_lcid(0x0401);
590 assert!(enc.is_some());
591 assert_eq!(enc.unwrap().name(), "windows-1256");
592 assert_eq!(code_page_for_lcid(0x0401), Some(1256));
593
594 // Farsi/Persian
595 let enc = encoding_for_lcid(0x0429);
596 assert!(enc.is_some());
597 assert_eq!(enc.unwrap().name(), "windows-1256");
598 }
599
600 #[test]
601 fn test_baltic_encoding() {
602 // Estonian
603 let enc = encoding_for_lcid(0x0425);
604 assert!(enc.is_some());
605 assert_eq!(enc.unwrap().name(), "windows-1257");
606 assert_eq!(code_page_for_lcid(0x0425), Some(1257));
607
608 // Lithuanian
609 let enc = encoding_for_lcid(0x0427);
610 assert!(enc.is_some());
611 assert_eq!(enc.unwrap().name(), "windows-1257");
612 }
613
614 /// Issue #158: a SQL collation's code page comes from its SortId, not the
615 /// LCID. Without consulting SortId these silently decoded as
616 /// windows-1252.
617 #[test]
618 fn test_sort_id_drives_encoding() {
619 // SQL_Latin1_General_CP1250_CS_AS — SortId 80, LCID 0x0409 (English,
620 // which would otherwise resolve to windows-1252).
621 assert_eq!(
622 encoding_for_sort_id(80).map(|e| e.name()),
623 Some("windows-1250")
624 );
625 assert_eq!(code_page_for_sort_id(80), Some(1250));
626
627 // Cyrillic, Greek, Turkish, Hebrew, Arabic, Baltic SQL collations.
628 assert_eq!(
629 encoding_for_sort_id(105).map(|e| e.name()),
630 Some("windows-1251")
631 );
632 assert_eq!(
633 encoding_for_sort_id(112).map(|e| e.name()),
634 Some("windows-1253")
635 );
636 assert_eq!(
637 encoding_for_sort_id(128).map(|e| e.name()),
638 Some("windows-1254")
639 );
640 assert_eq!(
641 encoding_for_sort_id(136).map(|e| e.name()),
642 Some("windows-1255")
643 );
644 assert_eq!(
645 encoding_for_sort_id(144).map(|e| e.name()),
646 Some("windows-1256")
647 );
648 assert_eq!(
649 encoding_for_sort_id(152).map(|e| e.name()),
650 Some("windows-1257")
651 );
652
653 // The common default SQL_Latin1_General_CP1_CI_AS (SortId 52) is 1252.
654 assert_eq!(
655 encoding_for_sort_id(52).map(|e| e.name()),
656 Some("windows-1252")
657 );
658
659 // CJK SQL collations.
660 assert_eq!(
661 encoding_for_sort_id(192).map(|e| e.name()),
662 Some("Shift_JIS")
663 );
664 assert_eq!(encoding_for_sort_id(198).map(|e| e.name()), Some("gb18030"));
665
666 // Issue #187: SortId 201 is NLS_CP949_CS (Korean Wansung
667 // case-sensitive) → CP949, not CP950/Big5. Its neighbors stay CP950.
668 assert_eq!(encoding_for_sort_id(201).map(|e| e.name()), Some("EUC-KR"));
669 assert_eq!(code_page_for_sort_id(201), Some(949));
670 assert_eq!(encoding_for_sort_id(202).map(|e| e.name()), Some("Big5"));
671 assert_eq!(code_page_for_sort_id(202), Some(950));
672
673 // CP437/CP850 are not representable in encoding_rs: no encoding, but
674 // the true code page is still reported for error messages.
675 assert_eq!(encoding_for_sort_id(40), None);
676 assert_eq!(code_page_for_sort_id(40), Some(850));
677 assert_eq!(encoding_for_sort_id(30), None);
678 assert_eq!(code_page_for_sort_id(30), Some(437));
679
680 // Unknown SortId.
681 assert_eq!(encoding_for_sort_id(250), None);
682 assert_eq!(code_page_for_sort_id(250), None);
683 }
684
685 /// Issue #158: the `Collation` accessors must branch on SortId.
686 #[test]
687 fn test_collation_methods_consult_sort_id() {
688 use crate::token::Collation;
689
690 // SortId 80 (CP1250) with an English LCID — must NOT be 1252.
691 let sql_collation = Collation {
692 lcid: 0x0409,
693 sort_id: 80,
694 };
695 assert_eq!(
696 sql_collation.encoding().map(|e| e.name()),
697 Some("windows-1250")
698 );
699 assert_eq!(sql_collation.code_page(), Some(1250));
700 assert_eq!(sql_collation.encoding_name(), "windows-1250");
701
702 // SortId 0 (Windows collation) still uses the LCID path.
703 let win_collation = Collation {
704 lcid: 0x0419, // Russian
705 sort_id: 0,
706 };
707 assert_eq!(
708 win_collation.encoding().map(|e| e.name()),
709 Some("windows-1251")
710 );
711 }
712
713 #[test]
714 fn test_thai_encoding() {
715 let enc = encoding_for_lcid(0x041E);
716 assert!(enc.is_some());
717 assert_eq!(enc.unwrap().name(), "windows-874");
718 assert_eq!(code_page_for_lcid(0x041E), Some(874));
719 }
720
721 #[test]
722 fn test_vietnamese_encoding() {
723 let enc = encoding_for_lcid(0x042A);
724 assert!(enc.is_some());
725 assert_eq!(enc.unwrap().name(), "windows-1258");
726 assert_eq!(code_page_for_lcid(0x042A), Some(1258));
727 }
728
729 #[test]
730 fn test_unknown_lcid_fallback() {
731 // Unknown LCID should return None (caller uses Windows-1252)
732 let enc = encoding_for_lcid(0x9999);
733 assert!(enc.is_none());
734 // But code page should default to 1252
735 assert_eq!(code_page_for_lcid(0x9999), Some(1252));
736 }
737
738 #[test]
739 fn test_encoding_name() {
740 assert_eq!(encoding_name_for_lcid(0x0411), "Shift_JIS");
741 assert_eq!(encoding_name_for_lcid(0x0419), "windows-1251");
742 assert_eq!(encoding_name_for_lcid(0x0400_0409), "UTF-8");
743 assert_eq!(encoding_name_for_lcid(0x9999), "windows-1252"); // fallback
744 }
745
746 #[test]
747 fn test_decode_chinese_text() {
748 let enc = encoding_for_lcid(0x0804).unwrap();
749 // "中文" in GB18030 encoding
750 let gb_bytes = [0xD6, 0xD0, 0xCE, 0xC4];
751 let (decoded, _, had_errors) = enc.decode(&gb_bytes);
752 assert!(!had_errors);
753 assert_eq!(decoded, "中文");
754 }
755
756 #[test]
757 fn test_decode_cyrillic_text() {
758 let enc = encoding_for_lcid(0x0419).unwrap();
759 // "Привет" in Windows-1251
760 let cp1251_bytes = [0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2];
761 let (decoded, _, had_errors) = enc.decode(&cp1251_bytes);
762 assert!(!had_errors);
763 assert_eq!(decoded, "Привет");
764 }
765
766 #[test]
767 fn test_decode_japanese_text() {
768 let enc = encoding_for_lcid(0x0411).unwrap();
769 // "日本語" in Shift_JIS
770 let sjis_bytes = [0x93, 0xFA, 0x96, 0x7B, 0x8C, 0xEA];
771 let (decoded, _, had_errors) = enc.decode(&sjis_bytes);
772 assert!(!had_errors);
773 assert_eq!(decoded, "日本語");
774 }
775}