tds-protocol 0.13.1

//! Collation encoding support for SQL Server VARCHAR decoding.
//!
//! This module provides mappings from SQL Server collation LCIDs (Locale IDs)
//! to their corresponding character encodings, enabling proper decoding of
//! non-UTF-8 VARCHAR data.
//!
//! # Supported Encodings
//!
//! The following encoding families are supported based on the collation's LCID:
//!
//! | Code Page | Encoding | Languages |
//! |-----------|----------|-----------|
//! | 874 | Windows-874 (TIS-620) | Thai |
//! | 932 | Shift_JIS | Japanese |
//! | 936 | GBK/GB18030 | Simplified Chinese |
//! | 949 | EUC-KR | Korean |
//! | 950 | Big5 | Traditional Chinese |
//! | 1250 | Windows-1250 | Central/Eastern European |
//! | 1251 | Windows-1251 | Cyrillic |
//! | 1252 | Windows-1252 | Western European (default) |
//! | 1253 | Windows-1253 | Greek |
//! | 1254 | Windows-1254 | Turkish |
//! | 1255 | Windows-1255 | Hebrew |
//! | 1256 | Windows-1256 | Arabic |
//! | 1257 | Windows-1257 | Baltic |
//! | 1258 | Windows-1258 | Vietnamese |
//!
//! # UTF-8 Collations
//!
//! SQL Server 2019+ supports UTF-8 collations (suffix `_UTF8`). These are
//! detected by checking the collation flags. When a UTF-8 collation is used,
//! no encoding conversion is needed as the data is already UTF-8.
//!
//! # References
//!
//! - [MS-LCID: Windows Language Code Identifier Reference](https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/)
//! - [Code Page Identifiers](https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers)

#[cfg(feature = "encoding")]
use encoding_rs::Encoding;

// Re-export Vec from the internal prelude for no_std + alloc builds.
use crate::prelude::*;

/// Flag bit indicating UTF-8 collation (SQL Server 2019+).
/// This is fUTF8, bit 26 (0x0400_0000) in the collation info field per the
/// MS-TDS Collation Rule Definition (bit 27 is FRESERVEDBIT). Matches
/// mssql-jdbc's `UTF8_IN_TDSCOLLATION = 0x4000000`.
pub const COLLATION_FLAG_UTF8: u32 = 0x0400_0000;

/// Mask to extract the primary LCID from the collation info.
/// The LCID is stored in the lower 20 bits.
pub const LCID_MASK: u32 = 0x000F_FFFF;

/// Mask to extract the primary language ID (lower 16 bits of LCID).
pub const PRIMARY_LANGUAGE_MASK: u32 = 0x0000_FFFF;

/// Returns whether the collation uses UTF-8 encoding.
///
/// SQL Server 2019+ supports UTF-8 collations with the `_UTF8` suffix.
/// These collations set fUTF8 (bit 26) in the collation info field.
#[inline]
pub fn is_utf8_collation(lcid: u32) -> bool {
    lcid & COLLATION_FLAG_UTF8 != 0
}

/// Returns the encoding for a given LCID, if known.
///
/// This function maps SQL Server collation LCIDs to their corresponding
/// character encodings from the `encoding_rs` crate.
///
/// # Arguments
///
/// * `lcid` - The locale ID from the SQL Server collation
///
/// # Returns
///
/// * `Some(&Encoding)` - The corresponding encoding if the LCID is recognized
/// * `None` - If the LCID is not recognized or uses UTF-8
///
/// # UTF-8 Handling
///
/// UTF-8 collations (SQL Server 2019+) return `None` because no transcoding
/// is needed - the data is already valid UTF-8.
#[cfg(feature = "encoding")]
pub fn encoding_for_lcid(lcid: u32) -> Option<&'static Encoding> {
    // UTF-8 collations don't need transcoding
    if is_utf8_collation(lcid) {
        return None;
    }

    // Extract the primary language ID
    let primary_lang = lcid & PRIMARY_LANGUAGE_MASK;

    // Map LCID to encoding based on Windows code page assignments
    // Reference: https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/
    match primary_lang {
        // Japanese (Code Page 932 - Shift_JIS)
        0x0411 => Some(encoding_rs::SHIFT_JIS),

        // Chinese - Simplified (Code Page 936 - GBK/GB18030)
        // Includes: zh-CN, zh-SG
        0x0804 | // Chinese (Simplified, PRC)
        0x1004   // Chinese (Simplified, Singapore)
        => Some(encoding_rs::GB18030),

        // Chinese - Traditional (Code Page 950 - Big5)
        // Includes: zh-TW, zh-HK, zh-MO
        0x0404 | // Chinese (Traditional, Taiwan)
        0x0C04 | // Chinese (Traditional, Hong Kong SAR)
        0x1404   // Chinese (Traditional, Macao SAR)
        => Some(encoding_rs::BIG5),

        // Korean (Code Page 949 - EUC-KR)
        0x0412 => Some(encoding_rs::EUC_KR),

        // Thai (Code Page 874 - Windows-874/TIS-620)
        0x041E => Some(encoding_rs::WINDOWS_874),

        // Vietnamese (Code Page 1258)
        0x042A => Some(encoding_rs::WINDOWS_1258),

        // Central/Eastern European (Code Page 1250)
        // Includes: Czech, Polish, Hungarian, Croatian, Slovak, Slovenian, Romanian, Albanian
        0x0405 | // Czech
        0x0415 | // Polish
        0x040E | // Hungarian
        0x041A | // Croatian
        0x081A | // Serbian (Latin)
        0x141A | // Bosnian (Latin)
        0x101A | // Croatian (Bosnia and Herzegovina)
        0x041B | // Slovak
        0x0424 | // Slovenian
        0x0418 | // Romanian
        0x041C   // Albanian
        => Some(encoding_rs::WINDOWS_1250),

        // Cyrillic (Code Page 1251)
        // Includes: Russian, Ukrainian, Belarusian, Bulgarian, Macedonian, Serbian Cyrillic
        0x0419 | // Russian
        0x0422 | // Ukrainian
        0x0423 | // Belarusian
        0x0402 | // Bulgarian
        0x042F | // Macedonian
        0x0C1A | // Serbian (Cyrillic)
        0x201A | // Bosnian (Cyrillic)
        0x0440 | // Kyrgyz
        0x0843 | // Uzbek (Cyrillic)
        0x0444 | // Tatar
        0x0450 | // Mongolian (Cyrillic)
        0x0485   // Sakha
        => Some(encoding_rs::WINDOWS_1251),

        // Greek (Code Page 1253)
        0x0408 => Some(encoding_rs::WINDOWS_1253),

        // Turkish (Code Page 1254)
        0x041F | // Turkish
        0x042C   // Azerbaijani (Latin)
        => Some(encoding_rs::WINDOWS_1254),

        // Hebrew (Code Page 1255)
        0x040D => Some(encoding_rs::WINDOWS_1255),

        // Arabic (Code Page 1256)
        // Includes all Arabic variants and Farsi/Persian, Urdu, etc.
        0x0401 | // Arabic (Saudi Arabia)
        0x0801 | // Arabic (Iraq)
        0x0C01 | // Arabic (Egypt)
        0x1001 | // Arabic (Libya)
        0x1401 | // Arabic (Algeria)
        0x1801 | // Arabic (Morocco)
        0x1C01 | // Arabic (Tunisia)
        0x2001 | // Arabic (Oman)
        0x2401 | // Arabic (Yemen)
        0x2801 | // Arabic (Syria)
        0x2C01 | // Arabic (Jordan)
        0x3001 | // Arabic (Lebanon)
        0x3401 | // Arabic (Kuwait)
        0x3801 | // Arabic (UAE)
        0x3C01 | // Arabic (Bahrain)
        0x4001 | // Arabic (Qatar)
        0x0429 | // Farsi/Persian
        0x0420 | // Urdu
        0x048C | // Dari
        0x0463   // Pashto
        => Some(encoding_rs::WINDOWS_1256),

        // Baltic (Code Page 1257)
        0x0425..=0x0427   // Lithuanian
        => Some(encoding_rs::WINDOWS_1257),

        // Western European (Code Page 1252) - Default for most European languages
        // Includes: English, French, German, Spanish, Italian, Portuguese, Dutch, etc.
        0x0409 | // English (United States)
        0x0809 | // English (United Kingdom)
        0x0C09 | // English (Australia)
        0x1009 | // English (Canada)
        0x1409 | // English (New Zealand)
        0x1809 | // English (Ireland)
        0x040C | // French (France)
        0x080C | // French (Belgium)
        0x0C0C | // French (Canada)
        0x100C | // French (Switzerland)
        0x140C | // French (Luxembourg)
        0x0407 | // German (Germany)
        0x0807 | // German (Switzerland)
        0x0C07 | // German (Austria)
        0x1007 | // German (Luxembourg)
        0x1407 | // German (Liechtenstein)
        0x040A | // Spanish (Traditional Sort)
        0x080A | // Spanish (Mexico)
        0x0C0A | // Spanish (Modern Sort)
        0x100A | // Spanish (Guatemala)
        0x140A | // Spanish (Costa Rica)
        0x180A | // Spanish (Panama)
        0x1C0A | // Spanish (Dominican Republic)
        0x200A | // Spanish (Venezuela)
        0x240A | // Spanish (Colombia)
        0x280A | // Spanish (Peru)
        0x2C0A | // Spanish (Argentina)
        0x300A | // Spanish (Ecuador)
        0x340A | // Spanish (Chile)
        0x380A | // Spanish (Uruguay)
        0x3C0A | // Spanish (Paraguay)
        0x400A | // Spanish (Bolivia)
        0x440A | // Spanish (El Salvador)
        0x480A | // Spanish (Honduras)
        0x4C0A | // Spanish (Nicaragua)
        0x500A | // Spanish (Puerto Rico)
        0x0410 | // Italian (Italy)
        0x0810 | // Italian (Switzerland)
        0x0816 | // Portuguese (Portugal)
        0x0416 | // Portuguese (Brazil)
        0x0413 | // Dutch (Netherlands)
        0x0813 | // Dutch (Belgium)
        0x0406 | // Danish
        0x0414 | // Norwegian (Bokmål)
        0x0814 | // Norwegian (Nynorsk)
        0x041D | // Swedish
        0x081D | // Swedish (Finland)
        0x040B | // Finnish
        0x040F | // Icelandic
        0x0403 | // Catalan
        0x0456 | // Galician
        0x042D | // Basque
        0x0436 | // Afrikaans
        0x0421 | // Indonesian
        0x043E | // Malay (Malaysia)
        0x0441   // Swahili
        => Some(encoding_rs::WINDOWS_1252),

        // Unknown LCID - return None, caller should use Windows-1252 as fallback
        _ => None,
    }
}

/// Returns the encoding for a SQL collation identified by its SortId.
///
/// A collation with a non-zero SortId is a "SQL collation" (one of a
/// predefined set of sort orders); its code page is derived from the SortId,
/// not the LCID (MS-TDS Collation rule). For example
/// `SQL_Latin1_General_CP1250_CS_AS` has SortId 80 → windows-1250, while its
/// LCID would otherwise resolve to windows-1252.
///
/// Returns `None` for SortIds whose code page `encoding_rs` cannot represent
/// (the OEM code pages CP437 and CP850 are not in the WHATWG encoding set)
/// and for unknown SortIds. Table derived from the Microsoft/mssql-jdbc
/// SortId mapping.
#[cfg(feature = "encoding")]
pub fn encoding_for_sort_id(sort_id: u8) -> Option<&'static Encoding> {
    match sort_id {
        // 30..=35  => CP437 — not representable in encoding_rs
        // 40..=49  => CP850 — not representable in encoding_rs
        50..=54 | 71..=75 | 183..=186 | 210..=217 => Some(encoding_rs::WINDOWS_1252),
        // 55..=62  => CP850 — not representable in encoding_rs
        80..=98 => Some(encoding_rs::WINDOWS_1250),
        104..=108 => Some(encoding_rs::WINDOWS_1251),
        112..=114 | 120..=124 => Some(encoding_rs::WINDOWS_1253),
        128..=130 => Some(encoding_rs::WINDOWS_1254),
        136..=138 => Some(encoding_rs::WINDOWS_1255),
        144..=146 => Some(encoding_rs::WINDOWS_1256),
        152..=160 => Some(encoding_rs::WINDOWS_1257),
        192 | 193 | 200 => Some(encoding_rs::SHIFT_JIS), // CP932
        194 | 195 | 201 => Some(encoding_rs::EUC_KR),    // CP949
        196 | 197 | 202 => Some(encoding_rs::BIG5),      // CP950
        198 | 199 | 203 => Some(encoding_rs::GB18030),   // CP936
        204..=206 => Some(encoding_rs::WINDOWS_874),
        _ => None,
    }
}

/// Returns the Windows code page number for a SQL collation's SortId.
///
/// Unlike [`encoding_for_sort_id`], this reports the true code page even for
/// the OEM pages `encoding_rs` cannot decode (437, 850), so callers can
/// produce an accurate "unsupported code page" error. Returns `None` for
/// unknown SortIds.
#[cfg(feature = "encoding")]
pub fn code_page_for_sort_id(sort_id: u8) -> Option<u16> {
    match sort_id {
        30..=35 => Some(437),
        40..=49 | 55..=62 => Some(850),
        50..=54 | 71..=75 | 183..=186 | 210..=217 => Some(1252),
        80..=98 => Some(1250),
        104..=108 => Some(1251),
        112..=114 | 120..=124 => Some(1253),
        128..=130 => Some(1254),
        136..=138 => Some(1255),
        144..=146 => Some(1256),
        152..=160 => Some(1257),
        192 | 193 | 200 => Some(932),
        194 | 195 | 201 => Some(949),
        196 | 197 | 202 => Some(950),
        198 | 199 | 203 => Some(936),
        204..=206 => Some(874),
        _ => None,
    }
}

/// Returns the Windows code page number for a given LCID.
///
/// This is useful for error messages and debugging.
#[cfg(feature = "encoding")]
pub fn code_page_for_lcid(lcid: u32) -> Option<u16> {
    if is_utf8_collation(lcid) {
        return Some(65001); // UTF-8
    }

    let primary_lang = lcid & PRIMARY_LANGUAGE_MASK;

    match primary_lang {
        0x0411 => Some(932),                   // Japanese - Shift_JIS
        0x0804 | 0x1004 => Some(936),          // Chinese Simplified - GBK
        0x0404 | 0x0C04 | 0x1404 => Some(950), // Chinese Traditional - Big5
        0x0412 => Some(949),                   // Korean - EUC-KR
        0x041E => Some(874),                   // Thai
        0x042A => Some(1258),                  // Vietnamese

        // Code Page 1250 - Central European
        0x0405 | 0x0415 | 0x040E | 0x041A | 0x081A | 0x141A | 0x101A | 0x041B | 0x0424 | 0x0418
        | 0x041C => Some(1250),

        // Code Page 1251 - Cyrillic
        0x0419 | 0x0422 | 0x0423 | 0x0402 | 0x042F | 0x0C1A | 0x201A | 0x0440 | 0x0843 | 0x0444
        | 0x0450 | 0x0485 => Some(1251),

        0x0408 => Some(1253),          // Greek
        0x041F | 0x042C => Some(1254), // Turkish, Azerbaijani
        0x040D => Some(1255),          // Hebrew

        // Code Page 1256 - Arabic
        0x0401 | 0x0801 | 0x0C01 | 0x1001 | 0x1401 | 0x1801 | 0x1C01 | 0x2001 | 0x2401 | 0x2801
        | 0x2C01 | 0x3001 | 0x3401 | 0x3801 | 0x3C01 | 0x4001 | 0x0429 | 0x0420 | 0x048C
        | 0x0463 => Some(1256),

        // Code Page 1257 - Baltic
        0x0425..=0x0427 => Some(1257),

        // Default to Code Page 1252 for Western European
        _ => Some(1252),
    }
}

/// Returns the encoding name for display/logging purposes.
#[cfg(feature = "encoding")]
pub fn encoding_name_for_lcid(lcid: u32) -> &'static str {
    if is_utf8_collation(lcid) {
        return "UTF-8";
    }

    match encoding_for_lcid(lcid) {
        Some(enc) => enc.name(),
        None => "windows-1252", // Default fallback
    }
}

/// Encode UTF-8 text into the given codepage, replacing unmappable
/// characters with `?`.
///
/// `encoding_rs`'s convenience `Encoding::encode()` performs HTML-form
/// substitution — unmappable characters become decimal numeric character
/// references like `&#20320;`. That is never what a database client wants:
/// one CJK character would silently expand into eight ASCII characters of
/// markup. SQL Server itself and the other first-party drivers (ADO.NET,
/// JDBC, ODBC) substitute `?`, so this drives the lower-level encoder by
/// hand to match that convention.
#[cfg(feature = "encoding")]
fn encode_lossy_question_mark(value: &str, encoding: &'static encoding_rs::Encoding) -> Vec<u8> {
    let mut encoder = encoding.new_encoder();
    let mut out = Vec::with_capacity(value.len());
    let mut buf = [0u8; 1024];
    let mut input = value;
    loop {
        let (result, read, written) =
            encoder.encode_from_utf8_without_replacement(input, &mut buf, true);
        out.extend_from_slice(&buf[..written]);
        input = &input[read..];
        match result {
            encoding_rs::EncoderResult::InputEmpty => break,
            encoding_rs::EncoderResult::OutputFull => {}
            encoding_rs::EncoderResult::Unmappable(_) => out.push(b'?'),
        }
    }
    out
}

/// Transcode a Rust `&str` into single-byte VARCHAR bytes for the given collation.
///
/// - UTF-8 collations (SQL Server 2019+) pass through as raw UTF-8 bytes.
/// - Known non-UTF-8 LCIDs transcode via the matching `encoding_rs` codec.
/// - Unknown or `None` collations fall back to Windows-1252 (Latin1_General_CI_AS).
///
/// Characters not representable in the target codepage are replaced with `?`,
/// matching SQL Server's own conversion behavior and the other first-party
/// drivers. (Regardless of whether the `encoding` feature is enabled.)
pub fn encode_str_for_collation(
    value: &str,
    collation: Option<&crate::token::Collation>,
) -> Vec<u8> {
    #[cfg(feature = "encoding")]
    {
        if let Some(c) = collation {
            if c.is_utf8() {
                return value.as_bytes().to_vec();
            }
            if let Some(encoding) = c.encoding() {
                return encode_lossy_question_mark(value, encoding);
            }
        }
        encode_lossy_question_mark(value, encoding_rs::WINDOWS_1252)
    }
    #[cfg(not(feature = "encoding"))]
    {
        let _ = collation;
        value
            .chars()
            .map(|ch| if (ch as u32) <= 0xFF { ch as u8 } else { b'?' })
            .collect()
    }
}

#[cfg(all(test, feature = "encoding"))]
#[allow(clippy::unwrap_used)]
mod tests {
    use super::*;

    #[test]
    fn test_utf8_detection() {
        // UTF-8 collation flag: fUTF8 is bit 26 (0x0400_0000), matching
        // mssql-jdbc's UTF8_IN_TDSCOLLATION. A real Latin1_General_100_*_UTF8
        // collation info field has this bit set.
        assert!(is_utf8_collation(0x0400_0409)); // English with UTF-8
        assert!(!is_utf8_collation(0x0409)); // English without UTF-8
        // Regression: bit 27 is FRESERVEDBIT, not fUTF8. Treating it as UTF-8
        // made is_utf8() always false for real _UTF8 collations (issue #153).
        assert!(!is_utf8_collation(0x0800_0409));
    }

    /// Unmappable characters must become `?` (SQL Server / ADO.NET / JDBC
    /// convention), NOT the decimal numeric character references
    /// (`&#20320;`) that `encoding_rs::Encoding::encode()` produces for HTML
    /// form submission. A regression here silently expands one CJK char into
    /// eight ASCII chars of markup in the database.
    #[test]
    fn test_unmappable_chars_become_question_marks() {
        // Windows-1252 fallback: Ω and 你 are unmappable, € maps to 0x80.
        let encoded = encode_str_for_collation("aΩ€你b", None);
        assert_eq!(encoded, b"a?\x80?b");
        // No NCR entities anywhere.
        let one_each = encode_str_for_collation("你好世界", None);
        assert_eq!(one_each, b"????");

        // Collation-specific path: GB18030 maps the CJK chars but not every
        // codepoint; the mappable ones must round through the codec.
        let chinese = crate::token::Collation {
            lcid: 0x0804,
            sort_id: 0,
        };
        let encoded = encode_str_for_collation("你好", Some(&chinese));
        let (expected, _, _) = encoding_rs::GB18030.encode("你好");
        assert_eq!(encoded, expected.into_owned());
    }

    #[test]
    fn test_japanese_encoding() {
        let enc = encoding_for_lcid(0x0411);
        assert!(enc.is_some());
        assert_eq!(enc.unwrap().name(), "Shift_JIS");
        assert_eq!(code_page_for_lcid(0x0411), Some(932));
    }

    #[test]
    fn test_chinese_simplified_encoding() {
        let enc = encoding_for_lcid(0x0804);
        assert!(enc.is_some());
        assert_eq!(enc.unwrap().name(), "gb18030");
        assert_eq!(code_page_for_lcid(0x0804), Some(936));
    }

    #[test]
    fn test_chinese_traditional_encoding() {
        let enc = encoding_for_lcid(0x0404);
        assert!(enc.is_some());
        assert_eq!(enc.unwrap().name(), "Big5");
        assert_eq!(code_page_for_lcid(0x0404), Some(950));
    }

    #[test]
    fn test_korean_encoding() {
        let enc = encoding_for_lcid(0x0412);
        assert!(enc.is_some());
        assert_eq!(enc.unwrap().name(), "EUC-KR");
        assert_eq!(code_page_for_lcid(0x0412), Some(949));
    }

    #[test]
    fn test_cyrillic_encoding() {
        // Russian
        let enc = encoding_for_lcid(0x0419);
        assert!(enc.is_some());
        assert_eq!(enc.unwrap().name(), "windows-1251");
        assert_eq!(code_page_for_lcid(0x0419), Some(1251));

        // Ukrainian
        let enc = encoding_for_lcid(0x0422);
        assert!(enc.is_some());
        assert_eq!(enc.unwrap().name(), "windows-1251");
    }

    #[test]
    fn test_western_european_encoding() {
        // English (US)
        let enc = encoding_for_lcid(0x0409);
        assert!(enc.is_some());
        assert_eq!(enc.unwrap().name(), "windows-1252");
        assert_eq!(code_page_for_lcid(0x0409), Some(1252));

        // French
        let enc = encoding_for_lcid(0x040C);
        assert!(enc.is_some());
        assert_eq!(enc.unwrap().name(), "windows-1252");

        // German
        let enc = encoding_for_lcid(0x0407);
        assert!(enc.is_some());
        assert_eq!(enc.unwrap().name(), "windows-1252");
    }

    #[test]
    fn test_greek_encoding() {
        let enc = encoding_for_lcid(0x0408);
        assert!(enc.is_some());
        assert_eq!(enc.unwrap().name(), "windows-1253");
        assert_eq!(code_page_for_lcid(0x0408), Some(1253));
    }

    #[test]
    fn test_turkish_encoding() {
        let enc = encoding_for_lcid(0x041F);
        assert!(enc.is_some());
        assert_eq!(enc.unwrap().name(), "windows-1254");
        assert_eq!(code_page_for_lcid(0x041F), Some(1254));
    }

    #[test]
    fn test_hebrew_encoding() {
        let enc = encoding_for_lcid(0x040D);
        assert!(enc.is_some());
        assert_eq!(enc.unwrap().name(), "windows-1255");
        assert_eq!(code_page_for_lcid(0x040D), Some(1255));
    }

    #[test]
    fn test_arabic_encoding() {
        // Arabic (Saudi Arabia)
        let enc = encoding_for_lcid(0x0401);
        assert!(enc.is_some());
        assert_eq!(enc.unwrap().name(), "windows-1256");
        assert_eq!(code_page_for_lcid(0x0401), Some(1256));

        // Farsi/Persian
        let enc = encoding_for_lcid(0x0429);
        assert!(enc.is_some());
        assert_eq!(enc.unwrap().name(), "windows-1256");
    }

    #[test]
    fn test_baltic_encoding() {
        // Estonian
        let enc = encoding_for_lcid(0x0425);
        assert!(enc.is_some());
        assert_eq!(enc.unwrap().name(), "windows-1257");
        assert_eq!(code_page_for_lcid(0x0425), Some(1257));

        // Lithuanian
        let enc = encoding_for_lcid(0x0427);
        assert!(enc.is_some());
        assert_eq!(enc.unwrap().name(), "windows-1257");
    }

    /// Issue #158: a SQL collation's code page comes from its SortId, not the
    /// LCID. Without consulting SortId these silently decoded as
    /// windows-1252.
    #[test]
    fn test_sort_id_drives_encoding() {
        // SQL_Latin1_General_CP1250_CS_AS — SortId 80, LCID 0x0409 (English,
        // which would otherwise resolve to windows-1252).
        assert_eq!(
            encoding_for_sort_id(80).map(|e| e.name()),
            Some("windows-1250")
        );
        assert_eq!(code_page_for_sort_id(80), Some(1250));

        // Cyrillic, Greek, Turkish, Hebrew, Arabic, Baltic SQL collations.
        assert_eq!(
            encoding_for_sort_id(105).map(|e| e.name()),
            Some("windows-1251")
        );
        assert_eq!(
            encoding_for_sort_id(112).map(|e| e.name()),
            Some("windows-1253")
        );
        assert_eq!(
            encoding_for_sort_id(128).map(|e| e.name()),
            Some("windows-1254")
        );
        assert_eq!(
            encoding_for_sort_id(136).map(|e| e.name()),
            Some("windows-1255")
        );
        assert_eq!(
            encoding_for_sort_id(144).map(|e| e.name()),
            Some("windows-1256")
        );
        assert_eq!(
            encoding_for_sort_id(152).map(|e| e.name()),
            Some("windows-1257")
        );

        // The common default SQL_Latin1_General_CP1_CI_AS (SortId 52) is 1252.
        assert_eq!(
            encoding_for_sort_id(52).map(|e| e.name()),
            Some("windows-1252")
        );

        // CJK SQL collations.
        assert_eq!(
            encoding_for_sort_id(192).map(|e| e.name()),
            Some("Shift_JIS")
        );
        assert_eq!(encoding_for_sort_id(198).map(|e| e.name()), Some("gb18030"));

        // Issue #187: SortId 201 is NLS_CP949_CS (Korean Wansung
        // case-sensitive) → CP949, not CP950/Big5. Its neighbors stay CP950.
        assert_eq!(encoding_for_sort_id(201).map(|e| e.name()), Some("EUC-KR"));
        assert_eq!(code_page_for_sort_id(201), Some(949));
        assert_eq!(encoding_for_sort_id(202).map(|e| e.name()), Some("Big5"));
        assert_eq!(code_page_for_sort_id(202), Some(950));

        // CP437/CP850 are not representable in encoding_rs: no encoding, but
        // the true code page is still reported for error messages.
        assert_eq!(encoding_for_sort_id(40), None);
        assert_eq!(code_page_for_sort_id(40), Some(850));
        assert_eq!(encoding_for_sort_id(30), None);
        assert_eq!(code_page_for_sort_id(30), Some(437));

        // Unknown SortId.
        assert_eq!(encoding_for_sort_id(250), None);
        assert_eq!(code_page_for_sort_id(250), None);
    }

    /// Issue #158: the `Collation` accessors must branch on SortId.
    #[test]
    fn test_collation_methods_consult_sort_id() {
        use crate::token::Collation;

        // SortId 80 (CP1250) with an English LCID — must NOT be 1252.
        let sql_collation = Collation {
            lcid: 0x0409,
            sort_id: 80,
        };
        assert_eq!(
            sql_collation.encoding().map(|e| e.name()),
            Some("windows-1250")
        );
        assert_eq!(sql_collation.code_page(), Some(1250));
        assert_eq!(sql_collation.encoding_name(), "windows-1250");

        // SortId 0 (Windows collation) still uses the LCID path.
        let win_collation = Collation {
            lcid: 0x0419, // Russian
            sort_id: 0,
        };
        assert_eq!(
            win_collation.encoding().map(|e| e.name()),
            Some("windows-1251")
        );
    }

    #[test]
    fn test_thai_encoding() {
        let enc = encoding_for_lcid(0x041E);
        assert!(enc.is_some());
        assert_eq!(enc.unwrap().name(), "windows-874");
        assert_eq!(code_page_for_lcid(0x041E), Some(874));
    }

    #[test]
    fn test_vietnamese_encoding() {
        let enc = encoding_for_lcid(0x042A);
        assert!(enc.is_some());
        assert_eq!(enc.unwrap().name(), "windows-1258");
        assert_eq!(code_page_for_lcid(0x042A), Some(1258));
    }

    #[test]
    fn test_unknown_lcid_fallback() {
        // Unknown LCID should return None (caller uses Windows-1252)
        let enc = encoding_for_lcid(0x9999);
        assert!(enc.is_none());
        // But code page should default to 1252
        assert_eq!(code_page_for_lcid(0x9999), Some(1252));
    }

    #[test]
    fn test_encoding_name() {
        assert_eq!(encoding_name_for_lcid(0x0411), "Shift_JIS");
        assert_eq!(encoding_name_for_lcid(0x0419), "windows-1251");
        assert_eq!(encoding_name_for_lcid(0x0400_0409), "UTF-8");
        assert_eq!(encoding_name_for_lcid(0x9999), "windows-1252"); // fallback
    }

    #[test]
    fn test_decode_chinese_text() {
        let enc = encoding_for_lcid(0x0804).unwrap();
        // "中文" in GB18030 encoding
        let gb_bytes = [0xD6, 0xD0, 0xCE, 0xC4];
        let (decoded, _, had_errors) = enc.decode(&gb_bytes);
        assert!(!had_errors);
        assert_eq!(decoded, "中文");
    }

    #[test]
    fn test_decode_cyrillic_text() {
        let enc = encoding_for_lcid(0x0419).unwrap();
        // "Привет" in Windows-1251
        let cp1251_bytes = [0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2];
        let (decoded, _, had_errors) = enc.decode(&cp1251_bytes);
        assert!(!had_errors);
        assert_eq!(decoded, "Привет");
    }

    #[test]
    fn test_decode_japanese_text() {
        let enc = encoding_for_lcid(0x0411).unwrap();
        // "日本語" in Shift_JIS
        let sjis_bytes = [0x93, 0xFA, 0x96, 0x7B, 0x8C, 0xEA];
        let (decoded, _, had_errors) = enc.decode(&sjis_bytes);
        assert!(!had_errors);
        assert_eq!(decoded, "日本語");
    }
}