1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
//! This crate provides language information and lookups from Microsoft's numeric
//! language code identifiers (LCIDs) or language names (IETF language tags,
//! [`BCP 47`]), similar to - but more limited than - the
//! [`System.Globalization.CultureInfo`] class in .NET. The lookup follows the
//! [`MS-LCID`] "Windows Language Code Identifier Reference" specification.
//! Sort IDs are not supported yet.
//!
//! # Examples
//!
//! ```
//! use lcid::LanguageId;
//! use std::convert::TryInto;
//!
//! let lang: &LanguageId = 1033.try_into().unwrap();
//! assert_eq!(lang.name, "en-US");
//!
//! let lang: &LanguageId = "en-US".try_into().unwrap();
//! assert_eq!(lang.lcid, 1033);
//! ```
//!
//! [`BCP 47`]: https://tools.ietf.org/rfc/bcp/bcp47.txt
//! [`System.Globalization.CultureInfo`]: https://docs.microsoft.com/en-us/dotnet/api/system.globalization.cultureinfo
//! [`MS-LCID`]: https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/70feba9f-294e-491e-b6eb-56532684c37f
#[macro_use]
pub mod constants;

use std::convert::TryFrom;
use thiserror::Error;

/// Errors when looking up a [`LanguageId`] from a numeric ([`u32`]) LCID via
/// [`TryFrom`] or [`TryInto`](std::convert::TryInto).
#[derive(Error, Debug)]
pub enum LcidLookupError {
    /// The LCID has set reserved bits (original value, reserved bits).
    #[error("LCID {0}/{0:#06x} has reserved bits set: {1}/{1:#x}")]
    ReservedBits(u32, u32),
    /// The LCID has sort ID bits set (original value, sort ID bits).
    #[error("LCID {0}/{0:#06x} has sort ID bits set: {1}/{1:#x}")]
    SortIdBits(u32, u32),
    /// The LCID refers to a named reserved language.
    #[error("LCID {0}/{0:#06x} is reserved ('{1}')")]
    Reserved(u32, &'static str),
    /// The LCID refers to a unknown reserved language.
    #[error("LCID {0}/{0:#06x} is reserved (<unknown>)")]
    ReservedUnknown(u32),
    /// The LCID is neither defined nor reserved.
    #[error("LCID {0}/{0:#06x} is neither defined nor reserved")]
    NeitherDefinedNorReserved(u32),
    /// The LCID is undefined.
    #[error("LCID {0}/{0:#06x} is undefined")]
    Undefined(u32),
}

/// Errors when looking up a [`LanguageId`] from a named identifier via
/// [`TryFrom`] or [`TryInto`](std::convert::TryInto).
#[derive(Error, Debug)]
pub enum NameLookupError {
    /// The name refers to a reserved language.
    #[error("Name '{0}' is reserved ({1}/{1:#06x})")]
    Reserved(&'static str, u32),
    /// The name does not refer to a defined language.
    #[error("Name '{0}' is undefined")]
    Undefined(String),
}

/// A known ANSI code page. Some languages can be encoded using one of these
/// code pages. This enum has a [`u32`] representation, and so can be converted
/// to the numeric code page value if needed.
#[derive(Clone, Copy, Debug)]
#[repr(u32)]
pub enum AnsiCodePage {
    /// windows-874, Thai (Windows)
    Windows874 = 874,
    /// shift_jis, ANSI/OEM Japanese; Japanese (Shift-JIS)
    ShiftJIS = 932,
    /// gb2312, ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
    GB2312 = 936,
    /// ks_c_5601-1987, ANSI/OEM Korean (Unified Hangul Code)
    KsC5601 = 949,
    /// big5, ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
    Big5 = 950,
    /// windows-1250, ANSI Central European; Central European (Windows)
    Windows1250 = 1250,
    /// windows-1251, ANSI Cyrillic; Cyrillic (Windows)
    Windows1251 = 1251,
    /// windows-1252, ANSI Latin 1; Western European (Windows)
    Windows1252 = 1252,
    /// windows-1253, ANSI Greek; Greek (Windows)
    Windows1253 = 1253,
    /// windows-1254, ANSI Turkish; Turkish (Windows)
    Windows1254 = 1254,
    /// windows-1255, ANSI Hebrew; Hebrew (Windows)
    Windows1255 = 1255,
    /// windows-1256, ANSI Arabic; Arabic (Windows)
    Windows1256 = 1256,
    /// windows-1257, ANSI Baltic; Baltic (Windows)
    Windows1257 = 1257,
    /// windows-1258, ANSI/OEM Vietnamese; Vietnamese (Windows)
    Windows1258 = 1258,
}

impl From<AnsiCodePage> for u32 {
    fn from(value: AnsiCodePage) -> u32 {
        value as u32
    }
}

/// A language's identifiers and information. Lookups from numeric or named
/// identifiers return a reference to statically defined `LanguageId`.
#[derive(Clone, Debug)]
pub struct LanguageId {
    /// A unique name that identifies the language (IETF language tag).
    pub name: &'static str,
    /// The language code ID for the language. This does not identify a
    /// language uniquely.
    pub lcid: u32,
    /// A non-localized, English, readable name for the language.
    pub english_name: &'static str,
    /// A two-letter ISO-639 code for the language.
    pub iso639_two_letter: &'static str,
    /// A three-letter ISO-639 code for the language.
    pub iso639_three_letter: &'static str,
    /// A three-letter code for the language used in the Windows API.
    pub windows_three_letter: &'static str,
    /// The corresponding ANSI code page that may be used to encode this
    /// language, or [`None`] if the language can't be encoded outside Unicode.
    pub ansi_code_page: Option<AnsiCodePage>,
}

const PRIMARY_LANG_ID_MASK: u32 = 0x3ff;
const SUB_LANG_ID_MASK: u32 = 0x3f;
const RESERVED_BITS_MASK: u32 = 0xfff;
const SORT_ID_BITS_MASK: u32 = 0xf;

const PRIMARY_LANG_ID_SHIFT: u32 = 0;
const SUB_LANG_ID_SHIFT: u32 = 10;
const RESERVED_BITS_SHIFT: u32 = 20;
const SORT_ID_BITS_SHIFT: u32 = 16;

impl LanguageId {
    /// The primary language ID part of the language's LCID
    #[inline]
    pub fn primary_language_id(&self) -> u32 {
        // +----------------+---------------------+
        // | SubLanguage ID | Primary Language ID |
        // +----------------+---------------------+
        // 15           10  9                     0 bit
        // primary language ID is 10 bits (0-9)
        (self.lcid >> PRIMARY_LANG_ID_SHIFT) & PRIMARY_LANG_ID_MASK
    }

    /// The sub language ID part of the language's LCID
    #[inline]
    pub fn sub_language_id(&self) -> u32 {
        // +----------------+---------------------+
        // | SubLanguage ID | Primary Language ID |
        // +----------------+---------------------+
        // 15           10  9                     0 bit
        // sub language ID is 6 bits (10-15)
        (self.lcid >> SUB_LANG_ID_SHIFT) & SUB_LANG_ID_MASK
    }
}

impl TryFrom<u32> for &'static LanguageId {
    type Error = LcidLookupError;

    /// Try to identify a [`LanguageId`] from a numeric LCID. This returns an
    /// error if:
    /// * The LCID has any reserved bits set
    /// * The LCID has a sort ID
    /// * The LCID maps to an unknown, reserved, or neither defined nor reserved
    ///   language
    fn try_from(value: u32) -> std::result::Result<Self, Self::Error> {
        // +----------+---------+-------------+
        // | Reserved | Sort ID | Language ID |
        // +----------+---------+-------------+
        // 31      20 19     16 15            0 bit
        // reserved is 12 bits (20-31)
        let reserved_bits = (value >> RESERVED_BITS_SHIFT) & RESERVED_BITS_MASK;
        if reserved_bits != 0 {
            return Err(Self::Error::ReservedBits(value, reserved_bits));
        }
        // sort ID is 4 bits (16-19)
        let sort_id_bits = (value >> SORT_ID_BITS_SHIFT) & SORT_ID_BITS_MASK;
        if sort_id_bits != 0 {
            return Err(Self::Error::SortIdBits(value, sort_id_bits));
        }
        // language ID is 16 bits (0-15), primary and sub language ID combined

        // Generated from JSON
        use constants::*;
        parse_lcid!(value)
    }
}

impl TryFrom<&str> for &'static LanguageId {
    type Error = NameLookupError;

    /// Try to identify a [`LanguageId`] from a name. This returns an
    /// error if the name is unknown or reserved.
    fn try_from(value: &str) -> std::result::Result<Self, Self::Error> {
        // Generated from JSON
        use constants::*;
        parse_name!(value)
    }
}

#[cfg(test)]
mod tests {
    use assert_matches::assert_matches;
    use std::convert::TryInto;

    use super::*;

    #[test]
    fn bit_mask_size() {
        assert_eq!(PRIMARY_LANG_ID_MASK.count_ones(), 10);
        assert_eq!(SUB_LANG_ID_MASK.count_ones(), 6);
        assert_eq!(RESERVED_BITS_MASK.count_ones(), 12);
        assert_eq!(SORT_ID_BITS_MASK.count_ones(), 4);
    }

    #[test]
    fn bit_mask_pos() {
        let primary_lang_id: u32 = 0b0000001111111111;
        let sub_lang_id: u32 = 0b1111110000000000;
        let reserved_bits: u32 = 0b1111111111110000 << 16;
        let sort_id_bits: u32 = 0b0000000000001111 << 16;

        assert_eq!(
            PRIMARY_LANG_ID_MASK.count_ones(),
            primary_lang_id.count_ones()
        );
        assert_eq!(SUB_LANG_ID_MASK.count_ones(), sub_lang_id.count_ones());
        assert_eq!(RESERVED_BITS_MASK.count_ones(), reserved_bits.count_ones());
        assert_eq!(SORT_ID_BITS_MASK.count_ones(), sort_id_bits.count_ones());

        assert_eq!(
            PRIMARY_LANG_ID_MASK << PRIMARY_LANG_ID_SHIFT,
            primary_lang_id
        );
        assert_eq!(SUB_LANG_ID_MASK << SUB_LANG_ID_SHIFT, sub_lang_id);
        assert_eq!(RESERVED_BITS_MASK << RESERVED_BITS_SHIFT, reserved_bits);
        assert_eq!(SORT_ID_BITS_MASK << SORT_ID_BITS_SHIFT, sort_id_bits);

        let all_bits = reserved_bits | sort_id_bits | sub_lang_id | primary_lang_id;
        assert_eq!(all_bits, u32::MAX);
        let all_count = reserved_bits.count_ones()
            + sort_id_bits.count_ones()
            + sub_lang_id.count_ones()
            + primary_lang_id.count_ones();
        assert_eq!(all_count, u32::MAX.count_ones());
    }

    #[test]
    fn lcid_conv_reserved() {
        for reserved_bits in 1..=RESERVED_BITS_MASK {
            let reserved = reserved_bits << RESERVED_BITS_SHIFT;
            let err = TryInto::<&LanguageId>::try_into(reserved).unwrap_err();
            assert_matches!(err, LcidLookupError::ReservedBits(v, r) if v == reserved && r == reserved_bits);
        }
    }

    #[test]
    fn lcid_conv_sort_id() {
        for sort_id_bits in 1..=SORT_ID_BITS_MASK {
            let sort_id = sort_id_bits << SORT_ID_BITS_SHIFT;
            let err = TryInto::<&LanguageId>::try_into(sort_id).unwrap_err();
            assert_matches!(err, LcidLookupError::SortIdBits(v, r) if v == sort_id && r == sort_id_bits);
        }
    }

    #[test]
    fn ansi_code_page_to_u32() {
        assert_eq!(AnsiCodePage::Windows1252 as u32, 1252u32);
        let value: u32 = AnsiCodePage::Windows1252.into();
        assert_eq!(value, 1252u32);
    }
}