1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
// Copyright 2018 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

#![doc(html_root_url = "https://docs.rs/codepage/0.1.1")]

//! Mapping between Windows [code page identifiers][1] and
//! [encoding_rs][2] `Encoding`s.
//!
//! [1]: https://docs.microsoft.com/en-us/windows/desktop/intl/code-page-identifiers
//! [2]: https://crates.io/crates/encoding_rs/

extern crate encoding_rs;

#[cfg(test)]
mod tests;

use encoding_rs::*;

/// Maps a Windows code page identifier to an encoding_rs `Encoding`
/// (or `None` if there is no appropriate mapping).
///
/// In some cases, multiple code page identifiers maps to a single
/// `Encoding`. For example, `28591` for ISO-8859-1 maps to the
/// windows-1252 encoding, since the two are unified in the Web
/// Platform. The EUC family of CJK encodings has multiple code
/// page identifiers. For example, EUC-KR has `949`, `20949` and
/// `51949`. (At present, x-mac-korean, `10003`, maps to `None`,
/// though.)
///
/// Code page identifiers whose corresponding labels would map to
/// the replacement encoding also map to the replacement encoding
/// here.
pub fn to_encoding(code_page: u16) -> Option<&'static Encoding> {
    CODE_PAGES
        .iter()
        .position(|&x| x == code_page)
        .map(|i| ENCODINGS[i])
}

/// Like `to_encoding`, except returns `None` when `to_encoding`
/// would return `Some(REPLACEMENT)`.
pub fn to_encoding_no_replacement(code_page: u16) -> Option<&'static Encoding> {
    let opt_encoding = to_encoding(code_page);
    if opt_encoding == Some(REPLACEMENT) {
        None
    } else {
        opt_encoding
    }
}

/// Returns the preferred code page identifier for an `Encoding`.
///
/// Returns `None` for replacement and x-user-defined.
pub fn from_encoding(encoding: &'static Encoding) -> Option<u16> {
    if encoding == REPLACEMENT {
        None
    } else {
        ENCODINGS
            .iter()
            .position(|&x| x == encoding)
            .map(|i| CODE_PAGES[i])
    }
}

// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
// Instead, please regenerate using generate-encoding-data.py

/// Supported code page numbers in estimated order of usage frequency
static CODE_PAGES: [u16; 51] = [
    65001, 1200, 1252, 1251, 936, 932, 949, 1250, 1256, 1254, 950, 874, 1255, 1253, 1257, 1258,
    20932, 28592, 28605, 28597, 20866, 54936, 28595, 38598, 28594, 28596, 50221, 21866, 28603,
    28593, 1201, 866, 28600, 28598, 10000, 10017, 28604, 28606, 951, 20936, 20949, 28591, 28599,
    28601, 50220, 50222, 50225, 50227, 51936, 51949, 52936,
];

/// Encodings corresponding to the code page numbers in the same order
static ENCODINGS: [&'static Encoding; 51] = [
    &UTF_8_INIT,
    &UTF_16LE_INIT,
    &WINDOWS_1252_INIT,
    &WINDOWS_1251_INIT,
    &GBK_INIT,
    &SHIFT_JIS_INIT,
    &EUC_KR_INIT,
    &WINDOWS_1250_INIT,
    &WINDOWS_1256_INIT,
    &WINDOWS_1254_INIT,
    &BIG5_INIT,
    &WINDOWS_874_INIT,
    &WINDOWS_1255_INIT,
    &WINDOWS_1253_INIT,
    &WINDOWS_1257_INIT,
    &WINDOWS_1258_INIT,
    &EUC_JP_INIT,
    &ISO_8859_2_INIT,
    &ISO_8859_15_INIT,
    &ISO_8859_7_INIT,
    &KOI8_R_INIT,
    &GB18030_INIT,
    &ISO_8859_5_INIT,
    &ISO_8859_8_I_INIT,
    &ISO_8859_4_INIT,
    &ISO_8859_6_INIT,
    &ISO_2022_JP_INIT,
    &KOI8_U_INIT,
    &ISO_8859_13_INIT,
    &ISO_8859_3_INIT,
    &UTF_16BE_INIT,
    &IBM866_INIT,
    &ISO_8859_10_INIT,
    &ISO_8859_8_INIT,
    &MACINTOSH_INIT,
    &X_MAC_CYRILLIC_INIT,
    &ISO_8859_14_INIT,
    &ISO_8859_16_INIT,
    &BIG5_INIT,
    &GBK_INIT,
    &EUC_KR_INIT,
    &WINDOWS_1252_INIT,
    &WINDOWS_1254_INIT,
    &WINDOWS_874_INIT,
    &ISO_2022_JP_INIT,
    &ISO_2022_JP_INIT,
    &REPLACEMENT_INIT,
    &REPLACEMENT_INIT,
    &GBK_INIT,
    &EUC_KR_INIT,
    &REPLACEMENT_INIT,
];

// END GENERATED CODE