locale_codes/codeset.rs
1/*!
2Character sets registered with IANA.
3
4These are the official names for character sets that may be used in
5the Internet and may be referred to in Internet documentation. These
6names are expressed in ANSI_X3.4-1968 which is commonly called
7US-ASCII or simply ASCII. The character set most commonly use in the
8Internet and used especially in protocol standards is US-ASCII, this
9is strongly encouraged. The use of the name US-ASCII is also
10encouraged.
11
12## Source - IANA
13
14The data used here is taken from the tables in the html page
15[IANA](https://www.iana.org/assignments/character-sets/character-sets.xhtml).
16
17See also: [RFC-2978](https://tools.ietf.org/html/rfc2978) IANA Charset
18Registration Procedures.
19*/
20
21use std::collections::HashMap;
22
23use serde::{Deserialize, Serialize};
24
25// ------------------------------------------------------------------------------------------------
26// Public Types
27// ------------------------------------------------------------------------------------------------
28
29/// A representation of registrered character set data that maintained by IANA.
30#[derive(Serialize, Deserialize, Debug)]
31pub struct CodesetInfo {
32 /// The name, not a code, for this code set.
33 pub name: String,
34 /// Any well known aliases for this code set.
35 pub also_known_as: Vec<String>,
36 /// The IANA registered MIB code.
37 pub mib_code: u32,
38 /// Sources identified in the IANA registration.
39 pub source: Option<String>,
40 /// References identified in the IANA registration.
41 pub references: Option<String>,
42}
43
44// ------------------------------------------------------------------------------------------------
45// Public Functions
46// ------------------------------------------------------------------------------------------------
47
48lazy_static! {
49 static ref CODESETS: HashMap<String, CodesetInfo> = load_code_sets_from_json();
50}
51
52/// Lookup a `CodesetInfo` based on it's name, returning `None` if the name
53/// does not exist in the current IANA data set.
54pub fn lookup(name: &str) -> Option<&'static CodesetInfo> {
55 assert!(name.len() > 0, "codeset name may not be empty");
56 CODESETS.get(name)
57}
58
59/// Return all the registered script names.
60pub fn all_names() -> Vec<String> {
61 CODESETS.keys().cloned().collect()
62}
63
64// ------------------------------------------------------------------------------------------------
65// Generated Data
66// ------------------------------------------------------------------------------------------------
67
68fn load_code_sets_from_json() -> HashMap<String, CodesetInfo> {
69 info!("load_code_sets_from_json - loading JSON");
70 let raw_data = include_bytes!("data/codesets.json");
71 let code_set_map: HashMap<String, CodesetInfo> = serde_json::from_slice(raw_data).unwrap();
72 info!(
73 "load_code_sets_from_json - loaded {} codes ets",
74 code_set_map.len()
75 );
76 code_set_map
77}
78
79// ------------------------------------------------------------------------------------------------
80// Unit Tests
81// ------------------------------------------------------------------------------------------------
82
83#[cfg(test)]
84mod tests {
85 use super::*;
86
87 // --------------------------------------------------------------------------------------------
88 #[test]
89 fn test_good_codeset_code() {
90 match lookup("UTF-8") {
91 None => panic!("was expecting a codeset"),
92 Some(codeset) => assert_eq!(codeset.name.to_string(), "UTF-8".to_string()),
93 }
94 }
95
96 #[test]
97 fn test_bad_codeset_code() {
98 match lookup(&"UTF-99") {
99 None => (),
100 Some(_) => panic!("was expecting a None in response"),
101 }
102 }
103}