icu_data/ucm/mod.rs
1//! This module contains a UniCode Mapping (`.ucm`) file format parser and all of the data files
2//! available in the Unicode Consortium's `icu-data` repository. For a list, see [`KNOWN_CHARSETS`].
3//!
4//! Most uses of this library should look like this:
5//!
6//! ```
7//! use icu_data::ucm::{request_mapping_file, parser::parse as parse_ucm};
8//!
9//! let f = request_mapping_file("java-EUC_JP-1.3_P").unwrap(); // holds the .ucm file as a String
10//! let enc = parse_ucm(&f).unwrap(); // holds an `Encoding`
11//! /* ... */
12//! ```
13//!
14//! If you only want a single encoding, they're all in the module named [`mappings`]. They are all
15//! [`lazy_static`] types, so are only evaluated when used. The evaluation of them can panic,
16//! because it is just the code above, but they all work on my machine, and will only ever panic if
17//! Brotli decompression or [`tar`] metadata parsing fails.
18//!
19//! Example:
20//! ```
21//! use icu_data::ucm::mappings;
22//! assert_eq!(mappings::JAVA_EUC_JP_1_3_P.codepoints.len(), 13139);
23//! ```
24
25mod charsets;
26use charsets::{BROTLI_UCM_ARRAYS, CHARSET_LOOKUP};
27pub use charsets::KNOWN_CHARSETS;
28mod errors;
29pub use errors::IcuDataError;
30pub mod mappings;
31pub mod parser;
32pub use parser::Parser as PestParser;
33mod types;
34pub use types::{Codepoint, Encoding, EquivalenceType};
35mod util;
36
37use brotli;
38use tar;
39
40use std::path::PathBuf;
41
42/// Given the name of an encoding known to ICU, return its raw UCM data as a String.
43///
44/// You should not request encoding filenames (with `.ucm`), but it'll still be understood.
45///
46/// Internally, this function considers 105 byte arrays stored as static strings. It takes the name
47/// of the mapping and looks it up in `CHARSET_LOOKUP`, which tells it what byte array contains the
48/// mapping. For example, `CHARSET_LOOKUP["ibm-737_P100-1997.ucm"]` is `58`. So, we know that
49/// `BYTES_58` contains the file. `BYTES_58` is defined as:
50///
51/// ```no_run
52/// include_bytes!("../../resources/brotli/ibm-737_P100-1997ibm-775_P100-1996ibm-803_P100-1999ibm-806_P100-1998ibm-808_P100-1999ibm-813_P100-1995ibm-819_P100-1999ibm-833_P100-1995ibm-834_P100-1995ibm-834_X100-1995.ucm.tar.b");
53/// ```
54///
55/// So, we un-Brotli compress the data in `BYTES_58` and then send it through a `.tar` file parser.
56/// We iterate through the metadata entries until we find one equal to `ibm-737_P100-1997.ucm`. We
57/// then clone that data to a `String` type because only the compressed versions are owned in
58/// memory and return.
59///
60/// This function returns a `Result<_, _>` type because users may provide unknown mappings. The
61/// only error you should ever receive is [`IcuDataError::UnknownMappingRequested`], all the valid
62/// mappings have been tested and decompress on my machine.
63pub fn request_mapping_file(mapping: &str) -> Result<String, IcuDataError> {
64 // We do this in case we're given a mapping file with a .ucm extension
65 let mapping = util::remove_suffix(mapping, ".ucm");
66 let request = format!("{}.ucm", mapping);
67 let index = CHARSET_LOOKUP.get(request.as_str()).ok_or(IcuDataError::UnknownMappingRequested)?;
68 let mut bytes = BROTLI_UCM_ARRAYS[*index].clone();
69
70 let mut brotli_decompressed = vec![];
71 brotli::BrotliDecompress(&mut bytes, &mut brotli_decompressed).or(Err(IcuDataError::BrotliDecompressionFailure))?;
72 let mut tar_archive = tar::Archive::new(&*brotli_decompressed);
73 // e.g. aix-IBM_858-4.3.6; windows-862-2000
74 let mut found = None;
75 for entry in tar_archive.entries().or(Err(IcuDataError::TarArchiveEntriesReadError))?.into_iter() {
76 let e = entry.or(Err(IcuDataError::TarArchiveEntryParseError))?;
77 if *e.path().or(Err(IcuDataError::TarArchivePathParseError))? == PathBuf::from(request.as_str()) {
78 found = Some(e);
79 break
80 }
81 }
82 let found = found.ok_or_else(||IcuDataError::UnknownMappingRequested)?;
83 let begin = found.raw_file_position() as usize;
84 let end = begin + found.size() as usize;
85 Ok( String::from_utf8(brotli_decompressed[begin..end].to_vec()).or(Err(IcuDataError::MappingFileNotUtf8))? )
86}
87
88#[cfg(test)]
89mod tests {
90 use super::*;
91 #[test]
92 fn fetching_works() {
93 // Windows Code Page 862 (Hebrew)
94 let _f = request_mapping_file("windows-862-2000").unwrap();
95 //eprintln!("{}", &f);
96 }
97
98 #[test]
99 fn parsing_works() {
100 let f = request_mapping_file("java-EUC_JP-1.3_P").unwrap();
101 //eprintln!("{}", &f);
102 let enc = parser::parse(&f).unwrap();
103 assert_eq!(enc.codepoints.len(), 13139);
104 assert_eq!(enc.codepoints[0].uni, '\u{0}');
105 assert_eq!(enc.states.len(), 5);
106 assert_eq!(enc.metadata.len(), 5);
107 assert_eq!(enc.metadata["code_set_name"], "java-EUC_JP-1.3_P");
108 assert_eq!(enc.metadata["uconv_class"], "MBCS");
109 }
110
111 #[test]
112 fn known_charsets_works() {
113 assert!(KNOWN_CHARSETS.contains(&"windows-862-2000"));
114 assert_eq!(KNOWN_CHARSETS.len(), 1049);
115 }
116}