icu_data/ucm/
mod.rs

1//! This module contains a UniCode Mapping (`.ucm`) file format parser and all of the data files
2//! available in the Unicode Consortium's `icu-data` repository. For a list, see [`KNOWN_CHARSETS`].
3//!
4//! Most uses of this library should look like this:
5//!
6//! ```
7//! use icu_data::ucm::{request_mapping_file, parser::parse as parse_ucm};
8//!
9//! let f = request_mapping_file("java-EUC_JP-1.3_P").unwrap(); // holds the .ucm file as a String
10//! let enc = parse_ucm(&f).unwrap(); // holds an `Encoding`
11//! /* ... */
12//! ```
13//!
14//! If you only want a single encoding, they're all in the module named [`mappings`]. They are all
15//! [`lazy_static`] types, so are only evaluated when used. The evaluation of them can panic,
16//! because it is just the code above, but they all work on my machine, and will only ever panic if
17//! Brotli decompression or [`tar`] metadata parsing fails.
18//!
19//! Example:
20//! ```
21//! use icu_data::ucm::mappings;
22//! assert_eq!(mappings::JAVA_EUC_JP_1_3_P.codepoints.len(), 13139);
23//! ```
24
25mod charsets;
26use charsets::{BROTLI_UCM_ARRAYS, CHARSET_LOOKUP};
27pub use charsets::KNOWN_CHARSETS;
28mod errors;
29pub use errors::IcuDataError;
30pub mod mappings;
31pub mod parser;
32pub use parser::Parser as PestParser;
33mod types;
34pub use types::{Codepoint, Encoding, EquivalenceType};
35mod util;
36
37use brotli;
38use tar;
39
40use std::path::PathBuf;
41
42/// Given the name of an encoding known to ICU, return its raw UCM data as a String.
43///
44/// You should not request encoding filenames (with `.ucm`), but it'll still be understood.
45///
46/// Internally, this function considers 105 byte arrays stored as static strings. It takes the name
47/// of the mapping and looks it up in `CHARSET_LOOKUP`, which tells it what byte array contains the
48/// mapping. For example, `CHARSET_LOOKUP["ibm-737_P100-1997.ucm"]` is `58`. So, we know that
49/// `BYTES_58` contains the file. `BYTES_58` is defined as:
50///
51/// ```no_run
52/// include_bytes!("../../resources/brotli/ibm-737_P100-1997ibm-775_P100-1996ibm-803_P100-1999ibm-806_P100-1998ibm-808_P100-1999ibm-813_P100-1995ibm-819_P100-1999ibm-833_P100-1995ibm-834_P100-1995ibm-834_X100-1995.ucm.tar.b");
53/// ```
54///
55/// So, we un-Brotli compress the data in `BYTES_58` and then send it through a `.tar` file parser.
56/// We iterate through the metadata entries until we find one equal to `ibm-737_P100-1997.ucm`. We
57/// then clone that data to a `String` type because only the compressed versions are owned in
58/// memory and return.
59///
60/// This function returns a `Result<_, _>` type because users may provide unknown mappings. The
61/// only error you should ever receive is [`IcuDataError::UnknownMappingRequested`], all the valid
62/// mappings have been tested and decompress on my machine.
63pub fn request_mapping_file(mapping: &str) -> Result<String, IcuDataError> {
64    // We do this in case we're given a mapping file with a .ucm extension
65	let mapping = util::remove_suffix(mapping, ".ucm");
66	let request = format!("{}.ucm", mapping);
67	let index = CHARSET_LOOKUP.get(request.as_str()).ok_or(IcuDataError::UnknownMappingRequested)?;
68	let mut bytes = BROTLI_UCM_ARRAYS[*index].clone();
69
70	let mut brotli_decompressed = vec![];
71	brotli::BrotliDecompress(&mut bytes, &mut brotli_decompressed).or(Err(IcuDataError::BrotliDecompressionFailure))?;
72	let mut tar_archive = tar::Archive::new(&*brotli_decompressed);
73	// e.g. aix-IBM_858-4.3.6; windows-862-2000
74	let mut found = None;
75	for entry in tar_archive.entries().or(Err(IcuDataError::TarArchiveEntriesReadError))?.into_iter() {
76		let e = entry.or(Err(IcuDataError::TarArchiveEntryParseError))?;
77		if *e.path().or(Err(IcuDataError::TarArchivePathParseError))? == PathBuf::from(request.as_str()) {
78			found = Some(e);
79			break
80		}
81	}
82	let found = found.ok_or_else(||IcuDataError::UnknownMappingRequested)?;
83	let begin = found.raw_file_position() as usize;
84	let end = begin + found.size() as usize;
85	Ok( String::from_utf8(brotli_decompressed[begin..end].to_vec()).or(Err(IcuDataError::MappingFileNotUtf8))? )
86}
87
88#[cfg(test)]
89mod tests {
90    use super::*;
91    #[test]
92    fn fetching_works() {
93        // Windows Code Page 862 (Hebrew)
94		let _f = request_mapping_file("windows-862-2000").unwrap();
95		//eprintln!("{}", &f);
96    }
97
98    #[test]
99    fn parsing_works() {
100        let f = request_mapping_file("java-EUC_JP-1.3_P").unwrap();
101        //eprintln!("{}", &f);
102        let enc = parser::parse(&f).unwrap();
103        assert_eq!(enc.codepoints.len(), 13139);
104        assert_eq!(enc.codepoints[0].uni, '\u{0}');
105        assert_eq!(enc.states.len(), 5);
106        assert_eq!(enc.metadata.len(), 5);
107        assert_eq!(enc.metadata["code_set_name"], "java-EUC_JP-1.3_P");
108        assert_eq!(enc.metadata["uconv_class"], "MBCS");
109    }
110
111    #[test]
112    fn known_charsets_works() {
113        assert!(KNOWN_CHARSETS.contains(&"windows-862-2000"));
114        assert_eq!(KNOWN_CHARSETS.len(), 1049);
115    }
116}