gettext_ng/
parser.rs

1use std::default::Default;
2use std::io;
3
4use byteorder::{BigEndian, ByteOrder, LittleEndian};
5use encoding::label::encoding_from_whatwg_label;
6use encoding::types::DecoderTrap::Strict;
7use encoding::types::EncodingRef;
8
9use crate::metadata::parse_metadata;
10use crate::plurals::{Ast, Resolver};
11use crate::Error::{self, *};
12use crate::{Catalog, Message};
13
14#[allow(non_upper_case_globals)]
15static utf8_encoding: EncodingRef = &encoding::codec::utf_8::UTF8Encoding;
16
17/// ParseOptions allows setting options for parsing MO catalogs.
18///
19/// # Examples
20/// ```ignore
21/// use std::fs::File;
22/// use encoding::all::ISO_8859_1;
23///
24/// let file = File::open("french.mo").unwrap();
25/// let catalog = ParseOptions::new().force_encoding(ISO_8859_1).parse(file).unwrap();
26/// ```
27#[allow(missing_debug_implementations)]
28#[derive(Default)]
29pub struct ParseOptions {
30    force_encoding: Option<EncodingRef>,
31    force_plural: Option<fn(u64) -> usize>,
32}
33
34impl ParseOptions {
35    /// Returns a new instance of ParseOptions with default options.
36    pub fn new() -> Self {
37        Default::default()
38    }
39
40    /// Tries to parse the catalog from the given reader using the specified options.
41    pub fn parse<R: io::Read>(self, reader: R) -> Result<Catalog, Error> {
42        parse_catalog(reader, self)
43    }
44
45    /// Forces a use of a specific encoding
46    /// when parsing strings from a catalog.
47    /// If this option is not enabled,
48    /// the parser tries to use the encoding specified in the metadata
49    /// or UTF-8 if metadata is non-existent.
50    pub fn force_encoding(mut self, encoding: EncodingRef) -> Self {
51        self.force_encoding = Some(encoding);
52        self
53    }
54
55    /// Forces a use of the given plural formula
56    /// for deciding the proper plural form for a message.
57    /// If this option is not enabled,
58    /// the parser tries to use the plural formula specified in the metadata
59    /// or `n != 1` if metadata is non-existent.
60    pub fn force_plural(mut self, plural: fn(u64) -> usize) -> Self {
61        self.force_plural = Some(plural);
62        self
63    }
64}
65
66/// According to the given magic number of a MO file,
67/// returns the function which reads a `u32` in the relevant endianness.
68fn get_read_u32_fn(magic: &[u8]) -> Option<fn(&[u8]) -> u32> {
69    if magic == [0xde, 0x12, 0x04, 0x95] {
70        Some(LittleEndian::read_u32)
71    } else if magic == [0x95, 0x04, 0x12, 0xde] {
72        Some(BigEndian::read_u32)
73    } else {
74        None
75    }
76}
77
78pub fn parse_catalog<R: io::Read>(mut file: R, opts: ParseOptions) -> Result<Catalog, Error> {
79    let mut contents = vec![];
80    let n = file.read_to_end(&mut contents)?;
81    if n < 28 {
82        return Err(Eof);
83    }
84
85    let read_u32 = get_read_u32_fn(&contents[0..4]).ok_or(BadMagic)?;
86
87    // ignore hashing tables (bytes at 20..28)
88    let num_strings = read_u32(&contents[8..12]) as usize;
89    let mut off_otable = read_u32(&contents[12..16]) as usize;
90    let mut off_ttable = read_u32(&contents[16..20]) as usize;
91    if n < off_otable || n < off_ttable {
92        return Err(Eof);
93    }
94
95    let mut catalog = Catalog::new();
96    if let Some(f) = opts.force_plural {
97        catalog.resolver = Resolver::Function(f);
98    }
99    let mut encoding = opts.force_encoding.unwrap_or(utf8_encoding);
100
101    for i in 0..num_strings {
102        // Parse the original string
103        if n < off_otable + 8 {
104            return Err(Eof);
105        }
106        let len = read_u32(&contents[off_otable..off_otable + 4]) as usize;
107        let off = read_u32(&contents[off_otable + 4..off_otable + 8]) as usize;
108        // +1 compensates for the ending NUL byte which is not included in length
109        if n < off + len + 1 {
110            return Err(Eof);
111        }
112        let mut original = &contents[off..=off + len];
113        // check for context
114        let context = match original.iter().position(|x| *x == 4) {
115            Some(idx) => {
116                let ctx = &original[..idx];
117                original = &original[idx + 1..];
118                Some(encoding.decode(ctx, Strict)?)
119            }
120            None => None,
121        };
122        // extract msg_id singular, ignoring the plural
123        let id = match original
124            .iter()
125            .position(|x| *x == 0)
126            .map(|i| &original[..i])
127        {
128            Some(b) => encoding.decode(b, Strict)?,
129            None => return Err(Eof),
130        };
131        if id == "" && i != 0 {
132            return Err(MisplacedMetadata);
133        }
134
135        // Parse the translation strings
136        if n < off_ttable + 8 {
137            return Err(Eof);
138        }
139        let len = read_u32(&contents[off_ttable..off_ttable + 4]) as usize;
140        let off = read_u32(&contents[off_ttable + 4..off_ttable + 8]) as usize;
141        // +1 compensates for the ending NUL byte which is not included in length
142        if n < off + len + 1 {
143            return Err(Eof);
144        }
145        let translated = contents[off..off + len]
146            .split(|x| *x == 0)
147            .map(|b| encoding.decode(b, Strict))
148            .collect::<Result<Vec<_>, _>>()?;
149        if id == "" {
150            let map = parse_metadata(&*translated[0])?;
151            if let (Some(c), None) = (map.charset(), opts.force_encoding) {
152                encoding = encoding_from_whatwg_label(c).ok_or(UnknownEncoding)?;
153            }
154            if opts.force_plural.is_none() {
155                if let Some(p) = map.plural_forms().1 {
156                    catalog.resolver = Ast::parse(p).map(Resolver::Expr)?;
157                }
158            }
159        }
160
161        catalog.insert(Message::new(id, context, translated));
162
163        off_otable += 8;
164        off_ttable += 8;
165    }
166
167    Ok(catalog)
168}
169
170/// The default plural resolver.
171///
172/// It will be used if not `Plural-Forms` header is found in the .mo file, and if
173/// `ParseOptions::force_plural` was not called.
174///
175/// It is valid for English and similar languages: plural will be used for any quantity
176/// different of 1.
177pub fn default_resolver(n: u64) -> usize {
178    if n == 1 {
179        0
180    } else {
181        1
182    }
183}
184
185#[cfg(test)]
186mod test {
187    use super::*;
188
189    #[test]
190    fn test_get_read_u32_fn() {
191        assert!(get_read_u32_fn(&[]).is_none());
192        assert!(get_read_u32_fn(&[0xde, 0x12, 0x04, 0x95, 0x00]).is_none());
193
194        {
195            let le_ptr = LittleEndian::read_u32 as *const ();
196            let ret_ptr = get_read_u32_fn(&[0xde, 0x12, 0x04, 0x95]).unwrap() as _;
197            assert_eq!(le_ptr, ret_ptr);
198        }
199
200        {
201            let be_ptr = BigEndian::read_u32 as *const ();
202            let ret_ptr = get_read_u32_fn(&[0x95, 0x04, 0x12, 0xde]).unwrap() as _;
203            assert_eq!(be_ptr, ret_ptr);
204        }
205    }
206
207    #[test]
208    fn test_parse_catalog() {
209        macro_rules! assert_variant {
210            ($value:expr, $variant:path) => {
211                match $value {
212                    $variant => (),
213                    _ => panic!("Expected {:?}, got {:?}", $variant, $value),
214                }
215            };
216        }
217
218        let fluff = [0; 24]; // zeros to pad our magic test cases to satisfy the length requirements
219
220        {
221            let mut reader = vec![1u8, 2, 3];
222            reader.extend(fluff.iter().cloned());
223            let err = parse_catalog(&reader[..], ParseOptions::new()).unwrap_err();
224            assert_variant!(err, Eof);
225        }
226
227        {
228            let mut reader = vec![1u8, 2, 3, 4];
229            reader.extend(fluff.iter().cloned());
230            let err = parse_catalog(&reader[..], ParseOptions::new()).unwrap_err();
231            assert_variant!(err, BadMagic);
232        }
233
234        {
235            let mut reader = vec![0x95, 0x04, 0x12, 0xde];
236            reader.extend(fluff.iter().cloned());
237            assert!(parse_catalog(&reader[..], ParseOptions::new()).is_ok());
238        }
239
240        {
241            let mut reader = vec![0xde, 0x12, 0x04, 0x95];
242            reader.extend(fluff.iter().cloned());
243            assert!(parse_catalog(&reader[..], ParseOptions::new()).is_ok());
244        }
245
246        {
247            let reader: &[u8] = include_bytes!("../test_cases/1.mo");
248            let catalog = parse_catalog(reader, ParseOptions::new()).unwrap();
249            assert_eq!(catalog.strings.len(), 1);
250            assert_eq!(
251                catalog.strings["this is context\x04Text"],
252                Message::new("Text", Some("this is context"), vec!["Tekstas", "Tekstai"])
253            );
254        }
255
256        {
257            let reader: &[u8] = include_bytes!("../test_cases/2.mo");
258            let catalog = parse_catalog(reader, ParseOptions::new()).unwrap();
259            assert_eq!(catalog.strings.len(), 2);
260            assert_eq!(
261                catalog.strings["Image"],
262                Message::new("Image", None, vec!["Nuotrauka", "Nuotraukos"])
263            );
264        }
265
266        {
267            let reader: &[u8] = include_bytes!("../test_cases/invalid_utf8.mo");
268            let err = parse_catalog(reader, ParseOptions::new()).unwrap_err();
269            assert_variant!(err, DecodingError);
270        }
271    }
272}