1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
extern crate byteorder;
extern crate encoding;

use std::borrow::Cow;
use std::default::Default;
use std::error;
use std::fmt;
use std::io;

use self::byteorder::{BigEndian, ByteOrder, LittleEndian};
use self::encoding::label::encoding_from_whatwg_label;
use self::encoding::types::DecoderTrap::Strict;
use self::encoding::types::EncodingRef;

use super::plurals::{Ast, Resolver};
use super::{Catalog, Message};
use metadata::parse_metadata;

#[allow(non_upper_case_globals)]
static utf8_encoding: EncodingRef = &encoding::codec::utf_8::UTF8Encoding;

/// Represents an error encountered while parsing an MO file.
#[derive(Debug)]
pub enum Error {
    /// An incorrect magic number has been encountered
    BadMagic,
    /// An invalid byte sequence for the given encoding has been encountered
    DecodingError,
    /// An unexpected EOF occured
    Eof,
    /// An I/O error occured
    Io(io::Error),
    /// Incorrect syntax encountered while parsing the meta information
    MalformedMetadata,
    /// Meta information string was not the first string in the catalog
    MisplacedMetadata,
    /// Invalid Plural-Forms metadata
    PluralParsing,
    /// An unknown encoding was specified in the metadata
    UnknownEncoding,
}
use Error::*;

impl error::Error for Error {
    fn description(&self) -> &str {
        match *self {
            BadMagic => "bad magic number",
            DecodingError => "invalid byte sequence in a string",
            Eof => "unxpected end of file",
            Io(ref err) => err.description(),
            MalformedMetadata => "metadata syntax error",
            MisplacedMetadata => "misplaced metadata",
            UnknownEncoding => "unknown encoding specified",
            PluralParsing => "invalid plural expression",
        }
    }
}

impl fmt::Display for Error {
    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
        let self_err: &error::Error = self;
        write!(fmt, "{}", self_err.description())
    }
}

impl From<io::Error> for Error {
    fn from(inner: io::Error) -> Error {
        Io(inner)
    }
}

impl From<Cow<'static, str>> for Error {
    fn from(_: Cow<'static, str>) -> Error {
        DecodingError
    }
}

/// ParseOptions allows setting options for parsing MO catalogs.
///
/// # Examples
/// ```ignore
/// use std::fs::File;
///
/// extern crate encoding;
/// use encoding::all::ISO_8859_1;
///
/// let file = File::open("french.mo").unwrap();
/// let catalog = ParseOptions::new().force_encoding(ISO_8859_1).parse(file).unwrap();
/// ```
#[allow(missing_debug_implementations)]
#[derive(Default)]
pub struct ParseOptions {
    force_encoding: Option<EncodingRef>,
    force_plural: Option<fn(u64) -> usize>,
}

impl ParseOptions {
    /// Returns a new instance of ParseOptions with default options.
    pub fn new() -> Self {
        Default::default()
    }

    /// Tries to parse the catalog from the given reader using the specified options.
    pub fn parse<R: io::Read>(self, reader: R) -> Result<Catalog, Error> {
        parse_catalog(reader, self)
    }

    /// Forces a use of a specific encoding
    /// when parsing strings from a catalog.
    /// If this option is not enabled,
    /// the parser tries to use the encoding specified in the metadata
    /// or UTF-8 if metadata is non-existent.
    pub fn force_encoding(mut self, encoding: EncodingRef) -> Self {
        self.force_encoding = Some(encoding);
        self
    }

    /// Forces a use of the given plural formula
    /// for deciding the proper plural form for a message.
    /// If this option is not enabled,
    /// the parser uses the default formula
    /// (`n != 1`).
    pub fn force_plural(mut self, plural: fn(u64) -> usize) -> Self {
        self.force_plural = Some(plural);
        self
    }
}

/// According to the given magic number of a MO file,
/// returns the function which reads a `u32` in the relevant endianness.
fn get_read_u32_fn(magic: &[u8]) -> Option<fn(&[u8]) -> u32> {
    if magic == [0xde, 0x12, 0x04, 0x95] {
        Some(LittleEndian::read_u32)
    } else if magic == [0x95, 0x04, 0x12, 0xde] {
        Some(BigEndian::read_u32)
    } else {
        None
    }
}

pub fn parse_catalog<'a, R: io::Read>(mut file: R, opts: ParseOptions) -> Result<Catalog, Error> {
    let mut contents = vec![];
    let n = try!(file.read_to_end(&mut contents));
    if n < 28 {
        return Err(Eof);
    }

    let read_u32 = match get_read_u32_fn(&contents[0..4]) {
        Some(f) => f,
        None => return Err(BadMagic),
    };

    // ignore hashing tables (bytes at 20..28)
    let num_strings = read_u32(&contents[8..12]) as usize;
    let mut off_otable = read_u32(&contents[12..16]) as usize;
    let mut off_ttable = read_u32(&contents[16..20]) as usize;
    if n < off_otable || n < off_ttable {
        return Err(Eof);
    }

    let mut catalog = Catalog::new();
    if let Some(f) = opts.force_plural {
        catalog.resolver = Resolver::Function(f);
    }
    let mut encoding = opts.force_encoding.unwrap_or(utf8_encoding);

    for i in 0..num_strings {
        // Parse the original string
        if n < off_otable + 8 {
            return Err(Eof);
        }
        let len = read_u32(&contents[off_otable..off_otable + 4]) as usize;
        let off = read_u32(&contents[off_otable + 4..off_otable + 8]) as usize;
        // +1 compensates for the ending NUL byte which is not included in length
        if n < off + len + 1 {
            return Err(Eof);
        }
        let mut original = &contents[off..off + len + 1];
        // check for context
        let context = match original.iter().position(|x| *x == 4) {
            Some(idx) => {
                let ctx = &original[..idx];
                original = &original[idx + 1..];
                Some(try!(encoding.decode(ctx, Strict)))
            }
            None => None,
        };
        // extract msg_id singular, ignoring the plural
        let id = match original
            .iter()
            .position(|x| *x == 0)
            .map(|i| &original[..i])
        {
            Some(b) => try!(encoding.decode(b, Strict)),
            None => return Err(Eof),
        };
        if id == "" && i != 0 {
            return Err(MisplacedMetadata);
        }

        // Parse the translation strings
        if n < off_ttable + 8 {
            return Err(Eof);
        }
        let len = read_u32(&contents[off_ttable..off_ttable + 4]) as usize;
        let off = read_u32(&contents[off_ttable + 4..off_ttable + 8]) as usize;
        // +1 compensates for the ending NUL byte which is not included in length
        if n < off + len + 1 {
            return Err(Eof);
        }
        let translated = try!(
            (&contents[off..off + len])
                .split(|x| *x == 0)
                .map(|b| encoding.decode(b, Strict))
                .collect::<Result<Vec<_>, _>>()
        );
        if id == "" {
            let map = parse_metadata(&*translated[0])?;
            if let (Some(c), None) = (map.charset(), opts.force_encoding) {
                encoding = match encoding_from_whatwg_label(c) {
                    Some(enc_ref) => enc_ref,
                    None => return Err(UnknownEncoding),
                }
            }
            if opts.force_plural.is_none() {
                if let Some(p) = map.plural_forms().1 {
                    catalog.resolver = Ast::parse(p).map(Resolver::Expr)?;
                }
            }
        }

        catalog.insert(Message::new(id, context, translated));

        off_otable += 8;
        off_ttable += 8;
    }

    Ok(catalog)
}

/// The default plural resolver.
///
/// It will be used if not `Plural-Forms` header is found in the .mo file, and if
/// `ParseOptions::force_plural` was not called.
///
/// It is valid for English and similar languages: plural will be used for any quantity
/// different of 1.
pub fn default_resolver(n: u64) -> usize {
    if n == 1 {
        0
    } else {
        1
    }
}

#[test]
fn test_get_read_u32_fn() {
    use std::mem;

    assert!(get_read_u32_fn(&[]).is_none());
    assert!(get_read_u32_fn(&[0xde, 0x12, 0x04, 0x95, 0x00]).is_none());

    {
        let le_ptr: *const ();
        let ret_ptr;
        unsafe {
            le_ptr = mem::transmute(LittleEndian::read_u32 as usize);
            ret_ptr = mem::transmute(get_read_u32_fn(&[0xde, 0x12, 0x04, 0x95]).unwrap());
        }
        assert_eq!(le_ptr, ret_ptr);
    }

    {
        let be_ptr: *const ();
        let ret_ptr;
        unsafe {
            be_ptr = mem::transmute(BigEndian::read_u32 as usize);
            ret_ptr = mem::transmute(get_read_u32_fn(&[0x95, 0x04, 0x12, 0xde]).unwrap());
        }
        assert_eq!(be_ptr, ret_ptr);
    }
}

#[test]
fn test_parse_catalog() {
    macro_rules! assert_variant {
        ($value:expr, $variant:path) => {
            match $value {
                $variant => (),
                _ => panic!("Expected {:?}, got {:?}", $variant, $value),
            }
        };
    }

    let fluff = [0; 24]; // zeros to pad our magic test cases to satisfy the length requirements

    {
        let mut reader = vec![1u8, 2, 3];
        reader.extend(fluff.iter().cloned());
        let err = parse_catalog(&reader[..], ParseOptions::new()).unwrap_err();
        assert_variant!(err, Eof);
    }

    {
        let mut reader = vec![1u8, 2, 3, 4];
        reader.extend(fluff.iter().cloned());
        let err = parse_catalog(&reader[..], ParseOptions::new()).unwrap_err();
        assert_variant!(err, BadMagic);
    }

    {
        let mut reader = vec![0x95, 0x04, 0x12, 0xde];
        reader.extend(fluff.iter().cloned());
        assert!(parse_catalog(&reader[..], ParseOptions::new()).is_ok());
    }

    {
        let mut reader = vec![0xde, 0x12, 0x04, 0x95];
        reader.extend(fluff.iter().cloned());
        assert!(parse_catalog(&reader[..], ParseOptions::new()).is_ok());
    }

    {
        let reader: &[u8] = include_bytes!("../test_cases/1.mo");
        let catalog = parse_catalog(reader, ParseOptions::new()).unwrap();
        assert_eq!(catalog.strings.len(), 1);
        assert_eq!(
            catalog.strings["this is context\x04Text"],
            Message::new("Text", Some("this is context"), vec!["Tekstas", "Tekstai"])
        );
    }

    {
        let reader: &[u8] = include_bytes!("../test_cases/2.mo");
        let catalog = parse_catalog(reader, ParseOptions::new()).unwrap();
        assert_eq!(catalog.strings.len(), 2);
        assert_eq!(
            catalog.strings["Image"],
            Message::new("Image", None, vec!["Nuotrauka", "Nuotraukos"])
        );
    }

    {
        let reader: &[u8] = include_bytes!("../test_cases/invalid_utf8.mo");
        let err = parse_catalog(reader, ParseOptions::new()).unwrap_err();
        assert_variant!(err, DecodingError);
    }
}