translate_storage/
po.rs

1//! Handling of [Uniforum Portable Objects][PO]
2//!
3//! This format is used by the well known [gettext] suite and also supported by the
4//! [translate-toolkit][tt] suite. It is a simple text format storing translation units with
5//! optional context and plural variants.
6//!
7//! For modern translation work it's disadvantage is the plural system only supports integers.
8//!
9//! [PO]: https://www.gnu.org/software/gettext/manual/html_node/PO-Files.html
10//! [gettext]: https://www.gnu.org/software/gettext/
11//! [tt]: http://toolkit.translatehouse.org/
12
13use locale_config::LanguageRange;
14use regex::{Regex,Captures};
15use std::collections::{BTreeMap,HashMap};
16use std::io::{BufRead,Lines};
17use std::iter::Peekable;
18use super::{CatalogueReader,Count,Error,Message,Origin,State,Unit};
19
20#[derive(Clone,Debug)]
21enum PoLine {
22    // (number, kind (translator is space), content of the comment)
23    Comment(usize, char, String),
24    // (number, obsolete/previous flag, tag, string)
25    Message(usize, String, String, String),
26    // (number, obsolete/previous flag, string)
27    Continuation(usize, String, String),
28    // ()
29    Blank,
30}
31
32struct LineIter<R: BufRead> {
33    _n: usize,
34    _inner: Lines<R>,
35}
36
37lazy_static!{
38    static ref MESSAGE_RE: Regex = Regex::new(
39        r#"^\s*(#~?\|?)?\s*(msgctxt|msgid|msgif_plural|msgstr(?:\[[012345]\])?)?\s*"(.*)"\s*$"#)
40        .unwrap();
41    static ref COMMENT_RE: Regex = Regex::new(
42        r#"^\s*#([:.,]?)\s*(.*)"#).unwrap();
43
44    static ref UNESCAPE_RE: Regex = Regex::new(r#"\\[rtn"\\]"#).unwrap();
45    static ref UNESCAPE_MAP: HashMap<&'static str, &'static str> = [
46        (r"\r", "\r"),
47        (r"\t", "\t"),
48        (r"\n", "\n"),
49        ("\\\"", "\""),
50        (r"\\", r"\"),
51    ].iter().cloned().collect();
52}
53
54fn parse_po_line(line: &str, n: usize) -> Result<PoLine, ()> {
55    if !line.contains(|c: char| !c.is_whitespace()) {
56        return Ok(PoLine::Blank);
57    }
58    if let Some(c) = MESSAGE_RE.captures(line) {
59        if c.get(2).is_some() {
60            return Ok(PoLine::Message(
61                    n,
62                    c.get(1).map(|x| x.as_str()).unwrap_or("").to_owned(),
63                    if c.get(1).map(|x| x.as_str()).unwrap_or("").ends_with('|') {
64                        String::from("|") + c.get(2).unwrap().as_str()
65                    } else {
66                        c.get(2).unwrap().as_str().to_owned()
67                    },
68                    UNESCAPE_RE.replace(
69                        c.get(3).unwrap().as_str(),
70                        |d: &Captures| -> String {
71                            UNESCAPE_MAP.get(d.get(0).unwrap().as_str()).unwrap().to_string()
72                        }).into_owned()));
73        } else {
74            return Ok(PoLine::Continuation(
75                    n,
76                    c.get(1).map(|x| x.as_str()).unwrap_or("").to_owned(),
77                    UNESCAPE_RE.replace(
78                        c.get(3).unwrap().as_str(),
79                        |d: &Captures| -> String {
80                            UNESCAPE_MAP.get(d.get(0).unwrap().as_str()).unwrap().to_string()
81                        }).into_owned()));
82        }
83    }
84    if let Some(c) = COMMENT_RE.captures(line) {
85        return Ok(PoLine::Comment(
86                n,
87                c.get(1).unwrap().as_str().chars().next().unwrap_or(' '),
88                c.get(2).unwrap().as_str().to_owned()));
89    }
90    return Err(());
91}
92
93impl<R: BufRead> Iterator for LineIter<R> {
94    type Item = Result<PoLine, Error>;
95    fn next(&mut self) -> Option<Result<PoLine, Error>> {
96        loop {
97            let line = match self._inner.next() {
98                Some(Ok(s)) => s,
99                Some(Err(e)) => return Some(Err(Error::Io(self._n + 1, e))),
100                None => return None,
101            };
102            self._n += 1;
103            match parse_po_line(&line, self._n) {
104                Ok(PoLine::Blank) => (),
105                Ok(p) => return Some(Ok(p)),
106                Err(_) => return Some(Err(Error::Parse(self._n, Some(line), Vec::new()))),
107            }
108        }
109    }
110}
111
112impl<R: BufRead> LineIter<R> {
113    fn new(r: R) -> LineIter<R> {
114        LineIter {
115            _n: 0,
116            _inner: r.lines(),
117        }
118    }
119}
120
121trait MsgParser {
122    fn parse_comments(&mut self, unit: &mut Unit);
123    fn parse_msg(&mut self, tag: &str, unit: &mut Unit) -> Result<Option<String>, Error>;
124    fn expected(&mut self, exp: Vec<&'static str>) -> Result<Option<Unit>, Error>;
125}
126
127impl<R: BufRead> MsgParser for Peekable<LineIter<R>> {
128    fn parse_comments(&mut self, unit: &mut Unit) {
129        while let Some(&Ok(PoLine::Comment(..))) = self.peek() {
130            match self.next() {
131                Some(Ok(PoLine::Comment(_, ',', s))) => {
132                    for flag in s.split(',').map(str::trim) {
133                        match flag {
134                            "fuzzy" => unit._state = State::NeedsWork,
135                            _ => (), // TODO: Implement other flags (do we need any?)
136                        }
137                    }
138                }
139                Some(Ok(PoLine::Comment(_, ':', s))) => {
140                    unit._locations.extend(s.split(char::is_whitespace).filter(|x| !x.is_empty()).map(From::from));
141                }
142                Some(Ok(PoLine::Comment(_, '.', s))) => {
143                    unit._notes.push((Origin::Developer, s));
144                }
145                Some(Ok(PoLine::Comment(_, ' ', s))) => {
146                    unit._notes.push((Origin::Translator, s));
147                }
148                _ => unreachable!(), // we *know* it is a Some(Ok(Comment))
149            }
150        }
151    }
152
153    fn parse_msg(&mut self, tag: &str, unit: &mut Unit) -> Result<Option<String>, Error> {
154        if is!(self.peek() => Some(&Err(_))) {
155            // Get error out of the way
156            return Err(self.next().unwrap().unwrap_err())
157        }
158        
159        let prefix;
160        let mut string;
161
162        if is!(self.peek() =>
163               Some(&Ok(PoLine::Message(_, ref p, ref t, _)))
164               if t == tag && p.starts_with("#~") == unit._obsolete) {
165            if let PoLine::Message(_, p, _, s) = self.next().unwrap().unwrap() {
166                prefix = p;
167                string = s;
168            } else {
169                unreachable!()
170            }
171        } else {
172            return Ok(None); // Not the expected message
173        }
174
175        loop {
176            if is!(self.peek() => Some(&Err(_))) {
177                // Get error out of the way
178                return Err(self.next().unwrap().unwrap_err())
179            }
180
181            if is!(self.peek() =>
182                   Some(&Ok(PoLine::Continuation(_, ref p, _)))
183                   if *p == prefix) {
184                if let PoLine::Continuation(_, _, s) = self.next().unwrap().unwrap() {
185                    string.push_str(&s);
186                } else {
187                    unreachable!();
188                }
189            } else {
190                break;
191            }
192        }
193        Ok(Some(string))
194    }
195
196    fn expected(&mut self, exp: Vec<&'static str>) -> Result<Option<Unit>, Error> {
197        match self.peek() {
198            Some(&Ok(PoLine::Message(n, ref p, ..))) =>
199                Err(Error::Parse(n, Some(p.clone()), exp)),
200            Some(&Ok(PoLine::Continuation(n, ..))) =>
201                Err(Error::Parse(n, Some("\"".to_owned()), exp)),
202            Some(&Ok(PoLine::Comment(n, c, ..))) =>
203                Err(Error::Parse(n, Some(format!("#{}", c)), exp)),
204            None =>
205                Ok(None),
206            _ => panic!("Should not happen!"),
207        }
208    }
209}
210
211fn is_header(oru: &Option<Result<Unit, Error>>) -> bool {
212    match oru {
213        &Some(Ok(ref u)) => u.source().is_singular() && u.source().is_blank(),
214        _ => false,
215    }
216}
217
218pub struct PoReader<R: BufRead> {
219    _lines: Peekable<LineIter<R>>,
220    _next_unit: Option<Result<Unit, Error>>,
221    _failed: Option<Error>,
222    _header: HashMap<String, String>,
223    _target_language: LanguageRange<'static>,
224    _plurals: Vec<Count>,
225}
226
227impl<R: BufRead> PoReader<R> {
228    pub fn new(reader: R) -> Self {
229        let mut res = PoReader {
230            _lines: LineIter::new(reader).peekable(),
231            _next_unit: None,
232            _failed: None,
233            _header: HashMap::new(),
234            _target_language: LanguageRange::invariant(),
235            _plurals: Vec::new(),
236        };
237        res._next_unit = res.next_unit();
238        if is_header(&res._next_unit) {
239            res.parse_po_header();
240            res._next_unit = res.next_unit();
241        }
242        return res;
243    }
244
245    fn make_source(msgid: Option<String>, msgid_plural: Option<String>) -> Message {
246        if msgid.is_none() {
247            Message::Empty
248        } else if msgid_plural.is_none() {
249            Message::Singular(msgid.unwrap())
250        } else {
251            let mut map = BTreeMap::new();
252            map.insert(Count::One, msgid.unwrap());
253            map.insert(Count::Other, msgid_plural.unwrap());
254            Message::Plural(map)
255        }
256    }
257
258    fn parse_unit(&mut self) -> Result<Option<Unit>, Error> {
259        let mut unit = Unit::default();
260
261        self._lines.parse_comments(&mut unit);
262        match self._lines.peek() {
263            None => return Ok(None), // end if no unit (possibly after comments)
264            Some(&Ok(PoLine::Message(_, ref p, ..))) // detect obsolete
265                if p.starts_with("#~") => unit._obsolete = true,
266            _ => (),
267        }
268
269        unit._prev_context = self._lines.parse_msg("|msgctxt", &mut unit)?;
270
271        let prev_msgid = self._lines.parse_msg("|msgid", &mut unit)?;
272        let prev_msgid_pl = if prev_msgid.is_some() {
273            self._lines.parse_msg("|msgid_plural", &mut unit)?
274        } else { None };
275        unit._prev_source = Self::make_source(prev_msgid, prev_msgid_pl);
276
277        unit._context = self._lines.parse_msg("msgctxt", &mut unit)?;
278
279        let msgid = self._lines.parse_msg("msgid", &mut unit)?;
280        if msgid.is_none() {
281            return self._lines.expected(vec!["msgid"]);
282        }
283        let msgid_pl = self._lines.parse_msg("msgid_plural", &mut unit)?;
284        unit._source = Self::make_source(msgid, msgid_pl);
285
286        if unit._source.is_singular() {
287            // sinngular source, so expecting singular target:
288            match self._lines.parse_msg("msgstr", &mut unit)? {
289                None => return self._lines.expected(vec!["msgstr"]),
290                Some(s) => unit._target = Message::Singular(s),
291            }
292        } else {
293            assert!(unit._source.is_plural());
294            const TAGS: &'static [&'static str] =
295                &["msgstr[0]", "msgstr[1]", "msgstr[2]", "msgstr[3]", "msgstr[4]", "msgstr[5]", "msgstr[6]"];
296            let mut map = BTreeMap::new();
297            for (c, t) in self._plurals.iter().zip(TAGS) {
298                match self._lines.parse_msg(t, &mut unit)? {
299                    None => return self._lines.expected(vec![t]),
300                    Some(s) => { map.insert(*c, s); }
301                }
302            }
303            unit._target = Message::Plural(map);
304        }
305
306        if unit._state == State::Empty && !unit._target.is_blank() {
307            // translation is non-empty and state was not set yet, then it is final
308            unit._state = State::Final;
309        }
310
311        assert!(!unit._source.is_empty());
312        return Ok(Some(unit));
313    }
314
315    fn next_unit(&mut self) -> Option<Result<Unit, Error>> {
316        match self.parse_unit() {
317            Ok(None) => None,
318            Ok(Some(u)) => Some(Ok(u)),
319            Err(e) => Some(Err(e)),
320        }
321    }
322
323    fn parse_po_header(&mut self) {
324        if let Some(Ok(ref u)) = self._next_unit {
325            for line in u._target.singular().unwrap_or("").split('\n') {
326                if let Some(n) = line.find(':') {
327                    let key = line[..n].trim();
328                    let val = line[(n+1)..].trim();
329                    self._header.insert(key.to_owned(), val.to_owned());
330                }
331            }
332            if let Some(lang) = self._header.get("Language") {
333                self._target_language
334                    = LanguageRange::new(lang)
335                    .map(LanguageRange::into_static)
336                    .or_else(|_| LanguageRange::from_unix(lang))
337                    .unwrap_or_else(|_| LanguageRange::invariant());
338            }
339            // FIXME FIXME: Extract plurals
340        }
341    }
342}
343
344impl<R: BufRead> Iterator for PoReader<R> {
345    type Item = Result<Unit, Error>;
346    fn next(&mut self) -> Option<Result<Unit, Error>> {
347        if self._next_unit.is_none() {
348            return None;
349        }
350
351        let mut res = self.next_unit();
352        ::std::mem::swap(&mut res, &mut self._next_unit);
353        return res;
354    }
355}
356
357impl<R: BufRead> CatalogueReader for PoReader<R> {
358    fn target_language(&self) -> &LanguageRange<'static> {
359        &self._target_language
360    }
361}
362
363#[cfg(test)]
364mod tests {
365    use ::CatalogueReader;
366    use ::locale_config::LanguageRange;
367    use ::Message::*;
368    use ::Origin::*;
369    use super::PoReader;
370
371    static SAMPLE_PO: &'static str = r###"
372msgid ""
373msgstr ""
374"Project-Id-Version: translate-storage test\n"
375"PO-Revision-Date: 2017-04-24 21:39+02:00\n"
376"Last-Translator: Jan Hudec <bulb@ucw.cz>\n"
377"Language-Team: Czech\n"
378"Language: cs\n"
379"MIME-Version: 1.0\n"
380"Content-Type: text/plain; charset=ISO-8859-2\n"
381"Content-Transfer-Encoding: 8bit\n"
382"Plural-Forms: nplurals=3; plural=(n==1) ? 0 : (n>=2 && n<=4) ? 1 : 2;\n"
383
384msgid "Simple message"
385msgstr "Jednoduchá zpráva"
386
387#. Extracted comment
388# Translator comment
389#: Location:42  Another:69
390#, fuzzy
391#| msgctxt "ConTeXt"
392#| msgid "Previous message"
393msgctxt "ConTeXt"
394msgid "Changed message"
395msgstr "Změněná\n"
396"zpráva"
397
398msgid "Untranslated message"
399msgstr ""
400
401# Another comment
402#~ msgid "Obsolete message"
403#~ msgstr "Zastaralá zpráva"
404
405"###;
406
407    #[test]
408    fn integration_test() {
409        let mut reader = PoReader::new(SAMPLE_PO.as_ref());
410
411        assert_eq!(LanguageRange::new("cs").unwrap(), *reader.target_language());
412        
413        let u1 = reader.next().unwrap().unwrap();
414        assert_eq!(None, *u1.context());
415        assert_eq!(Singular("Simple message".to_owned()), *u1.source());
416        assert_eq!(Singular("Jednoduchá zpráva".to_owned()), *u1.target());
417        assert_eq!(None, *u1.prev_context());
418        assert_eq!(Empty, *u1.prev_source());
419        assert!(u1.notes().is_empty());
420        assert!(u1.locations().is_empty());
421        assert_eq!(::State::Final, u1.state());
422        assert!(u1.is_translated());
423        assert!(!u1.is_obsolete());
424
425        let u2 = reader.next().unwrap().unwrap();
426        assert_eq!(Some("ConTeXt".to_owned()), *u2.context());
427        assert_eq!(Singular("Changed message".to_owned()), *u2.source());
428        assert_eq!(Singular("Změněná\nzpráva".to_owned()), *u2.target());
429        assert_eq!(Some("ConTeXt".to_owned()), *u2.prev_context());
430        assert_eq!(Singular("Previous message".to_owned()), *u2.prev_source());
431        assert_eq!(&[
432                (Developer, "Extracted comment".to_owned()),
433                (Translator, "Translator comment".to_owned()),
434            ], u2.notes().as_slice());
435        assert_eq!(&[
436                "Location:42".to_owned(),
437                "Another:69".to_owned(),
438            ], u2.locations().as_slice());
439        assert_eq!(::State::NeedsWork, u2.state());
440        assert!(!u2.is_translated());
441        assert!(!u2.is_obsolete());
442
443        let u3 = reader.next().unwrap().unwrap();
444        assert_eq!(None, *u3.context());
445        assert_eq!(Singular("Untranslated message".to_owned()), *u3.source());
446        assert_eq!(Singular("".to_owned()), *u3.target());
447        assert_eq!(None, *u3.prev_context());
448        assert_eq!(Empty, *u3.prev_source());
449        assert!(u3.notes().is_empty());
450        assert!(u3.locations().is_empty());
451        assert_eq!(::State::Empty, u3.state());
452        assert!(!u3.is_translated());
453        assert!(!u3.is_obsolete());
454        
455        let u4 = reader.next().unwrap().unwrap();
456        assert_eq!(None, *u4.context());
457        assert_eq!(Singular("Obsolete message".to_owned()), *u4.source());
458        assert_eq!(Singular("Zastaralá zpráva".to_owned()), *u4.target());
459        assert_eq!(None, *u4.prev_context());
460        assert_eq!(Empty, *u4.prev_source());
461        assert_eq!(&[
462                (Translator, "Another comment".to_owned()),
463            ], u4.notes().as_slice());
464        assert!(u4.locations().is_empty());
465        assert_eq!(::State::Final, u4.state());
466        assert!(u4.is_translated());
467        assert!(u4.is_obsolete());
468
469        assert!(reader.next().is_none());
470    }
471}