Skip to main content

include_po/
lib.rs

1//! This crate contains functions to parse PO files (gettext message catalogs) and use them in Rust.
2//!
3//! # Main usage
4//!
5//! The recommended use is to use it to build a Rust module with the translations.
6//!
7//! First build your PO files into the `locales` directory, in the root of your project.
8//!
9//! Then this the crate as a `[build-dependencies]` and `tr` as a normal one: in your `Cargo.toml`:
10//! ```
11//! [dev-dependencies]
12//! include-po = "0.1"
13//!
14//! [dependencies]
15//! tr = { version = "0.1.10", default-features = false }
16//! ```
17//! Write a `build.rs` script:
18//! ```
19//! fn main() {
20//!     let output_dir = std::env::var("OUT_DIR").unwrap();
21//!     let out = std::path::PathBuf::from(&output_dir).join("locale/translators.rs");
22//!     include_po::generate_locales_from_dir("locales", out).unwrap();
23//! }
24//! ```
25//! And finally in your `main.rs` or `lib.rs`:
26//! ```
27//! include!(concat!(env!("OUT_DIR"), "/locale/translators.rs"));
28//! ```
29//! That's it! Now you can call `translators::set_locale("es");` to switch to Spanish!
30//!
31//! If you are writing a lib crate, as a convention you should have a public function in the root namespace:
32//! ```
33//! fn set_locale(locale: &str) {
34//!     translators::set_locale(locale);
35//! }
36//! ```
37//! This way you can chain the locale of multiple translatable libraries.
38//!
39//! # Other functions
40//!
41//! If you prefer to do the translations yourself, you can also use this crate to parse the PO file and obtain
42//! the messages. But note that currently the messages will be unescaped, that is a '"' will be a '\n' and so on.
43//!
44//! # Why PO instead of MO?
45//!
46//! Most solutions based on gettext read the message catalog from the MO file, instead of the PO. A MO file is a
47//! compiled message catalog. The reason original gettext uses MO files is to optimize start-up time: when the
48//! `gettext` library wants to use a MO, it just locates the file, opens it and memory-maps it. It does very little
49//! parse, because everything is designed to be used from that memory map. It even contains a precomputed hash table!
50//!
51//! In Rust, I see little reason to distribute MO files separated from the executable. Some people try to use `include_bytes!`
52//! and then parse the binary data into a `BTreeMap`... but that defeats the purpose of the MO existence in the first place.
53//!
54//! If you are going to embed the message catalog into the executable you may as well go all the way and include it as code:
55//! once again the catalog is memory mapped (as it is most executable code) and with zero parsing at runtime. But if you are
56//! going to parse the catalog and convert it to Rust at build time, why read the MO and not the PO that is simpler and saves
57//! a compiler step?
58//!
59//! But what about the hash table? you are probably asking... Well, currently this crate is building a giant `match string`
60//! for each source PO file. This seems to be good enough, but if needed we can transparently upgrade it to a cleverer algoritm.
61//! My hope is that the code generated by the compiler will get better faster than the needs of this crate.
62
63#![allow(clippy::needless_doctest_main)]
64
65use std::io::{BufRead, Write};
66use std::path::{Path, PathBuf};
67use thiserror::Error;
68
69mod plurals;
70
71/// An error while parsing or processing a PO file.
72#[derive(Error, Debug)]
73pub enum PoIncludeError {
74    #[error("Invalid path '{0}'")]
75    InvalidPath(PathBuf),
76    #[error("Non-UTF-8 PO file '{0}'")]
77    NonUtf8PoFile(PathBuf),
78    #[error("Invalid plural expression")]
79    PluralError,
80    #[error(transparent)]
81    Io {
82        #[from]
83        source: std::io::Error,
84    },
85}
86
87/// The default return type.
88pub type Result<T> = std::result::Result<T, PoIncludeError>;
89
90/// Build a module with all the translations from a directory.
91///
92/// This function creates a module `translators` in the given directory. Then,
93/// for each PO file in the source directory it will create a submodule.
94/// Then, in the root module it creates a function `set_locale(locale: &str)` that
95/// chooses the given locale by calling the `tr::set_translator()`.
96pub fn generate_locales_from_dir(
97    po_dir: impl AsRef<Path>,
98    out_path: impl AsRef<Path>,
99) -> Result<()> {
100    let po_dir = po_dir.as_ref();
101    let out_path = out_path.as_ref();
102    let out_dir = out_path
103        .parent()
104        .ok_or_else(|| PoIncludeError::InvalidPath(out_path.to_owned()))?;
105    if !out_dir.is_dir() {
106        std::fs::create_dir_all(out_dir)?;
107    }
108
109    let out = std::fs::File::create(out_path)?;
110    let mut out = std::io::BufWriter::new(out);
111    let mod_path = std::path::absolute(out_dir)?;
112    writeln!(out, r#"#[path = {:?}]"#, mod_path)?;
113    writeln!(
114        out,
115        r#"#[allow(unused_variables)]
116pub mod translators {{
117"#
118    )?;
119
120    let mut objs = Vec::new();
121    for entry in po_dir.read_dir()? {
122        let entry = entry?;
123        let path = entry.path();
124        if path.extension().and_then(|s| s.to_str()) != Some("po") {
125            continue;
126        }
127        let Some(lang) = path.file_stem() else {
128            continue;
129        };
130        let lang = lang.to_ascii_lowercase();
131        let Some(lang) = lang.to_str() else { continue };
132        let lang = lang.to_owned();
133        generate_rs_from_po(path, out_dir.join(format!("{lang}.rs")))?;
134        println!("cargo:rerun-if-changed={}", entry.path().display());
135
136        writeln!(out, "pub mod {lang};")?;
137        objs.push(lang);
138    }
139
140    write!(
141        out,
142        r#"
143use std::borrow::Cow;
144
145pub fn set_locale(name: &str) -> bool {{
146    let name = name.to_ascii_lowercase();
147    if set_locale_inner(&name) {{
148        return true;
149    }}
150    if let Some(p) = name.find('_').or_else(|| name.find('-')) {{
151        let (base, _) = name.split_at(p);
152        if set_locale_inner(base) {{
153            return true;
154        }}
155    }}
156    ::tr::set_translator!(NullTranslator);
157    false
158}}
159
160fn set_locale_inner(name: &str) -> bool {{
161    match name {{
162"#
163    )?;
164    for lang in &objs {
165        writeln!(
166            out,
167            r#"        "{lang}" => ::tr::set_translator!({lang}::Translator),"#
168        )?;
169    }
170    write!(
171        out,
172        r#"
173        _ => return false,
174    }}
175    true
176}}
177
178pub struct NullTranslator;
179
180impl ::tr::Translator for NullTranslator {{
181    fn translate<'a>(&'a self, string: &'a str, _context: Option<&'a str>) -> Cow<'a, str> {{
182        Cow::Borrowed(string)
183    }}
184    fn ntranslate<'a>(&'a self, n: u64, singular: &'a str, plural: &'a str, _context: Option<&'a str>) -> Cow<'a, str> {{
185        if n == 1 {{ Cow::Borrowed(singular) }} else {{ Cow::Borrowed(plural) }}
186    }}
187}}
188"#
189    )?;
190    writeln!(out, "}}")?;
191    Ok(())
192}
193
194/// A simple message from the PO file.
195#[derive(Debug)]
196pub struct Message {
197    pub context: Option<String>,
198    pub id: String,
199    pub text: String,
200}
201
202/// A message with plural form.
203#[derive(Debug)]
204pub struct PMessage {
205    pub context: Option<String>,
206    pub singular: String,
207    pub plural: String,
208    pub texts: Vec<String>,
209}
210
211/// Parses a PO file.
212///
213/// Returns the normal and the pluralized messages in two separated vectors.
214pub fn parse_po(po_path: impl AsRef<Path>) -> Result<(Vec<Message>, Vec<PMessage>)> {
215    let f = std::fs::File::open(po_path)?;
216    let f = std::io::BufReader::new(f);
217    let mut text = String::new();
218    let mut last_key: Option<String> = None;
219    let mut id: Option<String> = None;
220    let mut id_plural: Option<String> = None;
221    let mut msgs: Vec<String> = Vec::new();
222    let mut ctxt: Option<String> = None;
223
224    let mut messages = Vec::new();
225    let mut pmessages = Vec::new();
226
227    // Ensure an empty string to flush the last message
228    for line in f.lines().chain([Ok(String::new())]) {
229        let line = line?;
230        let line = line.trim_ascii();
231        let head = line.chars().next();
232
233        match head {
234            Some('#') => {
235                continue;
236            }
237            Some('"') => {
238                text.push_str(unquote(line));
239                continue;
240            }
241            _ => match last_key.take().as_deref() {
242                None => (),
243                Some("msgid") => id = Some(std::mem::take(&mut text)),
244                Some("msgid_plural") => id_plural = Some(std::mem::take(&mut text)),
245                Some("msgstr") => msgs = vec![std::mem::take(&mut text)],
246                Some("msgctxt") => ctxt = Some(std::mem::take(&mut text)),
247                Some(unk) if unk.starts_with("msgstr[") => msgs.push(std::mem::take(&mut text)),
248                Some(_) => {}
249            },
250        }
251
252        let (next_key, sub_text) = match line.find(' ') {
253            Some(p) => {
254                let (a, b) = line.split_at(p);
255                let (_, b) = b.split_at(1);
256                (a, unquote(b))
257            }
258            None => (line, ""),
259        };
260
261        // start of next entry or separator or end of file
262        if next_key.is_empty() || next_key == "msgid" {
263            let mut msgs = std::mem::take(&mut msgs);
264            if !msgs.is_empty() {
265                match (id.take(), id_plural.take()) {
266                    (Some(id), None) => {
267                        messages.push(Message {
268                            context: ctxt.take(),
269                            id,
270                            text: std::mem::take(&mut msgs[0]),
271                        });
272                    }
273                    (Some(singular), Some(plural)) => {
274                        pmessages.push(PMessage {
275                            context: ctxt.take(),
276                            singular,
277                            plural,
278                            texts: msgs,
279                        });
280                    }
281                    _ => {}
282                }
283            }
284        }
285
286        if !next_key.is_empty() {
287            last_key = Some(String::from(next_key));
288            text = String::from(sub_text);
289        }
290    }
291    Ok((messages, pmessages))
292}
293
294fn split_at_char(s: &str, c: char) -> Option<(&str, &str)> {
295    let pos = s.find(c)?;
296    let a = s[..pos].trim();
297    let b = s[pos + c.len_utf8()..].trim();
298    Some((a, b))
299}
300
301fn unquote(line: &str) -> &str {
302    // Remove the starting and ending quotes
303    let (_, line) = line.split_at(1);
304    let (line, _) = line.split_at(line.len() - 1);
305    // po quoting is similar enough to Rust's so nothing else to do
306    line
307}
308
309/// Converts a PO file to the corresponding Rust source module.
310pub fn generate_rs_from_po(po_path: impl AsRef<Path>, out_path: impl AsRef<Path>) -> Result<()> {
311    use std::collections::BTreeMap;
312
313    let po_path = po_path.as_ref();
314    let (messages, pmessages) = parse_po(po_path)?;
315
316    let mut plural_count: usize = 2;
317    let mut plural_expr = plurals::Expr::default();
318    // The empty string is translated as the "description", that looks HTML headers
319    if let Some(descr) = messages
320        .iter()
321        .find(|m| m.id.is_empty())
322        .as_ref()
323        .map(|m| m.text.as_str())
324    {
325        // TODO: maybe we should unescape the text, but it doesn't seem to be too necessary
326        for header in descr.split("\\n") {
327            let Some((name, value)) = split_at_char(header, ':') else {
328                continue;
329            };
330            match name.to_lowercase().as_str() {
331                "content-type" => {
332                    for field in value.split(';') {
333                        let Some((n, v)) = split_at_char(field, '=') else {
334                            continue;
335                        };
336                        if n == "charset" && v != "UTF-8" && v != "ASCII" {
337                            return Err(PoIncludeError::NonUtf8PoFile(po_path.to_owned()));
338                        }
339                    }
340                }
341                "plural-forms" => {
342                    for field in value.split(';') {
343                        let Some((n, v)) = split_at_char(field, '=') else {
344                            continue;
345                        };
346                        match n {
347                            "nplurals" => {
348                                plural_count =
349                                    v.parse().map_err(|_| PoIncludeError::PluralError)?;
350                            }
351                            "plural" => {
352                                plural_expr = plurals::Expr::parse(v)
353                                    .map_err(|_| PoIncludeError::PluralError)?;
354                            }
355                            _ => {}
356                        }
357                    }
358                }
359                _ => {}
360            }
361        }
362    }
363
364    let mut messages_by_ctx = BTreeMap::<Option<&str>, Vec<&Message>>::new();
365    for msg in &messages {
366        if msg.id.is_empty() || msg.text.is_empty() {
367            continue;
368        }
369        let entry = messages_by_ctx.entry(msg.context.as_deref());
370        entry.or_default().push(msg);
371    }
372    let mut pmessages_by_ctx = BTreeMap::<Option<&str>, Vec<&PMessage>>::new();
373    for pmsg in &pmessages {
374        if pmsg.singular.is_empty() || pmsg.texts.is_empty() || pmsg.texts[0].is_empty() {
375            continue;
376        }
377        let entry = pmessages_by_ctx.entry(pmsg.context.as_deref());
378        entry.or_default().push(pmsg);
379    }
380
381    let out = std::fs::File::create(out_path)?;
382    let mut out = std::io::BufWriter::new(out);
383
384    write!(
385        out,
386        r#"
387#![allow(dead_code)]
388
389use std::borrow::Cow;
390pub struct Translator;
391
392pub const PLURALS: usize = {plural_count};
393
394#[allow(unused_parens)]
395pub fn number_index(n: u64) -> u32 {{
396    {plural_expr}
397}}
398
399#[allow(clippy::match_single_binding)]
400impl ::tr::Translator for Translator {{
401    fn translate<'a>(&'a self, string: &'a str, context: Option<&'a str>) -> Cow<'a, str> {{
402        let s = match context {{
403"#
404    )?;
405
406    for (ctxt, messages) in &messages_by_ctx {
407        let s;
408        writeln!(
409            out,
410            r#"            {} => match string {{"#,
411            match &ctxt {
412                None => "None",
413                Some(x) => {
414                    s = format!(r#"Some("{x}")"#);
415                    &s
416                }
417            }
418        )?;
419
420        for msg in messages {
421            writeln!(out, r#"                "{}" => "{}","#, msg.id, msg.text,)?;
422        }
423        writeln!(
424            out,
425            r#"                _ => string,
426            }},"#
427        )?;
428    }
429    write!(
430        out,
431        r#"
432            _ => string,
433        }};
434        Cow::Borrowed(s)
435    }}
436    fn ntranslate<'a>(&'a self, n: u64, singular: &'a str, plural: &'a str, context: Option<&'a str>) -> Cow<'a, str> {{
437        let ni = number_index(n);
438        let s = match context {{
439"#
440    )?;
441    for (ctxt, pmessages) in &pmessages_by_ctx {
442        let s;
443        writeln!(
444            out,
445            r#"            {} => match singular {{"#,
446            match &ctxt {
447                None => "None",
448                Some(x) => {
449                    s = format!(r#"Some("{x}")"#);
450                    &s
451                }
452            }
453        )?;
454        for pmsg in pmessages {
455            write!(
456                out,
457                r#"                "{}" => {{ match ni {{ "#,
458                pmsg.singular,
459            )?;
460            // skip the 0 because it is the default, avoid the duplicate
461            for (i, m) in pmsg.texts.iter().enumerate().take(plural_count).skip(1) {
462                write!(out, r#"{i} => "{m}", "#)?;
463            }
464            writeln!(out, r#"_ => "{}" }} }}"#, pmsg.texts[0])?;
465        }
466        writeln!(
467            out,
468            r#"                _ => if n == 1 {{ singular }} else {{ plural }},
469            }},"#
470        )?;
471    }
472    write!(
473        out,
474        r#"
475            _ => if n == 1 {{ singular }} else {{ plural }},
476        }};
477        Cow::Borrowed(s)
478    }}
479}}
480"#
481    )?;
482
483    Ok(())
484}