include_po/
lib.rs

1//! This crate contains functions to parse PO files (gettext message catalogs) and use them in Rust.
2//!
3//! # Main usage
4//!
5//! The recommended use is to use it to build a Rust module with the translations.
6//!
7//! First build your PO files into the `locales` directory, in the root of your project.
8//!
9//! Then this the crate as a `[build-dependencies]` and `tr` as a normal one: in your `Cargo.toml`:
10//! ```
11//! [dev-dependencies]
12//! include-po = "0.1"
13//!
14//! [dependencies]
15//! tr = { version = "0.1.10", default-features = false }
16//! ```
17//! Write a `build.rs` script:
18//! ```
19//! fn main() {
20//!     let output_dir = std::env::var("OUT_DIR").unwrap();
21//!     let out = std::path::PathBuf::from(&output_dir).join("locale/translators.rs");
22//!     include_po::generate_locales_from_dir("locales", out).unwrap();
23//! }
24//! ```
25//! And finally in your `main.rs` or `lib.rs`:
26//! ```
27//! include!(concat!(env!("OUT_DIR"), "/locale/translators.rs"));
28//! ```
29//! That's it! Now you can call `translators::set_locale("es");` to switch to Spanish!
30//!
31//! If you are writing a lib crate, as a convention you should have a public function in the root namespace:
32//! ```
33//! fn set_locale(locale: &str) {
34//!     translators::set_locale(locale);
35//! }
36//! ```
37//! This way you can chain the locale of multiple translatable libraries.
38//!
39//! # Other functions
40//!
41//! If you prefer to do the translations yourself, you can also use this crate to parse the PO file and obtain
42//! the messages. But note that currently the messages will be unescaped, that is a '"' will be a '\n' and so on.
43//!
44//! # Why PO instead of MO?
45//!
46//! Most solutions based on gettext read the message catalog from the MO file, instead of the PO. A MO file is a
47//! compiled message catalog. The reason original gettext uses MO files is to optimize start-up time: when the
48//! `gettext` library wants to use a MO, it just locates the file, opens it and memory-maps it. It does very little
49//! parse, because everything is designed to be used from that memory map. It even contains a precomputed hash table!
50//!
51//! In Rust, I see little reason to distribute MO files separated from the executable. Some people try to use `include_bytes!`
52//! and then parse the binary data into a `BTreeMap`... but that defeats the purpose of the MO existence in the first place.
53//!
54//! If you are going to embed the message catalog into the executable you may as well go all the way and include it as code:
55//! once again the catalog is memory mapped (as it is most executable code) and with zero parsing at runtime. But if you are
56//! going to parse the catalog and convert it to Rust at build time, why read the MO and not the PO that is simpler and saves
57//! a compiler step?
58//!
59//! But what about the hash table? you are probably asking... Well, currently this crate is building a giant `match string`
60//! for each source PO file. This seems to be good enough, but if needed we can transparently upgrade it to a cleverer algoritm.
61//! My hope is that the code generated by the compiler will get better faster than the needs of this crate.
62
63#![allow(clippy::needless_doctest_main)]
64
65use std::io::{BufRead, Write};
66use std::path::{Path, PathBuf};
67use thiserror::Error;
68
69mod plurals;
70
71/// An error while parsing or processing a PO file.
72#[derive(Error, Debug)]
73pub enum PoIncludeError {
74    #[error("Invalid path '{0}'")]
75    InvalidPath(PathBuf),
76    #[error("Non-UTF-8 PO file '{0}'")]
77    NonUtf8PoFile(PathBuf),
78    #[error("Invalid plural expression")]
79    PluralError,
80    #[error(transparent)]
81    Io { #[from] source: std::io::Error },
82}
83
84/// The default return type.
85pub type Result<T> = std::result::Result<T, PoIncludeError>;
86
87/// Build a module with all the translations from a directory.
88///
89/// This function creates a module `translators` in the given directory. Then,
90/// for each PO file in the source directory it will create a submodule.
91/// Then, in the root module it creates a function `set_locale(locale: &str)` that
92/// chooses the given locale by calling the `tr::set_translator()`.
93pub fn generate_locales_from_dir(po_dir: impl AsRef<Path>, out_path: impl AsRef<Path>) -> Result<()> {
94    let po_dir = po_dir.as_ref();
95    let out_path = out_path.as_ref();
96    let out_dir = out_path.parent().ok_or_else(|| PoIncludeError::InvalidPath(out_path.to_owned()))?;
97    if !out_dir.is_dir() {
98        std::fs::create_dir_all(out_dir)?;
99    }
100
101    let out = std::fs::File::create(out_path)?;
102    let mut out = std::io::BufWriter::new(out);
103    let mod_path = std::path::absolute(out_dir)?;
104    writeln!(out, r#"#[path = "{}"]"#, mod_path.display())?;
105    writeln!(out, r#"#[allow(unused_variables)]
106pub mod translators {{
107"#)?;
108
109    let mut objs = Vec::new();
110    for entry in po_dir.read_dir()? {
111        let entry = entry?;
112        let path = entry.path();
113        if path.extension().and_then(|s| s.to_str()) != Some("po") {
114            continue;
115        }
116        let Some(lang) = path.file_stem() else { continue };
117        let lang = lang.to_ascii_lowercase();
118        let Some(lang) = lang.to_str() else { continue };
119        let lang = lang.to_owned();
120        generate_rs_from_po(path, out_dir.join(format!("{lang}.rs")))?;
121        println!("cargo:rerun-if-changed={}", entry.path().display());
122
123        writeln!(out, "pub mod {lang};")?;
124        objs.push(lang);
125    }
126
127    write!(out, r#"
128use std::borrow::Cow;
129
130pub fn set_locale(name: &str) -> bool {{
131    let name = name.to_ascii_lowercase();
132    if set_locale_inner(&name) {{
133        return true;
134    }}
135    if let Some(p) = name.find('_').or_else(|| name.find('-')) {{
136        let (base, _) = name.split_at(p);
137        if set_locale_inner(base) {{
138            return true;
139        }}
140    }}
141    ::tr::set_translator!(NullTranslator);
142    false
143}}
144
145fn set_locale_inner(name: &str) -> bool {{
146    match name {{
147"#)?;
148    for lang in &objs {
149        writeln!(out, r#"        "{lang}" => ::tr::set_translator!({lang}::Translator),"#)?;
150    }
151    write!(out, r#"
152        _ => return false,
153    }}
154    true
155}}
156
157pub struct NullTranslator;
158
159impl ::tr::Translator for NullTranslator {{
160    fn translate<'a>(&'a self, string: &'a str, _context: Option<&'a str>) -> Cow<'a, str> {{
161        Cow::Borrowed(string)
162    }}
163    fn ntranslate<'a>(&'a self, n: u64, singular: &'a str, plural: &'a str, _context: Option<&'a str>) -> Cow<'a, str> {{
164        if n == 1 {{ Cow::Borrowed(singular) }} else {{ Cow::Borrowed(plural) }}
165    }}
166}}
167"#)?;
168    writeln!(out, "}}")?;
169    Ok(())
170}
171
172/// A simple message from the PO file.
173#[derive(Debug)]
174pub struct Message {
175    pub context: Option<String>,
176    pub id: String,
177    pub text: String,
178}
179
180/// A message with plural form.
181#[derive(Debug)]
182pub struct PMessage {
183    pub context: Option<String>,
184    pub singular: String,
185    pub plural: String,
186    pub texts: Vec<String>,
187}
188
189/// Parses a PO file.
190///
191/// Returns the normal and the pluralized messages in two separated vectors.
192pub fn parse_po(po_path: impl AsRef<Path>) -> Result<(Vec<Message>, Vec<PMessage>)> {
193    let f = std::fs::File::open(po_path)?;
194    let f = std::io::BufReader::new(f);
195    let mut text = String::new();
196    let mut last_key: Option<String> = None;
197    let mut id: Option<String> = None;
198    let mut id_plural: Option<String> = None;
199    let mut msgs: Vec<String> = Vec::new();
200    let mut ctxt: Option<String> = None;
201
202    let mut messages = Vec::new();
203    let mut pmessages = Vec::new();
204
205    // Ensure an empty string to flush the last message
206    for line in f.lines().chain([Ok(String::new())]) {
207        let line = line?;
208        let line = line.trim_ascii();
209        let head = line.chars().next();
210
211        match head {
212            Some('#') => {
213                continue;
214            }
215            Some('"') => {
216                text.push_str(unquote(line));
217                continue;
218            }
219            _ => {
220                match last_key.take().as_deref() {
221                    None => (),
222                    Some("msgid") => id = Some(std::mem::take(&mut text)),
223                    Some("msgid_plural") => id_plural = Some(std::mem::take(&mut text)),
224                    Some("msgstr") => msgs = vec![std::mem::take(&mut text)],
225                    Some("msgctxt") => ctxt = Some(std::mem::take(&mut text)),
226                    Some(unk) if unk.starts_with("msgstr[") => msgs.push(std::mem::take(&mut text)),
227                    Some(_) => { }
228                }
229            }
230        }
231
232        let (next_key, sub_text) = match line.find(' ') {
233            Some(p) => {
234                let (a, b) = line.split_at(p);
235                let (_, b) = b.split_at(1);
236                (a, unquote(b))
237            }
238            None => (line, ""),
239        };
240
241        // start of next entry or separator or end of file
242        if next_key.is_empty() || next_key == "msgid" {
243            let mut msgs = std::mem::take(&mut msgs);
244            if !msgs.is_empty() {
245                match (id.take(), id_plural.take(), ) {
246                    (Some(id), None) => {
247                        messages.push(Message {
248                            context: ctxt.take(),
249                            id,
250                            text: std::mem::take(&mut msgs[0]),
251                        });
252                    }
253                    (Some(singular), Some(plural)) => {
254                        pmessages.push(PMessage {
255                            context: ctxt.take(),
256                            singular,
257                            plural,
258                            texts: msgs,
259                        });
260                    }
261                    _ => {}
262                }
263            }
264        }
265
266        if !next_key.is_empty() {
267            last_key = Some(String::from(next_key));
268            text = String::from(sub_text);
269        }
270    }
271    Ok((messages, pmessages))
272}
273
274fn split_at_char(s: &str, c: char) -> Option<(&str, &str)> {
275    let pos = s.find(c)?;
276    let a = s[.. pos].trim();
277    let b = s[pos + c.len_utf8() ..].trim();
278    Some((a, b))
279}
280
281fn unquote(line: &str) -> &str {
282    // Remove the starting and ending quotes
283    let (_, line) = line.split_at(1);
284    let (line, _) = line.split_at(line.len() - 1);
285    // po quoting is similar enough to Rust's so nothing else to do
286    line
287}
288
289/// Converts a PO file to the corresponding Rust source module.
290pub fn generate_rs_from_po(po_path: impl AsRef<Path>, out_path: impl AsRef<Path>) -> Result<()> {
291    use std::collections::BTreeMap;
292
293    let po_path = po_path.as_ref();
294    let (messages, pmessages) = parse_po(po_path)?;
295
296
297    let mut plural_count: usize = 2;
298    let mut plural_expr = plurals::Expr::default();
299    // The empty string is translated as the "description", that looks HTML headers
300    if let Some(descr) = messages.iter().find(|m| m.id.is_empty()).as_ref().map(|m| m.text.as_str()) {
301        // TODO: maybe we should unescape the text, but it doesn't seem to be too necessary
302        for header in descr.split("\\n") {
303            let Some((name, value)) = split_at_char(header, ':') else { continue };
304            match name.to_lowercase().as_str() {
305                "content-type" => {
306                    for field in value.split(';') {
307                        let Some((n, v)) = split_at_char(field, '=') else { continue };
308                        if n == "charset" && v != "UTF-8" && v != "ASCII" {
309                            return Err(PoIncludeError::NonUtf8PoFile(po_path.to_owned()));
310                        }
311                    }
312                }
313                "plural-forms" => {
314                    for field in value.split(';') {
315                        let Some((n, v)) = split_at_char(field, '=') else { continue };
316                        match n {
317                            "nplurals" => {
318                                plural_count = v.parse().map_err(|_| PoIncludeError::PluralError)?;
319                            }
320                            "plural" => {
321                                plural_expr = plurals::Expr::parse(v).map_err(|_| PoIncludeError::PluralError)?;
322                            }
323                            _ => {}
324                        }
325                    }
326                }
327                _ => {}
328            }
329        }
330    }
331
332    let mut messages_by_ctx = BTreeMap::<Option<&str>, Vec<&Message>>::new();
333    for msg in &messages {
334        if msg.id.is_empty() || msg.text.is_empty() {
335            continue;
336        }
337        let entry = messages_by_ctx.entry(msg.context.as_deref());
338        entry.or_default().push(msg);
339    }
340    let mut pmessages_by_ctx = BTreeMap::<Option<&str>, Vec<&PMessage>>::new();
341    for pmsg in &pmessages {
342        if pmsg.singular.is_empty() || pmsg.texts.is_empty() || pmsg.texts[0].is_empty() {
343            continue;
344        }
345        let entry = pmessages_by_ctx.entry(pmsg.context.as_deref());
346        entry.or_default().push(pmsg);
347    }
348
349    let out = std::fs::File::create(out_path)?;
350    let mut out = std::io::BufWriter::new(out);
351
352    write!(out,
353r#"
354#![allow(dead_code)]
355
356use std::borrow::Cow;
357pub struct Translator;
358
359pub const PLURALS: usize = {plural_count};
360
361#[allow(unused_parens)]
362pub fn number_index(n: u64) -> u32 {{
363    {plural_expr}
364}}
365
366#[allow(clippy::match_single_binding)]
367impl ::tr::Translator for Translator {{
368    fn translate<'a>(&'a self, string: &'a str, context: Option<&'a str>) -> Cow<'a, str> {{
369        let s = match context {{
370"#)?;
371
372    for (ctxt, messages) in &messages_by_ctx {
373        let s;
374        writeln!(out, r#"            {} => match string {{"#,
375            match &ctxt {
376                None => "None",
377                Some(x) => { s = format!(r#"Some("{x}")"#); &s }
378            }
379        )?;
380
381        for msg in messages {
382            writeln!(out, r#"                "{}" => "{}","#,
383                msg.id,
384                msg.text,
385            )?;
386        }
387        writeln!(out, r#"                _ => string,
388            }},"#)?;
389    }
390    write!(out,
391r#"
392            _ => string,
393        }};
394        Cow::Borrowed(s)
395    }}
396    fn ntranslate<'a>(&'a self, n: u64, singular: &'a str, plural: &'a str, context: Option<&'a str>) -> Cow<'a, str> {{
397        let ni = number_index(n);
398        let s = match context {{
399"#)?;
400    for (ctxt, pmessages) in &pmessages_by_ctx {
401        let s;
402        writeln!(out, r#"            {} => match singular {{"#,
403            match &ctxt {
404                None => "None",
405                Some(x) => { s = format!(r#"Some("{x}")"#); &s }
406            }
407        )?;
408        for pmsg in pmessages {
409            write!(out, r#"                "{}" => {{ match ni {{ "#,
410                pmsg.singular,
411            )?;
412            // skip the 0 because it is the default, avoid the duplicate
413            for (i, m) in pmsg.texts.iter().enumerate().take(plural_count).skip(1) {
414                write!(out, r#"{i} => "{m}", "#)?;
415            }
416            writeln!(out, r#"_ => "{}" }} }}"#, pmsg.texts[0])?;
417        }
418        writeln!(out, r#"                _ => if n == 1 {{ singular }} else {{ plural }},
419            }},"#)?;
420    }
421    write!(out,
422r#"
423            _ => if n == 1 {{ singular }} else {{ plural }},
424        }};
425        Cow::Borrowed(s)
426    }}
427}}
428"#)?;
429
430    Ok(())
431}
432