use std::borrow::Cow;
use crate::{po::entry::Entry, po::format::language::Language, po::message::Message};
use encoding_rs::Encoding;
#[derive(Default)]
enum Field {
#[default]
Comment,
Ctxt,
Id,
IdPlural,
Str(u32),
}
#[derive(Default)]
pub struct Parser<'a> {
data: &'a [u8],
data_len: usize,
language: String,
language_code: String,
country: String,
encoding: Option<&'static Encoding>,
nplurals: u32,
offset: usize,
line_number: usize,
next_line_number: usize,
field: Field,
encoding_error: bool,
}
impl<'d> Parser<'d> {
pub fn new(data: &'d [u8]) -> Self {
Self {
data,
data_len: data.len(),
line_number: 1,
next_line_number: 1,
..Default::default()
}
}
pub fn encoding_name(&self) -> &'static str {
self.encoding
.map_or_else(|| encoding_rs::UTF_8.name(), |enc| enc.name())
}
pub fn language(&self) -> &str {
&self.language
}
pub fn language_code(&self) -> &str {
&self.language_code
}
pub fn country(&self) -> &str {
&self.country
}
pub const fn nplurals(&self) -> u32 {
self.nplurals
}
fn next_line(&mut self) -> Option<&'d [u8]> {
if self.offset >= self.data_len {
return None;
}
let start = self.offset;
let end =
memchr::memchr(b'\n', &self.data[start..]).map_or(self.data_len, |pos| start + pos);
self.offset = end + 1;
self.next_line_number += 1;
Some(&self.data[start..end])
}
fn parse_header(&mut self, entry: &Entry) {
let Some(id) = entry.msgid.as_ref() else {
return;
};
if !id.value.is_empty() {
return;
}
let Some(msg) = entry.msgstr.get(&0) else {
return;
};
if msg.value.is_empty() {
return;
}
for line in msg.value.split('\n') {
let (keyword, value) = line.split_once(':').unwrap_or(("", ""));
let keyword = keyword.trim();
if keyword.eq_ignore_ascii_case("language") {
self.language = value.trim().to_string();
if let Some(pos) = value.find('_') {
self.language_code = value[..pos].trim().to_string();
self.country = value[pos + 1..].trim().to_string();
} else {
self.language_code = self.language.clone();
}
} else if keyword.eq_ignore_ascii_case("content-type")
&& let Some(pos) = value.find("charset=")
{
let value_charset = &value[pos + 8..];
let end = value_charset
.find(|c: char| c.is_whitespace() || c == ';')
.unwrap_or(value_charset.len());
let charset = &value_charset[..end];
let encoding = Encoding::for_label(charset.as_bytes());
if encoding.is_some_and(|e| e != encoding_rs::UTF_8) {
self.encoding = encoding;
}
} else if keyword.eq_ignore_ascii_case("plural-forms")
&& let Some(pos) = value.find("nplurals=")
{
let value_nplurals = &value[pos + 9..];
let end = value_nplurals
.find(|c: char| !c.is_ascii_digit())
.unwrap_or(value_nplurals.len());
if let Ok(nplurals) = value_nplurals[..end].parse::<u32>() {
self.nplurals = nplurals;
}
}
}
}
fn parse_keywords(line: &[u8], entry: &mut Entry) {
for kw in line.split(|&b| b == b',') {
let kw = kw.trim_ascii();
match kw {
b"fuzzy" => entry.fuzzy = true,
b"noqa" => entry.noqa = true,
b"no-wrap" => entry.nowrap = true,
_ => {
if let Some(rules) = kw.strip_prefix(b"noqa:") {
entry.noqa_rules = rules
.split(|&b| b == b';')
.map(|r| String::from_utf8_lossy(r.trim_ascii()).into_owned())
.collect();
} else if let Some(stripped) = kw.strip_suffix(b"-format")
&& let Ok(s) = str::from_utf8(stripped)
{
entry.format_language = Language::from(s);
}
}
}
entry
.keywords
.push(String::from_utf8_lossy(kw).into_owned());
}
}
fn extract_string(&mut self, line: &'d [u8]) -> Cow<'d, str> {
let Some(start) = memchr::memchr(b'"', line) else {
return Cow::Borrowed("");
};
let end = if line.len() > start + 1 && line.last() == Some(&b'"') {
line.len() - 1
} else {
match memchr::memrchr(b'"', line) {
Some(end) if end != start => end,
_ => return Cow::Borrowed(""),
}
};
let bytes = &line[start + 1..end];
if let Some(encoding) = self.encoding {
let (cow, _, errors) = encoding.decode(bytes);
if errors {
self.encoding_error = true;
}
cow
} else if let Ok(s) = str::from_utf8(bytes) {
Cow::Borrowed(s)
} else {
self.encoding_error = true;
String::from_utf8_lossy(bytes)
}
}
fn parse_message(&mut self, line: &'d [u8], entry: &mut Entry) {
match line {
[b'"', ..] => match self.field {
Field::Comment => {}
Field::Ctxt => entry.append_msgctxt(self.extract_string(line)),
Field::Id => entry.append_msgid(self.extract_string(line)),
Field::IdPlural => entry.append_msgid_plural(self.extract_string(line)),
Field::Str(idx) => entry.append_msgstr(idx, self.extract_string(line)),
},
[b'm', b's', b'g', b'c', b't', b'x', b't', ..] => {
self.field = Field::Ctxt;
entry.msgctxt = Some(Message::new(self.line_number, self.extract_string(line)));
}
[
b'm',
b's',
b'g',
b'i',
b'd',
b'_',
b'p',
b'l',
b'u',
b'r',
b'a',
b'l',
..,
] => {
self.field = Field::IdPlural;
entry.msgid_plural =
Some(Message::new(self.line_number, self.extract_string(line)));
}
[b'm', b's', b'g', b'i', b'd', ..] => {
self.field = Field::Id;
entry.msgid = Some(Message::new(self.line_number, self.extract_string(line)));
}
[b'm', b's', b'g', b's', b't', b'r', b'[', ..] => {
if let Some(idx_end) = memchr::memchr(b']', line)
&& let Ok(str_idx) = str::from_utf8(&line[7..idx_end])
&& let Ok(idx) = str_idx.parse::<u32>()
{
self.field = Field::Str(idx);
entry.msgstr.insert(
idx,
Message::new(self.line_number, self.extract_string(line)),
);
}
}
[b'm', b's', b'g', b's', b't', b'r', ..] => {
self.field = Field::Str(0);
entry
.msgstr
.insert(0, Message::new(self.line_number, self.extract_string(line)));
}
_ => {}
}
}
}
impl Iterator for Parser<'_> {
type Item = Entry;
fn next(&mut self) -> Option<Self::Item> {
let mut entry = Entry::new(self.next_line_number);
self.line_number = self.next_line_number;
self.field = Field::Comment;
self.encoding_error = false;
let mut started = false;
while let Some(line) = self.next_line() {
if line.is_empty() {
if started {
entry.encoding_error = self.encoding_error;
entry.unescape_strings();
self.parse_header(&entry);
return Some(entry);
}
entry.line_number = self.next_line_number;
self.line_number = self.next_line_number;
continue;
}
started = true;
match line {
[b'#', b',' | b'=', keywords @ ..] => {
Parser::parse_keywords(keywords, &mut entry);
}
[b'#', b'~', b' ', msg @ ..] => {
entry.obsolete = true;
self.parse_message(msg, &mut entry);
}
[b'#', b' ', b'n', b'o', b'q', b'a', b':', rules @ ..] => {
entry.noqa_rules = rules
.split(|&b| b == b';')
.map(|r| String::from_utf8_lossy(r.trim_ascii()).into_owned())
.collect();
}
[b'#', b' ', b'n', b'o', b'q', b'a', ..] => {
entry.noqa = true;
}
[b'm' | b'"', ..] => {
self.parse_message(line, &mut entry);
}
_ => {}
}
self.line_number = self.next_line_number;
}
if started {
entry.encoding_error = self.encoding_error;
entry.unescape_strings();
self.parse_header(&entry);
Some(entry)
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_empty_string() {
assert!(Parser::new(b"").next().is_none());
}
#[test]
fn parse_header() {
let content = r#"# Main comment
msgid ""
msgstr "test\n"
"Project-Id-Version: my_project\n"
"Report-Msgid-Bugs-To: someone@example.com\n"
"Language: fr\n"
"Plural-Forms: nplurals=2; plural=(n > 1);\n"
"#;
let mut parser = Parser::new(content.as_bytes());
let entries = parser.by_ref().collect::<Vec<Entry>>();
assert_eq!(entries[0].line_number, 1);
assert!(entries[0].keywords.is_empty());
assert!(!entries[0].fuzzy);
assert!(!entries[0].noqa);
assert!(!entries[0].nowrap);
assert_eq!(entries[0].format_language, Language::Null);
assert!(!entries[0].encoding_error);
assert_eq!(parser.nplurals, 2);
assert!(entries[0].msgctxt.is_none());
assert_eq!(entries[0].msgid, Some(Message::new(2, "")));
assert!(entries[0].msgid_plural.is_none());
assert_eq!(
entries[0].msgstr.get(&0),
Some(Message::new(
3,
"test\n\
Project-Id-Version: my_project\n\
Report-Msgid-Bugs-To: someone@example.com\n\
Language: fr\n\
Plural-Forms: nplurals=2; plural=(n > 1);\n"
))
.as_ref()
);
assert_eq!(parser.language, "fr");
assert_eq!(parser.language_code, "fr");
assert_eq!(parser.country, "");
assert!(parser.encoding.is_none());
let content = r#"# Main comment
msgid ""
msgstr "Language: pt_BR\n"
"#;
let mut parser = Parser::new(content.as_bytes());
let _ = parser.by_ref().collect::<Vec<Entry>>();
assert_eq!(parser.language, "pt_BR");
assert_eq!(parser.language_code, "pt");
assert_eq!(parser.country, "BR");
}
#[test]
fn parse_simple_entry() {
let content = r#"
msgid "hello"
msgstr "bonjour"
"#;
let mut parser = Parser::new(content.as_bytes());
let entries = parser.by_ref().collect::<Vec<Entry>>();
assert_eq!(entries[0].line_number, 2);
assert!(entries[0].keywords.is_empty());
assert!(!entries[0].fuzzy);
assert!(!entries[0].noqa);
assert!(!entries[0].nowrap);
assert_eq!(entries[0].format_language, Language::Null);
assert!(!entries[0].encoding_error);
assert!(entries[0].msgctxt.is_none());
assert_eq!(entries[0].msgid, Some(Message::new(2, "hello")));
assert!(entries[0].msgid_plural.is_none());
assert_eq!(
entries[0].msgstr.get(&0),
Some(Message::new(3, "bonjour")).as_ref()
);
}
#[test]
fn parse_simple_entry_iso8859() {
let content = r#"
msgid ""
msgstr "Content-Type: text/plain; charset=ISO-8859-15\n"
msgid "tested"
msgstr "testé"
"#;
let content_iso = encoding_rs::ISO_8859_15.encode(content).0;
let mut parser = Parser::new(content_iso.as_ref());
let entries = parser.by_ref().collect::<Vec<Entry>>();
assert_eq!(parser.encoding, Some(encoding_rs::ISO_8859_15));
assert_eq!(entries[0].line_number, 2);
assert!(entries[0].keywords.is_empty());
assert!(!entries[0].fuzzy);
assert!(!entries[0].noqa);
assert!(!entries[0].nowrap);
assert_eq!(entries[0].format_language, Language::Null);
assert!(!entries[0].encoding_error);
assert!(entries[0].msgctxt.is_none());
assert_eq!(entries[0].msgid, Some(Message::new(2, "")));
assert!(entries[0].msgid_plural.is_none());
assert_eq!(
entries[0].msgstr.get(&0),
Some(Message::new(
3,
"Content-Type: text/plain; charset=ISO-8859-15\n"
))
.as_ref()
);
assert!(entries[1].keywords.is_empty());
assert!(!entries[1].fuzzy);
assert!(!entries[1].noqa);
assert!(!entries[1].nowrap);
assert_eq!(entries[1].format_language, Language::Null);
assert!(!entries[1].encoding_error);
assert!(entries[1].msgctxt.is_none());
assert_eq!(entries[1].msgid, Some(Message::new(5, "tested")));
assert!(entries[1].msgid_plural.is_none());
assert_eq!(
entries[1].msgstr.get(&0),
Some(Message::new(6, "testé")).as_ref()
);
}
#[test]
fn parse_simple_entry_encoding_error() {
let content = r#"
msgid ""
msgstr "Content-Type: text/plain; charset=UTF-8\n"
msgid "tested"
msgstr "testé"
"#;
let content_iso = encoding_rs::ISO_8859_15.encode(content).0;
let mut parser = Parser::new(content_iso.as_ref());
let entries = parser.by_ref().collect::<Vec<Entry>>();
assert!(parser.encoding.is_none());
assert_eq!(entries[0].line_number, 2);
assert!(entries[0].keywords.is_empty());
assert!(!entries[0].noqa);
assert!(!entries[0].nowrap);
assert_eq!(entries[0].format_language, Language::Null);
assert!(!entries[0].encoding_error);
assert!(entries[0].msgctxt.is_none());
assert_eq!(entries[0].msgid, Some(Message::new(2, "")));
assert!(entries[0].msgid_plural.is_none());
assert_eq!(
entries[0].msgstr.get(&0),
Some(Message::new(3, "Content-Type: text/plain; charset=UTF-8\n",)).as_ref()
);
assert!(entries[1].keywords.is_empty());
assert!(!entries[1].fuzzy);
assert!(!entries[1].noqa);
assert!(!entries[1].nowrap);
assert_eq!(entries[1].format_language, Language::Null);
assert!(entries[1].encoding_error);
assert!(entries[1].msgctxt.is_none());
assert_eq!(entries[1].msgid, Some(Message::new(5, "tested")));
assert!(entries[1].msgid_plural.is_none());
assert_eq!(
entries[1].msgstr.get(&0),
Some(Message::new(6, "test�")).as_ref()
);
}
#[test]
fn parse_entry_with_context() {
let content = r#"
msgctxt "month of the year"
msgid "may"
msgstr "mai"
"#;
let mut parser = Parser::new(content.as_bytes());
let entries = parser.by_ref().collect::<Vec<Entry>>();
assert_eq!(entries[0].line_number, 2);
assert!(entries[0].keywords.is_empty());
assert!(!entries[0].fuzzy);
assert!(!entries[0].noqa);
assert!(!entries[0].nowrap);
assert_eq!(entries[0].format_language, Language::Null);
assert!(!entries[0].encoding_error);
assert_eq!(
entries[0].msgctxt,
Some(Message::new(2, "month of the year"))
);
assert_eq!(entries[0].msgid, Some(Message::new(3, "may")));
assert!(entries[0].msgid_plural.is_none());
assert_eq!(
entries[0].msgstr.get(&0),
Some(Message::new(4, "mai")).as_ref()
);
}
#[test]
fn parse_two_entries() {
let content = r#"
msgid "hello"
msgstr "bonjour"
msgid "hello 2"
msgstr ""
"#;
let mut parser = Parser::new(content.as_bytes());
let entries = parser.by_ref().collect::<Vec<Entry>>();
assert_eq!(entries[0].line_number, 2);
assert!(entries[0].keywords.is_empty());
assert!(!entries[0].fuzzy);
assert!(!entries[0].noqa);
assert!(!entries[0].nowrap);
assert_eq!(entries[0].format_language, Language::Null);
assert!(!entries[0].encoding_error);
assert!(entries[0].msgctxt.is_none());
assert_eq!(entries[0].msgid, Some(Message::new(2, "hello")));
assert!(entries[0].msgid_plural.is_none());
assert_eq!(
entries[0].msgstr.get(&0),
Some(Message::new(3, "bonjour")).as_ref()
);
assert_eq!(entries[1].line_number, 5);
assert!(entries[1].keywords.is_empty());
assert!(!entries[1].fuzzy);
assert!(!entries[1].noqa);
assert!(!entries[1].nowrap);
assert_eq!(entries[1].format_language, Language::Null);
assert!(!entries[1].encoding_error);
assert!(entries[1].msgctxt.is_none());
assert_eq!(entries[1].msgid, Some(Message::new(5, "hello 2")));
assert!(entries[1].msgid_plural.is_none());
assert_eq!(
entries[1].msgstr.get(&0),
Some(Message::new(6, "")).as_ref()
);
}
#[test]
fn parse_plural_entry() {
let content = r#"
msgid "file"
msgid_plural "files"
msgstr[0] "fichier"
msgstr[1] "fichiers"
"#;
let mut parser = Parser::new(content.as_bytes());
let entries = parser.by_ref().collect::<Vec<Entry>>();
assert_eq!(entries[0].line_number, 2);
assert!(entries[0].keywords.is_empty());
assert!(!entries[0].fuzzy);
assert!(!entries[0].noqa);
assert!(!entries[0].nowrap);
assert_eq!(entries[0].format_language, Language::Null);
assert!(!entries[0].encoding_error);
assert!(entries[0].msgctxt.is_none());
assert_eq!(entries[0].msgid, Some(Message::new(2, "file")));
assert_eq!(entries[0].msgid_plural, Some(Message::new(3, "files")));
assert_eq!(
entries[0].msgstr.get(&0),
Some(Message::new(4, "fichier")).as_ref()
);
assert_eq!(
entries[0].msgstr.get(&1),
Some(Message::new(5, "fichiers")).as_ref()
);
}
#[test]
fn parse_comments() {
let content = r#"
# Translator comment
#, fuzzy, c-format, noqa, noqa:blank; pipes, no-wrap
#= keyword
#: src/main.rs:42
msgid "hello, %s"
msgstr "bonjour, %s"
"#;
let mut parser = Parser::new(content.as_bytes());
let entries = parser.by_ref().collect::<Vec<Entry>>();
assert_eq!(entries[0].line_number, 2);
assert_eq!(
entries[0].keywords,
vec![
"fuzzy".to_string(),
"c-format".to_string(),
"noqa".to_string(),
"noqa:blank; pipes".to_string(),
"no-wrap".to_string(),
"keyword".to_string(),
]
);
assert!(entries[0].fuzzy);
assert!(entries[0].noqa);
assert!(entries[0].nowrap);
assert_eq!(entries[0].noqa_rules, vec!["blank", "pipes"]);
assert_eq!(entries[0].format_language, Language::C);
assert!(!entries[0].encoding_error);
assert!(entries[0].msgctxt.is_none());
assert_eq!(entries[0].msgid, Some(Message::new(6, "hello, %s")));
assert!(entries[0].msgid_plural.is_none());
assert_eq!(
entries[0].msgstr.get(&0),
Some(Message::new(7, "bonjour, %s")).as_ref()
);
let content = r#"
# noqa
#, c-format
msgid "hello, %s"
msgstr "bonjour, %s"
"#;
let mut parser = Parser::new(content.as_bytes());
let entries = parser.by_ref().collect::<Vec<Entry>>();
assert_eq!(entries[0].line_number, 2);
assert_eq!(entries[0].keywords, vec!["c-format"]);
assert!(!entries[0].fuzzy);
assert!(entries[0].noqa);
assert!(!entries[0].nowrap);
assert!(entries[0].noqa_rules.is_empty());
assert_eq!(entries[0].format_language, Language::C);
assert!(!entries[0].encoding_error);
assert!(entries[0].msgctxt.is_none());
assert_eq!(entries[0].msgid, Some(Message::new(4, "hello, %s")));
assert!(entries[0].msgid_plural.is_none());
assert_eq!(
entries[0].msgstr.get(&0),
Some(Message::new(5, "bonjour, %s")).as_ref()
);
let content = r#"
# noqa:blank; pipes
#, c-format
msgid "hello, %s"
msgstr "bonjour, %s"
"#;
let mut parser = Parser::new(content.as_bytes());
let entries = parser.by_ref().collect::<Vec<Entry>>();
assert_eq!(entries[0].line_number, 2);
assert_eq!(entries[0].keywords, vec!["c-format"]);
assert!(!entries[0].fuzzy);
assert!(!entries[0].noqa);
assert!(!entries[0].nowrap);
assert_eq!(entries[0].noqa_rules, vec!["blank", "pipes"]);
assert_eq!(entries[0].format_language, Language::C);
assert!(!entries[0].encoding_error);
assert!(entries[0].msgctxt.is_none());
assert_eq!(entries[0].msgid, Some(Message::new(4, "hello, %s")));
assert!(entries[0].msgid_plural.is_none());
assert_eq!(
entries[0].msgstr.get(&0),
Some(Message::new(5, "bonjour, %s")).as_ref()
);
}
#[test]
fn parse_multiline_strings() {
let content = r#"
msgid ""
"hello "
"world"
msgstr ""
"bonjour "
"le monde"
"#;
let mut parser = Parser::new(content.as_bytes());
let entries = parser.by_ref().collect::<Vec<Entry>>();
assert_eq!(entries[0].line_number, 2);
assert!(entries[0].keywords.is_empty());
assert!(!entries[0].fuzzy);
assert!(!entries[0].noqa);
assert!(!entries[0].nowrap);
assert_eq!(entries[0].format_language, Language::Null);
assert!(!entries[0].encoding_error);
assert!(entries[0].msgctxt.is_none());
assert_eq!(entries[0].msgid, Some(Message::new(2, "hello world")));
assert!(entries[0].msgid_plural.is_none());
assert_eq!(
entries[0].msgstr.get(&0),
Some(Message::new(5, "bonjour le monde")).as_ref()
);
}
}