utf8-locale 1.0.0

Detect a UTF-8-capable locale for running child processes in
Documentation
/*
 * Copyright (c) 2022  Peter Pentchev <roam@ringlet.net>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
//! Detect a UTF-8-capable locale.

#![allow(clippy::module_name_repetitions)]

use std::collections::{HashMap, HashSet};
use std::env;
use std::hash::BuildHasher;
use std::io::Error as IoError;
use std::process::{Command, Stdio};

use anyhow::{anyhow, Error as AnyError};
use encoding::all::ISO_8859_1;
use encoding::{DecoderTrap, Encoding};
use once_cell::sync::Lazy;
use regex::{Error as RegexError, Regex};
use thiserror::Error;

/// An error that occurred while examining the environment or locales.
#[derive(Debug, Error)]
#[non_exhaustive]
pub enum UErr {
    /// Could not decode the list of locales output by `locale -a`.
    #[error("Could not decode the obtained list of locales")]
    DecodeLocaleList(#[source] AnyError),

    /// Invalid value for an environment variable.
    #[error("The {0} environment variable's value is not a valid string")]
    InvalidEnvValue(String),

    /// Something went really, really wrong...
    #[error("Internal utf8-locale error: {0}")]
    Internal(String),

    /// Could not compile a regular expression.
    #[error("Internal error: could not compile the {0} regular expression")]
    Regex(String, #[source] RegexError),

    /// Could not extract a captured group out of a regular expression.
    #[error("Internal error: could not extract the '{0}' regex group out of {1}")]
    RegexCaptures(String, String),

    /// Could not run a program, e.g. `locale -a`.
    #[error("Could not run the `{0}` program")]
    RunProgram(String, #[source] IoError),
}

/// The variables examined by the [`LanguagesDetect`] class by default.
pub const LOCALE_VARIABLES: [&str; 14] = [
    "LC_ALL",
    "LANG",
    "LC_MESSAGES",
    "LC_COLLATE",
    "LC_NAME",
    "LC_IDENTIFICATION",
    "LC_CTYPE",
    "LC_NUMERIC",
    "LC_TIME",
    "LC_MONETARY",
    "LC_PAPER",
    "LC_ADDRESS",
    "LC_TELEPHONE",
    "LC_MEASUREMENT",
];

/// The encodings recognized as UTF-8 for the various locale distributions.
pub const UTF8_ENCODINGS: [&str; 2] = ["UTF-8", "utf8"];

/// The list of preferred languages used by the [`Utf8Detect`] class by default.
pub const UTF8_LANGUAGES: [&str; 5] = ["C", "en", "de", "es", "it"];

/// Break a locale name down into components.
pub const RE_LOCALE_NAME: &str = r"(?x) ^
    (?P<lang> [a-zA-Z0-9]+ )
    (?:
        _
        (?P<territory> [a-zA-Z0-9]+ )
    )?
    (?:
        \.
        (?P<codeset> [a-zA-Z0-9-]+ )
    )?
    (?:
        @
        (?P<modifier> [a-zA-Z0-9]+ )
    )?
    $ ";

/// Initialize the language weights array in order.
fn build_weights(langs: &[&str]) -> (HashMap<String, usize>, usize) {
    let mut res = HashMap::new();
    for lang in langs {
        let weight = res.len();
        res.entry((*lang).to_owned()).or_insert(weight);
    }
    let unweight = res.len();
    (res, unweight)
}

/// Get the regular expression used to parse a locale name.
///
/// # Errors
///
/// [`UErr::Regex`] on failure to compile a built-in regular expression.
fn get_re_name() -> Result<&'static Regex, UErr> {
    /// The regular expression used for parsing a locale name.
    static RE_NAME: Lazy<Result<Regex, RegexError>> = Lazy::new(|| Regex::new(RE_LOCALE_NAME));
    RE_NAME
        .as_ref()
        .map_err(|err| UErr::Regex("locale name".to_owned(), err.clone()))
}

/// Get a locale name that may hopefully be used for UTF-8 output.
///
/// The [`detect_utf8_locale()`] function runs the external `locale` command to
/// obtain a list of the supported locale names, and then picks a suitable one
/// to use so that programs are more likely to output valid UTF-8 characters
/// and language-neutral messages. It prefers the `C` base locale, but if
/// neither `C.UTF-8` nor `C.utf8` is available, it will fall back to a list of
/// other locale names that are likely to be present on the system. Note that
/// the [`Utf8Detect`] class is the preferred way of doing this.
///
/// The [`UTF8_LANGUAGES`] variable contains a list of default languages in
/// order of preference that the [`Utf8Detect`] class passes to this function by
/// default.
///
/// # Errors
///
/// [`UErr::RunProgram`] if `locale -a` could not be executed.
/// [`UErr::DecodeLocaleList`] if the output of `locale -a` could not be decoded as
/// ISO-8859-1 text.
/// [`UErr::Regex`] on failure to compile a built-in regular expression.
/// [`UErr::RegexCaptures`] on failure to extract a captured group out of
/// a successful regular expression match.
#[inline]
pub fn detect_utf8_locale(languages: &[&str]) -> Result<String, UErr> {
    let re_name = get_re_name()?;
    let (weights, unweight) = build_weights(languages);

    let raw = Command::new("locale")
        .arg("-a")
        .stderr(Stdio::inherit())
        .output()
        .map_err(|err| UErr::RunProgram("locale -a".to_owned(), err))?
        .stdout;
    let text = ISO_8859_1
        .decode(&raw, DecoderTrap::Strict)
        .map_err(|err| UErr::DecodeLocaleList(anyhow!("Could not decode a string: {}", err)))?;
    Ok(text
        .lines()
        .try_fold(
            ("C".to_owned(), unweight),
            |state, line| -> Result<(String, usize), UErr> {
                Ok(match re_name.captures(line) {
                    None => state,
                    Some(caps) => match caps.name("codeset") {
                        None => state,
                        Some(value) => {
                            if UTF8_ENCODINGS.contains(&value.as_str()) {
                                let lang = caps
                                    .name("lang")
                                    .ok_or_else(|| {
                                        UErr::RegexCaptures(
                                            "lang".to_owned(),
                                            format!("{:?}", caps),
                                        )
                                    })?
                                    .as_str();
                                match weights.get(lang) {
                                    None => state,
                                    Some(&weight) => {
                                        if weight < state.1 {
                                            (line.to_owned(), weight)
                                        } else {
                                            state
                                        }
                                    }
                                }
                            } else {
                                state
                            }
                        }
                    },
                })
            },
        )?
        .0)
}

/// Prepare the environment variables that need to be changed.
///
/// The [`get_utf8_vars()`] function invokes [`detect_utf8_locale()`] and
/// then returns a hashmap with `LC_ALL` set to the obtained locale name and
/// `LANGUAGE` set to an empty string so that recent versions of the gettext
/// library do not choose a different language to output messages in.
///
/// # Errors
///
/// Propagates errors returned by [`detect_utf8_locale()`].
#[inline]
pub fn get_utf8_vars(languages: &[&str]) -> Result<HashMap<String, String>, UErr> {
    let loc = detect_utf8_locale(languages)?;
    let arr = [
        ("LC_ALL".to_owned(), loc),
        ("LANGUAGE".to_owned(), "".to_owned()),
    ];
    Ok(arr.into_iter().collect())
}

/// Prepare the environment to run subprocesses in.
///
/// The [`get_utf8_env()`] function invokes [`detect_utf8_locale()`] and then
/// returns a hashmap based on [`std::env::vars()`], but with `LC_ALL` set to
/// the obtained locale name and `LANGUAGE` set to an empty string so that
/// recent versions of the gettext library do not choose a different language
/// to output messages in. Note that the [`Utf8Detect`] class is the preferred
/// way of doing this.
///
/// # Errors
///
/// Propagates errors returned by [`get_utf8_vars()`].
#[inline]
pub fn get_utf8_env(languages: &[&str]) -> Result<HashMap<String, String>, UErr> {
    Ok(env::vars().chain(get_utf8_vars(languages)?).collect())
}

/// Determine preferred languages as per the current locale settings.
///
/// The [`get_preferred_languages()`] function examines the specified
/// hashmap of environment variables and returns a list of
/// the languages specified in the locale variables (`LC_ALL`, `LANG`,
/// `LC_MESSAGES`, etc) in order of preference as defined by either
/// the `names` parameter. Note that the [`LanguagesDetect`] class is
/// the preferred way of doing this.
///
/// Note that "C" is always appended to the end of the list if it is not
/// already present.
///
/// # Errors
///
/// [`UErr::Regex`] on failure to compile a built-in regular expression.
#[inline]
pub fn get_preferred_languages<S: BuildHasher>(
    env: &HashMap<String, String, S>,
    names: &[&str],
) -> Result<Vec<String>, UErr> {
    let re_name = get_re_name()?;

    let mut res: Vec<String> = Vec::new();
    for name in names {
        if let Some(value) = env.get(&(*name).to_owned()) {
            if let Some(caps) = re_name.captures(value) {
                let cap = |group| {
                    caps.name(group)
                        .ok_or_else(|| UErr::RegexCaptures(group.to_owned(), format!("{:?}", caps)))
                };
                if UTF8_ENCODINGS.contains(&cap("codeset")?.as_str()) {
                    let lang = cap("lang")?.as_str().to_owned();
                    if !res.contains(&lang) {
                        res.push(lang);
                    }
                }
            }
        }
    }

    /* Make sure "C" is always in the list. */
    if !res.contains(&"C".to_owned()) {
        res.push("C".to_owned());
    }
    Ok(res)
}

/// Determine preferred languages as per the current locale settings.
///
/// This class is used to invoke the [`get_preferred_languages()`] function
/// with reasonable default values: the current process environment and
/// the default [`LOCALE_VARIABLES`] list of variable names, with the option
/// of overriding either.
#[derive(Debug, Default)]
#[non_exhaustive]
pub struct LanguagesDetect<'names> {
    /// The environment variables to examine instead of [`mod@std::env`].
    pub env: Option<HashMap<String, String>>,
    /// The names of locale variables to use instead of the defaults.
    pub names: Option<&'names [&'names str]>,
}

impl<'names> LanguagesDetect<'names> {
    /// Prepare to detect languages in the default manner.
    #[inline]
    #[must_use]
    pub fn new() -> Self {
        Self::default()
    }

    /// Detect the preferred languages according to the specified settings.
    ///
    /// # Errors
    ///
    /// Returns an error if one of the required environment variables has
    /// a value that is not a valid UTF-8 string.
    #[inline]
    pub fn detect(self) -> Result<Vec<String>, UErr> {
        let qnames = self.names.unwrap_or(&LOCALE_VARIABLES);
        let qenv = self.env.map_or_else(
            || {
                let vars: HashSet<String> = LOCALE_VARIABLES
                    .iter()
                    .map(|name| (*name).to_owned())
                    .collect();
                env::vars_os()
                    .filter_map(|(os_name, os_value)| {
                        os_name.to_str().and_then(|name| {
                            vars.contains(name).then(|| match os_value.to_str() {
                                Some(value) => Ok((name.to_owned(), value.to_owned())),
                                None => Err(UErr::InvalidEnvValue(name.to_owned())),
                            })
                        })
                    })
                    .collect::<Result<_, _>>()
            },
            Ok,
        )?;
        get_preferred_languages(&qenv, qnames)
    }

    /// Specify the environment variables to examine instead of [`mod@std::env`].
    #[allow(clippy::missing_const_for_fn)]
    #[inline]
    #[must_use]
    pub fn with_env(self, env: HashMap<String, String>) -> Self {
        Self {
            env: Some(env),
            ..self
        }
    }

    /// Specify the names of the environment variables to look at instead of
    /// [`LOCALE_VARIABLES`].
    #[allow(clippy::missing_const_for_fn)]
    #[inline]
    #[must_use]
    pub fn with_names(self, names: &'names [&'names str]) -> Self {
        Self {
            names: Some(names),
            ..self
        }
    }
}

/// Information about an available UTF-8 environment.
#[derive(Debug)]
#[non_exhaustive]
pub struct Utf8Environment {
    /// The environment to run a child process in.
    pub env: HashMap<String, String>,
    /// The environment variables that need to be updated.
    pub env_vars: HashMap<String, String>,
    /// The name of the UTF-8 locale.
    pub locale: String,
}

/// Determine a UTF-8 locale to use and prepare the environment variables.
///
/// This class holds an optional list of preferred languages (if none is
/// specified, the [`Utf8Detect::detect()`] method uses the ones in
/// the [`UTF8_LANGUAGES`] variable by default), and an optional map of
/// environment variables to augment (if none is specified, the current process
/// environment is used by default).
#[derive(Debug, Default)]
#[non_exhaustive]
pub struct Utf8Detect {
    /// The environment variables to use instead of the current process's ones.
    pub env: Option<HashMap<String, String>>,
    /// The languages to look for in order of preference.
    pub languages: Option<Vec<String>>,
}

impl Utf8Detect {
    /// Prepare to detect a locale in the default manner.
    #[inline]
    #[must_use]
    pub fn new() -> Self {
        Self::default()
    }

    /// Detect a UTF-8 locale, prepare the environment.
    /// This method invokes the (mostly internal) [`get_utf8_vars()`] function
    /// which, in turn, invokes the (again mostly internal) [`detect_utf8_locale()`]
    /// one, which uses the external `locale` utility to obtain the list of
    /// available locales and then picks a UTF-8-capable one according to
    /// the list of preferred languages.
    ///
    /// # Errors
    ///
    /// Propagate errors returned by [`get_utf8_vars()`].
    #[inline]
    pub fn detect(self) -> Result<Utf8Environment, UErr> {
        let env_vars = match self.languages {
            None => get_utf8_vars(&UTF8_LANGUAGES)?,
            Some(langs) => {
                let lvec: Vec<&str> = langs.iter().map(|lang| &**lang).collect();
                get_utf8_vars(&lvec)?
            }
        };
        let renv = self.env.unwrap_or_else(|| env::vars().collect());
        let locale = env_vars
            .get("LC_ALL")
            .ok_or_else(|| {
                UErr::Internal(format!(
                    "Internal error: no 'LC_ALL' after successful detection: {:?}",
                    env_vars
                ))
            })?
            .to_string();

        Ok(Utf8Environment {
            env: renv
                .into_iter()
                .chain(
                    env_vars
                        .iter()
                        .map(|(name, value)| (name.to_string(), value.to_string())),
                )
                .collect(),
            env_vars,
            locale,
        })
    }

    /// Specify the environment variables to record.
    #[allow(clippy::missing_const_for_fn)]
    #[inline]
    #[must_use]
    pub fn with_env(self, env: HashMap<String, String>) -> Self {
        Self {
            env: Some(env),
            ..self
        }
    }

    /// Specify the preferred languages to look for among the locales.
    #[inline]
    #[must_use]
    pub fn with_languages(self, langs: Vec<String>) -> Self {
        Self {
            languages: Some(langs),
            ..self
        }
    }
}