utf8_supported/
lib.rs

1#![doc = include_str!("../README.md")]
2
3#[cfg(unix)]
4use std::ffi::OsStr;
5
6const LOCALE_ENVIRONMENT_VARIABLES: &[&str] = &["LC_ALL", "LC_CTYPE", "LANG"];
7
8#[cfg(unix)]
9enum LocaleSignal {
10    Unknown,
11    UTF8,
12    ASCII,
13    NonUTF8,
14}
15
16#[cfg(unix)]
17fn strstr_ignore_case(haystack: &OsStr, needle: &[u8]) -> bool {
18    use std::os::unix::ffi::OsStrExt;
19
20    debug_assert!(
21        needle.iter().all(|c| c.to_ascii_lowercase() == *c),
22        "needle must contain only ASCII lowercase characters"
23    );
24
25    let hlen = haystack.len();
26    let nlen = needle.len();
27    if hlen < nlen {
28        return false;
29    }
30    let haystack = haystack.as_bytes();
31
32    'outer: for i in 0..(hlen - nlen + 1) as usize {
33        if haystack[i].to_ascii_lowercase() == needle[0] {
34            for j in 1..nlen {
35                if haystack[i + j].to_ascii_lowercase() != needle[j] {
36                    continue 'outer;
37                }
38            }
39            return true;
40        }
41    }
42    false
43}
44
45#[cfg(unix)]
46fn get_locale_signal(value: &OsStr) -> LocaleSignal {
47    if value.is_empty() {
48        LocaleSignal::Unknown
49    } else if value == "C" || value == "c" || value == "POSIX" || value == "posix" {
50        LocaleSignal::ASCII
51    } else if strstr_ignore_case(value, b"utf-8") || strstr_ignore_case(value, b"utf8") {
52        LocaleSignal::UTF8
53    } else {
54        LocaleSignal::NonUTF8
55    }
56}
57
58#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
59pub enum Utf8Support {
60    /// The locale is unset, or at least as far as we can tell.
61    #[default]
62    Unknown,
63    /// The locale is ASCII.
64    ASCII,
65    /// The locale is Latin-1 (Windows only).
66    Latin1,
67    /// The locale is UTF-8.
68    UTF8,
69    /// The locale is set, and not ASCII or UTF-8.
70    Other,
71}
72
73/// Determine the UTF-8 support of the current locale.
74pub fn utf8_supported() -> Utf8Support {
75    #[cfg(unix)]
76    return utf8_supported_unix();
77    #[cfg(windows)]
78    return utf8_supported_windows();
79    #[cfg(not(any(unix, windows)))]
80    return Utf8Support::Unset;
81}
82
83/// Determine the UTF-8 support of the current locale by locating the first
84/// signal.
85#[cfg(unix)]
86fn utf8_supported_unix() -> Utf8Support {
87    let mut signal = Utf8Support::Unknown;
88    for &var in LOCALE_ENVIRONMENT_VARIABLES {
89        let locale = std::env::var_os(var).unwrap_or_default();
90        match get_locale_signal(locale.as_os_str()) {
91            LocaleSignal::UTF8 => return Utf8Support::UTF8,
92            LocaleSignal::ASCII => return Utf8Support::ASCII,
93            LocaleSignal::NonUTF8 => signal = Utf8Support::Other,
94            LocaleSignal::Unknown => {}
95        }
96    }
97    signal
98}
99
100#[cfg(windows)]
101fn utf8_supported_windows() -> Utf8Support {
102    use windows_sys::Win32::System::Console::*;
103    match unsafe { GetConsoleOutputCP() } {
104        65001 => Utf8Support::UTF8,
105        20127 => Utf8Support::ASCII,
106        1252 => Utf8Support::Latin1,
107        0 => Utf8Support::Unknown,
108        // Should we expose this?
109        437 => Utf8Support::Other,
110        _ => Utf8Support::Other,
111    }
112}
113
114/// A trait for setting the locale of a subprocess to C.
115pub trait CommandUtf8Ext {
116    /// Ensure that a child subprocess runs with the C locale (Unix only).
117    #[cfg(unix)]
118    fn set_c_locale(&mut self);
119}
120
121impl CommandUtf8Ext for std::process::Command {
122    #[cfg(unix)]
123    fn set_c_locale(&mut self) {
124        self.env("LANG", "C");
125        self.env("LC_ALL", "C");
126        self.env("LC_CTYPE", "C");
127    }
128}
129
130#[derive(Debug, Default)]
131#[cfg(windows)]
132pub struct CodePageHandle(u32, u32);
133
134#[cfg(windows)]
135impl Drop for CodePageHandle {
136    fn drop(&mut self) {
137        use windows_sys::Win32::System::Console::*;
138        unsafe {
139            if self.0 != 0 {
140                SetConsoleOutputCP(self.0);
141            }
142            if self.1 != 0 {
143                SetConsoleCP(self.1);
144            }
145        }
146    }
147}
148
149/// Set the console code page to UTF-8 (Windows only).
150///
151/// This function returns a handle that will reset the console code page to the
152/// original value when dropped.
153///
154/// # Example
155///
156/// ```rust
157/// let handle = set_console_utf8().unwrap_or_default();
158/// // Use UTF-8 here...
159/// drop(handle);
160/// // ...and restore the original code page when done.
161/// ```
162#[must_use = "The returned CodePageHandle resets the console code page to the original value when dropped"]
163#[cfg(windows)]
164pub fn set_console_utf8() -> Result<CodePageHandle, std::io::Error> {
165    use windows_sys::Win32::Globalization::*;
166    use windows_sys::Win32::System::Console::*;
167
168    unsafe {
169        let original_output_cp = GetConsoleOutputCP();
170        let original_input_cp = GetConsoleCP();
171
172        // Returns a nonzero value if the code page is valid, or 0 if the code page is invalid
173        if IsValidCodePage(65001) != 0 {
174            SetConsoleOutputCP(65001);
175            SetConsoleCP(65001);
176        } else {
177            return Err(std::io::Error::new(
178                std::io::ErrorKind::Other,
179                "UTF-8 codepage (65001) is not available",
180            ));
181        }
182
183        Ok(CodePageHandle(original_output_cp, original_input_cp))
184    }
185}
186
187#[cfg(test)]
188mod tests {
189    use super::*;
190
191    #[cfg(unix)]
192    #[test]
193    fn test_strstr() {
194        assert!(strstr_ignore_case(OsStr::new("UTF-8"), b"utf-8"));
195        assert!(strstr_ignore_case(OsStr::new("XUTF-8"), b"utf-8"));
196        assert!(strstr_ignore_case(OsStr::new("UTF-8X"), b"utf-8"));
197        assert!(strstr_ignore_case(OsStr::new("utf-8x"), b"utf-8"));
198        assert!(strstr_ignore_case(OsStr::new("utf-8"), b"utf-8"));
199        assert!(strstr_ignore_case(OsStr::new("xutf-8"), b"utf-8"));
200        assert!(!strstr_ignore_case(OsStr::new("UTF-16"), b"utf-8"));
201        assert!(!strstr_ignore_case(OsStr::new("utf16"), b"utf-8"));
202        assert!(!strstr_ignore_case(OsStr::new("utf8"), b"utf-8"));
203    }
204}