Skip to main content

xtask_todo_lib/devshell/
host_text.rs

1//! Read text from the **host filesystem** in a Windows-friendly way.
2//!
3//! Windows editors often save scripts as **UTF-8 with BOM** or **UTF-16 LE with BOM** (“Unicode” in
4//! Notepad). `std::fs::read_to_string` assumes UTF-8 only, which can yield mojibake or load failures.
5//! VFS export/sync for `cargo`/`rustup` already uses binary [`std::fs::read`] / [`write`]; this module
6//! targets **text** loads: `-f` scripts, `source` / `.` from disk, nested `source` in `.dsh`, and
7//! `.todo.json`.
8
9use std::io;
10use std::path::Path;
11
12/// Strip a leading UTF-8 BOM (`EF BB BF`) if present.
13#[must_use]
14pub fn strip_utf8_bom(bytes: &[u8]) -> &[u8] {
15    if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
16        &bytes[3..]
17    } else {
18        bytes
19    }
20}
21
22/// Decode bytes from a host text file: UTF-16 LE/BE (with BOM), else UTF-8 (optional BOM).
23///
24/// # Errors
25/// Returns [`io::Error`] with kind [`io::ErrorKind::InvalidData`] if decoding fails.
26pub fn decode_host_text_bytes(bytes: &[u8]) -> Result<String, io::Error> {
27    use io::ErrorKind::InvalidData;
28
29    if bytes.starts_with(&[0xFF, 0xFE]) {
30        let rest = &bytes[2..];
31        if !rest.len().is_multiple_of(2) {
32            return Err(io::Error::new(
33                InvalidData,
34                "invalid UTF-16LE: odd byte length after BOM",
35            ));
36        }
37        let u16s: Vec<u16> = rest
38            .chunks_exact(2)
39            .map(|c| u16::from_le_bytes([c[0], c[1]]))
40            .collect();
41        return String::from_utf16(&u16s).map_err(|e| io::Error::new(InvalidData, e));
42    }
43
44    if bytes.starts_with(&[0xFE, 0xFF]) {
45        let rest = &bytes[2..];
46        if !rest.len().is_multiple_of(2) {
47            return Err(io::Error::new(
48                InvalidData,
49                "invalid UTF-16BE: odd byte length after BOM",
50            ));
51        }
52        let u16s: Vec<u16> = rest
53            .chunks_exact(2)
54            .map(|c| u16::from_be_bytes([c[0], c[1]]))
55            .collect();
56        return String::from_utf16(&u16s).map_err(|e| io::Error::new(InvalidData, e));
57    }
58
59    let b = strip_utf8_bom(bytes);
60    String::from_utf8(b.to_vec()).map_err(|e| io::Error::new(InvalidData, e))
61}
62
63/// Read a host file and decode as for [`decode_host_text_bytes`].
64///
65/// # Errors
66/// I/O errors from [`std::fs::read`], or [`io::ErrorKind::InvalidData`] if text is not valid.
67pub fn read_host_text(path: &Path) -> io::Result<String> {
68    let bytes = std::fs::read(path)?;
69    decode_host_text_bytes(&bytes)
70}
71
72/// Decode VFS file bytes as script/JSON text (UTF-8 with optional BOM, or UTF-16 with BOM).
73#[must_use]
74pub fn script_text_from_vfs_bytes(bytes: &[u8]) -> Option<String> {
75    decode_host_text_bytes(bytes).ok()
76}
77
78#[cfg(test)]
79mod tests {
80    use super::*;
81
82    #[test]
83    fn utf8_plain() {
84        assert_eq!(decode_host_text_bytes(b"hello").unwrap(), "hello");
85    }
86
87    #[test]
88    fn utf8_bom_stripped() {
89        let mut v = vec![0xEF, 0xBB, 0xBF];
90        v.extend_from_slice(b"echo ok");
91        assert_eq!(decode_host_text_bytes(&v).unwrap(), "echo ok");
92    }
93
94    #[test]
95    fn utf16le_bom_hello() {
96        // "Hi" in UTF-16 LE: H=0x48, i=0x69
97        let bytes: Vec<u8> = vec![0xFF, 0xFE, 0x48, 0x00, 0x69, 0x00];
98        assert_eq!(decode_host_text_bytes(&bytes).unwrap(), "Hi");
99    }
100
101    #[test]
102    fn utf16be_bom_hi() {
103        let bytes: Vec<u8> = vec![0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69];
104        assert_eq!(decode_host_text_bytes(&bytes).unwrap(), "Hi");
105    }
106
107    #[test]
108    fn strip_utf8_bom_only() {
109        assert_eq!(strip_utf8_bom(b"a"), b"a");
110        assert_eq!(strip_utf8_bom(&[0xEF, 0xBB, 0xBF]), b"");
111    }
112}