Skip to main content

yaml_lib/file/
file.rs

1//! File Format Utilities
2//!
3//! Provides utilities for handling Unicode text file formats and byte order marks (BOM).
4//! Supports UTF-8, UTF-16, and UTF-32 in both little and big endian variants.
5//!
6//! Copyright (c) 2026 YAML Library Developers
7
8use std::fs::File;
9use std::io::{Read, Result, Write};
10
11/// Represents different Unicode text file formats with their corresponding byte order marks (BOM)
12pub enum Format {
13    Utf8,
14    Utf8bom,
15    Utf16le,
16    Utf16be,
17    Utf32le,
18    Utf32be,
19}
20
21impl Format {
22    /// Returns the byte order mark (BOM) bytes for each format
23    fn get_bom(&self) -> &'static [u8] {
24        match self {
25            Format::Utf8 => &[],
26            Format::Utf8bom => &[0xEF, 0xBB, 0xBF],
27            Format::Utf16le => &[0xFF, 0xFE],
28            Format::Utf16be => &[0xFE, 0xFF],
29            Format::Utf32le => &[0xFF, 0xFE, 0x00, 0x00],
30            Format::Utf32be => &[0x00, 0x00, 0xFE, 0xFF],
31        }
32    }
33}
34
35/// Detects the Unicode format of a text file by examining its byte order mark (BOM).
36///
37/// Reads the first few bytes of a file to identify the Unicode encoding format
38/// based on the presence and type of byte order mark. Defaults to UTF-8 if no BOM is found.
39///
40/// # Arguments
41///
42/// * `filename` - Path to the file to analyze
43///
44/// # Returns
45///
46/// Result containing the detected Format or an IO error
47pub fn detect_format(filename: &str) -> Result<Format> {
48    let mut file = File::open(filename)?;
49    let mut bom_buffer = [0u8; 4];
50    let bytes_read = file.read(&mut bom_buffer)?;
51
52    let format = match &bom_buffer[..bytes_read] {
53        [0xEF, 0xBB, 0xBF, ..] => Format::Utf8bom,
54        [0xFE, 0xFF, ..] => Format::Utf16be,
55        [0xFF, 0xFE, 0x00, 0x00] => Format::Utf32le,
56        [0x00, 0x00, 0xFE, 0xFF] => Format::Utf32be,
57        [0xFF, 0xFE, ..] => Format::Utf16le,
58        _ => Format::Utf8,
59    };
60
61    Ok(format)
62}
63
64/// Writes a string to a file in the specified Unicode format
65/// Writes string content to a file with the specified Unicode format and BOM.
66///
67/// Creates or overwrites a file with the given content, adding the appropriate
68/// byte order mark based on the specified format. Handles Unicode encoding
69/// conversions as needed.
70///
71/// # Arguments
72///
73/// * `filename` - Path where the file will be created/written
74/// * `content` - String content to write to the file
75/// * `format` - Unicode format specifying encoding and BOM requirements
76///
77/// # Returns
78///
79/// Result indicating success or an IO error
80pub fn write_file_from_string(filename: &str, content: &str, format: Format) -> Result<()> {
81    let mut file = File::create(filename)?;
82    file.write_all(format.get_bom())?;
83
84    match format {
85        Format::Utf8 | Format::Utf8bom => {
86            file.write_all(content.as_bytes())?;
87        }
88        Format::Utf16le => {
89            for c in content.encode_utf16() {
90                file.write_all(&c.to_le_bytes())?;
91            }
92        }
93        Format::Utf16be => {
94            for c in content.encode_utf16() {
95                file.write_all(&c.to_be_bytes())?;
96            }
97        }
98        Format::Utf32le => {
99            for c in content.chars() {
100                file.write_all(&(c as u32).to_le_bytes())?;
101            }
102        }
103        Format::Utf32be => {
104            for c in content.chars() {
105                file.write_all(&(c as u32).to_be_bytes())?;
106            }
107        }
108    }
109    Ok(())
110}
111
112/// Reads a text file and returns its content as a String, handling different Unicode formats
113/// Reads a text file and returns its content as a UTF-8 string.
114///
115/// Automatically handles BOM detection and removal, converting the file
116/// content to a standard UTF-8 string regardless of the original encoding format.
117///
118/// # Arguments
119///
120/// * `filename` - Path to the file to read
121///
122/// # Returns
123///
124/// Result containing the file content as a String or an IO error
125pub fn read_file_to_string(filename: &str) -> Result<String> {
126    let mut content = String::new();
127    let format = detect_format(filename)?;
128    let mut file = File::open(filename)?;
129
130    /// Helper function to read and skip over the BOM bytes
131    fn read_and_skip_bom(file: &mut File, size: usize) -> Result<()> {
132        let mut buf = vec![0u8; size];
133        file.read_exact(&mut buf)
134    }
135
136    /// Helper function to process UTF-16 encoded files
137    fn process_utf16(file: &mut File, is_be: bool) -> Result<String> {
138        read_and_skip_bom(file, 2)?;
139        let mut bytes = Vec::new();
140        file.read_to_end(&mut bytes)?;
141
142        let content = String::from_utf16(
143            &bytes
144                .chunks(2)
145                .map(|chunk| {
146                    if is_be {
147                        u16::from_be_bytes([chunk[0], chunk[1]])
148                    } else {
149                        u16::from_le_bytes([chunk[0], chunk[1]])
150                    }
151                })
152                .collect::<Vec<u16>>(),
153        )
154        .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
155
156        Ok(content.replace("\r\n", "\n"))
157    }
158
159    /// Helper function to process UTF-32 encoded files
160    fn process_utf32(file: &mut File, is_be: bool) -> Result<String> {
161        read_and_skip_bom(file, 4)?;
162        let mut bytes = Vec::new();
163        file.read_to_end(&mut bytes)?;
164
165        let content = bytes
166            .chunks(4)
167            .map(|chunk| {
168                if is_be {
169                    u32::from_be_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])
170                } else {
171                    u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])
172                }
173            })
174            .map(|cp| char::from_u32(cp).unwrap_or('\u{FFFD}'))
175            .collect::<String>();
176
177        Ok(content.replace("\r\n", "\n"))
178    }
179
180    match format {
181        Format::Utf8bom => {
182            read_and_skip_bom(&mut file, 3)?;
183            file.read_to_string(&mut content)?;
184        }
185        Format::Utf16be => return process_utf16(&mut file, true),
186        Format::Utf16le => return process_utf16(&mut file, false),
187        Format::Utf32be => return process_utf32(&mut file, true),
188        Format::Utf32le => return process_utf32(&mut file, false),
189        Format::Utf8 => {
190            file.read_to_string(&mut content)?;
191        }
192    }
193
194    Ok(content.replace("\r\n", "\n"))
195}
196
197#[cfg(test)]
198mod tests {
199    use super::*;
200    use std::fs::{File, remove_file};
201    use std::io::{Read, Write};
202    use std::path::PathBuf;
203
204    fn temp_file(name: &str) -> PathBuf {
205        let mut p = std::env::temp_dir();
206
207        let ts = std::time::SystemTime::now()
208            .duration_since(std::time::UNIX_EPOCH)
209            .unwrap()
210            .as_nanos();
211        p.push(format!("yaml_lib_test_{}_{}.tmp", name, ts));
212        p
213    }
214
215    fn write_bytes(path: &PathBuf, bytes: &[u8]) {
216        let mut f = File::create(path).unwrap();
217        f.write_all(bytes).unwrap();
218        f.flush().unwrap();
219    }
220
221    fn read_all_bytes(path: &PathBuf) -> Vec<u8> {
222        let mut f = File::open(path).unwrap();
223        let mut v = Vec::new();
224        f.read_to_end(&mut v).unwrap();
225        v
226    }
227
228    #[test]
229    fn detect_utf8_empty_file() {
230        let path = temp_file("empty_utf8");
231
232        File::create(&path).unwrap();
233        let fmt = detect_format(path.to_str().unwrap()).unwrap();
234        assert!(matches!(fmt, Format::Utf8));
235        remove_file(path).ok();
236    }
237
238    #[test]
239    fn detect_all_boms() {
240        let cases: Vec<(&str, Vec<u8>, Format)> = vec![
241            ("utf8bom", vec![0xEF, 0xBB, 0xBF, b'a'], Format::Utf8bom),
242            ("utf16be", vec![0xFE, 0xFF, 0x00, 0x61], Format::Utf16be),
243            (
244                "utf32le",
245                vec![0xFF, 0xFE, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00],
246                Format::Utf32le,
247            ),
248            (
249                "utf32be",
250                vec![0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0x00, 0x61],
251                Format::Utf32be,
252            ),
253            ("utf16le", vec![0xFF, 0xFE, 0x61, 0x00], Format::Utf16le),
254        ];
255        for (name, bytes, expected) in cases {
256            let path = temp_file(name);
257            write_bytes(&path, &bytes);
258            let fmt = detect_format(path.to_str().unwrap()).unwrap();
259            assert!(
260                matches!(fmt, f if std::mem::discriminant(&f) == std::mem::discriminant(&expected))
261            );
262            remove_file(path).ok();
263        }
264    }
265
266    fn roundtrip(content: &str, format: Format) {
267        let path = temp_file("roundtrip");
268        write_file_from_string(path.to_str().unwrap(), content, format).unwrap();
269        let read_back = read_file_to_string(path.to_str().unwrap()).unwrap();
270        assert_eq!(read_back, content.replace("\r\n", "\n"));
271        remove_file(path).ok();
272    }
273
274    #[test]
275    fn roundtrip_all_formats_simple_ascii() {
276        let content = "Hello\nWorld\n";
277        roundtrip(content, Format::Utf8);
278        roundtrip(content, Format::Utf8bom);
279        roundtrip(content, Format::Utf16le);
280        roundtrip(content, Format::Utf16be);
281        roundtrip(content, Format::Utf32le);
282        roundtrip(content, Format::Utf32be);
283    }
284
285    #[test]
286    fn roundtrip_all_formats_unicode() {
287        let content = "Héllö – 世界\nLine2";
288        roundtrip(content, Format::Utf8);
289        roundtrip(content, Format::Utf8bom);
290        roundtrip(content, Format::Utf16le);
291        roundtrip(content, Format::Utf16be);
292        roundtrip(content, Format::Utf32le);
293        roundtrip(content, Format::Utf32be);
294    }
295
296    #[test]
297    fn read_crlf_normalization_utf8() {
298        let path = temp_file("crlf_utf8");
299        let data = b"line1\r\nline2\r\n";
300        write_bytes(&path, data);
301        let s = read_file_to_string(path.to_str().unwrap()).unwrap();
302        assert_eq!(s, "line1\nline2\n");
303        remove_file(path).ok();
304    }
305
306    #[test]
307    fn write_bom_presence() {
308        let cases = vec![
309            (Format::Utf8, vec![] as Vec<u8>),
310            (Format::Utf8bom, vec![0xEF, 0xBB, 0xBF]),
311            (Format::Utf16le, vec![0xFF, 0xFE]),
312            (Format::Utf16be, vec![0xFE, 0xFF]),
313            (Format::Utf32le, vec![0xFF, 0xFE, 0x00, 0x00]),
314            (Format::Utf32be, vec![0x00, 0x00, 0xFE, 0xFF]),
315        ];
316        for (fmt, bom) in cases {
317            let path = temp_file("bom");
318            write_file_from_string(path.to_str().unwrap(), "A", fmt).unwrap();
319            let bytes = read_all_bytes(&path);
320            assert!(bytes.starts_with(&bom));
321            remove_file(path).ok();
322        }
323    }
324}