1use std::fs::File;
9use std::io::{Read, Result, Write};
10
11pub enum Format {
13 Utf8,
14 Utf8bom,
15 Utf16le,
16 Utf16be,
17 Utf32le,
18 Utf32be,
19}
20
21impl Format {
22 fn get_bom(&self) -> &'static [u8] {
24 match self {
25 Format::Utf8 => &[],
26 Format::Utf8bom => &[0xEF, 0xBB, 0xBF],
27 Format::Utf16le => &[0xFF, 0xFE],
28 Format::Utf16be => &[0xFE, 0xFF],
29 Format::Utf32le => &[0xFF, 0xFE, 0x00, 0x00],
30 Format::Utf32be => &[0x00, 0x00, 0xFE, 0xFF],
31 }
32 }
33}
34
35pub fn detect_format(filename: &str) -> Result<Format> {
48 let mut file = File::open(filename)?;
49 let mut bom_buffer = [0u8; 4];
50 let bytes_read = file.read(&mut bom_buffer)?;
51
52 let format = match &bom_buffer[..bytes_read] {
53 [0xEF, 0xBB, 0xBF, ..] => Format::Utf8bom,
54 [0xFE, 0xFF, ..] => Format::Utf16be,
55 [0xFF, 0xFE, 0x00, 0x00] => Format::Utf32le,
56 [0x00, 0x00, 0xFE, 0xFF] => Format::Utf32be,
57 [0xFF, 0xFE, ..] => Format::Utf16le,
58 _ => Format::Utf8,
59 };
60
61 Ok(format)
62}
63
64pub fn write_file_from_string(filename: &str, content: &str, format: Format) -> Result<()> {
81 let mut file = File::create(filename)?;
82 file.write_all(format.get_bom())?;
83
84 match format {
85 Format::Utf8 | Format::Utf8bom => {
86 file.write_all(content.as_bytes())?;
87 }
88 Format::Utf16le => {
89 for c in content.encode_utf16() {
90 file.write_all(&c.to_le_bytes())?;
91 }
92 }
93 Format::Utf16be => {
94 for c in content.encode_utf16() {
95 file.write_all(&c.to_be_bytes())?;
96 }
97 }
98 Format::Utf32le => {
99 for c in content.chars() {
100 file.write_all(&(c as u32).to_le_bytes())?;
101 }
102 }
103 Format::Utf32be => {
104 for c in content.chars() {
105 file.write_all(&(c as u32).to_be_bytes())?;
106 }
107 }
108 }
109 Ok(())
110}
111
112pub fn read_file_to_string(filename: &str) -> Result<String> {
126 let mut content = String::new();
127 let format = detect_format(filename)?;
128 let mut file = File::open(filename)?;
129
130 fn read_and_skip_bom(file: &mut File, size: usize) -> Result<()> {
132 let mut buf = vec![0u8; size];
133 file.read_exact(&mut buf)
134 }
135
136 fn process_utf16(file: &mut File, is_be: bool) -> Result<String> {
138 read_and_skip_bom(file, 2)?;
139 let mut bytes = Vec::new();
140 file.read_to_end(&mut bytes)?;
141
142 let content = String::from_utf16(
143 &bytes
144 .chunks(2)
145 .map(|chunk| {
146 if is_be {
147 u16::from_be_bytes([chunk[0], chunk[1]])
148 } else {
149 u16::from_le_bytes([chunk[0], chunk[1]])
150 }
151 })
152 .collect::<Vec<u16>>(),
153 )
154 .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
155
156 Ok(content.replace("\r\n", "\n"))
157 }
158
159 fn process_utf32(file: &mut File, is_be: bool) -> Result<String> {
161 read_and_skip_bom(file, 4)?;
162 let mut bytes = Vec::new();
163 file.read_to_end(&mut bytes)?;
164
165 let content = bytes
166 .chunks(4)
167 .map(|chunk| {
168 if is_be {
169 u32::from_be_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])
170 } else {
171 u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])
172 }
173 })
174 .map(|cp| char::from_u32(cp).unwrap_or('\u{FFFD}'))
175 .collect::<String>();
176
177 Ok(content.replace("\r\n", "\n"))
178 }
179
180 match format {
181 Format::Utf8bom => {
182 read_and_skip_bom(&mut file, 3)?;
183 file.read_to_string(&mut content)?;
184 }
185 Format::Utf16be => return process_utf16(&mut file, true),
186 Format::Utf16le => return process_utf16(&mut file, false),
187 Format::Utf32be => return process_utf32(&mut file, true),
188 Format::Utf32le => return process_utf32(&mut file, false),
189 Format::Utf8 => {
190 file.read_to_string(&mut content)?;
191 }
192 }
193
194 Ok(content.replace("\r\n", "\n"))
195}
196
197#[cfg(test)]
198mod tests {
199 use super::*;
200 use std::fs::{File, remove_file};
201 use std::io::{Read, Write};
202 use std::path::PathBuf;
203
204 fn temp_file(name: &str) -> PathBuf {
205 let mut p = std::env::temp_dir();
206
207 let ts = std::time::SystemTime::now()
208 .duration_since(std::time::UNIX_EPOCH)
209 .unwrap()
210 .as_nanos();
211 p.push(format!("yaml_lib_test_{}_{}.tmp", name, ts));
212 p
213 }
214
215 fn write_bytes(path: &PathBuf, bytes: &[u8]) {
216 let mut f = File::create(path).unwrap();
217 f.write_all(bytes).unwrap();
218 f.flush().unwrap();
219 }
220
221 fn read_all_bytes(path: &PathBuf) -> Vec<u8> {
222 let mut f = File::open(path).unwrap();
223 let mut v = Vec::new();
224 f.read_to_end(&mut v).unwrap();
225 v
226 }
227
228 #[test]
229 fn detect_utf8_empty_file() {
230 let path = temp_file("empty_utf8");
231
232 File::create(&path).unwrap();
233 let fmt = detect_format(path.to_str().unwrap()).unwrap();
234 assert!(matches!(fmt, Format::Utf8));
235 remove_file(path).ok();
236 }
237
238 #[test]
239 fn detect_all_boms() {
240 let cases: Vec<(&str, Vec<u8>, Format)> = vec![
241 ("utf8bom", vec![0xEF, 0xBB, 0xBF, b'a'], Format::Utf8bom),
242 ("utf16be", vec![0xFE, 0xFF, 0x00, 0x61], Format::Utf16be),
243 (
244 "utf32le",
245 vec![0xFF, 0xFE, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00],
246 Format::Utf32le,
247 ),
248 (
249 "utf32be",
250 vec![0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0x00, 0x61],
251 Format::Utf32be,
252 ),
253 ("utf16le", vec![0xFF, 0xFE, 0x61, 0x00], Format::Utf16le),
254 ];
255 for (name, bytes, expected) in cases {
256 let path = temp_file(name);
257 write_bytes(&path, &bytes);
258 let fmt = detect_format(path.to_str().unwrap()).unwrap();
259 assert!(
260 matches!(fmt, f if std::mem::discriminant(&f) == std::mem::discriminant(&expected))
261 );
262 remove_file(path).ok();
263 }
264 }
265
266 fn roundtrip(content: &str, format: Format) {
267 let path = temp_file("roundtrip");
268 write_file_from_string(path.to_str().unwrap(), content, format).unwrap();
269 let read_back = read_file_to_string(path.to_str().unwrap()).unwrap();
270 assert_eq!(read_back, content.replace("\r\n", "\n"));
271 remove_file(path).ok();
272 }
273
274 #[test]
275 fn roundtrip_all_formats_simple_ascii() {
276 let content = "Hello\nWorld\n";
277 roundtrip(content, Format::Utf8);
278 roundtrip(content, Format::Utf8bom);
279 roundtrip(content, Format::Utf16le);
280 roundtrip(content, Format::Utf16be);
281 roundtrip(content, Format::Utf32le);
282 roundtrip(content, Format::Utf32be);
283 }
284
285 #[test]
286 fn roundtrip_all_formats_unicode() {
287 let content = "Héllö – 世界\nLine2";
288 roundtrip(content, Format::Utf8);
289 roundtrip(content, Format::Utf8bom);
290 roundtrip(content, Format::Utf16le);
291 roundtrip(content, Format::Utf16be);
292 roundtrip(content, Format::Utf32le);
293 roundtrip(content, Format::Utf32be);
294 }
295
296 #[test]
297 fn read_crlf_normalization_utf8() {
298 let path = temp_file("crlf_utf8");
299 let data = b"line1\r\nline2\r\n";
300 write_bytes(&path, data);
301 let s = read_file_to_string(path.to_str().unwrap()).unwrap();
302 assert_eq!(s, "line1\nline2\n");
303 remove_file(path).ok();
304 }
305
306 #[test]
307 fn write_bom_presence() {
308 let cases = vec![
309 (Format::Utf8, vec![] as Vec<u8>),
310 (Format::Utf8bom, vec![0xEF, 0xBB, 0xBF]),
311 (Format::Utf16le, vec![0xFF, 0xFE]),
312 (Format::Utf16be, vec![0xFE, 0xFF]),
313 (Format::Utf32le, vec![0xFF, 0xFE, 0x00, 0x00]),
314 (Format::Utf32be, vec![0x00, 0x00, 0xFE, 0xFF]),
315 ];
316 for (fmt, bom) in cases {
317 let path = temp_file("bom");
318 write_file_from_string(path.to_str().unwrap(), "A", fmt).unwrap();
319 let bytes = read_all_bytes(&path);
320 assert!(bytes.starts_with(&bom));
321 remove_file(path).ok();
322 }
323 }
324}