file_content/
text_data.rs1use std::fs;
2use std::io::Read;
3use std::path::Path;
4
5use crate::constants::{
6 BINARY_DETECTION_THRESHOLD, UTF16BE_BOM, UTF16LE_BOM, UTF16_BOM_LENGTH, UTF8_BOM,
7 UTF8_BOM_LENGTH, ZERO_BYTE,
8};
9use crate::encoding::Encoding;
10use crate::utf16::{to_u16_be, to_u16_le, UnevenByteSequenceError};
11use crate::FileError;
12
13#[derive(Debug, PartialEq)]
15pub struct TextData {
16 pub data: String,
17 pub encoding: Encoding,
18}
19
20#[derive(Debug, thiserror::Error)]
22pub enum TextDataError {
23 #[error(transparent)]
24 FromUtf8(#[from] std::string::FromUtf8Error),
25
26 #[error(transparent)]
27 FromUtf16(#[from] std::string::FromUtf16Error),
28
29 #[error(transparent)]
30 UnevenByteSequence(#[from] UnevenByteSequenceError),
31
32 #[error("File content is binary")]
33 Binary,
34}
35
36impl TryFrom<&Path> for TextData {
37 type Error = FileError;
38
39 fn try_from(path: &Path) -> Result<Self, Self::Error> {
40 let mut file = fs::File::open(path)?;
41 let mut bytes: Vec<u8> = vec![];
42 file.read_to_end(&mut bytes)?;
43 Ok(TextData::try_from(bytes.as_slice())?)
44 }
45}
46
47impl TryFrom<&[u8]> for TextData {
48 type Error = TextDataError;
49
50 fn try_from(bytes: &[u8]) -> Result<Self, Self::Error> {
51 if bytes.starts_with(UTF8_BOM) {
52 Ok(TextData {
53 data: String::from_utf8(bytes[UTF8_BOM_LENGTH..].to_vec())?,
54 encoding: Encoding::Utf8Bom,
55 })
56 } else if bytes.starts_with(UTF16BE_BOM) {
57 Ok(TextData {
58 data: String::from_utf16(&to_u16_be(&bytes[UTF16_BOM_LENGTH..])?)?,
59 encoding: Encoding::Utf16Be,
60 })
61 } else if bytes.starts_with(UTF16LE_BOM) {
62 Ok(TextData {
63 data: String::from_utf16(&to_u16_le(&bytes[UTF16_BOM_LENGTH..])?)?,
64 encoding: Encoding::Utf16Le,
65 })
66 } else if is_binary(bytes) {
67 Err(TextDataError::Binary)
68 } else {
69 Ok(TextData {
70 data: String::from_utf8(bytes.to_vec())?,
71 encoding: Encoding::Utf8,
72 })
73 }
74 }
75}
76
77fn is_binary(bytes: &[u8]) -> bool {
79 bytes
80 .iter()
81 .take(BINARY_DETECTION_THRESHOLD)
82 .any(|b| *b == ZERO_BYTE)
83}
84
85#[cfg(test)]
86mod tests {
87 use test_case::test_case;
88
89 use crate::{
90 encoding::Encoding,
91 text_data::{TextData, TextDataError},
92 };
93
94 const UTF8BOM_EMPTY_CONTENT: &[u8] = include_bytes!(concat!(
95 env!("CARGO_MANIFEST_DIR"),
96 "/tests/data/UTF8BOM/empty"
97 ));
98 const UTF8BOM_ASCII_CONTENT: &[u8] = include_bytes!(concat!(
99 env!("CARGO_MANIFEST_DIR"),
100 "/tests/data/UTF8BOM/ascii"
101 ));
102 const UTF8BOM_UNICODE_CONTENT: &[u8] = include_bytes!(concat!(
103 env!("CARGO_MANIFEST_DIR"),
104 "/tests/data/UTF8BOM/unicode"
105 ));
106
107 const UTF16BE_EMPTY_CONTENT: &[u8] = include_bytes!(concat!(
108 env!("CARGO_MANIFEST_DIR"),
109 "/tests/data/UTF16BE/empty"
110 ));
111 const UTF16BE_ASCII_CONTENT: &[u8] = include_bytes!(concat!(
112 env!("CARGO_MANIFEST_DIR"),
113 "/tests/data/UTF16BE/ascii"
114 ));
115 const UTF16BE_UNICODE_CONTENT: &[u8] = include_bytes!(concat!(
116 env!("CARGO_MANIFEST_DIR"),
117 "/tests/data/UTF16BE/unicode"
118 ));
119
120 const UTF16LE_EMPTY_CONTENT: &[u8] = include_bytes!(concat!(
121 env!("CARGO_MANIFEST_DIR"),
122 "/tests/data/UTF16LE/empty"
123 ));
124 const UTF16LE_ASCII_CONTENT: &[u8] = include_bytes!(concat!(
125 env!("CARGO_MANIFEST_DIR"),
126 "/tests/data/UTF16LE/ascii"
127 ));
128 const UTF16LE_UNICODE_CONTENT: &[u8] = include_bytes!(concat!(
129 env!("CARGO_MANIFEST_DIR"),
130 "/tests/data/UTF16LE/unicode"
131 ));
132
133 #[test_case(""; "No content")]
134 #[test_case("Hello!"; "ASCII chars")]
135 #[test_case("Hello! 你好! 🌍"; "Unicode chars")]
136 fn from_valid_utf8(input: &str) {
137 let subject = TextData::try_from(input.as_bytes()).expect("Should pass");
138 let expected = TextData {
139 data: input.into(),
140 encoding: Encoding::Utf8,
141 };
142
143 assert_eq!(subject, expected);
144 }
145
146 #[test_case(b"\xC1\x80"; "Overlong encoding")]
148 #[test_case(b"\x80\xA2"; "Invalid start byte")]
149 #[test_case(b"\xE0\xA4"; "Incomplete sequence")]
150 #[test_case(b"\xF4\x90\x80\x80"; "Code points above maximum")]
151 fn from_invalid_utf8(bytes: &[u8]) {
152 let subject = TextData::try_from(bytes);
153
154 assert!(matches!(subject, Err(TextDataError::FromUtf8(_))))
155 }
156
157 #[test_case(UTF8BOM_EMPTY_CONTENT, ""; "No content")]
158 #[test_case(UTF8BOM_ASCII_CONTENT, "Hello!"; "ASCII chars")]
159 #[test_case(UTF8BOM_UNICODE_CONTENT, "Hello! 你好! 🌍"; "Unicode chars")]
160 fn from_valid_utf8_with_bom(bytes: &[u8], content: &str) {
161 let subject = TextData::try_from(bytes).expect("Should pass");
162 let expected = TextData {
163 data: content.into(),
164 encoding: Encoding::Utf8Bom,
165 };
166
167 assert_eq!(subject, expected);
168 }
169
170 #[test_case(b"\xEF\xBB\xBF\xC1\x80"; "Overlong encoding")]
171 #[test_case(b"\xEF\xBB\xBF\x80\xA2"; "Invalid start byte")]
172 #[test_case(b"\xEF\xBB\xBF\xE0\xA4"; "Incomplete sequence")]
173 #[test_case(b"\xEF\xBB\xBF\xF4\x90\x80\x80"; "Code points above maximum")]
174 fn from_invalid_utf8_with_bom(bytes: &[u8]) {
175 let subject = TextData::try_from(bytes);
176
177 assert!(matches!(subject, Err(TextDataError::FromUtf8(_))));
178 }
179
180 #[test_case(UTF16BE_EMPTY_CONTENT, ""; "No content")]
181 #[test_case(UTF16BE_ASCII_CONTENT, "Hello!"; "ASCII chars")]
182 #[test_case(UTF16BE_UNICODE_CONTENT, "Hello! 你好! 🌍"; "Unicode chars")]
183 fn from_valid_utf16be(bytes: &[u8], content: &str) {
184 let subject = TextData::try_from(bytes).expect("Should pass");
185 let expected = TextData {
186 data: content.into(),
187 encoding: Encoding::Utf16Be,
188 };
189
190 assert_eq!(subject, expected);
191 }
192
193 #[test_case(b"\xFE\xFF\xD8\xA5"; "Invalid high surrogate")]
194 #[test_case(b"\xFE\xFF\xDC\xA5"; "Invalid low surrogate")]
195 #[test_case(b"\xFE\xFF\xD8\x3D"; "Incomplete sequence")]
196 #[test_case(b"\xFE\xFF\xDB\xFF\xFF\xFF"; "Code points above maximum")]
197 fn from_invalid_utf16be(bytes: &[u8]) {
198 let subject = TextData::try_from(bytes);
199
200 assert!(matches!(subject, Err(TextDataError::FromUtf16(_))));
201 }
202
203 #[test_case(UTF16LE_EMPTY_CONTENT, ""; "No content")]
204 #[test_case(UTF16LE_ASCII_CONTENT, "Hello!"; "ASCII chars")]
205 #[test_case(UTF16LE_UNICODE_CONTENT, "Hello! 你好! 🌍"; "Unicode chars")]
206 fn from_valid_utf16le(bytes: &[u8], content: &str) {
207 let subject = TextData::try_from(bytes).expect("Should pass");
208 let expected = TextData {
209 data: content.into(),
210 encoding: Encoding::Utf16Le,
211 };
212
213 assert_eq!(subject, expected);
214 }
215
216 #[test_case(b"\xFF\xFE\xA5\xD8"; "Invalid high surrogate")]
217 #[test_case(b"\xFF\xFE\xA5\xDC"; "Invalid low surrogate")]
218 #[test_case(b"\xFF\xFE\x3D\xD8"; "Incomplete sequence")]
219 #[test_case(b"\xFF\xFE\xFF\xFF\xFF\xDB"; "Code points above maximum")]
220 fn from_invalid_utf16le(bytes: &[u8]) {
221 let subject = TextData::try_from(bytes);
222
223 assert!(matches!(subject, Err(TextDataError::FromUtf16(_))));
224 }
225
226 #[test_case(b"\0"; "Single zero-byte")]
227 #[test_case(b"\x12\x34\0"; "Trailing zero-byte")]
228 #[test_case(b"\0\x12\x34"; "Zero-byte at start")]
229 fn from_binary(bytes: &[u8]) {
230 let subject = TextData::try_from(bytes);
231
232 assert!(matches!(subject, Err(TextDataError::Binary)));
233 }
234}