file_content/
text_data.rs

1use std::fs;
2use std::io::Read;
3use std::path::Path;
4
5use crate::constants::{
6    BINARY_DETECTION_THRESHOLD, UTF16BE_BOM, UTF16LE_BOM, UTF16_BOM_LENGTH, UTF8_BOM,
7    UTF8_BOM_LENGTH, ZERO_BYTE,
8};
9use crate::encoding::Encoding;
10use crate::utf16::{to_u16_be, to_u16_le, UnevenByteSequenceError};
11use crate::FileError;
12
13/// A struct to hold the data of a text file and the encoding used to read it.
14#[derive(Debug, PartialEq)]
15pub struct TextData {
16    pub data: String,
17    pub encoding: Encoding,
18}
19
20/// The possible errors that can occur when working with [TextData] structs.
21#[derive(Debug, thiserror::Error)]
22pub enum TextDataError {
23    #[error(transparent)]
24    FromUtf8(#[from] std::string::FromUtf8Error),
25
26    #[error(transparent)]
27    FromUtf16(#[from] std::string::FromUtf16Error),
28
29    #[error(transparent)]
30    UnevenByteSequence(#[from] UnevenByteSequenceError),
31
32    #[error("File content is binary")]
33    Binary,
34}
35
36impl TryFrom<&Path> for TextData {
37    type Error = FileError;
38
39    fn try_from(path: &Path) -> Result<Self, Self::Error> {
40        let mut file = fs::File::open(path)?;
41        let mut bytes: Vec<u8> = vec![];
42        file.read_to_end(&mut bytes)?;
43        Ok(TextData::try_from(bytes.as_slice())?)
44    }
45}
46
47impl TryFrom<&[u8]> for TextData {
48    type Error = TextDataError;
49
50    fn try_from(bytes: &[u8]) -> Result<Self, Self::Error> {
51        if bytes.starts_with(UTF8_BOM) {
52            Ok(TextData {
53                data: String::from_utf8(bytes[UTF8_BOM_LENGTH..].to_vec())?,
54                encoding: Encoding::Utf8Bom,
55            })
56        } else if bytes.starts_with(UTF16BE_BOM) {
57            Ok(TextData {
58                data: String::from_utf16(&to_u16_be(&bytes[UTF16_BOM_LENGTH..])?)?,
59                encoding: Encoding::Utf16Be,
60            })
61        } else if bytes.starts_with(UTF16LE_BOM) {
62            Ok(TextData {
63                data: String::from_utf16(&to_u16_le(&bytes[UTF16_BOM_LENGTH..])?)?,
64                encoding: Encoding::Utf16Le,
65            })
66        } else if is_binary(bytes) {
67            Err(TextDataError::Binary)
68        } else {
69            Ok(TextData {
70                data: String::from_utf8(bytes.to_vec())?,
71                encoding: Encoding::Utf8,
72            })
73        }
74    }
75}
76
77/// Returns true if it finds a zero-byte within the first 8 thousand bytes (same as Git)
78fn is_binary(bytes: &[u8]) -> bool {
79    bytes
80        .iter()
81        .take(BINARY_DETECTION_THRESHOLD)
82        .any(|b| *b == ZERO_BYTE)
83}
84
85#[cfg(test)]
86mod tests {
87    use test_case::test_case;
88
89    use crate::{
90        encoding::Encoding,
91        text_data::{TextData, TextDataError},
92    };
93
94    const UTF8BOM_EMPTY_CONTENT: &[u8] = include_bytes!(concat!(
95        env!("CARGO_MANIFEST_DIR"),
96        "/tests/data/UTF8BOM/empty"
97    ));
98    const UTF8BOM_ASCII_CONTENT: &[u8] = include_bytes!(concat!(
99        env!("CARGO_MANIFEST_DIR"),
100        "/tests/data/UTF8BOM/ascii"
101    ));
102    const UTF8BOM_UNICODE_CONTENT: &[u8] = include_bytes!(concat!(
103        env!("CARGO_MANIFEST_DIR"),
104        "/tests/data/UTF8BOM/unicode"
105    ));
106
107    const UTF16BE_EMPTY_CONTENT: &[u8] = include_bytes!(concat!(
108        env!("CARGO_MANIFEST_DIR"),
109        "/tests/data/UTF16BE/empty"
110    ));
111    const UTF16BE_ASCII_CONTENT: &[u8] = include_bytes!(concat!(
112        env!("CARGO_MANIFEST_DIR"),
113        "/tests/data/UTF16BE/ascii"
114    ));
115    const UTF16BE_UNICODE_CONTENT: &[u8] = include_bytes!(concat!(
116        env!("CARGO_MANIFEST_DIR"),
117        "/tests/data/UTF16BE/unicode"
118    ));
119
120    const UTF16LE_EMPTY_CONTENT: &[u8] = include_bytes!(concat!(
121        env!("CARGO_MANIFEST_DIR"),
122        "/tests/data/UTF16LE/empty"
123    ));
124    const UTF16LE_ASCII_CONTENT: &[u8] = include_bytes!(concat!(
125        env!("CARGO_MANIFEST_DIR"),
126        "/tests/data/UTF16LE/ascii"
127    ));
128    const UTF16LE_UNICODE_CONTENT: &[u8] = include_bytes!(concat!(
129        env!("CARGO_MANIFEST_DIR"),
130        "/tests/data/UTF16LE/unicode"
131    ));
132
133    #[test_case(""; "No content")]
134    #[test_case("Hello!"; "ASCII chars")]
135    #[test_case("Hello! 你好! 🌍"; "Unicode chars")]
136    fn from_valid_utf8(input: &str) {
137        let subject = TextData::try_from(input.as_bytes()).expect("Should pass");
138        let expected = TextData {
139            data: input.into(),
140            encoding: Encoding::Utf8,
141        };
142
143        assert_eq!(subject, expected);
144    }
145
146    // Overlong encoding refers to using more bytes than necessary for a character [https://codedocs.org/what-is/utf-8#Overlong_encodings]
147    #[test_case(b"\xC1\x80"; "Overlong encoding")]
148    #[test_case(b"\x80\xA2"; "Invalid start byte")]
149    #[test_case(b"\xE0\xA4"; "Incomplete sequence")]
150    #[test_case(b"\xF4\x90\x80\x80"; "Code points above maximum")]
151    fn from_invalid_utf8(bytes: &[u8]) {
152        let subject = TextData::try_from(bytes);
153
154        assert!(matches!(subject, Err(TextDataError::FromUtf8(_))))
155    }
156
157    #[test_case(UTF8BOM_EMPTY_CONTENT, ""; "No content")]
158    #[test_case(UTF8BOM_ASCII_CONTENT, "Hello!"; "ASCII chars")]
159    #[test_case(UTF8BOM_UNICODE_CONTENT, "Hello! 你好! 🌍"; "Unicode chars")]
160    fn from_valid_utf8_with_bom(bytes: &[u8], content: &str) {
161        let subject = TextData::try_from(bytes).expect("Should pass");
162        let expected = TextData {
163            data: content.into(),
164            encoding: Encoding::Utf8Bom,
165        };
166
167        assert_eq!(subject, expected);
168    }
169
170    #[test_case(b"\xEF\xBB\xBF\xC1\x80"; "Overlong encoding")]
171    #[test_case(b"\xEF\xBB\xBF\x80\xA2"; "Invalid start byte")]
172    #[test_case(b"\xEF\xBB\xBF\xE0\xA4"; "Incomplete sequence")]
173    #[test_case(b"\xEF\xBB\xBF\xF4\x90\x80\x80"; "Code points above maximum")]
174    fn from_invalid_utf8_with_bom(bytes: &[u8]) {
175        let subject = TextData::try_from(bytes);
176
177        assert!(matches!(subject, Err(TextDataError::FromUtf8(_))));
178    }
179
180    #[test_case(UTF16BE_EMPTY_CONTENT, ""; "No content")]
181    #[test_case(UTF16BE_ASCII_CONTENT, "Hello!"; "ASCII chars")]
182    #[test_case(UTF16BE_UNICODE_CONTENT, "Hello! 你好! 🌍"; "Unicode chars")]
183    fn from_valid_utf16be(bytes: &[u8], content: &str) {
184        let subject = TextData::try_from(bytes).expect("Should pass");
185        let expected = TextData {
186            data: content.into(),
187            encoding: Encoding::Utf16Be,
188        };
189
190        assert_eq!(subject, expected);
191    }
192
193    #[test_case(b"\xFE\xFF\xD8\xA5"; "Invalid high surrogate")]
194    #[test_case(b"\xFE\xFF\xDC\xA5"; "Invalid low surrogate")]
195    #[test_case(b"\xFE\xFF\xD8\x3D"; "Incomplete sequence")]
196    #[test_case(b"\xFE\xFF\xDB\xFF\xFF\xFF"; "Code points above maximum")]
197    fn from_invalid_utf16be(bytes: &[u8]) {
198        let subject = TextData::try_from(bytes);
199
200        assert!(matches!(subject, Err(TextDataError::FromUtf16(_))));
201    }
202
203    #[test_case(UTF16LE_EMPTY_CONTENT, ""; "No content")]
204    #[test_case(UTF16LE_ASCII_CONTENT, "Hello!"; "ASCII chars")]
205    #[test_case(UTF16LE_UNICODE_CONTENT, "Hello! 你好! 🌍"; "Unicode chars")]
206    fn from_valid_utf16le(bytes: &[u8], content: &str) {
207        let subject = TextData::try_from(bytes).expect("Should pass");
208        let expected = TextData {
209            data: content.into(),
210            encoding: Encoding::Utf16Le,
211        };
212
213        assert_eq!(subject, expected);
214    }
215
216    #[test_case(b"\xFF\xFE\xA5\xD8"; "Invalid high surrogate")]
217    #[test_case(b"\xFF\xFE\xA5\xDC"; "Invalid low surrogate")]
218    #[test_case(b"\xFF\xFE\x3D\xD8"; "Incomplete sequence")]
219    #[test_case(b"\xFF\xFE\xFF\xFF\xFF\xDB"; "Code points above maximum")]
220    fn from_invalid_utf16le(bytes: &[u8]) {
221        let subject = TextData::try_from(bytes);
222
223        assert!(matches!(subject, Err(TextDataError::FromUtf16(_))));
224    }
225
226    #[test_case(b"\0"; "Single zero-byte")]
227    #[test_case(b"\x12\x34\0"; "Trailing zero-byte")]
228    #[test_case(b"\0\x12\x34"; "Zero-byte at start")]
229    fn from_binary(bytes: &[u8]) {
230        let subject = TextData::try_from(bytes);
231
232        assert!(matches!(subject, Err(TextDataError::Binary)));
233    }
234}