1use crate::error::{ErrorKind, LoftyError, Result};
2use crate::macros::err;
3
4use std::io::Read;
5
6use byteorder::ReadBytesExt;
7
8#[derive(Debug, Clone, Eq, PartialEq, Copy, Hash)]
10#[repr(u8)]
11pub enum TextEncoding {
12 Latin1 = 0,
14 UTF16 = 1,
16 UTF16BE = 2,
18 UTF8 = 3,
20}
21
22impl TextEncoding {
23 pub fn from_u8(byte: u8) -> Option<Self> {
25 match byte {
26 0 => Some(Self::Latin1),
27 1 => Some(Self::UTF16),
28 2 => Some(Self::UTF16BE),
29 3 => Some(Self::UTF8),
30 _ => None,
31 }
32 }
33
34 pub(crate) fn verify_latin1(text: &str) -> bool {
35 text.chars().all(|c| c as u32 <= 255)
36 }
37
38 pub(crate) fn to_id3v23(self) -> Self {
42 match self {
43 Self::UTF8 | Self::UTF16BE => {
44 log::warn!(
45 "Text encoding {:?} is not supported in ID3v2.3, substituting with UTF-16",
46 self
47 );
48 Self::UTF16
49 },
50 _ => self,
51 }
52 }
53}
54
55#[derive(Eq, PartialEq, Debug)]
56pub(crate) struct DecodeTextResult {
57 pub(crate) content: String,
58 pub(crate) bytes_read: usize,
59 pub(crate) bom: [u8; 2],
60}
61
62impl DecodeTextResult {
63 pub(crate) fn text_or_none(self) -> Option<String> {
64 if self.content.is_empty() {
65 return None;
66 }
67
68 Some(self.content)
69 }
70}
71
72const EMPTY_DECODED_TEXT: DecodeTextResult = DecodeTextResult {
73 content: String::new(),
74 bytes_read: 0,
75 bom: [0, 0],
76};
77
78#[derive(Copy, Clone, Debug)]
86pub(crate) struct TextDecodeOptions {
87 pub encoding: TextEncoding,
88 pub terminated: bool,
89 pub bom: [u8; 2],
90}
91
92impl TextDecodeOptions {
93 pub(crate) fn new() -> Self {
94 Self::default()
95 }
96
97 pub(crate) fn encoding(mut self, encoding: TextEncoding) -> Self {
98 self.encoding = encoding;
99 self
100 }
101
102 pub(crate) fn terminated(mut self, terminated: bool) -> Self {
103 self.terminated = terminated;
104 self
105 }
106
107 pub(crate) fn bom(mut self, bom: [u8; 2]) -> Self {
108 self.bom = bom;
109 self
110 }
111}
112
113impl Default for TextDecodeOptions {
114 fn default() -> Self {
115 Self {
116 encoding: TextEncoding::UTF8,
117 terminated: false,
118 bom: [0, 0],
119 }
120 }
121}
122
123pub(crate) fn decode_text<R>(reader: &mut R, options: TextDecodeOptions) -> Result<DecodeTextResult>
124where
125 R: Read,
126{
127 let raw_bytes;
128 let bytes_read;
129
130 if options.terminated {
131 let (bytes, terminator_len) = read_to_terminator(reader, options.encoding);
132
133 if bytes.is_empty() {
134 return Ok(EMPTY_DECODED_TEXT);
135 }
136
137 bytes_read = bytes.len() + terminator_len;
138 raw_bytes = bytes;
139 } else {
140 let mut bytes = Vec::new();
141 reader.read_to_end(&mut bytes)?;
142
143 if bytes.is_empty() {
144 return Ok(EMPTY_DECODED_TEXT);
145 }
146
147 bytes_read = bytes.len();
148 raw_bytes = bytes;
149 }
150
151 let mut bom = [0, 0];
152 let read_string = match options.encoding {
153 TextEncoding::Latin1 => latin1_decode(&raw_bytes),
154 TextEncoding::UTF16 => {
155 if raw_bytes.len() < 2 {
156 err!(TextDecode("UTF-16 string has an invalid length (< 2)"));
157 }
158
159 if raw_bytes.len() % 2 != 0 {
160 err!(TextDecode("UTF-16 string has an odd length"));
161 }
162
163 let bom_to_check;
164 if options.bom == [0, 0] {
165 bom_to_check = [raw_bytes[0], raw_bytes[1]];
166 } else {
167 bom_to_check = options.bom;
168 }
169
170 match bom_to_check {
171 [0xFE, 0xFF] => {
172 bom = [0xFE, 0xFF];
173 utf16_decode_bytes(&raw_bytes[2..], u16::from_be_bytes)?
174 },
175 [0xFF, 0xFE] => {
176 bom = [0xFF, 0xFE];
177 utf16_decode_bytes(&raw_bytes[2..], u16::from_le_bytes)?
178 },
179 _ => err!(TextDecode("UTF-16 string has an invalid byte order mark")),
180 }
181 },
182 TextEncoding::UTF16BE => utf16_decode_bytes(raw_bytes.as_slice(), u16::from_be_bytes)?,
183 TextEncoding::UTF8 => utf8_decode(raw_bytes)
184 .map_err(|_| LoftyError::new(ErrorKind::TextDecode("Expected a UTF-8 string")))?,
185 };
186
187 if read_string.is_empty() {
188 return Ok(EMPTY_DECODED_TEXT);
189 }
190
191 Ok(DecodeTextResult {
192 content: read_string,
193 bytes_read,
194 bom,
195 })
196}
197
198pub(crate) fn read_to_terminator<R>(reader: &mut R, encoding: TextEncoding) -> (Vec<u8>, usize)
199where
200 R: Read,
201{
202 let mut text_bytes = Vec::new();
203 let mut terminator_len = 0;
204
205 match encoding {
206 TextEncoding::Latin1 | TextEncoding::UTF8 => {
207 while let Ok(byte) = reader.read_u8() {
208 if byte == 0 {
209 terminator_len = 1;
210 break;
211 }
212
213 text_bytes.push(byte)
214 }
215 },
216 TextEncoding::UTF16 | TextEncoding::UTF16BE => {
217 while let (Ok(b1), Ok(b2)) = (reader.read_u8(), reader.read_u8()) {
218 if b1 == 0 && b2 == 0 {
219 terminator_len = 2;
220 break;
221 }
222
223 text_bytes.push(b1);
224 text_bytes.push(b2)
225 }
226 },
227 }
228
229 (text_bytes, terminator_len)
230}
231
232pub(crate) fn latin1_decode(bytes: &[u8]) -> String {
233 let mut text = bytes.iter().map(|c| *c as char).collect::<String>();
234 trim_end_nulls(&mut text);
235 text
236}
237
238pub(crate) fn utf8_decode(bytes: Vec<u8>) -> Result<String> {
239 String::from_utf8(bytes)
240 .map(|mut text| {
241 trim_end_nulls(&mut text);
242 text
243 })
244 .map_err(Into::into)
245}
246
247pub(crate) fn utf8_decode_str(bytes: &[u8]) -> Result<&str> {
248 std::str::from_utf8(bytes)
249 .map(trim_end_nulls_str)
250 .map_err(Into::into)
251}
252
253pub(crate) fn utf16_decode(words: &[u16]) -> Result<String> {
254 String::from_utf16(words)
255 .map(|mut text| {
256 trim_end_nulls(&mut text);
257 text
258 })
259 .map_err(|_| LoftyError::new(ErrorKind::TextDecode("Given an invalid UTF-16 string")))
260}
261
262pub(crate) fn utf16_decode_bytes(bytes: &[u8], endianness: fn([u8; 2]) -> u16) -> Result<String> {
263 if bytes.is_empty() {
264 return Ok(String::new());
265 }
266
267 let unverified: Vec<u16> = bytes
268 .chunks_exact(2)
269 .filter_map(|c| match c {
273 [0xFF, 0xFE] | [0xFE, 0xFF] => None,
274 _ => Some(endianness(c.try_into().unwrap())), })
276 .collect();
277
278 utf16_decode(&unverified)
279}
280
281pub(crate) fn encode_text(text: &str, text_encoding: TextEncoding, terminated: bool) -> Vec<u8> {
282 match text_encoding {
283 TextEncoding::Latin1 => {
284 let mut out = text.chars().map(|c| c as u8).collect::<Vec<u8>>();
285
286 if terminated {
287 out.push(0)
288 }
289
290 out
291 },
292 TextEncoding::UTF16 => utf16_encode(text, u16::to_ne_bytes, true, terminated),
293 TextEncoding::UTF16BE => utf16_encode(text, u16::to_be_bytes, false, terminated),
294 TextEncoding::UTF8 => {
295 let mut out = text.as_bytes().to_vec();
296
297 if terminated {
298 out.push(0);
299 }
300
301 out
302 },
303 }
304}
305
306pub(crate) fn trim_end_nulls(text: &mut String) {
307 if text.ends_with('\0') {
308 let new_len = text.trim_end_matches('\0').len();
309 text.truncate(new_len);
310 }
311}
312
313pub(crate) fn trim_end_nulls_str(text: &str) -> &str {
314 text.trim_end_matches('\0')
315}
316
317fn utf16_encode(
318 text: &str,
319 endianness: fn(u16) -> [u8; 2],
320 bom: bool,
321 terminated: bool,
322) -> Vec<u8> {
323 let mut encoded = Vec::<u8>::new();
324
325 if bom {
326 encoded.extend_from_slice(&endianness(0xFEFF_u16));
327 }
328
329 for ch in text.encode_utf16() {
330 encoded.extend_from_slice(&endianness(ch));
331 }
332
333 if terminated {
334 encoded.extend_from_slice(&[0, 0]);
335 }
336
337 encoded
338}
339
340#[cfg(test)]
341mod tests {
342 use crate::util::text::{TextDecodeOptions, TextEncoding};
343 use std::io::Cursor;
344
345 const TEST_STRING: &str = "l\u{00f8}ft\u{00a5}";
346
347 #[test_log::test]
348 fn text_decode() {
349 let utf16_decode = super::utf16_decode_bytes(
351 &[
352 0x00, 0x6C, 0x00, 0xF8, 0x00, 0x66, 0x00, 0x74, 0x00, 0xA5, 0x00, 0x00,
353 ],
354 u16::from_be_bytes,
355 )
356 .unwrap();
357
358 assert_eq!(utf16_decode, TEST_STRING.to_string());
359
360 let be_utf16_decode = super::decode_text(
362 &mut Cursor::new(&[
363 0xFE, 0xFF, 0x00, 0x6C, 0x00, 0xF8, 0x00, 0x66, 0x00, 0x74, 0x00, 0xA5, 0x00, 0x00,
364 ]),
365 TextDecodeOptions::new().encoding(TextEncoding::UTF16),
366 )
367 .unwrap();
368 let le_utf16_decode = super::decode_text(
369 &mut Cursor::new(&[
370 0xFF, 0xFE, 0x6C, 0x00, 0xF8, 0x00, 0x66, 0x00, 0x74, 0x00, 0xA5, 0x00, 0x00, 0x00,
371 ]),
372 TextDecodeOptions::new().encoding(TextEncoding::UTF16),
373 )
374 .unwrap();
375
376 assert_eq!(be_utf16_decode.content, le_utf16_decode.content);
377 assert_eq!(be_utf16_decode.bytes_read, le_utf16_decode.bytes_read);
378 assert_eq!(be_utf16_decode.content, TEST_STRING.to_string());
379
380 let utf8_decode = super::decode_text(
381 &mut TEST_STRING.as_bytes(),
382 TextDecodeOptions::new().encoding(TextEncoding::UTF8),
383 )
384 .unwrap();
385
386 assert_eq!(utf8_decode.content, TEST_STRING.to_string());
387 }
388
389 #[test_log::test]
390 fn text_encode() {
391 let utf16_encode = super::utf16_encode(TEST_STRING, u16::to_be_bytes, true, false);
393
394 assert_eq!(
395 utf16_encode.as_slice(),
396 &[
397 0xFE, 0xFF, 0x00, 0x6C, 0x00, 0xF8, 0x00, 0x66, 0x00, 0x74, 0x00, 0xA5
398 ]
399 );
400
401 let be_utf16_encode = super::encode_text(TEST_STRING, TextEncoding::UTF16BE, false);
403 let le_utf16_encode = super::utf16_encode(TEST_STRING, u16::to_le_bytes, true, false);
404 let be_utf16_encode_bom = super::utf16_encode(TEST_STRING, u16::to_be_bytes, true, false);
405
406 assert_ne!(be_utf16_encode.as_slice(), le_utf16_encode.as_slice());
407 assert_eq!(
409 be_utf16_encode.as_slice(),
410 &[0x00, 0x6C, 0x00, 0xF8, 0x00, 0x66, 0x00, 0x74, 0x00, 0xA5]
411 );
412 assert_eq!(
413 le_utf16_encode.as_slice(),
414 &[
415 0xFF, 0xFE, 0x6C, 0x00, 0xF8, 0x00, 0x66, 0x00, 0x74, 0x00, 0xA5, 0x00
416 ]
417 );
418 assert_eq!(
419 be_utf16_encode_bom.as_slice(),
420 &[
421 0xFE, 0xFF, 0x00, 0x6C, 0x00, 0xF8, 0x00, 0x66, 0x00, 0x74, 0x00, 0xA5
422 ]
423 );
424
425 let utf8_encode = super::encode_text(TEST_STRING, TextEncoding::UTF8, false);
426
427 assert_eq!(utf8_encode.as_slice(), TEST_STRING.as_bytes());
428 }
429}