lofty/util/
text.rs

1use crate::error::{ErrorKind, LoftyError, Result};
2use crate::macros::err;
3
4use std::io::Read;
5
6use byteorder::ReadBytesExt;
7
8/// The text encoding for use in ID3v2 frames
9#[derive(Debug, Clone, Eq, PartialEq, Copy, Hash)]
10#[repr(u8)]
11pub enum TextEncoding {
12	/// ISO-8859-1
13	Latin1 = 0,
14	/// UTF-16 with a byte order mark
15	UTF16 = 1,
16	/// UTF-16 big endian
17	UTF16BE = 2,
18	/// UTF-8
19	UTF8 = 3,
20}
21
22impl TextEncoding {
23	/// Get a `TextEncoding` from a u8, must be 0-3 inclusive
24	pub fn from_u8(byte: u8) -> Option<Self> {
25		match byte {
26			0 => Some(Self::Latin1),
27			1 => Some(Self::UTF16),
28			2 => Some(Self::UTF16BE),
29			3 => Some(Self::UTF8),
30			_ => None,
31		}
32	}
33
34	pub(crate) fn verify_latin1(text: &str) -> bool {
35		text.chars().all(|c| c as u32 <= 255)
36	}
37
38	/// ID3v2.4 introduced two new text encodings.
39	///
40	/// When writing ID3v2.3, we just substitute with UTF-16.
41	pub(crate) fn to_id3v23(self) -> Self {
42		match self {
43			Self::UTF8 | Self::UTF16BE => {
44				log::warn!(
45					"Text encoding {:?} is not supported in ID3v2.3, substituting with UTF-16",
46					self
47				);
48				Self::UTF16
49			},
50			_ => self,
51		}
52	}
53}
54
55#[derive(Eq, PartialEq, Debug)]
56pub(crate) struct DecodeTextResult {
57	pub(crate) content: String,
58	pub(crate) bytes_read: usize,
59	pub(crate) bom: [u8; 2],
60}
61
62impl DecodeTextResult {
63	pub(crate) fn text_or_none(self) -> Option<String> {
64		if self.content.is_empty() {
65			return None;
66		}
67
68		Some(self.content)
69	}
70}
71
72const EMPTY_DECODED_TEXT: DecodeTextResult = DecodeTextResult {
73	content: String::new(),
74	bytes_read: 0,
75	bom: [0, 0],
76};
77
78/// Specify how to decode the provided text
79///
80/// By default, this will:
81///
82/// * Use [`TextEncoding::UTF8`] as the encoding
83/// * Not expect the text to be null terminated
84/// * Have no byte order mark
85#[derive(Copy, Clone, Debug)]
86pub(crate) struct TextDecodeOptions {
87	pub encoding: TextEncoding,
88	pub terminated: bool,
89	pub bom: [u8; 2],
90}
91
92impl TextDecodeOptions {
93	pub(crate) fn new() -> Self {
94		Self::default()
95	}
96
97	pub(crate) fn encoding(mut self, encoding: TextEncoding) -> Self {
98		self.encoding = encoding;
99		self
100	}
101
102	pub(crate) fn terminated(mut self, terminated: bool) -> Self {
103		self.terminated = terminated;
104		self
105	}
106
107	pub(crate) fn bom(mut self, bom: [u8; 2]) -> Self {
108		self.bom = bom;
109		self
110	}
111}
112
113impl Default for TextDecodeOptions {
114	fn default() -> Self {
115		Self {
116			encoding: TextEncoding::UTF8,
117			terminated: false,
118			bom: [0, 0],
119		}
120	}
121}
122
123pub(crate) fn decode_text<R>(reader: &mut R, options: TextDecodeOptions) -> Result<DecodeTextResult>
124where
125	R: Read,
126{
127	let raw_bytes;
128	let bytes_read;
129
130	if options.terminated {
131		let (bytes, terminator_len) = read_to_terminator(reader, options.encoding);
132
133		if bytes.is_empty() {
134			return Ok(EMPTY_DECODED_TEXT);
135		}
136
137		bytes_read = bytes.len() + terminator_len;
138		raw_bytes = bytes;
139	} else {
140		let mut bytes = Vec::new();
141		reader.read_to_end(&mut bytes)?;
142
143		if bytes.is_empty() {
144			return Ok(EMPTY_DECODED_TEXT);
145		}
146
147		bytes_read = bytes.len();
148		raw_bytes = bytes;
149	}
150
151	let mut bom = [0, 0];
152	let read_string = match options.encoding {
153		TextEncoding::Latin1 => latin1_decode(&raw_bytes),
154		TextEncoding::UTF16 => {
155			if raw_bytes.len() < 2 {
156				err!(TextDecode("UTF-16 string has an invalid length (< 2)"));
157			}
158
159			if raw_bytes.len() % 2 != 0 {
160				err!(TextDecode("UTF-16 string has an odd length"));
161			}
162
163			let bom_to_check;
164			if options.bom == [0, 0] {
165				bom_to_check = [raw_bytes[0], raw_bytes[1]];
166			} else {
167				bom_to_check = options.bom;
168			}
169
170			match bom_to_check {
171				[0xFE, 0xFF] => {
172					bom = [0xFE, 0xFF];
173					utf16_decode_bytes(&raw_bytes[2..], u16::from_be_bytes)?
174				},
175				[0xFF, 0xFE] => {
176					bom = [0xFF, 0xFE];
177					utf16_decode_bytes(&raw_bytes[2..], u16::from_le_bytes)?
178				},
179				_ => err!(TextDecode("UTF-16 string has an invalid byte order mark")),
180			}
181		},
182		TextEncoding::UTF16BE => utf16_decode_bytes(raw_bytes.as_slice(), u16::from_be_bytes)?,
183		TextEncoding::UTF8 => utf8_decode(raw_bytes)
184			.map_err(|_| LoftyError::new(ErrorKind::TextDecode("Expected a UTF-8 string")))?,
185	};
186
187	if read_string.is_empty() {
188		return Ok(EMPTY_DECODED_TEXT);
189	}
190
191	Ok(DecodeTextResult {
192		content: read_string,
193		bytes_read,
194		bom,
195	})
196}
197
198pub(crate) fn read_to_terminator<R>(reader: &mut R, encoding: TextEncoding) -> (Vec<u8>, usize)
199where
200	R: Read,
201{
202	let mut text_bytes = Vec::new();
203	let mut terminator_len = 0;
204
205	match encoding {
206		TextEncoding::Latin1 | TextEncoding::UTF8 => {
207			while let Ok(byte) = reader.read_u8() {
208				if byte == 0 {
209					terminator_len = 1;
210					break;
211				}
212
213				text_bytes.push(byte)
214			}
215		},
216		TextEncoding::UTF16 | TextEncoding::UTF16BE => {
217			while let (Ok(b1), Ok(b2)) = (reader.read_u8(), reader.read_u8()) {
218				if b1 == 0 && b2 == 0 {
219					terminator_len = 2;
220					break;
221				}
222
223				text_bytes.push(b1);
224				text_bytes.push(b2)
225			}
226		},
227	}
228
229	(text_bytes, terminator_len)
230}
231
232pub(crate) fn latin1_decode(bytes: &[u8]) -> String {
233	let mut text = bytes.iter().map(|c| *c as char).collect::<String>();
234	trim_end_nulls(&mut text);
235	text
236}
237
238pub(crate) fn utf8_decode(bytes: Vec<u8>) -> Result<String> {
239	String::from_utf8(bytes)
240		.map(|mut text| {
241			trim_end_nulls(&mut text);
242			text
243		})
244		.map_err(Into::into)
245}
246
247pub(crate) fn utf8_decode_str(bytes: &[u8]) -> Result<&str> {
248	std::str::from_utf8(bytes)
249		.map(trim_end_nulls_str)
250		.map_err(Into::into)
251}
252
253pub(crate) fn utf16_decode(words: &[u16]) -> Result<String> {
254	String::from_utf16(words)
255		.map(|mut text| {
256			trim_end_nulls(&mut text);
257			text
258		})
259		.map_err(|_| LoftyError::new(ErrorKind::TextDecode("Given an invalid UTF-16 string")))
260}
261
262pub(crate) fn utf16_decode_bytes(bytes: &[u8], endianness: fn([u8; 2]) -> u16) -> Result<String> {
263	if bytes.is_empty() {
264		return Ok(String::new());
265	}
266
267	let unverified: Vec<u16> = bytes
268		.chunks_exact(2)
269		// In ID3v2, it is possible to have multiple UTF-16 strings separated by null.
270		// This also makes it possible for us to encounter multiple BOMs in a single string.
271		// We must filter them out.
272		.filter_map(|c| match c {
273			[0xFF, 0xFE] | [0xFE, 0xFF] => None,
274			_ => Some(endianness(c.try_into().unwrap())), // Infallible
275		})
276		.collect();
277
278	utf16_decode(&unverified)
279}
280
281pub(crate) fn encode_text(text: &str, text_encoding: TextEncoding, terminated: bool) -> Vec<u8> {
282	match text_encoding {
283		TextEncoding::Latin1 => {
284			let mut out = text.chars().map(|c| c as u8).collect::<Vec<u8>>();
285
286			if terminated {
287				out.push(0)
288			}
289
290			out
291		},
292		TextEncoding::UTF16 => utf16_encode(text, u16::to_ne_bytes, true, terminated),
293		TextEncoding::UTF16BE => utf16_encode(text, u16::to_be_bytes, false, terminated),
294		TextEncoding::UTF8 => {
295			let mut out = text.as_bytes().to_vec();
296
297			if terminated {
298				out.push(0);
299			}
300
301			out
302		},
303	}
304}
305
306pub(crate) fn trim_end_nulls(text: &mut String) {
307	if text.ends_with('\0') {
308		let new_len = text.trim_end_matches('\0').len();
309		text.truncate(new_len);
310	}
311}
312
313pub(crate) fn trim_end_nulls_str(text: &str) -> &str {
314	text.trim_end_matches('\0')
315}
316
317fn utf16_encode(
318	text: &str,
319	endianness: fn(u16) -> [u8; 2],
320	bom: bool,
321	terminated: bool,
322) -> Vec<u8> {
323	let mut encoded = Vec::<u8>::new();
324
325	if bom {
326		encoded.extend_from_slice(&endianness(0xFEFF_u16));
327	}
328
329	for ch in text.encode_utf16() {
330		encoded.extend_from_slice(&endianness(ch));
331	}
332
333	if terminated {
334		encoded.extend_from_slice(&[0, 0]);
335	}
336
337	encoded
338}
339
340#[cfg(test)]
341mod tests {
342	use crate::util::text::{TextDecodeOptions, TextEncoding};
343	use std::io::Cursor;
344
345	const TEST_STRING: &str = "l\u{00f8}ft\u{00a5}";
346
347	#[test_log::test]
348	fn text_decode() {
349		// No BOM
350		let utf16_decode = super::utf16_decode_bytes(
351			&[
352				0x00, 0x6C, 0x00, 0xF8, 0x00, 0x66, 0x00, 0x74, 0x00, 0xA5, 0x00, 0x00,
353			],
354			u16::from_be_bytes,
355		)
356		.unwrap();
357
358		assert_eq!(utf16_decode, TEST_STRING.to_string());
359
360		// BOM test
361		let be_utf16_decode = super::decode_text(
362			&mut Cursor::new(&[
363				0xFE, 0xFF, 0x00, 0x6C, 0x00, 0xF8, 0x00, 0x66, 0x00, 0x74, 0x00, 0xA5, 0x00, 0x00,
364			]),
365			TextDecodeOptions::new().encoding(TextEncoding::UTF16),
366		)
367		.unwrap();
368		let le_utf16_decode = super::decode_text(
369			&mut Cursor::new(&[
370				0xFF, 0xFE, 0x6C, 0x00, 0xF8, 0x00, 0x66, 0x00, 0x74, 0x00, 0xA5, 0x00, 0x00, 0x00,
371			]),
372			TextDecodeOptions::new().encoding(TextEncoding::UTF16),
373		)
374		.unwrap();
375
376		assert_eq!(be_utf16_decode.content, le_utf16_decode.content);
377		assert_eq!(be_utf16_decode.bytes_read, le_utf16_decode.bytes_read);
378		assert_eq!(be_utf16_decode.content, TEST_STRING.to_string());
379
380		let utf8_decode = super::decode_text(
381			&mut TEST_STRING.as_bytes(),
382			TextDecodeOptions::new().encoding(TextEncoding::UTF8),
383		)
384		.unwrap();
385
386		assert_eq!(utf8_decode.content, TEST_STRING.to_string());
387	}
388
389	#[test_log::test]
390	fn text_encode() {
391		// No BOM
392		let utf16_encode = super::utf16_encode(TEST_STRING, u16::to_be_bytes, true, false);
393
394		assert_eq!(
395			utf16_encode.as_slice(),
396			&[
397				0xFE, 0xFF, 0x00, 0x6C, 0x00, 0xF8, 0x00, 0x66, 0x00, 0x74, 0x00, 0xA5
398			]
399		);
400
401		// BOM test
402		let be_utf16_encode = super::encode_text(TEST_STRING, TextEncoding::UTF16BE, false);
403		let le_utf16_encode = super::utf16_encode(TEST_STRING, u16::to_le_bytes, true, false);
404		let be_utf16_encode_bom = super::utf16_encode(TEST_STRING, u16::to_be_bytes, true, false);
405
406		assert_ne!(be_utf16_encode.as_slice(), le_utf16_encode.as_slice());
407		// TextEncoding::UTF16BE has no BOM
408		assert_eq!(
409			be_utf16_encode.as_slice(),
410			&[0x00, 0x6C, 0x00, 0xF8, 0x00, 0x66, 0x00, 0x74, 0x00, 0xA5]
411		);
412		assert_eq!(
413			le_utf16_encode.as_slice(),
414			&[
415				0xFF, 0xFE, 0x6C, 0x00, 0xF8, 0x00, 0x66, 0x00, 0x74, 0x00, 0xA5, 0x00
416			]
417		);
418		assert_eq!(
419			be_utf16_encode_bom.as_slice(),
420			&[
421				0xFE, 0xFF, 0x00, 0x6C, 0x00, 0xF8, 0x00, 0x66, 0x00, 0x74, 0x00, 0xA5
422			]
423		);
424
425		let utf8_encode = super::encode_text(TEST_STRING, TextEncoding::UTF8, false);
426
427		assert_eq!(utf8_encode.as_slice(), TEST_STRING.as_bytes());
428	}
429}