use std::io::{Error, ErrorKind, Read};
use crate::{buffer::ByteBuffer, decoder::ByteStreamCharDecoder};
/// Mask of the value bits of a UTF-8 continuation byte.
const CONT_MASK: u8 = 0b0011_1111;
/**
A decoder for a byte stream which is using UTF-8 character encoding.
*/
pub struct Utf8Decoder<R> {
byte_buffer: ByteBuffer<R>,
}
impl<R: Read> ByteStreamCharDecoder<R> for Utf8Decoder<R> {
/**
Wraps the given `Read` type as a byte stream and uses UTF-8 encoding to convert bytes into
characters.
# Examples
Given a hardcoded string (which Rust encodes using UTF-8) you can take it as a byte slice
and `wrap` it with a `Utf8Decoder`, then read the characters out one at a time.
```
// Important: you need to `use` both Utf8Decoder and the trait ByteStreamCharDecoder!
use sipp::decoder::{Utf8Decoder, ByteStreamCharDecoder};
use std::io::Error;
fn main() -> Result<(), Error> {
let input = "Some input text!";
let mut decoder = Utf8Decoder::wrap(input.as_bytes());
while let Some(c) = decoder.decode_char()? {
println!("Found character {}", c);
}
println!("No more input available!");
Ok(())
}
```
*/
fn wrap(reader: R) -> Utf8Decoder<R> {
Utf8Decoder {
byte_buffer: ByteBuffer::wrap(reader),
}
}
/**
Wraps the given `ByteBuffer` and uses UTF-8 encoding to convert bytes into characters.
# Examples
If you need to read from a UTF-8 file (or any other `Read` type) but you need to check for
a BOM (byte-order mark) at the start of the byte stream, then you can wrap the `File`
in a `ByteBuffer`, check for the BOM and skip past it, then wrap the `ByteBuffer` with a
`Utf8Decoder` and start reading the actual content from it one character at a time.
```
// Important: you need to `use` both Utf8Decoder and the trait ByteStreamCharDecoder!
use sipp::{buffer::ByteBuffer, decoder::{Utf8Decoder, ByteStreamCharDecoder}};
use std::fs::File;
use std::io::Error;
// A UTF-8 BOM is three bytes: 0xEF 0xBB 0xBF
const BOM_UTF8: &[u8] = &[0xEF, 0xBB, 0xBF];
fn main() -> Result<(), Error> {
let file = File::open("test_resources/xml_utf8_BOM.xml")?;
let mut byte_buffer = ByteBuffer::wrap(file);
let first_bytes = byte_buffer.peek()?;
# let mut found_byte_order_mark = false;
if first_bytes.len() > 2 && first_bytes[0..3] == *BOM_UTF8 {
println!("Found input which starts with UTF-8 BOM!");
# found_byte_order_mark = true;
// Now read past the three bytes which make up the UTF-8 BOM.
assert_eq!(byte_buffer.read_next()?, Some(0xEF));
assert_eq!(byte_buffer.read_next()?, Some(0xBB));
assert_eq!(byte_buffer.read_next()?, Some(0xBF));
} else {
println!("No BOM found!");
}
# assert!(found_byte_order_mark);
// Now the BOM is out of the way, you can wrap the ByteBuffer with Utf8Decoder so that
// it's ready to decode actual character content.
let mut decoder = Utf8Decoder::wrap_buffer(byte_buffer);
# assert_eq!(decoder.decode_char()?, Some('<'));
Ok(())
}
```
*/
fn wrap_buffer(byte_buffer: ByteBuffer<R>) -> Self {
Utf8Decoder { byte_buffer }
}
/**
Returns the next character represented by the byte stream. If there are no bytes remaining
in the input stream then this method will return `None`.
This method will not (must not) return Unicode surrogate codepoint characters.
# Errors
If the byte stream contains a sequence of bytes which do not represent a valid character
under UTF-8 encoding, or if something goes wrong while reading the byte stream, then this
method will return an `std::io::Error` variant.
# Examples
Using a hardcoded short string as an example, you can see how `decode_next` works.
```
// Important: you need to `use` both Utf8Decoder and the trait ByteStreamCharDecoder!
use sipp::decoder::{Utf8Decoder, ByteStreamCharDecoder};
use std::io::Error;
fn main() -> Result<(), Error> {
let input = "Short";
let mut decoder = Utf8Decoder::wrap(input.as_bytes());
// While there is content, Some(c) will be returned.
assert_eq!(decoder.decode_char()?, Some('S'));
assert_eq!(decoder.decode_char()?, Some('h'));
assert_eq!(decoder.decode_char()?, Some('o'));
assert_eq!(decoder.decode_char()?, Some('r'));
assert_eq!(decoder.decode_char()?, Some('t'));
// Once we've consumed all of the content, None will be returned.
assert_eq!(decoder.decode_char()?, None);
Ok(())
}
```
As an example of what might cause an error to be returned, see what happens if we ask
`Utf8Decoder` to decode a byte sequence which is not valid a valid character.
```
// Important: you need to `use` both Utf8Decoder and the trait ByteStreamCharDecoder!
use sipp::decoder::{Utf8Decoder, ByteStreamCharDecoder};
use std::io::Error;
fn main() -> Result<(), Error> {
// UTF-8 representation of "Hello" followed by Unicode high surrogate codepoint U+D800
let bytes: &[u8] = &[0x48, 0x65, 0x6C, 0x6C, 0x6F, 0xED, 0xA0, 0x80];
let mut decoder = Utf8Decoder::wrap(bytes);
// Reading works fine while we have valid UTF-8 bytes to decode:
assert_eq!(decoder.decode_char()?, Some('H'));
assert_eq!(decoder.decode_char()?, Some('e'));
assert_eq!(decoder.decode_char()?, Some('l'));
assert_eq!(decoder.decode_char()?, Some('l'));
assert_eq!(decoder.decode_char()?, Some('o'));
// But once the decoder reaches the byte sequence for a high surrogate codepoint, which
// is not a valid character, then an error will be returned.
let invalid_read = decoder.decode_char();
assert!(invalid_read.is_err());
# let invalid_read = decoder.decode_char()?;
# assert!(invalid_read.is_none());
Ok(())
}
```
While you may be able to keep reading after an error has been returned, it is recommended
that an error is considered to indicate an invalid or corrupt UTF-8 stream, and no further
reading should be attempted.
*/
fn decode_char(&mut self) -> Result<Option<char>, Error> {
// This decode algorithm is based closely on the method next_code_point found in the Rust source code:
// https://doc.rust-lang.org/src/core/str/validations.rs.html#36-70
match self.byte_buffer.read_next()? {
None => Ok(None),
Some(start_byte) => {
let width = Self::determine_utf8_byte_count(start_byte)?;
if width == 1 {
// println!("Found UTF-8 sequence with exactly one byte!");
return Self::u32_to_char(start_byte as u32);
}
// println!("Found UTF-8 sequence with at least two bytes!");
let init = (start_byte & (0x7F >> 2)) as u32;
match self.byte_buffer.read_next()? {
None => Err(Error::new(
ErrorKind::InvalidData,
"Invalid UTF-8: second byte missing.",
)),
Some(second_byte) => {
self.accumulate_multibyte_sequence(width, init, second_byte)
}
}
}
}
}
}
impl<R: Read> Utf8Decoder<R> {
fn determine_utf8_byte_count(start_byte: u8) -> Result<usize, Error> {
// Based on table 3-7 in section https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf#G7404
if start_byte <= 0x7F {
return Ok(1);
}
if start_byte < 0xC2 {
return Err(Error::new(
ErrorKind::InvalidData,
"Invalid UTF-8: start byte from illegal 0x80 to 0xC1 interval.",
));
}
if start_byte <= 0xDF {
return Ok(2);
}
if start_byte <= 0xEF {
return Ok(3);
}
if start_byte <= 0xF4 {
return Ok(4);
}
Err(Error::new(
ErrorKind::InvalidData,
"Invalid UTF-8: start byte from illegal 0xF5 to 0xFF interval.",
))
}
fn u32_to_char(codepoint: u32) -> Result<Option<char>, Error> {
let conversion = char::from_u32(codepoint);
match conversion {
Some(c) => Ok(Some(c)),
None => Err(Error::new(
ErrorKind::InvalidData,
"Invalid UTF-8: byte sequence maps to illegal codepoint.",
)),
}
}
fn accumulate_multibyte_sequence(
&mut self,
width: usize,
init: u32,
second_byte: u8,
) -> Result<Option<char>, Error> {
match width {
2 => {
let codepoint = (init << 6) | (second_byte & CONT_MASK) as u32;
// println!(
// "Found UTF-8 sequence of two bytes: {:?}",
// Self::u32_to_char(codepoint)
// );
Self::u32_to_char(codepoint)
}
_ => {
match self.byte_buffer.read_next()? {
None => Err(Error::new(
ErrorKind::InvalidData,
"Invalid UTF-8: third byte missing.",
)),
Some(third_byte) => {
let second_masked = (second_byte & CONT_MASK) as u32;
let second_third = (second_masked << 6) | (third_byte & CONT_MASK) as u32;
match width {
3 => {
let codepoint = init << 12 | second_third;
// println!(
// "Found UTF-8 sequence of three bytes: {:?}",
// Self::u32_to_char(codepoint)
// );
Self::u32_to_char(codepoint)
}
_ => {
match self.byte_buffer.read_next()? {
None => Err(Error::new(
ErrorKind::InvalidData,
"Invalid UTF-8: third byte missing.",
)),
Some(fourth_byte) => {
let last_three =
(second_third << 6) | (fourth_byte & CONT_MASK) as u32;
let codepoint = (init & 7) << 18 | last_three;
// println!(
// "Found UTF-8 sequence of four bytes: {:?}",
// Self::u32_to_char(codepoint)
// );
Self::u32_to_char(codepoint)
}
}
}
}
}
}
}
}
}
}
#[cfg(test)]
mod tests {
// Note this useful idiom: importing names from outer (for mod tests) scope.
use super::*;
#[test]
fn test_utf8_empty() -> Result<(), Error> {
let original = "";
let bytes = original.as_bytes();
let mut decoder = Utf8Decoder::wrap(bytes);
let mut builder = String::new();
while let Some(c) = decoder.decode_char()? {
builder.push(c);
}
assert_eq!(builder.to_owned(), original.to_owned());
Ok(())
}
#[test]
fn test_ascii() -> Result<(), Error> {
let original = "simple ASCII string";
let bytes = original.as_bytes();
let mut decoder = Utf8Decoder::wrap(bytes);
let mut builder = String::new();
while let Some(c) = decoder.decode_char()? {
builder.push(c);
}
assert_eq!(builder.to_owned(), original.to_owned());
Ok(())
}
#[test]
fn test_utf8_two_bytes() -> Result<(), Error> {
let original = "Swedish: Svenska är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige.
Ukrainian: Украї́нська мо́ва - національна мова українців. Належить до східнослов'янської групи слов'янських мов, що входять до індоєвропейської мовної сім'ї, поряд з романськими, германськими, кельтськими, грецькою, албанською, вірменською та найближче спорідненими зі слов'янськими балтійськими мовами.
Greek: Η ελληνική γλώσσα ανήκει στην ινδοευρωπαϊκή οικογένεια και αποτελεί το μοναδικό μέλος του ελληνικού κλάδου, ενώ είναι η επίσημη γλώσσα της Ελλάδας και της Κύπρου. Ανήκει επίσης στο βαλκανικό γλωσσικό δεσμό.";
let bytes = original.as_bytes();
let mut decoder = Utf8Decoder::wrap(bytes);
let mut builder = String::new();
while let Some(c) = decoder.decode_char()? {
builder.push(c);
}
assert_eq!(builder.to_owned(), original.to_owned());
Ok(())
}
#[test]
fn test_utf8_three_bytes() -> Result<(), Error> {
let original = "Japanese: 日本語 は、日本国内や、かつての日本領だった国、そして国外移民や移住者を含む日本人同士の間で使用されている言語。
Mathematical symbols: ∀ x ∃ ∅ ∌ x";
let bytes = original.as_bytes();
let mut decoder = Utf8Decoder::wrap(bytes);
let mut builder = String::new();
while let Some(c) = decoder.decode_char()? {
builder.push(c);
}
assert_eq!(builder.to_owned(), original.to_owned());
Ok(())
}
#[test]
fn test_utf8_four_bytes() -> Result<(), Error> {
let original = "Emoticons: 😀 😄 😌 🙄";
let bytes = original.as_bytes();
let mut decoder = Utf8Decoder::wrap(bytes);
let mut builder = String::new();
while let Some(c) = decoder.decode_char()? {
builder.push(c);
}
assert_eq!(builder.to_owned(), original.to_owned());
Ok(())
}
#[test]
fn test_utf8_right_to_left_script() -> Result<(), Error> {
let original = "Arabic: هذه المقالة عن اللغة العربية. لمعانٍ أخرى، طالع عربية (توضيح).
Uyghur: ھەممە ئادەم زانىدىنلا ئەركىن، ئىززەت-ھۆرمەت ۋە ھوقۇقتا باپباراۋەر بولۇپ تۇغۇلغان.";
let bytes = original.as_bytes();
let mut decoder = Utf8Decoder::wrap(bytes);
let mut builder = String::new();
while let Some(c) = decoder.decode_char()? {
builder.push(c);
}
assert_eq!(builder.to_owned(), original.to_owned());
Ok(())
}
#[test]
fn test_utf8_combining_diacritical_marks() -> Result<(), Error> {
let original = "c\u{30C} = č ≍ č ≠ ć ≍ ć = c\u{301}";
let bytes = original.as_bytes();
let mut decoder = Utf8Decoder::wrap(bytes);
let mut builder = String::new();
while let Some(c) = decoder.decode_char()? {
builder.push(c);
}
assert_eq!(builder.to_owned(), original.to_owned());
Ok(())
}
#[test]
fn invalid_unicode_codepoint_high_surrogate_first() -> Result<(), Error> {
// UTF-8 representation of first high surrogate codepoint U+D800
let bytes: &[u8] = &[0xED, 0xA0, 0x80];
let mut decoder = Utf8Decoder::wrap(bytes);
let invalid_read = decoder.decode_char();
assert!(invalid_read.is_err());
Ok(())
}
#[test]
fn invalid_unicode_codepoint_high_surrogate_last() -> Result<(), Error> {
// UTF-8 representation of last high surrogate codepoint U+DBFF
let bytes: &[u8] = &[0xED, 0xAF, 0xBF];
let mut decoder = Utf8Decoder::wrap(bytes);
let invalid_read = decoder.decode_char();
assert!(invalid_read.is_err());
Ok(())
}
#[test]
fn invalid_unicode_codepoint_low_surrogate_first() -> Result<(), Error> {
// UTF-8 representation of first low surrogate codepoint U+DC00
let bytes: &[u8] = &[0xED, 0xB0, 0x80];
let mut decoder = Utf8Decoder::wrap(bytes);
let invalid_read = decoder.decode_char();
assert!(invalid_read.is_err());
Ok(())
}
#[test]
fn invalid_unicode_codepoint_low_surrogate_last() -> Result<(), Error> {
// UTF-8 representation of last low surrogate codepoint U+DFFF
let bytes: &[u8] = &[0xED, 0xBF, 0xBF];
let mut decoder = Utf8Decoder::wrap(bytes);
let invalid_read = decoder.decode_char();
assert!(invalid_read.is_err());
Ok(())
}
}