pub const REPLACE_UTF32:u32 = 0xFFFD;
pub const REPLACE_PART1:u8 = 0xEFu8;
pub const REPLACE_PART2:u8 = 0xBFu8;
pub const REPLACE_PART3:u8 = 0xBDu8;
const TYPE2_PREFIX:u32 = 0b1100_0000u32;
const TYPE3_PREFIX:u32 = 0b1110_0000u32;
const TYPE4_PREFIX:u32 = 0b1111_0000u32;
const BYTE2_PREFIX:u32 = 0b1000_0000u32;
const BOM:char = '\u{FEFF}';
const CR:char = '\r';
const NL:char = '\n';
const SIX_ONES_SHIFTED:u32 = 0b111111000000u32;
const SIX_ONES:u32 = 0b111111u32;
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
#[cfg_attr(nightly, warn(rustdoc::missing_doc_code_examples))]
pub enum MoreEnum {
More(u32),
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
#[cfg_attr(nightly, warn(rustdoc::missing_doc_code_examples))]
pub enum Utf8TypeEnum {
Type1(u8),
Type2((u8,u8)),
Type3((u8,u8,u8)),
Type4((u8,u8,u8,u8)),
Type0((u8,u8,u8)),
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
#[cfg_attr(nightly, warn(rustdoc::missing_doc_code_examples))]
pub enum Utf8EndEnum {
BadDecode(u32),
Finish(u32),
TypeUnknown,
}
#[inline]
pub fn classify_utf32(code: u32) -> Utf8TypeEnum {
if code < 0x80u32 {
Utf8TypeEnum::Type1(code as u8)
}
else if code < 0x800u32 {
let v1:u8 = ((code >> 6) + TYPE2_PREFIX) as u8;
let v2:u8 = ((code & SIX_ONES) + BYTE2_PREFIX) as u8;
Utf8TypeEnum::Type2((v1,v2))
}
else if (code >= 0xD800u32) && (code < 0xE000u32) {
Utf8TypeEnum::Type0((REPLACE_PART1, REPLACE_PART2, REPLACE_PART3))
}
else if code < 0x10000u32 {
if code == REPLACE_UTF32 {
Utf8TypeEnum::Type0((REPLACE_PART1, REPLACE_PART2, REPLACE_PART3))
}
else {
let v1:u8 = ((code >> 12) + TYPE3_PREFIX) as u8;
let v2:u8 = (((code & SIX_ONES_SHIFTED) >> 6) + BYTE2_PREFIX) as u8;
let v3:u8 = ((code & SIX_ONES) + BYTE2_PREFIX) as u8;
Utf8TypeEnum::Type3((v1,v2,v3))
}
}
else if code < 0x110000u32 {
let v1:u8 = ((code >> 18) + TYPE4_PREFIX) as u8;
let v2:u8 = (((code >> 12) & SIX_ONES) + BYTE2_PREFIX) as u8;
let v3:u8 = (((code & SIX_ONES_SHIFTED) >> 6) + BYTE2_PREFIX) as u8;
let v4:u8 = ((code & SIX_ONES) + BYTE2_PREFIX) as u8;
Utf8TypeEnum::Type4((v1,v2,v3,v4))
}
else {
Utf8TypeEnum::Type0((REPLACE_PART1, REPLACE_PART2, REPLACE_PART3))
}
}
use core::iter::Iterator;
use crate::utf8conv::buf::EightBytes;
#[inline]
fn byte2_action9(mybuf: & mut EightBytes, arg: u32) -> Utf8EndEnum {
match mybuf.front() {
Option::Some(v) => {
let v2 = v as u32;
if (v2 >= 0x80) && (v2 <= 0xbf) {
mybuf.pop_front(); Utf8EndEnum::Finish((arg << 6)+(v2 & 0x3F))
}
else {
Utf8EndEnum::BadDecode(1)
}
}
Option::None => {
Utf8EndEnum::TypeUnknown
}
}
}
fn byte2_action10(mybuf: & mut EightBytes, arg: u32) -> Utf8EndEnum {
match mybuf.front() {
Option::Some(v) => {
let v2 = v as u32;
if (v2 >= 0x80) && (v2 <= 0xbf) {
mybuf.pop_front(); byte3_action17(mybuf, (arg << 6)+(v2 & 0x3F))
}
else {
Utf8EndEnum::BadDecode(1)
}
}
Option::None => {
Utf8EndEnum::TypeUnknown
}
}
}
fn byte2_action11(mybuf: & mut EightBytes, arg: u32) -> Utf8EndEnum {
match mybuf.front() {
Option::Some(v) => {
let v2 = v as u32;
if (v2 >= 0x80) && (v2 <= 0xbf) {
mybuf.pop_front(); byte3_action20(mybuf, (arg << 6)+(v2 & 0x3F))
}
else {
Utf8EndEnum::BadDecode(1)
}
}
Option::None => {
Utf8EndEnum::TypeUnknown
}
}
}
fn byte2_action12(mybuf: & mut EightBytes, arg: u32) -> Utf8EndEnum {
match mybuf.front() {
Option::Some(v) => {
let v2 = v as u32;
if (v2 >= 0x80) && (v2 <= 0xbf) {
mybuf.pop_front(); byte3_action21(mybuf, (arg << 6)+(v2 & 0x3F))
}
else {
Utf8EndEnum::BadDecode(1)
}
}
Option::None => {
Utf8EndEnum::TypeUnknown
}
}
}
fn byte2_action13(mybuf: & mut EightBytes, arg: u32) -> Utf8EndEnum {
match mybuf.front() {
Option::Some(v) => {
let v2 = v as u32;
if (v2 >= 0x80) && (v2 <= 0x8F) {
mybuf.pop_front(); byte3_action21(mybuf, (arg << 6)+(v2 & 0x3F))
}
else {
Utf8EndEnum::BadDecode(1)
}
}
Option::None => {
Utf8EndEnum::TypeUnknown
}
}
}
#[inline]
fn byte2_action14(mybuf: & mut EightBytes, arg: u32) -> Utf8EndEnum {
match mybuf.front() {
Option::Some(v) => {
let v2 = v as u32;
if (v2 >= 0xA0) && (v2 <= 0xbf) {
mybuf.pop_front(); byte3_action17(mybuf, (arg << 6)+(v2 & 0x3F))
}
else {
Utf8EndEnum::BadDecode(1)
}
}
Option::None => {
Utf8EndEnum::TypeUnknown
}
}
}
fn byte2_action15(mybuf: & mut EightBytes, arg: u32) -> Utf8EndEnum {
match mybuf.front() {
Option::Some(v) => {
let v2 = v as u32;
if (v2 >= 0x80) && (v2 <= 0x9F) {
mybuf.pop_front(); byte3_action17(mybuf, (arg << 6)+(v2 & 0x3F))
}
else {
Utf8EndEnum::BadDecode(1)
}
}
Option::None => {
Utf8EndEnum::TypeUnknown
}
}
}
fn byte2_action16(mybuf: & mut EightBytes, arg: u32) -> Utf8EndEnum {
match mybuf.front() {
Option::Some(v) => {
let v2 = v as u32;
if (v2 >= 0x90) && (v2 <= 0xbf) {
mybuf.pop_front(); byte3_action21(mybuf, (arg << 6)+(v2 & 0x3F))
}
else {
Utf8EndEnum::BadDecode(1)
}
}
Option::None => {
Utf8EndEnum::TypeUnknown
}
}
}
#[inline]
fn byte3_action17(mybuf: & mut EightBytes, arg: u32) -> Utf8EndEnum {
match mybuf.front() {
Option::Some(v) => {
let v3 = v as u32;
if (v3 >= 0x80) && (v3 <= 0xbf) {
mybuf.pop_front(); Utf8EndEnum::Finish((arg << 6)+(v3 & 0x3F))
}
else {
Utf8EndEnum::BadDecode(2)
}
}
Option::None => {
Utf8EndEnum::TypeUnknown
}
}
}
#[inline]
fn byte3_action20(mybuf: & mut EightBytes, arg: u32) -> Utf8EndEnum {
match mybuf.front() {
Option::Some(v) => {
let v3 = v as u32;
if (v3 >= 0x80) && (v3 <= 0xbf) {
mybuf.pop_front(); let codepoint = (arg << 6) + (v3 & 0x3F);
if codepoint == REPLACE_UTF32 {
Utf8EndEnum::BadDecode(3)
}
else {
Utf8EndEnum::Finish(codepoint)
}
}
else {
Utf8EndEnum::BadDecode(2)
}
}
Option::None => {
Utf8EndEnum::TypeUnknown
}
}
}
#[inline]
fn byte3_action21(mybuf: & mut EightBytes, arg: u32) -> Utf8EndEnum {
match mybuf.front() {
Option::Some(v) => {
let v3 = v as u32;
if (v3 >= 0x80) && (v3 <= 0xbf) {
mybuf.pop_front(); byte4_action24(mybuf, (arg << 6)+(v3 & 0x3F))
}
else {
Utf8EndEnum::BadDecode(2)
}
}
Option::None => {
Utf8EndEnum::TypeUnknown
}
}
}
#[inline]
fn byte4_action24(mybuf: & mut EightBytes, arg: u32) -> Utf8EndEnum {
match mybuf.front() {
Option::Some(v) => {
let v4 = v as u32;
if (v4 >= 0x80) && (v4 <= 0xbf) {
mybuf.pop_front(); Utf8EndEnum::Finish((arg << 6)+(v4 & 0x3F))
}
else {
Utf8EndEnum::BadDecode(3)
}
}
Option::None => {
Utf8EndEnum::TypeUnknown
}
}
}
pub fn utf8_decode(mybuf: & mut EightBytes, last_buffer: bool) -> Utf8EndEnum {
match mybuf.front() {
Option::Some(v) => {
let v1 = v as u32;
if v1 < 0xE0 {
if v1 < 0xC2 {
mybuf.pop_front();
if v1 < 0x80 {
Utf8EndEnum::Finish(v1)
}
else {
Utf8EndEnum::BadDecode(1)
}
}
else {
if (mybuf.len() < 2) && ! last_buffer {
Utf8EndEnum::TypeUnknown
}
else {
mybuf.pop_front();
byte2_action9(mybuf, v1 & 0x1F)
}
}
}
else {
if v1 < 0xF0 {
if (mybuf.len() < 3) && ! last_buffer {
Utf8EndEnum::TypeUnknown
}
else if v1 < 0xED {
mybuf.pop_front();
if v1 == 0xE0 {
byte2_action14(mybuf, v1 & 0xF)
}
else {
byte2_action10(mybuf, v1 & 0xF)
}
}
else {
mybuf.pop_front();
if v1 == 0xED {
byte2_action15(mybuf, v1 & 0xF)
}
else {
byte2_action11(mybuf, v1 & 0xF)
}
}
}
else {
if v1 > 0xF4 {
mybuf.pop_front();
Utf8EndEnum::BadDecode(1)
}
else if (mybuf.len() < 4) && ! last_buffer {
Utf8EndEnum::TypeUnknown
}
else {
mybuf.pop_front();
if v1 == 0xF0 {
byte2_action16(mybuf, v1 & 0x7)
}
else if v1 < 0xF4 {
byte2_action12(mybuf, v1 & 0x7)
}
else {
byte2_action13(mybuf, v1 & 0x7)
}
}
}
}
}
Option::None => {
Utf8EndEnum::TypeUnknown
}
}
}
pub struct CharRefToCharStruct<'b> {
my_borrow_mut_iter: &'b mut dyn Iterator<Item = &'b char>,
}
impl<'b> Iterator for CharRefToCharStruct<'b> {
type Item=char;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
match self.my_borrow_mut_iter.next() {
Option::None => { Option::None }
Option::Some(v) => { Option::Some(* v) }
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.my_borrow_mut_iter.size_hint()
}
}
#[inline]
pub fn char_ref_iter_to_char_iter<'a, I: 'a + Iterator>(input: &'a mut I)
-> CharRefToCharStruct<'a>
where I: Iterator<Item = &'a char>, {
CharRefToCharStruct {
my_borrow_mut_iter: input,
}
}
pub struct Utf32RefToUtf32Struct<'b> {
my_borrow_mut_iter: &'b mut dyn Iterator<Item = &'b u32>,
}
impl<'b> Iterator for Utf32RefToUtf32Struct<'b> {
type Item=u32;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
match self.my_borrow_mut_iter.next() {
Option::None => { Option::None }
Option::Some(v) => { Option::Some(* v) }
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.my_borrow_mut_iter.size_hint()
}
}
#[inline]
pub fn utf32_ref_iter_to_utf32_iter<'a, I: 'a + Iterator>(input: &'a mut I)
-> Utf32RefToUtf32Struct<'a>
where I: Iterator<Item = &'a u32>, {
Utf32RefToUtf32Struct {
my_borrow_mut_iter: input,
}
}
pub struct Utf8RefToUtf8Struct<'b> {
my_borrow_mut_iter: &'b mut dyn Iterator<Item = &'b u8>,
}
impl<'b> Iterator for Utf8RefToUtf8Struct<'b> {
type Item=u8;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
match self.my_borrow_mut_iter.next() {
Option::None => { Option::None }
Option::Some(v) => { Option::Some(* v) }
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.my_borrow_mut_iter.size_hint()
}
}
#[inline]
pub fn utf8_ref_iter_to_utf8_iter<'a, I: 'a + Iterator>(input: &'a mut I)
-> Utf8RefToUtf8Struct<'a>
where I: Iterator<Item = &'a u8>, {
Utf8RefToUtf8Struct {
my_borrow_mut_iter: input,
}
}
pub struct CharToUtf32Struct<'b> {
my_borrow_mut_iter: &'b mut dyn Iterator<Item = char>,
}
impl<'b> Iterator for CharToUtf32Struct<'b> {
type Item=u32;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
match self.my_borrow_mut_iter.next() {
Option::None => { Option::None }
Option::Some(v) => { Option::Some(v as u32) }
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.my_borrow_mut_iter.size_hint()
}
}
#[inline]
pub fn char_iter_to_utf32_iter<'a, I: 'a + Iterator>(input: &'a mut I)
-> CharToUtf32Struct<'a>
where I: Iterator<Item = char>, {
CharToUtf32Struct {
my_borrow_mut_iter: input,
}
}
pub struct BomAndCarriageReturnFilterStruct<'b> {
my_borrow_mut_iter: &'b mut dyn Iterator<Item = char>,
my_start_stream: bool,
my_prev_cr: bool,
}
impl<'b> Iterator for BomAndCarriageReturnFilterStruct<'b> {
type Item=char;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
loop {
match self.my_borrow_mut_iter.next() {
Option::None => {
break Option::None;
}
Option::Some(v) => {
if self.my_start_stream {
if v == BOM {
self.my_start_stream = false; continue; }
else {
self.my_start_stream = false;
}
}
if self.my_prev_cr {
if v == NL {
self.my_prev_cr = false;
continue;
}
else if v == CR {
break Option::Some(NL);
}
else {
self.my_prev_cr = false;
}
}
else if v == CR {
self.my_prev_cr = true;
break Option::Some(NL);
}
break Option::Some(v);
}
}
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.my_borrow_mut_iter.size_hint()
}
}
#[inline]
pub fn filter_bom_and_cr_iter<'a, I: 'a + Iterator>(input: &'a mut I)
-> BomAndCarriageReturnFilterStruct<'a>
where I: Iterator<Item = char>, {
BomAndCarriageReturnFilterStruct {
my_borrow_mut_iter: input,
my_start_stream: true,
my_prev_cr: false,
}
}
pub trait UtfParserCommon {
fn reset_parser(&mut self);
fn set_is_last_buffer(&mut self, b: bool);
fn is_last_buffer(&self) -> bool;
fn signal_invalid_sequence(& mut self);
fn has_invalid_sequence(&self) -> bool;
fn reset_invalid_sequence(& mut self);
}
#[derive(Debug, Clone, Copy)]
pub struct FromUtf8 {
my_buf: EightBytes,
my_last_buffer: bool,
my_invalid_sequence: bool,
}
#[derive(Debug, Clone, Copy)]
pub struct FromUnicode {
my_buf: EightBytes,
my_last_buffer: bool,
my_invalid_sequence: bool,
}
pub struct Utf8IterToCharIter<'p> {
my_borrow_mut_iter: &'p mut dyn Iterator<Item = u8>,
my_info: &'p mut FromUtf8,
}
pub struct Utf32IterToUtf8Iter<'q> {
my_borrow_mut_iter: &'q mut dyn Iterator<Item = u32>,
my_info: &'q mut FromUnicode,
}
pub struct Utf8RefIterToCharIter<'r> {
my_borrow_mut_iter: &'r mut dyn Iterator<Item = &'r u8>,
my_info: &'r mut FromUtf8,
}
pub struct CharRefIterToUtf8Iter<'s> {
my_borrow_mut_iter: &'s mut dyn Iterator<Item = &'s char>,
my_info: &'s mut FromUnicode,
}
impl<'b> UtfParserCommon for FromUtf8 {
#[inline]
fn set_is_last_buffer(&mut self, b: bool) {
self.my_last_buffer = b;
}
#[inline]
fn is_last_buffer(&self) -> bool {
self.my_last_buffer
}
#[inline]
fn has_invalid_sequence(&self) -> bool {
self.my_invalid_sequence
}
#[inline]
fn signal_invalid_sequence(&mut self) {
self.my_invalid_sequence = true;
}
#[inline]
fn reset_invalid_sequence(& mut self) {
self.my_invalid_sequence = false;
}
#[inline]
fn reset_parser(&mut self) {
self.my_buf.clear();
self.set_is_last_buffer(true);
self.reset_invalid_sequence();
}
}
impl<'b> UtfParserCommon for FromUnicode {
#[inline]
fn set_is_last_buffer(&mut self, b: bool) {
self.my_last_buffer = b;
}
#[inline]
fn is_last_buffer(&self) -> bool {
self.my_last_buffer
}
#[inline]
fn has_invalid_sequence(&self) -> bool {
self.my_invalid_sequence
}
#[inline]
fn signal_invalid_sequence(&mut self) {
self.my_invalid_sequence = true;
}
#[inline]
fn reset_invalid_sequence(&mut self) {
self.my_invalid_sequence = false;
}
#[inline]
fn reset_parser(&mut self) {
self.my_buf.clear();
self.set_is_last_buffer(true);
self.reset_invalid_sequence();
}
}
pub fn parse_mapper_char_to_utf32(input: Result<(& [u8], char), MoreEnum>)
-> Result<(& [u8], u32), MoreEnum> {
match input {
Result::Err(e) => { Result::Err(e) }
Result::Ok((new_spot, ch)) => { Ok((new_spot, ch as u32)) }
}
}
impl FromUtf8 {
pub fn new() -> FromUtf8 {
FromUtf8 {
my_buf : EightBytes::new(),
my_last_buffer : true,
my_invalid_sequence : false,
}
}
pub fn utf8_to_char<'b>(&mut self, input: &'b [u8])
-> Result<(&'b [u8], char), MoreEnum> {
let mut my_cursor: &[u8] = input;
let last_buffer = self.my_last_buffer;
loop {
if self.my_buf.is_full() || (my_cursor.len() == 0) {
break;
}
self.my_buf.push_back(my_cursor[0]);
my_cursor = &my_cursor[1..];
}
if self.my_buf.is_empty() {
if last_buffer {
Result::Err(MoreEnum::More(0))
}
else {
Result::Err(MoreEnum::More(4096))
}
}
else {
match utf8_decode(& mut self.my_buf, last_buffer) {
Utf8EndEnum::BadDecode(_) => {
self.signal_invalid_sequence();
Result::Ok((my_cursor, char::REPLACEMENT_CHARACTER))
}
Utf8EndEnum::Finish(code) => {
let ch = unsafe { char::from_u32_unchecked(code) };
Result::Ok((my_cursor, ch))
}
Utf8EndEnum::TypeUnknown => {
if last_buffer {
self.signal_invalid_sequence();
Result::Ok((my_cursor, char::REPLACEMENT_CHARACTER))
}
else {
Result::Err(MoreEnum::More(4096))
}
}
}
}
}
pub fn utf8_to_utf32<'c>(&mut self, input: &'c [u8])
-> Result<(&'c [u8], u32), MoreEnum> {
let char_parse_result = self.utf8_to_char(input);
parse_mapper_char_to_utf32(char_parse_result)
}
pub fn utf8_to_char_with_iter<'d>(&'d mut self, iter: &'d mut dyn Iterator<Item = u8>)
-> Utf8IterToCharIter {
Utf8IterToCharIter {
my_info : self,
my_borrow_mut_iter: iter,
}
}
pub fn utf8_ref_to_char_with_iter<'d>(&'d mut self, iter: &'d mut dyn Iterator<Item = &'d u8>)
-> Utf8RefIterToCharIter {
Utf8RefIterToCharIter {
my_info : self,
my_borrow_mut_iter: iter,
}
}
}
impl FromUnicode {
pub fn new() -> FromUnicode {
FromUnicode {
my_buf : EightBytes::new(),
my_last_buffer : true,
my_invalid_sequence : false,
}
}
pub fn char_to_utf8<'b>(&mut self, input: &'b [char])
-> Result<(&'b [char], u8), MoreEnum> {
match self.my_buf.pop_front() {
Some(v1) => {
return Result::Ok((input, v1));
}
None => {}
}
let mut my_cursor: &[char] = input;
if my_cursor.len() == 0 {
if self.is_last_buffer() {
return Result::Err(MoreEnum::More(0));
}
else {
return Result::Err(MoreEnum::More(1024));
}
}
let cur_u32 = my_cursor[0] as u32;
my_cursor = &my_cursor[1..];
match classify_utf32(cur_u32) {
Utf8TypeEnum::Type1(v1) => {
Result::Ok((my_cursor, v1))
}
Utf8TypeEnum::Type2((v1,v2)) => {
self.my_buf.push_back(v2);
Result::Ok((my_cursor, v1))
}
Utf8TypeEnum::Type3((v1,v2,v3)) => {
self.my_buf.push_back(v2);
self.my_buf.push_back(v3);
Result::Ok((my_cursor, v1))
}
Utf8TypeEnum::Type4((v1,v2,v3,v4)) => {
self.my_buf.push_back(v2);
self.my_buf.push_back(v3);
self.my_buf.push_back(v4);
Result::Ok((my_cursor, v1))
}
_ => {
self.signal_invalid_sequence();
self.my_buf.push_back(REPLACE_PART2);
self.my_buf.push_back(REPLACE_PART3);
Result::Ok((my_cursor, REPLACE_PART1))
}
}
}
pub fn utf32_to_utf8<'c>(&mut self, input: &'c [u32])
-> Result<(&'c [u32], u8), MoreEnum> {
match self.my_buf.pop_front() {
Some(v1) => {
return Result::Ok((input, v1));
}
None => {}
}
let mut my_cursor: &[u32] = input;
if my_cursor.len() == 0 {
if self.is_last_buffer() {
return Result::Err(MoreEnum::More(0));
}
else {
return Result::Err(MoreEnum::More(1024));
}
}
let cur_u32 = my_cursor[0];
my_cursor = &my_cursor[1..];
match classify_utf32(cur_u32) {
Utf8TypeEnum::Type1(v1) => {
Result::Ok((my_cursor, v1))
}
Utf8TypeEnum::Type2((v1,v2)) => {
self.my_buf.push_back(v2);
Result::Ok((my_cursor, v1))
}
Utf8TypeEnum::Type3((v1,v2,v3)) => {
self.my_buf.push_back(v2);
self.my_buf.push_back(v3);
Result::Ok((my_cursor, v1))
}
Utf8TypeEnum::Type4((v1,v2,v3,v4)) => {
self.my_buf.push_back(v2);
self.my_buf.push_back(v3);
self.my_buf.push_back(v4);
Result::Ok((my_cursor, v1))
}
_ => {
self.signal_invalid_sequence();
self.my_buf.push_back(REPLACE_PART2);
self.my_buf.push_back(REPLACE_PART3);
Result::Ok((my_cursor, REPLACE_PART1))
}
}
}
pub fn utf32_to_utf8_with_iter<'d>(&'d mut self, iter: &'d mut dyn Iterator<Item = u32>)
-> Utf32IterToUtf8Iter {
Utf32IterToUtf8Iter {
my_borrow_mut_iter: iter,
my_info: self,
}
}
pub fn char_ref_to_utf8_with_iter<'d>(&'d mut self, iter: &'d mut dyn Iterator<Item = &'d char>)
-> CharRefIterToUtf8Iter {
CharRefIterToUtf8Iter {
my_borrow_mut_iter: iter,
my_info: self,
}
}
}
impl<'g> UtfParserCommon for Utf8IterToCharIter<'g> {
#[inline]
fn set_is_last_buffer(&mut self, b: bool) {
self.my_info.set_is_last_buffer(b);
}
#[inline]
fn is_last_buffer(&self) -> bool {
self.my_info.is_last_buffer()
}
#[inline]
fn has_invalid_sequence(&self) -> bool {
self.my_info.has_invalid_sequence()
}
#[inline]
fn signal_invalid_sequence(&mut self) {
self.my_info.signal_invalid_sequence();
}
#[inline]
fn reset_invalid_sequence(& mut self) {
self.my_info.reset_invalid_sequence();
}
#[inline]
fn reset_parser(&mut self) {
self.my_info.reset_parser();
}
}
impl<'g> Iterator for Utf8IterToCharIter<'g> {
type Item = char;
fn next(&mut self) -> Option<Self::Item> {
loop {
if self.my_info.my_buf.is_full() {
break;
}
match self.my_borrow_mut_iter.next() {
Option::None => {
break;
}
Option::Some(utf8) => {
self.my_info.my_buf.push_back(utf8);
}
}
}
if self.my_info.my_buf.is_empty() {
Option::None
}
else {
let last_buffer = self.my_info.is_last_buffer();
match utf8_decode(& mut self.my_info.my_buf, last_buffer) {
Utf8EndEnum::BadDecode(_) => {
self.my_info.signal_invalid_sequence();
Option::Some(char::REPLACEMENT_CHARACTER)
}
Utf8EndEnum::Finish(code) => {
let ch = unsafe { char::from_u32_unchecked(code) };
Option::Some(ch)
}
Utf8EndEnum::TypeUnknown => {
if last_buffer {
self.my_info.signal_invalid_sequence();
Option::Some(char::REPLACEMENT_CHARACTER)
}
else {
Option::None
}
}
}
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.my_borrow_mut_iter.size_hint()
}
}
impl<'g> UtfParserCommon for Utf8RefIterToCharIter<'g> {
#[inline]
fn set_is_last_buffer(&mut self, b: bool) {
self.my_info.set_is_last_buffer(b);
}
#[inline]
fn is_last_buffer(&self) -> bool {
self.my_info.is_last_buffer()
}
#[inline]
fn has_invalid_sequence(&self) -> bool {
self.my_info.has_invalid_sequence()
}
#[inline]
fn signal_invalid_sequence(&mut self) {
self.my_info.signal_invalid_sequence();
}
#[inline]
fn reset_invalid_sequence(& mut self) {
self.my_info.reset_invalid_sequence();
}
#[inline]
fn reset_parser(&mut self) {
self.my_info.reset_parser();
}
}
impl<'g> Iterator for Utf8RefIterToCharIter<'g> {
type Item = char;
fn next(&mut self) -> Option<Self::Item> {
loop {
if self.my_info.my_buf.is_full() {
break;
}
match self.my_borrow_mut_iter.next() {
Option::None => {
break;
}
Option::Some(utf8) => {
self.my_info.my_buf.push_back(* utf8);
}
}
}
if self.my_info.my_buf.is_empty() {
Option::None
}
else {
let last_buffer = self.my_info.is_last_buffer();
match utf8_decode(& mut self.my_info.my_buf, last_buffer) {
Utf8EndEnum::BadDecode(_) => {
self.my_info.signal_invalid_sequence();
Option::Some(char::REPLACEMENT_CHARACTER)
}
Utf8EndEnum::Finish(code) => {
let ch = unsafe { char::from_u32_unchecked(code) };
Option::Some(ch)
}
Utf8EndEnum::TypeUnknown => {
if last_buffer {
self.my_info.signal_invalid_sequence();
Option::Some(char::REPLACEMENT_CHARACTER)
}
else {
Option::None
}
}
}
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.my_borrow_mut_iter.size_hint()
}
}
impl<'h> UtfParserCommon for Utf32IterToUtf8Iter<'h> {
#[inline]
fn set_is_last_buffer(&mut self, b: bool) {
self.my_info.set_is_last_buffer(b);
}
#[inline]
fn is_last_buffer(&self) -> bool {
self.my_info.is_last_buffer()
}
#[inline]
fn has_invalid_sequence(&self) -> bool {
self.my_info.has_invalid_sequence()
}
#[inline]
fn signal_invalid_sequence(&mut self) {
self.my_info.signal_invalid_sequence();
}
#[inline]
fn reset_invalid_sequence(& mut self) {
self.my_info.reset_invalid_sequence();
}
#[inline]
fn reset_parser(&mut self) {
self.my_info.reset_parser();
}
}
impl<'h> Iterator for Utf32IterToUtf8Iter<'h> {
type Item = u8;
fn next(&mut self) -> Option<Self::Item> {
match self.my_info.my_buf.pop_front() {
Option::Some(v1) => {
return Option::Some(v1);
}
Option::None => {}
}
match self.my_borrow_mut_iter.next() {
Option::None => {
return Option::None;
}
Option::Some(utf32) => {
match classify_utf32(utf32) {
Utf8TypeEnum::Type1(v1) => {
Option::Some(v1)
}
Utf8TypeEnum::Type2((v1,v2)) => {
self.my_info.my_buf.push_back(v2);
Option::Some(v1)
}
Utf8TypeEnum::Type3((v1,v2,v3)) => {
self.my_info.my_buf.push_back(v2);
self.my_info.my_buf.push_back(v3);
Option::Some(v1)
}
Utf8TypeEnum::Type4((v1,v2,v3,v4)) => {
self.my_info.my_buf.push_back(v2);
self.my_info.my_buf.push_back(v3);
self.my_info.my_buf.push_back(v4);
Option::Some(v1)
}
_ => {
self.my_info.signal_invalid_sequence();
self.my_info.my_buf.push_back(REPLACE_PART2);
self.my_info.my_buf.push_back(REPLACE_PART3);
Option::Some(REPLACE_PART1)
}
}
}
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.my_borrow_mut_iter.size_hint()
}
}
impl<'h> UtfParserCommon for CharRefIterToUtf8Iter<'h> {
#[inline]
fn set_is_last_buffer(&mut self, b: bool) {
self.my_info.set_is_last_buffer(b);
}
#[inline]
fn is_last_buffer(&self) -> bool {
self.my_info.is_last_buffer()
}
#[inline]
fn has_invalid_sequence(&self) -> bool {
self.my_info.has_invalid_sequence()
}
#[inline]
fn signal_invalid_sequence(&mut self) {
self.my_info.signal_invalid_sequence();
}
#[inline]
fn reset_invalid_sequence(& mut self) {
self.my_info.reset_invalid_sequence();
}
#[inline]
fn reset_parser(&mut self) {
self.my_info.reset_parser();
}
}
impl<'h> Iterator for CharRefIterToUtf8Iter<'h> {
type Item = u8;
fn next(&mut self) -> Option<Self::Item> {
match self.my_info.my_buf.pop_front() {
Option::Some(v1) => {
return Option::Some(v1);
}
Option::None => {}
}
match self.my_borrow_mut_iter.next() {
Option::None => {
return Option::None;
}
Option::Some(ch_ref) => {
let utf32 = (* ch_ref) as u32;
match classify_utf32(utf32) {
Utf8TypeEnum::Type1(v1) => {
Option::Some(v1)
}
Utf8TypeEnum::Type2((v1,v2)) => {
self.my_info.my_buf.push_back(v2);
Option::Some(v1)
}
Utf8TypeEnum::Type3((v1,v2,v3)) => {
self.my_info.my_buf.push_back(v2);
self.my_info.my_buf.push_back(v3);
Option::Some(v1)
}
Utf8TypeEnum::Type4((v1,v2,v3,v4)) => {
self.my_info.my_buf.push_back(v2);
self.my_info.my_buf.push_back(v3);
self.my_info.my_buf.push_back(v4);
Option::Some(v1)
}
_ => {
self.my_info.signal_invalid_sequence();
self.my_info.my_buf.push_back(REPLACE_PART2);
self.my_info.my_buf.push_back(REPLACE_PART3);
Option::Some(REPLACE_PART1)
}
}
}
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.my_borrow_mut_iter.size_hint()
}
}
#[cfg(test)]
mod tests {
extern crate std;
use crate::*;
fn _print_bytes(u8_slice: & [u8]) {
for indx in 0 .. u8_slice.len() {
let b = u8_slice[indx] as u32;
print!(" {:#02x}", b);
}
println!("");
}
#[test]
fn test_filter_bom_and_cr() {
let byte_slice = "\u{FEFF}\u{FEFF}\r\nA\r\rB\rCD\r\n\nEF\r\n\r\nG\n\r".as_bytes();
let mut byte_ref_iter = byte_slice.iter();
let mut from_utf8 = FromUtf8::new();
let mut utf8_to_char_iter = from_utf8.utf8_ref_to_char_with_iter(& mut byte_ref_iter);
let mut filter_iter = filter_bom_and_cr_iter(& mut utf8_to_char_iter);
assert_eq!(Some('\u{FEFF}'), filter_iter.next());
assert_eq!(Some('\n'), filter_iter.next()); assert_eq!(Some('A'), filter_iter.next());
assert_eq!(Some('\n'), filter_iter.next()); assert_eq!(Some('\n'), filter_iter.next());
assert_eq!(Some('B'), filter_iter.next());
assert_eq!(Some('\n'), filter_iter.next()); assert_eq!(Some('C'), filter_iter.next());
assert_eq!(Some('D'), filter_iter.next());
assert_eq!(Some('\n'), filter_iter.next()); assert_eq!(Some('\n'), filter_iter.next()); assert_eq!(Some('E'), filter_iter.next());
assert_eq!(Some('F'), filter_iter.next());
assert_eq!(Some('\n'), filter_iter.next()); assert_eq!(Some('\n'), filter_iter.next()); assert_eq!(Some('G'), filter_iter.next());
assert_eq!(Some('\n'), filter_iter.next()); assert_eq!(Some('\n'), filter_iter.next()); assert_eq!(Option::None, filter_iter.next());
}
fn round_trip_parsing1(char_val: char) {
let char_box: [char; 1] = [char_val; 1];
let mut utf8_box: [u8; 4] = [0; 4];
let mut utf8_len: usize = 0;
let mut char_ref = & char_box[..];
let mut utf32_parser = FromUnicode::new();
loop {
match utf32_parser.char_to_utf8(char_ref) {
Result::Ok((char_pos, b)) => {
if char_val == char::REPLACEMENT_CHARACTER {
assert_eq!(true, utf32_parser.has_invalid_sequence());
}
utf8_box[utf8_len] = b;
utf8_len += 1;
char_ref = char_pos;
}
Result::Err(MoreEnum::More(_)) => {
break;
}
}
}
let mut utf8_ref = & utf8_box[0 .. utf8_len];
let mut char_box2: [char; 1] = [char::MAX; 1];
let mut char_len: usize = 0;
let mut utf8_parser = FromUtf8::new();
loop {
match utf8_parser.utf8_to_char(utf8_ref) {
Result::Ok((utf8_pos, ch)) => {
if char_val == char::REPLACEMENT_CHARACTER {
assert_eq!(true, utf8_parser.has_invalid_sequence());
}
char_box2[char_len] = ch;
char_len += 1;
utf8_ref = utf8_pos;
}
Result::Err(MoreEnum::More(_)) => {
break;
}
}
}
assert_eq!(1, char_len);
assert_eq!(char_val, char_box2[0]);
}
fn round_trip_parsing2(code_val: u32) {
let utf32_box: [u32; 1] = [code_val; 1];
let mut utf8_box: [u8; 4] = [0; 4];
let mut utf8_len: usize = 0;
let mut utf32_ref = & utf32_box[..];
let mut utf32_parser = FromUnicode::new();
loop {
match utf32_parser.utf32_to_utf8(utf32_ref) {
Result::Ok((utf32_pos, b)) => {
if code_val == REPLACE_UTF32 {
assert_eq!(true, utf32_parser.has_invalid_sequence());
}
utf8_box[utf8_len] = b;
utf8_len += 1;
utf32_ref = utf32_pos;
}
Result::Err(MoreEnum::More(_)) => {
break;
}
}
}
let mut utf8_ref = & utf8_box[0 .. utf8_len];
let mut utf32_box2: [u32; 1] = [0; 1];
let mut utf32_len: usize = 0;
let mut utf8_parser = FromUtf8::new();
loop {
match utf8_parser.utf8_to_utf32(utf8_ref) {
Result::Ok((utf8_pos, co)) => {
if code_val == REPLACE_UTF32 {
assert_eq!(true, utf8_parser.has_invalid_sequence());
}
utf32_box2[utf32_len] = co;
utf32_len += 1;
utf8_ref = utf8_pos;
}
Result::Err(MoreEnum::More(_)) => {
break;
}
}
}
assert_eq!(1, utf32_len);
assert_eq!(code_val, utf32_box2[0]);
}
#[test]
pub fn test_round_trip_parsing() {
let mut code:u32 = 0;
loop {
let ch = char::from_u32(code).unwrap();
round_trip_parsing1(ch);
round_trip_parsing2(code);
code += 1;
if code == 0xD800 {
code = 0xE000; }
if code == 0x110000 {
break;
}
}
}
}
pub mod buf;