[][src]Trait encode_unicode::IterExt

pub trait IterExt: Iterator + Sized {
    fn to_bytes(self) -> Utf8CharSplitter<Self::Item, Self>
    where
        Self::Item: Borrow<Utf8Char>
;
fn to_units(self) -> Utf16CharSplitter<Self::Item, Self>
    where
        Self::Item: Borrow<Utf16Char>
;
fn to_utf8chars(self) -> Utf8CharMerger<Self::Item, Self>
    where
        Self::Item: Borrow<u8>
;
fn to_utf16chars(self) -> Utf16CharMerger<Self::Item, Self>
    where
        Self::Item: Borrow<u16>
; }

Iterator methods that convert between u8s and Utf8Char or u16s and Utf16Char

All the iterator adapters also accept iterators that produce references of the type they convert from.

Required methods

Important traits for Utf8CharSplitter<U, I>
fn to_bytes(self) -> Utf8CharSplitter<Self::Item, Self> where
    Self::Item: Borrow<Utf8Char>, 

Converts an iterator of Utf8Chars or &Utf8Chars to an iterator of u8s.

Has the same effect as .flat_map() or .flatten(), but the returned iterator is ~40% faster.

The iterator also implements Read (when the std feature isn't disabled).
Reading will never produce an error, and calls to .read() and .next() can be mixed.

The exact number of bytes cannot be known in advance, but size_hint() gives the possible range. (min: all remaining characters are ASCII, max: all require four bytes)

Examples

From iterator of values:

use encode_unicode::{IterExt, StrExt};

let iterator = "foo".utf8chars();
let mut bytes = [0; 4];
for (u,dst) in iterator.to_bytes().zip(&mut bytes) {*dst=u;}
assert_eq!(&bytes, b"foo\0");

From iterator of references:

use encode_unicode::{IterExt, StrExt, Utf8Char};

let chars: Vec<Utf8Char> = "💣 bomb 💣".utf8chars().collect();
let bytes: Vec<u8> = chars.iter().to_bytes().collect();
let flat_map: Vec<u8> = chars.iter().flat_map(|u8c| *u8c ).collect();
assert_eq!(bytes, flat_map);

Reading from it:

use encode_unicode::{IterExt, StrExt};
use std::io::Read;

let s = "Ååh‽";
assert_eq!(s.len(), 8);
let mut buf = [b'E'; 9];
let mut reader = s.utf8chars().to_bytes();
assert_eq!(reader.read(&mut buf[..]).unwrap(), 8);
assert_eq!(reader.read(&mut buf[..]).unwrap(), 0);
assert_eq!(&buf[..8], s.as_bytes());
assert_eq!(buf[8], b'E');

Important traits for Utf16CharSplitter<U, I>
fn to_units(self) -> Utf16CharSplitter<Self::Item, Self> where
    Self::Item: Borrow<Utf16Char>, 

Converts an iterator of Utf16Char (or &Utf16Char) to an iterator of u16s.

Has the same effect as .flat_map() or .flatten(), but the returned iterator is about twice as fast.

The exact number of units cannot be known in advance, but size_hint() gives the possible range.

Examples

From iterator of values:

use encode_unicode::{IterExt, StrExt};

let iterator = "foo".utf16chars();
let mut units = [0; 4];
for (u,dst) in iterator.to_units().zip(&mut units) {*dst=u;}

assert_eq!(units, ['f' as u16, 'o' as u16, 'o' as u16, 0]);

From iterator of references:

use encode_unicode::{IterExt, StrExt, Utf16Char};

// (💣 takes two units)
let chars: Vec<Utf16Char> = "💣 bomb 💣".utf16chars().collect();
let units: Vec<u16> = chars.iter().to_units().collect();
let flat_map: Vec<u16> = chars.iter().flat_map(|u16c| *u16c ).collect();

assert_eq!(units, flat_map);

Important traits for Utf8CharMerger<B, I>
fn to_utf8chars(self) -> Utf8CharMerger<Self::Item, Self> where
    Self::Item: Borrow<u8>, 

Decodes bytes as UTF-8 and groups them into Utf8Chars

When errors (invalid values or sequences) are encountered, it continues with the byte right after the start of the error sequence.
This is neither the most intelligent choiche (sometimes it is guaranteed to produce another error), nor the easiest to implement, but I believe it to be the most predictable. It also means that ASCII characters are never hidden by errors.

Examples

Replace all errors with u+FFFD REPLACEMENT_CHARACTER:

use encode_unicode::{Utf8Char, IterExt};

let mut buf = [b'\0'; 255];
let len = b"foo\xCFbar".iter()
    .to_utf8chars()
    .flat_map(|r| r.unwrap_or(Utf8Char::from('\u{FFFD}')).into_iter() )
    .zip(&mut buf[..])
    .map(|(byte, dst)| *dst = byte )
    .count();

assert_eq!(&buf[..len], "foo\u{FFFD}bar".as_bytes());

Collect everything up until the first error into a string:

use encode_unicode::iterator::Utf8CharMerger;
let mut good = String::new();
for r in Utf8CharMerger::from(b"foo\xcc\xbbbar\xcc\xddbaz") {
    if let Ok(uc) = r {
        good.push_str(uc.as_str());
    } else {
        break;
    }
}
assert_eq!(good, "foo̻bar");

Abort decoding on error:

use encode_unicode::{IterExt, Utf8Char};
use encode_unicode::error::{InvalidUtf8Slice, InvalidUtf8};

let result = b"ab\0\xe0\xbc\xa9 \xf3\x80\x77".iter()
    .to_utf8chars()
    .collect::<Result<String,InvalidUtf8Slice>>();

assert_eq!(result, Err(InvalidUtf8Slice::Utf8(InvalidUtf8::NotAContinuationByte(2))));

Important traits for Utf16CharMerger<B, I>
fn to_utf16chars(self) -> Utf16CharMerger<Self::Item, Self> where
    Self::Item: Borrow<u16>, 

Decodes bytes as UTF-16 and groups them into Utf16Chars

When errors (unmatched leading surrogates or unexpected trailing surrogates) are encountered, an error is produced for every unit.

Examples

Replace errors with '�':

use encode_unicode::{IterExt, Utf16Char};

let slice = &['a' as u16, 0xdf00, 0xd83c, 0xdca0][..];
let string = slice.iter()
    .to_utf16chars()
    .map(|r| r.unwrap_or(Utf16Char::from('\u{fffd}')) ) // REPLACEMENT_CHARACTER
    .collect::<String>();

assert_eq!(string, "a�🂠");
use encode_unicode::{IterExt, Utf16Char};
use encode_unicode::error::Utf16PairError::*;

let slice = [0xdcba, 0xdeff, 0xd8be, 0xdeee, 'Y' as u16, 0xdab1, 0xdab1];
let mut iter = slice.iter().to_utf16chars();
assert_eq!(iter.size_hint(), (3, Some(7)));
assert_eq!(iter.next(), Some(Err(UnexpectedTrailingSurrogate)));
assert_eq!(iter.next(), Some(Err(UnexpectedTrailingSurrogate)));
assert_eq!(iter.next(), Some(Ok(Utf16Char::from('\u{3faee}'))));
assert_eq!(iter.next(), Some(Ok(Utf16Char::from('Y'))));
assert_eq!(iter.next(), Some(Err(UnmatchedLeadingSurrogate)));
assert_eq!(iter.next(), Some(Err(Incomplete)));
assert_eq!(iter.into_remaining_units().next(), None);

Search for a codepoint and return the codepoint index of the first match:

use encode_unicode::{IterExt, Utf16Char};

let position = [0xd875, 0xdd4f, '≈' as u16, '2' as u16].iter()
    .to_utf16chars()
    .position(|r| r == Ok(Utf16Char::from('≈')) );

assert_eq!(position, Some(1));
Loading content...

Implementors

impl<I: Iterator> IterExt for I[src]

Loading content...