Trait encode_unicode::SliceExt

source · [−]

pub trait SliceExt: Index<RangeFull> {
    fn utf8char_indices(&self) -> Utf8CharDecoder<'_>ⓘNotable traits for Utf8CharDecoder<'a>impl<'a> Iterator for Utf8CharDecoder<'a>    type Item = (usize, Result<Utf8Char, Utf8Error>, usize);
    where
        Self::Output: Borrow<[u8]>;
    fn utf16char_indices(&self) -> Utf16CharDecoder<'_>ⓘNotable traits for Utf16CharDecoder<'a>impl<'a> Iterator for Utf16CharDecoder<'a>    type Item = (usize, Result<Utf16Char, Utf16PairError>, usize);
    where
        Self::Output: Borrow<[u16]>;
}

Expand description

Methods for iterating over u8 and u16 slices as UTF-8 or UTF-16 characters.

The iterators are slightly faster than the similar methods in IterExt because they con “push back” items for free after errors and don’t need a separate buffer that must be checked on every call to .next().

Required Methods

source

fn utf8char_indices(&self) -> Utf8CharDecoder<'_>ⓘNotable traits for Utf8CharDecoder<'a>`impl<'a> Iterator for Utf8CharDecoder<'a> type Item = (usize, Result<Utf8Char, Utf8Error>, usize);` where
Self::Output: Borrow<[u8]>,

Decode u8 slices as UTF-8 and iterate over the codepoints as Utf8Chars,

Examples

Get the index and error type of the first error:

use encode_unicode::{SliceExt, Utf8Char, error::Utf8ErrorKind};

let slice = b"ab\0\xe0\xbc\xa9 \xf3\x80\x77";
let result = slice.utf8char_indices()
   .map(|(offset,r,length)| r.map_err(|e| (offset,e.kind(),length) ) )
   .collect::<Result<String,(usize,Utf8ErrorKind,usize)>>();

assert_eq!(result, Err((7, Utf8ErrorKind::TooFewBytes, 1)));

use encode_unicode::{SliceExt, Utf8Char};
use std::error::Error;

let slice = b"\xf0\xbf\xbf\xbfXY\xdd\xbb\xe1\x80\x99quux123";
let mut fixed_size = [Utf8Char::default(); 8];
for (cp_i, (byte_index, r, _)) in slice.utf8char_indices().enumerate().take(8) {
    match r {
        Ok(u8c) => fixed_size[cp_i] = u8c,
        Err(e) => panic!("Invalid codepoint at index {} ({})", cp_i, e),
    }
}
let chars = ['\u{3ffff}', 'X', 'Y', '\u{77b}', '\u{1019}', 'q', 'u', 'u'];
assert_eq!(fixed_size, chars);

use encode_unicode::{SliceExt, Utf8Char, error::Utf8ErrorKind};

let bytes = b"\xfa-\xf4\x8f\xee\xa1\x8f-\xed\xa9\x87\xf0\xcc\xbb";
let mut errors = Vec::new();
let mut lengths = Vec::new();
let mut string = String::new();
for (offset,result,length) in bytes.utf8char_indices() {
   lengths.push((offset,length));
   let c = result.unwrap_or_else(|error| {
       errors.push((offset, error.kind()));
       Utf8Char::from('\u{fffd}') // replacement character
   });
   string.push_str(c.as_str());
}

assert_eq!(string, "�-��\u{e84f}-����\u{33b}");
assert_eq!(lengths, [(0,1), (1,1), (2,1), (3,1), (4,3), (7,1),
                    (8,1), (9,1), (10,1), (11,1), (12,2)]);
assert_eq!(errors, [
   ( 0, Utf8ErrorKind::NonUtf8Byte),
   ( 2, Utf8ErrorKind::InterruptedSequence),
   ( 3, Utf8ErrorKind::UnexpectedContinuationByte),
   ( 8, Utf8ErrorKind::Utf16ReservedCodepoint),
   ( 9, Utf8ErrorKind::UnexpectedContinuationByte),
   (10, Utf8ErrorKind::UnexpectedContinuationByte),
   (11, Utf8ErrorKind::TooFewBytes), // (but it was not the last element returned!)
]);

source

fn utf16char_indices(&self) -> Utf16CharDecoder<'_>ⓘNotable traits for Utf16CharDecoder<'a>`impl<'a> Iterator for Utf16CharDecoder<'a> type Item = (usize, Result<Utf16Char, Utf16PairError>, usize);` where
Self::Output: Borrow<[u16]>,

Decode u16 slices as UTF-16 and iterate over the codepoints as Utf16Chars,

The iterator produces (usize,Result<Utf16Char,Utf16Error>,usize), and the slice is validated as you go.

The first usize contains the offset from the start of the slice and the last usize contains the length of the codepoint or error. The length is either 1 or 2, and always 1 for errors.

Examples

use encode_unicode::{SliceExt, Utf8Char};

let slice = &['a' as u16, 0xdf00, 0xd83c, 0xdca0][..];
let mut errors = Vec::new();
let string = slice.utf16char_indices().map(|(offset,r,_)| match r {
   Ok(u16c) => Utf8Char::from(u16c),
   Err(_) => {
       errors.push(offset);
       Utf8Char::from('\u{fffd}') // REPLACEMENT_CHARACTER
   }
}).collect::<String>();

assert_eq!(string, "a�🂠");
assert_eq!(errors, [1]);

Search for a codepoint and return its unit and codepoint index.

use encode_unicode::{SliceExt, Utf16Char};

let slice = [0xd875,/*'𝕏'*/ 0xdd4f, '≈' as u16, '2' as u16];
let position = slice.utf16char_indices()
    .enumerate()
    .find(|&(_,(_,r,_))| r == Ok(Utf16Char::from('≈')) )
    .map(|(codepoint, (offset, _, _))| (codepoint, offset) );

assert_eq!(position, Some((1,2)));

Error types:

use encode_unicode::{SliceExt, Utf16Char};
use encode_unicode::error::Utf16PairError::*;

let slice = [0xdcba, 0xdeff, 0xd8be, 0xdeee, 'λ' as u16, 0xdab1, 0xdab1];
let mut iter = slice.utf16char_indices();
assert_eq!(iter.next(), Some((0, Err(UnexpectedTrailingSurrogate), 1)));
assert_eq!(iter.next(), Some((1, Err(UnexpectedTrailingSurrogate), 1)));
assert_eq!(iter.next(), Some((2, Ok(Utf16Char::from('\u{3faee}')), 2)));
assert_eq!(iter.next(), Some((4, Ok(Utf16Char::from('λ')), 1)));
assert_eq!(iter.next(), Some((5, Err(UnmatchedLeadingSurrogate), 1)));
assert_eq!(iter.next(), Some((6, Err(Incomplete), 1)));
assert_eq!(iter.next(), None);
assert_eq!(iter.as_slice(), [])

Implementors

source

Trait encode_unicode::SliceExt

Required Methods

fn utf8char_indices(&self) -> Utf8CharDecoder<'_>ⓘNotable traits for Utf8CharDecoder<'a>impl<'a> Iterator for Utf8CharDecoder<'a> type Item = (usize, Result<Utf8Char, Utf8Error>, usize); where Self::Output: Borrow<[u8]>,

Examples

fn utf16char_indices(&self) -> Utf16CharDecoder<'_>ⓘNotable traits for Utf16CharDecoder<'a>impl<'a> Iterator for Utf16CharDecoder<'a> type Item = (usize, Result<Utf16Char, Utf16PairError>, usize); where Self::Output: Borrow<[u16]>,

Examples

Implementors

impl<S: ?Sized + Index<RangeFull>> SliceExt for S

fn utf8char_indices(&self) -> Utf8CharDecoder<'_>ⓘNotable traits for Utf8CharDecoder<'a>`impl<'a> Iterator for Utf8CharDecoder<'a> type Item = (usize, Result<Utf8Char, Utf8Error>, usize);` where
Self::Output: Borrow<[u8]>,

fn utf16char_indices(&self) -> Utf16CharDecoder<'_>ⓘNotable traits for Utf16CharDecoder<'a>`impl<'a> Iterator for Utf16CharDecoder<'a> type Item = (usize, Result<Utf16Char, Utf16PairError>, usize);` where
Self::Output: Borrow<[u16]>,