sp-ropey 0.2.0 - Docs.rs

//! Utility functions for utf8 string slices.
//!
//! This module provides various utility functions that operate on string
//! slices in ways compatible with Ropey.  They may be useful when building
//! additional functionality on top of Ropey.

/// Converts from byte-index to char-index in a string slice.
///
/// If the byte is in the middle of a multi-byte char, returns the index of
/// the char that the byte belongs to.
///
/// Any past-the-end index will return the one-past-the-end char index.
#[inline]
pub fn byte_to_char_idx(text: &str, byte_idx: usize) -> usize {
    let count = count_chars_in_bytes(&text.as_bytes()[0..(byte_idx + 1).min(text.len())]);
    if byte_idx < text.len() {
        count - 1
    } else {
        count
    }
}

/// Converts from byte-index to line-index in a string slice.
///
/// This is equivalent to counting the line endings before the given byte.
///
/// Any past-the-end index will return the last line index.
#[inline]
pub fn byte_to_line_idx(text: &str, byte_idx: usize) -> usize {
    use crate::crlf;
    let mut byte_idx = byte_idx.min(text.len());
    while !text.is_char_boundary(byte_idx) {
        byte_idx -= 1;
    }
    let nl_count = count_line_breaks(&text[..byte_idx]);
    if crlf::is_break(byte_idx, text.as_bytes()) {
        nl_count
    } else {
        nl_count - 1
    }
}

/// Converts from char-index to byte-index in a string slice.
///
/// Any past-the-end index will return the one-past-the-end byte index.
#[inline]
pub fn char_to_byte_idx(text: &str, char_idx: usize) -> usize {
    // Fallback for non-sse2 platforms.
    char_to_byte_idx_inner::<usize>(text, char_idx)
}

#[inline(always)]
fn char_to_byte_idx_inner<T: ByteChunk>(text: &str, char_idx: usize) -> usize {
    // Get `middle` so we can do more efficient chunk-based counting.
    // We can't use this to get `end`, however, because the start index of
    // `end` actually depends on the accumulating char counts during the
    // counting process.
    let (start, middle, _) = unsafe { text.as_bytes().align_to::<T>() };

    let mut byte_count = 0;
    let mut char_count = 0;

    // Take care of any unaligned bytes at the beginning.
    let mut i = 0;
    while i < start.len() && char_count <= char_idx {
        char_count += ((start[i] & 0xC0) != 0x80) as usize;
        i += 1;
    }
    byte_count += i;

    // Use chunks to count multiple bytes at once, using bit-fiddling magic.
    let mut i = 0;
    let mut acc = T::splat(0);
    let mut acc_i = 0;
    while i < middle.len() && (char_count + (T::size() * (acc_i + 1))) <= char_idx {
        acc = acc.add(middle[i].bitand(T::splat(0xc0)).cmp_eq_byte(0x80));
        acc_i += 1;
        if acc_i == T::max_acc() || (char_count + (T::size() * (acc_i + 1))) >= char_idx {
            char_count += (T::size() * acc_i) - acc.sum_bytes();
            acc_i = 0;
            acc = T::splat(0);
        }
        i += 1;
    }
    char_count += (T::size() * acc_i) - acc.sum_bytes();
    byte_count += i * T::size();

    // Take care of any unaligned bytes at the end.
    let end = &text.as_bytes()[byte_count..];
    let mut i = 0;
    while i < end.len() && char_count <= char_idx {
        char_count += ((end[i] & 0xC0) != 0x80) as usize;
        i += 1;
    }
    byte_count += i;

    // Finish up
    if byte_count == text.len() && char_count <= char_idx {
        byte_count
    } else {
        byte_count - 1
    }
}

/// Converts from char-index to line-index in a string slice.
///
/// This is equivalent to counting the line endings before the given char.
///
/// Any past-the-end index will return the last line index.
#[inline]
pub fn char_to_line_idx(text: &str, char_idx: usize) -> usize {
    byte_to_line_idx(text, char_to_byte_idx(text, char_idx))
}

/// Converts from line-index to byte-index in a string slice.
///
/// More specifically, this returns the index of the first byte of the given
/// line.
///
/// Any past-the-end index will return the one-past-the-end byte index.
#[inline]
pub fn line_to_byte_idx(text: &str, line_idx: usize) -> usize {
    // Fallback for non-sse2 platforms.
    line_to_byte_idx_inner::<usize>(text, line_idx)
}

#[inline(always)]
fn line_to_byte_idx_inner<T: ByteChunk>(text: &str, line_idx: usize) -> usize {
    let mut bytes = text.as_bytes();
    let mut line_break_count = 0;

    // Handle unaligned bytes at the start.
    let aligned_idx = alignment_diff::<T>(bytes);
    if aligned_idx > 0 {
        let result = count_line_breaks_up_to(bytes, aligned_idx, line_idx);
        line_break_count += result.0;
        bytes = &bytes[result.1..];
    }

    // Count line breaks in big chunks.
    if alignment_diff::<T>(bytes) == 0 {
        while bytes.len() >= T::size() {
            // Unsafe because the called function depends on correct alignment.
            let tmp = unsafe { count_line_breaks_in_chunk_from_ptr::<T>(bytes) }.sum_bytes();
            if tmp + line_break_count >= line_idx {
                break;
            }
            line_break_count += tmp;

            bytes = &bytes[T::size()..];
        }
    }

    // Handle unaligned bytes at the end.
    let result = count_line_breaks_up_to(bytes, bytes.len(), line_idx - line_break_count);
    bytes = &bytes[result.1..];

    // Finish up
    let mut byte_idx = text.len() - bytes.len();
    while !text.is_char_boundary(byte_idx) {
        byte_idx += 1;
    }
    byte_idx
}

/// Converts from line-index to char-index in a string slice.
///
/// More specifically, this returns the index of the first char of the given
/// line.
///
/// Any past-the-end index will return the one-past-the-end char index.
#[inline]
pub fn line_to_char_idx(text: &str, line_idx: usize) -> usize {
    byte_to_char_idx(text, line_to_byte_idx(text, line_idx))
}

// /// Counts the utf16 surrogate pairs that would be in `text` if it were encoded
// /// as utf16.
// pub(crate) fn count_utf16_surrogates_slow(text: &str) -> usize {
//     let mut utf16_surrogate_count = 0;
//
//     for byte in text.bytes() {
//         utf16_surrogate_count += ((byte & 0xf0) == 0xf0) as usize;
//     }
//
//     utf16_surrogate_count
// }

/// Counts the utf16 surrogate pairs that would be in `text` if it were encoded
/// as utf16.
#[inline]
pub(crate) fn count_utf16_surrogates(text: &str) -> usize {
    count_utf16_surrogates_in_bytes(text.as_bytes())
}

#[inline]
pub(crate) fn count_utf16_surrogates_in_bytes(text: &[u8]) -> usize {
    // Fallback for non-sse2 platforms.
    count_utf16_surrogates_internal::<usize>(text)
}

#[inline(always)]
fn count_utf16_surrogates_internal<T: ByteChunk>(text: &[u8]) -> usize {
    // Get `middle` for more efficient chunk-based counting.
    let (start, middle, end) = unsafe { text.align_to::<T>() };

    let mut utf16_surrogate_count = 0;

    // Take care of unaligned bytes at the beginning.
    for byte in start.iter() {
        utf16_surrogate_count += ((byte & 0xf0) == 0xf0) as usize;
    }

    // Take care of the middle bytes in big chunks.
    let mut i = 0;
    let mut acc = T::splat(0);
    for chunk in middle.iter() {
        let tmp = chunk.bitand(T::splat(0xf0)).cmp_eq_byte(0xf0);
        acc = acc.add(tmp);
        i += 1;
        if i == T::max_acc() {
            i = 0;
            utf16_surrogate_count += acc.sum_bytes();
            acc = T::splat(0);
        }
    }
    utf16_surrogate_count += acc.sum_bytes();

    // Take care of unaligned bytes at the end.
    for byte in end.iter() {
        utf16_surrogate_count += ((byte & 0xf0) == 0xf0) as usize;
    }

    utf16_surrogate_count
}

#[inline(always)]
pub(crate) fn byte_to_utf16_surrogate_idx(text: &str, byte_idx: usize) -> usize {
    count_utf16_surrogates(&text[..byte_idx])
}

#[inline(always)]
pub(crate) fn utf16_code_unit_to_char_idx(text: &str, utf16_idx: usize) -> usize {
    // TODO: optimized version.  This is pretty slow.  It isn't expected to be
    // used in performance critical functionality, so this isn't urgent.  But
    // might as well make it faster when we get the chance.
    let mut char_i = 0;
    let mut utf16_i = 0;
    for c in text.chars() {
        if utf16_idx <= utf16_i {
            break;
        }
        char_i += 1;
        utf16_i += c.len_utf16();
    }

    if utf16_idx < utf16_i {
        char_i -= 1;
    }

    char_i
}

//===========================================================================
// Internal
//===========================================================================

/// Returns the byte position just after the second-to-last line break
/// in `text`, or zero of there is no second-to-last line break.
///
/// This function is narrow in scope, only being used for iterating
/// backwards over the lines of a `str`.
pub(crate) fn prev_line_end_char_idx(text: &str) -> usize {
    let mut itr = text.bytes().enumerate().rev();

    let first_byte = if let Some((_, byte)) = itr.next() {
        byte
    } else {
        return 0;
    };

    while let Some((idx, byte)) = itr.next() {
        match byte {
            0x0A | 0x0B | 0x0C => {
                return idx + 1;
            }
            0x0D => {
                if first_byte != 0x0A {
                    return idx + 1;
                }
            }
            0x85 => {
                if let Some((_, 0xC2)) = itr.next() {
                    return idx + 1;
                }
            }
            0xA8 | 0xA9 => {
                if let Some((_, 0x80)) = itr.next() {
                    if let Some((_, 0xE2)) = itr.next() {
                        return idx + 1;
                    }
                }
            }
            _ => {}
        }
    }

    return 0;
}

/// Returns whether the given string ends in a line break or not.
#[inline]
pub(crate) fn ends_with_line_break(text: &str) -> bool {
    if text.is_empty() {
        return false;
    }
    // Find the starting boundary of the last codepoint.
    let mut i = text.len() - 1;
    while !text.is_char_boundary(i) {
        i -= 1;
    }

    // Check if the last codepoint is a line break.
    match &text[i..] {
        "\u{000A}" | "\u{000B}" | "\u{000C}" | "\u{000D}" | "\u{0085}" | "\u{2028}"
        | "\u{2029}" => true,
        _ => false,
    }
}

/// Uses bit-fiddling magic to count utf8 chars really quickly.
/// We actually count the number of non-starting utf8 bytes, since
/// they have a consistent starting two-bit pattern.  We then
/// subtract from the byte length of the text to get the final
/// count.
#[inline]
pub(crate) fn count_chars(text: &str) -> usize {
    count_chars_in_bytes(text.as_bytes())
}

#[inline]
pub(crate) fn count_chars_in_bytes(text: &[u8]) -> usize {
    // Fallback for non-sse2 platforms.
    count_chars_internal::<usize>(text)
}

#[inline(always)]
fn count_chars_internal<T: ByteChunk>(text: &[u8]) -> usize {
    // Get `middle` for more efficient chunk-based counting.
    let (start, middle, end) = unsafe { text.align_to::<T>() };

    let mut inv_count = 0;

    // Take care of unaligned bytes at the beginning.
    for byte in start.iter() {
        inv_count += ((byte & 0xC0) == 0x80) as usize;
    }

    // Take care of the middle bytes in big chunks.
    let mut i = 0;
    let mut acc = T::splat(0);
    for chunk in middle.iter() {
        let tmp = chunk.bitand(T::splat(0xc0)).cmp_eq_byte(0x80);
        acc = acc.add(tmp);
        i += 1;
        if i == T::max_acc() {
            i = 0;
            inv_count += acc.sum_bytes();
            acc = T::splat(0);
        }
    }
    inv_count += acc.sum_bytes();

    // Take care of unaligned bytes at the end.
    for byte in end.iter() {
        inv_count += ((byte & 0xC0) == 0x80) as usize;
    }

    text.len() - inv_count
}

/// Uses bit-fiddling magic to count line breaks really quickly.
///
/// The following unicode sequences are considered newlines by this function:
/// - u{000A}        (Line Feed)
/// - u{000B}        (Vertical Tab)
/// - u{000C}        (Form Feed)
/// - u{000D}        (Carriage Return)
/// - u{000D}u{000A} (Carriage Return + Line Feed)
/// - u{0085}        (Next Line)
/// - u{2028}        (Line Separator)
/// - u{2029}        (Paragraph Separator)
#[inline]
pub(crate) fn count_line_breaks(text: &str) -> usize {
    // Fallback for non-sse2 platforms.
    count_line_breaks_internal::<usize>(text)
}

#[inline(always)]
fn count_line_breaks_internal<T: ByteChunk>(text: &str) -> usize {
    let mut bytes = text.as_bytes();
    let mut count = 0;

    // Handle unaligned bytes at the start.
    let aligned_idx = alignment_diff::<T>(bytes);
    if aligned_idx > 0 {
        let result = count_line_breaks_up_to(bytes, aligned_idx, bytes.len());
        count += result.0;
        bytes = &bytes[result.1..];
    }

    // Count line breaks in big chunks.
    let mut i = 0;
    let mut acc = T::splat(0);
    while bytes.len() >= T::size() {
        // Unsafe because the called function depends on correct alignment.
        acc = acc.add(unsafe { count_line_breaks_in_chunk_from_ptr::<T>(bytes) });
        i += 1;
        if i == T::max_acc() {
            i = 0;
            count += acc.sum_bytes();
            acc = T::splat(0);
        }
        bytes = &bytes[T::size()..];
    }
    count += acc.sum_bytes();

    // Handle unaligned bytes at the end.
    count += count_line_breaks_up_to(bytes, bytes.len(), bytes.len()).0;

    count
}

/// Used internally in the line-break counting functions.
///
/// Counts line breaks a byte at a time up to a maximum number of bytes and
/// line breaks, and returns the counted lines and how many bytes were processed.
#[inline(always)]
#[allow(clippy::if_same_then_else)]
fn count_line_breaks_up_to(bytes: &[u8], max_bytes: usize, max_breaks: usize) -> (usize, usize) {
    let mut ptr = 0;
    let mut count = 0;
    while ptr < max_bytes && count < max_breaks {
        let byte = bytes[ptr];

        // Handle u{000A}, u{000B}, u{000C}, and u{000D}
        if (byte <= 0x0D) && (byte >= 0x0A) {
            count += 1;

            // Check for CRLF and and subtract 1 if it is,
            // since it will be caught in the next iteration
            // with the LF.
            if byte == 0x0D && (ptr + 1) < bytes.len() && bytes[ptr + 1] == 0x0A {
                count -= 1;
            }
        }
        // Handle u{0085}
        else if byte == 0xC2 && (ptr + 1) < bytes.len() && bytes[ptr + 1] == 0x85 {
            count += 1;
        }
        // Handle u{2028} and u{2029}
        else if byte == 0xE2
            && (ptr + 2) < bytes.len()
            && bytes[ptr + 1] == 0x80
            && (bytes[ptr + 2] >> 1) == 0x54
        {
            count += 1;
        }

        ptr += 1;
    }

    (count, ptr)
}

/// Used internally in the line-break counting functions.
///
/// The start of `bytes` MUST be aligned as type T, and `bytes` MUST be at
/// least as large (in bytes) as T.  If these invariants are not met, bad
/// things could potentially happen.  Hence why this function is unsafe.
#[inline(always)]
unsafe fn count_line_breaks_in_chunk_from_ptr<T: ByteChunk>(bytes: &[u8]) -> T {
    let c = {
        // The only unsafe bits of the function are in this block.
        debug_assert_eq!(bytes.align_to::<T>().0.len(), 0);
        debug_assert!(bytes.len() >= T::size());
        // This unsafe cast is for performance reasons: going through e.g.
        // `align_to()` results in a significant drop in performance.
        *(bytes.as_ptr() as *const T)
    };
    let end_i = T::size();

    let mut acc = T::splat(0);

    // Calculate the flags we're going to be working with.
    let nl_1_flags = c.cmp_eq_byte(0xC2);
    let sp_1_flags = c.cmp_eq_byte(0xE2);
    let all_flags = c.bytes_between_127(0x09, 0x0E);
    let cr_flags = c.cmp_eq_byte(0x0D);

    // Next Line: u{0085}
    if !nl_1_flags.is_zero() {
        let nl_2_flags = c.cmp_eq_byte(0x85).shift_back_lex(1);
        let flags = nl_1_flags.bitand(nl_2_flags);
        acc = acc.add(flags);

        // Handle ending boundary
        if bytes.len() > end_i && bytes[end_i - 1] == 0xC2 && bytes[end_i] == 0x85 {
            acc = acc.inc_nth_from_end_lex_byte(0);
        }
    }

    // Line Separator:      u{2028}
    // Paragraph Separator: u{2029}
    if !sp_1_flags.is_zero() {
        let sp_2_flags = c.cmp_eq_byte(0x80).shift_back_lex(1).bitand(sp_1_flags);
        if !sp_2_flags.is_zero() {
            let sp_3_flags = c
                .shr(1)
                .bitand(T::splat(!0x80))
                .cmp_eq_byte(0x54)
                .shift_back_lex(2);
            let sp_flags = sp_2_flags.bitand(sp_3_flags);
            acc = acc.add(sp_flags);
        }

        // Handle ending boundary
        if bytes.len() > end_i
            && bytes[end_i - 2] == 0xE2
            && bytes[end_i - 1] == 0x80
            && (bytes[end_i] >> 1) == 0x54
        {
            acc = acc.inc_nth_from_end_lex_byte(1);
        } else if bytes.len() > (end_i + 1)
            && bytes[end_i - 1] == 0xE2
            && bytes[end_i] == 0x80
            && (bytes[end_i + 1] >> 1) == 0x54
        {
            acc = acc.inc_nth_from_end_lex_byte(0);
        }
    }

    // Line Feed:                   u{000A}
    // Vertical Tab:                u{000B}
    // Form Feed:                   u{000C}
    // Carriage Return:             u{000D}
    // Carriage Return + Line Feed: u{000D}u{000A}
    acc = acc.add(all_flags);
    if !cr_flags.is_zero() {
        // Handle CRLF
        let lf_flags = c.cmp_eq_byte(0x0A);
        let crlf_flags = cr_flags.bitand(lf_flags.shift_back_lex(1));
        acc = acc.sub(crlf_flags);
        if bytes.len() > end_i && bytes[end_i - 1] == 0x0D && bytes[end_i] == 0x0A {
            acc = acc.dec_last_lex_byte();
        }
    }

    acc
}

/// Returns the alignment difference between the start of `bytes` and the
/// type `T`.
///
/// Or put differently: returns how many bytes into `bytes` you need to walk
/// to reach the alignment of `T` in memory.
///
/// Will return 0 if already aligned at the start, and will return the length
/// of `bytes` if alignment is beyond the end of `bytes`.
#[inline(always)]
fn alignment_diff<T>(bytes: &[u8]) -> usize {
    let alignment = core::mem::align_of::<T>();
    let ptr = bytes.as_ptr() as usize;
    (alignment - ((ptr - 1) & (alignment - 1)) - 1).min(bytes.len())
}

//======================================================================

/// Interface for working with chunks of bytes at a time, providing the
/// operations needed for the functionality in str_utils.
trait ByteChunk: Copy + Clone + core::fmt::Debug {
    /// Returns the size of the chunk in bytes.
    fn size() -> usize;

    /// Returns the maximum number of iterations the chunk can accumulate
    /// before sum_bytes() becomes inaccurate.
    fn max_acc() -> usize;

    /// Creates a new chunk with all bytes set to n.
    fn splat(n: u8) -> Self;

    /// Returns whether all bytes are zero or not.
    fn is_zero(&self) -> bool;

    /// Shifts bytes back lexographically by n bytes.
    fn shift_back_lex(&self, n: usize) -> Self;

    /// Shifts bits to the right by n bits.
    fn shr(&self, n: usize) -> Self;

    /// Compares bytes for equality with the given byte.
    ///
    /// Bytes that are equal are set to 1, bytes that are not
    /// are set to 0.
    fn cmp_eq_byte(&self, byte: u8) -> Self;

    /// Compares bytes to see if they're in the non-inclusive range (a, b),
    /// where a < b <= 127.
    ///
    /// Bytes in the range are set to 1, bytes not in the range are set to 0.
    fn bytes_between_127(&self, a: u8, b: u8) -> Self;

    /// Performs a bitwise and on two chunks.
    fn bitand(&self, other: Self) -> Self;

    /// Adds the bytes of two chunks together.
    fn add(&self, other: Self) -> Self;

    /// Subtracts other's bytes from this chunk.
    fn sub(&self, other: Self) -> Self;

    /// Increments the nth-from-last lexographic byte by 1.
    fn inc_nth_from_end_lex_byte(&self, n: usize) -> Self;

    /// Decrements the last lexographic byte by 1.
    fn dec_last_lex_byte(&self) -> Self;

    /// Returns the sum of all bytes in the chunk.
    fn sum_bytes(&self) -> usize;
}

impl ByteChunk for usize {
    #[inline(always)]
    fn size() -> usize {
        core::mem::size_of::<usize>()
    }

    #[inline(always)]
    fn max_acc() -> usize {
        (256 / core::mem::size_of::<usize>()) - 1
    }

    #[inline(always)]
    fn splat(n: u8) -> Self {
        const ONES: usize = core::usize::MAX / 0xFF;
        ONES * n as usize
    }

    #[inline(always)]
    fn is_zero(&self) -> bool {
        *self == 0
    }

    #[inline(always)]
    fn shift_back_lex(&self, n: usize) -> Self {
        if cfg!(target_endian = "little") {
            *self >> (n * 8)
        } else {
            *self << (n * 8)
        }
    }

    #[inline(always)]
    fn shr(&self, n: usize) -> Self {
        *self >> n
    }

    #[inline(always)]
    fn cmp_eq_byte(&self, byte: u8) -> Self {
        const ONES: usize = core::usize::MAX / 0xFF;
        const ONES_HIGH: usize = ONES << 7;
        let word = *self ^ (byte as usize * ONES);
        (!(((word & !ONES_HIGH) + !ONES_HIGH) | word) & ONES_HIGH) >> 7
    }

    #[inline(always)]
    fn bytes_between_127(&self, a: u8, b: u8) -> Self {
        const ONES: usize = core::usize::MAX / 0xFF;
        const ONES_HIGH: usize = ONES << 7;
        let tmp = *self & (ONES * 127);
        (((ONES * (127 + b as usize) - tmp) & !*self & (tmp + (ONES * (127 - a as usize))))
            & ONES_HIGH)
            >> 7
    }

    #[inline(always)]
    fn bitand(&self, other: Self) -> Self {
        *self & other
    }

    #[inline(always)]
    fn add(&self, other: Self) -> Self {
        *self + other
    }

    #[inline(always)]
    fn sub(&self, other: Self) -> Self {
        *self - other
    }

    #[inline(always)]
    fn inc_nth_from_end_lex_byte(&self, n: usize) -> Self {
        if cfg!(target_endian = "little") {
            *self + (1 << ((Self::size() - 1 - n) * 8))
        } else {
            *self + (1 << (n * 8))
        }
    }

    #[inline(always)]
    fn dec_last_lex_byte(&self) -> Self {
        if cfg!(target_endian = "little") {
            *self - (1 << ((Self::size() - 1) * 8))
        } else {
            *self - 1
        }
    }

    #[inline(always)]
    fn sum_bytes(&self) -> usize {
        const ONES: usize = core::usize::MAX / 0xFF;
        self.wrapping_mul(ONES) >> ((Self::size() - 1) * 8)
    }
}

/// An iterator that yields the byte indices of line breaks in a string.
/// A line break in this case is the point immediately *after* a newline
/// character.
///
/// The following unicode sequences are considered newlines by this function:
/// - u{000A}        (Line Feed)
/// - u{000B}        (Vertical Tab)
/// - u{000C}        (Form Feed)
/// - u{000D}        (Carriage Return)
/// - u{000D}u{000A} (Carriage Return + Line Feed)
/// - u{0085}        (Next Line)
/// - u{2028}        (Line Separator)
/// - u{2029}        (Paragraph Separator)
#[allow(unused)] // Used in tests, as reference solution.
struct LineBreakIter<'a> {
    byte_itr: alloc::str::Bytes<'a>,
    byte_idx: usize,
}

#[allow(unused)]
impl<'a> LineBreakIter<'a> {
    #[inline]
    fn new(text: &str) -> LineBreakIter {
        LineBreakIter {
            byte_itr: text.bytes(),
            byte_idx: 0,
        }
    }
}

impl<'a> Iterator for LineBreakIter<'a> {
    type Item = usize;

    #[inline]
    fn next(&mut self) -> Option<usize> {
        while let Some(byte) = self.byte_itr.next() {
            self.byte_idx += 1;
            // Handle u{000A}, u{000B}, u{000C}, and u{000D}
            if (byte <= 0x0D) && (byte >= 0x0A) {
                if byte == 0x0D {
                    // We're basically "peeking" here.
                    if let Some(0x0A) = self.byte_itr.clone().next() {
                        self.byte_itr.next();
                        self.byte_idx += 1;
                    }
                }
                return Some(self.byte_idx);
            }
            // Handle u{0085}
            else if byte == 0xC2 {
                self.byte_idx += 1;
                if let Some(0x85) = self.byte_itr.next() {
                    return Some(self.byte_idx);
                }
            }
            // Handle u{2028} and u{2029}
            else if byte == 0xE2 {
                self.byte_idx += 2;
                let byte2 = self.byte_itr.next().unwrap();
                let byte3 = self.byte_itr.next().unwrap() >> 1;
                if byte2 == 0x80 && byte3 == 0x54 {
                    return Some(self.byte_idx);
                }
            }
        }

        return None;
    }
}

//======================================================================

#[cfg(test)]
mod tests {
    use super::*;

    // 124 bytes, 100 chars, 4 lines
    const TEXT_LINES: &str = "Hello there!  How're you doing?\nIt's \
                              a fine day, isn't it?\nAren't you glad \
                              we're alive?\nこんにちは、みんなさん！";

    #[test]
    fn count_chars_01() {
        let text = "Hello せかい! Hello せかい! Hello せかい! Hello せかい! Hello せかい!";

        assert_eq!(54, count_chars(text));
    }

    #[test]
    fn count_chars_02() {
        assert_eq!(100, count_chars(TEXT_LINES));
    }

    #[test]
    fn line_breaks_iter_01() {
        let text = "\u{000A}Hello\u{000D}\u{000A}\u{000D}せ\u{000B}か\u{000C}い\u{0085}. \
                    There\u{2028}is something.\u{2029}";
        let mut itr = LineBreakIter::new(text);
        assert_eq!(48, text.len());
        assert_eq!(Some(1), itr.next());
        assert_eq!(Some(8), itr.next());
        assert_eq!(Some(9), itr.next());
        assert_eq!(Some(13), itr.next());
        assert_eq!(Some(17), itr.next());
        assert_eq!(Some(22), itr.next());
        assert_eq!(Some(32), itr.next());
        assert_eq!(Some(48), itr.next());
        assert_eq!(None, itr.next());
    }

    #[test]
    fn prev_line_end_char_idx_01() {
        let mut text = "\u{000A}Hello\u{000D}\u{000A}\u{000D}せ\u{000B}か\u{000C}い\u{0085}. \
                        There\u{2028}is something.\u{2029}";

        assert_eq!(48, text.len());
        text = &text[..prev_line_end_char_idx(text)];
        assert_eq!(32, text.len());
        text = &text[..prev_line_end_char_idx(text)];
        assert_eq!(22, text.len());
        text = &text[..prev_line_end_char_idx(text)];
        assert_eq!(17, text.len());
        text = &text[..prev_line_end_char_idx(text)];
        assert_eq!(13, text.len());
        text = &text[..prev_line_end_char_idx(text)];
        assert_eq!(9, text.len());
        text = &text[..prev_line_end_char_idx(text)];
        assert_eq!(8, text.len());
        text = &text[..prev_line_end_char_idx(text)];
        assert_eq!(1, text.len());
        text = &text[..prev_line_end_char_idx(text)];
        assert_eq!(0, text.len());
    }

    #[test]
    fn count_line_breaks_01() {
        let text = "\u{000A}Hello\u{000D}\u{000A}\u{000D}せ\u{000B}か\u{000C}い\u{0085}. \
                    There\u{2028}is something.\u{2029}";
        assert_eq!(48, text.len());
        assert_eq!(8, count_line_breaks(text));
    }

    #[test]
    fn count_line_breaks_02() {
        let text = "\u{000A}Hello world!  This is a longer text.\u{000D}\u{000A}\u{000D}To better test that skipping by usize doesn't mess things up.\u{000B}Hello せかい!\u{000C}\u{0085}Yet more text.  How boring.\u{2028}Hi.\u{2029}\u{000A}Hello world!  This is a longer text.\u{000D}\u{000A}\u{000D}To better test that skipping by usize doesn't mess things up.\u{000B}Hello せかい!\u{000C}\u{0085}Yet more text.  How boring.\u{2028}Hi.\u{2029}\u{000A}Hello world!  This is a longer text.\u{000D}\u{000A}\u{000D}To better test that skipping by usize doesn't mess things up.\u{000B}Hello せかい!\u{000C}\u{0085}Yet more text.  How boring.\u{2028}Hi.\u{2029}\u{000A}Hello world!  This is a longer text.\u{000D}\u{000A}\u{000D}To better test that skipping by usize doesn't mess things up.\u{000B}Hello せかい!\u{000C}\u{0085}Yet more text.  How boring.\u{2028}Hi.\u{2029}";
        assert_eq!(count_line_breaks(text), LineBreakIter::new(text).count());
    }

    #[test]
    fn byte_to_char_idx_01() {
        let text = "Hello せかい!";
        assert_eq!(0, byte_to_char_idx(text, 0));
        assert_eq!(1, byte_to_char_idx(text, 1));
        assert_eq!(6, byte_to_char_idx(text, 6));
        assert_eq!(6, byte_to_char_idx(text, 7));
        assert_eq!(6, byte_to_char_idx(text, 8));
        assert_eq!(7, byte_to_char_idx(text, 9));
        assert_eq!(7, byte_to_char_idx(text, 10));
        assert_eq!(7, byte_to_char_idx(text, 11));
        assert_eq!(8, byte_to_char_idx(text, 12));
        assert_eq!(8, byte_to_char_idx(text, 13));
        assert_eq!(8, byte_to_char_idx(text, 14));
        assert_eq!(9, byte_to_char_idx(text, 15));
        assert_eq!(10, byte_to_char_idx(text, 16));
        assert_eq!(10, byte_to_char_idx(text, 17));
        assert_eq!(10, byte_to_char_idx(text, 18));
        assert_eq!(10, byte_to_char_idx(text, 19));
    }

    #[test]
    fn byte_to_char_idx_02() {
        let text = "せかい";
        assert_eq!(0, byte_to_char_idx(text, 0));
        assert_eq!(0, byte_to_char_idx(text, 1));
        assert_eq!(0, byte_to_char_idx(text, 2));
        assert_eq!(1, byte_to_char_idx(text, 3));
        assert_eq!(1, byte_to_char_idx(text, 4));
        assert_eq!(1, byte_to_char_idx(text, 5));
        assert_eq!(2, byte_to_char_idx(text, 6));
        assert_eq!(2, byte_to_char_idx(text, 7));
        assert_eq!(2, byte_to_char_idx(text, 8));
        assert_eq!(3, byte_to_char_idx(text, 9));
        assert_eq!(3, byte_to_char_idx(text, 10));
        assert_eq!(3, byte_to_char_idx(text, 11));
        assert_eq!(3, byte_to_char_idx(text, 12));
    }

    #[test]
    fn byte_to_char_idx_03() {
        // Ascii range
        for i in 0..88 {
            assert_eq!(i, byte_to_char_idx(TEXT_LINES, i));
        }

        // Hiragana characters
        for i in 88..125 {
            assert_eq!(88 + ((i - 88) / 3), byte_to_char_idx(TEXT_LINES, i));
        }

        // Past the end
        for i in 125..130 {
            assert_eq!(100, byte_to_char_idx(TEXT_LINES, i));
        }
    }

    #[test]
    fn byte_to_line_idx_01() {
        let text = "Here\nare\nsome\nwords";
        assert_eq!(0, byte_to_line_idx(text, 0));
        assert_eq!(0, byte_to_line_idx(text, 4));
        assert_eq!(1, byte_to_line_idx(text, 5));
        assert_eq!(1, byte_to_line_idx(text, 8));
        assert_eq!(2, byte_to_line_idx(text, 9));
        assert_eq!(2, byte_to_line_idx(text, 13));
        assert_eq!(3, byte_to_line_idx(text, 14));
        assert_eq!(3, byte_to_line_idx(text, 19));
    }

    #[test]
    fn byte_to_line_idx_02() {
        let text = "\nHere\nare\nsome\nwords\n";
        assert_eq!(0, byte_to_line_idx(text, 0));
        assert_eq!(1, byte_to_line_idx(text, 1));
        assert_eq!(1, byte_to_line_idx(text, 5));
        assert_eq!(2, byte_to_line_idx(text, 6));
        assert_eq!(2, byte_to_line_idx(text, 9));
        assert_eq!(3, byte_to_line_idx(text, 10));
        assert_eq!(3, byte_to_line_idx(text, 14));
        assert_eq!(4, byte_to_line_idx(text, 15));
        assert_eq!(4, byte_to_line_idx(text, 20));
        assert_eq!(5, byte_to_line_idx(text, 21));
    }

    #[test]
    fn byte_to_line_idx_03() {
        let text = "Here\r\nare\r\nsome\r\nwords";
        assert_eq!(0, byte_to_line_idx(text, 0));
        assert_eq!(0, byte_to_line_idx(text, 4));
        assert_eq!(0, byte_to_line_idx(text, 5));
        assert_eq!(1, byte_to_line_idx(text, 6));
        assert_eq!(1, byte_to_line_idx(text, 9));
        assert_eq!(1, byte_to_line_idx(text, 10));
        assert_eq!(2, byte_to_line_idx(text, 11));
        assert_eq!(2, byte_to_line_idx(text, 15));
        assert_eq!(2, byte_to_line_idx(text, 16));
        assert_eq!(3, byte_to_line_idx(text, 17));
    }

    #[test]
    fn byte_to_line_idx_04() {
        // Line 0
        for i in 0..32 {
            assert_eq!(0, byte_to_line_idx(TEXT_LINES, i));
        }

        // Line 1
        for i in 32..59 {
            assert_eq!(1, byte_to_line_idx(TEXT_LINES, i));
        }

        // Line 2
        for i in 59..88 {
            assert_eq!(2, byte_to_line_idx(TEXT_LINES, i));
        }

        // Line 3
        for i in 88..125 {
            assert_eq!(3, byte_to_line_idx(TEXT_LINES, i));
        }

        // Past the end
        for i in 125..130 {
            assert_eq!(3, byte_to_line_idx(TEXT_LINES, i));
        }
    }

    #[test]
    fn char_to_byte_idx_01() {
        let text = "Hello せかい!";
        assert_eq!(0, char_to_byte_idx(text, 0));
        assert_eq!(1, char_to_byte_idx(text, 1));
        assert_eq!(2, char_to_byte_idx(text, 2));
        assert_eq!(5, char_to_byte_idx(text, 5));
        assert_eq!(6, char_to_byte_idx(text, 6));
        assert_eq!(12, char_to_byte_idx(text, 8));
        assert_eq!(15, char_to_byte_idx(text, 9));
        assert_eq!(16, char_to_byte_idx(text, 10));
    }

    #[test]
    fn char_to_byte_idx_02() {
        let text = "せかい";
        assert_eq!(0, char_to_byte_idx(text, 0));
        assert_eq!(3, char_to_byte_idx(text, 1));
        assert_eq!(6, char_to_byte_idx(text, 2));
        assert_eq!(9, char_to_byte_idx(text, 3));
    }

    #[test]
    fn char_to_byte_idx_03() {
        let text = "Hello world!";
        assert_eq!(0, char_to_byte_idx(text, 0));
        assert_eq!(1, char_to_byte_idx(text, 1));
        assert_eq!(8, char_to_byte_idx(text, 8));
        assert_eq!(11, char_to_byte_idx(text, 11));
        assert_eq!(12, char_to_byte_idx(text, 12));
    }

    #[test]
    fn char_to_byte_idx_04() {
        let text = "Hello world! Hello せかい! Hello world! Hello せかい! \
                    Hello world! Hello せかい! Hello world! Hello せかい! \
                    Hello world! Hello せかい! Hello world! Hello せかい! \
                    Hello world! Hello せかい! Hello world! Hello せかい!";
        assert_eq!(0, char_to_byte_idx(text, 0));
        assert_eq!(30, char_to_byte_idx(text, 24));
        assert_eq!(60, char_to_byte_idx(text, 48));
        assert_eq!(90, char_to_byte_idx(text, 72));
        assert_eq!(115, char_to_byte_idx(text, 93));
        assert_eq!(120, char_to_byte_idx(text, 96));
        assert_eq!(150, char_to_byte_idx(text, 120));
        assert_eq!(180, char_to_byte_idx(text, 144));
        assert_eq!(210, char_to_byte_idx(text, 168));
        assert_eq!(239, char_to_byte_idx(text, 191));
    }

    #[test]
    fn char_to_byte_idx_05() {
        // Ascii range
        for i in 0..88 {
            assert_eq!(i, char_to_byte_idx(TEXT_LINES, i));
        }

        // Hiragana characters
        for i in 88..100 {
            assert_eq!(88 + ((i - 88) * 3), char_to_byte_idx(TEXT_LINES, i));
        }

        // Past the end
        for i in 100..110 {
            assert_eq!(124, char_to_byte_idx(TEXT_LINES, i));
        }
    }

    #[test]
    fn char_to_line_idx_01() {
        let text = "Hello せ\nか\nい!";
        assert_eq!(0, char_to_line_idx(text, 0));
        assert_eq!(0, char_to_line_idx(text, 7));
        assert_eq!(1, char_to_line_idx(text, 8));
        assert_eq!(1, char_to_line_idx(text, 9));
        assert_eq!(2, char_to_line_idx(text, 10));
    }

    #[test]
    fn char_to_line_idx_02() {
        // Line 0
        for i in 0..32 {
            assert_eq!(0, char_to_line_idx(TEXT_LINES, i));
        }

        // Line 1
        for i in 32..59 {
            assert_eq!(1, char_to_line_idx(TEXT_LINES, i));
        }

        // Line 2
        for i in 59..88 {
            assert_eq!(2, char_to_line_idx(TEXT_LINES, i));
        }

        // Line 3
        for i in 88..100 {
            assert_eq!(3, char_to_line_idx(TEXT_LINES, i));
        }

        // Past the end
        for i in 100..110 {
            assert_eq!(3, char_to_line_idx(TEXT_LINES, i));
        }
    }

    #[test]
    fn line_to_byte_idx_01() {
        let text = "Here\r\nare\r\nsome\r\nwords";
        assert_eq!(0, line_to_byte_idx(text, 0));
        assert_eq!(6, line_to_byte_idx(text, 1));
        assert_eq!(11, line_to_byte_idx(text, 2));
        assert_eq!(17, line_to_byte_idx(text, 3));
    }

    #[test]
    fn line_to_byte_idx_02() {
        let text = "\nHere\nare\nsome\nwords\n";
        assert_eq!(0, line_to_byte_idx(text, 0));
        assert_eq!(1, line_to_byte_idx(text, 1));
        assert_eq!(6, line_to_byte_idx(text, 2));
        assert_eq!(10, line_to_byte_idx(text, 3));
        assert_eq!(15, line_to_byte_idx(text, 4));
        assert_eq!(21, line_to_byte_idx(text, 5));
    }

    #[test]
    fn line_to_byte_idx_03() {
        assert_eq!(0, line_to_byte_idx(TEXT_LINES, 0));
        assert_eq!(32, line_to_byte_idx(TEXT_LINES, 1));
        assert_eq!(59, line_to_byte_idx(TEXT_LINES, 2));
        assert_eq!(88, line_to_byte_idx(TEXT_LINES, 3));

        // Past end
        assert_eq!(124, line_to_byte_idx(TEXT_LINES, 4));
        assert_eq!(124, line_to_byte_idx(TEXT_LINES, 5));
        assert_eq!(124, line_to_byte_idx(TEXT_LINES, 6));
    }

    #[test]
    fn line_to_char_idx_01() {
        let text = "Hello せ\nか\nい!";
        assert_eq!(0, line_to_char_idx(text, 0));
        assert_eq!(8, line_to_char_idx(text, 1));
        assert_eq!(10, line_to_char_idx(text, 2));
    }

    #[test]
    fn line_to_char_idx_02() {
        assert_eq!(0, line_to_char_idx(TEXT_LINES, 0));
        assert_eq!(32, line_to_char_idx(TEXT_LINES, 1));
        assert_eq!(59, line_to_char_idx(TEXT_LINES, 2));
        assert_eq!(88, line_to_char_idx(TEXT_LINES, 3));

        // Past end
        assert_eq!(100, line_to_char_idx(TEXT_LINES, 4));
        assert_eq!(100, line_to_char_idx(TEXT_LINES, 5));
        assert_eq!(100, line_to_char_idx(TEXT_LINES, 6));
    }

    #[test]
    fn line_byte_round_trip() {
        let text = "\nHere\nare\nsome\nwords\n";
        assert_eq!(6, line_to_byte_idx(text, byte_to_line_idx(text, 6)));
        assert_eq!(2, byte_to_line_idx(text, line_to_byte_idx(text, 2)));

        assert_eq!(0, line_to_byte_idx(text, byte_to_line_idx(text, 0)));
        assert_eq!(0, byte_to_line_idx(text, line_to_byte_idx(text, 0)));

        assert_eq!(21, line_to_byte_idx(text, byte_to_line_idx(text, 21)));
        assert_eq!(5, byte_to_line_idx(text, line_to_byte_idx(text, 5)));
    }

    #[test]
    fn line_char_round_trip() {
        let text = "\nHere\nare\nsome\nwords\n";
        assert_eq!(6, line_to_char_idx(text, char_to_line_idx(text, 6)));
        assert_eq!(2, char_to_line_idx(text, line_to_char_idx(text, 2)));

        assert_eq!(0, line_to_char_idx(text, char_to_line_idx(text, 0)));
        assert_eq!(0, char_to_line_idx(text, line_to_char_idx(text, 0)));

        assert_eq!(21, line_to_char_idx(text, char_to_line_idx(text, 21)));
        assert_eq!(5, char_to_line_idx(text, line_to_char_idx(text, 5)));
    }

    #[test]
    fn usize_flag_bytes_01() {
        let v: usize = 0xE2_09_08_A6_E2_A6_E2_09;
        assert_eq!(0x00_00_00_00_00_00_00_00, v.cmp_eq_byte(0x07));
        assert_eq!(0x00_00_01_00_00_00_00_00, v.cmp_eq_byte(0x08));
        assert_eq!(0x00_01_00_00_00_00_00_01, v.cmp_eq_byte(0x09));
        assert_eq!(0x00_00_00_01_00_01_00_00, v.cmp_eq_byte(0xA6));
        assert_eq!(0x01_00_00_00_01_00_01_00, v.cmp_eq_byte(0xE2));
    }

    #[test]
    fn usize_bytes_between_127_01() {
        let v: usize = 0x7E_09_00_A6_FF_7F_08_07;
        assert_eq!(0x01_01_00_00_00_00_01_01, v.bytes_between_127(0x00, 0x7F));
        assert_eq!(0x00_01_00_00_00_00_01_00, v.bytes_between_127(0x07, 0x7E));
        assert_eq!(0x00_01_00_00_00_00_00_00, v.bytes_between_127(0x08, 0x7E));
    }

    #[test]
    fn ends_with_line_break_01() {
        assert_eq!(true, ends_with_line_break("\n"));
        assert_eq!(true, ends_with_line_break("\r"));
        assert_eq!(true, ends_with_line_break("\u{000A}"));
        assert_eq!(true, ends_with_line_break("\u{000B}"));
        assert_eq!(true, ends_with_line_break("\u{000C}"));
        assert_eq!(true, ends_with_line_break("\u{000D}"));
        assert_eq!(true, ends_with_line_break("\u{0085}"));
        assert_eq!(true, ends_with_line_break("\u{2028}"));
        assert_eq!(true, ends_with_line_break("\u{2029}"));
    }

    #[test]
    fn ends_with_line_break_02() {
        assert_eq!(true, ends_with_line_break("Hi there!\n"));
        assert_eq!(true, ends_with_line_break("Hi there!\r"));
        assert_eq!(true, ends_with_line_break("Hi there!\u{000A}"));
        assert_eq!(true, ends_with_line_break("Hi there!\u{000B}"));
        assert_eq!(true, ends_with_line_break("Hi there!\u{000C}"));
        assert_eq!(true, ends_with_line_break("Hi there!\u{000D}"));
        assert_eq!(true, ends_with_line_break("Hi there!\u{0085}"));
        assert_eq!(true, ends_with_line_break("Hi there!\u{2028}"));
        assert_eq!(true, ends_with_line_break("Hi there!\u{2029}"));
    }

    #[test]
    fn ends_with_line_break_03() {
        assert_eq!(false, ends_with_line_break(""));
        assert_eq!(false, ends_with_line_break("a"));
        assert_eq!(false, ends_with_line_break("Hi there!"));
    }

    #[test]
    fn ends_with_line_break_04() {
        assert_eq!(false, ends_with_line_break("\na"));
        assert_eq!(false, ends_with_line_break("\ra"));
        assert_eq!(false, ends_with_line_break("\u{000A}a"));
        assert_eq!(false, ends_with_line_break("\u{000B}a"));
        assert_eq!(false, ends_with_line_break("\u{000C}a"));
        assert_eq!(false, ends_with_line_break("\u{000D}a"));
        assert_eq!(false, ends_with_line_break("\u{0085}a"));
        assert_eq!(false, ends_with_line_break("\u{2028}a"));
        assert_eq!(false, ends_with_line_break("\u{2029}a"));
    }
}