bb 0.4.4 - Docs.rs

// bb
//
// Copyright 2019 Manos Pitsidianakis
//
// This file is part of bb.
//
// bb is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// bb is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with bb. If not, see <http://www.gnu.org/licenses/>.

extern crate unicode_segmentation;
use core::{cmp::Ordering, iter::Peekable, str::FromStr};

use LineBreakClass::*;

use self::unicode_segmentation::UnicodeSegmentation;
use crate::text_processing::{tables::LINE_BREAK_RULES, types::LineBreakClass};

#[derive(Debug, PartialEq)]
pub enum LineBreakCandidate {
    MandatoryBreak,
    BreakAllowed,
    // NoBreak, Not used.
}

use LineBreakCandidate::*;

pub struct LineBreakCandidateIter<'a> {
    text: &'a str,
    iter: Peekable<unicode_segmentation::GraphemeIndices<'a>>,
    pos: usize,
    /// Needed for rule `LB30a`
    reg_ind_streak: u32,
}

impl<'a> LineBreakCandidateIter<'a> {
    pub fn new(text: &'a str) -> Self {
        LineBreakCandidateIter {
            text,
            pos: 0,
            iter: UnicodeSegmentation::grapheme_indices(text, true).peekable(),
            reg_ind_streak: 0,
        }
    }
}

macro_rules! get_base_character {
    ($grapheme:ident) => {{
        char::from_str($grapheme.get(0..1).unwrap_or_else(|| {
            $grapheme.get(0..2).unwrap_or_else(|| {
                $grapheme
                    .get(0..3)
                    .unwrap_or_else(|| $grapheme.get(0..4).unwrap())
            })
        }))
    }};
    ($grapheme:expr) => {{
        char::from_str($grapheme.get(0..1).unwrap_or_else(|| {
            $grapheme.get(0..2).unwrap_or_else(|| {
                $grapheme
                    .get(0..3)
                    .unwrap_or_else(|| $grapheme.get(0..4).unwrap())
            })
        }))
    }};
}

/// Side effects: none
macro_rules! get_class {
    ($grapheme:ident) => {{
        get_base_character!($grapheme)
            .map(|char| search_table(char as u32, LINE_BREAK_RULES))
            .unwrap_or(XX)
    }};
    ($grapheme:expr) => {{
        get_base_character!($grapheme)
            .map(|char| search_table(char as u32, LINE_BREAK_RULES))
            .unwrap_or(XX)
    }};
}

/// Side effects: Updates $graph_iter and potentially $idx and $grapheme
macro_rules! next_grapheme_class {
    ($graph_iter:ident, $grapheme:ident) => ({
        if let Some((_, g)) = $graph_iter.next() {
            $grapheme = g;
            Some(get_class!(g))
        } else { None }
    });
    (($next_char:ident is $class:expr)) => ({
        $next_char.is_some() && get_class!(($next_char.unwrap().1)) == $class
    });
    (($next_char:ident is $($class:ident),+)) => ({
        $next_char.is_some() && ($(get_class!(($next_char.unwrap().1)) == $class)||+)
    });
}

/// Returns positions where breaks can happen
/// Examples:
/// ```
/// use text_processing::{
///     self,
///     line_break::LineBreakCandidateIter,
///     LineBreakCandidate::{self, *},
/// };
///
/// assert!(LineBreakCandidateIter::new("")
///     .collect::<Vec<(usize, LineBreakCandidate)>>()
///     .is_empty());
/// assert_eq!(
///     &[(7, BreakAllowed), (12, MandatoryBreak)],
///     LineBreakCandidateIter::new("Sample Text.")
///         .collect::<Vec<(usize, LineBreakCandidate)>>()
///         .as_slice()
/// );
/// assert_eq!(
///     &[
///         (3, MandatoryBreak),
///         (7, MandatoryBreak),
///         (10, BreakAllowed),
///         (17, MandatoryBreak)
///     ],
///     LineBreakCandidateIter::new("Sa\nmp\r\nle T(e)xt.")
///         .collect::<Vec<(usize, LineBreakCandidate)>>()
///         .as_slice()
/// );
/// ```
impl<'a> Iterator for LineBreakCandidateIter<'a> {
    type Item = (usize, LineBreakCandidate);
    fn next(&mut self) -> Option<Self::Item> {
        // After end of text, there are no breaks.
        if self.pos >= self.text.len() {
            return None;
        }
        // LB3 Always break at the end of text
        if self.pos + 1 == self.text.len() {
            self.pos += 1;
            return Some((self.pos, MandatoryBreak));
        }

        let (idx, mut grapheme) = self.iter.next().unwrap();
        let LineBreakCandidateIter {
            ref mut iter,
            text,
            ref mut reg_ind_streak,
            ref mut pos,
        } = self;
        let iter = iter.by_ref();

        debug_assert_eq!(idx, *pos);

        // LB2 Never break at the start of text
        if idx == 0 {
            *pos += grapheme.len();
            return self.next();
        }

        let class = get_class!(grapheme);

        if class != RI {
            *reg_ind_streak = 0;
        }

        // LB1 Assign a line breaking class to each code point of the input. Resolve
        // AI, CB, CJ, SA, SG, and XX into other line breaking classes
        // depending on criteria outside the scope of this algorithm.
        //
        // In the absence of such criteria all characters with a specific combination
        // of original class and General_Category property value are resolved
        // as follows: Resolved Original     General_Category
        // AL       AI, SG, XX   Any
        // CM       SA           Only Mn or Mc
        // AL       SA           Any except Mn and Mc
        // NS       SJ           Any

        // TODO: LB1

        // Check if next character class allows breaks before it
        let next_char: Option<&(usize, &str)> = iter.peek();

        match class {
            BK => {
                // LB4 Always Break after hard line breaks.
                *pos += grapheme.len();
                return Some((*pos, MandatoryBreak));
            }
            // LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks
            CR if next_grapheme_class!((next_char is LF)) => {
                *pos += grapheme.len();
                assert!(Some(LF) == next_grapheme_class!(iter, grapheme));
                *pos += grapheme.len();
                return Some((*pos, MandatoryBreak));
            }
            CR | LF | NL => {
                *pos += grapheme.len();
                return Some((*pos, MandatoryBreak));
            }
            _ => {}
        }
        if let Some((_, next_grapheme)) = next_char {
            let next_class = get_class!(next_grapheme);
            match next_class {
                // LB6 Do not break before hard line breaks.  × ( BK | CR | LF | NL )
                BK | CR | LF | NL => {
                    *pos += grapheme.len();
                    return self.next();
                }
                // LB7 Do not break before spaces or zero width
                // space. × SP × ZW
                SP | ZW => {
                    *pos += grapheme.len();
                    return self.next();
                }
                _ => {}
            }
        }
        match class {
            ZW => {
                // LB8 Break before any character following a zero-width space, even if one or
                // more spaces intervene
                // ZW SP* ÷
                *pos += grapheme.len();
                while Some(SP) == next_grapheme_class!(iter, grapheme) {
                    *pos += grapheme.len();
                }
                return Some((*pos, MandatoryBreak));
            }
            ZWJ => {
                // LB8a Do not break after a zero width joiner.
                *pos += grapheme.len();
                return self.next();
            }

            CM => {
                // LB9 Do not break a combining character sequence; treat it as if it has the
                // line breaking class of the base character in all of the
                // following rules. Treat ZWJ as if it were CM.
                // Treat X (CM | ZWJ)* as if it were X.
                // where X is any line break class except BK, CR, LF, NL, SP, or ZW.

                // Unreachable since we break lines based on graphemes, not characters
                unreachable!();
            }
            WJ => {
                // : LB11 Do not break before or after Word joiner and related characters.
                *pos += grapheme.len();
                // Get next grapheme
                if next_grapheme_class!(iter, grapheme).is_some() {
                    *pos += grapheme.len();
                }
                return self.next();
            }
            GL => {
                // LB12 Non-breaking characters: LB12 Do not break after NBSP and related
                // characters.
                *pos += grapheme.len();
                return self.next();
            }
            _ => {}
        }
        if let Some((next_idx, next_grapheme)) = next_char {
            let next_class = get_class!(next_grapheme);
            match next_class {
                GL if ![SP, BA, HY].contains(&class) => {
                    // LB12a Do not break before NBSP and related characters, except after spaces
                    // and hyphens.  [^SP BA HY] × GL
                    // Also LB12 Do not break after NBSP and related characters
                    *pos += grapheme.len();
                    return self.next();
                }
                // LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces.
                CL | CP | EX | IS | SY => {
                    *pos = *next_idx;
                    return self.next();
                }
                _ => {}
            }
        }

        match class {
            // LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces.
            SP if [CL, CP, EX, IS, SY].contains(&get_class!(text[idx..].trim_start())) => {
                *pos += grapheme.len();
                while ![CL, CP, EX, IS, SY].contains(&next_grapheme_class!(iter, grapheme).unwrap())
                {
                    *pos += grapheme.len();
                }
                *pos += grapheme.len();
                return self.next();
            }
            OP => {
                /* LB14 Do not break after ‘[’, even after spaces.
                 * OP SP* ×
                 */
                for (idx, grapheme) in self.iter.by_ref() {
                    *pos = idx + grapheme.len();
                    if !(get_class!(grapheme) == SP) {
                        break;
                    }
                }
                return self.next();
            }
            QU if get_class!(text[idx..].trim_start()) == OP => {
                /* LB15 Do not break within ‘”[’, even with intervening spaces.
                 * QU SP* × OP */
                *pos += grapheme.len();
                while Some(SP) == next_grapheme_class!(iter, grapheme) {
                    *pos += grapheme.len();
                }
                *pos = idx;
                return self.next();
            }
            QU => {
                // LB19 Do not break before or after quotation marks, such as ‘ ” ’.
                *pos += grapheme.len();
                if let Some((_, g)) = self.iter.next() {
                    *pos += g.len();
                }
                return self.next();
            }
            LineBreakClass::CL | LineBreakClass::CP
                if get_class!(text[idx..].trim_start()) == NS =>
            {
                /* LB16 Do not break between closing punctuation and a nonstarter (lb=NS),
                 * even with intervening spaces.
                 * (CL | CP) SP* × NS */
                *pos += grapheme.len();
                while Some(SP) == next_grapheme_class!(iter, grapheme) {
                    *pos += grapheme.len();
                }
                return self.next();
            }
            B2 if get_class!(text[idx..].trim_start()) == B2 => {
                *pos += grapheme.len();
                while Some(SP) == next_grapheme_class!(iter, grapheme) {
                    *pos += grapheme.len();
                }
                return self.next();
            }
            SP => {
                // LB18 Break after spaces.  SP ÷
                // Space 0x20 is 1 byte long.
                *pos += 1;
                return Some((*pos, BreakAllowed));
            }
            _ => {}
        }
        if let Some((next_idx, next_grapheme)) = next_char {
            let next_class = get_class!(next_grapheme);
            match next_class {
                QU if class != SP => {
                    // LB19 Do not break before or after quotation marks, such as ‘ ” ’.
                    *pos = *next_idx + next_grapheme.len();
                    self.iter.next();
                    return self.next();
                }
                _ => {}
            }
        }
        match class {
            CB => {
                // LB20 Break before and after unresolved CB.
                *pos += grapheme.len();
                return Some((*pos - 1, BreakAllowed));
            }
            // LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
            // kana, and other non-starters, or after acute accents.  × BA,  × HY, × NS,  BB ×
            BB => {
                *pos += grapheme.len();
                return self.next();
            }
            _ => {}
        }

        if let Some((_, next_grapheme)) = next_char {
            let next_class = get_class!(next_grapheme);
            match next_class {
                BA | HY | NS => {
                    // LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces,
                    // small kana, and other non-starters, or after acute
                    // accents.  × BA,  × HY, × NS,  BB ×
                    *pos += grapheme.len();
                    return self.next();
                }
                _ => {}
            }
        }
        match class {
            HL if next_grapheme_class!((next_char is HY, BA)) => {
                // LB21a Don’t break after Hebrew + Hyphen.  HL (HY | BA) ×
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // LB21b Don’t break between ,Solidus and Hebrew letters.  SY × HL
            SY if next_grapheme_class!((next_char is HL)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                // bypass next_char
                self.iter.next().unwrap();
                if let Some((idx, next_grapheme)) = self.iter.next() {
                    *pos = idx + next_grapheme.len();
                }
                self.next()
            }
            //  LB22 Do not break between two ellipses, or between letters, numbers or excla-
            //  mations and ellipsis.
            //  Examples: ‘9...’, ‘a...’, ‘H...’
            //  (AL | HL) × IN
            AL | HL if next_grapheme_class!((next_char is IN)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // EX × IN
            EX if next_grapheme_class!((next_char is IN)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            EX => {
                // LB13
                *pos += grapheme.len();
                self.next()
            }
            // (ID | EB | EM) × IN
            ID | EB | EM if next_grapheme_class!((next_char is IN)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // IN × IN
            IN if next_grapheme_class!((next_char is IN)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // NU × IN
            NU if next_grapheme_class!((next_char is IN)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // LB23 Do not break between digits and letters.
            // (AL | HL) × NU
            AL | HL if next_grapheme_class!((next_char is NU)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // NU × (AL | HL)
            NU if next_grapheme_class!((next_char is AL, HL)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // LB23a Do not break between numeric prefixes and ideographs, or between ideographs
            // and numeric postfixes.
            // PR × (ID | EB | EM)
            PR if next_grapheme_class!((next_char is ID, EB, EM)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // (ID | EB | EM) × PO
            ID | EB | EM if next_grapheme_class!((next_char is PO)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // B24 Do not break between numeric prefix/postfix and letters, or between
            // letters and prefix/postfix.
            // (PR | PO) × (AL | HL)
            PR | PO if next_grapheme_class!((next_char is AL, HL)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // (AL | HL) × (PR | PO)
            AL | HL if next_grapheme_class!((next_char is PR, PO)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // LB25 Do not break between the following pairs of classes relevant to numbers:
            // CL × PO
            CL if next_grapheme_class!((next_char is PO)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // CP × PO
            CP if next_grapheme_class!((next_char is PO)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // CL × PR
            CL if next_grapheme_class!((next_char is PR)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // CP × PR
            CP if next_grapheme_class!((next_char is PR)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // NU × PO
            NU if next_grapheme_class!((next_char is PO)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // NU × PR
            NU if next_grapheme_class!((next_char is PR)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // PO × OP
            PO if next_grapheme_class!((next_char is OP)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // PO × NU
            PO if next_grapheme_class!((next_char is NU)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // PR × OP
            PR if next_grapheme_class!((next_char is OP)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // PR × NU
            PR if next_grapheme_class!((next_char is NU)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // HY × NU
            HY if next_grapheme_class!((next_char is NU)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // IS × NU
            IS if next_grapheme_class!((next_char is NU)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // NU × NU
            NU if next_grapheme_class!((next_char is NU)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // SY × NU
            SY if next_grapheme_class!((next_char is NU)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // LB26 Do not break a Korean syllable.
            // JL × (JL | JV | H2 | H3)
            JL if next_grapheme_class!((next_char is JL, JV, H2, H3)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // (JV | H2) × (JV | JT)
            JV | H2 if next_grapheme_class!((next_char is JV, JT)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // (JT | H3) × JT
            JT | H3 if next_grapheme_class!((next_char is JT)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // LB27 Treat a Korean Syllable Block the same as ID.
            // (JL | JV | JT | H2 | H3) × IN
            JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is IN)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // (JL | JV | JT | H2 | H3) × PO
            JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is PO)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // PR × (JL | JV | JT | H2 | H3)
            PR if next_grapheme_class!((next_char is JL, JV, JT, H2, H3)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // LB28 Do not break between alphabetics (“at”).
            // (AL | HL) × (AL | HL)
            AL | HL if next_grapheme_class!((next_char is AL, HL)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
            // IS × (AL | HL)
            IS if next_grapheme_class!((next_char is AL, HL)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // LB30 Do not break between letters, numbers, or ordinary symbols and opening
            // or closing parentheses.
            // (AL | HL | NU) × OP
            AL | HL | NU if next_grapheme_class!((next_char is OP)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // CP × (AL | HL | NU)
            CP if next_grapheme_class!((next_char is AL, HL , NU)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            // LB30b Do not break between an emoji base and an emoji modifier.
            // EB × EM
            EB if next_grapheme_class!((next_char is EM)) => {
                let (idx, next_grapheme) = next_char.unwrap();
                *pos = idx + next_grapheme.len();
                self.iter.next();
                self.next()
            }
            RI => {
                /* LB30a Break between two regional indicator symbols if and only if there
                 * are an even number of regional indicators preceding the
                 * position of the break. sot (RI RI)* RI × RI
                 * [^RI] (RI RI)* RI × RI */
                *reg_ind_streak += 1;
                *pos += grapheme.len();
                if *reg_ind_streak % 2 == 1 {
                    return Some((*pos - grapheme.len(), BreakAllowed));
                }
                self.iter.next();
                self.next()
            }
            _ => {
                *pos += grapheme.len();
                Some((*pos - grapheme.len(), BreakAllowed))
            }
        }
    }
}

fn search_table(c: u32, t: &'static [(u32, u32, LineBreakClass)]) -> LineBreakClass {
    match t.binary_search_by(|&(lo, hi, _)| {
        if lo <= c && c <= hi {
            Ordering::Equal
        } else if hi < c {
            Ordering::Less
        } else {
            Ordering::Greater
        }
    }) {
        Ok(idx) => t[idx].2,
        Err(_) => XX,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_line_breaks() {
        let s = "Fell past it.\n\n‘Well!’ thought Alice to herself.";
        let breaks = LineBreakCandidateIter::new(s).collect::<Vec<(usize, LineBreakCandidate)>>();
        let mut prev = 0;
        for b in breaks {
            println!("{:?}", &s[prev..b.0]);
            prev = b.0;
        }
        println!("{:?}", &s[prev..]);
    }
}