1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
//! Hyphenating iterators.

use std::borrow::Cow;

use unicode_segmentation::UnicodeSegmentation;

use language::{Corpus};
use utilia::{Interspersable, Intersperse};

pub trait Hyphenation<Hyphenator> where Hyphenator : Iterator {
    /// Returns the indices of valid hyphenation points within the text.
    fn opportunities(self, corp: &Corpus) -> Vec<usize>;

    /// Returns an iterator over segments of the text separated by valid
    /// hyphenation points.
    ///
    /// When iterating over a word, such segments coincide with orthographic
    /// syllables. Note that, in some orthographies, the syllables of a hyphenated
    /// word are not necessarily substrings of the original word.
    fn hyphenate(self, corp: &Corpus) -> Hyphenator;
}


/// The `Standard` hyphenator iterates over a string, returning slices
/// delimited by string boundaries and valid hyphenation points.
///
/// For individual words, such slices coincide with orthographic syllables.
#[derive(Clone, Debug)]
pub struct Standard<'a> {
    text: &'a str,
    opportunities: Vec<usize>,
    prior: usize,
    i: usize
}

impl<'a> Standard<'a> {
    /// Inserts a soft hyphen at hyphenation points.
    pub fn punctuate(self) -> Intersperse<Self> {
        self.intersperse("\u{ad}")
    }

    /// Inserts a given string at hyphenation points.
    pub fn punctuate_with(self, mark: &'a str) -> Intersperse<Self> {
        self.intersperse(mark)
    }
}


impl<'a> Iterator for Standard<'a> {
    type Item = &'a str;

    fn next(&mut self) -> Option<&'a str> {
        let start = self.prior;
        let i = self.i;

        match self.opportunities.get(i) {
            Some(&end) => {
                self.prior = end;
                self.i = i + 1;
                Some(&self.text[start .. end])
            },
            None => {
                if i <= self.opportunities.len() {
                    self.i = i + 1;
                    Some(&self.text[start ..])
                } else {
                    None
                }
            }
        }
    }
}


impl<'a> Hyphenation<Standard<'a>> for &'a str {
    /// Returns the byte indices of valid hyphenation points within the string.
    fn opportunities(self, corp: &Corpus) -> Vec<usize> {
        let (l_min, r_min) = (corp.left_min, corp.right_min);
        let length_min = l_min + r_min;

        if self.chars().count() < length_min {
            return vec![];
        }

        let by_word = self.split_word_bound_indices();

        by_word.flat_map(|(i, word)| {
            let pts = match corp.exceptions.iter()
                                .filter_map(|exs| exs.score(word))
                                .next() {
                    Some(vec) => Cow::Borrowed(vec),
                    None => Cow::Owned(corp.patterns.score(word))
            }.into_owned();
            let length = pts.len();
            let l = l_min;
            let r = if length >= length_min { length - l_min - r_min + 1 } else { 0 };

            word.char_indices().skip(l)
                .zip(pts.into_iter().skip(l).take(r))
                .filter(|&(_, p)| p % 2 != 0)
                .map(move |((i1, _), _)| i1 + i)
        }).collect()
    }

    /// Returns an iterator over string slices separated by valid hyphenation
    /// points.
    fn hyphenate(self, corp: &Corpus) -> Standard<'a> {
        Standard {
            text: self,
            opportunities: self.opportunities(corp),
            prior: 0,
            i: 0
        }
    }
}