hypher/
lib.rs

1/*!
2_hypher_ separates words into syllables.
3
4# Features
5- All-inclusive: Hyphenation patterns are embedded into the binary as
6  efficiently encoded finite automata at build time.
7- Zero load time: Hyphenation automata operate directly over the embedded
8  binary data with no up-front decoding.
9- No allocations unless when hyphenating very long words (> 45 bytes). You can
10  disable the `alloc` feature, but then overly long words lead to a panic.
11- Support for many languages.
12- No unsafe code, no dependencies, no std.
13
14# Example
15*/
16#![cfg_attr(
17    feature = "alloc",
18    doc = r##"
19```rust
20use hypher::{hyphenate, Lang};
21
22let mut syllables = hyphenate("extensive", Lang::English);
23assert_eq!(syllables.join("-"), "ex-ten-sive");
24```
25"##
26)]
27#![cfg_attr(
28    not(feature = "alloc"),
29    doc = r##"
30```rust
31use hypher::{hyphenate, Lang};
32
33let mut syllables = hyphenate("extensive", Lang::English);
34assert_eq!(syllables.next(), Some("ex"));
35assert_eq!(syllables.next(), Some("ten"));
36assert_eq!(syllables.next(), Some("sive"));
37assert_eq!(syllables.next(), None);
38```
39"##
40)]
41/*!
42# Languages
43By default, this crate supports hyphenating more than 30 languages.
44Embedding automata for all these languages will add ~1.1 MiB to your binary.
45Alternatively, you can disable support for all languages and manually choose
46which ones get added:
47
48```toml
49[dependencies]
50hypher = { version = "0.1", default-features = false, features = ["english", "greek"] }
51```
52*/
53
54#![no_std]
55#![forbid(unsafe_code)]
56#![deny(missing_docs)]
57
58#[cfg(any(feature = "alloc", test))]
59extern crate alloc;
60
61use core::fmt::{self, Debug, Formatter};
62use core::iter::FusedIterator;
63use core::num::NonZeroU8;
64
65// Include language data.
66include!("lang.rs");
67
68/// Segment a word into syllables.
69///
70/// Returns an iterator over the syllables.
71///
72/// This uses the default [bounds](Lang::bounds) for the language.
73///
74/// # Panics
75/// Panics if the word is more than [`MAX_INLINE_SIZE`] bytes long and the `alloc`
76/// feature is disabled.
77///
78/// # Example
79/// ```
80/// # use hypher::{hyphenate, Lang};
81/// let mut syllables = hyphenate("extensive", Lang::English);
82/// assert_eq!(syllables.next(), Some("ex"));
83/// assert_eq!(syllables.next(), Some("ten"));
84/// assert_eq!(syllables.next(), Some("sive"));
85/// assert_eq!(syllables.next(), None);
86/// # assert_eq!(syllables.next(), None);
87/// ```
88pub fn hyphenate(word: &str, lang: Lang) -> Syllables<'_> {
89    let (left_min, right_min) = lang.bounds();
90    hyphenate_bounded(word, lang, left_min, right_min)
91}
92
93/// Segment a word into syllables, but forbid breaking between the given number
94/// of chars to each side.
95///
96/// Returns an iterator over the syllables.
97///
98/// # Panics
99/// Panics if the word is more than [`MAX_INLINE_SIZE`] bytes long and the `alloc`
100/// feature is disabled.
101///
102/// # Example
103/// By setting the left bound to three, we forbid the possible break between
104/// `ex` and `ten`.
105/// ```
106/// # use hypher::{hyphenate_bounded, Lang};
107/// let mut syllables = hyphenate_bounded("extensive", Lang::English, 3, 1);
108/// assert_eq!(syllables.next(), Some("exten"));
109/// assert_eq!(syllables.next(), Some("sive"));
110/// assert_eq!(syllables.next(), None);
111/// ```
112pub fn hyphenate_bounded(
113    word: &str,
114    lang: Lang,
115    left_min: usize,
116    right_min: usize,
117) -> Syllables<'_> {
118    // Initialize the trie state for the language.
119    let root = lang.root();
120
121    // Lowercase and add dots before and after the word..
122    let dotted = lowercase_and_dot(word);
123    let dotted = dotted.as_slice();
124
125    // Convert char bounds to byte bounds in the dotted word.
126    let (min_idx, max_idx) = char_to_byte_bounds(word, left_min, right_min);
127
128    // The levels between each two inner bytes of the word.
129    let mut levels = Bytes::zeros(word.len().saturating_sub(1));
130    let levels_mut = levels.as_mut_slice();
131
132    // Start pattern matching at each character boundary.
133    for start in 0..dotted.len() {
134        if !is_char_boundary(dotted[start]) {
135            continue;
136        }
137
138        let mut state = root;
139        for &b in &dotted[start..] {
140            if let Some(next) = state.transition(b) {
141                state = next;
142                for (offset, level) in state.levels() {
143                    let split = start + offset;
144
145                    // Example
146                    //
147                    // Dotted: . h e l l o .
148                    // Levels:    0 2 3 0
149                    if split >= min_idx && split <= max_idx {
150                        let slot = &mut levels_mut[split - 2];
151                        *slot = (*slot).max(level);
152                    }
153                }
154            } else {
155                break;
156            }
157        }
158    }
159
160    // Break into segments at odd levels.
161    Syllables { word, cursor: 0, levels }
162}
163
164/// Lowercase a word and add dots before and after it.
165///
166/// The dots enable patterns that match based on whether they are at the edges
167/// of the word.
168fn lowercase_and_dot(word: &str) -> Bytes {
169    let mut dotted = Bytes::zeros(word.len() + 2);
170    let dotted_mut = dotted.as_mut_slice();
171    dotted_mut[0] = b'.';
172
173    // Add the lowercased chars.
174    let mut offset = 1;
175    for mut c in word.chars() {
176        let mut lower = c.to_lowercase();
177        if let (Some(l), None) = (lower.next(), lower.next()) {
178            if l.len_utf8() == c.len_utf8() {
179                c = l;
180            }
181        }
182        offset += c.encode_utf8(&mut dotted_mut[offset..]).len();
183    }
184
185    debug_assert_eq!(offset, word.len() + 1);
186    dotted_mut[offset] = b'.';
187    dotted
188}
189
190/// Convert char bounds to byte bounds in the dotted word.
191fn char_to_byte_bounds(word: &str, left_min: usize, right_min: usize) -> (usize, usize) {
192    // It makes no sense to split outside the word.
193    let left_min = left_min.max(1);
194    let right_min = right_min.max(1);
195
196    // Convert from chars to byte indices in the dotted word.
197    let min_idx = 1 + word.chars().take(left_min).map(char::len_utf8).sum::<usize>();
198    let max_idx = 1 + word.len()
199        - word.chars().rev().take(right_min).map(char::len_utf8).sum::<usize>();
200
201    (min_idx, max_idx)
202}
203
204/// An iterator over the syllables of a word.
205///
206/// This struct is created by [`hyphenate`] and [`hyphenate_bounded`].
207#[derive(Debug, Clone)]
208pub struct Syllables<'a> {
209    word: &'a str,
210    cursor: usize,
211    levels: Bytes,
212}
213
214impl Syllables<'_> {
215    /// Join the syllables with a separator like a hyphen or soft hyphen.
216    ///
217    /// This is only available when the `alloc` feature is enabled.
218    ///
219    /// # Example
220    /// Adding soft hyphens at every opportunity.
221    /// ```
222    /// # use hypher::{hyphenate, Lang};
223    /// # let joined =
224    /// hyphenate("wonderful", Lang::English).join("\u{ad}");
225    /// # assert_eq!(joined, "won\u{ad}der\u{ad}ful")
226    /// ```
227    #[cfg(any(feature = "alloc", test))]
228    pub fn join(mut self, sep: &str) -> alloc::string::String {
229        let extra = self.splits() * sep.len();
230        let mut s = alloc::string::String::with_capacity(self.word.len() + extra);
231        s.extend(self.next());
232        for syllable in self {
233            s.push_str(sep);
234            s.push_str(syllable);
235        }
236        s
237    }
238
239    /// The remaining number of splits in the word.
240    fn splits(&self) -> usize {
241        self.levels.as_slice().iter().filter(|&lvl| lvl % 2 == 1).count()
242    }
243}
244
245impl<'a> Iterator for Syllables<'a> {
246    type Item = &'a str;
247
248    fn next(&mut self) -> Option<Self::Item> {
249        let found = self.levels.any(|lvl| lvl % 2 == 1);
250        let start = self.cursor;
251        let end = self.word.len() - self.levels.len() - found as usize;
252        self.cursor = end;
253        (start < end).then(|| &self.word[start..end])
254    }
255
256    fn size_hint(&self) -> (usize, Option<usize>) {
257        let len = if self.word.is_empty() { 0 } else { 1 + self.splits() };
258        (len, Some(len))
259    }
260}
261
262impl ExactSizeIterator for Syllables<'_> {}
263
264impl FusedIterator for Syllables<'_> {}
265
266/// The maximum size (in bytes) of words that may be hyphenated without
267/// allocating.
268pub const MAX_INLINE_SIZE: usize = 45;
269const INLINE_BUF_SIZE: usize = MAX_INLINE_SIZE + 2; // +2 for dots
270
271/// Storage for and iterator over bytes.
272#[derive(Clone)]
273enum Bytes {
274    Array([u8; INLINE_BUF_SIZE], NonZeroU8),
275    #[cfg(feature = "alloc")]
276    Vec(alloc::vec::IntoIter<u8>),
277}
278
279impl Bytes {
280    /// Create zero-initialized bytes.
281    fn zeros(len: usize) -> Self {
282        if len <= INLINE_BUF_SIZE {
283            // MAX+1-MAX is still nonzero, we can unwrap
284            let start = NonZeroU8::new(INLINE_BUF_SIZE as u8 + 1 - len as u8).unwrap();
285            Self::Array([0; INLINE_BUF_SIZE], start)
286        } else {
287            #[cfg(not(feature = "alloc"))]
288            panic!(
289                "hypher: maximum word length is {MAX_INLINE_SIZE} bytes when `alloc` is disabled"
290            );
291
292            #[cfg(feature = "alloc")]
293            Self::Vec(alloc::vec![0; len].into_iter())
294        }
295    }
296
297    /// Access the bytes as a slice.
298    fn as_slice(&self) -> &[u8] {
299        match self {
300            Self::Array(arr, start) => &arr[start.get() as usize - 1..],
301            #[cfg(feature = "alloc")]
302            Self::Vec(iter) => iter.as_slice(),
303        }
304    }
305
306    /// Access the bytes as a mutable slice.
307    fn as_mut_slice(&mut self) -> &mut [u8] {
308        match self {
309            Self::Array(arr, start) => &mut arr[start.get() as usize - 1..],
310            #[cfg(feature = "alloc")]
311            Self::Vec(iter) => iter.as_mut_slice(),
312        }
313    }
314}
315
316impl Iterator for Bytes {
317    type Item = u8;
318
319    fn next(&mut self) -> Option<Self::Item> {
320        match self {
321            Self::Array(arr, start) => {
322                let index = start.get() as usize - 1;
323                if index < INLINE_BUF_SIZE {
324                    *start = start.saturating_add(1); // Will never reach 255 anyways.
325                    Some(arr[index])
326                } else {
327                    None
328                }
329            }
330            #[cfg(feature = "alloc")]
331            Self::Vec(iter) => iter.next(),
332        }
333    }
334
335    fn size_hint(&self) -> (usize, Option<usize>) {
336        match self {
337            Self::Array(..) => (self.as_slice().len(), Some(self.as_slice().len())),
338            #[cfg(feature = "alloc")]
339            Self::Vec(iter) => iter.size_hint(),
340        }
341    }
342}
343
344impl ExactSizeIterator for Bytes {}
345
346impl Debug for Bytes {
347    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
348        self.as_slice().fmt(f)
349    }
350}
351
352/// A state in a trie traversal.
353#[derive(Copy, Clone)]
354struct State<'a> {
355    data: &'a [u8],
356    addr: usize,
357    stride: usize,
358    levels: &'a [u8],
359    trans: &'a [u8],
360    targets: &'a [u8],
361}
362
363impl<'a> State<'a> {
364    /// Create a new state at the root node.
365    #[allow(unused)]
366    fn root(data: &'a [u8]) -> Self {
367        let bytes = data[..4].try_into().unwrap();
368        let addr = u32::from_be_bytes(bytes) as usize;
369        Self::at(data, addr)
370    }
371
372    /// Create a new state at the given node address.
373    fn at(data: &'a [u8], addr: usize) -> Self {
374        let node = &data[addr..];
375        let mut pos = 0;
376
377        // Decode whether the state has levels and the transition count.
378        let has_levels = node[pos] >> 7 != 0;
379        let stride = usize::from((node[pos] >> 5) & 3);
380        let mut count = usize::from(node[pos] & 31);
381        pos += 1;
382
383        // Possibly decode high transition count.
384        if count == 31 {
385            count = usize::from(node[pos]);
386            pos += 1;
387        }
388
389        // Decode the levels.
390        let mut levels: &[u8] = &[];
391        if has_levels {
392            let offset_hi = usize::from(node[pos]) << 4;
393            let offset_lo = usize::from(node[pos + 1]) >> 4;
394            let offset = offset_hi | offset_lo;
395            let len = usize::from(node[pos + 1] & 15);
396            levels = &data[offset..offset + len];
397            pos += 2;
398        }
399
400        // Decode the transitions.
401        let trans = &node[pos..pos + count];
402        pos += count;
403
404        // Decode the targets.
405        let targets = &node[pos..pos + stride * count];
406        Self { data, addr, stride, levels, trans, targets }
407    }
408
409    /// Return the state reached by following the transition labelled `b`.
410    /// Returns `None` if there is no such state.
411    fn transition(self, b: u8) -> Option<Self> {
412        self.trans.iter().position(|&x| x == b).map(|idx| {
413            let offset = self.stride * idx;
414            let delta = from_be_bytes(&self.targets[offset..offset + self.stride]);
415            let next = (self.addr as isize + delta) as usize;
416            Self::at(self.data, next)
417        })
418    }
419
420    /// Returns the levels contained in the state.
421    fn levels(self) -> impl Iterator<Item = (usize, u8)> + 'a {
422        let mut offset = 0;
423        self.levels.iter().map(move |&packed| {
424            let dist = usize::from(packed / 10);
425            let level = packed % 10;
426            offset += dist;
427            (offset, level)
428        })
429    }
430}
431
432/// Decode a signed number with 1, 2 or 3 bytes.
433fn from_be_bytes(buf: &[u8]) -> isize {
434    if let Ok(array) = buf.try_into() {
435        i8::from_be_bytes(array) as isize
436    } else if let Ok(array) = buf.try_into() {
437        i16::from_be_bytes(array) as isize
438    } else if buf.len() == 3 {
439        let first = usize::from(buf[0]) << 16;
440        let second = usize::from(buf[1]) << 8;
441        let third = usize::from(buf[2]);
442        let unsigned = first | second | third;
443        unsigned as isize - (1 << 23)
444    } else {
445        panic!("invalid stride");
446    }
447}
448
449/// Whether a byte is a character boundary.
450fn is_char_boundary(b: u8) -> bool {
451    (b as i8) >= -0x40
452}
453
454#[cfg(test)]
455mod tests {
456    use super::{hyphenate, Lang, MAX_INLINE_SIZE};
457
458    #[allow(unused)]
459    use Lang::*;
460
461    #[allow(unused)]
462    fn test(lang: Lang, hyphenated: &str) {
463        let word = hyphenated.replace('-', "");
464        let syllables = hyphenate(&word, lang);
465        assert_eq!(syllables.join("-"), hyphenated);
466    }
467
468    #[test]
469    #[cfg(feature = "english")]
470    fn test_empty() {
471        let mut syllables = hyphenate("", English);
472        assert_eq!(syllables.next(), None);
473    }
474
475    #[test]
476    #[cfg(feature = "english")]
477    fn test_exact() {
478        assert_eq!(hyphenate("", English).len(), 0);
479        assert_eq!(hyphenate("hello", English).len(), 1);
480        assert_eq!(hyphenate("extensive", English).len(), 3);
481    }
482
483    const LONG_WORD: &str = "thisisaverylongstringwithanunrealisticwordlengthforenglishbutitmightbepossibleinanotherlanguage";
484
485    #[test]
486    #[cfg(all(feature = "english", feature = "alloc"))]
487    fn test_alloc() {
488        assert_eq!(hyphenate(&LONG_WORD[..MAX_INLINE_SIZE - 1], English).len(), 13);
489        assert_eq!(hyphenate(&LONG_WORD[..MAX_INLINE_SIZE], English).len(), 12);
490        assert_eq!(hyphenate(&LONG_WORD[..MAX_INLINE_SIZE + 1], English).len(), 12);
491        assert_eq!(hyphenate(LONG_WORD, English).len(), 25);
492    }
493
494    #[test]
495    #[cfg(all(feature = "english", not(feature = "alloc")))]
496    fn test_nonalloc() {
497        _ = hyphenate(&LONG_WORD[..MAX_INLINE_SIZE], English).count();
498    }
499    #[test]
500    #[should_panic]
501    #[cfg(all(feature = "english", not(feature = "alloc")))]
502    fn test_nonalloc_fail() {
503        _ = hyphenate(&LONG_WORD[..MAX_INLINE_SIZE + 1], English).count();
504    }
505
506    #[test]
507    #[cfg(feature = "english")]
508    fn test_english() {
509        test(English, "");
510        test(English, "hi");
511        test(English, "wel-come");
512        test(English, "walk-ing");
513        test(English, "cap-tiVe");
514        test(English, "pur-sue");
515        test(English, "wHaT-eVeR");
516        test(English, "bro-ken");
517        test(English, "ex-ten-sive");
518        test(English, "Prob-a-bil-ity");
519        test(English, "rec-og-nize");
520    }
521
522    #[test]
523    #[cfg(feature = "german")]
524    fn test_german() {
525        test(German, "");
526        test(German, "Baum");
527        test(German, "ge-hen");
528        test(German, "Ap-fel");
529        test(German, "To-ma-te");
530        test(German, "Ein-ga-be-auf-for-de-rung");
531        test(German, "Fort-pflan-zungs-lem-ma");
532        test(German, "stra-te-gie-er-hal-ten-den");
533        test(German, "hübsch");
534        test(German, "häss-lich");
535        test(German, "über-zeu-gen-der");
536    }
537
538    #[test]
539    #[cfg(feature = "greek")]
540    fn test_greek() {
541        test(Greek, "δια-με-ρί-σμα-τα");
542        test(Greek, "λα-τρευ-τός");
543        test(Greek, "κά-τοι-κος");
544    }
545
546    #[test]
547    #[cfg(feature = "georgian")]
548    fn test_georgian() {
549        test(Georgian, "თა-რო");
550        test(Georgian, "შეყ-ვა-ნა");
551        test(Georgian, "კარ-ტო-ფი-ლი");
552    }
553
554    #[test]
555    #[cfg(feature = "polish")]
556    fn test_polish() {
557        test(Polish, "wy-kształ-ciu-chy");
558    }
559
560    #[test]
561    #[cfg(feature = "czech")]
562    fn test_czech() {
563        test(Czech, "po-ví-dá-me");
564        test(Czech, "nej-ja-s-něj-ší");
565        test(Czech, "br-něn-ský");
566    }
567}