1#![cfg_attr(
17 feature = "alloc",
18 doc = r##"
19```rust
20use hypher::{hyphenate, Lang};
21
22let mut syllables = hyphenate("extensive", Lang::English);
23assert_eq!(syllables.join("-"), "ex-ten-sive");
24```
25"##
26)]
27#![cfg_attr(
28 not(feature = "alloc"),
29 doc = r##"
30```rust
31use hypher::{hyphenate, Lang};
32
33let mut syllables = hyphenate("extensive", Lang::English);
34assert_eq!(syllables.next(), Some("ex"));
35assert_eq!(syllables.next(), Some("ten"));
36assert_eq!(syllables.next(), Some("sive"));
37assert_eq!(syllables.next(), None);
38```
39"##
40)]
41#![no_std]
55#![forbid(unsafe_code)]
56#![deny(missing_docs)]
57
58#[cfg(any(feature = "alloc", test))]
59extern crate alloc;
60
61use core::fmt::{self, Debug, Formatter};
62use core::iter::FusedIterator;
63use core::num::NonZeroU8;
64
65include!("lang.rs");
67
68pub fn hyphenate(word: &str, lang: Lang) -> Syllables<'_> {
89 let (left_min, right_min) = lang.bounds();
90 hyphenate_bounded(word, lang, left_min, right_min)
91}
92
93pub fn hyphenate_bounded(
113 word: &str,
114 lang: Lang,
115 left_min: usize,
116 right_min: usize,
117) -> Syllables<'_> {
118 let root = lang.root();
120
121 let dotted = lowercase_and_dot(word);
123 let dotted = dotted.as_slice();
124
125 let (min_idx, max_idx) = char_to_byte_bounds(word, left_min, right_min);
127
128 let mut levels = Bytes::zeros(word.len().saturating_sub(1));
130 let levels_mut = levels.as_mut_slice();
131
132 for start in 0..dotted.len() {
134 if !is_char_boundary(dotted[start]) {
135 continue;
136 }
137
138 let mut state = root;
139 for &b in &dotted[start..] {
140 if let Some(next) = state.transition(b) {
141 state = next;
142 for (offset, level) in state.levels() {
143 let split = start + offset;
144
145 if split >= min_idx && split <= max_idx {
150 let slot = &mut levels_mut[split - 2];
151 *slot = (*slot).max(level);
152 }
153 }
154 } else {
155 break;
156 }
157 }
158 }
159
160 Syllables { word, cursor: 0, levels }
162}
163
164fn lowercase_and_dot(word: &str) -> Bytes {
169 let mut dotted = Bytes::zeros(word.len() + 2);
170 let dotted_mut = dotted.as_mut_slice();
171 dotted_mut[0] = b'.';
172
173 let mut offset = 1;
175 for mut c in word.chars() {
176 let mut lower = c.to_lowercase();
177 if let (Some(l), None) = (lower.next(), lower.next()) {
178 if l.len_utf8() == c.len_utf8() {
179 c = l;
180 }
181 }
182 offset += c.encode_utf8(&mut dotted_mut[offset..]).len();
183 }
184
185 debug_assert_eq!(offset, word.len() + 1);
186 dotted_mut[offset] = b'.';
187 dotted
188}
189
190fn char_to_byte_bounds(word: &str, left_min: usize, right_min: usize) -> (usize, usize) {
192 let left_min = left_min.max(1);
194 let right_min = right_min.max(1);
195
196 let min_idx = 1 + word.chars().take(left_min).map(char::len_utf8).sum::<usize>();
198 let max_idx = 1 + word.len()
199 - word.chars().rev().take(right_min).map(char::len_utf8).sum::<usize>();
200
201 (min_idx, max_idx)
202}
203
204#[derive(Debug, Clone)]
208pub struct Syllables<'a> {
209 word: &'a str,
210 cursor: usize,
211 levels: Bytes,
212}
213
214impl Syllables<'_> {
215 #[cfg(any(feature = "alloc", test))]
228 pub fn join(mut self, sep: &str) -> alloc::string::String {
229 let extra = self.splits() * sep.len();
230 let mut s = alloc::string::String::with_capacity(self.word.len() + extra);
231 s.extend(self.next());
232 for syllable in self {
233 s.push_str(sep);
234 s.push_str(syllable);
235 }
236 s
237 }
238
239 fn splits(&self) -> usize {
241 self.levels.as_slice().iter().filter(|&lvl| lvl % 2 == 1).count()
242 }
243}
244
245impl<'a> Iterator for Syllables<'a> {
246 type Item = &'a str;
247
248 fn next(&mut self) -> Option<Self::Item> {
249 let found = self.levels.any(|lvl| lvl % 2 == 1);
250 let start = self.cursor;
251 let end = self.word.len() - self.levels.len() - found as usize;
252 self.cursor = end;
253 (start < end).then(|| &self.word[start..end])
254 }
255
256 fn size_hint(&self) -> (usize, Option<usize>) {
257 let len = if self.word.is_empty() { 0 } else { 1 + self.splits() };
258 (len, Some(len))
259 }
260}
261
262impl ExactSizeIterator for Syllables<'_> {}
263
264impl FusedIterator for Syllables<'_> {}
265
266pub const MAX_INLINE_SIZE: usize = 45;
269const INLINE_BUF_SIZE: usize = MAX_INLINE_SIZE + 2; #[derive(Clone)]
273enum Bytes {
274 Array([u8; INLINE_BUF_SIZE], NonZeroU8),
275 #[cfg(feature = "alloc")]
276 Vec(alloc::vec::IntoIter<u8>),
277}
278
279impl Bytes {
280 fn zeros(len: usize) -> Self {
282 if len <= INLINE_BUF_SIZE {
283 let start = NonZeroU8::new(INLINE_BUF_SIZE as u8 + 1 - len as u8).unwrap();
285 Self::Array([0; INLINE_BUF_SIZE], start)
286 } else {
287 #[cfg(not(feature = "alloc"))]
288 panic!(
289 "hypher: maximum word length is {MAX_INLINE_SIZE} bytes when `alloc` is disabled"
290 );
291
292 #[cfg(feature = "alloc")]
293 Self::Vec(alloc::vec![0; len].into_iter())
294 }
295 }
296
297 fn as_slice(&self) -> &[u8] {
299 match self {
300 Self::Array(arr, start) => &arr[start.get() as usize - 1..],
301 #[cfg(feature = "alloc")]
302 Self::Vec(iter) => iter.as_slice(),
303 }
304 }
305
306 fn as_mut_slice(&mut self) -> &mut [u8] {
308 match self {
309 Self::Array(arr, start) => &mut arr[start.get() as usize - 1..],
310 #[cfg(feature = "alloc")]
311 Self::Vec(iter) => iter.as_mut_slice(),
312 }
313 }
314}
315
316impl Iterator for Bytes {
317 type Item = u8;
318
319 fn next(&mut self) -> Option<Self::Item> {
320 match self {
321 Self::Array(arr, start) => {
322 let index = start.get() as usize - 1;
323 if index < INLINE_BUF_SIZE {
324 *start = start.saturating_add(1); Some(arr[index])
326 } else {
327 None
328 }
329 }
330 #[cfg(feature = "alloc")]
331 Self::Vec(iter) => iter.next(),
332 }
333 }
334
335 fn size_hint(&self) -> (usize, Option<usize>) {
336 match self {
337 Self::Array(..) => (self.as_slice().len(), Some(self.as_slice().len())),
338 #[cfg(feature = "alloc")]
339 Self::Vec(iter) => iter.size_hint(),
340 }
341 }
342}
343
344impl ExactSizeIterator for Bytes {}
345
346impl Debug for Bytes {
347 fn fmt(&self, f: &mut Formatter) -> fmt::Result {
348 self.as_slice().fmt(f)
349 }
350}
351
352#[derive(Copy, Clone)]
354struct State<'a> {
355 data: &'a [u8],
356 addr: usize,
357 stride: usize,
358 levels: &'a [u8],
359 trans: &'a [u8],
360 targets: &'a [u8],
361}
362
363impl<'a> State<'a> {
364 #[allow(unused)]
366 fn root(data: &'a [u8]) -> Self {
367 let bytes = data[..4].try_into().unwrap();
368 let addr = u32::from_be_bytes(bytes) as usize;
369 Self::at(data, addr)
370 }
371
372 fn at(data: &'a [u8], addr: usize) -> Self {
374 let node = &data[addr..];
375 let mut pos = 0;
376
377 let has_levels = node[pos] >> 7 != 0;
379 let stride = usize::from((node[pos] >> 5) & 3);
380 let mut count = usize::from(node[pos] & 31);
381 pos += 1;
382
383 if count == 31 {
385 count = usize::from(node[pos]);
386 pos += 1;
387 }
388
389 let mut levels: &[u8] = &[];
391 if has_levels {
392 let offset_hi = usize::from(node[pos]) << 4;
393 let offset_lo = usize::from(node[pos + 1]) >> 4;
394 let offset = offset_hi | offset_lo;
395 let len = usize::from(node[pos + 1] & 15);
396 levels = &data[offset..offset + len];
397 pos += 2;
398 }
399
400 let trans = &node[pos..pos + count];
402 pos += count;
403
404 let targets = &node[pos..pos + stride * count];
406 Self { data, addr, stride, levels, trans, targets }
407 }
408
409 fn transition(self, b: u8) -> Option<Self> {
412 self.trans.iter().position(|&x| x == b).map(|idx| {
413 let offset = self.stride * idx;
414 let delta = from_be_bytes(&self.targets[offset..offset + self.stride]);
415 let next = (self.addr as isize + delta) as usize;
416 Self::at(self.data, next)
417 })
418 }
419
420 fn levels(self) -> impl Iterator<Item = (usize, u8)> + 'a {
422 let mut offset = 0;
423 self.levels.iter().map(move |&packed| {
424 let dist = usize::from(packed / 10);
425 let level = packed % 10;
426 offset += dist;
427 (offset, level)
428 })
429 }
430}
431
432fn from_be_bytes(buf: &[u8]) -> isize {
434 if let Ok(array) = buf.try_into() {
435 i8::from_be_bytes(array) as isize
436 } else if let Ok(array) = buf.try_into() {
437 i16::from_be_bytes(array) as isize
438 } else if buf.len() == 3 {
439 let first = usize::from(buf[0]) << 16;
440 let second = usize::from(buf[1]) << 8;
441 let third = usize::from(buf[2]);
442 let unsigned = first | second | third;
443 unsigned as isize - (1 << 23)
444 } else {
445 panic!("invalid stride");
446 }
447}
448
449fn is_char_boundary(b: u8) -> bool {
451 (b as i8) >= -0x40
452}
453
454#[cfg(test)]
455mod tests {
456 use super::{hyphenate, Lang, MAX_INLINE_SIZE};
457
458 #[allow(unused)]
459 use Lang::*;
460
461 #[allow(unused)]
462 fn test(lang: Lang, hyphenated: &str) {
463 let word = hyphenated.replace('-', "");
464 let syllables = hyphenate(&word, lang);
465 assert_eq!(syllables.join("-"), hyphenated);
466 }
467
468 #[test]
469 #[cfg(feature = "english")]
470 fn test_empty() {
471 let mut syllables = hyphenate("", English);
472 assert_eq!(syllables.next(), None);
473 }
474
475 #[test]
476 #[cfg(feature = "english")]
477 fn test_exact() {
478 assert_eq!(hyphenate("", English).len(), 0);
479 assert_eq!(hyphenate("hello", English).len(), 1);
480 assert_eq!(hyphenate("extensive", English).len(), 3);
481 }
482
483 const LONG_WORD: &str = "thisisaverylongstringwithanunrealisticwordlengthforenglishbutitmightbepossibleinanotherlanguage";
484
485 #[test]
486 #[cfg(all(feature = "english", feature = "alloc"))]
487 fn test_alloc() {
488 assert_eq!(hyphenate(&LONG_WORD[..MAX_INLINE_SIZE - 1], English).len(), 13);
489 assert_eq!(hyphenate(&LONG_WORD[..MAX_INLINE_SIZE], English).len(), 12);
490 assert_eq!(hyphenate(&LONG_WORD[..MAX_INLINE_SIZE + 1], English).len(), 12);
491 assert_eq!(hyphenate(LONG_WORD, English).len(), 25);
492 }
493
494 #[test]
495 #[cfg(all(feature = "english", not(feature = "alloc")))]
496 fn test_nonalloc() {
497 _ = hyphenate(&LONG_WORD[..MAX_INLINE_SIZE], English).count();
498 }
499 #[test]
500 #[should_panic]
501 #[cfg(all(feature = "english", not(feature = "alloc")))]
502 fn test_nonalloc_fail() {
503 _ = hyphenate(&LONG_WORD[..MAX_INLINE_SIZE + 1], English).count();
504 }
505
506 #[test]
507 #[cfg(feature = "english")]
508 fn test_english() {
509 test(English, "");
510 test(English, "hi");
511 test(English, "wel-come");
512 test(English, "walk-ing");
513 test(English, "cap-tiVe");
514 test(English, "pur-sue");
515 test(English, "wHaT-eVeR");
516 test(English, "bro-ken");
517 test(English, "ex-ten-sive");
518 test(English, "Prob-a-bil-ity");
519 test(English, "rec-og-nize");
520 }
521
522 #[test]
523 #[cfg(feature = "german")]
524 fn test_german() {
525 test(German, "");
526 test(German, "Baum");
527 test(German, "ge-hen");
528 test(German, "Ap-fel");
529 test(German, "To-ma-te");
530 test(German, "Ein-ga-be-auf-for-de-rung");
531 test(German, "Fort-pflan-zungs-lem-ma");
532 test(German, "stra-te-gie-er-hal-ten-den");
533 test(German, "hübsch");
534 test(German, "häss-lich");
535 test(German, "über-zeu-gen-der");
536 }
537
538 #[test]
539 #[cfg(feature = "greek")]
540 fn test_greek() {
541 test(Greek, "δια-με-ρί-σμα-τα");
542 test(Greek, "λα-τρευ-τός");
543 test(Greek, "κά-τοι-κος");
544 }
545
546 #[test]
547 #[cfg(feature = "georgian")]
548 fn test_georgian() {
549 test(Georgian, "თა-რო");
550 test(Georgian, "შეყ-ვა-ნა");
551 test(Georgian, "კარ-ტო-ფი-ლი");
552 }
553
554 #[test]
555 #[cfg(feature = "polish")]
556 fn test_polish() {
557 test(Polish, "wy-kształ-ciu-chy");
558 }
559
560 #[test]
561 #[cfg(feature = "czech")]
562 fn test_czech() {
563 test(Czech, "po-ví-dá-me");
564 test(Czech, "nej-ja-s-něj-ší");
565 test(Czech, "br-něn-ský");
566 }
567}