unic_segment/
lib.rs

1// Copyright 2012-2015 The Rust Project Developers.
2// Copyright 2017 The UNIC Project Developers.
3//
4// See the COPYRIGHT file at the top-level directory of this distribution.
5//
6// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
7// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
8// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
9// option. This file may not be copied, modified, or distributed
10// except according to those terms.
11
12#![warn(
13    bad_style,
14    missing_debug_implementations,
15    missing_docs,
16    unconditional_recursion
17)]
18#![forbid(unsafe_code)]
19
20//! # UNIC — Unicode Text Segmentation Algorithms
21//!
22//! A component of [`unic`: Unicode and Internationalization Crates for Rust](/unic/).
23//!
24//! This UNIC component implements algorithms from [Unicode® Standard Annex #29 -
25//! Unicode Text Segmentation](http://unicode.org/reports/tr29/), used for detecting
26//! boundaries of text element boundaries, such as user-perceived characters (a.k.a.
27//! *Grapheme Clusters)*, *Words*, and *Sentences* (last one not implemented yet).
28//!
29//! # Examples
30//!
31//! ```rust
32//! # use unic_segment::{GraphemeIndices, Graphemes, WordBoundIndices, WordBounds, Words};
33//! assert_eq!(
34//!     Graphemes::new("a\u{310}e\u{301}o\u{308}\u{332}").collect::<Vec<&str>>(),
35//!     &["a\u{310}", "e\u{301}", "o\u{308}\u{332}"]
36//! );
37//!
38//! assert_eq!(
39//!     Graphemes::new("a\r\nb🇺🇳🇮🇨").collect::<Vec<&str>>(),
40//!     &["a", "\r\n", "b", "🇺🇳", "🇮🇨"]
41//! );
42//!
43//! assert_eq!(
44//!     GraphemeIndices::new("a̐éö̲\r\n").collect::<Vec<(usize, &str)>>(),
45//!     &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")]
46//! );
47//!
48//! fn has_alphanumeric(s: &&str) -> bool {
49//!     s.chars().any(|ch| ch.is_alphanumeric())
50//! }
51//!
52//! assert_eq!(
53//!     Words::new(
54//!         "The quick (\"brown\") fox can't jump 32.3 feet, right?",
55//!         has_alphanumeric,
56//!     ).collect::<Vec<&str>>(),
57//!     &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"]
58//! );
59//!
60//! assert_eq!(
61//!     WordBounds::new("The quick (\"brown\")  fox").collect::<Vec<&str>>(),
62//!     &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"]
63//! );
64//!
65//! assert_eq!(
66//!     WordBoundIndices::new("Brr, it's 29.3°F!").collect::<Vec<(usize, &str)>>(),
67//!     &[
68//!         (0, "Brr"),
69//!         (3, ","),
70//!         (4, " "),
71//!         (5, "it's"),
72//!         (9, " "),
73//!         (10, "29.3"),
74//!         (14, "°"),
75//!         (16, "F"),
76//!         (17, "!")
77//!     ]
78//! );
79//! ```
80
81pub use unic_ucd_segment::UNICODE_VERSION;
82
83mod pkg_info;
84pub use crate::pkg_info::{PKG_DESCRIPTION, PKG_NAME, PKG_VERSION};
85
86mod grapheme;
87pub use crate::grapheme::{GraphemeCursor, GraphemeIncomplete, GraphemeIndices, Graphemes};
88
89mod word;
90pub use crate::word::{WordBoundIndices, WordBounds, Words};