charabia/
lib.rs

1//! Charabia library tokenize a text detecting the Script/Language, segmenting, normalizing, and classifying it.
2//!
3//! Examples
4//! --------
5//! #### Tokenization
6//! ```
7//! use charabia::Tokenize;
8//!
9//! let orig = "Thé quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!";
10//!
11//! // tokenize the text.
12//! let mut tokens = orig.tokenize();
13//!
14//! let token = tokens.next().unwrap();
15//! // the lemma into the token is normalized: `Thé` became `the`.
16//! assert_eq!(token.lemma(), "the");
17//! // token is classfied as a word
18//! assert!(token.is_word());
19//!
20//! let token = tokens.next().unwrap();
21//! assert_eq!(token.lemma(), " ");
22//! // token is classfied as a separator
23//! assert!(token.is_separator());
24//! ```
25//!
26//! #### Segmentation
27//! ```
28//! use charabia::Segment;
29//!
30//! let orig = "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!";
31//!
32//! let mut segments = orig.segment_str();
33//!
34//! assert_eq!(segments.next(), Some("The"));
35//! assert_eq!(segments.next(), Some(" "));
36//! assert_eq!(segments.next(), Some("quick"));
37//! ```
38//!
39//! Build features
40//! --------
41//! Charabia comes with default features that can be deactivated at compile time,
42//! this features are additional Language supports that need to download and/or build a specialized dictionary that impact the compilation time.
43//! Theses features are listed in charabia's `cargo.toml` and can be deactivated via [dependency features](https://doc.rust-lang.org/cargo/reference/features.html#dependency-features).
44
45#[cfg(test)]
46extern crate quickcheck;
47#[cfg(test)]
48#[macro_use(quickcheck)]
49extern crate quickcheck_macros;
50
51pub mod normalizer;
52pub mod segmenter;
53pub mod separators;
54
55mod detection;
56mod token;
57mod tokenizer;
58
59pub use detection::{Language, Script, StrDetection};
60pub use normalizer::Normalize;
61pub use segmenter::Segment;
62#[cfg(test)]
63pub use token::StaticToken;
64pub use token::{SeparatorKind, Token, TokenKind};
65
66pub use crate::tokenizer::{ReconstructedTokenIter, Tokenize, Tokenizer, TokenizerBuilder};