1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
//! ```
//! use sudachiclone::prelude::*;
//!
//! let dictionary = Dictionary::new(None, None).unwrap();
//! let tokenizer = dictionary.create();
//!
//! // Multi-granular tokenization
//! // using `system_core.dic` or `system_full.dic` version 20190781
//! // you may not be able to replicate this particular example due to dictionary you use
//!
//! for m in tokenizer.tokenize("国家公務員", &Some(SplitMode::C), None).unwrap() {
//!     println!("{}", m.surface());
//! };
//! // => 国家公務員
//!
//! for m in tokenizer.tokenize("国家公務員", &Some(SplitMode::B), None).unwrap() {
//!     println!("{}", m.surface());
//! };
//! // => 国家
//! // => 公務員
//!
//! for m in tokenizer.tokenize("国家公務員", &Some(SplitMode::A), None).unwrap() {
//!     println!("{}", m.surface());
//! };
//! // => 国家
//! // => 公務
//! // => 員
//!
//! // Morpheme information
//!
//! let m = tokenizer.tokenize("食べ", &Some(SplitMode::A), None).unwrap().get(0).unwrap();
//! println!("{}", m.surface());
//! // => 食べ
//! println!("{}", m.dictionary_form());
//! // => 食べる
//! println!("{}", m.reading_form());
//! // => タベ
//! println!("{:?}", m.part_of_speech());
//! // => ["動詞", "一般", "*", "*", "下一段-バ行", "連用形-一般"]
//!
//! // Normalization
//!
//! println!("{}", tokenizer.tokenize("附属", &Some(SplitMode::A), None).unwrap().get(0).unwrap().normalized_form());
//! // => 付属
//!
//! println!("{}", tokenizer.tokenize("SUMMER", &Some(SplitMode::A), None).unwrap().get(0).unwrap().normalized_form());
//! // => サマー
//!
//! println!("{}", tokenizer.tokenize("シュミレーション", &Some(SplitMode::A), None).unwrap().get(0).unwrap().normalized_form());
//! // => シミュレーション
//! ```

#![crate_name = "sudachiclone"]
#![crate_type = "lib"]
#![crate_type = "dylib"]
#![crate_type = "rlib"]

pub mod config;
pub mod darts;
pub mod dictionary;
pub mod dictionary_lib;
pub mod lattice;
pub mod lattice_node;
pub mod morpheme;
pub mod morpheme_list;
pub mod plugin;
mod resources;
pub mod tokenizer;
pub mod utf8_input_text;
pub mod utf8_input_text_builder;

pub mod prelude {
  pub use crate::dictionary::Dictionary;
  pub use crate::tokenizer::{CanTokenize, SplitMode, Tokenizer};
}