1#![warn(missing_docs)]
26#![deny(unsafe_code)]
27#![cfg_attr(feature = "simd", feature(portable_simd))]
28
29#[cfg(any(test, feature = "test-utils"))]
30pub mod test_utils;
31
32pub mod analysis_mode;
33pub mod batch;
34pub mod cache;
35pub mod evaluate;
36pub mod kiwi_compat;
37pub mod lattice;
38pub mod lattice_viz;
39pub mod nbest;
40pub mod nori_compat;
41pub mod normalizer;
42pub mod pool;
43pub mod pos_tag;
44pub mod streaming;
45pub mod tokenizer;
46pub mod unknown;
47pub mod viterbi;
48
49#[cfg(feature = "async")]
50pub mod async_tokenizer;
51
52pub use batch::{BatchTokenizer, ParallelStreamProcessor};
53pub use error::{Error, Result};
54pub use evaluate::{
55 evaluate_dataset, evaluate_tokens, EvaluateError, EvaluationResult, GoldSentence, GoldToken,
56 PosStats, TestDataset,
57};
58pub use kiwi_compat::{from_kiwi_tag, to_kiwi_tag, KiwiPosTag, KiwiToken};
59pub use lattice::{Lattice, Node, NodeBuilder, NodeType};
60pub use nori_compat::{
61 mecab_to_nori_tag, nori_to_mecab_tag, DecompoundMode, NoriAnalyzer, NoriToken, NoriTokenizer,
62 WordType,
63};
64pub use normalizer::{NormalizationConfig, NormalizationRule, Normalizer, RuleType};
65pub use pool::{
66 IdVecPool, NodeVecPool, PoolManager, PoolStats, SharedStringInterner, Symbol, TokenPool,
67};
68pub use pos_tag::PosTag;
69pub use streaming::{StreamingTokenizer, TokenStream};
70pub use tokenizer::{Token, Tokenizer};
71pub use unknown::{CharCategoryMap, UnknownDictionary, UnknownHandler};
72pub use analysis_mode::{
73 extract_adjectives, extract_content_words, extract_lemmas, extract_nouns, extract_verbs,
74 AnalysisMode, AnalyzedToken, AnalyzerConfig, LemmatizationMode, PosFilter,
75};
76pub use nbest::{ImprovedNbestSearcher, NbestPath, NbestResult};
77pub use viterbi::{ConnectionCost, NbestSearcher, SpacePenalty, ViterbiSearcher};
78pub use lattice_viz::{
79 lattice_to_dot, lattice_to_html, lattice_to_json, lattice_to_text, LatticeViz, VizFormat,
80 VizOptions,
81};
82pub use cache::{CacheConfig, CacheStats, CachedToken, CachingTokenizer, TokenCache};
83
84#[cfg(feature = "async")]
85pub use async_tokenizer::{AsyncStreamingTokenizer, AsyncTokenizer};
86
87pub mod error {
89 use thiserror::Error;
90
91 #[derive(Error, Debug)]
93 pub enum Error {
94 #[error("Dictionary error: {0}")]
96 Dict(#[from] mecab_ko_dict::error::DictError),
97
98 #[error("Analysis error: {0}")]
100 Analysis(String),
101
102 #[error("Initialization error: {0}")]
104 Init(String),
105
106 #[error("Lattice error: {0}")]
108 Lattice(String),
109
110 #[error("Viterbi error: {0}")]
112 Viterbi(String),
113 }
114
115 pub type Result<T> = std::result::Result<T, Error>;
117}
118
119#[cfg(test)]
120#[allow(clippy::unwrap_used)]
121mod tests {
122 use super::*;
123
124 #[test]
125 fn test_tokenizer_creation() {
126 let tokenizer = Tokenizer::new();
127 assert!(tokenizer.is_ok());
128 }
129
130 #[test]
131 fn test_basic_tokenize() {
132 let mut tokenizer = Tokenizer::new().unwrap();
133 let tokens = tokenizer.tokenize("안녕");
134 assert!(!tokens.is_empty());
135 }
136}