#![warn(missing_docs)]
#![deny(unsafe_code)]
#![cfg_attr(feature = "simd", feature(portable_simd))]
#![allow(
clippy::similar_names,
clippy::too_many_lines,
clippy::needless_range_loop,
clippy::inline_always,
clippy::cast_precision_loss,
clippy::cast_possible_truncation,
clippy::option_if_let_else,
clippy::missing_panics_doc,
clippy::unwrap_used
)]
#[cfg(any(test, feature = "test-utils"))]
pub mod test_utils;
pub mod analysis_mode;
pub mod batch;
pub mod cache;
pub mod evaluate;
pub mod kiwi_compat;
pub mod lattice;
pub mod lattice_viz;
pub mod memory;
pub mod nbest;
pub mod nori_compat;
pub mod normalizer;
pub mod pool;
pub mod pos_tag;
pub mod sejong;
pub mod streaming;
pub mod tokenizer;
pub mod unknown;
pub mod viterbi;
#[cfg(feature = "async")]
pub mod async_tokenizer;
pub use analysis_mode::{
extract_adjectives, extract_content_words, extract_lemmas, extract_nouns, extract_verbs,
AnalysisMode, AnalyzedToken, AnalyzerConfig, LemmatizationMode, PosFilter,
};
pub use batch::{BatchTokenizer, LargeFileProcessor, LargeFileProgress, ParallelStreamProcessor};
pub use cache::{CacheConfig, CacheStats, CachedToken, CachingTokenizer, TokenCache};
pub use error::{Error, Result};
pub use evaluate::{
evaluate_dataset, evaluate_dataset_sejong, evaluate_tokens, EvaluateError, EvaluationResult,
GoldSentence, GoldToken, PosStats, TestDataset,
};
pub use kiwi_compat::{from_kiwi_tag, to_kiwi_tag, KiwiPosTag, KiwiToken};
pub use lattice::{Lattice, Node, NodeBuilder, NodeType};
pub use lattice_viz::{
lattice_to_dot, lattice_to_html, lattice_to_json, lattice_to_text, LatticeViz, VizFormat,
VizOptions,
};
pub use memory::{
estimate_tokens_memory, FeatureCache, InternerStats, MemoryStats, PosTagInterner,
};
pub use nbest::{ImprovedNbestSearcher, NbestPath, NbestResult};
pub use nori_compat::{
mecab_to_nori_tag, nori_to_mecab_tag, DecompoundMode, NoriAnalyzer, NoriToken, NoriTokenizer,
WordType,
};
pub use normalizer::{NormalizationConfig, NormalizationRule, Normalizer, RuleType};
pub use pool::{
IdVecPool, NodeVecPool, PoolManager, PoolStats, SharedStringInterner, Symbol, TokenPool,
};
pub use pos_tag::PosTag;
pub use sejong::{EndingRule, SejongConverter, SejongToken};
pub use streaming::{
ChunkedTokenIterator, ProgressCallback, ProgressStreamingTokenizer, SentenceReader,
StreamingProgress, StreamingTokenizer, TokenStream,
};
pub use tokenizer::{Token, Tokenizer};
pub use unknown::{CharCategoryMap, UnknownDictionary, UnknownHandler};
pub use viterbi::{ConnectionCost, NbestSearcher, SpacePenalty, ViterbiSearcher};
#[cfg(feature = "async")]
pub use async_tokenizer::{AsyncStreamingTokenizer, AsyncTokenizer};
pub mod error {
use thiserror::Error;
#[derive(Error, Debug)]
pub enum Error {
#[error("Dictionary error: {0}")]
Dict(#[from] mecab_ko_dict::error::DictError),
#[error("Analysis error: {0}")]
Analysis(String),
#[error("Initialization error: {0}")]
Init(String),
#[error("Lattice error: {0}")]
Lattice(String),
#[error("Viterbi error: {0}")]
Viterbi(String),
}
pub type Result<T> = std::result::Result<T, Error>;
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
#[test]
fn test_tokenizer_creation() {
let tokenizer = Tokenizer::new();
assert!(tokenizer.is_ok());
}
#[test]
fn test_basic_tokenize() {
let mut tokenizer = Tokenizer::new().unwrap();
let tokens = tokenizer.tokenize("안녕");
assert!(!tokens.is_empty());
}
}