Skip to main content

mecab_ko_core/
lib.rs

1//! # mecab-ko-core
2//!
3//! 한국어 형태소 분석 핵심 엔진
4//!
5//! ## 주요 기능
6//!
7//! - Lattice 구축
8//! - Viterbi 알고리즘
9//! - N-best 경로 탐색
10//! - 미등록어 처리
11//!
12//! ## 예제
13//!
14//! ```rust,no_run
15//! use mecab_ko_core::tokenizer::Tokenizer;
16//!
17//! let mut tokenizer = Tokenizer::new().unwrap();
18//! let tokens = tokenizer.tokenize("안녕하세요");
19//!
20//! for token in tokens {
21//!     println!("{}: {}", token.surface, token.pos);
22//! }
23//! ```
24
25#![warn(missing_docs)]
26#![deny(unsafe_code)]
27#![cfg_attr(feature = "simd", feature(portable_simd))]
28
29#[cfg(any(test, feature = "test-utils"))]
30pub mod test_utils;
31
32pub mod analysis_mode;
33pub mod batch;
34pub mod cache;
35pub mod evaluate;
36pub mod kiwi_compat;
37pub mod lattice;
38pub mod lattice_viz;
39pub mod nbest;
40pub mod nori_compat;
41pub mod normalizer;
42pub mod pool;
43pub mod pos_tag;
44pub mod streaming;
45pub mod tokenizer;
46pub mod unknown;
47pub mod viterbi;
48
49#[cfg(feature = "async")]
50pub mod async_tokenizer;
51
52pub use batch::{BatchTokenizer, ParallelStreamProcessor};
53pub use error::{Error, Result};
54pub use evaluate::{
55    evaluate_dataset, evaluate_tokens, EvaluateError, EvaluationResult, GoldSentence, GoldToken,
56    PosStats, TestDataset,
57};
58pub use kiwi_compat::{from_kiwi_tag, to_kiwi_tag, KiwiPosTag, KiwiToken};
59pub use lattice::{Lattice, Node, NodeBuilder, NodeType};
60pub use nori_compat::{
61    mecab_to_nori_tag, nori_to_mecab_tag, DecompoundMode, NoriAnalyzer, NoriToken, NoriTokenizer,
62    WordType,
63};
64pub use normalizer::{NormalizationConfig, NormalizationRule, Normalizer, RuleType};
65pub use pool::{
66    IdVecPool, NodeVecPool, PoolManager, PoolStats, SharedStringInterner, Symbol, TokenPool,
67};
68pub use pos_tag::PosTag;
69pub use streaming::{StreamingTokenizer, TokenStream};
70pub use tokenizer::{Token, Tokenizer};
71pub use unknown::{CharCategoryMap, UnknownDictionary, UnknownHandler};
72pub use analysis_mode::{
73    extract_adjectives, extract_content_words, extract_lemmas, extract_nouns, extract_verbs,
74    AnalysisMode, AnalyzedToken, AnalyzerConfig, LemmatizationMode, PosFilter,
75};
76pub use nbest::{ImprovedNbestSearcher, NbestPath, NbestResult};
77pub use viterbi::{ConnectionCost, NbestSearcher, SpacePenalty, ViterbiSearcher};
78pub use lattice_viz::{
79    lattice_to_dot, lattice_to_html, lattice_to_json, lattice_to_text, LatticeViz, VizFormat,
80    VizOptions,
81};
82pub use cache::{CacheConfig, CacheStats, CachedToken, CachingTokenizer, TokenCache};
83
84#[cfg(feature = "async")]
85pub use async_tokenizer::{AsyncStreamingTokenizer, AsyncTokenizer};
86
87/// 에러 모듈
88pub mod error {
89    use thiserror::Error;
90
91    /// 핵심 엔진 에러 타입
92    #[derive(Error, Debug)]
93    pub enum Error {
94        /// 사전 에러
95        #[error("Dictionary error: {0}")]
96        Dict(#[from] mecab_ko_dict::error::DictError),
97
98        /// 분석 에러
99        #[error("Analysis error: {0}")]
100        Analysis(String),
101
102        /// 초기화 에러
103        #[error("Initialization error: {0}")]
104        Init(String),
105
106        /// Lattice 에러
107        #[error("Lattice error: {0}")]
108        Lattice(String),
109
110        /// Viterbi 에러
111        #[error("Viterbi error: {0}")]
112        Viterbi(String),
113    }
114
115    /// Result 타입 별칭
116    pub type Result<T> = std::result::Result<T, Error>;
117}
118
119#[cfg(test)]
120#[allow(clippy::unwrap_used)]
121mod tests {
122    use super::*;
123
124    #[test]
125    fn test_tokenizer_creation() {
126        let tokenizer = Tokenizer::new();
127        assert!(tokenizer.is_ok());
128    }
129
130    #[test]
131    fn test_basic_tokenize() {
132        let mut tokenizer = Tokenizer::new().unwrap();
133        let tokens = tokenizer.tokenize("안녕");
134        assert!(!tokens.is_empty());
135    }
136}