Skip to main content

mecab_ko_core/
lib.rs

1//! # mecab-ko-core
2//!
3//! 한국어 형태소 분석 핵심 엔진
4//!
5//! ## 주요 기능
6//!
7//! - **형태소 분석**: Lattice 기반 Viterbi 알고리즘
8//! - **N-best 탐색**: K개의 최적 분석 결과 제공
9//! - **미등록어 처리**: 사전에 없는 단어 자동 처리
10//! - **스트리밍 처리**: 대용량 파일 청크 단위 처리
11//! - **토큰화 캐싱**: LRU 캐시로 반복 입력 최적화
12//! - **분석 모드**: 명사/동사/원형 등 맞춤 분석
13//! - **Lattice 시각화**: DOT/HTML/JSON 형식 지원
14//! - **메모리 최적화**: String interning, 객체 풀링
15//!
16//! ## 빠른 시작
17//!
18//! ```rust,no_run
19//! use mecab_ko_core::Tokenizer;
20//!
21//! let mut tokenizer = Tokenizer::new()?;
22//! let tokens = tokenizer.tokenize("아버지가방에들어가신다");
23//!
24//! for token in tokens {
25//!     println!("{}: {}", token.surface, token.pos);
26//! }
27//! # Ok::<(), mecab_ko_core::Error>(())
28//! ```
29//!
30//! ## 고급 기능
31//!
32//! ### 명사만 추출
33//!
34//! ```rust,no_run
35//! use mecab_ko_core::{Tokenizer, extract_nouns};
36//!
37//! let mut tokenizer = Tokenizer::new()?;
38//! let nouns = extract_nouns(&mut tokenizer, "오늘 서울 날씨가 좋습니다");
39//! // ["오늘", "서울", "날씨"]
40//! # Ok::<(), mecab_ko_core::Error>(())
41//! ```
42//!
43//! ### 스트리밍 처리
44//!
45//! ```rust,no_run
46//! use mecab_ko_core::{Tokenizer, StreamingTokenizer};
47//!
48//! let tokenizer = Tokenizer::new()?;
49//! let mut stream = StreamingTokenizer::new(tokenizer)
50//!     .with_chunk_size(8192);
51//!
52//! let tokens = stream.process_chunk("첫 번째 청크. ");
53//! let more_tokens = stream.process_chunk("두 번째 청크.");
54//! let remaining = stream.flush();
55//! # Ok::<(), mecab_ko_core::Error>(())
56//! ```
57//!
58//! ### 토큰화 캐싱
59//!
60//! ```rust,no_run
61//! use mecab_ko_core::{Tokenizer, TokenCache, CacheConfig};
62//!
63//! let mut tokenizer = Tokenizer::new()?;
64//! let cache = TokenCache::new(CacheConfig::default());
65//!
66//! let text = "반복되는 입력";
67//! let key = cache.make_key(text);
68//!
69//! // 캐시 조회 또는 계산
70//! let tokens = cache.get_or_insert(key, || {
71//!     tokenizer.tokenize(text)
72//!         .into_iter()
73//!         .map(|t| mecab_ko_core::CachedToken {
74//!             surface: t.surface,
75//!             pos: t.pos,
76//!             start_byte: t.start_byte,
77//!             end_byte: t.end_byte,
78//!         })
79//!         .collect()
80//! });
81//!
82//! println!("캐시 히트율: {:.1}%", cache.stats().hit_rate() * 100.0);
83//! # Ok::<(), mecab_ko_core::Error>(())
84//! ```
85//!
86//! ### N-best 경로 탐색
87//!
88//! ```rust,no_run
89//! use mecab_ko_core::{Tokenizer, ImprovedNbestSearcher};
90//! use mecab_ko_dict::matrix::Matrix;
91//!
92//! let mut tokenizer = Tokenizer::new()?;
93//! let lattice = tokenizer.tokenize_to_lattice("한국어");
94//! // ... N-best 탐색 수행 ...
95//! # Ok::<(), mecab_ko_core::Error>(())
96//! ```
97//!
98//! ## 모듈 구조
99//!
100//! | 모듈 | 설명 |
101//! |------|------|
102//! | [`tokenizer`] | 형태소 분석 메인 인터페이스 |
103//! | [`lattice`] | Lattice 그래프 구조 |
104//! | [`viterbi`] | Viterbi 알고리즘 |
105//! | [`nbest`] | N-best 경로 탐색 |
106//! | [`streaming`] | 스트리밍 토큰화 |
107//! | [`cache`] | 토큰화 캐싱 |
108//! | [`batch`] | 배치/병렬 처리 |
109//! | [`analysis_mode`] | 분석 모드 (명사/동사 추출 등) |
110//! | [`memory`] | 메모리 최적화 유틸리티 |
111//! | [`lattice_viz`] | Lattice 시각화 |
112//! | [`nori_compat`] | Elasticsearch Nori 호환 |
113//! | [`kiwi_compat`] | Kiwi 분석기 호환 |
114//!
115//! ## Feature Flags
116//!
117//! - `default`: zstd 압축 지원
118//! - `async`: 비동기 토크나이저 (`tokio` 필요)
119//! - `simd`: SIMD 최적화 (nightly 필요)
120//! - `test-utils`: 테스트 유틸리티 공개
121
122#![warn(missing_docs)]
123#![deny(unsafe_code)]
124#![cfg_attr(feature = "simd", feature(portable_simd))]
125#![allow(
126    clippy::similar_names,
127    clippy::too_many_lines,
128    clippy::needless_range_loop,
129    clippy::inline_always,
130    clippy::cast_precision_loss,
131    clippy::cast_possible_truncation,
132    clippy::option_if_let_else,
133    clippy::missing_panics_doc,
134    clippy::unwrap_used
135)]
136
137#[cfg(any(test, feature = "test-utils"))]
138pub mod test_utils;
139
140pub mod analysis_mode;
141pub mod batch;
142pub mod cache;
143pub mod evaluate;
144pub mod kiwi_compat;
145pub mod lattice;
146pub mod lattice_viz;
147pub mod memory;
148pub mod nbest;
149pub mod nori_compat;
150pub mod normalizer;
151pub mod pool;
152pub mod pos_tag;
153pub mod sejong;
154pub mod streaming;
155pub mod tokenizer;
156pub mod unknown;
157pub mod viterbi;
158
159#[cfg(feature = "async")]
160pub mod async_tokenizer;
161
162pub use analysis_mode::{
163    extract_adjectives, extract_content_words, extract_lemmas, extract_nouns, extract_verbs,
164    AnalysisMode, AnalyzedToken, AnalyzerConfig, LemmatizationMode, PosFilter,
165};
166pub use batch::{BatchTokenizer, LargeFileProcessor, LargeFileProgress, ParallelStreamProcessor};
167pub use cache::{CacheConfig, CacheStats, CachedToken, CachingTokenizer, TokenCache};
168pub use error::{Error, Result};
169pub use evaluate::{
170    evaluate_dataset, evaluate_dataset_sejong, evaluate_tokens, EvaluateError, EvaluationResult,
171    GoldSentence, GoldToken, PosStats, TestDataset,
172};
173pub use kiwi_compat::{from_kiwi_tag, to_kiwi_tag, KiwiPosTag, KiwiToken};
174pub use lattice::{Lattice, Node, NodeBuilder, NodeType};
175pub use lattice_viz::{
176    lattice_to_dot, lattice_to_html, lattice_to_json, lattice_to_text, LatticeViz, VizFormat,
177    VizOptions,
178};
179pub use memory::{
180    estimate_tokens_memory, FeatureCache, InternerStats, MemoryStats, PosTagInterner,
181};
182pub use nbest::{ImprovedNbestSearcher, NbestPath, NbestResult};
183pub use nori_compat::{
184    mecab_to_nori_tag, nori_to_mecab_tag, DecompoundMode, NoriAnalyzer, NoriToken, NoriTokenizer,
185    WordType,
186};
187pub use normalizer::{NormalizationConfig, NormalizationRule, Normalizer, RuleType};
188pub use pool::{
189    IdVecPool, NodeVecPool, PoolManager, PoolStats, SharedStringInterner, Symbol, TokenPool,
190};
191pub use pos_tag::PosTag;
192pub use sejong::{EndingRule, SejongConverter, SejongToken};
193pub use streaming::{
194    ChunkedTokenIterator, ProgressCallback, ProgressStreamingTokenizer, SentenceReader,
195    StreamingProgress, StreamingTokenizer, TokenStream,
196};
197pub use tokenizer::{Token, Tokenizer};
198pub use unknown::{CharCategoryMap, UnknownDictionary, UnknownHandler};
199pub use viterbi::{ConnectionCost, NbestSearcher, SpacePenalty, ViterbiSearcher};
200
201#[cfg(feature = "async")]
202pub use async_tokenizer::{AsyncStreamingTokenizer, AsyncTokenizer};
203
204/// 에러 모듈
205pub mod error {
206    use thiserror::Error;
207
208    /// 핵심 엔진 에러 타입
209    #[derive(Error, Debug)]
210    pub enum Error {
211        /// 사전 에러
212        #[error("Dictionary error: {0}")]
213        Dict(#[from] mecab_ko_dict::error::DictError),
214
215        /// 분석 에러
216        #[error("Analysis error: {0}")]
217        Analysis(String),
218
219        /// 초기화 에러
220        #[error("Initialization error: {0}")]
221        Init(String),
222
223        /// Lattice 에러
224        #[error("Lattice error: {0}")]
225        Lattice(String),
226
227        /// Viterbi 에러
228        #[error("Viterbi error: {0}")]
229        Viterbi(String),
230    }
231
232    /// Result 타입 별칭
233    pub type Result<T> = std::result::Result<T, Error>;
234}
235
236#[cfg(test)]
237#[allow(clippy::unwrap_used)]
238mod tests {
239    use super::*;
240
241    #[test]
242    fn test_tokenizer_creation() {
243        let tokenizer = Tokenizer::new();
244        assert!(tokenizer.is_ok());
245    }
246
247    #[test]
248    fn test_basic_tokenize() {
249        let mut tokenizer = Tokenizer::new().unwrap();
250        let tokens = tokenizer.tokenize("안녕");
251        assert!(!tokens.is_empty());
252    }
253}