mecab_ko_core/
analysis_mode.rs

1//! 사용자 정의 분석 모드
2//!
3//! 다양한 분석 요구사항을 지원하는 분석 모드를 제공합니다.
4//!
5//! # 개요
6//!
7//! 기본 토크나이저는 모든 형태소를 반환하지만, 많은 NLP 작업에서는
8//! 특정 품사만 필요하거나, 원형 복원이 필요한 경우가 있습니다.
9//!
10//! 이 모듈은 다음 기능을 제공합니다:
11//! - 품사 필터링 (명사, 동사, 형용사 등)
12//! - 원형 복원 (동사/형용사 → 기본형)
13//! - 커스텀 분석 모드 조합
14//!
15//! # Example
16//!
17//! ```rust,no_run
18//! use mecab_ko_core::analysis_mode::{AnalysisMode, PosFilter, AnalyzerConfig};
19//! use mecab_ko_core::tokenizer::Tokenizer;
20//!
21//! let mut tokenizer = Tokenizer::new().unwrap();
22//!
23//! // 명사만 추출
24//! let config = AnalyzerConfig::new(AnalysisMode::NounsOnly);
25//! let nouns = config.analyze(&mut tokenizer, "한국어 형태소 분석기");
26//!
27//! // 커스텀 품사 필터
28//! let filter = PosFilter::new()
29//!     .include_nouns()
30//!     .include_verbs();
31//! let config = AnalyzerConfig::with_filter(filter);
32//! let tokens = config.analyze(&mut tokenizer, "아버지가 방에 들어가신다");
33//! ```
34
35use crate::tokenizer::{Token, Tokenizer};
36
37/// 분석 모드
38///
39/// 토크나이저의 출력을 필터링/변환하는 모드입니다.
40#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
41pub enum AnalysisMode {
42    /// 모든 형태소 반환 (기본)
43    #[default]
44    Full,
45
46    /// 명사만 추출 (NNG, NNP, NNB, NR, NP)
47    NounsOnly,
48
49    /// 동사만 추출 (VV)
50    VerbsOnly,
51
52    /// 형용사만 추출 (VA)
53    AdjectivesOnly,
54
55    /// 동사/형용사 추출 (VV, VA)
56    PredicatesOnly,
57
58    /// 내용어만 추출 (명사, 동사, 형용사, 부사)
59    ContentWordsOnly,
60
61    /// 표면형만 반환 (wakati 모드)
62    SurfaceOnly,
63
64    /// 원형 복원 모드 (동사/형용사를 기본형으로)
65    Lemmatized,
66
67    /// 품사 태그만 반환
68    PosTagsOnly,
69
70    /// 커스텀 필터 사용
71    Custom,
72}
73
74impl AnalysisMode {
75    /// 이 모드가 품사 필터링을 사용하는지 확인
76    #[must_use]
77    pub const fn uses_pos_filter(&self) -> bool {
78        matches!(
79            self,
80            Self::NounsOnly
81                | Self::VerbsOnly
82                | Self::AdjectivesOnly
83                | Self::PredicatesOnly
84                | Self::ContentWordsOnly
85                | Self::Custom
86        )
87    }
88
89    /// 이 모드가 원형 복원을 사용하는지 확인
90    #[must_use]
91    pub const fn uses_lemmatization(&self) -> bool {
92        matches!(self, Self::Lemmatized)
93    }
94}
95
96/// 품사 필터
97///
98/// 특정 품사 태그를 포함하거나 제외합니다.
99#[derive(Debug, Clone, Default)]
100pub struct PosFilter {
101    /// 포함할 품사 접두사 (예: "NN", "VV")
102    include_prefixes: Vec<String>,
103    /// 제외할 품사 접두사
104    exclude_prefixes: Vec<String>,
105    /// 포함할 정확한 품사 태그
106    include_exact: Vec<String>,
107    /// 제외할 정확한 품사 태그
108    exclude_exact: Vec<String>,
109}
110
111impl PosFilter {
112    /// 빈 필터 생성
113    #[must_use]
114    pub fn new() -> Self {
115        Self::default()
116    }
117
118    /// 명사 포함 (NNG, NNP, NNB, NR, NP)
119    #[must_use]
120    pub fn include_nouns(mut self) -> Self {
121        self.include_prefixes.push("NN".to_string());
122        self.include_prefixes.push("NR".to_string());
123        self.include_prefixes.push("NP".to_string());
124        self
125    }
126
127    /// 일반 명사만 포함 (NNG)
128    #[must_use]
129    pub fn include_common_nouns(mut self) -> Self {
130        self.include_exact.push("NNG".to_string());
131        self
132    }
133
134    /// 고유 명사만 포함 (NNP)
135    #[must_use]
136    pub fn include_proper_nouns(mut self) -> Self {
137        self.include_exact.push("NNP".to_string());
138        self
139    }
140
141    /// 동사 포함 (VV)
142    #[must_use]
143    pub fn include_verbs(mut self) -> Self {
144        self.include_exact.push("VV".to_string());
145        self
146    }
147
148    /// 형용사 포함 (VA)
149    #[must_use]
150    pub fn include_adjectives(mut self) -> Self {
151        self.include_exact.push("VA".to_string());
152        self
153    }
154
155    /// 용언 포함 (VV, VA, VX, VCP, VCN)
156    #[must_use]
157    pub fn include_predicates(mut self) -> Self {
158        self.include_prefixes.push("V".to_string());
159        self
160    }
161
162    /// 부사 포함 (MAG, MAJ)
163    #[must_use]
164    pub fn include_adverbs(mut self) -> Self {
165        self.include_prefixes.push("MA".to_string());
166        self
167    }
168
169    /// 조사 제외 (JK*, JX, JC)
170    #[must_use]
171    pub fn exclude_particles(mut self) -> Self {
172        self.exclude_prefixes.push("J".to_string());
173        self
174    }
175
176    /// 어미 제외 (E*)
177    #[must_use]
178    pub fn exclude_endings(mut self) -> Self {
179        self.exclude_prefixes.push("E".to_string());
180        self
181    }
182
183    /// 접사 제외 (XP*, XS*)
184    #[must_use]
185    pub fn exclude_affixes(mut self) -> Self {
186        self.exclude_prefixes.push("X".to_string());
187        self
188    }
189
190    /// 특수 기호 제외 (S*)
191    #[must_use]
192    pub fn exclude_symbols(mut self) -> Self {
193        self.exclude_prefixes.push("S".to_string());
194        self
195    }
196
197    /// 품사 접두사 포함 추가
198    #[must_use]
199    pub fn include_prefix(mut self, prefix: &str) -> Self {
200        self.include_prefixes.push(prefix.to_string());
201        self
202    }
203
204    /// 품사 접두사 제외 추가
205    #[must_use]
206    pub fn exclude_prefix(mut self, prefix: &str) -> Self {
207        self.exclude_prefixes.push(prefix.to_string());
208        self
209    }
210
211    /// 정확한 품사 태그 포함 추가
212    #[must_use]
213    pub fn include_tag(mut self, tag: &str) -> Self {
214        self.include_exact.push(tag.to_string());
215        self
216    }
217
218    /// 정확한 품사 태그 제외 추가
219    #[must_use]
220    pub fn exclude_tag(mut self, tag: &str) -> Self {
221        self.exclude_exact.push(tag.to_string());
222        self
223    }
224
225    /// 내용어 필터 생성 (명사, 동사, 형용사, 부사)
226    #[must_use]
227    pub fn content_words() -> Self {
228        Self::new()
229            .include_nouns()
230            .include_verbs()
231            .include_adjectives()
232            .include_adverbs()
233    }
234
235    /// 품사가 필터를 통과하는지 확인
236    #[must_use]
237    pub fn matches(&self, pos: &str) -> bool {
238        // 제외 목록 먼저 확인
239        for excluded in &self.exclude_exact {
240            if pos == excluded {
241                return false;
242            }
243        }
244        for excluded in &self.exclude_prefixes {
245            if pos.starts_with(excluded) {
246                return false;
247            }
248        }
249
250        // 포함 목록이 비어있으면 모두 통과
251        if self.include_exact.is_empty() && self.include_prefixes.is_empty() {
252            return true;
253        }
254
255        // 포함 목록 확인
256        for included in &self.include_exact {
257            if pos == included {
258                return true;
259            }
260        }
261        for included in &self.include_prefixes {
262            if pos.starts_with(included) {
263                return true;
264            }
265        }
266
267        false
268    }
269}
270
271/// 원형 복원 설정
272#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
273pub enum LemmatizationMode {
274    /// 원형 복원 안함
275    #[default]
276    None,
277
278    /// 동사/형용사만 원형 복원
279    PredicatesOnly,
280
281    /// 모든 굴절 형태 원형 복원
282    All,
283}
284
285/// 분석기 설정
286///
287/// 분석 모드, 필터, 원형 복원 설정을 조합합니다.
288#[derive(Debug, Clone)]
289pub struct AnalyzerConfig {
290    /// 분석 모드
291    pub mode: AnalysisMode,
292    /// 품사 필터 (Custom 모드에서 사용)
293    pub pos_filter: Option<PosFilter>,
294    /// 원형 복원 모드
295    pub lemmatization: LemmatizationMode,
296    /// 최소 토큰 길이 (문자 단위)
297    pub min_length: usize,
298    /// 최대 토큰 길이 (문자 단위, 0이면 제한 없음)
299    pub max_length: usize,
300}
301
302impl Default for AnalyzerConfig {
303    fn default() -> Self {
304        Self {
305            mode: AnalysisMode::Full,
306            pos_filter: None,
307            lemmatization: LemmatizationMode::None,
308            min_length: 0,
309            max_length: 0,
310        }
311    }
312}
313
314impl AnalyzerConfig {
315    /// 새 분석기 설정 생성
316    #[must_use]
317    pub fn new(mode: AnalysisMode) -> Self {
318        Self {
319            mode,
320            ..Self::default()
321        }
322    }
323
324    /// 커스텀 필터로 설정 생성
325    #[must_use]
326    pub fn with_filter(filter: PosFilter) -> Self {
327        Self {
328            mode: AnalysisMode::Custom,
329            pos_filter: Some(filter),
330            ..Self::default()
331        }
332    }
333
334    /// 원형 복원 모드 설정
335    #[must_use]
336    pub const fn with_lemmatization(mut self, mode: LemmatizationMode) -> Self {
337        self.lemmatization = mode;
338        self
339    }
340
341    /// 최소 토큰 길이 설정
342    #[must_use]
343    pub const fn with_min_length(mut self, len: usize) -> Self {
344        self.min_length = len;
345        self
346    }
347
348    /// 최대 토큰 길이 설정
349    #[must_use]
350    pub const fn with_max_length(mut self, len: usize) -> Self {
351        self.max_length = len;
352        self
353    }
354
355    /// 분석 수행
356    ///
357    /// 토크나이저를 사용하여 텍스트를 분석하고,
358    /// 설정에 따라 결과를 필터링/변환합니다.
359    pub fn analyze(&self, tokenizer: &mut Tokenizer, text: &str) -> Vec<AnalyzedToken> {
360        let tokens = tokenizer.tokenize(text);
361        self.process_tokens(tokens)
362    }
363
364    /// 토큰 목록 처리
365    ///
366    /// 이미 토크나이징된 결과를 필터링/변환합니다.
367    #[must_use]
368    pub fn process_tokens(&self, tokens: Vec<Token>) -> Vec<AnalyzedToken> {
369        tokens
370            .into_iter()
371            .filter(|t| self.filter_token(t))
372            .map(|t| self.transform_token(t))
373            .collect()
374    }
375
376    /// 토큰 필터링
377    fn filter_token(&self, token: &Token) -> bool {
378        // 길이 필터
379        let char_len = token.char_len();
380        if self.min_length > 0 && char_len < self.min_length {
381            return false;
382        }
383        if self.max_length > 0 && char_len > self.max_length {
384            return false;
385        }
386
387        // 품사 필터
388        match self.mode {
389            AnalysisMode::Full
390            | AnalysisMode::SurfaceOnly
391            | AnalysisMode::Lemmatized
392            | AnalysisMode::PosTagsOnly => true,
393            AnalysisMode::NounsOnly => {
394                token.pos.starts_with("NN")
395                    || token.pos.starts_with("NR")
396                    || token.pos.starts_with("NP")
397            }
398            AnalysisMode::VerbsOnly => token.pos == "VV",
399            AnalysisMode::AdjectivesOnly => token.pos == "VA",
400            AnalysisMode::PredicatesOnly => token.pos == "VV" || token.pos == "VA",
401            AnalysisMode::ContentWordsOnly => {
402                token.pos.starts_with("NN")
403                    || token.pos.starts_with("NR")
404                    || token.pos.starts_with("NP")
405                    || token.pos == "VV"
406                    || token.pos == "VA"
407                    || token.pos.starts_with("MA")
408            }
409            AnalysisMode::Custom => self
410                .pos_filter
411                .as_ref()
412                .map_or(true, |f| f.matches(&token.pos)),
413        }
414    }
415
416    /// 토큰 변환
417    fn transform_token(&self, token: Token) -> AnalyzedToken {
418        let surface = match self.lemmatization {
419            LemmatizationMode::None => token.surface.clone(),
420            LemmatizationMode::PredicatesOnly => {
421                if token.pos == "VV" || token.pos == "VA" {
422                    token.lemma.clone().unwrap_or_else(|| token.surface.clone())
423                } else {
424                    token.surface.clone()
425                }
426            }
427            LemmatizationMode::All => token.lemma.clone().unwrap_or_else(|| token.surface.clone()),
428        };
429
430        AnalyzedToken {
431            surface,
432            original_surface: token.surface,
433            pos: token.pos,
434            start_pos: token.start_pos,
435            end_pos: token.end_pos,
436            lemma: token.lemma,
437            is_lemmatized: self.lemmatization != LemmatizationMode::None,
438        }
439    }
440}
441
442/// 분석된 토큰
443///
444/// 분석 모드에 따라 변환된 토큰입니다.
445#[derive(Debug, Clone, PartialEq, Eq)]
446pub struct AnalyzedToken {
447    /// 표면형 (원형 복원 시 기본형)
448    pub surface: String,
449    /// 원본 표면형
450    pub original_surface: String,
451    /// 품사 태그
452    pub pos: String,
453    /// 시작 위치
454    pub start_pos: usize,
455    /// 끝 위치
456    pub end_pos: usize,
457    /// 원형 (사전에 있는 경우)
458    pub lemma: Option<String>,
459    /// 원형 복원 적용 여부
460    pub is_lemmatized: bool,
461}
462
463impl AnalyzedToken {
464    /// 토큰 길이 (문자 단위)
465    #[must_use]
466    pub const fn char_len(&self) -> usize {
467        self.end_pos - self.start_pos
468    }
469}
470
471/// 편의 함수: 명사만 추출
472pub fn extract_nouns(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
473    AnalyzerConfig::new(AnalysisMode::NounsOnly)
474        .analyze(tokenizer, text)
475        .into_iter()
476        .map(|t| t.surface)
477        .collect()
478}
479
480/// 편의 함수: 동사만 추출
481pub fn extract_verbs(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
482    AnalyzerConfig::new(AnalysisMode::VerbsOnly)
483        .analyze(tokenizer, text)
484        .into_iter()
485        .map(|t| t.surface)
486        .collect()
487}
488
489/// 편의 함수: 형용사만 추출
490pub fn extract_adjectives(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
491    AnalyzerConfig::new(AnalysisMode::AdjectivesOnly)
492        .analyze(tokenizer, text)
493        .into_iter()
494        .map(|t| t.surface)
495        .collect()
496}
497
498/// 편의 함수: 내용어만 추출
499pub fn extract_content_words(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
500    AnalyzerConfig::new(AnalysisMode::ContentWordsOnly)
501        .analyze(tokenizer, text)
502        .into_iter()
503        .map(|t| t.surface)
504        .collect()
505}
506
507/// 편의 함수: 원형 복원된 형태소 추출
508pub fn extract_lemmas(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
509    AnalyzerConfig::new(AnalysisMode::Lemmatized)
510        .with_lemmatization(LemmatizationMode::All)
511        .analyze(tokenizer, text)
512        .into_iter()
513        .map(|t| t.surface)
514        .collect()
515}
516
517#[cfg(test)]
518mod tests {
519    use super::*;
520
521    #[test]
522    fn test_analysis_mode_uses_filter() {
523        assert!(!AnalysisMode::Full.uses_pos_filter());
524        assert!(AnalysisMode::NounsOnly.uses_pos_filter());
525        assert!(AnalysisMode::Custom.uses_pos_filter());
526    }
527
528    #[test]
529    fn test_pos_filter_matches_nouns() {
530        let filter = PosFilter::new().include_nouns();
531
532        assert!(filter.matches("NNG"));
533        assert!(filter.matches("NNP"));
534        assert!(filter.matches("NNB"));
535        assert!(filter.matches("NR"));
536        assert!(filter.matches("NP"));
537        assert!(!filter.matches("VV"));
538        assert!(!filter.matches("JKS"));
539    }
540
541    #[test]
542    fn test_pos_filter_matches_verbs() {
543        let filter = PosFilter::new().include_verbs();
544
545        assert!(filter.matches("VV"));
546        assert!(!filter.matches("VA"));
547        assert!(!filter.matches("NNG"));
548    }
549
550    #[test]
551    fn test_pos_filter_matches_predicates() {
552        let filter = PosFilter::new().include_predicates();
553
554        assert!(filter.matches("VV"));
555        assert!(filter.matches("VA"));
556        assert!(filter.matches("VX"));
557        assert!(filter.matches("VCP"));
558        assert!(!filter.matches("NNG"));
559    }
560
561    #[test]
562    fn test_pos_filter_content_words() {
563        let filter = PosFilter::content_words();
564
565        assert!(filter.matches("NNG"));
566        assert!(filter.matches("VV"));
567        assert!(filter.matches("VA"));
568        assert!(filter.matches("MAG"));
569        assert!(!filter.matches("JKS"));
570        assert!(!filter.matches("EC"));
571    }
572
573    #[test]
574    fn test_pos_filter_exclude() {
575        let filter = PosFilter::new().include_prefix("N").exclude_tag("NNB");
576
577        assert!(filter.matches("NNG"));
578        assert!(filter.matches("NNP"));
579        assert!(!filter.matches("NNB")); // 제외됨
580        assert!(!filter.matches("VV"));
581    }
582
583    #[test]
584    fn test_pos_filter_empty_includes_all() {
585        let filter = PosFilter::new();
586
587        assert!(filter.matches("NNG"));
588        assert!(filter.matches("VV"));
589        assert!(filter.matches("JKS"));
590    }
591
592    #[test]
593    fn test_analyzer_config_default() {
594        let config = AnalyzerConfig::default();
595
596        assert_eq!(config.mode, AnalysisMode::Full);
597        assert!(config.pos_filter.is_none());
598        assert_eq!(config.lemmatization, LemmatizationMode::None);
599    }
600
601    #[test]
602    fn test_analyzer_config_with_filter() {
603        let filter = PosFilter::new().include_nouns();
604        let config = AnalyzerConfig::with_filter(filter);
605
606        assert_eq!(config.mode, AnalysisMode::Custom);
607        assert!(config.pos_filter.is_some());
608    }
609
610    #[test]
611    fn test_analyzer_config_process_tokens() {
612        let tokens = vec![
613            Token {
614                surface: "한국어".to_string(),
615                pos: "NNG".to_string(),
616                start_pos: 0,
617                end_pos: 3,
618                start_byte: 0,
619                end_byte: 9,
620                reading: None,
621                lemma: None,
622                cost: 0,
623                features: String::new(),
624                normalized: None,
625            },
626            Token {
627                surface: "가".to_string(),
628                pos: "JKS".to_string(),
629                start_pos: 3,
630                end_pos: 4,
631                start_byte: 9,
632                end_byte: 12,
633                reading: None,
634                lemma: None,
635                cost: 0,
636                features: String::new(),
637                normalized: None,
638            },
639        ];
640
641        // NounsOnly 모드
642        let config = AnalyzerConfig::new(AnalysisMode::NounsOnly);
643        let result = config.process_tokens(tokens);
644
645        assert_eq!(result.len(), 1);
646        assert_eq!(result[0].surface, "한국어");
647    }
648
649    #[test]
650    fn test_analyzer_config_min_length() {
651        let tokens = vec![
652            Token {
653                surface: "가".to_string(),
654                pos: "NNG".to_string(),
655                start_pos: 0,
656                end_pos: 1,
657                start_byte: 0,
658                end_byte: 3,
659                reading: None,
660                lemma: None,
661                cost: 0,
662                features: String::new(),
663                normalized: None,
664            },
665            Token {
666                surface: "한국어".to_string(),
667                pos: "NNG".to_string(),
668                start_pos: 1,
669                end_pos: 4,
670                start_byte: 3,
671                end_byte: 12,
672                reading: None,
673                lemma: None,
674                cost: 0,
675                features: String::new(),
676                normalized: None,
677            },
678        ];
679
680        let config = AnalyzerConfig::new(AnalysisMode::NounsOnly).with_min_length(2);
681        let result = config.process_tokens(tokens);
682
683        assert_eq!(result.len(), 1);
684        assert_eq!(result[0].surface, "한국어");
685    }
686
687    #[test]
688    fn test_lemmatization_mode() {
689        let tokens = vec![Token {
690            surface: "먹었".to_string(),
691            pos: "VV".to_string(),
692            start_pos: 0,
693            end_pos: 2,
694            start_byte: 0,
695            end_byte: 6,
696            reading: Some("먹".to_string()),
697            lemma: Some("먹다".to_string()),
698            cost: 0,
699            features: String::new(),
700            normalized: None,
701        }];
702
703        // 원형 복원 없음
704        let config = AnalyzerConfig::new(AnalysisMode::Full);
705        let result = config.process_tokens(tokens.clone());
706        assert_eq!(result[0].surface, "먹었");
707
708        // 용언 원형 복원
709        let config = AnalyzerConfig::new(AnalysisMode::Lemmatized)
710            .with_lemmatization(LemmatizationMode::PredicatesOnly);
711        let result = config.process_tokens(tokens);
712        assert_eq!(result[0].surface, "먹다");
713    }
714}
mecab_ko_core/analysis_mode.rs

mecab_ko_core/
analysis_mode.rs