1use crate::tokenizer::{Token, Tokenizer};
36
37#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
41pub enum AnalysisMode {
42 #[default]
44 Full,
45
46 NounsOnly,
48
49 VerbsOnly,
51
52 AdjectivesOnly,
54
55 PredicatesOnly,
57
58 ContentWordsOnly,
60
61 SurfaceOnly,
63
64 Lemmatized,
66
67 PosTagsOnly,
69
70 Custom,
72}
73
74impl AnalysisMode {
75 #[must_use]
77 pub const fn uses_pos_filter(&self) -> bool {
78 matches!(
79 self,
80 Self::NounsOnly
81 | Self::VerbsOnly
82 | Self::AdjectivesOnly
83 | Self::PredicatesOnly
84 | Self::ContentWordsOnly
85 | Self::Custom
86 )
87 }
88
89 #[must_use]
91 pub const fn uses_lemmatization(&self) -> bool {
92 matches!(self, Self::Lemmatized)
93 }
94}
95
96#[derive(Debug, Clone, Default)]
100pub struct PosFilter {
101 include_prefixes: Vec<String>,
103 exclude_prefixes: Vec<String>,
105 include_exact: Vec<String>,
107 exclude_exact: Vec<String>,
109}
110
111impl PosFilter {
112 #[must_use]
114 pub fn new() -> Self {
115 Self::default()
116 }
117
118 #[must_use]
120 pub fn include_nouns(mut self) -> Self {
121 self.include_prefixes.push("NN".to_string());
122 self.include_prefixes.push("NR".to_string());
123 self.include_prefixes.push("NP".to_string());
124 self
125 }
126
127 #[must_use]
129 pub fn include_common_nouns(mut self) -> Self {
130 self.include_exact.push("NNG".to_string());
131 self
132 }
133
134 #[must_use]
136 pub fn include_proper_nouns(mut self) -> Self {
137 self.include_exact.push("NNP".to_string());
138 self
139 }
140
141 #[must_use]
143 pub fn include_verbs(mut self) -> Self {
144 self.include_exact.push("VV".to_string());
145 self
146 }
147
148 #[must_use]
150 pub fn include_adjectives(mut self) -> Self {
151 self.include_exact.push("VA".to_string());
152 self
153 }
154
155 #[must_use]
157 pub fn include_predicates(mut self) -> Self {
158 self.include_prefixes.push("V".to_string());
159 self
160 }
161
162 #[must_use]
164 pub fn include_adverbs(mut self) -> Self {
165 self.include_prefixes.push("MA".to_string());
166 self
167 }
168
169 #[must_use]
171 pub fn exclude_particles(mut self) -> Self {
172 self.exclude_prefixes.push("J".to_string());
173 self
174 }
175
176 #[must_use]
178 pub fn exclude_endings(mut self) -> Self {
179 self.exclude_prefixes.push("E".to_string());
180 self
181 }
182
183 #[must_use]
185 pub fn exclude_affixes(mut self) -> Self {
186 self.exclude_prefixes.push("X".to_string());
187 self
188 }
189
190 #[must_use]
192 pub fn exclude_symbols(mut self) -> Self {
193 self.exclude_prefixes.push("S".to_string());
194 self
195 }
196
197 #[must_use]
199 pub fn include_prefix(mut self, prefix: &str) -> Self {
200 self.include_prefixes.push(prefix.to_string());
201 self
202 }
203
204 #[must_use]
206 pub fn exclude_prefix(mut self, prefix: &str) -> Self {
207 self.exclude_prefixes.push(prefix.to_string());
208 self
209 }
210
211 #[must_use]
213 pub fn include_tag(mut self, tag: &str) -> Self {
214 self.include_exact.push(tag.to_string());
215 self
216 }
217
218 #[must_use]
220 pub fn exclude_tag(mut self, tag: &str) -> Self {
221 self.exclude_exact.push(tag.to_string());
222 self
223 }
224
225 #[must_use]
227 pub fn content_words() -> Self {
228 Self::new()
229 .include_nouns()
230 .include_verbs()
231 .include_adjectives()
232 .include_adverbs()
233 }
234
235 #[must_use]
237 pub fn matches(&self, pos: &str) -> bool {
238 for excluded in &self.exclude_exact {
240 if pos == excluded {
241 return false;
242 }
243 }
244 for excluded in &self.exclude_prefixes {
245 if pos.starts_with(excluded) {
246 return false;
247 }
248 }
249
250 if self.include_exact.is_empty() && self.include_prefixes.is_empty() {
252 return true;
253 }
254
255 for included in &self.include_exact {
257 if pos == included {
258 return true;
259 }
260 }
261 for included in &self.include_prefixes {
262 if pos.starts_with(included) {
263 return true;
264 }
265 }
266
267 false
268 }
269}
270
271#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
273pub enum LemmatizationMode {
274 #[default]
276 None,
277
278 PredicatesOnly,
280
281 All,
283}
284
285#[derive(Debug, Clone)]
289pub struct AnalyzerConfig {
290 pub mode: AnalysisMode,
292 pub pos_filter: Option<PosFilter>,
294 pub lemmatization: LemmatizationMode,
296 pub min_length: usize,
298 pub max_length: usize,
300}
301
302impl Default for AnalyzerConfig {
303 fn default() -> Self {
304 Self {
305 mode: AnalysisMode::Full,
306 pos_filter: None,
307 lemmatization: LemmatizationMode::None,
308 min_length: 0,
309 max_length: 0,
310 }
311 }
312}
313
314impl AnalyzerConfig {
315 #[must_use]
317 pub fn new(mode: AnalysisMode) -> Self {
318 Self {
319 mode,
320 ..Self::default()
321 }
322 }
323
324 #[must_use]
326 pub fn with_filter(filter: PosFilter) -> Self {
327 Self {
328 mode: AnalysisMode::Custom,
329 pos_filter: Some(filter),
330 ..Self::default()
331 }
332 }
333
334 #[must_use]
336 pub const fn with_lemmatization(mut self, mode: LemmatizationMode) -> Self {
337 self.lemmatization = mode;
338 self
339 }
340
341 #[must_use]
343 pub const fn with_min_length(mut self, len: usize) -> Self {
344 self.min_length = len;
345 self
346 }
347
348 #[must_use]
350 pub const fn with_max_length(mut self, len: usize) -> Self {
351 self.max_length = len;
352 self
353 }
354
355 pub fn analyze(&self, tokenizer: &mut Tokenizer, text: &str) -> Vec<AnalyzedToken> {
360 let tokens = tokenizer.tokenize(text);
361 self.process_tokens(tokens)
362 }
363
364 #[must_use]
368 pub fn process_tokens(&self, tokens: Vec<Token>) -> Vec<AnalyzedToken> {
369 tokens
370 .into_iter()
371 .filter(|t| self.filter_token(t))
372 .map(|t| self.transform_token(t))
373 .collect()
374 }
375
376 fn filter_token(&self, token: &Token) -> bool {
378 let char_len = token.char_len();
380 if self.min_length > 0 && char_len < self.min_length {
381 return false;
382 }
383 if self.max_length > 0 && char_len > self.max_length {
384 return false;
385 }
386
387 match self.mode {
389 AnalysisMode::Full
390 | AnalysisMode::SurfaceOnly
391 | AnalysisMode::Lemmatized
392 | AnalysisMode::PosTagsOnly => true,
393 AnalysisMode::NounsOnly => {
394 token.pos.starts_with("NN")
395 || token.pos.starts_with("NR")
396 || token.pos.starts_with("NP")
397 }
398 AnalysisMode::VerbsOnly => token.pos == "VV",
399 AnalysisMode::AdjectivesOnly => token.pos == "VA",
400 AnalysisMode::PredicatesOnly => token.pos == "VV" || token.pos == "VA",
401 AnalysisMode::ContentWordsOnly => {
402 token.pos.starts_with("NN")
403 || token.pos.starts_with("NR")
404 || token.pos.starts_with("NP")
405 || token.pos == "VV"
406 || token.pos == "VA"
407 || token.pos.starts_with("MA")
408 }
409 AnalysisMode::Custom => self
410 .pos_filter
411 .as_ref()
412 .map_or(true, |f| f.matches(&token.pos)),
413 }
414 }
415
416 fn transform_token(&self, token: Token) -> AnalyzedToken {
418 let surface = match self.lemmatization {
419 LemmatizationMode::None => token.surface.clone(),
420 LemmatizationMode::PredicatesOnly => {
421 if token.pos == "VV" || token.pos == "VA" {
422 token.lemma.clone().unwrap_or_else(|| token.surface.clone())
423 } else {
424 token.surface.clone()
425 }
426 }
427 LemmatizationMode::All => token.lemma.clone().unwrap_or_else(|| token.surface.clone()),
428 };
429
430 AnalyzedToken {
431 surface,
432 original_surface: token.surface,
433 pos: token.pos,
434 start_pos: token.start_pos,
435 end_pos: token.end_pos,
436 lemma: token.lemma,
437 is_lemmatized: self.lemmatization != LemmatizationMode::None,
438 }
439 }
440}
441
442#[derive(Debug, Clone, PartialEq, Eq)]
446pub struct AnalyzedToken {
447 pub surface: String,
449 pub original_surface: String,
451 pub pos: String,
453 pub start_pos: usize,
455 pub end_pos: usize,
457 pub lemma: Option<String>,
459 pub is_lemmatized: bool,
461}
462
463impl AnalyzedToken {
464 #[must_use]
466 pub const fn char_len(&self) -> usize {
467 self.end_pos - self.start_pos
468 }
469}
470
471pub fn extract_nouns(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
473 AnalyzerConfig::new(AnalysisMode::NounsOnly)
474 .analyze(tokenizer, text)
475 .into_iter()
476 .map(|t| t.surface)
477 .collect()
478}
479
480pub fn extract_verbs(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
482 AnalyzerConfig::new(AnalysisMode::VerbsOnly)
483 .analyze(tokenizer, text)
484 .into_iter()
485 .map(|t| t.surface)
486 .collect()
487}
488
489pub fn extract_adjectives(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
491 AnalyzerConfig::new(AnalysisMode::AdjectivesOnly)
492 .analyze(tokenizer, text)
493 .into_iter()
494 .map(|t| t.surface)
495 .collect()
496}
497
498pub fn extract_content_words(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
500 AnalyzerConfig::new(AnalysisMode::ContentWordsOnly)
501 .analyze(tokenizer, text)
502 .into_iter()
503 .map(|t| t.surface)
504 .collect()
505}
506
507pub fn extract_lemmas(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
509 AnalyzerConfig::new(AnalysisMode::Lemmatized)
510 .with_lemmatization(LemmatizationMode::All)
511 .analyze(tokenizer, text)
512 .into_iter()
513 .map(|t| t.surface)
514 .collect()
515}
516
517#[cfg(test)]
518mod tests {
519 use super::*;
520
521 #[test]
522 fn test_analysis_mode_uses_filter() {
523 assert!(!AnalysisMode::Full.uses_pos_filter());
524 assert!(AnalysisMode::NounsOnly.uses_pos_filter());
525 assert!(AnalysisMode::Custom.uses_pos_filter());
526 }
527
528 #[test]
529 fn test_pos_filter_matches_nouns() {
530 let filter = PosFilter::new().include_nouns();
531
532 assert!(filter.matches("NNG"));
533 assert!(filter.matches("NNP"));
534 assert!(filter.matches("NNB"));
535 assert!(filter.matches("NR"));
536 assert!(filter.matches("NP"));
537 assert!(!filter.matches("VV"));
538 assert!(!filter.matches("JKS"));
539 }
540
541 #[test]
542 fn test_pos_filter_matches_verbs() {
543 let filter = PosFilter::new().include_verbs();
544
545 assert!(filter.matches("VV"));
546 assert!(!filter.matches("VA"));
547 assert!(!filter.matches("NNG"));
548 }
549
550 #[test]
551 fn test_pos_filter_matches_predicates() {
552 let filter = PosFilter::new().include_predicates();
553
554 assert!(filter.matches("VV"));
555 assert!(filter.matches("VA"));
556 assert!(filter.matches("VX"));
557 assert!(filter.matches("VCP"));
558 assert!(!filter.matches("NNG"));
559 }
560
561 #[test]
562 fn test_pos_filter_content_words() {
563 let filter = PosFilter::content_words();
564
565 assert!(filter.matches("NNG"));
566 assert!(filter.matches("VV"));
567 assert!(filter.matches("VA"));
568 assert!(filter.matches("MAG"));
569 assert!(!filter.matches("JKS"));
570 assert!(!filter.matches("EC"));
571 }
572
573 #[test]
574 fn test_pos_filter_exclude() {
575 let filter = PosFilter::new().include_prefix("N").exclude_tag("NNB");
576
577 assert!(filter.matches("NNG"));
578 assert!(filter.matches("NNP"));
579 assert!(!filter.matches("NNB")); assert!(!filter.matches("VV"));
581 }
582
583 #[test]
584 fn test_pos_filter_empty_includes_all() {
585 let filter = PosFilter::new();
586
587 assert!(filter.matches("NNG"));
588 assert!(filter.matches("VV"));
589 assert!(filter.matches("JKS"));
590 }
591
592 #[test]
593 fn test_analyzer_config_default() {
594 let config = AnalyzerConfig::default();
595
596 assert_eq!(config.mode, AnalysisMode::Full);
597 assert!(config.pos_filter.is_none());
598 assert_eq!(config.lemmatization, LemmatizationMode::None);
599 }
600
601 #[test]
602 fn test_analyzer_config_with_filter() {
603 let filter = PosFilter::new().include_nouns();
604 let config = AnalyzerConfig::with_filter(filter);
605
606 assert_eq!(config.mode, AnalysisMode::Custom);
607 assert!(config.pos_filter.is_some());
608 }
609
610 #[test]
611 fn test_analyzer_config_process_tokens() {
612 let tokens = vec![
613 Token {
614 surface: "한국어".to_string(),
615 pos: "NNG".to_string(),
616 start_pos: 0,
617 end_pos: 3,
618 start_byte: 0,
619 end_byte: 9,
620 reading: None,
621 lemma: None,
622 cost: 0,
623 features: String::new(),
624 normalized: None,
625 },
626 Token {
627 surface: "가".to_string(),
628 pos: "JKS".to_string(),
629 start_pos: 3,
630 end_pos: 4,
631 start_byte: 9,
632 end_byte: 12,
633 reading: None,
634 lemma: None,
635 cost: 0,
636 features: String::new(),
637 normalized: None,
638 },
639 ];
640
641 let config = AnalyzerConfig::new(AnalysisMode::NounsOnly);
643 let result = config.process_tokens(tokens);
644
645 assert_eq!(result.len(), 1);
646 assert_eq!(result[0].surface, "한국어");
647 }
648
649 #[test]
650 fn test_analyzer_config_min_length() {
651 let tokens = vec![
652 Token {
653 surface: "가".to_string(),
654 pos: "NNG".to_string(),
655 start_pos: 0,
656 end_pos: 1,
657 start_byte: 0,
658 end_byte: 3,
659 reading: None,
660 lemma: None,
661 cost: 0,
662 features: String::new(),
663 normalized: None,
664 },
665 Token {
666 surface: "한국어".to_string(),
667 pos: "NNG".to_string(),
668 start_pos: 1,
669 end_pos: 4,
670 start_byte: 3,
671 end_byte: 12,
672 reading: None,
673 lemma: None,
674 cost: 0,
675 features: String::new(),
676 normalized: None,
677 },
678 ];
679
680 let config = AnalyzerConfig::new(AnalysisMode::NounsOnly).with_min_length(2);
681 let result = config.process_tokens(tokens);
682
683 assert_eq!(result.len(), 1);
684 assert_eq!(result[0].surface, "한국어");
685 }
686
687 #[test]
688 fn test_lemmatization_mode() {
689 let tokens = vec![Token {
690 surface: "먹었".to_string(),
691 pos: "VV".to_string(),
692 start_pos: 0,
693 end_pos: 2,
694 start_byte: 0,
695 end_byte: 6,
696 reading: Some("먹".to_string()),
697 lemma: Some("먹다".to_string()),
698 cost: 0,
699 features: String::new(),
700 normalized: None,
701 }];
702
703 let config = AnalyzerConfig::new(AnalysisMode::Full);
705 let result = config.process_tokens(tokens.clone());
706 assert_eq!(result[0].surface, "먹었");
707
708 let config = AnalyzerConfig::new(AnalysisMode::Lemmatized)
710 .with_lemmatization(LemmatizationMode::PredicatesOnly);
711 let result = config.process_tokens(tokens);
712 assert_eq!(result[0].surface, "먹다");
713 }
714}