1use crate::tokenizer::{Token, Tokenizer};
36
37#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
41pub enum AnalysisMode {
42 #[default]
44 Full,
45
46 NounsOnly,
48
49 VerbsOnly,
51
52 AdjectivesOnly,
54
55 PredicatesOnly,
57
58 ContentWordsOnly,
60
61 SurfaceOnly,
63
64 Lemmatized,
66
67 PosTagsOnly,
69
70 Custom,
72}
73
74impl AnalysisMode {
75 #[must_use]
77 pub const fn uses_pos_filter(&self) -> bool {
78 matches!(
79 self,
80 Self::NounsOnly
81 | Self::VerbsOnly
82 | Self::AdjectivesOnly
83 | Self::PredicatesOnly
84 | Self::ContentWordsOnly
85 | Self::Custom
86 )
87 }
88
89 #[must_use]
91 pub const fn uses_lemmatization(&self) -> bool {
92 matches!(self, Self::Lemmatized)
93 }
94}
95
96#[derive(Debug, Clone, Default)]
100pub struct PosFilter {
101 include_prefixes: Vec<String>,
103 exclude_prefixes: Vec<String>,
105 include_exact: Vec<String>,
107 exclude_exact: Vec<String>,
109}
110
111impl PosFilter {
112 #[must_use]
114 pub fn new() -> Self {
115 Self::default()
116 }
117
118 #[must_use]
120 pub fn include_nouns(mut self) -> Self {
121 self.include_prefixes.push("NN".to_string());
122 self.include_prefixes.push("NR".to_string());
123 self.include_prefixes.push("NP".to_string());
124 self
125 }
126
127 #[must_use]
129 pub fn include_common_nouns(mut self) -> Self {
130 self.include_exact.push("NNG".to_string());
131 self
132 }
133
134 #[must_use]
136 pub fn include_proper_nouns(mut self) -> Self {
137 self.include_exact.push("NNP".to_string());
138 self
139 }
140
141 #[must_use]
143 pub fn include_verbs(mut self) -> Self {
144 self.include_exact.push("VV".to_string());
145 self
146 }
147
148 #[must_use]
150 pub fn include_adjectives(mut self) -> Self {
151 self.include_exact.push("VA".to_string());
152 self
153 }
154
155 #[must_use]
157 pub fn include_predicates(mut self) -> Self {
158 self.include_prefixes.push("V".to_string());
159 self
160 }
161
162 #[must_use]
164 pub fn include_adverbs(mut self) -> Self {
165 self.include_prefixes.push("MA".to_string());
166 self
167 }
168
169 #[must_use]
171 pub fn exclude_particles(mut self) -> Self {
172 self.exclude_prefixes.push("J".to_string());
173 self
174 }
175
176 #[must_use]
178 pub fn exclude_endings(mut self) -> Self {
179 self.exclude_prefixes.push("E".to_string());
180 self
181 }
182
183 #[must_use]
185 pub fn exclude_affixes(mut self) -> Self {
186 self.exclude_prefixes.push("X".to_string());
187 self
188 }
189
190 #[must_use]
192 pub fn exclude_symbols(mut self) -> Self {
193 self.exclude_prefixes.push("S".to_string());
194 self
195 }
196
197 #[must_use]
199 pub fn include_prefix(mut self, prefix: &str) -> Self {
200 self.include_prefixes.push(prefix.to_string());
201 self
202 }
203
204 #[must_use]
206 pub fn exclude_prefix(mut self, prefix: &str) -> Self {
207 self.exclude_prefixes.push(prefix.to_string());
208 self
209 }
210
211 #[must_use]
213 pub fn include_tag(mut self, tag: &str) -> Self {
214 self.include_exact.push(tag.to_string());
215 self
216 }
217
218 #[must_use]
220 pub fn exclude_tag(mut self, tag: &str) -> Self {
221 self.exclude_exact.push(tag.to_string());
222 self
223 }
224
225 #[must_use]
227 pub fn content_words() -> Self {
228 Self::new()
229 .include_nouns()
230 .include_verbs()
231 .include_adjectives()
232 .include_adverbs()
233 }
234
235 #[must_use]
237 pub fn matches(&self, pos: &str) -> bool {
238 for excluded in &self.exclude_exact {
240 if pos == excluded {
241 return false;
242 }
243 }
244 for excluded in &self.exclude_prefixes {
245 if pos.starts_with(excluded) {
246 return false;
247 }
248 }
249
250 if self.include_exact.is_empty() && self.include_prefixes.is_empty() {
252 return true;
253 }
254
255 for included in &self.include_exact {
257 if pos == included {
258 return true;
259 }
260 }
261 for included in &self.include_prefixes {
262 if pos.starts_with(included) {
263 return true;
264 }
265 }
266
267 false
268 }
269}
270
271#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
273pub enum LemmatizationMode {
274 #[default]
276 None,
277
278 PredicatesOnly,
280
281 All,
283}
284
285#[derive(Debug, Clone)]
289pub struct AnalyzerConfig {
290 pub mode: AnalysisMode,
292 pub pos_filter: Option<PosFilter>,
294 pub lemmatization: LemmatizationMode,
296 pub min_length: usize,
298 pub max_length: usize,
300}
301
302impl Default for AnalyzerConfig {
303 fn default() -> Self {
304 Self {
305 mode: AnalysisMode::Full,
306 pos_filter: None,
307 lemmatization: LemmatizationMode::None,
308 min_length: 0,
309 max_length: 0,
310 }
311 }
312}
313
314impl AnalyzerConfig {
315 #[must_use]
317 pub fn new(mode: AnalysisMode) -> Self {
318 Self {
319 mode,
320 ..Self::default()
321 }
322 }
323
324 #[must_use]
326 pub fn with_filter(filter: PosFilter) -> Self {
327 Self {
328 mode: AnalysisMode::Custom,
329 pos_filter: Some(filter),
330 ..Self::default()
331 }
332 }
333
334 #[must_use]
336 pub const fn with_lemmatization(mut self, mode: LemmatizationMode) -> Self {
337 self.lemmatization = mode;
338 self
339 }
340
341 #[must_use]
343 pub const fn with_min_length(mut self, len: usize) -> Self {
344 self.min_length = len;
345 self
346 }
347
348 #[must_use]
350 pub const fn with_max_length(mut self, len: usize) -> Self {
351 self.max_length = len;
352 self
353 }
354
355 pub fn analyze(&self, tokenizer: &mut Tokenizer, text: &str) -> Vec<AnalyzedToken> {
360 let tokens = tokenizer.tokenize(text);
361 self.process_tokens(tokens)
362 }
363
364 #[must_use]
368 pub fn process_tokens(&self, tokens: Vec<Token>) -> Vec<AnalyzedToken> {
369 tokens
370 .into_iter()
371 .filter(|t| self.filter_token(t))
372 .map(|t| self.transform_token(t))
373 .collect()
374 }
375
376 fn filter_token(&self, token: &Token) -> bool {
378 let char_len = token.char_len();
380 if self.min_length > 0 && char_len < self.min_length {
381 return false;
382 }
383 if self.max_length > 0 && char_len > self.max_length {
384 return false;
385 }
386
387 match self.mode {
389 AnalysisMode::Full | AnalysisMode::SurfaceOnly | AnalysisMode::Lemmatized | AnalysisMode::PosTagsOnly => true,
390 AnalysisMode::NounsOnly => {
391 token.pos.starts_with("NN")
392 || token.pos.starts_with("NR")
393 || token.pos.starts_with("NP")
394 }
395 AnalysisMode::VerbsOnly => token.pos == "VV",
396 AnalysisMode::AdjectivesOnly => token.pos == "VA",
397 AnalysisMode::PredicatesOnly => {
398 token.pos == "VV" || token.pos == "VA"
399 }
400 AnalysisMode::ContentWordsOnly => {
401 token.pos.starts_with("NN")
402 || token.pos.starts_with("NR")
403 || token.pos.starts_with("NP")
404 || token.pos == "VV"
405 || token.pos == "VA"
406 || token.pos.starts_with("MA")
407 }
408 AnalysisMode::Custom => {
409 self.pos_filter
410 .as_ref()
411 .map_or(true, |f| f.matches(&token.pos))
412 }
413 }
414 }
415
416 fn transform_token(&self, token: Token) -> AnalyzedToken {
418 let surface = match self.lemmatization {
419 LemmatizationMode::None => token.surface.clone(),
420 LemmatizationMode::PredicatesOnly => {
421 if token.pos == "VV" || token.pos == "VA" {
422 token.lemma.clone().unwrap_or_else(|| token.surface.clone())
423 } else {
424 token.surface.clone()
425 }
426 }
427 LemmatizationMode::All => {
428 token.lemma.clone().unwrap_or_else(|| token.surface.clone())
429 }
430 };
431
432 AnalyzedToken {
433 surface,
434 original_surface: token.surface,
435 pos: token.pos,
436 start_pos: token.start_pos,
437 end_pos: token.end_pos,
438 lemma: token.lemma,
439 is_lemmatized: self.lemmatization != LemmatizationMode::None,
440 }
441 }
442}
443
444#[derive(Debug, Clone, PartialEq, Eq)]
448pub struct AnalyzedToken {
449 pub surface: String,
451 pub original_surface: String,
453 pub pos: String,
455 pub start_pos: usize,
457 pub end_pos: usize,
459 pub lemma: Option<String>,
461 pub is_lemmatized: bool,
463}
464
465impl AnalyzedToken {
466 #[must_use]
468 pub const fn char_len(&self) -> usize {
469 self.end_pos - self.start_pos
470 }
471}
472
473pub fn extract_nouns(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
475 AnalyzerConfig::new(AnalysisMode::NounsOnly)
476 .analyze(tokenizer, text)
477 .into_iter()
478 .map(|t| t.surface)
479 .collect()
480}
481
482pub fn extract_verbs(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
484 AnalyzerConfig::new(AnalysisMode::VerbsOnly)
485 .analyze(tokenizer, text)
486 .into_iter()
487 .map(|t| t.surface)
488 .collect()
489}
490
491pub fn extract_adjectives(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
493 AnalyzerConfig::new(AnalysisMode::AdjectivesOnly)
494 .analyze(tokenizer, text)
495 .into_iter()
496 .map(|t| t.surface)
497 .collect()
498}
499
500pub fn extract_content_words(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
502 AnalyzerConfig::new(AnalysisMode::ContentWordsOnly)
503 .analyze(tokenizer, text)
504 .into_iter()
505 .map(|t| t.surface)
506 .collect()
507}
508
509pub fn extract_lemmas(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
511 AnalyzerConfig::new(AnalysisMode::Lemmatized)
512 .with_lemmatization(LemmatizationMode::All)
513 .analyze(tokenizer, text)
514 .into_iter()
515 .map(|t| t.surface)
516 .collect()
517}
518
519#[cfg(test)]
520mod tests {
521 use super::*;
522
523 #[test]
524 fn test_analysis_mode_uses_filter() {
525 assert!(!AnalysisMode::Full.uses_pos_filter());
526 assert!(AnalysisMode::NounsOnly.uses_pos_filter());
527 assert!(AnalysisMode::Custom.uses_pos_filter());
528 }
529
530 #[test]
531 fn test_pos_filter_matches_nouns() {
532 let filter = PosFilter::new().include_nouns();
533
534 assert!(filter.matches("NNG"));
535 assert!(filter.matches("NNP"));
536 assert!(filter.matches("NNB"));
537 assert!(filter.matches("NR"));
538 assert!(filter.matches("NP"));
539 assert!(!filter.matches("VV"));
540 assert!(!filter.matches("JKS"));
541 }
542
543 #[test]
544 fn test_pos_filter_matches_verbs() {
545 let filter = PosFilter::new().include_verbs();
546
547 assert!(filter.matches("VV"));
548 assert!(!filter.matches("VA"));
549 assert!(!filter.matches("NNG"));
550 }
551
552 #[test]
553 fn test_pos_filter_matches_predicates() {
554 let filter = PosFilter::new().include_predicates();
555
556 assert!(filter.matches("VV"));
557 assert!(filter.matches("VA"));
558 assert!(filter.matches("VX"));
559 assert!(filter.matches("VCP"));
560 assert!(!filter.matches("NNG"));
561 }
562
563 #[test]
564 fn test_pos_filter_content_words() {
565 let filter = PosFilter::content_words();
566
567 assert!(filter.matches("NNG"));
568 assert!(filter.matches("VV"));
569 assert!(filter.matches("VA"));
570 assert!(filter.matches("MAG"));
571 assert!(!filter.matches("JKS"));
572 assert!(!filter.matches("EC"));
573 }
574
575 #[test]
576 fn test_pos_filter_exclude() {
577 let filter = PosFilter::new()
578 .include_prefix("N")
579 .exclude_tag("NNB");
580
581 assert!(filter.matches("NNG"));
582 assert!(filter.matches("NNP"));
583 assert!(!filter.matches("NNB")); assert!(!filter.matches("VV"));
585 }
586
587 #[test]
588 fn test_pos_filter_empty_includes_all() {
589 let filter = PosFilter::new();
590
591 assert!(filter.matches("NNG"));
592 assert!(filter.matches("VV"));
593 assert!(filter.matches("JKS"));
594 }
595
596 #[test]
597 fn test_analyzer_config_default() {
598 let config = AnalyzerConfig::default();
599
600 assert_eq!(config.mode, AnalysisMode::Full);
601 assert!(config.pos_filter.is_none());
602 assert_eq!(config.lemmatization, LemmatizationMode::None);
603 }
604
605 #[test]
606 fn test_analyzer_config_with_filter() {
607 let filter = PosFilter::new().include_nouns();
608 let config = AnalyzerConfig::with_filter(filter);
609
610 assert_eq!(config.mode, AnalysisMode::Custom);
611 assert!(config.pos_filter.is_some());
612 }
613
614 #[test]
615 fn test_analyzer_config_process_tokens() {
616 let tokens = vec![
617 Token {
618 surface: "한국어".to_string(),
619 pos: "NNG".to_string(),
620 start_pos: 0,
621 end_pos: 3,
622 start_byte: 0,
623 end_byte: 9,
624 reading: None,
625 lemma: None,
626 cost: 0,
627 features: String::new(),
628 normalized: None,
629 },
630 Token {
631 surface: "가".to_string(),
632 pos: "JKS".to_string(),
633 start_pos: 3,
634 end_pos: 4,
635 start_byte: 9,
636 end_byte: 12,
637 reading: None,
638 lemma: None,
639 cost: 0,
640 features: String::new(),
641 normalized: None,
642 },
643 ];
644
645 let config = AnalyzerConfig::new(AnalysisMode::NounsOnly);
647 let result = config.process_tokens(tokens);
648
649 assert_eq!(result.len(), 1);
650 assert_eq!(result[0].surface, "한국어");
651 }
652
653 #[test]
654 fn test_analyzer_config_min_length() {
655 let tokens = vec![
656 Token {
657 surface: "가".to_string(),
658 pos: "NNG".to_string(),
659 start_pos: 0,
660 end_pos: 1,
661 start_byte: 0,
662 end_byte: 3,
663 reading: None,
664 lemma: None,
665 cost: 0,
666 features: String::new(),
667 normalized: None,
668 },
669 Token {
670 surface: "한국어".to_string(),
671 pos: "NNG".to_string(),
672 start_pos: 1,
673 end_pos: 4,
674 start_byte: 3,
675 end_byte: 12,
676 reading: None,
677 lemma: None,
678 cost: 0,
679 features: String::new(),
680 normalized: None,
681 },
682 ];
683
684 let config = AnalyzerConfig::new(AnalysisMode::NounsOnly).with_min_length(2);
685 let result = config.process_tokens(tokens);
686
687 assert_eq!(result.len(), 1);
688 assert_eq!(result[0].surface, "한국어");
689 }
690
691 #[test]
692 fn test_lemmatization_mode() {
693 let tokens = vec![
694 Token {
695 surface: "먹었".to_string(),
696 pos: "VV".to_string(),
697 start_pos: 0,
698 end_pos: 2,
699 start_byte: 0,
700 end_byte: 6,
701 reading: Some("먹".to_string()),
702 lemma: Some("먹다".to_string()),
703 cost: 0,
704 features: String::new(),
705 normalized: None,
706 },
707 ];
708
709 let config = AnalyzerConfig::new(AnalysisMode::Full);
711 let result = config.process_tokens(tokens.clone());
712 assert_eq!(result[0].surface, "먹었");
713
714 let config = AnalyzerConfig::new(AnalysisMode::Lemmatized)
716 .with_lemmatization(LemmatizationMode::PredicatesOnly);
717 let result = config.process_tokens(tokens);
718 assert_eq!(result[0].surface, "먹다");
719 }
720}