use crate::tokenizer::{Token, Tokenizer};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
pub enum AnalysisMode {
#[default]
Full,
NounsOnly,
VerbsOnly,
AdjectivesOnly,
PredicatesOnly,
ContentWordsOnly,
SurfaceOnly,
Lemmatized,
PosTagsOnly,
Custom,
}
impl AnalysisMode {
#[must_use]
pub const fn uses_pos_filter(&self) -> bool {
matches!(
self,
Self::NounsOnly
| Self::VerbsOnly
| Self::AdjectivesOnly
| Self::PredicatesOnly
| Self::ContentWordsOnly
| Self::Custom
)
}
#[must_use]
pub const fn uses_lemmatization(&self) -> bool {
matches!(self, Self::Lemmatized)
}
}
#[derive(Debug, Clone, Default)]
pub struct PosFilter {
include_prefixes: Vec<String>,
exclude_prefixes: Vec<String>,
include_exact: Vec<String>,
exclude_exact: Vec<String>,
}
impl PosFilter {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn include_nouns(mut self) -> Self {
self.include_prefixes.push("NN".to_string());
self.include_prefixes.push("NR".to_string());
self.include_prefixes.push("NP".to_string());
self
}
#[must_use]
pub fn include_common_nouns(mut self) -> Self {
self.include_exact.push("NNG".to_string());
self
}
#[must_use]
pub fn include_proper_nouns(mut self) -> Self {
self.include_exact.push("NNP".to_string());
self
}
#[must_use]
pub fn include_verbs(mut self) -> Self {
self.include_exact.push("VV".to_string());
self
}
#[must_use]
pub fn include_adjectives(mut self) -> Self {
self.include_exact.push("VA".to_string());
self
}
#[must_use]
pub fn include_predicates(mut self) -> Self {
self.include_prefixes.push("V".to_string());
self
}
#[must_use]
pub fn include_adverbs(mut self) -> Self {
self.include_prefixes.push("MA".to_string());
self
}
#[must_use]
pub fn exclude_particles(mut self) -> Self {
self.exclude_prefixes.push("J".to_string());
self
}
#[must_use]
pub fn exclude_endings(mut self) -> Self {
self.exclude_prefixes.push("E".to_string());
self
}
#[must_use]
pub fn exclude_affixes(mut self) -> Self {
self.exclude_prefixes.push("X".to_string());
self
}
#[must_use]
pub fn exclude_symbols(mut self) -> Self {
self.exclude_prefixes.push("S".to_string());
self
}
#[must_use]
pub fn include_prefix(mut self, prefix: &str) -> Self {
self.include_prefixes.push(prefix.to_string());
self
}
#[must_use]
pub fn exclude_prefix(mut self, prefix: &str) -> Self {
self.exclude_prefixes.push(prefix.to_string());
self
}
#[must_use]
pub fn include_tag(mut self, tag: &str) -> Self {
self.include_exact.push(tag.to_string());
self
}
#[must_use]
pub fn exclude_tag(mut self, tag: &str) -> Self {
self.exclude_exact.push(tag.to_string());
self
}
#[must_use]
pub fn content_words() -> Self {
Self::new()
.include_nouns()
.include_verbs()
.include_adjectives()
.include_adverbs()
}
#[must_use]
pub fn matches(&self, pos: &str) -> bool {
for excluded in &self.exclude_exact {
if pos == excluded {
return false;
}
}
for excluded in &self.exclude_prefixes {
if pos.starts_with(excluded) {
return false;
}
}
if self.include_exact.is_empty() && self.include_prefixes.is_empty() {
return true;
}
for included in &self.include_exact {
if pos == included {
return true;
}
}
for included in &self.include_prefixes {
if pos.starts_with(included) {
return true;
}
}
false
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum LemmatizationMode {
#[default]
None,
PredicatesOnly,
All,
}
#[derive(Debug, Clone)]
pub struct AnalyzerConfig {
pub mode: AnalysisMode,
pub pos_filter: Option<PosFilter>,
pub lemmatization: LemmatizationMode,
pub min_length: usize,
pub max_length: usize,
}
impl Default for AnalyzerConfig {
fn default() -> Self {
Self {
mode: AnalysisMode::Full,
pos_filter: None,
lemmatization: LemmatizationMode::None,
min_length: 0,
max_length: 0,
}
}
}
impl AnalyzerConfig {
#[must_use]
pub fn new(mode: AnalysisMode) -> Self {
Self {
mode,
..Self::default()
}
}
#[must_use]
pub fn with_filter(filter: PosFilter) -> Self {
Self {
mode: AnalysisMode::Custom,
pos_filter: Some(filter),
..Self::default()
}
}
#[must_use]
pub const fn with_lemmatization(mut self, mode: LemmatizationMode) -> Self {
self.lemmatization = mode;
self
}
#[must_use]
pub const fn with_min_length(mut self, len: usize) -> Self {
self.min_length = len;
self
}
#[must_use]
pub const fn with_max_length(mut self, len: usize) -> Self {
self.max_length = len;
self
}
pub fn analyze(&self, tokenizer: &mut Tokenizer, text: &str) -> Vec<AnalyzedToken> {
let tokens = tokenizer.tokenize(text);
self.process_tokens(tokens)
}
#[must_use]
pub fn process_tokens(&self, tokens: Vec<Token>) -> Vec<AnalyzedToken> {
tokens
.into_iter()
.filter(|t| self.filter_token(t))
.map(|t| self.transform_token(t))
.collect()
}
fn filter_token(&self, token: &Token) -> bool {
let char_len = token.char_len();
if self.min_length > 0 && char_len < self.min_length {
return false;
}
if self.max_length > 0 && char_len > self.max_length {
return false;
}
match self.mode {
AnalysisMode::Full
| AnalysisMode::SurfaceOnly
| AnalysisMode::Lemmatized
| AnalysisMode::PosTagsOnly => true,
AnalysisMode::NounsOnly => {
token.pos.starts_with("NN")
|| token.pos.starts_with("NR")
|| token.pos.starts_with("NP")
}
AnalysisMode::VerbsOnly => token.pos == "VV",
AnalysisMode::AdjectivesOnly => token.pos == "VA",
AnalysisMode::PredicatesOnly => token.pos == "VV" || token.pos == "VA",
AnalysisMode::ContentWordsOnly => {
token.pos.starts_with("NN")
|| token.pos.starts_with("NR")
|| token.pos.starts_with("NP")
|| token.pos == "VV"
|| token.pos == "VA"
|| token.pos.starts_with("MA")
}
AnalysisMode::Custom => self
.pos_filter
.as_ref()
.map_or(true, |f| f.matches(&token.pos)),
}
}
fn transform_token(&self, token: Token) -> AnalyzedToken {
let surface = match self.lemmatization {
LemmatizationMode::None => token.surface.clone(),
LemmatizationMode::PredicatesOnly => {
if token.pos == "VV" || token.pos == "VA" {
token.lemma.clone().unwrap_or_else(|| token.surface.clone())
} else {
token.surface.clone()
}
}
LemmatizationMode::All => token.lemma.clone().unwrap_or_else(|| token.surface.clone()),
};
AnalyzedToken {
surface,
original_surface: token.surface,
pos: token.pos,
start_pos: token.start_pos,
end_pos: token.end_pos,
lemma: token.lemma,
is_lemmatized: self.lemmatization != LemmatizationMode::None,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct AnalyzedToken {
pub surface: String,
pub original_surface: String,
pub pos: String,
pub start_pos: usize,
pub end_pos: usize,
pub lemma: Option<String>,
pub is_lemmatized: bool,
}
impl AnalyzedToken {
#[must_use]
pub const fn char_len(&self) -> usize {
self.end_pos - self.start_pos
}
}
pub fn extract_nouns(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
AnalyzerConfig::new(AnalysisMode::NounsOnly)
.analyze(tokenizer, text)
.into_iter()
.map(|t| t.surface)
.collect()
}
pub fn extract_verbs(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
AnalyzerConfig::new(AnalysisMode::VerbsOnly)
.analyze(tokenizer, text)
.into_iter()
.map(|t| t.surface)
.collect()
}
pub fn extract_adjectives(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
AnalyzerConfig::new(AnalysisMode::AdjectivesOnly)
.analyze(tokenizer, text)
.into_iter()
.map(|t| t.surface)
.collect()
}
pub fn extract_content_words(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
AnalyzerConfig::new(AnalysisMode::ContentWordsOnly)
.analyze(tokenizer, text)
.into_iter()
.map(|t| t.surface)
.collect()
}
pub fn extract_lemmas(tokenizer: &mut Tokenizer, text: &str) -> Vec<String> {
AnalyzerConfig::new(AnalysisMode::Lemmatized)
.with_lemmatization(LemmatizationMode::All)
.analyze(tokenizer, text)
.into_iter()
.map(|t| t.surface)
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_analysis_mode_uses_filter() {
assert!(!AnalysisMode::Full.uses_pos_filter());
assert!(AnalysisMode::NounsOnly.uses_pos_filter());
assert!(AnalysisMode::Custom.uses_pos_filter());
}
#[test]
fn test_pos_filter_matches_nouns() {
let filter = PosFilter::new().include_nouns();
assert!(filter.matches("NNG"));
assert!(filter.matches("NNP"));
assert!(filter.matches("NNB"));
assert!(filter.matches("NR"));
assert!(filter.matches("NP"));
assert!(!filter.matches("VV"));
assert!(!filter.matches("JKS"));
}
#[test]
fn test_pos_filter_matches_verbs() {
let filter = PosFilter::new().include_verbs();
assert!(filter.matches("VV"));
assert!(!filter.matches("VA"));
assert!(!filter.matches("NNG"));
}
#[test]
fn test_pos_filter_matches_predicates() {
let filter = PosFilter::new().include_predicates();
assert!(filter.matches("VV"));
assert!(filter.matches("VA"));
assert!(filter.matches("VX"));
assert!(filter.matches("VCP"));
assert!(!filter.matches("NNG"));
}
#[test]
fn test_pos_filter_content_words() {
let filter = PosFilter::content_words();
assert!(filter.matches("NNG"));
assert!(filter.matches("VV"));
assert!(filter.matches("VA"));
assert!(filter.matches("MAG"));
assert!(!filter.matches("JKS"));
assert!(!filter.matches("EC"));
}
#[test]
fn test_pos_filter_exclude() {
let filter = PosFilter::new().include_prefix("N").exclude_tag("NNB");
assert!(filter.matches("NNG"));
assert!(filter.matches("NNP"));
assert!(!filter.matches("NNB")); assert!(!filter.matches("VV"));
}
#[test]
fn test_pos_filter_empty_includes_all() {
let filter = PosFilter::new();
assert!(filter.matches("NNG"));
assert!(filter.matches("VV"));
assert!(filter.matches("JKS"));
}
#[test]
fn test_analyzer_config_default() {
let config = AnalyzerConfig::default();
assert_eq!(config.mode, AnalysisMode::Full);
assert!(config.pos_filter.is_none());
assert_eq!(config.lemmatization, LemmatizationMode::None);
}
#[test]
fn test_analyzer_config_with_filter() {
let filter = PosFilter::new().include_nouns();
let config = AnalyzerConfig::with_filter(filter);
assert_eq!(config.mode, AnalysisMode::Custom);
assert!(config.pos_filter.is_some());
}
#[test]
fn test_analyzer_config_process_tokens() {
let tokens = vec![
Token {
surface: "한국어".to_string(),
pos: "NNG".to_string(),
start_pos: 0,
end_pos: 3,
start_byte: 0,
end_byte: 9,
reading: None,
lemma: None,
cost: 0,
features: String::new(),
normalized: None,
},
Token {
surface: "가".to_string(),
pos: "JKS".to_string(),
start_pos: 3,
end_pos: 4,
start_byte: 9,
end_byte: 12,
reading: None,
lemma: None,
cost: 0,
features: String::new(),
normalized: None,
},
];
let config = AnalyzerConfig::new(AnalysisMode::NounsOnly);
let result = config.process_tokens(tokens);
assert_eq!(result.len(), 1);
assert_eq!(result[0].surface, "한국어");
}
#[test]
fn test_analyzer_config_min_length() {
let tokens = vec![
Token {
surface: "가".to_string(),
pos: "NNG".to_string(),
start_pos: 0,
end_pos: 1,
start_byte: 0,
end_byte: 3,
reading: None,
lemma: None,
cost: 0,
features: String::new(),
normalized: None,
},
Token {
surface: "한국어".to_string(),
pos: "NNG".to_string(),
start_pos: 1,
end_pos: 4,
start_byte: 3,
end_byte: 12,
reading: None,
lemma: None,
cost: 0,
features: String::new(),
normalized: None,
},
];
let config = AnalyzerConfig::new(AnalysisMode::NounsOnly).with_min_length(2);
let result = config.process_tokens(tokens);
assert_eq!(result.len(), 1);
assert_eq!(result[0].surface, "한국어");
}
#[test]
fn test_lemmatization_mode() {
let tokens = vec![Token {
surface: "먹었".to_string(),
pos: "VV".to_string(),
start_pos: 0,
end_pos: 2,
start_byte: 0,
end_byte: 6,
reading: Some("먹".to_string()),
lemma: Some("먹다".to_string()),
cost: 0,
features: String::new(),
normalized: None,
}];
let config = AnalyzerConfig::new(AnalysisMode::Full);
let result = config.process_tokens(tokens.clone());
assert_eq!(result[0].surface, "먹었");
let config = AnalyzerConfig::new(AnalysisMode::Lemmatized)
.with_lemmatization(LemmatizationMode::PredicatesOnly);
let result = config.process_tokens(tokens);
assert_eq!(result[0].surface, "먹다");
}
}