use std::borrow::Cow;
use std::path::Path;
use mecab_ko_dict::{SystemDictionary, UserDictionary};
use crate::error::Result;
use crate::lattice::{Lattice, Node, NodeBuilder, NodeType};
use crate::normalizer::{NormalizationConfig, Normalizer};
use crate::pool::{PoolManager, PoolStats};
use crate::pos_tag::PosTag;
use crate::unknown::UnknownHandler;
use crate::viterbi::{SpacePenalty, ViterbiSearcher};
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
pub surface: String,
pub pos: String,
pub start_pos: usize,
pub end_pos: usize,
pub start_byte: usize,
pub end_byte: usize,
pub reading: Option<String>,
pub lemma: Option<String>,
pub cost: i32,
pub features: String,
pub normalized: Option<String>,
}
impl Token {
#[must_use]
pub const fn new(
surface: String,
pos: String,
start_pos: usize,
end_pos: usize,
start_byte: usize,
end_byte: usize,
) -> Self {
Self {
surface,
pos,
start_pos,
end_pos,
start_byte,
end_byte,
reading: None,
lemma: None,
cost: 0,
features: String::new(),
normalized: None,
}
}
#[must_use]
pub fn from_node(node: &Node) -> Self {
let features = node.feature.to_string();
let (pos, reading, lemma) = parse_features(&features);
Self {
surface: node.surface.to_string(),
pos: pos.to_string(),
start_pos: node.start_pos,
end_pos: node.end_pos,
start_byte: node.start_byte,
end_byte: node.end_byte,
reading,
lemma,
cost: node.total_cost,
features,
normalized: None,
}
}
#[inline]
#[must_use]
pub const fn char_len(&self) -> usize {
self.end_pos - self.start_pos
}
#[inline]
#[must_use]
pub const fn byte_len(&self) -> usize {
self.end_byte - self.start_byte
}
#[must_use]
pub fn pos_tag(&self) -> Option<PosTag> {
self.pos.parse().ok()
}
}
fn parse_features(features: &str) -> (Cow<'_, str>, Option<String>, Option<String>) {
let mut split = features.splitn(5, ',');
let pos = split.next().unwrap_or("*");
let reading = split
.nth(2) .filter(|s| !s.is_empty() && *s != "*")
.map(std::string::ToString::to_string);
let lemma = reading.clone();
(Cow::Borrowed(pos), reading, lemma)
}
pub struct Tokenizer {
dictionary: SystemDictionary,
unknown_handler: UnknownHandler,
viterbi_searcher: ViterbiSearcher,
lattice: Lattice,
normalizer: Option<Normalizer>,
enable_normalization: bool,
pool_manager: PoolManager,
}
impl Tokenizer {
pub fn new() -> Result<Self> {
let dictionary = SystemDictionary::load_default()?;
let unknown_handler = UnknownHandler::korean_default();
let viterbi_searcher = ViterbiSearcher::new();
let lattice = Lattice::new("");
Ok(Self {
dictionary,
unknown_handler,
viterbi_searcher,
lattice,
normalizer: None,
enable_normalization: false,
pool_manager: PoolManager::new(),
})
}
pub fn with_dict<P: AsRef<Path>>(dict_path: P) -> Result<Self> {
let dictionary = SystemDictionary::load(dict_path)?;
let unknown_handler = UnknownHandler::korean_default();
let viterbi_searcher = ViterbiSearcher::new();
let lattice = Lattice::new("");
Ok(Self {
dictionary,
unknown_handler,
viterbi_searcher,
lattice,
normalizer: None,
enable_normalization: false,
pool_manager: PoolManager::new(),
})
}
#[must_use]
pub fn with_user_dict(mut self, user_dict: UserDictionary) -> Self {
self.dictionary.set_user_dictionary(user_dict);
self
}
pub fn set_user_dict(&mut self, user_dict: UserDictionary) {
self.dictionary.set_user_dictionary(user_dict);
}
#[cfg(feature = "hot-reload-v2")]
pub fn set_hot_reload(
&mut self,
hr: std::sync::Arc<mecab_ko_dict::hot_reload_v2::HotReloadDictV2>,
) {
self.dictionary.set_hot_reload(hr);
}
#[must_use]
pub fn with_space_penalty(mut self, penalty: SpacePenalty) -> Self {
self.viterbi_searcher = ViterbiSearcher::new().with_space_penalty(penalty);
self
}
pub fn tokenize(&mut self, text: &str) -> Vec<Token> {
if text.is_empty() {
return Vec::new();
}
self.lattice.reset(text);
self.build_lattice();
let path = self
.viterbi_searcher
.search(&mut self.lattice, self.dictionary.matrix());
path.iter()
.filter_map(|&node_id| self.lattice.node(node_id))
.map(Token::from_node)
.collect()
}
fn build_lattice(&mut self) {
let char_len = self.lattice.char_len();
for pos in 0..char_len {
let has_dict_entry = self.add_dict_nodes(pos);
self.unknown_handler
.add_unknown_nodes(&mut self.lattice, pos, has_dict_entry);
}
}
fn add_dict_nodes(&mut self, start_pos: usize) -> bool {
let char_len = self.lattice.char_len();
let search_text: &str = self.lattice.substring(start_pos, char_len);
if search_text.is_empty() {
return false;
}
let dict_entries: Vec<_> = self
.dictionary
.common_prefix_search(search_text)
.unwrap_or_default();
let user_entries: Vec<_> = self
.dictionary
.user_dictionary()
.map(|ud| ud.common_prefix_search(search_text))
.unwrap_or_default();
let mut found = false;
for (entry, byte_len) in dict_entries {
let end_pos = self
.lattice
.char_pos_from_start_and_byte_len(start_pos, byte_len);
self.lattice.add_node(
NodeBuilder::new(&entry.surface, start_pos, end_pos)
.left_id(entry.left_id)
.right_id(entry.right_id)
.word_cost(i32::from(entry.cost))
.node_type(NodeType::Known)
.feature(&entry.feature),
);
found = true;
}
for user_entry in user_entries {
let surface_char_len = user_entry.surface.chars().count();
let end_pos = start_pos + surface_char_len;
self.lattice.add_node(
NodeBuilder::new(&user_entry.surface, start_pos, end_pos)
.left_id(user_entry.left_id)
.right_id(user_entry.right_id)
.word_cost(i32::from(user_entry.cost))
.node_type(NodeType::User)
.feature(&user_entry.feature),
);
found = true;
}
found
}
pub fn tokenize_to_lattice(&mut self, text: &str) -> &Lattice {
if !text.is_empty() {
self.lattice.reset(text);
self.build_lattice();
}
&self.lattice
}
pub fn wakati(&mut self, text: &str) -> Vec<String> {
self.tokenize(text).into_iter().map(|t| t.surface).collect()
}
pub fn nouns(&mut self, text: &str) -> Vec<String> {
self.tokenize(text)
.into_iter()
.filter(|t| t.pos.starts_with("NN"))
.map(|t| t.surface)
.collect()
}
pub fn morphs(&mut self, text: &str) -> Vec<String> {
self.wakati(text)
}
pub fn pos(&mut self, text: &str) -> Vec<(String, String)> {
self.tokenize(text)
.into_iter()
.map(|t| (t.surface, t.pos))
.collect()
}
#[must_use]
pub const fn dictionary(&self) -> &SystemDictionary {
&self.dictionary
}
#[must_use]
pub fn lattice_stats(&self) -> crate::lattice::LatticeStats {
self.lattice.stats()
}
#[must_use]
pub fn pool_stats(&self) -> PoolStats {
self.pool_manager.stats()
}
#[must_use]
pub fn memory_stats(&self) -> crate::memory::MemoryStats {
crate::memory::MemoryStats {
dictionary_bytes: 0, lattice_bytes: self.lattice.memory_usage(),
pool_bytes: self.pool_manager.total_memory_usage(),
cache_bytes: 0,
interner_bytes: 0,
token_bytes: 0,
}
}
pub fn clear_pools(&self) {
self.pool_manager.clear_all();
}
pub fn set_normalization(
&mut self,
enable: bool,
config: Option<NormalizationConfig>,
) -> Result<()> {
self.enable_normalization = enable;
if enable {
let normalizer_config = config.unwrap_or_default();
self.normalizer = Some(Normalizer::new(normalizer_config)?);
} else {
self.normalizer = None;
}
Ok(())
}
#[must_use]
pub const fn normalizer(&self) -> Option<&Normalizer> {
self.normalizer.as_ref()
}
#[must_use]
pub const fn is_normalization_enabled(&self) -> bool {
self.enable_normalization
}
pub fn tokenize_with_normalization(&mut self, text: &str) -> Vec<Token> {
let mut tokens = self.tokenize(text);
if let Some(normalizer) = &self.normalizer {
for token in &mut tokens {
token.normalized = Some(normalizer.normalize(&token.surface));
}
}
tokens
}
#[must_use]
pub fn get_word_variants(&self, word: &str) -> (String, Vec<String>) {
self.normalizer.as_ref().map_or_else(
|| (word.to_string(), Vec::new()),
|normalizer| {
let standard = normalizer.normalize(word);
let variants = normalizer.get_variants(&standard);
(standard, variants)
},
)
}
}
#[cfg(test)]
#[allow(clippy::expect_used, clippy::vec_init_then_push)]
mod tests {
use super::*;
use mecab_ko_dict::{matrix::DenseMatrix, trie::TrieBuilder, DictEntry};
fn create_test_tokenizer() -> Tokenizer {
let mut trie_entries = vec![
("아버지", 0u32),
("가", 1),
("방", 2),
("에", 3),
("들어가", 4),
("신다", 5),
];
let trie_bytes = TrieBuilder::build_unsorted(&mut trie_entries).expect("should build trie");
let trie = mecab_ko_dict::Trie::from_vec(trie_bytes);
let matrix = DenseMatrix::new(10, 10, 100);
let matrix = mecab_ko_dict::matrix::ConnectionMatrix::Dense(matrix);
let mut entries = Vec::new();
entries.push(DictEntry::new(
"아버지",
1,
1,
1000,
"NNG,*,T,아버지,*,*,*,*",
));
entries.push(DictEntry::new("가", 5, 5, 500, "JKS,*,F,가,*,*,*,*"));
entries.push(DictEntry::new("방", 2, 2, 2000, "NNG,*,T,방,*,*,*,*"));
entries.push(DictEntry::new("에", 6, 6, 400, "JKB,*,F,에,*,*,*,*"));
entries.push(DictEntry::new(
"들어가",
3,
3,
1500,
"VV,*,F,들어가다,*,*,*,*",
));
entries.push(DictEntry::new("신다", 4, 4, 1800, "VV+EP,*,F,신다,*,*,*,*"));
let dictionary = SystemDictionary::new_test(
std::path::PathBuf::from("./test_dic"),
trie,
matrix,
entries,
);
let unknown_handler = UnknownHandler::korean_default();
let viterbi_searcher = ViterbiSearcher::new();
let lattice = Lattice::new("");
Tokenizer {
dictionary,
unknown_handler,
viterbi_searcher,
lattice,
normalizer: None,
enable_normalization: false,
pool_manager: PoolManager::new(),
}
}
#[test]
fn test_token_creation() {
let token = Token::new("안녕".to_string(), "NNG".to_string(), 0, 2, 0, 6);
assert_eq!(token.surface, "안녕");
assert_eq!(token.pos, "NNG");
assert_eq!(token.start_pos, 0);
assert_eq!(token.end_pos, 2);
assert_eq!(token.char_len(), 2);
assert_eq!(token.byte_len(), 6);
}
#[test]
fn test_parse_features() {
let features = "NNG,*,T,안녕,*,*,*,*";
let (pos, reading, lemma) = parse_features(features);
assert_eq!(pos, "NNG");
assert_eq!(reading, Some("안녕".to_string()));
assert_eq!(lemma, Some("안녕".to_string()));
}
#[test]
fn test_parse_features_no_reading() {
let features = "JKS,*,F,*,*,*,*,*";
let (pos, reading, _lemma) = parse_features(features);
assert_eq!(pos, "JKS");
assert_eq!(reading, None);
}
#[test]
fn test_tokenize_simple() {
let mut tokenizer = create_test_tokenizer();
let tokens = tokenizer.tokenize("아버지");
assert!(!tokens.is_empty());
assert_eq!(tokens[0].surface, "아버지");
assert_eq!(tokens[0].pos, "NNG");
}
#[test]
fn test_tokenize_with_particle() {
let mut tokenizer = create_test_tokenizer();
let tokens = tokenizer.tokenize("아버지가");
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].surface, "아버지");
assert_eq!(tokens[0].pos, "NNG");
assert_eq!(tokens[1].surface, "가");
assert_eq!(tokens[1].pos, "JKS");
}
#[test]
fn test_tokenize_complex() {
let mut tokenizer = create_test_tokenizer();
let tokens = tokenizer.tokenize("아버지가방에들어가신다");
assert!(!tokens.is_empty());
assert_eq!(tokens[0].surface, "아버지");
}
#[test]
fn test_tokenize_empty() {
let mut tokenizer = create_test_tokenizer();
let tokens = tokenizer.tokenize("");
assert!(tokens.is_empty());
}
#[test]
fn test_tokenize_with_spaces() {
let mut tokenizer = create_test_tokenizer();
let tokens = tokenizer.tokenize("아버지 가방");
assert!(!tokens.is_empty());
}
#[test]
fn test_wakati() {
let mut tokenizer = create_test_tokenizer();
let surfaces = tokenizer.wakati("아버지가");
assert_eq!(surfaces.len(), 2);
assert_eq!(surfaces[0], "아버지");
assert_eq!(surfaces[1], "가");
}
#[test]
fn test_nouns() {
let mut tokenizer = create_test_tokenizer();
let nouns = tokenizer.nouns("아버지가방에");
assert!(nouns.contains(&"아버지".to_string()));
assert!(nouns.contains(&"방".to_string()));
assert!(!nouns.contains(&"가".to_string())); }
#[test]
fn test_pos() {
let mut tokenizer = create_test_tokenizer();
let pos_tags = tokenizer.pos("아버지가");
assert_eq!(pos_tags.len(), 2);
assert_eq!(pos_tags[0], ("아버지".to_string(), "NNG".to_string()));
assert_eq!(pos_tags[1], ("가".to_string(), "JKS".to_string()));
}
#[test]
fn test_tokenize_to_lattice() {
let mut tokenizer = create_test_tokenizer();
let lattice = tokenizer.tokenize_to_lattice("아버지가");
assert!(lattice.node_count() > 2);
let stats = lattice.stats();
assert!(stats.total_nodes > 2);
}
#[test]
fn test_lattice_stats() {
let mut tokenizer = create_test_tokenizer();
tokenizer.tokenize("아버지가");
let stats = tokenizer.lattice_stats();
assert!(stats.total_nodes > 0);
assert!(stats.char_length > 0);
}
#[test]
fn test_token_positions() {
let mut tokenizer = create_test_tokenizer();
let tokens = tokenizer.tokenize("아버지가");
assert_eq!(tokens[0].start_pos, 0);
assert_eq!(tokens[0].end_pos, 3);
assert_eq!(tokens[1].start_pos, 3);
assert_eq!(tokens[1].end_pos, 4);
}
#[test]
fn test_multiple_tokenize_calls() {
let mut tokenizer = create_test_tokenizer();
let tokens1 = tokenizer.tokenize("아버지");
assert!(!tokens1.is_empty());
let tokens2 = tokenizer.tokenize("가방");
assert!(!tokens2.is_empty());
assert_ne!(tokens1[0].surface, tokens2[0].surface);
}
#[test]
fn test_token_from_node() {
use crate::lattice::Node;
use std::borrow::Cow;
let node = Node {
id: 1,
surface: Cow::Borrowed("테스트"),
start_pos: 0,
end_pos: 3,
start_byte: 0,
end_byte: 9,
left_id: 1,
right_id: 1,
word_cost: 1000,
total_cost: 1500,
prev_node_id: 0,
node_type: NodeType::Known,
feature: Cow::Borrowed("NNG,*,T,테스트,*,*,*,*"),
has_space_before: false,
};
let token = Token::from_node(&node);
assert_eq!(token.surface, "테스트");
assert_eq!(token.pos, "NNG");
assert_eq!(token.start_pos, 0);
assert_eq!(token.end_pos, 3);
assert_eq!(token.reading, Some("테스트".to_string()));
assert_eq!(token.cost, 1500);
}
#[test]
fn test_with_user_dict() {
let mut tokenizer = create_test_tokenizer();
let mut user_dict = UserDictionary::new();
user_dict.add_entry("딥러닝", "NNG", Some(-1000), None);
tokenizer.set_user_dict(user_dict);
assert!(tokenizer.dictionary().user_dictionary().is_some());
}
}