#![allow(
clippy::expect_used,
clippy::unwrap_used,
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
clippy::cast_precision_loss,
clippy::uninlined_format_args,
clippy::useless_vec,
clippy::doc_markdown
)]
mod common;
#[test]
fn test_e2e_basic_tokenization() {
use mecab_ko::Tokenizer;
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
let test_cases = vec![
("안녕하세요", vec!["안녕", "하", "세요"]),
("감사합니다", vec!["감사", "합니다"]),
("한국어", vec!["한국어"]),
("사람", vec!["사람"]),
];
for (input, expected_morphs) in test_cases {
let tokens = tokenizer.tokenize(input);
assert!(
!tokens.is_empty(),
"Tokenization should produce tokens for '{input}'"
);
let actual_morphs: Vec<String> = tokens.iter().map(|t| t.surface.clone()).collect();
println!("Input: {input}");
println!("Expected: {expected_morphs:?}");
println!("Actual: {actual_morphs:?}");
println!();
}
}
#[test]
fn test_e2e_sentence_types() {
use mecab_ko::Tokenizer;
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
let sentences = vec![
"안녕하세요".to_string(),
"감사합니다".to_string(),
"한국어".to_string(),
"나".to_string(),
];
for sentence in sentences {
let tokens = tokenizer.tokenize(&sentence);
assert!(!tokens.is_empty(), "Should tokenize sentence: '{sentence}'");
for (i, token) in tokens.iter().enumerate() {
assert!(
token.start_pos < token.end_pos,
"Token {i} should have valid position range"
);
assert!(
!token.pos.is_empty(),
"Token {i} should have POS tag: {token:?}"
);
assert!(
!token.surface.is_empty(),
"Token {i} should have non-empty surface"
);
}
println!("Sentence: {sentence}");
println!("Token count: {}", tokens.len());
for token in &tokens {
println!(
" {} [{}] ({}-{})",
token.surface, token.pos, token.start_pos, token.end_pos
);
}
println!();
}
}
#[test]
fn test_e2e_wakati_mode() {
use mecab_ko::Tokenizer;
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
let input = "한국어";
let words = tokenizer.wakati(input);
assert!(!words.is_empty(), "Wakati should produce words");
println!("Input: {input}");
println!("Words: {words:?}");
}
#[test]
fn test_e2e_noun_extraction() {
use mecab_ko::Tokenizer;
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
let input = "안녕감사";
let nouns = tokenizer.nouns(input);
assert!(!nouns.is_empty(), "Should extract nouns");
println!("Input: {input}");
println!("Nouns: {nouns:?}");
let tokens = tokenizer.tokenize(input);
let noun_surfaces: Vec<String> = tokens
.iter()
.filter(|t| t.pos.starts_with("NN"))
.map(|t| t.surface.clone())
.collect();
assert_eq!(
nouns, noun_surfaces,
"Nouns() should match tokens with NN* POS"
);
}
#[test]
fn test_e2e_pos_tagging() {
use mecab_ko::Tokenizer;
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
let input = "나";
let pos_tags = tokenizer.pos(input);
assert!(!pos_tags.is_empty(), "Should produce POS tags");
println!("Input: {input}");
println!("POS tags:");
for (surface, pos) in &pos_tags {
println!(" {surface}/{pos}");
}
}
#[test]
fn test_e2e_consistency() {
use mecab_ko::Tokenizer;
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
let test_inputs = vec![
"안녕하세요",
"대한민국 만세",
"인공지능 기술",
"형태소 분석기",
];
for input in test_inputs {
let result1 = tokenizer.tokenize(input);
let result2 = tokenizer.tokenize(input);
let result3 = tokenizer.tokenize(input);
assert_eq!(
result1, result2,
"Tokenization should be consistent for '{input}'"
);
assert_eq!(
result2, result3,
"Tokenization should be consistent for '{input}'"
);
}
}
#[test]
fn test_e2e_token_positions() {
use mecab_ko::Tokenizer;
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
let input = "안녕하세요 반갑습니다";
let tokens = tokenizer.tokenize(input);
for (i, token) in tokens.iter().enumerate() {
assert!(
token.start_pos < token.end_pos,
"Token {i}: start must be less than end"
);
assert!(
token.start_byte < token.end_byte,
"Token {i}: start_byte must be less than end_byte"
);
assert!(
input.is_char_boundary(token.start_byte),
"Token {i}: start_byte must be on char boundary"
);
assert!(
input.is_char_boundary(token.end_byte),
"Token {i}: end_byte must be on char boundary"
);
println!(
"Token {i}: '{}' pos={}..{} bytes={}..{}",
token.surface, token.start_pos, token.end_pos, token.start_byte, token.end_byte
);
}
}
#[test]
fn test_e2e_with_user_dictionary() {
use mecab_ko::{dict::UserDictionary, Tokenizer};
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
let mut user_dict = UserDictionary::new();
user_dict.add_entry("딥러닝", "NNG", Some(-1000), Some("딥러닝".to_string()));
user_dict.add_entry("머신러닝", "NNG", Some(-1000), Some("머신러닝".to_string()));
tokenizer.set_user_dict(user_dict);
let input = "딥러닝";
let tokens = tokenizer.tokenize(input);
println!("Input: {input}");
println!("Tokens:");
for token in &tokens {
println!(" {} [{}]", token.surface, token.pos);
}
let surfaces: Vec<String> = tokens.iter().map(|t| t.surface.clone()).collect();
assert!(
surfaces.contains(&"딥러닝".to_string()),
"Should recognize user dictionary entry '딥러닝', got: {surfaces:?}"
);
}
#[test]
fn test_e2e_lattice_construction() {
use mecab_ko::Tokenizer;
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
let input = "아버지가방에들어가신다";
let lattice = tokenizer.tokenize_to_lattice(input);
let stats = lattice.stats();
assert!(
stats.total_nodes > 2,
"Lattice should contain nodes beyond BOS and EOS"
);
assert_eq!(
stats.char_length,
input.chars().count(),
"Lattice char length should match input"
);
println!("Input: {input}");
println!("Lattice stats: {stats:?}");
}
#[test]
fn test_e2e_sequential_tokenizations() {
use mecab_ko::Tokenizer;
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
let test_cases = vec!["안녕", "감사", "한국어", "사람", "나"];
for (i, input) in test_cases.iter().enumerate() {
let tokens = tokenizer.tokenize(input);
assert!(!tokens.is_empty(), "Tokenization {i} should produce tokens");
for token in &tokens {
assert!(
token.end_pos <= input.chars().count(),
"Token position should be within current input bounds"
);
}
println!(
"Tokenization {}: {} -> {} tokens",
i + 1,
input,
tokens.len()
);
}
}
#[test]
fn test_e2e_basic_performance() {
use mecab_ko::Tokenizer;
use std::time::Instant;
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
let input = "오늘은 날씨가 정말 좋습니다. 밖에 나가서 산책을 하고 싶어요. \
친구들과 함께 공원에서 즐거운 시간을 보내고 싶습니다.";
for _ in 0..10 {
let _ = tokenizer.tokenize(input);
}
let iterations = 1000;
let start = Instant::now();
for _ in 0..iterations {
let _ = tokenizer.tokenize(input);
}
let duration = start.elapsed();
let avg_micros = duration.as_micros() / iterations;
let throughput = (iterations as f64 / duration.as_secs_f64()) as u64;
println!("Performance:");
println!(" Total: {:?}", duration);
println!(" Average: {} μs/iteration", avg_micros);
println!(" Throughput: {} tokenizations/sec", throughput);
assert!(
avg_micros < 10_000,
"Average tokenization should be under 10ms"
);
}
#[test]
fn test_e2e_mixed_korean_english() {
use mecab_ko::Tokenizer;
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
let test_cases = vec!["안녕", "감사", "한국어", "사람"];
for input in test_cases {
let tokens = tokenizer.tokenize(input);
assert!(!tokens.is_empty(), "Should tokenize Korean text: '{input}'");
println!("Input: {input}");
println!("Tokens:");
for token in &tokens {
println!(" {} [{}]", token.surface, token.pos);
}
println!();
}
}
#[test]
fn test_e2e_numbers_and_symbols() {
use mecab_ko::Tokenizer;
skip_without_system_dict!();
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
let test_cases = vec![
"2024년 1월 15일",
"가격은 10,000원입니다",
"전화번호: 010-1234-5678",
"이메일은 test@example.com입니다",
];
for input in test_cases {
let tokens = tokenizer.tokenize(input);
assert!(
!tokens.is_empty(),
"Should tokenize text with numbers and symbols: '{input}'"
);
println!("Input: {input}");
println!("Tokens:");
for token in &tokens {
println!(" {} [{}]", token.surface, token.pos);
}
println!();
}
}
#[test]
fn test_e2e_morphs_method() {
use mecab_ko::Tokenizer;
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
let input = "형태소 분석을 합니다";
let morphs = tokenizer.morphs(input);
let wakati = tokenizer.wakati(input);
assert_eq!(morphs, wakati, "morphs() should be equivalent to wakati()");
println!("Input: {input}");
println!("Morphs: {morphs:?}");
}
#[test]
fn test_e2e_token_metadata() {
use mecab_ko::Tokenizer;
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
let input = "안녕하세요";
let tokens = tokenizer.tokenize(input);
for (i, token) in tokens.iter().enumerate() {
assert!(!token.surface.is_empty(), "Token {i} should have surface");
assert!(!token.pos.is_empty(), "Token {i} should have POS tag");
assert!(!token.features.is_empty(), "Token {i} should have features");
assert_eq!(
token.char_len(),
token.surface.chars().count(),
"Token {i} char_len should match surface length"
);
assert_eq!(
token.byte_len(),
token.surface.len(),
"Token {i} byte_len should match surface byte length"
);
println!("Token {i}:");
println!(" Surface: {}", token.surface);
println!(" POS: {}", token.pos);
println!(" Position: {}..{} (chars)", token.start_pos, token.end_pos);
println!(" Bytes: {}..{}", token.start_byte, token.end_byte);
println!(" Cost: {}", token.cost);
println!(" Features: {}", token.features);
if let Some(reading) = &token.reading {
println!(" Reading: {reading}");
}
if let Some(lemma) = &token.lemma {
println!(" Lemma: {lemma}");
}
println!();
}
}
#[test]
fn test_e2e_special_characters() {
use mecab_ko::Tokenizer;
skip_without_system_dict!();
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
let test_cases = vec![
"한글!",
"문장.",
"질문?",
"감탄사!!!",
"쉼표,쉼표",
"괄호(내용)괄호",
"인용\"문\"인용",
];
for input in test_cases {
let tokens = tokenizer.tokenize(input);
assert!(
!tokens.is_empty(),
"Should tokenize text with special characters: '{input}'"
);
println!("Input: {input}");
println!(
"Tokens: {:?}",
tokens.iter().map(|t| &t.surface).collect::<Vec<_>>()
);
println!();
}
}
#[test]
fn test_e2e_long_text() {
use mecab_ko::Tokenizer;
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
let word = "한국어";
let long_text = word.repeat(100);
let tokens = tokenizer.tokenize(&long_text);
assert!(!tokens.is_empty(), "Should tokenize long text");
let mut prev_end_pos = 0;
for token in &tokens {
assert!(
token.start_pos >= prev_end_pos,
"Token positions should not overlap or go backwards"
);
prev_end_pos = token.end_pos;
}
println!("Long text length: {} chars", long_text.chars().count());
println!("Token count: {}", tokens.len());
}
#[test]
fn test_e2e_pool_statistics() {
use mecab_ko::Tokenizer;
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
for i in 0..10 {
let input = format!("이것은 {i}번째 테스트입니다");
let _ = tokenizer.tokenize(&input);
}
let stats = tokenizer.pool_stats();
println!("Pool statistics after 10 tokenizations:");
println!(" {stats:?}");
}
#[test]
fn test_e2e_lattice_statistics() {
use mecab_ko::Tokenizer;
let mut tokenizer = Tokenizer::new().expect("Failed to create tokenizer");
let input = "안녕하세요";
let _ = tokenizer.tokenize(input);
let stats = tokenizer.lattice_stats();
println!("Lattice statistics for '{input}':");
println!(" {stats:?}");
assert!(stats.total_nodes > 0, "Should have nodes in lattice");
assert_eq!(
stats.char_length,
input.chars().count(),
"Lattice char length should match input"
);
}