use std::borrow::Cow;
use std::str::FromStr;
use lindera::dictionary::{load_dictionary, load_user_dictionary};
use lindera::mode::Mode;
use lindera::segmenter::Segmenter;
use crate::analysis::token::{Token, TokenStream, TokenType};
use crate::analysis::tokenizer::Tokenizer;
use crate::error::{LaurusError, Result};
pub struct LinderaTokenizer {
inner: Segmenter,
}
impl LinderaTokenizer {
pub fn new(mode_str: &str, dict_uri: &str, user_dict_uri: Option<&str>) -> Result<Self> {
let mode = Mode::from_str(mode_str)
.map_err(|e| LaurusError::analysis(format!("Invalid mode '{}': {}", mode_str, e)))?;
let dict = load_dictionary(dict_uri)
.map_err(|e| LaurusError::analysis(format!("Failed to load dictionary: {}", e)))?;
let metadata = &dict.metadata;
let user_dict = match user_dict_uri {
Some(uri) => Some(load_user_dictionary(uri, metadata).map_err(|e| {
LaurusError::analysis(format!("Failed to load user dictionary: {}", e))
})?),
None => None,
};
let inner = Segmenter::new(mode, dict, user_dict);
Ok(Self { inner })
}
fn detect_token_type(text: &str) -> TokenType {
if text.is_empty() {
return TokenType::Other;
}
if text.chars().all(|c| c.is_numeric()) {
return TokenType::Num;
}
if text.chars().all(|c| matches!(c, '\u{3040}'..='\u{309F}')) {
return TokenType::Hiragana;
}
if text.chars().all(|c| matches!(c, '\u{30A0}'..='\u{30FF}')) {
return TokenType::Katakana;
}
if text
.chars()
.any(|c| matches!(c, '\u{AC00}'..='\u{D7AF}' | '\u{1100}'..='\u{11FF}'))
{
return TokenType::Hangul;
}
if text.chars().any(|c| {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{20000}'..='\u{2A6DF}' | '\u{2A700}'..='\u{2B73F}' | '\u{2B740}'..='\u{2B81F}' | '\u{2B820}'..='\u{2CEAF}' )
}) {
return TokenType::Cjk;
}
if text
.chars()
.all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
{
return TokenType::Alphanum;
}
if text.chars().all(|c| c.is_ascii_punctuation()) {
return TokenType::Punctuation;
}
TokenType::Other
}
}
impl Tokenizer for LinderaTokenizer {
fn tokenize(&self, text: &str) -> Result<TokenStream> {
let mut tokens = Vec::new();
for token in self
.inner
.segment(Cow::Borrowed(text))
.map_err(|e| LaurusError::analysis(format!("Failed to segment text: {}", e)))?
{
let token_type = Self::detect_token_type(&token.surface);
tokens.push(
Token::with_offsets(
token.surface,
token.position,
token.byte_start,
token.byte_end,
)
.with_token_type(token_type),
);
}
Ok(Box::new(tokens.into_iter()))
}
fn name(&self) -> &'static str {
"lindera"
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tokenize_japanese() {
let tokenizer = LinderaTokenizer::new("normal", "embedded://ipadic", None).unwrap();
let text = "日本語の形態素解析を行うことができます。";
let tokens: Vec<Token> = tokenizer.tokenize(text).unwrap().collect();
assert_eq!(tokens.len(), 11);
assert_eq!(tokens[0].text, "日本語");
assert_eq!(tokens[1].text, "の");
assert_eq!(tokens[2].text, "形態素");
assert_eq!(tokens[3].text, "解析");
assert_eq!(tokens[4].text, "を");
assert_eq!(tokens[5].text, "行う");
assert_eq!(tokens[6].text, "こと");
assert_eq!(tokens[7].text, "が");
assert_eq!(tokens[8].text, "でき");
assert_eq!(tokens[9].text, "ます");
assert_eq!(tokens[10].text, "。");
}
#[test]
fn test_tokenize_korean() {
let tokenizer = LinderaTokenizer::new("normal", "embedded://ko-dic", None).unwrap();
let text = "한국어의형태해석을실시할수있습니다.";
let tokens: Vec<Token> = tokenizer.tokenize(text).unwrap().collect();
assert_eq!(tokens.len(), 11);
assert_eq!(tokens[0].text, "한국어");
assert_eq!(tokens[1].text, "의");
assert_eq!(tokens[2].text, "형태");
assert_eq!(tokens[3].text, "해석");
assert_eq!(tokens[4].text, "을");
assert_eq!(tokens[5].text, "실시");
assert_eq!(tokens[6].text, "할");
assert_eq!(tokens[7].text, "수");
assert_eq!(tokens[8].text, "있");
assert_eq!(tokens[9].text, "습니다");
assert_eq!(tokens[10].text, ".");
}
#[test]
fn test_tokenize_chinese() {
let tokenizer = LinderaTokenizer::new("normal", "embedded://cc-cedict", None).unwrap();
let text = "能够进行汉语的形态素解析。";
let tokens: Vec<Token> = tokenizer.tokenize(text).unwrap().collect();
assert!(!tokens.is_empty());
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(texts.contains(&"能够"));
assert!(texts.contains(&"进行"));
assert!(texts.contains(&"汉语"));
assert!(texts.contains(&"解析"));
}
#[test]
fn test_tokenizer_name() {
let tokenizer = LinderaTokenizer::new("normal", "embedded://ipadic", None).unwrap();
assert_eq!(tokenizer.name(), "lindera");
}
}