use std::borrow::Cow;
use std::str::FromStr;
use lindera::dictionary::{load_dictionary, load_user_dictionary};
use lindera::mode::Mode;
use lindera::segmenter::Segmenter;
use crate::analysis::token::{Token, TokenStream, TokenType};
use crate::analysis::tokenizer::Tokenizer;
use crate::error::{LaurusError, Result};
pub struct LinderaTokenizer {
inner: Segmenter,
}
impl LinderaTokenizer {
pub fn new(mode_str: &str, dict_uri: &str, user_dict_uri: Option<&str>) -> Result<Self> {
let mode = Mode::from_str(mode_str)
.map_err(|e| LaurusError::analysis(format!("Invalid mode '{}': {}", mode_str, e)))?;
let dict = load_dictionary(dict_uri)
.map_err(|e| LaurusError::analysis(format!("Failed to load dictionary: {}", e)))?;
let metadata = &dict.metadata;
let user_dict = match user_dict_uri {
Some(uri) => Some(load_user_dictionary(uri, metadata).map_err(|e| {
LaurusError::analysis(format!("Failed to load user dictionary: {}", e))
})?),
None => None,
};
let inner = Segmenter::new(mode, dict, user_dict);
Ok(Self { inner })
}
#[allow(clippy::too_many_arguments)]
pub fn from_bytes(
mode_str: &str,
metadata: &[u8],
dict_da: &[u8],
dict_vals: &[u8],
dict_words_idx: &[u8],
dict_words: &[u8],
matrix_mtx: &[u8],
char_def: &[u8],
unk: &[u8],
) -> Result<Self> {
use lindera::dictionary::Dictionary;
use lindera_dictionary::dictionary::character_definition::CharacterDefinition;
use lindera_dictionary::dictionary::connection_cost_matrix::ConnectionCostMatrix;
use lindera_dictionary::dictionary::metadata::Metadata;
use lindera_dictionary::dictionary::prefix_dictionary::PrefixDictionary;
use lindera_dictionary::dictionary::unknown_dictionary::UnknownDictionary;
let mode = Mode::from_str(mode_str)
.map_err(|e| LaurusError::analysis(format!("Invalid mode '{}': {}", mode_str, e)))?;
let meta = Metadata::load(metadata)
.map_err(|e| LaurusError::analysis(format!("Failed to load metadata: {}", e)))?;
let prefix_dictionary = PrefixDictionary::load(
dict_da.to_vec(),
dict_vals.to_vec(),
dict_words_idx.to_vec(),
dict_words.to_vec(),
true,
)
.map_err(|e| LaurusError::analysis(format!("Failed to load prefix dictionary: {}", e)))?;
let connection_cost_matrix = ConnectionCostMatrix::load(matrix_mtx.to_vec())
.map_err(|e| LaurusError::analysis(format!("Failed to load cost matrix: {}", e)))?;
let character_definition = CharacterDefinition::load(char_def).map_err(|e| {
LaurusError::analysis(format!("Failed to load character definition: {}", e))
})?;
let unknown_dictionary = UnknownDictionary::load(unk).map_err(|e| {
LaurusError::analysis(format!("Failed to load unknown dictionary: {}", e))
})?;
let dict = Dictionary {
prefix_dictionary,
connection_cost_matrix,
character_definition,
unknown_dictionary,
metadata: meta,
};
let inner = Segmenter::new(mode, dict, None);
Ok(Self { inner })
}
fn detect_token_type(text: &str) -> TokenType {
if text.is_empty() {
return TokenType::Other;
}
if text.chars().all(|c| c.is_numeric()) {
return TokenType::Num;
}
if text.chars().all(|c| matches!(c, '\u{3040}'..='\u{309F}')) {
return TokenType::Hiragana;
}
if text.chars().all(|c| matches!(c, '\u{30A0}'..='\u{30FF}')) {
return TokenType::Katakana;
}
if text
.chars()
.any(|c| matches!(c, '\u{AC00}'..='\u{D7AF}' | '\u{1100}'..='\u{11FF}'))
{
return TokenType::Hangul;
}
if text.chars().any(|c| {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{20000}'..='\u{2A6DF}' | '\u{2A700}'..='\u{2B73F}' | '\u{2B740}'..='\u{2B81F}' | '\u{2B820}'..='\u{2CEAF}' )
}) {
return TokenType::Cjk;
}
if text
.chars()
.all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
{
return TokenType::Alphanum;
}
if text.chars().all(|c| c.is_ascii_punctuation()) {
return TokenType::Punctuation;
}
TokenType::Other
}
}
impl Tokenizer for LinderaTokenizer {
fn tokenize(&self, text: &str) -> Result<TokenStream> {
let mut tokens = Vec::new();
for token in self
.inner
.segment(Cow::Borrowed(text))
.map_err(|e| LaurusError::analysis(format!("Failed to segment text: {}", e)))?
{
let token_type = Self::detect_token_type(&token.surface);
tokens.push(
Token::with_offsets(
token.surface,
token.position,
token.byte_start,
token.byte_end,
)
.with_token_type(token_type),
);
}
Ok(Box::new(tokens.into_iter()))
}
fn name(&self) -> &'static str {
"lindera"
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tokenize_japanese() {
let tokenizer = LinderaTokenizer::new("normal", "embedded://ipadic", None).unwrap();
let text = "日本語の形態素解析を行うことができます。";
let tokens: Vec<Token> = tokenizer.tokenize(text).unwrap().collect();
assert_eq!(tokens.len(), 11);
assert_eq!(tokens[0].text, "日本語");
assert_eq!(tokens[1].text, "の");
assert_eq!(tokens[2].text, "形態素");
assert_eq!(tokens[3].text, "解析");
assert_eq!(tokens[4].text, "を");
assert_eq!(tokens[5].text, "行う");
assert_eq!(tokens[6].text, "こと");
assert_eq!(tokens[7].text, "が");
assert_eq!(tokens[8].text, "でき");
assert_eq!(tokens[9].text, "ます");
assert_eq!(tokens[10].text, "。");
}
#[test]
fn test_tokenize_korean() {
let tokenizer = LinderaTokenizer::new("normal", "embedded://ko-dic", None).unwrap();
let text = "한국어의형태해석을실시할수있습니다.";
let tokens: Vec<Token> = tokenizer.tokenize(text).unwrap().collect();
assert_eq!(tokens.len(), 11);
assert_eq!(tokens[0].text, "한국어");
assert_eq!(tokens[1].text, "의");
assert_eq!(tokens[2].text, "형태");
assert_eq!(tokens[3].text, "해석");
assert_eq!(tokens[4].text, "을");
assert_eq!(tokens[5].text, "실시");
assert_eq!(tokens[6].text, "할");
assert_eq!(tokens[7].text, "수");
assert_eq!(tokens[8].text, "있");
assert_eq!(tokens[9].text, "습니다");
assert_eq!(tokens[10].text, ".");
}
#[test]
fn test_tokenize_chinese() {
let tokenizer = LinderaTokenizer::new("normal", "embedded://cc-cedict", None).unwrap();
let text = "能够进行汉语的形态素解析。";
let tokens: Vec<Token> = tokenizer.tokenize(text).unwrap().collect();
assert!(!tokens.is_empty());
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(texts.contains(&"能够"));
assert!(texts.contains(&"进行"));
assert!(texts.contains(&"汉语"));
assert!(texts.contains(&"解析"));
}
#[test]
fn test_tokenizer_name() {
let tokenizer = LinderaTokenizer::new("normal", "embedded://ipadic", None).unwrap();
assert_eq!(tokenizer.name(), "lindera");
}
#[test]
fn test_from_bytes_invalid_metadata_errors() {
let empty: &[u8] = &[];
let result = LinderaTokenizer::from_bytes(
"normal",
b"not valid json".as_slice(),
empty,
empty,
empty,
empty,
empty,
empty,
empty,
);
assert!(result.is_err());
let msg = format!("{}", result.err().unwrap());
assert!(
msg.contains("metadata"),
"expected metadata error, got: {msg}"
);
}
#[test]
fn test_from_bytes_invalid_mode_errors() {
let empty: &[u8] = &[];
let result = LinderaTokenizer::from_bytes(
"not-a-mode",
b"{}".as_slice(),
empty,
empty,
empty,
empty,
empty,
empty,
empty,
);
assert!(result.is_err());
let msg = format!("{}", result.err().unwrap());
assert!(msg.contains("mode"), "expected mode error, got: {msg}");
}
}