use anyhow::{Context, Result};
use std::collections::HashMap;
use std::fs::File;
use std::path::Path;
use vibrato::{Dictionary, Tokenizer};
#[cfg(feature = "zstd-support")]
use zstd::Decoder;
pub struct MorphologicalSimilarityCalculator {
tokenizer: Tokenizer,
}
impl MorphologicalSimilarityCalculator {
pub fn new(dict_path: Option<&str>) -> Result<Self> {
let dict = if let Some(path) = dict_path {
Self::load_dictionary(path)?
} else {
let default_paths = [
"/usr/share/mecab/dic/ipadic/system.dic",
"/usr/share/mecab/dic/ipadic/system.dic.zst",
"/opt/homebrew/lib/mecab/dic/ipadic/system.dic",
"/opt/homebrew/lib/mecab/dic/ipadic/system.dic.zst",
"./dict/system.dic",
"./dict/system.dic.zst",
"./system.dic",
"./system.dic.zst",
"./ipadic-mecab-2_7_0/system.dic.zst",
];
let mut dict = None;
for path in &default_paths {
if Path::new(path).exists() {
if let Ok(d) = Self::load_dictionary(path) {
dict = Some(d);
break;
}
}
}
dict.context("形態素解析辞書が見つかりません。辞書パスを指定してください。\n利用可能な辞書をダウンロードするには:\nwget https://github.com/daac-tools/vibrato/releases/download/v0.5.0/ipadic-mecab-2_7_0.tar.xz\ntar xf ipadic-mecab-2_7_0.tar.xz")?
};
let tokenizer = Tokenizer::new(dict);
Ok(Self { tokenizer })
}
fn load_dictionary(path: &str) -> Result<Dictionary> {
let file =
File::open(path).with_context(|| format!("辞書ファイルを開けませんでした: {path}"))?;
if path.ends_with(".zst") {
#[cfg(feature = "zstd-support")]
{
let decoder = Decoder::new(file)
.with_context(|| format!("zstdデコーダーの初期化に失敗しました: {path}"))?;
Dictionary::read(decoder)
.with_context(|| format!("圧縮辞書ファイルの読み込みに失敗しました: {path}"))
}
#[cfg(not(feature = "zstd-support"))]
{
Err(anyhow::anyhow!(
"圧縮辞書ファイル(.zst)を読み込むには、zstd-supportフィーチャーを有効にしてください: {}",
path
))
}
} else {
Dictionary::read(file)
.with_context(|| format!("辞書ファイルの読み込みに失敗しました: {path}"))
}
}
pub fn tokenize(&self, text: &str) -> Result<Vec<MorphemeToken>> {
let mut worker = self.tokenizer.new_worker();
worker.reset_sentence(text);
worker.tokenize();
let mut tokens = Vec::new();
for i in 0..worker.num_tokens() {
let token = worker.token(i);
let surface = token.surface();
let features = token.feature();
let pos_parts: Vec<&str> = features.split(',').collect();
let pos_main = pos_parts.first().unwrap_or(&"").to_string();
let pos_sub1 = pos_parts.get(1).unwrap_or(&"").to_string();
let pos_sub2 = pos_parts.get(2).unwrap_or(&"").to_string();
let base_form = pos_parts.get(6).unwrap_or(&surface).to_string();
tokens.push(MorphemeToken {
surface: surface.to_string(),
base_form,
pos_main,
pos_sub1,
pos_sub2,
features: features.to_string(),
});
}
Ok(tokens)
}
pub fn calculate_morpheme_similarity(&self, text1: &str, text2: &str) -> Result<f64> {
let tokens1 = self.tokenize(text1)?;
let tokens2 = self.tokenize(text2)?;
let content_words1 = self.extract_content_words(&tokens1);
let content_words2 = self.extract_content_words(&tokens2);
let similarity = self.calculate_jaccard_similarity(&content_words1, &content_words2);
Ok(similarity)
}
pub fn calculate_pos_similarity(&self, text1: &str, text2: &str) -> Result<PosSimilarity> {
let tokens1 = self.tokenize(text1)?;
let tokens2 = self.tokenize(text2)?;
let nouns1 = self.extract_by_pos(&tokens1, "名詞");
let nouns2 = self.extract_by_pos(&tokens2, "名詞");
let noun_similarity = self.calculate_jaccard_similarity(&nouns1, &nouns2);
let verbs1 = self.extract_by_pos(&tokens1, "動詞");
let verbs2 = self.extract_by_pos(&tokens2, "動詞");
let verb_similarity = self.calculate_jaccard_similarity(&verbs1, &verbs2);
let adjectives1 = self.extract_by_pos(&tokens1, "形容詞");
let adjectives2 = self.extract_by_pos(&tokens2, "形容詞");
let adjective_similarity = self.calculate_jaccard_similarity(&adjectives1, &adjectives2);
Ok(PosSimilarity { noun_similarity, verb_similarity, adjective_similarity })
}
fn extract_content_words(&self, tokens: &[MorphemeToken]) -> Vec<String> {
tokens
.iter()
.filter(|token| matches!(token.pos_main.as_str(), "名詞" | "動詞" | "形容詞" | "副詞"))
.map(|token| token.base_form.clone())
.collect()
}
fn extract_by_pos(&self, tokens: &[MorphemeToken], pos: &str) -> Vec<String> {
tokens
.iter()
.filter(|token| token.pos_main == pos)
.map(|token| token.base_form.clone())
.collect()
}
fn calculate_jaccard_similarity(&self, words1: &[String], words2: &[String]) -> f64 {
if words1.is_empty() && words2.is_empty() {
return 1.0;
}
let set1: std::collections::HashSet<_> = words1.iter().collect();
let set2: std::collections::HashSet<_> = words2.iter().collect();
let intersection = set1.intersection(&set2).count();
let union = set1.union(&set2).count();
if union == 0 {
0.0
} else {
intersection as f64 / union as f64
}
}
pub fn calculate_weighted_similarity(
&self,
text1: &str,
text2: &str,
corpus: &[String],
) -> Result<f64> {
let tokens1 = self.tokenize(text1)?;
let tokens2 = self.tokenize(text2)?;
let content_words1 = self.extract_content_words(&tokens1);
let content_words2 = self.extract_content_words(&tokens2);
let freq1 = self.calculate_word_frequency(&content_words1);
let freq2 = self.calculate_word_frequency(&content_words2);
let corpus_freq = self.calculate_corpus_frequency(corpus)?;
let similarity = self.calculate_weighted_cosine_similarity(&freq1, &freq2, &corpus_freq);
Ok(similarity)
}
fn calculate_word_frequency(&self, words: &[String]) -> HashMap<String, f64> {
let mut freq = HashMap::new();
let total = words.len() as f64;
for word in words {
*freq.entry(word.clone()).or_insert(0.0) += 1.0;
}
for value in freq.values_mut() {
*value /= total;
}
freq
}
fn calculate_corpus_frequency(&self, corpus: &[String]) -> Result<HashMap<String, f64>> {
let mut doc_count = HashMap::new();
let total_docs = corpus.len() as f64;
for text in corpus {
let tokens = self.tokenize(text)?;
let content_words = self.extract_content_words(&tokens);
let unique_words: std::collections::HashSet<_> = content_words.into_iter().collect();
for word in unique_words {
*doc_count.entry(word).or_insert(0.0) += 1.0;
}
}
let mut idf = HashMap::new();
for (word, count) in doc_count {
idf.insert(word, (total_docs / count).ln());
}
Ok(idf)
}
fn calculate_weighted_cosine_similarity(
&self,
freq1: &HashMap<String, f64>,
freq2: &HashMap<String, f64>,
idf: &HashMap<String, f64>,
) -> f64 {
let mut dot_product = 0.0;
let mut norm1 = 0.0;
let mut norm2 = 0.0;
let all_words: std::collections::HashSet<_> = freq1.keys().chain(freq2.keys()).collect();
for word in all_words {
let tf1 = freq1.get(word).unwrap_or(&0.0);
let tf2 = freq2.get(word).unwrap_or(&0.0);
let idf_weight = idf.get(word).unwrap_or(&1.0);
let weight1 = tf1 * idf_weight;
let weight2 = tf2 * idf_weight;
dot_product += weight1 * weight2;
norm1 += weight1 * weight1;
norm2 += weight2 * weight2;
}
if norm1 == 0.0 || norm2 == 0.0 {
0.0
} else {
dot_product / (norm1.sqrt() * norm2.sqrt())
}
}
}
#[derive(Debug, Clone)]
pub struct MorphemeToken {
pub surface: String,
pub base_form: String,
pub pos_main: String,
pub pos_sub1: String,
pub pos_sub2: String,
pub features: String,
}
#[derive(Debug, Clone)]
pub struct PosSimilarity {
pub noun_similarity: f64,
pub verb_similarity: f64,
pub adjective_similarity: f64,
}
impl PosSimilarity {
pub fn weighted_average(&self, noun_weight: f64, verb_weight: f64, adj_weight: f64) -> f64 {
let total_weight = noun_weight + verb_weight + adj_weight;
if total_weight == 0.0 {
0.0
} else {
(self.noun_similarity * noun_weight
+ self.verb_similarity * verb_weight
+ self.adjective_similarity * adj_weight)
/ total_weight
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[ignore] fn test_tokenize() {
let calculator = MorphologicalSimilarityCalculator::new(None).unwrap();
let tokens = calculator.tokenize("これは日本語のテストです。").unwrap();
assert!(!tokens.is_empty());
}
#[test]
#[ignore] fn test_morpheme_similarity() {
let calculator = MorphologicalSimilarityCalculator::new(None).unwrap();
let text1 = "これは日本語の文書です。";
let text2 = "これは日本語のドキュメントです。";
let similarity = calculator.calculate_morpheme_similarity(text1, text2).unwrap();
assert!(similarity > 0.0);
assert!(similarity <= 1.0);
}
}