use pyo3::exceptions::PyRuntimeError;
use pyo3::prelude::*;
use pyo3::types::PyDict;
use std::path::PathBuf;
#[pyclass(name = "MeCrab")]
pub struct PyMeCrab {
inner: crate::MeCrab,
with_ipa: bool,
with_vector: bool,
}
#[pymethods]
impl PyMeCrab {
#[new]
#[pyo3(signature = (dicdir=None, with_ipa=false, vector_path=None))]
fn new(dicdir: Option<String>, with_ipa: bool, vector_path: Option<String>) -> PyResult<Self> {
let mut builder = crate::MeCrab::builder();
if let Some(path) = dicdir {
builder = builder.dicdir(Some(PathBuf::from(path)));
}
if with_ipa {
builder = builder.with_ipa(true);
}
let with_vector = vector_path.is_some();
if let Some(ref path) = vector_path {
builder = builder.vector_pool(Some(PathBuf::from(path)));
builder = builder.with_vector(true);
}
match builder.build() {
Ok(inner) => Ok(Self {
inner,
with_ipa,
with_vector,
}),
Err(e) => Err(PyRuntimeError::new_err(format!(
"Failed to load MeCrab: {e}"
))),
}
}
fn parse(&self, text: &str) -> PyResult<String> {
match self.inner.parse(text) {
Ok(result) => Ok(result.to_string()),
Err(e) => Err(PyRuntimeError::new_err(format!("Parse error: {e}"))),
}
}
fn wakati(&self, text: &str) -> PyResult<String> {
match self.inner.wakati(text) {
Ok(result) => Ok(result),
Err(e) => Err(PyRuntimeError::new_err(format!("Parse error: {e}"))),
}
}
fn parse_to_list(&self, text: &str) -> PyResult<Vec<(String, String)>> {
match self.inner.parse(text) {
Ok(result) => Ok(result
.morphemes
.iter()
.map(|m| (m.surface.clone(), m.feature.clone()))
.collect()),
Err(e) => Err(PyRuntimeError::new_err(format!("Parse error: {e}"))),
}
}
#[allow(clippy::doc_link_with_quotes)]
fn parse_to_dict<'py>(&self, py: Python<'py>, text: &str) -> PyResult<Vec<Bound<'py, PyDict>>> {
match self.inner.parse(text) {
Ok(result) => {
let dicts: Vec<Bound<'_, PyDict>> = result
.morphemes
.iter()
.map(|m| {
let dict = PyDict::new(py);
let _ = dict.set_item("surface", &m.surface);
let _ = dict.set_item("feature", &m.feature);
let parts: Vec<&str> = m.feature.split(',').collect();
if !parts.is_empty() {
let _ = dict.set_item("pos", parts[0]);
if parts.len() > 1 {
let _ = dict.set_item("pos1", parts[1]);
}
if parts.len() > 2 {
let _ = dict.set_item("pos2", parts[2]);
}
if parts.len() > 3 {
let _ = dict.set_item("pos3", parts[3]);
}
if parts.len() > 4 && parts[4] != "*" {
let _ = dict.set_item("inflection", parts[4]);
}
if parts.len() > 5 && parts[5] != "*" {
let _ = dict.set_item("conjugation", parts[5]);
}
if parts.len() > 6 && parts[6] != "*" {
let _ = dict.set_item("base", parts[6]);
}
if parts.len() > 7 && parts[7] != "*" {
let _ = dict.set_item("reading", parts[7]);
}
if parts.len() > 8 && parts[8] != "*" {
let _ = dict.set_item("pronunciation", parts[8]);
}
}
if let Some(ref ipa) = m.pronunciation {
let _ = dict.set_item("ipa", ipa.as_str());
}
if let Some(ref embedding) = m.embedding {
let _ = dict.set_item("embedding", embedding.clone());
}
dict
})
.collect();
Ok(dicts)
}
Err(e) => Err(PyRuntimeError::new_err(format!("Parse error: {e}"))),
}
}
fn parse_batch(&self, texts: Vec<String>) -> PyResult<Vec<String>> {
let refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
let results: Result<Vec<String>, _> = self
.inner
.parse_batch(&refs)
.into_iter()
.map(|r| r.map(|result| result.to_string()))
.collect();
results.map_err(|e| PyRuntimeError::new_err(format!("Parse error: {e}")))
}
fn wakati_batch(&self, texts: Vec<String>) -> PyResult<Vec<String>> {
let refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
let results: Result<Vec<String>, _> = self.inner.wakati_batch(&refs).into_iter().collect();
results.map_err(|e| PyRuntimeError::new_err(format!("Parse error: {e}")))
}
fn add_word(&self, surface: &str, reading: &str, pronunciation: &str, wcost: i16) {
self.inner.add_word(surface, reading, pronunciation, wcost);
}
fn remove_word(&self, surface: &str) -> bool {
self.inner.remove_word(surface)
}
fn overlay_size(&self) -> usize {
self.inner.overlay_size()
}
#[allow(clippy::doc_link_with_quotes)]
fn to_ipa(&self, text: &str) -> PyResult<Vec<String>> {
if !self.with_ipa {
return Err(PyRuntimeError::new_err(
"IPA support not enabled. Create MeCrab with with_ipa=True",
));
}
match self.inner.parse(text) {
Ok(result) => {
let ipas: Vec<String> = result
.morphemes
.iter()
.filter_map(|m| m.pronunciation.clone())
.collect();
Ok(ipas)
}
Err(e) => Err(PyRuntimeError::new_err(format!("Parse error: {e}"))),
}
}
#[pyo3(signature = (text, separator=" "))]
fn to_ipa_text(&self, text: &str, separator: &str) -> PyResult<String> {
let ipas = self.to_ipa(text)?;
Ok(ipas.join(separator))
}
fn similarity(&self, word1: &str, word2: &str) -> PyResult<f32> {
if !self.with_vector {
return Err(PyRuntimeError::new_err(
"Vector support not enabled. Create MeCrab with vector_path parameter",
));
}
let result1 = self
.inner
.parse(word1)
.map_err(|e| PyRuntimeError::new_err(format!("Parse error for word1: {e}")))?;
let result2 = self
.inner
.parse(word2)
.map_err(|e| PyRuntimeError::new_err(format!("Parse error for word2: {e}")))?;
let emb1 = result1
.morphemes
.first()
.and_then(|m| m.embedding.as_ref())
.ok_or_else(|| {
PyRuntimeError::new_err(format!(
"No embedding found for word1: '{}' (may be out-of-vocabulary)",
word1
))
})?;
let emb2 = result2
.morphemes
.first()
.and_then(|m| m.embedding.as_ref())
.ok_or_else(|| {
PyRuntimeError::new_err(format!(
"No embedding found for word2: '{}' (may be out-of-vocabulary)",
word2
))
})?;
crate::vectors::VectorStore::cosine_similarity(emb1, emb2).ok_or_else(|| {
PyRuntimeError::new_err("Failed to compute cosine similarity (zero vectors?)")
})
}
}
#[pyclass]
pub struct PyMorpheme {
#[pyo3(get)]
pub surface: String,
#[pyo3(get)]
pub feature: String,
#[pyo3(get)]
pub pos_id: u16,
#[pyo3(get)]
pub wcost: i16,
}
#[pymethods]
impl PyMorpheme {
fn __repr__(&self) -> String {
format!("Morpheme('{}', '{}')", self.surface, self.feature)
}
fn __str__(&self) -> String {
format!("{}\t{}", self.surface, self.feature)
}
}
#[pyfunction]
fn version() -> &'static str {
env!("CARGO_PKG_VERSION")
}
#[pymodule]
fn mecrab(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<PyMeCrab>()?;
m.add_class::<PyMorpheme>()?;
m.add_function(wrap_pyfunction!(version, m)?)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_version() {
assert!(!version().is_empty());
}
}