use std::path::{Path, PathBuf};
use std::process::Command;
use tentoku::database_path::find_database_path;
use tentoku::sqlite_dict::SqliteDictionary;
use tentoku::tokenizer::tokenize;
fn python_comparison_db_path() -> PathBuf {
if let Ok(path) = std::env::var("TENTOKU_PYTHON_DB") {
return PathBuf::from(path);
}
Path::new(env!("CARGO_MANIFEST_DIR"))
.join("reference")
.join("tentoku")
.join("data")
.join("jmdict.python.db")
}
fn shared_comparison_db_path() -> Option<PathBuf> {
if let Ok(path) = std::env::var("TENTOKU_COMPARE_DB") {
let p = PathBuf::from(path);
if p.exists() {
return Some(p);
}
}
let python_db = python_comparison_db_path();
if python_db.exists() {
return Some(python_db);
}
find_database_path()
}
#[cfg(unix)]
fn ensure_python_reference() -> bool {
let manifest = env!("CARGO_MANIFEST_DIR");
let reference = Path::new(manifest).join("reference");
let python_db = python_comparison_db_path();
if reference.join("tentoku").exists() && python_db.exists() {
return true;
}
let script = Path::new(manifest)
.join("scripts")
.join("setup_python_reference.sh");
if !script.exists() {
return false;
}
let ok = Command::new("sh")
.arg(&script)
.arg(manifest)
.current_dir(manifest)
.env("TENTOKU_PYTHON_DB", &python_db)
.status()
.map(|s| s.success())
.unwrap_or(false);
ok && reference.join("tentoku").exists() && python_db.exists()
}
#[cfg(not(unix))]
fn ensure_python_reference() -> bool {
Path::new(env!("CARGO_MANIFEST_DIR"))
.join("reference")
.join("tentoku")
.exists()
&& python_comparison_db_path().exists()
}
#[derive(Debug, PartialEq)]
struct SimpleToken {
text: String,
start: usize,
end: usize,
}
const SENTENCES: &[&str] = &[
"私は学生です",
"食べました",
"食べています",
"食べない",
"食べた",
"日本語を勉強しています",
"高かった",
"読みます",
"見てください",
"行かなかった",
"東京へ行く",
"本を読んでいます",
"食べさせられた",
"こんにちは",
"走っている",
];
const MEDIUM_PARAGRAPH: &str = concat!(
"私は毎日日本語を勉強しています。図書館で本を読んだり、音楽を聞いたりします。",
"友達と話すことが好きです。先生に質問することができます。",
"日本の文化について学んでいます。食べ物の中でお寿司が一番好きです。",
"東京に行きたいと思っています。日本語がだんだん上手になってきました。",
"毎朝六時に起きて、学校まで電車で通っています。",
);
fn python_tokenize(text: &str, python_db_path: &std::path::Path) -> Option<Vec<SimpleToken>> {
let py_bin = std::env::var("PYTHON_BIN").unwrap_or_else(|_| "python3".into());
let manifest = env!("CARGO_MANIFEST_DIR");
let reference = format!("{manifest}/reference");
let db_str = python_db_path.to_str()?;
let script = format!(
"import sys, json\n\
sys.path.insert(0, {reference:?})\n\
from tentoku.sqlite_dict_optimized import OptimizedSQLiteDictionary\n\
from tentoku.tokenizer import tokenize as py_tok\n\
d = OptimizedSQLiteDictionary({db_str:?}, auto_build=False)\n\
tokens = py_tok({text:?}, d)\n\
print(json.dumps([{{\"text\": t.text, \"start\": t.start, \"end\": t.end}} for t in tokens]))\n",
);
let out = Command::new(&py_bin).arg("-c").arg(&script).output().ok()?;
if !out.status.success() {
eprintln!("python stderr: {}", String::from_utf8_lossy(&out.stderr));
return None;
}
let parsed: Vec<serde_json::Value> = serde_json::from_slice(&out.stdout).ok()?;
let tokens = parsed
.into_iter()
.map(|v| SimpleToken {
text: v["text"].as_str().unwrap_or("").to_owned(),
start: v["start"].as_u64().unwrap_or(0) as usize,
end: v["end"].as_u64().unwrap_or(0) as usize,
})
.collect();
Some(tokens)
}
fn rust_tokenize(text: &str, dict: &SqliteDictionary) -> Vec<SimpleToken> {
tokenize(text, dict, 10)
.into_iter()
.map(|t| SimpleToken {
text: t.text,
start: t.start,
end: t.end,
})
.collect()
}
#[test]
fn cross_validate_tokenization() {
if !ensure_python_reference() {
eprintln!(
"SKIP cross_validate: Python reference not ready \
(needs reference/tentoku and dedicated Python DB; \
on Unix, scripts/setup_python_reference.sh prepares both)"
);
return;
}
let db_path = match shared_comparison_db_path() {
Some(p) => p,
None => {
eprintln!(
"SKIP cross_validate: no shared JMDict database found \
(set TENTOKU_COMPARE_DB, or prepare reference/tentoku/data/jmdict.python.db, \
or set TENTOKU_DB)"
);
return;
}
};
let dict = match SqliteDictionary::open(&db_path) {
Ok(d) => d,
Err(e) => {
eprintln!(
"SKIP cross_validate: failed to open shared DB at {}: {e}",
db_path.display()
);
return;
}
};
let mut mismatches = 0;
for &sentence in SENTENCES {
let rust = rust_tokenize(sentence, &dict);
let python = match python_tokenize(sentence, &db_path) {
Some(p) => p,
None => {
eprintln!(
"SKIP cross_validate: Python tokenizer unavailable or errored \
(set PYTHON_BIN if needed)"
);
return;
}
};
if rust != python {
eprintln!("MISMATCH for {:?}", sentence);
eprintln!(" Rust: {:?}", rust);
eprintln!(" Python: {:?}", python);
mismatches += 1;
}
}
let rust_para = rust_tokenize(MEDIUM_PARAGRAPH, &dict);
let python_para = match python_tokenize(MEDIUM_PARAGRAPH, &db_path) {
Some(p) => p,
None => {
eprintln!(
"SKIP cross_validate: Python tokenizer unavailable or errored \
(set PYTHON_BIN if needed)"
);
return;
}
};
if rust_para != python_para {
eprintln!("MISMATCH for MEDIUM_PARAGRAPH");
eprintln!(" Rust: {} tokens", rust_para.len());
eprintln!(" Python: {} tokens", python_para.len());
mismatches += 1;
}
assert_eq!(
mismatches, 0,
"{mismatches} sentence(s) had token boundary mismatches between Rust and Python"
);
}