use anyhow::{Context, Result};
use clap::{Args, Parser, ValueEnum};
use haqumei::{Haqumei, HaqumeiOptions, UnicodeNormalization};
use std::fs::File;
use std::io::{self, BufRead, IsTerminal, Write};
use std::path::PathBuf;
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
#[command(propagate_version = true)]
struct Cli {
#[arg(value_name = "TEXT", conflicts_with = "input")]
text: Option<String>,
#[arg(short, long, value_name = "FILE")]
input: Option<PathBuf>,
#[arg(short, long, value_name = "FILE")]
output: Option<PathBuf>,
#[arg(short, long, value_enum, default_value_t = OutputMode::G2p)]
mode: OutputMode,
#[arg(short = 'f', long, value_enum, default_value_t = OutputFormat::Text)]
format: OutputFormat,
#[arg(short, long)]
verbose: bool,
#[command(flatten)]
dict: DictArgs,
#[command(flatten)]
options: HaqumeiConfigArgs,
}
#[derive(ValueEnum, Clone, Copy, Debug)]
enum OutputMode {
G2p,
G2pDetailed,
Kana,
KanaPerWord,
PerWord,
Pairs,
Mapping,
MappingDetailed,
Fullcontext,
}
#[derive(ValueEnum, Clone, Copy, Debug)]
enum OutputFormat {
Text,
Json,
}
#[derive(Args, Debug)]
struct DictArgs {
#[arg(long, value_name = "DIR")]
dict_dir: Option<PathBuf>,
#[arg(long, value_name = "FILE")]
user_dict: Option<PathBuf>,
}
#[derive(Args, Debug)]
struct HaqumeiConfigArgs {
#[arg(long, value_enum, default_value_t = UnicodeNorm::None)]
normalize_unicode: UnicodeNorm,
#[arg(long)]
use_read_as_pron: bool,
#[arg(long)]
revert_long_vowels: bool,
#[arg(long)]
revert_yotsugana: bool,
#[arg(long)]
no_modify_filler_accent: bool,
#[arg(long)]
no_predict_nani: bool,
#[arg(long)]
use_unidic_yomi: bool,
#[arg(long)]
no_retreat_acc_nuc: bool,
#[arg(long)]
no_modify_acc_after_chaining: bool,
#[arg(long)]
no_process_odoriji: bool,
}
#[derive(ValueEnum, Clone, Debug)]
enum UnicodeNorm {
None,
Nfc,
Nfkc,
}
impl From<UnicodeNorm> for UnicodeNormalization {
fn from(norm: UnicodeNorm) -> Self {
match norm {
UnicodeNorm::None => UnicodeNormalization::None,
UnicodeNorm::Nfc => UnicodeNormalization::Nfc,
UnicodeNorm::Nfkc => UnicodeNormalization::Nfkc,
}
}
}
fn main() -> Result<()> {
let cli = Cli::parse();
let default_log_level = if cli.verbose { "info" } else { "error" };
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(default_log_level))
.target(env_logger::Target::Stderr)
.init();
let haqumei_options = HaqumeiOptions {
normalize_unicode: cli.options.normalize_unicode.into(),
use_read_as_pron: cli.options.use_read_as_pron,
revert_long_vowels: cli.options.revert_long_vowels,
revert_yotsugana: cli.options.revert_yotsugana,
modify_filler_accent: !cli.options.no_modify_filler_accent,
predict_nani: !cli.options.no_predict_nani,
use_unidic_yomi: cli.options.use_unidic_yomi,
retreat_acc_nuc: !cli.options.no_retreat_acc_nuc,
modify_acc_after_chaining: !cli.options.no_modify_acc_after_chaining,
process_odoriji: !cli.options.no_process_odoriji,
..Default::default()
};
let mut haqumei = match (cli.dict.dict_dir, cli.dict.user_dict) {
(Some(dict), Some(user_dict)) => {
Haqumei::from_path_with_userdict(dict, user_dict, haqumei_options)
.context("Failed to load dictionary and user dictionary")?
}
(Some(dict), None) => {
Haqumei::from_path(dict, haqumei_options).context("Failed to load custom dictionary")?
}
_ => Haqumei::with_options(haqumei_options)
.context("Failed to initialize with built-in dictionary")?,
};
let mut writer: Box<dyn Write> = match cli.output {
Some(path) => {
let file = File::create(&path)
.with_context(|| format!("Failed to create output file: {:?}", path))?;
Box::new(io::BufWriter::new(file))
}
None => Box::new(io::BufWriter::new(io::stdout())),
};
if let Some(text) = cli.text.as_deref() {
process_line(&mut haqumei, text, &cli.mode, &cli.format, &mut writer)?;
} else if let Some(input_path) = cli.input {
let file = File::open(&input_path)
.with_context(|| format!("Failed to open input file: {:?}", input_path))?;
let reader = io::BufReader::new(file);
for line in reader.lines() {
let line = line.context("Failed to read line from file")?;
if line.trim().is_empty() {
writeln!(writer)?;
continue;
}
process_line(&mut haqumei, &line, &cli.mode, &cli.format, &mut writer)?;
}
} else {
let stdin = io::stdin();
let stdout = io::stdout();
let is_repl = stdin.is_terminal() && stdout.is_terminal();
if is_repl {
eprintln!("Enter text to process (Ctrl+C or Ctrl+D to exit):");
loop {
eprint!("> ");
io::stderr().flush()?;
let mut line = String::new();
let bytes = stdin.read_line(&mut line)?;
if bytes == 0 {
break; }
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
process_line(&mut haqumei, trimmed, &cli.mode, &cli.format, &mut writer)?;
writer.flush()?;
}
} else {
for line in stdin.lock().lines() {
let line = line.context("Failed to read line from stdin")?;
if line.trim().is_empty() {
writeln!(writer)?;
continue;
}
process_line(&mut haqumei, &line, &cli.mode, &cli.format, &mut writer)?;
}
}
}
writer.flush()?;
Ok(())
}
#[inline(always)]
fn write_json<T: serde::Serialize>(writer: &mut dyn Write, data: &T) -> Result<()> {
serde_json::to_writer(&mut *writer, data)?;
writeln!(writer)?;
Ok(())
}
fn process_line(
haqumei: &mut Haqumei,
text: &str,
mode: &OutputMode,
format: &OutputFormat,
writer: &mut dyn Write,
) -> Result<()> {
match mode {
OutputMode::G2p => {
let res = haqumei.g2p(text)?;
match format {
OutputFormat::Text => writeln!(writer, "{}", res.join(" "))?,
OutputFormat::Json => write_json(writer, &res)?,
}
}
OutputMode::G2pDetailed => {
let res = haqumei.g2p_detailed(text)?;
match format {
OutputFormat::Text => writeln!(writer, "{}", res.join(" "))?,
OutputFormat::Json => write_json(writer, &res)?,
}
}
OutputMode::Kana => {
let res = haqumei.g2p_kana(text)?;
match format {
OutputFormat::Text => writeln!(writer, "{}", res)?,
OutputFormat::Json => write_json(writer, &res)?,
}
}
OutputMode::KanaPerWord => {
let res = haqumei.g2p_kana_per_word(text)?;
match format {
OutputFormat::Text => writeln!(writer, "{}", res.join(" "))?,
OutputFormat::Json => write_json(writer, &res)?,
}
}
OutputMode::PerWord => {
let res = haqumei.g2p_per_word(text)?;
match format {
OutputFormat::Text => {
let formatted: Vec<String> = res
.into_iter()
.map(|phonemes| format!("[{}]", phonemes.join(", ")))
.collect();
writeln!(writer, "{}", formatted.join(" "))?;
}
OutputFormat::Json => write_json(writer, &res)?,
}
}
OutputMode::Pairs => {
let res = haqumei.g2p_pairs(text)?;
match format {
OutputFormat::Text => {
for pair in res {
writeln!(writer, "{}\t{}", pair.word, pair.phonemes.join(" "))?;
}
}
OutputFormat::Json => write_json(writer, &res)?,
}
}
OutputMode::Mapping => {
let res = haqumei.g2p_mapping(text)?;
match format {
OutputFormat::Text => {
for map in res {
let status = if map.is_unknown {
"[UNK]"
} else if map.is_ignored {
"[IGN]"
} else {
"[OK] "
};
writeln!(
writer,
"{} {}\t{}",
status,
map.word,
map.phonemes.join(" ")
)?;
}
}
OutputFormat::Json => write_json(writer, &res)?,
}
}
OutputMode::MappingDetailed => {
let res = haqumei.g2p_mapping_detailed(text)?;
match format {
OutputFormat::Text => {
for detail in res {
let status = if detail.is_unknown {
"[UNK]"
} else if detail.is_ignored {
"[IGN]"
} else {
"[OK] "
};
writeln!(
writer,
"{} {}\tPOS:{}\tPRON:{}\tACC:{}/{}",
status,
detail.word,
detail.pos,
detail.pron,
detail.accent_nucleus,
detail.mora_count
)?;
}
}
OutputFormat::Json => write_json(writer, &res)?,
}
}
OutputMode::Fullcontext => {
let res = haqumei.extract_fullcontext(text)?;
match format {
OutputFormat::Text => {
for label in res {
writeln!(writer, "{}", label)?;
}
}
OutputFormat::Json => write_json(writer, &res)?,
}
}
}
Ok(())
}