1pub mod raw;
4mod rusty;
5
6pub use self::rusty::*;
7use std::{path::PathBuf, process::Command};
8use thiserror::Error;
9
10#[derive(Debug, Error)]
12pub enum IchiranError {
13 #[error("Error while trying to run ichiran-cli")]
14 CommandError(#[source] std::io::Error),
15 #[error("ichiran-cli output invalid utf-8")]
16 InvalidUtf8(#[from] std::string::FromUtf8Error),
17 #[error("Unexpected output from ichiran-cli")]
18 UnexpectedOutput(String),
19 #[error("ichiran-cli returned a non-zero exit code")]
20 IchiranError { stdout: String, stderr: String },
21 #[error("Error while deserializing ichiran-cli output")]
22 Deserialization(#[from] serde_path_to_error::Error<serde_json::Error>),
23}
24
25#[derive(Debug)]
27pub struct IchiranCli {
28 cli_path: PathBuf,
29}
30
31impl IchiranCli {
32 pub fn new(cli_path: PathBuf) -> Self {
34 Self { cli_path }
35 }
36
37 pub fn segment(&self, input: &str, limit: Option<u32>) -> Result<Vec<Segment>, IchiranError> {
40 let (stdout, _stderr) = if let Some(limit) = limit {
41 self.run(&["-f", "-l", &limit.to_string(), input])?
42 } else {
43 self.run(&["-f", input])?
44 };
45 let jd = &mut serde_json::Deserializer::from_str(&stdout);
46 let info: raw::FullSplitInfo = serde_path_to_error::deserialize(jd)?;
47 Ok(info.into())
48 }
49
50 pub fn romanize_with_info(&self, input: &str) -> Result<RomanizedWithInfo, IchiranError> {
52 let (stdout, _stderr) = self.run(&["-i", input])?;
53 let mut lines = stdout.lines();
54 let mut romanized = lines
55 .next()
56 .ok_or_else(|| IchiranError::UnexpectedOutput(stdout.clone()))?
57 .to_string();
58 let trimmed = romanized.trim_end().len();
59 romanized.truncate(trimmed);
60 let mut entries = vec![];
61 let mut word = None;
62 let mut alternatives = vec![];
63 while let Some(line) = lines.next() {
64 if line.is_empty() {
65 if let Some(w) = word {
66 entries.push(RomanizedWithInfoEntry {
67 word: w,
68 alternatives,
69 });
70 word = None;
71 alternatives = vec![];
72 }
73 } else {
74 if word.is_some() {
75 alternatives.push(line.to_string());
76 } else {
77 word = Some(line.to_string());
78 }
79 }
80 }
81 Ok(RomanizedWithInfo { romanized, entries })
82 }
83
84 pub fn romanize(&self, input: &str) -> Result<String, IchiranError> {
86 let (mut stdout, _stderr) = self.run(&[input])?;
87 let trimmed = stdout.trim_end().len();
89 stdout.truncate(trimmed);
90 Ok(stdout)
91 }
92
93 fn run(&self, args: &[&str]) -> Result<(String, String), IchiranError> {
94 let out = Command::new(&self.cli_path)
95 .args(args)
96 .output()
97 .map_err(IchiranError::CommandError)?;
98 let stdout = String::from_utf8(out.stdout)?;
99 let stderr = String::from_utf8(out.stderr)?;
100 if out.status.success() {
101 Ok((stdout, stderr))
102 } else {
103 Err(IchiranError::IchiranError { stdout, stderr })
104 }
105 }
106}
107
108#[derive(Debug, Clone, PartialEq, Eq)]
109pub struct RomanizedWithInfo {
110 pub romanized: String,
111 pub entries: Vec<RomanizedWithInfoEntry>,
112}
113
114#[derive(Debug, Clone, PartialEq, Eq)]
115pub struct RomanizedWithInfoEntry {
116 pub word: String,
117 pub alternatives: Vec<String>,
118}
119
120#[cfg(test)]
121mod test {
122 use super::*;
123
124 fn ichiran() -> IchiranCli {
125 IchiranCli::new(PathBuf::from("./data/ichiran-cli"))
126 }
127
128 #[test]
129 fn romanizes() {
130 let out = ichiran().romanize("").unwrap();
131 assert!(out.is_empty());
132
133 let out = ichiran().romanize("test").unwrap();
134 assert_eq!(out, "test");
135
136 let out = ichiran().romanize("一覧は最高だぞ").unwrap();
137 assert_eq!(out, "ichiran wa saikō da zo");
138 }
139
140 #[test]
141 fn romanizes_with_info() {
142 let out = ichiran().romanize_with_info("一覧は最高だぞ").unwrap();
143 assert_eq!(out.romanized, "ichiran wa saikō da zo");
144 assert_eq!(
145 out.entries[0],
146 RomanizedWithInfoEntry {
147 word: "* ichiran 一覧 【いちらん】".to_string(),
148 alternatives: vec![
149 "1. [n,vs,vt] look; glance; sight; having a look at; looking over; glancing through; running one's eyes over".to_string(),
150 "2. [n] summary; list; table; catalog; catalogue".to_string()
151 ]
152 }
153 );
154 assert_eq!(
155 out.entries[1],
156 RomanizedWithInfoEntry {
157 word: "* wa は".to_string(),
158 alternatives: vec![
159 "1. [prt] 《pronounced わ in modern Japanese》 indicates sentence topic"
160 .to_string(),
161 "2. [prt] indicates contrast with another option (stated or unstated)"
162 .to_string(),
163 "3. [prt] adds emphasis".to_string()
164 ]
165 }
166 );
167 assert_eq!(
168 out.entries[2],
169 RomanizedWithInfoEntry {
170 word: "* saikō 最高 【さいこう】".to_string(),
171 alternatives: vec![
172 "1. [adj-no,adj-na,n] best; supreme; wonderful; finest".to_string(),
173 "2. [n,adj-na,adj-no] highest; maximum; most; uppermost; supreme".to_string()
174 ]
175 }
176 );
177 assert_eq!(
178 out.entries[3],
179 RomanizedWithInfoEntry {
180 word: "* da だ".to_string(),
181 alternatives: vec!["1. [aux-v,cop-da,cop] 《plain copula》 be; is".to_string()]
182 }
183 );
184 assert_eq!(out.entries.len(), 4);
185 }
186
187 #[test]
188 fn gets_full_split_info() {
189 let ichiran = ichiran();
190 let _segmented = ichiran.segment("一覧は最高だぞ", None).unwrap();
191 }
192
193 #[test]
194 fn uses_limit() {
195 let ichiran = ichiran();
196 let segmented = ichiran.segment("一人目", None).unwrap();
197 let Segment::Segmentations(segmentations) = &segmented[0] else {
198 panic!();
199 };
200 assert_eq!(segmentations.len(), 1);
201 let segmented = ichiran.segment("一人目", Some(2)).unwrap();
202 let Segment::Segmentations(segmentations) = &segmented[0] else {
203 panic!();
204 };
205 assert_eq!(segmentations.len(), 2);
206 }
207
208 #[test]
209 #[ignore = "takes a very long time, requires a book to test with from aozora bunko"]
210 fn book() {
211 let ichiran = ichiran();
212 let file = std::fs::read_to_string("./data/book").unwrap();
213 for (idx, line) in file.lines().enumerate() {
214 println!("{idx} {line}");
215 ichiran.segment(line, None).unwrap();
216 }
217 }
218}