ichiran/
lib.rs

1//! Bindings for ichiran-cli.
2
3pub mod raw;
4mod rusty;
5
6pub use self::rusty::*;
7use std::{path::PathBuf, process::Command};
8use thiserror::Error;
9
10/// Crate error type.
11#[derive(Debug, Error)]
12pub enum IchiranError {
13    #[error("Error while trying to run ichiran-cli")]
14    CommandError(#[source] std::io::Error),
15    #[error("ichiran-cli output invalid utf-8")]
16    InvalidUtf8(#[from] std::string::FromUtf8Error),
17    #[error("Unexpected output from ichiran-cli")]
18    UnexpectedOutput(String),
19    #[error("ichiran-cli returned a non-zero exit code")]
20    IchiranError { stdout: String, stderr: String },
21    #[error("Error while deserializing ichiran-cli output")]
22    Deserialization(#[from] serde_path_to_error::Error<serde_json::Error>),
23}
24
25/// Wrapper for ichiran-cli.
26#[derive(Debug)]
27pub struct IchiranCli {
28    cli_path: PathBuf,
29}
30
31impl IchiranCli {
32    /// Takes a path to the `ichiran-cli` binary.
33    pub fn new(cli_path: PathBuf) -> Self {
34        Self { cli_path }
35    }
36
37    /// Calls and parses the output of `ichiran-cli -f`.
38    /// The optional limit argument defines the max number of alternative segmentations that are returned for each segment.
39    pub fn segment(&self, input: &str, limit: Option<u32>) -> Result<Vec<Segment>, IchiranError> {
40        let (stdout, _stderr) = if let Some(limit) = limit {
41            self.run(&["-f", "-l", &limit.to_string(), input])?
42        } else {
43            self.run(&["-f", input])?
44        };
45        let jd = &mut serde_json::Deserializer::from_str(&stdout);
46        let info: raw::FullSplitInfo = serde_path_to_error::deserialize(jd)?;
47        Ok(info.into())
48    }
49
50    /// Calls and parses the output of `ichiran-cli -i`.
51    pub fn romanize_with_info(&self, input: &str) -> Result<RomanizedWithInfo, IchiranError> {
52        let (stdout, _stderr) = self.run(&["-i", input])?;
53        let mut lines = stdout.lines();
54        let mut romanized = lines
55            .next()
56            .ok_or_else(|| IchiranError::UnexpectedOutput(stdout.clone()))?
57            .to_string();
58        let trimmed = romanized.trim_end().len();
59        romanized.truncate(trimmed);
60        let mut entries = vec![];
61        let mut word = None;
62        let mut alternatives = vec![];
63        while let Some(line) = lines.next() {
64            if line.is_empty() {
65                if let Some(w) = word {
66                    entries.push(RomanizedWithInfoEntry {
67                        word: w,
68                        alternatives,
69                    });
70                    word = None;
71                    alternatives = vec![];
72                }
73            } else {
74                if word.is_some() {
75                    alternatives.push(line.to_string());
76                } else {
77                    word = Some(line.to_string());
78                }
79            }
80        }
81        Ok(RomanizedWithInfo { romanized, entries })
82    }
83
84    /// Calls and parses the output of `ichiran-cli` without any flags.
85    pub fn romanize(&self, input: &str) -> Result<String, IchiranError> {
86        let (mut stdout, _stderr) = self.run(&[input])?;
87        // truncate to cut off the newline
88        let trimmed = stdout.trim_end().len();
89        stdout.truncate(trimmed);
90        Ok(stdout)
91    }
92
93    fn run(&self, args: &[&str]) -> Result<(String, String), IchiranError> {
94        let out = Command::new(&self.cli_path)
95            .args(args)
96            .output()
97            .map_err(IchiranError::CommandError)?;
98        let stdout = String::from_utf8(out.stdout)?;
99        let stderr = String::from_utf8(out.stderr)?;
100        if out.status.success() {
101            Ok((stdout, stderr))
102        } else {
103            Err(IchiranError::IchiranError { stdout, stderr })
104        }
105    }
106}
107
108#[derive(Debug, Clone, PartialEq, Eq)]
109pub struct RomanizedWithInfo {
110    pub romanized: String,
111    pub entries: Vec<RomanizedWithInfoEntry>,
112}
113
114#[derive(Debug, Clone, PartialEq, Eq)]
115pub struct RomanizedWithInfoEntry {
116    pub word: String,
117    pub alternatives: Vec<String>,
118}
119
120#[cfg(test)]
121mod test {
122    use super::*;
123
124    fn ichiran() -> IchiranCli {
125        IchiranCli::new(PathBuf::from("./data/ichiran-cli"))
126    }
127
128    #[test]
129    fn romanizes() {
130        let out = ichiran().romanize("").unwrap();
131        assert!(out.is_empty());
132
133        let out = ichiran().romanize("test").unwrap();
134        assert_eq!(out, "test");
135
136        let out = ichiran().romanize("一覧は最高だぞ").unwrap();
137        assert_eq!(out, "ichiran wa saikō da zo");
138    }
139
140    #[test]
141    fn romanizes_with_info() {
142        let out = ichiran().romanize_with_info("一覧は最高だぞ").unwrap();
143        assert_eq!(out.romanized, "ichiran wa saikō da zo");
144        assert_eq!(
145            out.entries[0],
146            RomanizedWithInfoEntry {
147                word: "* ichiran  一覧 【いちらん】".to_string(),
148                alternatives: vec![
149                    "1. [n,vs,vt] look; glance; sight; having a look at; looking over; glancing through; running one's eyes over".to_string(),
150                    "2. [n] summary; list; table; catalog; catalogue".to_string()
151                ]
152            }
153        );
154        assert_eq!(
155            out.entries[1],
156            RomanizedWithInfoEntry {
157                word: "* wa  は".to_string(),
158                alternatives: vec![
159                    "1. [prt] 《pronounced わ in modern Japanese》 indicates sentence topic"
160                        .to_string(),
161                    "2. [prt] indicates contrast with another option (stated or unstated)"
162                        .to_string(),
163                    "3. [prt] adds emphasis".to_string()
164                ]
165            }
166        );
167        assert_eq!(
168            out.entries[2],
169            RomanizedWithInfoEntry {
170                word: "* saikō  最高 【さいこう】".to_string(),
171                alternatives: vec![
172                    "1. [adj-no,adj-na,n] best; supreme; wonderful; finest".to_string(),
173                    "2. [n,adj-na,adj-no] highest; maximum; most; uppermost; supreme".to_string()
174                ]
175            }
176        );
177        assert_eq!(
178            out.entries[3],
179            RomanizedWithInfoEntry {
180                word: "* da  だ".to_string(),
181                alternatives: vec!["1. [aux-v,cop-da,cop] 《plain copula》 be; is".to_string()]
182            }
183        );
184        assert_eq!(out.entries.len(), 4);
185    }
186
187    #[test]
188    fn gets_full_split_info() {
189        let ichiran = ichiran();
190        let _segmented = ichiran.segment("一覧は最高だぞ", None).unwrap();
191    }
192
193    #[test]
194    fn uses_limit() {
195        let ichiran = ichiran();
196        let segmented = ichiran.segment("一人目", None).unwrap();
197        let Segment::Segmentations(segmentations) = &segmented[0] else {
198            panic!();
199        };
200        assert_eq!(segmentations.len(), 1);
201        let segmented = ichiran.segment("一人目", Some(2)).unwrap();
202        let Segment::Segmentations(segmentations) = &segmented[0] else {
203            panic!();
204        };
205        assert_eq!(segmentations.len(), 2);
206    }
207
208    #[test]
209    #[ignore = "takes a very long time, requires a book to test with from aozora bunko"]
210    fn book() {
211        let ichiran = ichiran();
212        let file = std::fs::read_to_string("./data/book").unwrap();
213        for (idx, line) in file.lines().enumerate() {
214            println!("{idx} {line}");
215            ichiran.segment(line, None).unwrap();
216        }
217    }
218}