1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
use std::fs;
use std::io;
use std::io::{BufRead, BufReader};
use clap::ArgMatches;
use lindera::tokenizer::tokenizer::{Token, Tokenizer};
use stringreader::StringReader;
fn format_mecab(tokens: Vec<Token>) -> String {
let mut lines = Vec::new();
for token in tokens {
let line = format!(
"{}\t{},{},{},{},{},{},{},{},{}",
token.text,
token.detail.pos_level1,
token.detail.pos_level2,
token.detail.pos_level3,
token.detail.pos_level4,
token.detail.conjugation_type,
token.detail.conjugate_form,
token.detail.base_form,
token.detail.reading,
token.detail.pronunciation
);
lines.push(line);
}
lines.push(String::from("EOS"));
lines.join("\n")
}
fn format_wakati(tokens: Vec<Token>) -> String {
let mut lines = Vec::new();
for token in tokens {
let line = token.text.to_string();
lines.push(line);
}
lines.join(" ")
}
fn format_json(tokens: Vec<Token>) -> String {
serde_json::to_string_pretty(&tokens).unwrap()
}
fn output(tokens: Vec<Token>, output_format: &str) -> Result<String, String> {
return match output_format {
"mecab" => {
Ok(format_mecab(tokens))
}
"wakati" => {
Ok(format_wakati(tokens))
}
"json" => {
Ok(format_json(tokens))
}
_ => {
Err(format!("unsupported output format: {}", output_format))
}
};
}
pub fn run_tokenize_cli(matches: &ArgMatches) -> Result<(), String> {
let mode = matches.value_of("MODE").unwrap();
let mut tokenizer;
match mode {
"normal" => {
tokenizer = Tokenizer::normal();
}
"search" => {
tokenizer = Tokenizer::for_search();
}
_ => {
return Err(format!("unsupported mode: {}", mode));
}
}
let output_format = matches.value_of("OUTPUT").unwrap();
if matches.is_present("INPUT_FILE") {
let mut input_text = String::new();
if let Some(f) = matches.value_of("INPUT_FILE") {
match fs::read_to_string(f) {
Ok(t) => {
input_text = t;
}
Err(e) => {
return Err(e.to_string());
}
}
}
let str_reader = StringReader::new(&input_text);
let mut buf_reader = BufReader::new(str_reader);
loop {
let mut text = String::new();
match buf_reader.read_line(&mut text) {
Ok(_size) => {
if _size <= 0 {
break;
}
text = text.trim().to_string();
}
Err(e) => {
return Err(e.to_string());
}
}
let tokens = tokenizer.tokenize(&text);
match output(tokens, output_format) {
Ok(output) => println!("{}", output),
Err(msg) => println!("{}", msg),
};
}
} else {
loop {
let mut text = String::new();
match io::stdin().read_line(&mut text) {
Ok(_size) => {
if _size <= 0 {
break;
}
text = text.trim().to_string();
}
Err(e) => {
return Err(e.to_string());
}
}
let tokens = tokenizer.tokenize(&text);
match output(tokens, output_format) {
Ok(output) => println!("{}", output),
Err(msg) => println!("{}", msg),
};
}
}
Ok(())
}