1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
use std::fs;
use std::io;
use std::io::{BufRead, BufReader};

use clap::ArgMatches;
use lindera::tokenizer::tokenizer::{Token, Tokenizer};
use stringreader::StringReader;

fn format_mecab(tokens: Vec<Token>) -> String {
    let mut lines = Vec::new();
    for token in tokens {
        let line = format!(
            "{}\t{},{},{},{},{},{},{},{},{}",
            token.text,
            token.detail.pos_level1,
            token.detail.pos_level2,
            token.detail.pos_level3,
            token.detail.pos_level4,
            token.detail.conjugation_type,
            token.detail.conjugate_form,
            token.detail.base_form,
            token.detail.reading,
            token.detail.pronunciation
        );
        lines.push(line);
    }
    lines.push(String::from("EOS"));

    lines.join("\n")
}

fn format_wakati(tokens: Vec<Token>) -> String {
    let mut lines = Vec::new();
    for token in tokens {
        let line = token.text.to_string();
        lines.push(line);
    }

    lines.join(" ")
}

fn format_json(tokens: Vec<Token>) -> String {
    serde_json::to_string_pretty(&tokens).unwrap()
}

fn output(tokens: Vec<Token>, output_format: &str) -> Result<String, String> {
    return match output_format {
        "mecab" => {
            Ok(format_mecab(tokens))
        }
        "wakati" => {
            Ok(format_wakati(tokens))
        }
        "json" => {
            Ok(format_json(tokens))
        }
        _ => {
            Err(format!("unsupported output format: {}", output_format))
        }
    };
}

pub fn run_tokenize_cli(matches: &ArgMatches) -> Result<(), String> {
    // create tokenizer
    let mode = matches.value_of("MODE").unwrap();
    let mut tokenizer;
    match mode {
        "normal" => {
            tokenizer = Tokenizer::normal();
        }
        "search" => {
            tokenizer = Tokenizer::for_search();
        }
        _ => {
            return Err(format!("unsupported mode: {}", mode));
        }
    }

    // output format
    let output_format = matches.value_of("OUTPUT").unwrap();

    if matches.is_present("INPUT_FILE") {
        let mut input_text = String::new();
        if let Some(f) = matches.value_of("INPUT_FILE") {
            match fs::read_to_string(f) {
                Ok(t) => {
                    input_text = t;
                }
                Err(e) => {
                    return Err(e.to_string());
                }
            }
        }

        let str_reader = StringReader::new(&input_text);
        let mut buf_reader = BufReader::new(str_reader);

        loop {
            // read the text to be tokenized from stdin
            let mut text = String::new();
            match buf_reader.read_line(&mut text) {
                Ok(_size) => {
                    if _size <= 0 {
                        // EOS
                        break;
                    }
                    text = text.trim().to_string();
                }
                Err(e) => {
                    // return error message
                    return Err(e.to_string());
                }
            }

            // tokenize
            let tokens = tokenizer.tokenize(&text);

            // output result
            match output(tokens, output_format) {
                Ok(output) => println!("{}", output),
                Err(msg) => println!("{}", msg),
            };
        }
    } else {
        loop {
            // read the text to be tokenized from stdin
            let mut text = String::new();
            match io::stdin().read_line(&mut text) {
                Ok(_size) => {
                    if _size <= 0 {
                        // EOS
                        break;
                    }
                    text = text.trim().to_string();
                }
                Err(e) => {
                    // return error message
                    return Err(e.to_string());
                }
            }

            // tokenize
            let tokens = tokenizer.tokenize(&text);

            // output result
            match output(tokens, output_format) {
                Ok(output) => println!("{}", output),
                Err(msg) => println!("{}", msg),
            };
        }
    }

    Ok(())
}