1use std::io;
2
3use io::BufWriter;
4use io::Write;
5
6use io::BufRead;
7
8use lindera::token::Token;
9
10use lindera::dictionary::Dictionary;
11
12use lindera::mode::Mode;
13use lindera::segmenter::Segmenter;
14
15pub fn tokens2writer<W>(tokens: &[Token], wtr: &mut W) -> Result<(), io::Error>
16where
17 W: Write,
18{
19 for tok in tokens {
20 let surface: &str = &tok.surface;
21 writeln!(wtr, "{surface}")?;
22 }
23 wtr.flush()
24}
25
26pub struct Tokenizer(pub lindera::tokenizer::Tokenizer);
27
28impl Tokenizer {
29 pub fn text2tokens2writer<W>(&self, txt: &str, wtr: &mut W) -> Result<(), io::Error>
30 where
31 W: Write,
32 {
33 let tokens: Vec<Token> = self.0.tokenize(txt).map_err(io::Error::other)?;
34 tokens2writer(&tokens, wtr)
35 }
36}
37
38impl Tokenizer {
39 pub fn lines2tokens2writer<I, W>(&self, lines: I, wtr: &mut W) -> Result<(), io::Error>
40 where
41 I: Iterator<Item = Result<String, io::Error>>,
42 W: Write,
43 {
44 for rline in lines {
45 let line: String = rline?;
46 self.text2tokens2writer(&line, wtr)?;
47 }
48 wtr.flush()
49 }
50}
51
52impl Tokenizer {
53 pub fn stdin2tokens2stdout(&self) -> Result<(), io::Error> {
54 let o = io::stdout();
55 let mut ol = o.lock();
56 self.lines2tokens2writer(io::stdin().lock().lines(), &mut BufWriter::new(&mut ol))?;
57 ol.flush()
58 }
59}
60
61impl Tokenizer {
62 pub fn new_default() -> Result<Self, io::Error> {
63 let dict: Dictionary =
64 lindera::dictionary::load_dictionary("embedded://ipadic").map_err(io::Error::other)?;
65 let segmenter: Segmenter = Segmenter::new(Mode::Normal, dict, None);
66 Ok(Self(lindera::tokenizer::Tokenizer::new(segmenter)))
67 }
68}
69
70pub fn stdin2tokens2stdout_default() -> Result<(), io::Error> {
71 let tok: Tokenizer = Tokenizer::new_default()?;
72 tok.stdin2tokens2stdout()
73}