Skip to main content

rs_jptxt2tokens/
lib.rs

1use std::io;
2
3use io::BufWriter;
4use io::Write;
5
6use io::BufRead;
7
8use lindera::token::Token;
9
10use lindera::dictionary::Dictionary;
11
12use lindera::mode::Mode;
13use lindera::segmenter::Segmenter;
14
15pub fn tokens2writer<W>(tokens: &[Token], wtr: &mut W) -> Result<(), io::Error>
16where
17    W: Write,
18{
19    for tok in tokens {
20        let surface: &str = &tok.surface;
21        writeln!(wtr, "{surface}")?;
22    }
23    wtr.flush()
24}
25
26pub struct Tokenizer(pub lindera::tokenizer::Tokenizer);
27
28impl Tokenizer {
29    pub fn text2tokens2writer<W>(&self, txt: &str, wtr: &mut W) -> Result<(), io::Error>
30    where
31        W: Write,
32    {
33        let tokens: Vec<Token> = self.0.tokenize(txt).map_err(io::Error::other)?;
34        tokens2writer(&tokens, wtr)
35    }
36}
37
38impl Tokenizer {
39    pub fn lines2tokens2writer<I, W>(&self, lines: I, wtr: &mut W) -> Result<(), io::Error>
40    where
41        I: Iterator<Item = Result<String, io::Error>>,
42        W: Write,
43    {
44        for rline in lines {
45            let line: String = rline?;
46            self.text2tokens2writer(&line, wtr)?;
47        }
48        wtr.flush()
49    }
50}
51
52impl Tokenizer {
53    pub fn stdin2tokens2stdout(&self) -> Result<(), io::Error> {
54        let o = io::stdout();
55        let mut ol = o.lock();
56        self.lines2tokens2writer(io::stdin().lock().lines(), &mut BufWriter::new(&mut ol))?;
57        ol.flush()
58    }
59}
60
61impl Tokenizer {
62    pub fn new_default() -> Result<Self, io::Error> {
63        let dict: Dictionary =
64            lindera::dictionary::load_dictionary("embedded://ipadic").map_err(io::Error::other)?;
65        let segmenter: Segmenter = Segmenter::new(Mode::Normal, dict, None);
66        Ok(Self(lindera::tokenizer::Tokenizer::new(segmenter)))
67    }
68}
69
70pub fn stdin2tokens2stdout_default() -> Result<(), io::Error> {
71    let tok: Tokenizer = Tokenizer::new_default()?;
72    tok.stdin2tokens2stdout()
73}