Skip to main content

rs_jsontxt2token/
lib.rs

1use std::io;
2
3use io::BufWriter;
4use io::Write;
5
6use io::BufRead;
7
8use serde::ser::{Serialize, SerializeSeq, Serializer};
9
10use lindera::token::Token;
11
12use lindera::dictionary::Dictionary;
13use lindera::dictionary::load_dictionary;
14
15use lindera::mode::Mode;
16use lindera::segmenter::Segmenter;
17
18pub struct Tokenizer(pub lindera::tokenizer::Tokenizer);
19
20pub struct Tokens<'a>(pub &'a [Token<'a>]);
21
22impl<'a> Serialize for Tokens<'a> {
23    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
24    where
25        S: Serializer,
26    {
27        let mut seq: S::SerializeSeq = serializer.serialize_seq(Some(self.0.len()))?;
28
29        for tok in self.0 {
30            let sur: &str = &tok.surface;
31            seq.serialize_element(sur)?;
32        }
33
34        seq.end()
35    }
36}
37
38impl<'a> Tokens<'a> {
39    pub fn to_writer<W>(&self, wtr: &mut W) -> Result<(), io::Error>
40    where
41        W: Write,
42    {
43        serde_json::to_writer(wtr.by_ref(), self)?;
44        writeln!(wtr)?;
45        Ok(())
46    }
47}
48
49impl Tokenizer {
50    pub fn txt2tokens2writer<W>(&self, txt: &str, wtr: &mut W) -> Result<(), io::Error>
51    where
52        W: Write,
53    {
54        let toks: Vec<Token> = self.0.tokenize(txt).map_err(io::Error::other)?;
55        Tokens(&toks).to_writer(wtr)?;
56        Ok(())
57    }
58}
59
60impl Tokenizer {
61    pub fn json2tokens2writer<W>(&self, jline: &[u8], wtr: &mut W) -> Result<(), io::Error>
62    where
63        W: Write,
64    {
65        let parsed: String = serde_json::from_slice(jline)?;
66        self.txt2tokens2writer(&parsed, wtr)
67    }
68}
69
70impl Tokenizer {
71    pub fn jsonl2tokens2writer<I, W>(&self, jsonl: I, wtr: &mut W) -> Result<(), io::Error>
72    where
73        I: Iterator<Item = Result<Vec<u8>, io::Error>>,
74        W: Write,
75    {
76        for rline in jsonl {
77            let line: Vec<u8> = rline?;
78            self.json2tokens2writer(&line, wtr)?;
79        }
80        wtr.flush()
81    }
82}
83
84impl Tokenizer {
85    pub fn stdin2jsonl2tokens2stdout(&self) -> Result<(), io::Error> {
86        let o = io::stdout();
87        let mut ol = o.lock();
88        self.jsonl2tokens2writer(
89            io::stdin().lock().split(b'\n'),
90            &mut BufWriter::new(&mut ol),
91        )?;
92        ol.flush()
93    }
94}
95
96impl Tokenizer {
97    pub fn new_default() -> Result<Self, io::Error> {
98        let dict: Dictionary = load_dictionary("embedded://ipadic").map_err(io::Error::other)?;
99        let segm: Segmenter = Segmenter::new(Mode::Normal, dict, None);
100        Ok(Self(lindera::tokenizer::Tokenizer::new(segm)))
101    }
102}
103
104pub fn stdin2jsonl2tokens2stdout_default() -> Result<(), io::Error> {
105    let tok: Tokenizer = Tokenizer::new_default()?;
106    tok.stdin2jsonl2tokens2stdout()
107}