1use std::io;
2
3use io::BufWriter;
4use io::Write;
5
6use io::BufRead;
7
8use serde::ser::{Serialize, SerializeSeq, Serializer};
9
10use lindera::token::Token;
11
12use lindera::dictionary::Dictionary;
13use lindera::dictionary::load_dictionary;
14
15use lindera::mode::Mode;
16use lindera::segmenter::Segmenter;
17
18pub struct Tokenizer(pub lindera::tokenizer::Tokenizer);
19
20pub struct Tokens<'a>(pub &'a [Token<'a>]);
21
22impl<'a> Serialize for Tokens<'a> {
23 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
24 where
25 S: Serializer,
26 {
27 let mut seq: S::SerializeSeq = serializer.serialize_seq(Some(self.0.len()))?;
28
29 for tok in self.0 {
30 let sur: &str = &tok.surface;
31 seq.serialize_element(sur)?;
32 }
33
34 seq.end()
35 }
36}
37
38impl<'a> Tokens<'a> {
39 pub fn to_writer<W>(&self, wtr: &mut W) -> Result<(), io::Error>
40 where
41 W: Write,
42 {
43 serde_json::to_writer(wtr.by_ref(), self)?;
44 writeln!(wtr)?;
45 Ok(())
46 }
47}
48
49impl Tokenizer {
50 pub fn txt2tokens2writer<W>(&self, txt: &str, wtr: &mut W) -> Result<(), io::Error>
51 where
52 W: Write,
53 {
54 let toks: Vec<Token> = self.0.tokenize(txt).map_err(io::Error::other)?;
55 Tokens(&toks).to_writer(wtr)?;
56 Ok(())
57 }
58}
59
60impl Tokenizer {
61 pub fn json2tokens2writer<W>(&self, jline: &[u8], wtr: &mut W) -> Result<(), io::Error>
62 where
63 W: Write,
64 {
65 let parsed: String = serde_json::from_slice(jline)?;
66 self.txt2tokens2writer(&parsed, wtr)
67 }
68}
69
70impl Tokenizer {
71 pub fn jsonl2tokens2writer<I, W>(&self, jsonl: I, wtr: &mut W) -> Result<(), io::Error>
72 where
73 I: Iterator<Item = Result<Vec<u8>, io::Error>>,
74 W: Write,
75 {
76 for rline in jsonl {
77 let line: Vec<u8> = rline?;
78 self.json2tokens2writer(&line, wtr)?;
79 }
80 wtr.flush()
81 }
82}
83
84impl Tokenizer {
85 pub fn stdin2jsonl2tokens2stdout(&self) -> Result<(), io::Error> {
86 let o = io::stdout();
87 let mut ol = o.lock();
88 self.jsonl2tokens2writer(
89 io::stdin().lock().split(b'\n'),
90 &mut BufWriter::new(&mut ol),
91 )?;
92 ol.flush()
93 }
94}
95
96impl Tokenizer {
97 pub fn new_default() -> Result<Self, io::Error> {
98 let dict: Dictionary = load_dictionary("embedded://ipadic").map_err(io::Error::other)?;
99 let segm: Segmenter = Segmenter::new(Mode::Normal, dict, None);
100 Ok(Self(lindera::tokenizer::Tokenizer::new(segm)))
101 }
102}
103
104pub fn stdin2jsonl2tokens2stdout_default() -> Result<(), io::Error> {
105 let tok: Tokenizer = Tokenizer::new_default()?;
106 tok.stdin2jsonl2tokens2stdout()
107}