1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
use conllx::graph::Sentence;
use conllx::io::WriteSentence;
use failure::Error;
use crate::TaggerWrapper;
pub struct SentProcessor<'a, W>
where
W: WriteSentence,
{
tagger: &'a TaggerWrapper,
writer: W,
batch_size: usize,
read_ahead: usize,
buffer: Vec<Sentence>,
}
impl<'a, W> SentProcessor<'a, W>
where
W: WriteSentence,
{
pub fn new(tagger: &'a TaggerWrapper, writer: W, batch_size: usize, read_ahead: usize) -> Self {
assert!(batch_size > 0, "Batch size should at least be 1.");
assert!(read_ahead > 0, "Read ahead should at least be 1.");
SentProcessor {
tagger,
writer,
batch_size,
read_ahead,
buffer: Vec::new(),
}
}
pub fn process(&mut self, sent: Sentence) -> Result<(), Error> {
self.buffer.push(sent);
if self.buffer.len() == self.batch_size * self.read_ahead {
self.tag_buffered_sentences()?;
}
Ok(())
}
fn tag_buffered_sentences(&mut self) -> Result<(), Error> {
let mut sent_refs: Vec<_> = self.buffer.iter_mut().map(|s| s).collect();
sent_refs.sort_unstable_by_key(|s| s.len());
for batch in sent_refs.chunks_mut(self.batch_size) {
self.tagger.tag_sentences(batch)?;
}
let mut sents = Vec::new();
std::mem::swap(&mut sents, &mut self.buffer);
for sent in sents {
self.writer.write_sentence(&sent)?;
}
Ok(())
}
}
impl<'a, W> Drop for SentProcessor<'a, W>
where
W: WriteSentence,
{
fn drop(&mut self) {
if !self.buffer.is_empty() {
if let Err(err) = self.tag_buffered_sentences() {
eprintln!("Error tagging sentences: {}", err);
}
}
}
}