1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
//! Provider of a routine for tokenization.
use crate::dictionary::mapper::{ConnIdCounter, ConnIdProbs};
use crate::errors::Result;
use crate::sentence::Sentence;
use crate::token::{Token, TokenIter};
use crate::tokenizer::lattice::{Lattice, Node};
use crate::tokenizer::Tokenizer;

/// Provider of a routine for tokenization.
///
/// It holds the internal data structures used in tokenization,
/// which can be reused to avoid unnecessary memory reallocation.
pub struct Worker<'a> {
    pub(crate) tokenizer: &'a Tokenizer,
    pub(crate) sent: Sentence,
    pub(crate) lattice: Lattice,
    pub(crate) top_nodes: Vec<(u16, Node)>,
    pub(crate) counter: Option<ConnIdCounter>,
}

impl<'a> Worker<'a> {
    /// Creates a new instance.
    pub(crate) fn new(tokenizer: &'a Tokenizer) -> Self {
        Self {
            tokenizer,
            sent: Sentence::new(),
            lattice: Lattice::default(),
            top_nodes: vec![],
            counter: None,
        }
    }

    /// Resets the input sentence to be tokenized.
    ///
    /// # Errors
    ///
    /// When the input sentence includes characters more than
    /// [`MAX_SENTENCE_LENGTH`](crate::common::MAX_SENTENCE_LENGTH),
    /// an error will be returned.
    pub fn reset_sentence<S>(&mut self, input: S) -> Result<()>
    where
        S: AsRef<str>,
    {
        self.sent.clear();
        self.top_nodes.clear();
        let input = input.as_ref();
        if !input.is_empty() {
            self.sent.set_sentence(input);
            self.sent.compile(self.tokenizer.dictionary().char_prop())?;
        }
        Ok(())
    }

    /// Tokenizes the input sentence set in `state`,
    /// returning the result through `state`.
    pub fn tokenize(&mut self) {
        if self.sent.chars().is_empty() {
            return;
        }
        self.tokenizer.build_lattice(&self.sent, &mut self.lattice);
        self.lattice.append_top_nodes(&mut self.top_nodes);
    }

    /// Gets the number of resultant tokens.
    #[inline(always)]
    pub fn num_tokens(&self) -> usize {
        self.top_nodes.len()
    }

    /// Gets the `i`-th resultant token.
    #[inline(always)]
    pub fn token(&self, i: usize) -> Token {
        let index = self.num_tokens() - i - 1;
        Token::new(self, index)
    }

    /// Creates an iterator of resultant tokens.
    #[inline(always)]
    pub const fn token_iter(&'a self) -> TokenIter<'a> {
        TokenIter::new(self, 0)
    }

    /// Initializes a counter to compute occurrence probabilities of connection ids.
    pub fn init_connid_counter(&mut self) {
        let connector = self.tokenizer.dictionary().connector();
        self.counter = Some(ConnIdCounter::new(
            connector.num_left(),
            connector.num_right(),
        ));
    }

    /// Updates frequencies of connection ids at the last tokenization.
    ///
    /// # Panics
    ///
    /// It will panic when [`Self::init_connid_counter()`] has never been called.
    pub fn update_connid_counts(&mut self) {
        self.lattice
            .add_connid_counts(self.counter.as_mut().unwrap());
    }

    /// Computes the computed occurrence probabilities of connection ids,
    /// returning those for left- and right-ids.
    ///
    /// # Panics
    ///
    /// It will panic when [`Self::init_connid_counter()`] has never been called.
    pub fn compute_connid_probs(&self) -> (ConnIdProbs, ConnIdProbs) {
        self.counter.as_ref().unwrap().compute_probs()
    }
}