1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
// Copyright 2019 Google LLC. All Rights Reserved.
// Copyright 2019-2020 Guillaume Becquin
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//     http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use crate::preprocessing::error::TokenizerError;
use crate::preprocessing::tokenizer::base_tokenizer::{Token, TokenRef};
use crate::preprocessing::tokenizer::tokenization_utils::{_clean_text, lowercase};
use crate::preprocessing::vocab::sentence_piece_vocab::{SentencePieceModel, SentencePieceVocab};
use crate::tokenization_utils::{decompose_nfkc, is_whitespace};
use crate::{MultiThreadedTokenizer, Tokenizer, Vocab};

pub struct SentencePieceTokenizer {
    model: SentencePieceModel,
    vocab: SentencePieceVocab,
    lower_case: bool,
}

impl SentencePieceTokenizer {
    pub fn from_file(
        path: &str,
        lower_case: bool,
    ) -> Result<SentencePieceTokenizer, TokenizerError> {
        let model = SentencePieceModel::from_file(path)?;
        let vocab = SentencePieceVocab::from_file(path)?;
        Ok(SentencePieceTokenizer {
            model,
            vocab,
            lower_case,
        })
    }

    pub fn from_existing_vocab_and_model(
        vocab: SentencePieceVocab,
        model: SentencePieceModel,
        lower_case: bool,
    ) -> SentencePieceTokenizer {
        SentencePieceTokenizer {
            model,
            vocab,
            lower_case,
        }
    }
}

impl Tokenizer<SentencePieceVocab> for SentencePieceTokenizer {
    fn vocab(&self) -> &SentencePieceVocab {
        &self.vocab
    }

    fn tokenize_to_tokens(&self, text: TokenRef) -> Vec<Token> {
        let mut token = text.to_owned();
        _clean_text(&mut token, true);
        decompose_nfkc(&mut token);
        if self.lower_case {
            lowercase(&mut token);
        }
        token.text = token.text.replace(|c: char| is_whitespace(&c), "\u{2581}");
        if !token.text.starts_with('\u{2581}') {
            token.text.insert(0, '\u{2581}');
            token.reference_offsets.insert(0, 0);
        };
        let output = self.model.decode_forward_token_ref(token.as_ref());
        let decoded = self.model.decode_backward(&output);
        self.model.parse_nodes_to_tokens(decoded)
    }

    fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String {
        tokens
            .into_iter()
            .map(|v| v.replace('\u{2581}', " "))
            .collect::<Vec<String>>()
            .join("")
    }
}

impl MultiThreadedTokenizer<SentencePieceVocab> for SentencePieceTokenizer {}