1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
use conllu::graph::Sentence;
use ndarray::Array1;
mod albert;
pub use albert::AlbertTokenizer;
mod bert;
pub use bert::BertTokenizer;
mod error;
pub use error::TokenizerError;
mod xlm_roberta;
pub use xlm_roberta::XlmRobertaTokenizer;
pub trait Tokenize: Send + Sync {
fn tokenize(&self, sentence: Sentence) -> SentenceWithPieces;
}
pub struct SentenceWithPieces {
pub pieces: Array1<i64>,
pub sentence: Sentence,
pub token_offsets: Vec<usize>,
}