syntaxdot_tokenizers/lib.rs
1use ndarray::Array1;
2use udgraph::graph::Sentence;
3
4mod albert;
5pub use albert::AlbertTokenizer;
6
7mod bert;
8pub use bert::BertTokenizer;
9
10mod error;
11pub use error::TokenizerError;
12
13mod xlm_roberta;
14pub use xlm_roberta::XlmRobertaTokenizer;
15
16/// Trait for wordpiece tokenizers.
17pub trait Tokenize: Send + Sync {
18 /// Tokenize the tokens in a sentence into word pieces.
19 ///
20 /// Implementations **must** prefix the first piece corresponding to a
21 /// token by one or more special pieces marking the beginning of the
22 /// sentence. The representation of this piece can be used for special
23 /// purposes, such as classification or acting is the pseudo-root in
24 /// dependency parsing.
25 fn tokenize(&self, sentence: Sentence) -> SentenceWithPieces;
26}
27
28/// A sentence and its word pieces.
29#[derive(Debug, Eq, PartialEq)]
30pub struct SentenceWithPieces {
31 /// Word pieces in a sentence.
32 pub pieces: Array1<i64>,
33
34 /// Sentence graph.
35 pub sentence: Sentence,
36
37 /// The the offsets of tokens in `pieces`.
38 pub token_offsets: Vec<usize>,
39}