syntaxdot_tokenizers/
lib.rs

1use ndarray::Array1;
2use udgraph::graph::Sentence;
3
4mod albert;
5pub use albert::AlbertTokenizer;
6
7mod bert;
8pub use bert::BertTokenizer;
9
10mod error;
11pub use error::TokenizerError;
12
13mod xlm_roberta;
14pub use xlm_roberta::XlmRobertaTokenizer;
15
16/// Trait for wordpiece tokenizers.
17pub trait Tokenize: Send + Sync {
18    /// Tokenize the tokens in a sentence into word pieces.
19    ///
20    /// Implementations **must** prefix the first piece corresponding to a
21    /// token by one or more special pieces marking the beginning of the
22    /// sentence. The representation of this piece can be used for special
23    /// purposes, such as classification or acting is the pseudo-root in
24    /// dependency parsing.
25    fn tokenize(&self, sentence: Sentence) -> SentenceWithPieces;
26}
27
28/// A sentence and its word pieces.
29#[derive(Debug, Eq, PartialEq)]
30pub struct SentenceWithPieces {
31    /// Word pieces in a sentence.
32    pub pieces: Array1<i64>,
33
34    /// Sentence graph.
35    pub sentence: Sentence,
36
37    /// The the offsets of tokens in `pieces`.
38    pub token_offsets: Vec<usize>,
39}