bpe-tokenizer 0.1.4

A BPE Tokenizer library.
Documentation
1
2
3
4
5
6
7
8
9
10
11
/// The character used to denote word breaks in the tokenized output.
pub(crate) const WORD_BREAK_CHAR: &str = "";

/// The token used to mark the start of a sentence.
pub(crate) const SENTENCE_START_TOKEN: &str = "<s>";

/// The token used to mark the end of a sentence.
pub(crate) const SENTENCE_END_TOKEN: &str = "</s>";

/// The token used to represent unknown words or subwords.
pub(crate) const UNKNOWN_TOKEN: &str = "<unk>";