tokenizers/decoders/
bpe.rs

1use crate::tokenizer::{Decoder, Result};
2
3use serde::{Deserialize, Serialize};
4
5#[derive(Deserialize, Clone, Debug, Serialize)]
6/// Allows decoding Original BPE by joining all the tokens and then replacing
7/// the suffix used to identify end-of-words by whitespaces
8#[serde(tag = "type")]
9#[non_exhaustive]
10pub struct BPEDecoder {
11    pub suffix: String,
12}
13
14impl BPEDecoder {
15    pub fn new(suffix: String) -> Self {
16        Self { suffix }
17    }
18}
19
20impl Default for BPEDecoder {
21    fn default() -> Self {
22        Self::new("</w>".into())
23    }
24}
25
26impl Decoder for BPEDecoder {
27    fn decode_chain(&self, tokens: Vec<String>) -> Result<Vec<String>> {
28        let n = tokens.len() - 1;
29        Ok(tokens
30            .into_iter()
31            .enumerate()
32            .map(|(i, token)| {
33                let replacement = if i == n { "" } else { " " };
34                token.replace(&self.suffix, replacement)
35            })
36            .collect())
37    }
38}