active_call/offline/sensevoice/
tokenizer.rs

1use std::fs::File;
2use std::io::{BufRead, BufReader};
3use std::path::Path;
4
5use anyhow::{anyhow, Context, Result};
6
7pub struct TokenDecoder {
8    pieces: Vec<String>,
9}
10
11impl TokenDecoder {
12    pub fn new<P: AsRef<Path>>(path: P) -> Result<Self> {
13        let file = File::open(path.as_ref())
14            .with_context(|| format!("open tokens file {}", path.as_ref().display()))?;
15        let mut pieces: Vec<String> = Vec::new();
16        for (line_idx, line) in BufReader::new(file).lines().enumerate() {
17            let line = line.with_context(|| format!("read line {}", line_idx + 1))?;
18            let trimmed = line.trim();
19            if trimmed.is_empty() {
20                continue;
21            }
22            let (token, id_str) = trimmed
23                .rsplit_once(' ')
24                .ok_or_else(|| anyhow!("invalid tokens entry: '{}'", trimmed))?;
25            let id: usize = id_str
26                .parse()
27                .with_context(|| format!("parse token id at line {}", line_idx + 1))?;
28            if id >= pieces.len() {
29                pieces.resize(id + 1, String::new());
30            }
31            pieces[id] = token.to_owned();
32        }
33        if pieces.is_empty() {
34            anyhow::bail!("tokens list is empty");
35        }
36        for (idx, piece) in pieces.iter().enumerate() {
37            if piece.is_empty() {
38                anyhow::bail!("missing token for id {}", idx);
39            }
40        }
41        Ok(Self { pieces })
42    }
43
44    pub fn decode_ids(&self, ids: &[i32]) -> String {
45        let mut text = String::new();
46        for &id in ids {
47            if id < 0 {
48                continue;
49            }
50            let idx = id as usize;
51            if idx >= self.pieces.len() {
52                continue;
53            }
54            let piece = &self.pieces[idx];
55            if piece == "<unk>" || piece == "<s>" || piece == "</s>" {
56                continue;
57            }
58            if piece.starts_with('<') && piece.ends_with('>') {
59                continue;
60            }
61            if let Some(stripped) = piece.strip_prefix('▁') {
62                if !text.is_empty() {
63                    text.push(' ');
64                }
65                text.push_str(stripped);
66            } else {
67                text.push_str(piece);
68            }
69        }
70        text.trim().to_string()
71    }
72}