active_call/offline/sensevoice/
tokenizer.rs1use std::fs::File;
2use std::io::{BufRead, BufReader};
3use std::path::Path;
4
5use anyhow::{anyhow, Context, Result};
6
7pub struct TokenDecoder {
8 pieces: Vec<String>,
9}
10
11impl TokenDecoder {
12 pub fn new<P: AsRef<Path>>(path: P) -> Result<Self> {
13 let file = File::open(path.as_ref())
14 .with_context(|| format!("open tokens file {}", path.as_ref().display()))?;
15 let mut pieces: Vec<String> = Vec::new();
16 for (line_idx, line) in BufReader::new(file).lines().enumerate() {
17 let line = line.with_context(|| format!("read line {}", line_idx + 1))?;
18 let trimmed = line.trim();
19 if trimmed.is_empty() {
20 continue;
21 }
22 let (token, id_str) = trimmed
23 .rsplit_once(' ')
24 .ok_or_else(|| anyhow!("invalid tokens entry: '{}'", trimmed))?;
25 let id: usize = id_str
26 .parse()
27 .with_context(|| format!("parse token id at line {}", line_idx + 1))?;
28 if id >= pieces.len() {
29 pieces.resize(id + 1, String::new());
30 }
31 pieces[id] = token.to_owned();
32 }
33 if pieces.is_empty() {
34 anyhow::bail!("tokens list is empty");
35 }
36 for (idx, piece) in pieces.iter().enumerate() {
37 if piece.is_empty() {
38 anyhow::bail!("missing token for id {}", idx);
39 }
40 }
41 Ok(Self { pieces })
42 }
43
44 pub fn decode_ids(&self, ids: &[i32]) -> String {
45 let mut text = String::new();
46 for &id in ids {
47 if id < 0 {
48 continue;
49 }
50 let idx = id as usize;
51 if idx >= self.pieces.len() {
52 continue;
53 }
54 let piece = &self.pieces[idx];
55 if piece == "<unk>" || piece == "<s>" || piece == "</s>" {
56 continue;
57 }
58 if piece.starts_with('<') && piece.ends_with('>') {
59 continue;
60 }
61 if let Some(stripped) = piece.strip_prefix('▁') {
62 if !text.is_empty() {
63 text.push(' ');
64 }
65 text.push_str(stripped);
66 } else {
67 text.push_str(piece);
68 }
69 }
70 text.trim().to_string()
71 }
72}