claude_tokenizer/
lib.rs

1use std::fs::{self, File};
2use std::io::{Read, Write};
3use std::path::Path;
4use std::str::FromStr;
5use std::sync::Mutex;
6
7use anyhow::Result;
8use tokenizers::InputSequence;
9use tokenizers::pre_tokenizers::byte_level::ByteLevel;
10use tokenizers::tokenizer::{EncodeInput, Tokenizer};
11
12pub fn get_tokenizer() -> Tokenizer {
13    let tokenizer_data = include_bytes!("claude-v3-tokenizer.json");
14
15    Tokenizer::from_bytes(tokenizer_data)
16        .unwrap()
17}
18
19pub fn tokenize(text: &str) -> Result<Vec<(u32, String)>> {
20    let tokenizer = get_tokenizer();
21
22    let val = EncodeInput::Single(InputSequence::Raw(text.into()));
23
24    let encoded_text =
25        tokenizer.encode(
26            val,
27            false,
28        );
29
30    match encoded_text {
31        Ok(encoded_text) =>
32            Ok(
33                encoded_text
34                    .get_ids()
35                    .iter()
36                    .zip(
37                        encoded_text
38                            .get_tokens()
39                            .iter()
40                            .cloned(),
41                    )
42                    .map(|(id, token)| (*id, token.to_string()))
43                    .collect()
44            ),
45        Err(err) =>
46            Err(
47                anyhow::Error::msg(
48                    err.to_string(),
49                )
50            )
51    }
52}
53
54pub fn count_tokens(text: &str) -> Result<usize> {
55    let tokenizer = get_tokenizer();
56
57    let val = EncodeInput::Single(InputSequence::Raw(text.into()));
58
59    let encoded_text =
60        tokenizer.encode(
61            val,
62            false,
63        );
64
65    match encoded_text {
66        Ok(encoded_text) => {
67            Ok(
68                encoded_text.len()
69            )
70        }
71        Err(err) =>
72            Err(
73                anyhow::Error::msg(
74                    err.to_string(),
75                )
76            ),
77    }
78}