1use std::fs::{self, File};
2use std::io::{Read, Write};
3use std::path::Path;
4use std::str::FromStr;
5use std::sync::Mutex;
6
7use anyhow::Result;
8use tokenizers::InputSequence;
9use tokenizers::pre_tokenizers::byte_level::ByteLevel;
10use tokenizers::tokenizer::{EncodeInput, Tokenizer};
11
12pub fn get_tokenizer() -> Tokenizer {
13 let tokenizer_data = include_bytes!("claude-v3-tokenizer.json");
14
15 Tokenizer::from_bytes(tokenizer_data)
16 .unwrap()
17}
18
19pub fn tokenize(text: &str) -> Result<Vec<(u32, String)>> {
20 let tokenizer = get_tokenizer();
21
22 let val = EncodeInput::Single(InputSequence::Raw(text.into()));
23
24 let encoded_text =
25 tokenizer.encode(
26 val,
27 false,
28 );
29
30 match encoded_text {
31 Ok(encoded_text) =>
32 Ok(
33 encoded_text
34 .get_ids()
35 .iter()
36 .zip(
37 encoded_text
38 .get_tokens()
39 .iter()
40 .cloned(),
41 )
42 .map(|(id, token)| (*id, token.to_string()))
43 .collect()
44 ),
45 Err(err) =>
46 Err(
47 anyhow::Error::msg(
48 err.to_string(),
49 )
50 )
51 }
52}
53
54pub fn count_tokens(text: &str) -> Result<usize> {
55 let tokenizer = get_tokenizer();
56
57 let val = EncodeInput::Single(InputSequence::Raw(text.into()));
58
59 let encoded_text =
60 tokenizer.encode(
61 val,
62 false,
63 );
64
65 match encoded_text {
66 Ok(encoded_text) => {
67 Ok(
68 encoded_text.len()
69 )
70 }
71 Err(err) =>
72 Err(
73 anyhow::Error::msg(
74 err.to_string(),
75 )
76 ),
77 }
78}