use once_cell::sync::Lazy;
use regex::Regex;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use trustformers_core::errors::Result;
use trustformers_core::traits::{TokenizedInput, Tokenizer};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChemicalTokenizerConfig {
pub max_length: Option<usize>,
pub include_special_tokens: bool,
pub tokenize_smiles: bool,
pub tokenize_inchi: bool,
pub tokenize_formulas: bool,
pub tokenize_names: bool,
pub vocab_size: Option<usize>,
pub preserve_structure: bool,
pub case_sensitive: bool,
}
impl Default for ChemicalTokenizerConfig {
fn default() -> Self {
Self {
max_length: Some(512),
include_special_tokens: true,
tokenize_smiles: true,
tokenize_inchi: true,
tokenize_formulas: true,
tokenize_names: true,
vocab_size: Some(10000),
preserve_structure: true,
case_sensitive: false,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum ChemicalTokenType {
AtomicSymbol,
Bond,
RingClosure,
Branch,
Stereochemistry,
Charge,
FunctionalGroup,
FormulaComponent,
InChILayer,
NameFragment,
Special,
Unknown,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChemicalToken {
pub text: String,
pub token_type: ChemicalTokenType,
pub start: usize,
pub end: usize,
pub metadata: Option<ChemicalTokenMetadata>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChemicalTokenMetadata {
pub atomic_number: Option<u8>,
pub bond_order: Option<u8>,
pub ring_size: Option<u8>,
pub formal_charge: Option<i8>,
pub molecular_weight: Option<f64>,
pub iupac_info: Option<String>,
}
pub struct ChemicalTokenizer {
config: ChemicalTokenizerConfig,
vocab: HashMap<String, u32>,
id_to_token: HashMap<u32, String>,
next_id: u32,
atomic_symbols: HashMap<String, u8>,
functional_groups: HashMap<String, String>,
smiles_patterns: Vec<Regex>,
inchi_patterns: Vec<Regex>,
formula_patterns: Vec<Regex>,
}
static ATOMIC_SYMBOLS: Lazy<HashMap<String, u8>> = Lazy::new(|| {
let mut map = HashMap::new();
map.insert("H".to_string(), 1);
map.insert("He".to_string(), 2);
map.insert("Li".to_string(), 3);
map.insert("Be".to_string(), 4);
map.insert("B".to_string(), 5);
map.insert("C".to_string(), 6);
map.insert("N".to_string(), 7);
map.insert("O".to_string(), 8);
map.insert("F".to_string(), 9);
map.insert("Ne".to_string(), 10);
map.insert("Na".to_string(), 11);
map.insert("Mg".to_string(), 12);
map.insert("Al".to_string(), 13);
map.insert("Si".to_string(), 14);
map.insert("P".to_string(), 15);
map.insert("S".to_string(), 16);
map.insert("Cl".to_string(), 17);
map.insert("Ar".to_string(), 18);
map.insert("K".to_string(), 19);
map.insert("Ca".to_string(), 20);
map.insert("Sc".to_string(), 21);
map.insert("Ti".to_string(), 22);
map.insert("V".to_string(), 23);
map.insert("Cr".to_string(), 24);
map.insert("Mn".to_string(), 25);
map.insert("Fe".to_string(), 26);
map.insert("Co".to_string(), 27);
map.insert("Ni".to_string(), 28);
map.insert("Cu".to_string(), 29);
map.insert("Zn".to_string(), 30);
map.insert("Ga".to_string(), 31);
map.insert("Ge".to_string(), 32);
map.insert("As".to_string(), 33);
map.insert("Se".to_string(), 34);
map.insert("Br".to_string(), 35);
map.insert("Kr".to_string(), 36);
map
});
static FUNCTIONAL_GROUPS: Lazy<HashMap<String, String>> = Lazy::new(|| {
let mut map = HashMap::new();
map.insert("OH".to_string(), "hydroxyl".to_string());
map.insert("NH2".to_string(), "amino".to_string());
map.insert("COOH".to_string(), "carboxyl".to_string());
map.insert("CHO".to_string(), "aldehyde".to_string());
map.insert("CO".to_string(), "carbonyl".to_string());
map.insert("SH".to_string(), "thiol".to_string());
map.insert("PO4".to_string(), "phosphate".to_string());
map.insert("SO4".to_string(), "sulfate".to_string());
map.insert("NO2".to_string(), "nitro".to_string());
map.insert("CN".to_string(), "cyano".to_string());
map.insert("CF3".to_string(), "trifluoromethyl".to_string());
map.insert("Ph".to_string(), "phenyl".to_string());
map.insert("Me".to_string(), "methyl".to_string());
map.insert("Et".to_string(), "ethyl".to_string());
map.insert("Pr".to_string(), "propyl".to_string());
map.insert("Bu".to_string(), "butyl".to_string());
map
});
impl Default for ChemicalTokenizer {
fn default() -> Self {
Self::new()
}
}
impl ChemicalTokenizer {
pub fn new() -> Self {
Self::with_config(ChemicalTokenizerConfig::default())
}
pub fn with_config(config: ChemicalTokenizerConfig) -> Self {
let mut tokenizer = Self {
config,
vocab: HashMap::new(),
id_to_token: HashMap::new(),
next_id: 0,
atomic_symbols: ATOMIC_SYMBOLS.clone(),
functional_groups: FUNCTIONAL_GROUPS.clone(),
smiles_patterns: Self::create_smiles_patterns(),
inchi_patterns: Self::create_inchi_patterns(),
formula_patterns: Self::create_formula_patterns(),
};
tokenizer.initialize_vocab();
tokenizer
}
fn initialize_vocab(&mut self) {
if self.config.include_special_tokens {
self.add_token("[CLS]");
self.add_token("[SEP]");
self.add_token("[PAD]");
self.add_token("[UNK]");
self.add_token("[MASK]");
self.add_token("[START_CHEM]");
self.add_token("[END_CHEM]");
self.add_token("[SMILES]");
self.add_token("[INCHI]");
self.add_token("[FORMULA]");
}
let atomic_symbols: Vec<String> = self.atomic_symbols.keys().cloned().collect();
for symbol in atomic_symbols {
self.add_token(&symbol);
}
self.add_token("-"); self.add_token("="); self.add_token("#"); self.add_token(":");
if self.config.tokenize_smiles {
self.add_token("("); self.add_token(")"); self.add_token("["); self.add_token("]"); self.add_token("@"); self.add_token("@@"); self.add_token("+"); self.add_token("++"); self.add_token("+++"); self.add_token("/-"); self.add_token("--"); self.add_token("---");
for i in 0..10 {
self.add_token(&i.to_string());
}
for i in 10..100 {
self.add_token(&format!("%{}", i));
}
}
let functional_groups: Vec<String> = self.functional_groups.keys().cloned().collect();
for group in functional_groups {
self.add_token(&group);
}
if self.config.tokenize_formulas {
self.add_token("2");
self.add_token("3");
self.add_token("4");
self.add_token("5");
self.add_token("6");
self.add_token("ยท"); self.add_token("H2O"); }
}
fn add_token(&mut self, token: &str) -> u32 {
if let Some(&id) = self.vocab.get(token) {
return id;
}
let id = self.next_id;
self.vocab.insert(token.to_string(), id);
self.id_to_token.insert(id, token.to_string());
self.next_id += 1;
id
}
fn create_smiles_patterns() -> Vec<Regex> {
vec![
Regex::new(r"\[[^]]+\]").expect("valid regex"),
Regex::new(r"COOH|CHO|NH2|CF3|PO4|SO4|NO2").expect("valid regex"),
Regex::new(r"Br|Cl").expect("valid regex"),
Regex::new(r"[BCNOPSFIbcnops]").expect("valid regex"),
Regex::new(r"[=#:]").expect("valid regex"),
Regex::new(r"%\d+|\d").expect("valid regex"),
Regex::new(r"[()]").expect("valid regex"),
Regex::new(r"@@?").expect("valid regex"),
]
}
fn create_inchi_patterns() -> Vec<Regex> {
vec![
Regex::new(r"InChI=").expect("valid regex"),
Regex::new(r"1S?").expect("valid regex"),
Regex::new(r"/[a-z]").expect("valid regex"),
Regex::new(r"[A-Z][a-z]?\d*").expect("valid regex"),
Regex::new(r"\d+-\d+").expect("valid regex"),
Regex::new(r"[(),;-]").expect("valid regex"),
]
}
fn create_formula_patterns() -> Vec<Regex> {
vec![
Regex::new(r"[A-Z][a-z]?\d*").expect("valid regex"),
Regex::new(r"ยท\d*H2O").expect("valid regex"),
Regex::new(r"\d*[+-]").expect("valid regex"),
Regex::new(r"\([^)]+\)\d*").expect("valid regex"),
]
}
pub fn tokenize_chemical(&self, text: &str) -> Result<Vec<ChemicalToken>> {
let mut tokens = Vec::new();
let mut pos = 0;
if text.starts_with("InChI=") && self.config.tokenize_inchi {
tokens.extend(self.tokenize_inchi(text, &mut pos)?);
} else if self.is_smiles(text) && self.config.tokenize_smiles {
tokens.extend(self.tokenize_smiles(text, &mut pos)?);
} else if self.is_chemical_formula(text) && self.config.tokenize_formulas {
tokens.extend(self.tokenize_formula(text, &mut pos)?);
} else if self.config.tokenize_names {
tokens.extend(self.tokenize_chemical_name(text, &mut pos)?);
} else {
tokens.extend(self.tokenize_fallback(text, &mut pos)?);
}
Ok(tokens)
}
fn is_smiles(&self, text: &str) -> bool {
let has_chemical_chars = text
.chars()
.any(|c| "CNOPSFBrClIcnops()[]=-#@+%1234567890".matches(c).next().is_some());
let is_common_word = matches!(
text.to_lowercase().as_str(),
"water" | "salt" | "acid" | "base" | "metal" | "gas" | "liquid" | "solid"
);
has_chemical_chars && !is_common_word
}
fn is_chemical_formula(&self, text: &str) -> bool {
text.chars().next().map(|c| c.is_ascii_uppercase()).unwrap_or(false)
&& text.chars().any(|c| c.is_ascii_digit() || "()ยท".contains(c))
}
fn tokenize_smiles(&self, text: &str, pos: &mut usize) -> Result<Vec<ChemicalToken>> {
let mut tokens = Vec::new();
let mut current_pos = *pos;
for pattern in &self.smiles_patterns {
for mat in pattern.find_iter(text) {
if mat.start() >= current_pos {
let token_text = mat.as_str().to_string();
let token_type = self.classify_smiles_token(&token_text);
let metadata = self.create_token_metadata(&token_text, &token_type);
tokens.push(ChemicalToken {
text: token_text,
token_type,
start: mat.start(),
end: mat.end(),
metadata,
});
current_pos = mat.end();
}
}
}
*pos = current_pos;
Ok(tokens)
}
fn tokenize_inchi(&self, text: &str, pos: &mut usize) -> Result<Vec<ChemicalToken>> {
let mut tokens = Vec::new();
let mut current_pos = *pos;
for pattern in &self.inchi_patterns {
for mat in pattern.find_iter(text) {
if mat.start() >= current_pos {
let token_text = mat.as_str().to_string();
let token_type = ChemicalTokenType::InChILayer;
let metadata = self.create_token_metadata(&token_text, &token_type);
tokens.push(ChemicalToken {
text: token_text,
token_type,
start: mat.start(),
end: mat.end(),
metadata,
});
current_pos = mat.end();
}
}
}
*pos = current_pos;
Ok(tokens)
}
fn tokenize_formula(&self, text: &str, pos: &mut usize) -> Result<Vec<ChemicalToken>> {
let mut tokens = Vec::new();
let mut current_pos = *pos;
for pattern in &self.formula_patterns {
for mat in pattern.find_iter(text) {
if mat.start() >= current_pos {
let token_text = mat.as_str().to_string();
let token_type = ChemicalTokenType::FormulaComponent;
let metadata = self.create_token_metadata(&token_text, &token_type);
tokens.push(ChemicalToken {
text: token_text,
token_type,
start: mat.start(),
end: mat.end(),
metadata,
});
current_pos = mat.end();
}
}
}
*pos = current_pos;
Ok(tokens)
}
fn tokenize_chemical_name(&self, text: &str, pos: &mut usize) -> Result<Vec<ChemicalToken>> {
let mut tokens = Vec::new();
let parts: Vec<&str> = text.split_whitespace().collect();
for part in parts {
let token_type = if self.functional_groups.contains_key(part) {
ChemicalTokenType::FunctionalGroup
} else {
ChemicalTokenType::NameFragment
};
let metadata = self.create_token_metadata(part, &token_type);
tokens.push(ChemicalToken {
text: part.to_string(),
token_type,
start: *pos,
end: *pos + part.len(),
metadata,
});
*pos += part.len() + 1; }
Ok(tokens)
}
fn tokenize_fallback(&self, text: &str, pos: &mut usize) -> Result<Vec<ChemicalToken>> {
let mut tokens = Vec::new();
for (i, ch) in text.char_indices() {
tokens.push(ChemicalToken {
text: ch.to_string(),
token_type: ChemicalTokenType::Unknown,
start: *pos + i,
end: *pos + i + ch.len_utf8(),
metadata: None,
});
}
*pos += text.len();
Ok(tokens)
}
fn classify_smiles_token(&self, token: &str) -> ChemicalTokenType {
match token {
"(" | ")" => ChemicalTokenType::Branch,
"[" | "]" => ChemicalTokenType::AtomicSymbol,
"@" | "@@" => ChemicalTokenType::Stereochemistry,
"+" | "++" | "+++" | "-" | "--" | "---" => ChemicalTokenType::Charge,
"=" | "#" | ":" => ChemicalTokenType::Bond,
_ if token.chars().all(|c| c.is_ascii_digit()) => ChemicalTokenType::RingClosure,
_ if token.starts_with('%') => ChemicalTokenType::RingClosure,
_ if self.atomic_symbols.contains_key(token) => ChemicalTokenType::AtomicSymbol,
_ if self.functional_groups.contains_key(token) => ChemicalTokenType::FunctionalGroup,
_ => ChemicalTokenType::Unknown,
}
}
fn create_token_metadata(
&self,
token: &str,
token_type: &ChemicalTokenType,
) -> Option<ChemicalTokenMetadata> {
match token_type {
ChemicalTokenType::AtomicSymbol => {
let clean_symbol = token.trim_matches(['[', ']']);
if let Some(&atomic_number) = self.atomic_symbols.get(clean_symbol) {
Some(ChemicalTokenMetadata {
atomic_number: Some(atomic_number),
bond_order: None,
ring_size: None,
formal_charge: None,
molecular_weight: self.get_atomic_weight(atomic_number),
iupac_info: None,
})
} else {
None
}
},
ChemicalTokenType::Bond => {
let bond_order = match token {
"-" => 1,
"=" => 2,
"#" => 3,
":" => 1, _ => 1,
};
Some(ChemicalTokenMetadata {
atomic_number: None,
bond_order: Some(bond_order),
ring_size: None,
formal_charge: None,
molecular_weight: None,
iupac_info: None,
})
},
ChemicalTokenType::FunctionalGroup => Some(ChemicalTokenMetadata {
atomic_number: None,
bond_order: None,
ring_size: None,
formal_charge: None,
molecular_weight: None,
iupac_info: self.functional_groups.get(token).cloned(),
}),
_ => None,
}
}
fn get_atomic_weight(&self, atomic_number: u8) -> Option<f64> {
match atomic_number {
1 => Some(1.008), 6 => Some(12.011), 7 => Some(14.007), 8 => Some(15.999), 9 => Some(18.998), 15 => Some(30.974), 16 => Some(32.06), 17 => Some(35.45), 35 => Some(79.904), 53 => Some(126.904), _ => None,
}
}
pub fn get_vocab(&self) -> &HashMap<String, u32> {
&self.vocab
}
pub fn id_to_token(&self, id: u32) -> Option<&String> {
self.id_to_token.get(&id)
}
pub fn config(&self) -> &ChemicalTokenizerConfig {
&self.config
}
}
impl Tokenizer for ChemicalTokenizer {
fn encode(&self, text: &str) -> Result<TokenizedInput> {
let chemical_tokens = self.tokenize_chemical(text)?;
let mut input_ids = Vec::new();
for token in chemical_tokens {
if let Some(&id) = self.vocab.get(&token.text) {
input_ids.push(id);
} else {
if let Some(&unk_id) = self.vocab.get("[UNK]") {
input_ids.push(unk_id);
} else {
input_ids.push(0); }
}
}
if let Some(max_len) = self.config.max_length {
input_ids.truncate(max_len);
}
let input_ids_len = input_ids.len();
Ok(TokenizedInput {
input_ids,
attention_mask: vec![1u8; input_ids_len],
token_type_ids: None,
special_tokens_mask: None,
offset_mapping: None,
overflowing_tokens: None,
})
}
fn decode(&self, token_ids: &[u32]) -> Result<String> {
let mut result = String::new();
for &id in token_ids {
if let Some(token) = self.id_to_token.get(&id) {
if !token.starts_with('[') || !token.ends_with(']') {
result.push_str(token);
}
}
}
Ok(result)
}
fn encode_pair(&self, text_a: &str, text_b: &str) -> Result<TokenizedInput> {
let mut tokenized_a = self.encode(text_a)?;
let tokenized_b = self.encode(text_b)?;
if let Some(&sep_id) = self.vocab.get("[SEP]") {
tokenized_a.input_ids.push(sep_id);
}
tokenized_a.input_ids.extend(tokenized_b.input_ids);
if let Some(max_len) = self.config.max_length {
tokenized_a.input_ids.truncate(max_len);
}
Ok(tokenized_a)
}
fn vocab_size(&self) -> usize {
self.vocab.len()
}
fn get_vocab(&self) -> HashMap<String, u32> {
self.vocab.clone()
}
fn token_to_id(&self, token: &str) -> Option<u32> {
self.vocab.get(token).copied()
}
fn id_to_token(&self, id: u32) -> Option<String> {
self.id_to_token.get(&id).cloned()
}
}
pub struct ChemicalAnalysis {
pub token_types: HashMap<ChemicalTokenType, usize>,
pub atomic_composition: HashMap<String, usize>,
pub functional_groups: HashMap<String, usize>,
pub avg_token_length: f64,
pub total_molecular_weight: Option<f64>,
pub complexity_score: f64,
}
impl ChemicalTokenizer {
pub fn analyze(&self, text: &str) -> Result<ChemicalAnalysis> {
let tokens = self.tokenize_chemical(text)?;
let mut token_types = HashMap::new();
let mut atomic_composition = HashMap::new();
let mut functional_groups = HashMap::new();
let mut total_length = 0;
let mut total_molecular_weight = 0.0;
let mut has_molecular_weight = false;
for token in &tokens {
*token_types.entry(token.token_type.clone()).or_insert(0) += 1;
total_length += token.text.len();
match &token.token_type {
ChemicalTokenType::AtomicSymbol => {
let clean_symbol = token.text.trim_matches(['[', ']']);
*atomic_composition.entry(clean_symbol.to_string()).or_insert(0) += 1;
if let Some(ref metadata) = token.metadata {
if let Some(weight) = metadata.molecular_weight {
total_molecular_weight += weight;
has_molecular_weight = true;
}
}
},
ChemicalTokenType::FunctionalGroup => {
*functional_groups.entry(token.text.clone()).or_insert(0) += 1;
},
_ => {},
}
}
let avg_token_length =
if tokens.is_empty() { 0.0 } else { total_length as f64 / tokens.len() as f64 };
let complexity_score = self.calculate_complexity_score(&tokens);
Ok(ChemicalAnalysis {
token_types,
atomic_composition,
functional_groups,
avg_token_length,
total_molecular_weight: if has_molecular_weight {
Some(total_molecular_weight)
} else {
None
},
complexity_score,
})
}
fn calculate_complexity_score(&self, tokens: &[ChemicalToken]) -> f64 {
let mut score = 0.0;
score += tokens.len() as f64 * 0.1;
let mut token_type_count = HashMap::new();
for token in tokens {
*token_type_count.entry(&token.token_type).or_insert(0) += 1;
}
score += token_type_count.len() as f64 * 0.5;
for token in tokens {
match token.token_type {
ChemicalTokenType::Branch => score += 1.0,
ChemicalTokenType::RingClosure => score += 1.5,
ChemicalTokenType::Stereochemistry => score += 2.0,
ChemicalTokenType::FunctionalGroup => score += 0.8,
_ => {},
}
}
score
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_chemical_tokenizer_creation() {
let tokenizer = ChemicalTokenizer::new();
assert!(tokenizer.get_vocab().len() > 0);
assert!(tokenizer.get_vocab().contains_key("C"));
assert!(tokenizer.get_vocab().contains_key("O"));
}
#[test]
fn test_smiles_detection() {
let tokenizer = ChemicalTokenizer::new();
assert!(tokenizer.is_smiles("CCO"));
assert!(tokenizer.is_smiles("c1ccccc1"));
assert!(tokenizer.is_smiles("CC(=O)O"));
assert!(!tokenizer.is_smiles("water"));
}
#[test]
fn test_formula_detection() {
let tokenizer = ChemicalTokenizer::new();
assert!(tokenizer.is_chemical_formula("H2O"));
assert!(tokenizer.is_chemical_formula("C6H12O6"));
assert!(tokenizer.is_chemical_formula("CaCl2ยท2H2O"));
assert!(!tokenizer.is_chemical_formula("hello"));
}
#[test]
fn test_smiles_tokenization() {
let tokenizer = ChemicalTokenizer::new();
let result = tokenizer.encode("CCO");
assert!(result.is_ok());
let tokenized = result.expect("Operation failed in test");
assert!(!tokenized.input_ids.is_empty());
}
#[test]
fn test_chemical_formula_encoding() {
let tokenizer = ChemicalTokenizer::new();
let result = tokenizer.encode("H2O");
assert!(result.is_ok());
let tokenized = result.expect("Operation failed in test");
assert!(!tokenized.input_ids.is_empty());
}
#[test]
fn test_inchi_detection() {
let tokenizer = ChemicalTokenizer::new();
let inchi = "InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3";
let tokens = tokenizer.tokenize_chemical(inchi);
assert!(tokens.is_ok());
let token_list = tokens.expect("Operation failed in test");
assert!(token_list.iter().any(|t| t.token_type == ChemicalTokenType::InChILayer));
}
#[test]
fn test_atomic_symbol_classification() {
let tokenizer = ChemicalTokenizer::new();
let token_type = tokenizer.classify_smiles_token("C");
assert_eq!(token_type, ChemicalTokenType::AtomicSymbol);
let token_type = tokenizer.classify_smiles_token("Br");
assert_eq!(token_type, ChemicalTokenType::AtomicSymbol);
}
#[test]
fn test_bond_classification() {
let tokenizer = ChemicalTokenizer::new();
assert_eq!(
tokenizer.classify_smiles_token("="),
ChemicalTokenType::Bond
);
assert_eq!(
tokenizer.classify_smiles_token("#"),
ChemicalTokenType::Bond
);
}
#[test]
fn test_chemical_analysis() {
let tokenizer = ChemicalTokenizer::new();
let analysis = tokenizer.analyze("CCO");
assert!(analysis.is_ok());
let result = analysis.expect("Operation failed in test");
assert!(result.atomic_composition.contains_key("C"));
assert!(result.atomic_composition.contains_key("O"));
assert!(result.complexity_score > 0.0);
}
#[test]
fn test_functional_group_recognition() {
let tokenizer = ChemicalTokenizer::new();
let analysis = tokenizer.analyze("COOH");
assert!(analysis.is_ok());
let result = analysis.expect("Operation failed in test");
assert!(result.functional_groups.contains_key("COOH"));
}
#[test]
fn test_encoding_decoding_consistency() {
let tokenizer = ChemicalTokenizer::new();
let original = "CCO";
let encoded = tokenizer.encode(original).expect("Encoding failed");
let decoded = tokenizer.decode(&encoded.input_ids).expect("Decoding failed");
assert!(!decoded.is_empty());
}
#[test]
fn test_max_length_constraint() {
let mut config = ChemicalTokenizerConfig::default();
config.max_length = Some(5);
let tokenizer = ChemicalTokenizer::with_config(config);
let result = tokenizer.encode("CCCCCCCCCCCCCCCC");
assert!(result.is_ok());
let tokenized = result.expect("Operation failed in test");
assert!(tokenized.input_ids.len() <= 5);
}
}