ticker_sniffer/structs/token_mapper.rs
1use crate::types::{TokenId, TokenRef, TokenVector};
2use crate::Tokenizer;
3use std::collections::HashMap;
4
5/// A struct to map tokens to unique identifiers and vice versa.
6///
7/// This structure is responsible for maintaining a bidirectional mapping
8/// between tokens (represented as character vectors) and their unique IDs.
9/// It also provides utility methods to query and manage these mappings.
10pub struct TokenMapper {
11 /// A map of token character vectors to their unique IDs.
12 pub token_map: HashMap<TokenVector, TokenId>,
13
14 /// A reverse map of unique IDs back to their token character vectors.
15 pub reverse_token_map: HashMap<TokenId, TokenVector>,
16
17 /// Tracks the next available unique ID for new tokens.
18 next_id: TokenId,
19}
20
21impl TokenMapper {
22 /// Creates a new instance of `TokenMapper`.
23 ///
24 /// Initializes empty maps for tokens and reverse lookups, and sets the
25 /// starting `next_id` to 0.
26 #[allow(clippy::new_without_default)]
27 pub fn new() -> Self {
28 TokenMapper {
29 token_map: HashMap::new(),
30 reverse_token_map: HashMap::new(),
31 next_id: 0,
32 }
33 }
34
35 /// Adds a token to the map if it doesn't already exist, and returns its unique ID.
36 ///
37 /// If the token is already present in the `token_map`, its existing ID is returned.
38 /// Otherwise, a new ID is generated, stored, and returned.
39 ///
40 /// # Arguments
41 /// * `token` - A reference to the token string to add or look up.
42 ///
43 /// # Returns
44 /// * A unique ID for the token.
45 pub fn upsert_token(&mut self, token: &str) -> TokenId {
46 let token_vector = Tokenizer::token_to_charcode_vector(token);
47
48 if let Some(&id) = self.token_map.get(&token_vector) {
49 id
50 } else {
51 let id = self.next_id;
52 self.token_map.insert(token_vector.clone(), id);
53 self.reverse_token_map.insert(id, token_vector.clone());
54 self.next_id += 1;
55 id
56 }
57 }
58
59 /// Gets the unique ID for a token if it exists in the map.
60 ///
61 /// # Arguments
62 /// * `token` - A reference to the token string to look up.
63 ///
64 /// # Returns
65 /// * `Some(TokenId)` if the token is present, or `None` if it is not found.
66 pub fn get_token_id(&self, token: &TokenRef) -> Option<TokenId> {
67 let token_vector = Tokenizer::token_to_charcode_vector(token);
68
69 self.token_map.get(&token_vector).copied()
70 }
71
72 /// Filters and returns tokens that are present in the map.
73 ///
74 /// # Arguments
75 /// * `tokens` - A vector of borrowed token references.
76 ///
77 /// # Returns
78 /// * A vector of borrowed token references that exist in the map.
79 pub fn get_filtered_tokens<'a>(&'a self, tokens: Vec<&'a TokenRef>) -> Vec<&'a TokenRef> {
80 tokens
81 .into_iter()
82 .filter(|token| self.get_token_id(token).is_some())
83 .collect()
84 }
85
86 /// Filters and returns token IDs for tokens that exist in the map.
87 ///
88 /// # Arguments
89 /// * `tokens` - A vector of borrowed token references.
90 ///
91 /// # Returns
92 /// * A vector of token IDs corresponding to the tokens found in the map.
93 pub fn get_filtered_token_ids<'a>(&'a self, tokens: Vec<&'a TokenRef>) -> Vec<TokenId> {
94 tokens
95 .into_iter()
96 .filter_map(|token| self.get_token_id(token))
97 .collect()
98 }
99
100 /// Retrieves the token string for a given unique ID.
101 ///
102 /// # Arguments
103 /// * `token_id` - The unique ID of the token to look up.
104 ///
105 /// # Returns
106 /// * `Some(String)` containing the token if the ID is found, or `None` otherwise.
107 pub fn get_token_by_id(&self, token_id: TokenId) -> Option<String> {
108 self.reverse_token_map
109 .get(&token_id)
110 .map(Tokenizer::charcode_vector_to_token)
111 }
112
113 /// Retrieves token strings for a list of token IDs.
114 ///
115 /// # Arguments
116 /// * `token_ids` - A slice of token IDs to look up.
117 ///
118 /// # Returns
119 /// * A vector of `Option<String>` where each entry corresponds to the token
120 /// string for the given ID, or `None` if the ID is not found.
121 pub fn get_tokens_by_ids(&self, token_ids: &[TokenId]) -> Vec<Option<String>> {
122 token_ids
123 .iter()
124 .map(|&token_id| self.get_token_by_id(token_id))
125 .collect()
126 }
127
128 /// Gets the total number of unique tokens in the map.
129 ///
130 /// # Returns
131 /// * The number of unique tokens as a `usize`.
132 pub fn get_token_count(&self) -> usize {
133 self.token_map.len()
134 }
135}