ticker_sniffer/structs/
token_mapper.rs

1use crate::types::{TokenId, TokenRef, TokenVector};
2use crate::Tokenizer;
3use std::collections::HashMap;
4
5/// A struct to map tokens to unique identifiers and vice versa.
6///
7/// This structure is responsible for maintaining a bidirectional mapping
8/// between tokens (represented as character vectors) and their unique IDs.
9/// It also provides utility methods to query and manage these mappings.
10pub struct TokenMapper {
11    /// A map of token character vectors to their unique IDs.
12    pub token_map: HashMap<TokenVector, TokenId>,
13
14    /// A reverse map of unique IDs back to their token character vectors.
15    pub reverse_token_map: HashMap<TokenId, TokenVector>,
16
17    /// Tracks the next available unique ID for new tokens.
18    next_id: TokenId,
19}
20
21impl TokenMapper {
22    /// Creates a new instance of `TokenMapper`.
23    ///
24    /// Initializes empty maps for tokens and reverse lookups, and sets the
25    /// starting `next_id` to 0.
26    #[allow(clippy::new_without_default)]
27    pub fn new() -> Self {
28        TokenMapper {
29            token_map: HashMap::new(),
30            reverse_token_map: HashMap::new(),
31            next_id: 0,
32        }
33    }
34
35    /// Adds a token to the map if it doesn't already exist, and returns its unique ID.
36    ///
37    /// If the token is already present in the `token_map`, its existing ID is returned.
38    /// Otherwise, a new ID is generated, stored, and returned.
39    ///
40    /// # Arguments
41    /// * `token` - A reference to the token string to add or look up.
42    ///
43    /// # Returns
44    /// * A unique ID for the token.
45    pub fn upsert_token(&mut self, token: &str) -> TokenId {
46        let token_vector = Tokenizer::token_to_charcode_vector(token);
47
48        if let Some(&id) = self.token_map.get(&token_vector) {
49            id
50        } else {
51            let id = self.next_id;
52            self.token_map.insert(token_vector.clone(), id);
53            self.reverse_token_map.insert(id, token_vector.clone());
54            self.next_id += 1;
55            id
56        }
57    }
58
59    /// Gets the unique ID for a token if it exists in the map.
60    ///
61    /// # Arguments
62    /// * `token` - A reference to the token string to look up.
63    ///
64    /// # Returns
65    /// * `Some(TokenId)` if the token is present, or `None` if it is not found.
66    pub fn get_token_id(&self, token: &TokenRef) -> Option<TokenId> {
67        let token_vector = Tokenizer::token_to_charcode_vector(token);
68
69        self.token_map.get(&token_vector).copied()
70    }
71
72    /// Filters and returns tokens that are present in the map.
73    ///
74    /// # Arguments
75    /// * `tokens` - A vector of borrowed token references.
76    ///
77    /// # Returns
78    /// * A vector of borrowed token references that exist in the map.
79    pub fn get_filtered_tokens<'a>(&'a self, tokens: Vec<&'a TokenRef>) -> Vec<&'a TokenRef> {
80        tokens
81            .into_iter()
82            .filter(|token| self.get_token_id(token).is_some())
83            .collect()
84    }
85
86    /// Filters and returns token IDs for tokens that exist in the map.
87    ///
88    /// # Arguments
89    /// * `tokens` - A vector of borrowed token references.
90    ///
91    /// # Returns
92    /// * A vector of token IDs corresponding to the tokens found in the map.
93    pub fn get_filtered_token_ids<'a>(&'a self, tokens: Vec<&'a TokenRef>) -> Vec<TokenId> {
94        tokens
95            .into_iter()
96            .filter_map(|token| self.get_token_id(token))
97            .collect()
98    }
99
100    /// Retrieves the token string for a given unique ID.
101    ///
102    /// # Arguments
103    /// * `token_id` - The unique ID of the token to look up.
104    ///
105    /// # Returns
106    /// * `Some(String)` containing the token if the ID is found, or `None` otherwise.
107    pub fn get_token_by_id(&self, token_id: TokenId) -> Option<String> {
108        self.reverse_token_map
109            .get(&token_id)
110            .map(Tokenizer::charcode_vector_to_token)
111    }
112
113    /// Retrieves token strings for a list of token IDs.
114    ///
115    /// # Arguments
116    /// * `token_ids` - A slice of token IDs to look up.
117    ///
118    /// # Returns
119    /// * A vector of `Option<String>` where each entry corresponds to the token
120    ///   string for the given ID, or `None` if the ID is not found.
121    pub fn get_tokens_by_ids(&self, token_ids: &[TokenId]) -> Vec<Option<String>> {
122        token_ids
123            .iter()
124            .map(|&token_id| self.get_token_by_id(token_id))
125            .collect()
126    }
127
128    /// Gets the total number of unique tokens in the map.
129    ///
130    /// # Returns
131    /// * The number of unique tokens as a `usize`.
132    pub fn get_token_count(&self) -> usize {
133        self.token_map.len()
134    }
135}