Skip to main content

st/
st_tokenizer.rs

1use once_cell::sync::Lazy;
2use std::collections::HashMap;
3use std::sync::RwLock;
4
5pub static TOKEN_REGISTRY: Lazy<RwLock<TokenRegistry>> =
6    Lazy::new(|| RwLock::new(TokenRegistry::new()));
7
8pub type TokenId = u16;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
11pub enum TokenCategory {
12    FileType,
13    Permission,
14    Size,
15    Time,
16    Path,
17    Owner,
18    Content,
19    Semantic,
20}
21
22#[derive(Debug, Clone)]
23pub struct SemanticToken {
24    pub id: TokenId,
25    pub category: TokenCategory,
26    pub canonical: String,
27    pub aliases: Vec<String>,
28    pub frequency: u64,
29}
30
31pub struct TokenRegistry {
32    tokens: HashMap<TokenId, SemanticToken>,
33    pattern_map: HashMap<String, TokenId>,
34    next_token_id: TokenId,
35    pattern_frequency: HashMap<String, u64>,
36}
37
38impl Default for TokenRegistry {
39    fn default() -> Self {
40        Self::new()
41    }
42}
43
44impl TokenRegistry {
45    pub fn new() -> Self {
46        let mut registry = Self {
47            tokens: HashMap::new(),
48            pattern_map: HashMap::new(),
49            next_token_id: 0x0100,
50            pattern_frequency: HashMap::new(),
51        };
52        registry.init_common_tokens();
53        registry
54    }
55
56    fn init_common_tokens(&mut self) {
57        self.add_token(0x0021, TokenCategory::FileType, "code.rust", vec![".rs"]);
58        self.add_token(
59            0x0022,
60            TokenCategory::FileType,
61            "code.python",
62            vec![".py", ".pyw", ".pyi"],
63        );
64        self.add_token(
65            0x0024,
66            TokenCategory::FileType,
67            "doc.markdown",
68            vec![".md", ".markdown", ".mdown"],
69        );
70        self.add_token(0x0081, TokenCategory::Path, "vcs.git", vec![".git"]);
71        self.add_token(
72            0x0082,
73            TokenCategory::Path,
74            "dir.source",
75            vec!["src", "source", "sources"],
76        );
77        self.add_token(
78            0x0083,
79            TokenCategory::Path,
80            "dir.build.rust",
81            vec!["target"],
82        );
83        self.add_token(
84            0x0086,
85            TokenCategory::Path,
86            "dir.docs",
87            vec!["docs", "doc", "documentation"],
88        );
89        self.add_token(
90            0x00B0,
91            TokenCategory::Semantic,
92            "pkg.manifest",
93            vec!["package.json", "Cargo.toml", "go.mod"],
94        );
95        self.add_token(
96            0x00B1,
97            TokenCategory::Semantic,
98            "pkg.lock",
99            vec!["package-lock.json", "Cargo.lock", "go.sum", "yarn.lock"],
100        );
101    }
102
103    fn add_token(
104        &mut self,
105        id: TokenId,
106        category: TokenCategory,
107        canonical: &str,
108        aliases: Vec<&str>,
109    ) {
110        let token = SemanticToken {
111            id,
112            category,
113            canonical: canonical.to_string(),
114            aliases: aliases.iter().map(|&s| s.to_string()).collect(),
115            frequency: 0,
116        };
117        for alias in &token.aliases {
118            self.pattern_map.insert(alias.clone(), id);
119        }
120        self.pattern_map.insert(canonical.to_string(), id);
121        self.tokens.insert(id, token);
122    }
123
124    pub fn get_token(&self, pattern: &str) -> Option<TokenId> {
125        self.pattern_map.get(pattern).copied()
126    }
127
128    pub fn get_semantic_token(&self, id: TokenId) -> Option<&SemanticToken> {
129        self.tokens.get(&id)
130    }
131
132    pub fn record_usage(&mut self, pattern: &str) {
133        *self
134            .pattern_frequency
135            .entry(pattern.to_string())
136            .or_insert(0) += 1;
137        if let Some(&token_id) = self.pattern_map.get(pattern) {
138            if let Some(token) = self.tokens.get_mut(&token_id) {
139                token.frequency += 1;
140            }
141        }
142    }
143
144    pub fn get_or_create_token(&mut self, pattern: &str, category: TokenCategory) -> TokenId {
145        if let Some(&token_id) = self.pattern_map.get(pattern) {
146            self.record_usage(pattern);
147            return token_id;
148        }
149        let frequency = self.pattern_frequency.get(pattern).copied().unwrap_or(0);
150        if frequency < 10 {
151            self.record_usage(pattern);
152            return 0;
153        }
154        let token_id = self.next_token_id;
155        self.next_token_id += 1;
156        let token = SemanticToken {
157            id: token_id,
158            category,
159            canonical: pattern.to_string(),
160            aliases: vec![],
161            frequency,
162        };
163        self.pattern_map.insert(pattern.to_string(), token_id);
164        self.tokens.insert(token_id, token);
165        token_id
166    }
167}
168
169impl TokenRegistry {
170    pub fn export_tokens(&self) -> Vec<(TokenId, String)> {
171        let mut tokens: Vec<_> = self
172            .tokens
173            .iter()
174            .map(|(&id, token)| (id, token.canonical.clone()))
175            .collect();
176        tokens.sort_by_key(|(id, _)| *id);
177        tokens
178    }
179
180    pub fn are_equivalent(&self, pattern1: &str, pattern2: &str) -> bool {
181        if pattern1 == pattern2 {
182            return true;
183        }
184        match (self.get_token(pattern1), self.get_token(pattern2)) {
185            (Some(id1), Some(id2)) => id1 == id2,
186            _ => false,
187        }
188    }
189
190    pub fn semantic_signature(&self, components: &[&str]) -> u64 {
191        use std::collections::hash_map::DefaultHasher;
192        use std::hash::{Hash, Hasher};
193        let mut hasher = DefaultHasher::new();
194        for component in components {
195            if let Some(token_id) = self.get_token(component) {
196                token_id.hash(&mut hasher);
197            } else {
198                component.hash(&mut hasher);
199            }
200        }
201        hasher.finish()
202    }
203}
204
205pub fn tokenize_path(path: &str) -> Vec<TokenId> {
206    let registry = TOKEN_REGISTRY.read().unwrap();
207    let mut tokens = Vec::new();
208    for component in path.split('/').filter(|s| !s.is_empty()) {
209        if let Some(token) = registry.get_token(component) {
210            tokens.push(token);
211        }
212    }
213    tokens
214}
215
216pub fn paths_equivalent(path1: &str, path2: &str) -> bool {
217    let registry = TOKEN_REGISTRY.read().unwrap();
218    let components1: Vec<_> = path1.split('/').filter(|s| !s.is_empty()).collect();
219    let components2: Vec<_> = path2.split('/').filter(|s| !s.is_empty()).collect();
220    if components1.len() != components2.len() {
221        return false;
222    }
223    for (i, (c1, c2)) in components1.iter().zip(components2.iter()).enumerate() {
224        if registry.are_equivalent(c1, c2) {
225            continue;
226        }
227        if i == components1.len() - 1 {
228            let (base1, ext1) = split_basename_ext(c1);
229            let (base2, ext2) = split_basename_ext(c2);
230            if base1 == base2 && registry.are_equivalent(ext1, ext2) {
231                continue;
232            }
233        }
234        return false;
235    }
236    true
237}
238
239fn split_basename_ext(s: &str) -> (&str, &str) {
240    match s.rfind('.') {
241        Some(idx) => (&s[..idx], &s[idx..]),
242        None => (s, ""),
243    }
244}