1use once_cell::sync::Lazy;
2use std::collections::HashMap;
3use std::sync::RwLock;
4
5pub static TOKEN_REGISTRY: Lazy<RwLock<TokenRegistry>> =
6 Lazy::new(|| RwLock::new(TokenRegistry::new()));
7
8pub type TokenId = u16;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
11pub enum TokenCategory {
12 FileType,
13 Permission,
14 Size,
15 Time,
16 Path,
17 Owner,
18 Content,
19 Semantic,
20}
21
22#[derive(Debug, Clone)]
23pub struct SemanticToken {
24 pub id: TokenId,
25 pub category: TokenCategory,
26 pub canonical: String,
27 pub aliases: Vec<String>,
28 pub frequency: u64,
29}
30
31pub struct TokenRegistry {
32 tokens: HashMap<TokenId, SemanticToken>,
33 pattern_map: HashMap<String, TokenId>,
34 next_token_id: TokenId,
35 pattern_frequency: HashMap<String, u64>,
36}
37
38impl Default for TokenRegistry {
39 fn default() -> Self {
40 Self::new()
41 }
42}
43
44impl TokenRegistry {
45 pub fn new() -> Self {
46 let mut registry = Self {
47 tokens: HashMap::new(),
48 pattern_map: HashMap::new(),
49 next_token_id: 0x0100,
50 pattern_frequency: HashMap::new(),
51 };
52 registry.init_common_tokens();
53 registry
54 }
55
56 fn init_common_tokens(&mut self) {
57 self.add_token(0x0021, TokenCategory::FileType, "code.rust", vec![".rs"]);
58 self.add_token(
59 0x0022,
60 TokenCategory::FileType,
61 "code.python",
62 vec![".py", ".pyw", ".pyi"],
63 );
64 self.add_token(
65 0x0024,
66 TokenCategory::FileType,
67 "doc.markdown",
68 vec![".md", ".markdown", ".mdown"],
69 );
70 self.add_token(0x0081, TokenCategory::Path, "vcs.git", vec![".git"]);
71 self.add_token(
72 0x0082,
73 TokenCategory::Path,
74 "dir.source",
75 vec!["src", "source", "sources"],
76 );
77 self.add_token(
78 0x0083,
79 TokenCategory::Path,
80 "dir.build.rust",
81 vec!["target"],
82 );
83 self.add_token(
84 0x0086,
85 TokenCategory::Path,
86 "dir.docs",
87 vec!["docs", "doc", "documentation"],
88 );
89 self.add_token(
90 0x00B0,
91 TokenCategory::Semantic,
92 "pkg.manifest",
93 vec!["package.json", "Cargo.toml", "go.mod"],
94 );
95 self.add_token(
96 0x00B1,
97 TokenCategory::Semantic,
98 "pkg.lock",
99 vec!["package-lock.json", "Cargo.lock", "go.sum", "yarn.lock"],
100 );
101 }
102
103 fn add_token(
104 &mut self,
105 id: TokenId,
106 category: TokenCategory,
107 canonical: &str,
108 aliases: Vec<&str>,
109 ) {
110 let token = SemanticToken {
111 id,
112 category,
113 canonical: canonical.to_string(),
114 aliases: aliases.iter().map(|&s| s.to_string()).collect(),
115 frequency: 0,
116 };
117 for alias in &token.aliases {
118 self.pattern_map.insert(alias.clone(), id);
119 }
120 self.pattern_map.insert(canonical.to_string(), id);
121 self.tokens.insert(id, token);
122 }
123
124 pub fn get_token(&self, pattern: &str) -> Option<TokenId> {
125 self.pattern_map.get(pattern).copied()
126 }
127
128 pub fn get_semantic_token(&self, id: TokenId) -> Option<&SemanticToken> {
129 self.tokens.get(&id)
130 }
131
132 pub fn record_usage(&mut self, pattern: &str) {
133 *self
134 .pattern_frequency
135 .entry(pattern.to_string())
136 .or_insert(0) += 1;
137 if let Some(&token_id) = self.pattern_map.get(pattern) {
138 if let Some(token) = self.tokens.get_mut(&token_id) {
139 token.frequency += 1;
140 }
141 }
142 }
143
144 pub fn get_or_create_token(&mut self, pattern: &str, category: TokenCategory) -> TokenId {
145 if let Some(&token_id) = self.pattern_map.get(pattern) {
146 self.record_usage(pattern);
147 return token_id;
148 }
149 let frequency = self.pattern_frequency.get(pattern).copied().unwrap_or(0);
150 if frequency < 10 {
151 self.record_usage(pattern);
152 return 0;
153 }
154 let token_id = self.next_token_id;
155 self.next_token_id += 1;
156 let token = SemanticToken {
157 id: token_id,
158 category,
159 canonical: pattern.to_string(),
160 aliases: vec![],
161 frequency,
162 };
163 self.pattern_map.insert(pattern.to_string(), token_id);
164 self.tokens.insert(token_id, token);
165 token_id
166 }
167}
168
169impl TokenRegistry {
170 pub fn export_tokens(&self) -> Vec<(TokenId, String)> {
171 let mut tokens: Vec<_> = self
172 .tokens
173 .iter()
174 .map(|(&id, token)| (id, token.canonical.clone()))
175 .collect();
176 tokens.sort_by_key(|(id, _)| *id);
177 tokens
178 }
179
180 pub fn are_equivalent(&self, pattern1: &str, pattern2: &str) -> bool {
181 if pattern1 == pattern2 {
182 return true;
183 }
184 match (self.get_token(pattern1), self.get_token(pattern2)) {
185 (Some(id1), Some(id2)) => id1 == id2,
186 _ => false,
187 }
188 }
189
190 pub fn semantic_signature(&self, components: &[&str]) -> u64 {
191 use std::collections::hash_map::DefaultHasher;
192 use std::hash::{Hash, Hasher};
193 let mut hasher = DefaultHasher::new();
194 for component in components {
195 if let Some(token_id) = self.get_token(component) {
196 token_id.hash(&mut hasher);
197 } else {
198 component.hash(&mut hasher);
199 }
200 }
201 hasher.finish()
202 }
203}
204
205pub fn tokenize_path(path: &str) -> Vec<TokenId> {
206 let registry = TOKEN_REGISTRY.read().unwrap();
207 let mut tokens = Vec::new();
208 for component in path.split('/').filter(|s| !s.is_empty()) {
209 if let Some(token) = registry.get_token(component) {
210 tokens.push(token);
211 }
212 }
213 tokens
214}
215
216pub fn paths_equivalent(path1: &str, path2: &str) -> bool {
217 let registry = TOKEN_REGISTRY.read().unwrap();
218 let components1: Vec<_> = path1.split('/').filter(|s| !s.is_empty()).collect();
219 let components2: Vec<_> = path2.split('/').filter(|s| !s.is_empty()).collect();
220 if components1.len() != components2.len() {
221 return false;
222 }
223 for (i, (c1, c2)) in components1.iter().zip(components2.iter()).enumerate() {
224 if registry.are_equivalent(c1, c2) {
225 continue;
226 }
227 if i == components1.len() - 1 {
228 let (base1, ext1) = split_basename_ext(c1);
229 let (base2, ext2) = split_basename_ext(c2);
230 if base1 == base2 && registry.are_equivalent(ext1, ext2) {
231 continue;
232 }
233 }
234 return false;
235 }
236 true
237}
238
239fn split_basename_ext(s: &str) -> (&str, &str) {
240 match s.rfind('.') {
241 Some(idx) => (&s[..idx], &s[idx..]),
242 None => (s, ""),
243 }
244}