Skip to main content

st/
quantum_scanner.rs

1// Quantum Scanner - The native tree walker that speaks in quantum format
2// This is where the magic happens - no intermediate representation, just pure quantum output
3
4#![allow(dead_code)] // Many constants and fields are reserved for future use
5
6use anyhow::Result;
7use std::collections::HashMap;
8use std::fs;
9use std::io::Write;
10use std::path::Path;
11use std::time::SystemTime;
12
13// Token ranges as suggested
14const TOKEN_RESERVED_START: u16 = 0x0000;
15const TOKEN_RESERVED_END: u16 = 0x00FF;
16const TOKEN_USER_START: u16 = 0x0100;
17
18// Pre-defined tokens for common filesystem terms
19const TOKEN_DIR: u16 = 0x0001;
20const TOKEN_FILE: u16 = 0x0002;
21const TOKEN_LINK: u16 = 0x0003;
22const TOKEN_PERM_755: u16 = 0x0010;
23const TOKEN_PERM_644: u16 = 0x0011;
24const TOKEN_PERM_777: u16 = 0x0012;
25const TOKEN_PERM_600: u16 = 0x0013;
26
27// Common extensions (0x20-0x7F)
28const TOKEN_EXT_JS: u16 = 0x0020;
29const TOKEN_EXT_RS: u16 = 0x0021;
30const TOKEN_EXT_PY: u16 = 0x0022;
31const TOKEN_EXT_GO: u16 = 0x0023;
32const TOKEN_EXT_MD: u16 = 0x0024;
33const TOKEN_EXT_JSON: u16 = 0x0025;
34const TOKEN_EXT_YAML: u16 = 0x0026;
35const TOKEN_EXT_TXT: u16 = 0x0027;
36
37// Common directory names (0x80-0xFF)
38const TOKEN_NODE_MODULES: u16 = 0x0080;
39const TOKEN_GIT: u16 = 0x0081;
40const TOKEN_SRC: u16 = 0x0082;
41const TOKEN_TARGET: u16 = 0x0083;
42const TOKEN_BUILD: u16 = 0x0084;
43const TOKEN_DIST: u16 = 0x0085;
44const TOKEN_DOCS: u16 = 0x0086;
45const TOKEN_TESTS: u16 = 0x0087;
46
47// Size tokens for common ranges
48const TOKEN_SIZE_ZERO: u16 = 0x00A0;
49const TOKEN_SIZE_TINY: u16 = 0x00A1; // 1-1KB
50const TOKEN_SIZE_SMALL: u16 = 0x00A2; // 1KB-100KB
51const TOKEN_SIZE_MEDIUM: u16 = 0x00A3; // 100KB-10MB
52const TOKEN_SIZE_LARGE: u16 = 0x00A4; // 10MB+
53
54// ASCII control codes for tree traversal
55const TRAVERSE_SAME: u8 = 0x0B; // Vertical Tab
56const TRAVERSE_DEEPER: u8 = 0x0E; // Shift Out
57const TRAVERSE_BACK: u8 = 0x0F; // Shift In
58const TRAVERSE_SUMMARY: u8 = 0x0C; // Form Feed
59
60// Header bit flags
61const HDR_HAS_SIZE: u8 = 0b00000001;
62const HDR_HAS_PERMS: u8 = 0b00000010;
63const HDR_HAS_TIME: u8 = 0b00000100;
64const HDR_HAS_OWNER: u8 = 0b00001000;
65const HDR_IS_DIR: u8 = 0b00010000;
66const HDR_IS_LINK: u8 = 0b00100000;
67const HDR_HAS_XATTR: u8 = 0b01000000;
68const HDR_TOKENIZED: u8 = 0b10000000;
69
70pub struct QuantumScanner<W: Write> {
71    writer: W,
72    token_map: HashMap<String, u16>,
73    #[allow(dead_code)]
74    next_dynamic_token: u16,
75
76    // Context for delta encoding
77    parent_perms: u32,
78    #[allow(dead_code)]
79    parent_uid: u32,
80    #[allow(dead_code)]
81    parent_gid: u32,
82    #[allow(dead_code)]
83    parent_time: SystemTime,
84
85    // Stats tracking
86    total_files: u64,
87    total_dirs: u64,
88    total_size: u64,
89}
90
91impl<W: Write> QuantumScanner<W> {
92    // Cross-platform permission handling
93    #[cfg(unix)]
94    fn get_permissions(metadata: &fs::Metadata) -> u32 {
95        use std::os::unix::fs::PermissionsExt;
96        metadata.permissions().mode() & 0o777
97    }
98
99    #[cfg(not(unix))]
100    fn get_permissions(_metadata: &fs::Metadata) -> u32 {
101        0o755 // Default permissions for non-Unix
102    }
103
104    pub fn new(writer: W) -> Self {
105        let mut token_map = HashMap::new();
106
107        // Initialize with predefined tokens
108        token_map.insert("node_modules".to_string(), TOKEN_NODE_MODULES);
109        token_map.insert(".git".to_string(), TOKEN_GIT);
110        token_map.insert("src".to_string(), TOKEN_SRC);
111        token_map.insert("target".to_string(), TOKEN_TARGET);
112        token_map.insert("build".to_string(), TOKEN_BUILD);
113        token_map.insert("dist".to_string(), TOKEN_DIST);
114        token_map.insert("docs".to_string(), TOKEN_DOCS);
115        token_map.insert("tests".to_string(), TOKEN_TESTS);
116
117        // Extension tokens
118        token_map.insert(".js".to_string(), TOKEN_EXT_JS);
119        token_map.insert(".rs".to_string(), TOKEN_EXT_RS);
120        token_map.insert(".py".to_string(), TOKEN_EXT_PY);
121        token_map.insert(".go".to_string(), TOKEN_EXT_GO);
122        token_map.insert(".md".to_string(), TOKEN_EXT_MD);
123        token_map.insert(".json".to_string(), TOKEN_EXT_JSON);
124        token_map.insert(".yaml".to_string(), TOKEN_EXT_YAML);
125        token_map.insert(".txt".to_string(), TOKEN_EXT_TXT);
126
127        Self {
128            writer,
129            token_map,
130            next_dynamic_token: TOKEN_USER_START,
131            parent_perms: 0o755,
132            parent_uid: 1000,
133            parent_gid: 1000,
134            parent_time: SystemTime::UNIX_EPOCH,
135            total_files: 0,
136            total_dirs: 0,
137            total_size: 0,
138        }
139    }
140
141    /// Write the quantum format header
142    pub fn write_header(&mut self) -> Result<()> {
143        writeln!(self.writer, "QUANTUM_NATIVE_V1:")?;
144        writeln!(self.writer, "TOKENS:")?;
145
146        // Write token map in sorted order
147        let mut tokens: Vec<_> = self.token_map.iter().collect();
148        tokens.sort_by_key(|(_, &token)| token);
149
150        for (name, token) in tokens {
151            writeln!(self.writer, "  {:04X}={}", token, name)?;
152        }
153
154        writeln!(self.writer, "DATA:")?;
155        Ok(())
156    }
157
158    /// Scan a path and emit quantum format directly
159    pub fn scan(&mut self, path: &Path) -> Result<()> {
160        self.write_header()?;
161        self.scan_recursive(path, 0)?;
162        self.write_summary()?;
163        Ok(())
164    }
165
166    fn scan_recursive(&mut self, path: &Path, depth: usize) -> Result<()> {
167        let metadata = fs::metadata(path)?;
168
169        // Emit quantum entry
170        if metadata.is_dir() {
171            self.emit_directory(path, &metadata, depth)?;
172
173            // Update parent context
174            let old_perms = self.parent_perms;
175            self.parent_perms = Self::get_permissions(&metadata);
176
177            // Scan children
178            let mut entries: Vec<_> = fs::read_dir(path)?.filter_map(|e| e.ok()).collect();
179
180            // Sort for consistent output
181            entries.sort_by_key(|e| e.file_name());
182
183            for (i, entry) in entries.iter().enumerate() {
184                let child_path = entry.path();
185                self.scan_recursive(&child_path, depth + 1)?;
186
187                // Emit traversal code
188                if i < entries.len() - 1 {
189                    self.writer.write_all(&[TRAVERSE_SAME])?;
190                }
191            }
192
193            // Restore parent context
194            self.parent_perms = old_perms;
195
196            // Emit back traversal if not at root
197            if depth > 0 {
198                self.writer.write_all(&[TRAVERSE_BACK])?;
199            }
200
201            self.total_dirs += 1;
202        } else {
203            self.emit_file(path, &metadata)?;
204            self.total_files += 1;
205            self.total_size += metadata.len();
206        }
207
208        Ok(())
209    }
210
211    fn emit_directory(&mut self, path: &Path, metadata: &fs::Metadata, depth: usize) -> Result<()> {
212        let mut header = HDR_IS_DIR;
213        let mut data = Vec::new();
214
215        // Size (for directories, this is the entry size)
216        header |= HDR_HAS_SIZE;
217        data.extend(&self.encode_size(metadata.len()));
218
219        // Permissions if different
220        let perms = Self::get_permissions(metadata);
221        if perms != self.parent_perms {
222            header |= HDR_HAS_PERMS;
223            let delta = perms ^ self.parent_perms;
224            data.push((delta >> 8) as u8);
225            data.push(delta as u8);
226        }
227
228        // Emit header and data
229        self.writer.write_all(&[header])?;
230        self.writer.write_all(&data)?;
231
232        // Emit name (tokenized if possible)
233        self.emit_name(path)?;
234
235        // Emit traversal code
236        if depth == 0 {
237            // Root directory
238            self.writer.write_all(&[TRAVERSE_DEEPER])?;
239        }
240
241        Ok(())
242    }
243
244    fn emit_file(&mut self, path: &Path, metadata: &fs::Metadata) -> Result<()> {
245        let mut header = 0u8;
246        let mut data = Vec::new();
247
248        // Size
249        header |= HDR_HAS_SIZE;
250        data.extend(&self.encode_size(metadata.len()));
251
252        // Permissions if different
253        let perms = Self::get_permissions(metadata);
254        if perms != self.parent_perms {
255            header |= HDR_HAS_PERMS;
256            let delta = perms ^ self.parent_perms;
257            data.push((delta >> 8) as u8);
258            data.push(delta as u8);
259        }
260
261        // Emit header and data
262        self.writer.write_all(&[header])?;
263        self.writer.write_all(&data)?;
264
265        // Emit name
266        self.emit_name(path)?;
267
268        Ok(())
269    }
270
271    fn emit_name(&mut self, path: &Path) -> Result<()> {
272        let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
273
274        // Check for exact token match
275        if let Some(&token) = self.token_map.get(name) {
276            self.writer.write_all(&token.to_le_bytes())?;
277            return Ok(());
278        }
279
280        // Check for extension token
281        if let Some(dot_pos) = name.rfind('.') {
282            let ext = &name[dot_pos..];
283            if let Some(&token) = self.token_map.get(ext) {
284                // Write base name + extension token
285                self.writer.write_all(&name.as_bytes()[..dot_pos])?;
286                self.writer.write_all(&token.to_le_bytes())?;
287                return Ok(());
288            }
289        }
290
291        // No token found - consider adding dynamically for frequently seen patterns
292        // For now, just write the raw name
293        self.writer.write_all(name.as_bytes())?;
294        Ok(())
295    }
296
297    fn encode_size(&self, size: u64) -> Vec<u8> {
298        // Size-based tokenization
299        match size {
300            0 => vec![TOKEN_SIZE_ZERO as u8, (TOKEN_SIZE_ZERO >> 8) as u8],
301            1..=1024 => vec![
302                TOKEN_SIZE_TINY as u8,
303                (TOKEN_SIZE_TINY >> 8) as u8,
304                size as u8,
305            ],
306            1025..=102400 => {
307                let kb = (size / 1024) as u16;
308                vec![
309                    TOKEN_SIZE_SMALL as u8,
310                    (TOKEN_SIZE_SMALL >> 8) as u8,
311                    kb as u8,
312                    (kb >> 8) as u8,
313                ]
314            }
315            _ => {
316                // For larger sizes, use standard encoding
317                match size {
318                    0..=255 => vec![0x00, size as u8],
319                    256..=65535 => {
320                        let bytes = (size as u16).to_le_bytes();
321                        vec![0x01, bytes[0], bytes[1]]
322                    }
323                    _ => {
324                        let bytes = (size as u32).to_le_bytes();
325                        vec![0x02, bytes[0], bytes[1], bytes[2], bytes[3]]
326                    }
327                }
328            }
329        }
330    }
331
332    fn write_summary(&mut self) -> Result<()> {
333        writeln!(self.writer, "\nSUMMARY:")?;
334        writeln!(self.writer, "FILES: {}", self.total_files)?;
335        writeln!(self.writer, "DIRS: {}", self.total_dirs)?;
336        writeln!(self.writer, "SIZE: {}", self.total_size)?;
337        Ok(())
338    }
339}
340
341// PermissionsExt import removed - not currently used
342// Will be re-added when permission handling is implemented