Skip to main content

cqlite_core/discovery/
scanner.rs

1//! Filesystem scanner for SSTable discovery
2//!
3//! This module provides functionality for scanning a Cassandra data directory
4//! and discovering SSTables, keyspaces, and tables.
5
6use std::path::{Path, PathBuf};
7
8use crate::error::{Error, Result};
9
10/// Keyspace information
11#[derive(Debug, Clone)]
12pub struct KeyspaceInfo {
13    /// Keyspace name
14    pub name: String,
15    /// Tables in this keyspace
16    pub tables: Vec<TableInfo>,
17}
18
19/// Table information
20#[derive(Debug, Clone)]
21pub struct TableInfo {
22    /// Fully qualified table name (keyspace.table)
23    pub qualified_name: String,
24    /// Keyspace name
25    pub keyspace: String,
26    /// Table name
27    pub name: String,
28    /// SSTable count
29    pub sstable_count: usize,
30    /// Table directory path
31    pub path: PathBuf,
32}
33
34/// Result of scanning a data directory
35#[derive(Debug, Clone)]
36pub struct ScanResult {
37    /// Keyspace names discovered (excluding system keyspaces)
38    pub keyspaces: Vec<String>,
39    /// Fully qualified table names discovered (excluding system tables)
40    pub tables: Vec<String>,
41    /// Total number of SSTables found
42    pub sstable_count: usize,
43    /// Detailed keyspace information
44    pub keyspace_info: Vec<KeyspaceInfo>,
45    /// Warnings about potential issues with the directory structure
46    pub warnings: Vec<String>,
47}
48
49/// Check if a directory name has the expected Cassandra table format (name-uuid)
50///
51/// Cassandra table directories follow the pattern: `table_name-table_id`
52/// where table_id is a 32-character hexadecimal UUID.
53///
54/// # Examples
55/// - `simple_table-6aa08200a25111f0a3fef1a551383fb9` -> true
56/// - `users-abc123def456789012345678901234567890` -> true (if 32 hex chars)
57/// - `test_basic` -> false (no hyphen/uuid)
58/// - `my-table` -> false (suffix too short)
59fn has_cassandra_table_uuid_suffix(dir_name: &str) -> bool {
60    if let Some(pos) = dir_name.rfind('-') {
61        let suffix = &dir_name[pos + 1..];
62        // Cassandra table UUIDs are 32 hex characters (no hyphens in directory name)
63        suffix.len() == 32 && suffix.chars().all(|c| c.is_ascii_hexdigit())
64    } else {
65        false
66    }
67}
68
69/// Scanner for discovering SSTables in a data directory
70pub struct Scanner {
71    data_dir: PathBuf,
72    version_hint: Option<String>,
73}
74
75impl Scanner {
76    /// Create a new scanner for the given data directory
77    pub fn new(data_dir: &Path, version_hint: Option<String>) -> Self {
78        Self {
79            data_dir: data_dir.to_path_buf(),
80            version_hint,
81        }
82    }
83
84    /// Scan the data directory for SSTables
85    ///
86    /// This method scans the data directory structure and discovers:
87    /// - Keyspaces (excluding system keyspaces)
88    /// - Tables (excluding system tables)
89    /// - SSTable files (Data.db files)
90    ///
91    /// Cassandra data directory structure is:
92    /// data_dir/keyspace_name/table_name-table_id/sstable_files
93    pub fn scan(&self) -> Result<ScanResult> {
94        let mut keyspaces = Vec::new();
95        let mut tables = Vec::new();
96        let mut sstable_count = 0;
97        let mut keyspace_info = Vec::new();
98
99        // Read top-level directory entries (keyspaces)
100        let entries = std::fs::read_dir(&self.data_dir).map_err(|e| {
101            Error::Io(std::io::Error::new(
102                e.kind(),
103                format!(
104                    "Failed to read data directory {}: {}",
105                    self.data_dir.display(),
106                    e
107                ),
108            ))
109        })?;
110
111        for entry in entries.flatten() {
112            if !entry.path().is_dir() {
113                continue;
114            }
115
116            let keyspace_name = entry.file_name().to_string_lossy().to_string();
117
118            // Skip system keyspaces
119            if keyspace_name.starts_with("system") {
120                continue;
121            }
122
123            keyspaces.push(keyspace_name.clone());
124
125            // Scan tables in this keyspace
126            let mut keyspace_tables = Vec::new();
127            if let Ok(table_entries) = std::fs::read_dir(entry.path()) {
128                for table_entry in table_entries.flatten() {
129                    if !table_entry.path().is_dir() {
130                        continue;
131                    }
132
133                    let table_dir_name = table_entry.file_name().to_string_lossy().to_string();
134
135                    // Extract table name (format: table_name-table_id)
136                    let table_name = table_dir_name
137                        .split('-')
138                        .next()
139                        .unwrap_or(&table_dir_name)
140                        .to_string();
141
142                    let qualified_name = format!("{}.{}", keyspace_name, table_name);
143
144                    // Count SSTable files (Data.db files)
145                    let mut table_sstable_count = 0;
146                    if let Ok(sstable_files) = std::fs::read_dir(table_entry.path()) {
147                        for sstable_file in sstable_files.flatten() {
148                            let file_name = sstable_file.file_name().to_string_lossy().to_string();
149                            // Match both old and new SSTable naming conventions
150                            if file_name.ends_with("-Data.db") || file_name == "Data.db" {
151                                table_sstable_count += 1;
152                                sstable_count += 1;
153                            }
154                        }
155                    }
156
157                    tables.push(qualified_name.clone());
158                    keyspace_tables.push(TableInfo {
159                        qualified_name,
160                        keyspace: keyspace_name.clone(),
161                        name: table_name,
162                        sstable_count: table_sstable_count,
163                        path: table_entry.path(),
164                    });
165                }
166            }
167
168            if !keyspace_tables.is_empty() {
169                keyspace_info.push(KeyspaceInfo {
170                    name: keyspace_name,
171                    tables: keyspace_tables,
172                });
173            }
174        }
175
176        // Validate directory structure: check if table directories have expected UUID format
177        let mut warnings = Vec::new();
178        if !tables.is_empty() {
179            let valid_table_dir_count = keyspace_info
180                .iter()
181                .flat_map(|k| &k.tables)
182                .filter(|t| {
183                    t.path
184                        .file_name()
185                        .map(|n| has_cassandra_table_uuid_suffix(&n.to_string_lossy()))
186                        .unwrap_or(false)
187                })
188                .count();
189
190            if valid_table_dir_count == 0 {
191                warnings.push(format!(
192                    "Warning: No table directories with expected 'name-uuid' format found.\n\
193                     The --data-dir may be pointing to the wrong directory level.\n\
194                     Current path: {}\n\
195                     Expected structure: <data-dir>/<keyspace>/<table>-<uuid>/\n\
196                     Hint: Try using a subdirectory like: {}/sstables or {}/data",
197                    self.data_dir.display(),
198                    self.data_dir.display(),
199                    self.data_dir.display()
200                ));
201            }
202        }
203
204        Ok(ScanResult {
205            keyspaces,
206            tables,
207            sstable_count,
208            keyspace_info,
209            warnings,
210        })
211    }
212
213    /// Resolve Cassandra version using precedence:
214    /// 1. version_hint (if provided)
215    /// 2. SSTable metadata (from Data.db headers)
216    /// 3. metadata.yml (cluster metadata)
217    /// 4. "unknown" (fallback)
218    pub fn resolve_version(&self, _scan_result: &ScanResult) -> Result<Option<String>> {
219        // Precedence 1: Use version hint if provided
220        if let Some(hint) = &self.version_hint {
221            return Ok(Some(hint.clone()));
222        }
223
224        // Precedence 2: Try to read version from SSTable metadata
225        // TODO: Implement SSTable header version detection
226        // This would require reading the first few bytes of a Data.db file
227
228        // Precedence 3: Try to read metadata.yml
229        let metadata_path = self.data_dir.join("metadata.yml");
230        if metadata_path.exists() {
231            if let Ok(content) = std::fs::read_to_string(&metadata_path) {
232                // Parse YAML for version field (simple string search, not full YAML parsing)
233                for line in content.lines() {
234                    if line.trim().starts_with("version:") {
235                        let version = line
236                            .trim()
237                            .strip_prefix("version:")
238                            .unwrap_or("")
239                            .trim()
240                            .trim_matches('"')
241                            .trim_matches('\'')
242                            .to_string();
243                        if !version.is_empty() {
244                            return Ok(Some(version));
245                        }
246                    }
247                }
248            }
249        }
250
251        // Precedence 4: Unknown
252        Ok(Some("unknown".to_string()))
253    }
254}
255
256#[cfg(test)]
257mod tests {
258    use super::*;
259    use std::fs;
260    use tempfile::TempDir;
261
262    #[test]
263    fn test_scanner_empty_directory() {
264        let temp_dir = TempDir::new().unwrap();
265        let scanner = Scanner::new(temp_dir.path(), None);
266        let result = scanner.scan().unwrap();
267
268        assert_eq!(result.sstable_count, 0);
269        assert!(result.keyspaces.is_empty());
270        assert!(result.tables.is_empty());
271        assert!(result.keyspace_info.is_empty());
272    }
273
274    #[test]
275    fn test_scanner_with_structure() {
276        let temp_dir = TempDir::new().unwrap();
277
278        // Create keyspace/table directory structure
279        let keyspace_dir = temp_dir.path().join("test_ks");
280        fs::create_dir(&keyspace_dir).unwrap();
281
282        // Use valid 32-char hex UUID suffix (Cassandra table directory format)
283        let table_dir = keyspace_dir.join("users-6aa08200a25111f0a3fef1a551383fb9");
284        fs::create_dir(&table_dir).unwrap();
285
286        // Create mock SSTable files
287        fs::write(table_dir.join("na-1-big-Data.db"), b"mock data").unwrap();
288        fs::write(table_dir.join("na-2-big-Data.db"), b"mock data").unwrap();
289
290        let scanner = Scanner::new(temp_dir.path(), None);
291        let result = scanner.scan().unwrap();
292
293        assert_eq!(result.sstable_count, 2);
294        assert_eq!(result.keyspaces.len(), 1);
295        assert!(result.keyspaces.contains(&"test_ks".to_string()));
296        assert_eq!(result.tables.len(), 1);
297        assert!(result.tables.iter().any(|t| t.starts_with("test_ks.users")));
298        assert_eq!(result.keyspace_info.len(), 1);
299        assert_eq!(result.keyspace_info[0].name, "test_ks");
300        assert_eq!(result.keyspace_info[0].tables.len(), 1);
301        assert_eq!(result.keyspace_info[0].tables[0].sstable_count, 2);
302        // No warnings for valid structure
303        assert!(result.warnings.is_empty());
304    }
305
306    #[test]
307    fn test_scanner_skips_system_keyspaces() {
308        let temp_dir = TempDir::new().unwrap();
309
310        // Create system keyspace
311        let system_dir = temp_dir.path().join("system");
312        fs::create_dir(&system_dir).unwrap();
313        let system_table_dir = system_dir.join("local-6aa08200a25111f0a3fef1a551383fb9");
314        fs::create_dir(&system_table_dir).unwrap();
315        fs::write(system_table_dir.join("Data.db"), b"mock").unwrap();
316
317        // Create user keyspace with valid UUID suffix
318        let user_dir = temp_dir.path().join("user_ks");
319        fs::create_dir(&user_dir).unwrap();
320        let user_table_dir = user_dir.join("table-7bb09311b36222f1b4fef2b662494fc0");
321        fs::create_dir(&user_table_dir).unwrap();
322        fs::write(user_table_dir.join("na-1-big-Data.db"), b"mock").unwrap();
323
324        let scanner = Scanner::new(temp_dir.path(), None);
325        let result = scanner.scan().unwrap();
326
327        // Should only find user keyspace
328        assert_eq!(result.keyspaces.len(), 1);
329        assert!(result.keyspaces.contains(&"user_ks".to_string()));
330        assert!(!result.keyspaces.iter().any(|k| k.starts_with("system")));
331        assert_eq!(result.sstable_count, 1);
332        // No warnings for valid structure
333        assert!(result.warnings.is_empty());
334    }
335
336    #[test]
337    fn test_resolve_version_with_hint() {
338        let temp_dir = TempDir::new().unwrap();
339        let scanner = Scanner::new(temp_dir.path(), Some("5.0".to_string()));
340        let result = scanner.scan().unwrap();
341        let version = scanner.resolve_version(&result).unwrap();
342
343        assert_eq!(version, Some("5.0".to_string()));
344    }
345
346    #[test]
347    fn test_resolve_version_from_metadata_yml() {
348        let temp_dir = TempDir::new().unwrap();
349        let metadata_content = "version: 5.0.1\nother: field\n";
350        fs::write(temp_dir.path().join("metadata.yml"), metadata_content).unwrap();
351
352        let scanner = Scanner::new(temp_dir.path(), None);
353        let result = scanner.scan().unwrap();
354        let version = scanner.resolve_version(&result).unwrap();
355
356        assert_eq!(version, Some("5.0.1".to_string()));
357    }
358
359    #[test]
360    fn test_resolve_version_unknown() {
361        let temp_dir = TempDir::new().unwrap();
362        let scanner = Scanner::new(temp_dir.path(), None);
363        let result = scanner.scan().unwrap();
364        let version = scanner.resolve_version(&result).unwrap();
365
366        assert_eq!(version, Some("unknown".to_string()));
367    }
368
369    #[test]
370    fn test_scanner_multiple_keyspaces() {
371        let temp_dir = TempDir::new().unwrap();
372
373        // Use different valid UUIDs for each keyspace
374        let uuids = [
375            "6aa08200a25111f0a3fef1a551383fb9",
376            "7bb09311b36222f1b4fef2b662494fc0",
377            "8cc0a422c47333f2c5fef3c773505fd1",
378        ];
379
380        // Create multiple keyspaces with valid UUID table directories
381        for (i, ks_name) in ["keyspace1", "keyspace2", "keyspace3"].iter().enumerate() {
382            let ks_dir = temp_dir.path().join(ks_name);
383            fs::create_dir(&ks_dir).unwrap();
384
385            let table_dir = ks_dir.join(format!("{}_table-{}", ks_name, uuids[i]));
386            fs::create_dir(&table_dir).unwrap();
387            fs::write(table_dir.join("na-1-big-Data.db"), b"mock").unwrap();
388        }
389
390        let scanner = Scanner::new(temp_dir.path(), None);
391        let result = scanner.scan().unwrap();
392
393        assert_eq!(result.keyspaces.len(), 3);
394        assert_eq!(result.tables.len(), 3);
395        assert_eq!(result.sstable_count, 3);
396        // No warnings for valid structure
397        assert!(result.warnings.is_empty());
398    }
399
400    #[test]
401    fn test_scanner_warns_on_invalid_table_directory_format() {
402        let temp_dir = TempDir::new().unwrap();
403
404        // Create directory structure that LOOKS like Cassandra data but has wrong format
405        // This simulates user pointing to parent directory instead of data directory
406        let sstables_dir = temp_dir.path().join("sstables");
407        fs::create_dir(&sstables_dir).unwrap();
408
409        // Create directories that look like keyspaces but are actually tables
410        // (missing UUID suffix - this is what happens when pointing to wrong level)
411        for ks_name in &["test_basic", "test_collections"] {
412            let dir = sstables_dir.join(ks_name);
413            fs::create_dir(&dir).unwrap();
414            // Create a file so it counts as having sstables
415            fs::write(dir.join("na-1-big-Data.db"), b"mock").unwrap();
416        }
417
418        let scanner = Scanner::new(temp_dir.path(), None);
419        let result = scanner.scan().unwrap();
420
421        // Should find tables (even though structure is wrong)
422        assert!(!result.tables.is_empty());
423        // But should have a warning about the structure
424        assert!(!result.warnings.is_empty());
425        assert!(result.warnings[0].contains("name-uuid"));
426        assert!(result.warnings[0].contains("wrong directory level"));
427    }
428
429    #[test]
430    fn test_scanner_invalid_directory() {
431        let scanner = Scanner::new(Path::new("/nonexistent/path"), None);
432        let result = scanner.scan();
433
434        assert!(result.is_err());
435        if let Err(Error::Io(io_err)) = result {
436            assert_eq!(io_err.kind(), std::io::ErrorKind::NotFound);
437        } else {
438            panic!("Expected Io error");
439        }
440    }
441
442    #[test]
443    fn test_has_cassandra_table_uuid_suffix() {
444        // Valid Cassandra table directory names (32 hex chars after hyphen)
445        assert!(has_cassandra_table_uuid_suffix(
446            "simple_table-6aa08200a25111f0a3fef1a551383fb9"
447        ));
448        assert!(has_cassandra_table_uuid_suffix(
449            "users-0123456789abcdef0123456789abcdef"
450        ));
451        assert!(has_cassandra_table_uuid_suffix(
452            "my_table-ABCDEF0123456789ABCDEF0123456789"
453        )); // uppercase hex
454
455        // Invalid - no hyphen
456        assert!(!has_cassandra_table_uuid_suffix("test_basic"));
457        assert!(!has_cassandra_table_uuid_suffix("users"));
458
459        // Invalid - suffix too short
460        assert!(!has_cassandra_table_uuid_suffix("users-abc123"));
461        assert!(!has_cassandra_table_uuid_suffix("table-456"));
462
463        // Invalid - suffix too long
464        assert!(!has_cassandra_table_uuid_suffix(
465            "table-6aa08200a25111f0a3fef1a551383fb9extra"
466        ));
467
468        // Invalid - suffix contains non-hex characters
469        assert!(!has_cassandra_table_uuid_suffix(
470            "table-6aa08200a25111f0a3fef1a551383fgz"
471        )); // 'g' and 'z' not hex
472    }
473}