cqlite-core 0.11.0

Core engine for CQLite — read Apache Cassandra 5.0 SSTables locally without a cluster
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
//! Execution Path Parity Tests for Issue #253
//!
//! This test suite validates that both LEGACY and ADVANCED execution paths
//! produce consistent query results, while documenting known divergence in
//! key generation strategies.
//!
//! **Background**: CQLite has two execution paths:
//! - **LEGACY path** (`executor.rs`): Uses `format!("user_key_{}", id)` for simple point lookups
//! - **ADVANCED path** (`select_executor.rs`): Uses schema-aware key decoding based on CQL types
//!
//! **Routing Logic** (`engine.rs:132-142`):
//! - Simple "WHERE id = <value>" queries with ≤8 tokens → LEGACY path
//! - All other SELECT queries → ADVANCED path
//!
//! **Key Generation Divergence** (documented, not a bug):
//! - LEGACY: Generates keys as `format!("user_key_{}", id)` (text-based)
//! - ADVANCED: Decodes partition keys from RowKey bytes using CQL type system
//!
//! **Requirements**:
//! - CQLITE_DATASETS_ROOT environment variable pointing to test-data/datasets
//! - test_basic dataset with simple_table SSTable files
//! - basic-types.cql schema file
//!
//! **Coverage**:
//! - Key generation strategy documentation
//! - Routing logic validation
//! - Execution path consistency checks

#![cfg(all(feature = "state_machine", feature = "cli-helpers"))]

use std::path::{Path, PathBuf};

use cqlite_core::ingestion::{ingest, IngestionConfig};
use cqlite_core::Database;

// Test constants
const TEST_QUALIFIED_TABLE: &str = "test_basic.simple_table";
const KEYSPACE_FILTER: &str = "/test_basic/";

/// Get the datasets root directory from environment or default
fn get_datasets_root() -> Option<PathBuf> {
    std::env::var("CQLITE_DATASETS_ROOT")
        .ok()
        .map(PathBuf::from)
        .filter(|p| p.exists())
}

/// Get the schemas directory
fn get_schemas_dir() -> Option<PathBuf> {
    // Try environment variable first
    if let Some(datasets_root) = get_datasets_root() {
        // Datasets root is test-data/datasets, schemas are in test-data/schemas
        let schemas_dir = datasets_root.parent()?.join("schemas");
        if schemas_dir.exists() {
            return Some(schemas_dir);
        }
    }

    // Fallback to relative path from cargo manifest
    let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR"));
    let schemas_dir = manifest_dir.parent()?.join("test-data").join("schemas");
    if schemas_dir.exists() {
        return Some(schemas_dir);
    }

    None
}

/// Setup test database with real SSTables via ingestion
/// Returns Ok(Database) if successful, Err(reason) if test should be skipped
async fn setup_test_database() -> Result<Database, String> {
    let datasets_root = get_datasets_root()
        .ok_or_else(|| "CQLITE_DATASETS_ROOT not set or path doesn't exist".to_string())?;

    let schemas_dir = get_schemas_dir().ok_or_else(|| "schemas directory not found".to_string())?;

    let schema_path = schemas_dir.join("basic-types.cql");
    if !schema_path.exists() {
        return Err(format!(
            "basic-types.cql schema not found at {:?}",
            schema_path
        ));
    }

    let data_dir = datasets_root.join("sstables");
    if !data_dir.exists() {
        return Err(format!("sstables directory not found at {:?}", data_dir));
    }

    let ingestion_config = IngestionConfig {
        schema_paths: vec![schema_path],
        data_dir,
        version_hint: None,
        core_config: cqlite_core::Config::default(),
        table_directory_filter: Some(KEYSPACE_FILTER.to_string()),
    };

    let ingestion_result = ingest(ingestion_config)
        .await
        .map_err(|e| format!("ingestion failed: {}", e))?;

    // Verify ingestion loaded schemas
    if ingestion_result.schema_load_result.schemas_loaded == 0 {
        return Err("No schemas loaded during ingestion".to_string());
    }

    Ok(ingestion_result.database)
}

#[tokio::test]
async fn test_simple_point_lookup_routing_to_legacy() {
    //! Verify that simple point lookup queries route to LEGACY executor
    //!
    //! **Test Strategy**:
    //! - Query: "SELECT * FROM table WHERE id = 1" (≤8 tokens)
    //! - Expected: Routes to LEGACY path (executor.rs)
    //! - Validation: Query executes without error

    let db = match setup_test_database().await {
        Ok(db) => db,
        Err(e) => {
            eprintln!("Skipping test: {}", e);
            return;
        }
    };

    // Simple point lookup: Should route to LEGACY executor (≤8 tokens)
    let query = format!("SELECT * FROM {} WHERE id = 1", TEST_QUALIFIED_TABLE);
    let _token_count = query.split_whitespace().count();

    assert!(
        _token_count <= 8,
        "Query should be ≤8 tokens to route to LEGACY path, got {}",
        _token_count
    );

    let result = db.execute(&query).await;

    match result {
        Ok(query_result) => {
            println!(
                "Issue #253: Simple point lookup routed correctly, returned {} rows",
                query_result.rows.len()
            );
        }
        Err(e) => {
            // Query may not return rows if id=1 doesn't exist, but should not error
            // due to routing or execution path issues
            println!(
                "Issue #253: Query returned error (may be expected if no data): {}",
                e
            );
        }
    }
}

#[tokio::test]
async fn test_complex_query_routing_to_advanced() {
    //! Verify that complex queries route to ADVANCED executor
    //!
    //! **Test Strategy**:
    //! - Query: Complex SELECT with multiple columns and conditions (>8 tokens)
    //! - Expected: Routes to ADVANCED path (select_executor.rs)
    //! - Validation: Query executes using schema-aware key decoding

    let db = match setup_test_database().await {
        Ok(db) => db,
        Err(e) => {
            eprintln!("Skipping test: {}", e);
            return;
        }
    };

    // Complex query: Should route to ADVANCED executor (>8 tokens OR complex predicates)
    let query = format!(
        "SELECT id, name, age FROM {} WHERE age > 25 LIMIT 10",
        TEST_QUALIFIED_TABLE
    );
    let _token_count = query.split_whitespace().count();

    // Note: Even if ≤8 tokens, absence of "WHERE id =" ensures ADVANCED path
    assert!(
        !query.contains("WHERE id ="),
        "Query should not contain 'WHERE id =' to ensure ADVANCED path routing"
    );

    let result = db.execute(&query).await;

    match result {
        Ok(query_result) => {
            println!(
                "Issue #253: Complex query routed to ADVANCED path, returned {} rows",
                query_result.rows.len()
            );

            // Verify we got column data (schema-aware decoding)
            for row in &query_result.rows {
                assert!(
                    !row.values.is_empty(),
                    "ADVANCED path should use schema-aware decoding, rows should have values"
                );
            }
        }
        Err(e) => {
            panic!("Issue #253: ADVANCED path query failed: {}", e);
        }
    }
}

#[tokio::test]
async fn test_key_generation_divergence_documented() {
    //! Validates and documents the key generation divergence between paths.
    //!
    //! ## Root Cause Analysis (Issue #253)
    //!
    //! The two execution paths serve fundamentally different purposes:
    //!
    //! ### LEGACY Path (`executor.rs:794-805`)
    //! - **Purpose**: Synthetic INSERT/SELECT testing with in-memory storage
    //! - **Key format**: `format!("user_key_{}", id)` - text-based synthetic keys
    //! - **Limitation**: Only works for columns named "id" with Integer type
    //! - **Problem**: Violates No-Heuristics Mandate (Issue #28)
    //!
    //! ### ADVANCED Path (`select_executor.rs` → `storage::partition_key_codec`)
    //! - **Purpose**: Reading real Cassandra SSTable partition keys
    //! - **Key format**: Schema-aware binary decoding via the canonical
    //!   `storage::partition_key_codec::decode_partition_key_columns()`, which
    //!   `select_executor::build_row_from_scan` delegates to (and the write engine's
    //!   `PartitionKey::from_bytes` shares). Prior to Issue #586 this lived inline in
    //!   `select_executor.rs` as `decode_partition_key_value()` and mishandled
    //!   single-component TEXT keys.
    //! - **Supports**: uuid, timeuuid, text, int, bigint, counter, blob, date, …
    //! - **Correct for**: Real SSTable data
    //!
    //! ## Why the 8-Token Heuristic Exists
    //!
    //! `SELECT * FROM ks.table WHERE id = 1` has exactly 8 whitespace-separated tokens.
    //! The routing hack sends ≤8 token queries with "WHERE id =" to LEGACY path to
    //! maintain compatibility with synthetic INSERT testing. This is a workaround,
    //! not a feature.
    //!
    //! ## Correct Behavior
    //!
    //! For SSTable reading, ADVANCED path is correct. The LEGACY INSERT feature
    //! generates keys that will never match real Cassandra partition keys.

    // Validate the key generation patterns exist in the codebase. These are
    // intentionally light source-text probes that document the Issue #253
    // divergence; they assert *architecture*, not a function's exact file, so a
    // legitimate refactor (e.g. Issue #586 relocating partition-key decoding into
    // the shared `partition_key_codec` module) doesn't falsely flag a regression.
    let manifest = std::path::Path::new(env!("CARGO_MANIFEST_DIR"));
    let read = |rel: &str| {
        std::fs::read_to_string(manifest.join(rel)).unwrap_or_else(|e| panic!("read {rel}: {e}"))
    };

    // LEGACY path: synthetic `user_key_{id}` generation still lives in executor.rs.
    let legacy_pattern = "user_key_";
    let executor_content = read("src/query/executor.rs");
    assert!(
        executor_content.contains(legacy_pattern),
        "LEGACY path should contain '{legacy_pattern}' pattern in executor.rs",
    );

    // ADVANCED path: schema-aware partition-key decoding is now the canonical
    // `decode_partition_key_columns` in `partition_key_codec`, which
    // `select_executor` delegates to (Issue #586). Assert both halves of that
    // contract rather than grepping for the old inline `decode_partition_key_value`.
    let codec_content = read("src/storage/partition_key_codec.rs");
    assert!(
        codec_content.contains("fn decode_partition_key_columns"),
        "ADVANCED path: canonical decoder 'decode_partition_key_columns' should live in partition_key_codec.rs",
    );
    let select_executor_content = read("src/query/select_executor.rs");
    assert!(
        select_executor_content.contains("partition_key_codec::decode_partition_key_columns"),
        "ADVANCED path: select_executor.rs should delegate partition-key decoding to partition_key_codec",
    );

    // Verify the routing hack exists
    let engine_content = read("src/query/engine.rs");
    assert!(
        engine_content.contains("WHERE id =") && engine_content.contains("count() <= 8"),
        "Routing hack should exist in engine.rs (WHERE id = with 8-token check)"
    );

    println!("Issue #253 ROOT CAUSE VERIFIED:");
    println!("  LEGACY:   executor.rs contains 'user_key_' synthetic key pattern");
    println!("  ADVANCED: select_executor.rs delegates to partition_key_codec::decode_partition_key_columns (Issue #586)");
    println!("  HACK:     engine.rs contains 8-token routing workaround");
    println!();
    println!("  This IS a bug - LEGACY key generation violates No-Heuristics Mandate.");
    println!("  The routing hack exists to maintain compatibility with broken INSERT feature.");
}

#[tokio::test]
async fn test_routing_logic_token_count_boundary() {
    //! Test the routing logic boundary at 8 tokens
    //!
    //! **Routing Decision** (`engine.rs:132-142`):
    //! ```rust,ignore
    //! if sql.contains("WHERE id =") && sql.split_whitespace().count() <= 8 {
    //!     // Fall through to LEGACY executor
    //! } else {
    //!     return self.execute_select_query(sql, start_time).await; // ADVANCED
    //! }
    //! ```

    let db = match setup_test_database().await {
        Ok(db) => db,
        Err(e) => {
            eprintln!("Skipping test: {}", e);
            return;
        }
    };

    // Test 1: Exactly 8 tokens with "WHERE id =" - should use LEGACY
    let query_8_tokens = format!("SELECT * FROM {} WHERE id = 1", TEST_QUALIFIED_TABLE);
    let tokens_8 = query_8_tokens.split_whitespace().count();

    assert_eq!(tokens_8, 8, "Query should have exactly 8 tokens");
    assert!(
        query_8_tokens.contains("WHERE id ="),
        "Query should contain 'WHERE id ='"
    );

    let result_8 = db.execute(&query_8_tokens).await;
    println!(
        "Issue #253: 8-token query result: {:?}",
        result_8.as_ref().map(|r| r.rows.len())
    );

    // Test 2: 9+ tokens with "WHERE id =" - should use ADVANCED (exceeds 8-token threshold)
    let query_9_tokens = format!(
        "SELECT id, name, age FROM {} WHERE id = 1",
        TEST_QUALIFIED_TABLE
    );
    let tokens_9 = query_9_tokens.split_whitespace().count();

    assert!(
        tokens_9 > 8,
        "Query should have more than 8 tokens to exceed threshold, got {}",
        tokens_9
    );
    assert!(
        query_9_tokens.contains("WHERE id ="),
        "Query should contain 'WHERE id ='"
    );

    let result_9 = db.execute(&query_9_tokens).await;
    println!(
        "Issue #253: {}-token query result: {:?}",
        tokens_9,
        result_9.as_ref().map(|r| r.rows.len())
    );

    // Test 3: ≤8 tokens but no "WHERE id =" - should use ADVANCED
    let query_no_id = format!("SELECT * FROM {} LIMIT 5", TEST_QUALIFIED_TABLE);
    let tokens_no_id = query_no_id.split_whitespace().count();

    assert!(tokens_no_id <= 8, "Query should be ≤8 tokens");
    assert!(
        !query_no_id.contains("WHERE id ="),
        "Query should NOT contain 'WHERE id ='"
    );

    let result_no_id = db.execute(&query_no_id).await;
    match result_no_id {
        Ok(query_result) => {
            assert!(
                !query_result.rows.is_empty(),
                "Issue #253: Non-id query should return rows from ADVANCED path"
            );
            println!(
                "Issue #253: Query without 'WHERE id =' routed to ADVANCED path, returned {} rows",
                query_result.rows.len()
            );
        }
        Err(e) => {
            panic!("Issue #253: Query without 'WHERE id =' failed: {}", e);
        }
    }
}

#[tokio::test]
async fn test_both_paths_produce_valid_results() {
    //! Verify both execution paths produce valid (non-error) results
    //!
    //! **Note**: We don't compare exact result sets because:
    //! 1. Different key generation strategies may access different data
    //! 2. LEGACY is optimized for simple point lookups
    //! 3. ADVANCED is optimized for complex queries with schema awareness
    //!
    //! Instead, we verify both paths execute without errors and return
    //! properly structured results.

    let db = match setup_test_database().await {
        Ok(db) => db,
        Err(e) => {
            eprintln!("Skipping test: {}", e);
            return;
        }
    };

    // Query 1: Simple point lookup (LEGACY path)
    let legacy_query = format!("SELECT * FROM {} WHERE id = 1", TEST_QUALIFIED_TABLE);
    let legacy_result = db.execute(&legacy_query).await;

    match legacy_result {
        Ok(result) => {
            println!(
                "Issue #253: LEGACY path executed successfully, {} rows",
                result.rows.len()
            );
            // execution_time_ms is u64, always >= 0
            let _ = result.execution_time_ms;
        }
        Err(e) => {
            println!(
                "Issue #253: LEGACY path returned error (may be expected): {}",
                e
            );
        }
    }

    // Query 2: Complex query (ADVANCED path)
    let advanced_query = format!("SELECT * FROM {} LIMIT 5", TEST_QUALIFIED_TABLE);
    let advanced_result = db.execute(&advanced_query).await;

    match advanced_result {
        Ok(result) => {
            println!(
                "Issue #253: ADVANCED path executed successfully, {} rows",
                result.rows.len()
            );
            assert!(
                !result.rows.is_empty(),
                "ADVANCED path should return rows for LIMIT query"
            );
            // execution_time_ms is u64, always >= 0
            let _ = result.execution_time_ms;

            // Verify row structure
            for row in &result.rows {
                assert!(
                    !row.values.is_empty(),
                    "Rows should have values with schema-aware decoding"
                );
            }
        }
        Err(e) => {
            panic!("Issue #253: ADVANCED path failed: {}", e);
        }
    }
}

#[tokio::test]
async fn test_execution_path_logging() {
    //! Verify that execution path routing is logged in debug mode
    //!
    //! **Purpose**: Ensure developers can observe which path is taken
    //! by examining debug logs during development.
    //!
    //! **Implementation Note**: The routing logic includes:
    //! ```rust,ignore
    //! #[cfg(debug_assertions)]
    //! log::debug!(
    //!     "Routing simple SELECT through normal executor for consistent key handling"
    //! );
    //! ```

    let db = match setup_test_database().await {
        Ok(db) => db,
        Err(e) => {
            eprintln!("Skipping test: {}", e);
            return;
        }
    };

    // In debug builds, routing decisions are logged
    // This test just verifies both paths execute without panicking
    let queries = [
        format!("SELECT * FROM {} WHERE id = 1", TEST_QUALIFIED_TABLE), // LEGACY
        format!("SELECT * FROM {} LIMIT 5", TEST_QUALIFIED_TABLE),      // ADVANCED
    ];

    for (i, query) in queries.iter().enumerate() {
        let result = db.execute(query).await;
        match result {
            Ok(_) => println!("Issue #253: Query {} executed successfully", i + 1),
            Err(e) => println!("Issue #253: Query {} returned: {}", i + 1, e),
        }
    }

    println!("Issue #253: Execution path logging test completed");
    println!("  Tip: Run with RUST_LOG=debug to see routing decisions");
}

#[tokio::test]
async fn test_parity_key_generation_assumptions() {
    //! Document key assumptions about partition key generation
    //!
    //! **Assumptions validated**:
    //! 1. LEGACY path uses text-based key generation for simple WHERE id = X
    //! 2. ADVANCED path uses binary key decoding from RowKey bytes
    //! 3. Both paths handle their respective query types correctly
    //! 4. Routing logic ensures correct path selection
    //!
    //! **What this test does NOT validate**:
    //! - Exact result set parity (intentionally divergent)
    //! - Performance characteristics
    //! - Memory usage patterns

    // This is primarily a documentation test
    println!("Issue #253 Key Generation Assumptions:");
    println!();
    println!("1. LEGACY Path (executor.rs:794-805):");
    println!("   - Trigger: WHERE id = <value> with ≤8 tokens");
    println!("   - Key Gen: format!(\"user_key_{{}}\", id) for id column");
    println!("   - Purpose: Consistent INSERT/SELECT key handling");
    println!();
    println!("2. ADVANCED Path (select_executor.rs:1095-1189):");
    println!("   - Trigger: Complex queries OR >8 tokens OR no 'WHERE id ='");
    println!("   - Key Gen: Schema-aware binary decoding from RowKey");
    println!("   - Purpose: Full CQL type system support");
    println!();
    println!("3. Routing Logic (engine.rs:132-142):");
    println!("   - Checks: sql.contains(\"WHERE id =\") && token_count <= 8");
    println!("   - LEGACY: Condition met → normal executor");
    println!("   - ADVANCED: Condition not met → select_executor");
    println!();
    println!("4. Divergence Status: DOCUMENTED and EXPECTED");
    println!("   - Not a bug - intentional design for compatibility");
    println!("   - Both paths correct for their respective use cases");
}