1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
//! Data access methods for SSTableReader
//!
//! This module contains all methods related to reading data from SSTables,
//! including point lookups, range scans, and sequential access.
use super::SSTableReader;
use crate::parser::DataFormat;
use crate::types::{TableId, Value};
use crate::util::cassandra_murmur3::cassandra_murmur3_token;
use crate::{Error, Result, RowKey};
use log::{debug, warn};
use std::io::SeekFrom;
use tokio::io::AsyncSeekExt;
/// Compare two table IDs, handling both qualified (keyspace.table) and unqualified (table) formats.
///
/// This function allows flexible matching:
/// - "keyspace.table" matches "keyspace.table" (exact match)
/// - "table" matches "keyspace.table" (unqualified matches qualified)
/// - "keyspace.table" matches "table" (qualified matches unqualified)
/// - "table" matches "table" (exact match)
///
/// This is necessary because:
/// - Dataset mode SSTables store qualified table_ids (e.g., "test_basic.simple_table")
/// - Queries can use either qualified ("test_basic.simple_table") or unqualified ("simple_table") names
/// - Production SSTables may use unqualified table_ids
fn table_ids_match(entry_table_id: &TableId, query_table_id: &TableId) -> bool {
let entry_name = entry_table_id.name();
let query_name = query_table_id.name();
// Fast path: exact match
if entry_name == query_name {
return true;
}
// Extract unqualified table names for comparison
let entry_unqualified = if let Some(dot_pos) = entry_name.rfind('.') {
&entry_name[dot_pos + 1..]
} else {
entry_name
};
let query_unqualified = if let Some(dot_pos) = query_name.rfind('.') {
&query_name[dot_pos + 1..]
} else {
query_name
};
// Match if unqualified names are the same
entry_unqualified == query_unqualified
}
/// Sort a result slice in ascending Cassandra token order.
///
/// The authoritative ordering for SSTable partitions is ascending Murmur3 token, with
/// equal-token ties broken by raw key bytes (lexicographic). This matches the on-disk
/// physical order (spec §5, Appendix B §313) and the write engine's `PartitionPosition::cmp`.
///
/// Computes each key's token once to avoid O(n log n) recomputation inside the comparator.
fn sort_by_token_order(results: &mut Vec<(RowKey, Value)>) {
// Map to (token, RowKey, Value), sort, then reassemble.
let mut tagged: Vec<(i64, RowKey, Value)> = results
.drain(..)
.map(|(k, v)| {
let t = cassandra_murmur3_token(k.as_bytes());
(t, k, v)
})
.collect();
tagged.sort_by(|a, b| a.0.cmp(&b.0).then_with(|| a.1.cmp(&b.1)));
results.extend(tagged.into_iter().map(|(_, k, v)| (k, v)));
}
impl SSTableReader {
/// Return `true` when Data.db uses the V5CompressedLegacy NB chunked format and
/// therefore requires all chunks to be stitched before parsing.
///
/// The correct predicate is:
/// data_format == V5CompressedLegacy AND is_nb_format()
///
/// Rationale:
/// - `V5CompressedLegacy` identifies the row serialization format (u16 length
/// prefixes, legacy encoding) used by all Cassandra 5 'nb' SSTables.
/// - `is_nb_format()` identifies the chunked-compression read path. It intentionally
/// EXCLUDES `V5_0Uncompressed`, which uses the same row format but stores data as
/// a single contiguous block (no chunk boundaries, no stitching needed).
/// - Using `is_compressed` (compression_reader.is_some()) would be wrong for NB
/// format because the per-chunk decompression is handled inside `stitch_and_parse_all_chunks`,
/// and `is_compressed` may differ from `is_nb_format` for edge-case versions.
fn requires_chunk_stitching(&self) -> bool {
let data_format = self.header.cassandra_version.data_format();
matches!(data_format, DataFormat::V5CompressedLegacy)
&& self.header.cassandra_version.is_nb_format()
}
/// Get a value by key from the SSTable
pub async fn get(&self, table_id: &TableId, key: &RowKey) -> Result<Option<Value>> {
// First check bloom filter if available
if let Some(bloom_filter) = &self.bloom_filter {
if !bloom_filter.might_contain(key.as_bytes()) {
return Ok(None);
}
}
// Use index for efficient lookup if available
if let Some(index) = &self.index {
if let Some(entry) = index.find_entry(table_id, key).await? {
// When Index.db reports size=0 (Cassandra 5.0), fall back to sequential scan
if entry.size == 0 {
log::debug!(
"Index reports size=0 for key {:?}, using sequential scan fallback",
key
);
return self.scan_for_key(table_id, key).await;
}
// Index offsets are relative to data section start - adjust for header
let file_offset = entry.offset + self.actual_header_size as u64;
return self.read_value_at_offset(file_offset, entry.size).await;
}
// Issue #517: The SSTableIndex is built from Index.db key *digests* (16-byte
// Murmur3 hashes), not raw partition key bytes. A raw-key lookup via
// find_entry() always misses. Fall back to scan_for_key() so that get()
// and scan() agree on which partitions exist.
log::debug!(
"Index lookup returned no entry for key {:?} (possible digest/raw-key mismatch), \
falling back to sequential scan",
key
);
return self.scan_for_key(table_id, key).await;
} else {
// No index at all — fall back to sequential scan
return self.scan_for_key(table_id, key).await;
}
}
/// Scan a range of keys
///
/// # Arguments
/// * `table_id` - The table to scan
/// * `start_key` - Optional start key for range scan
/// * `end_key` - Optional end key for range scan
/// * `limit` - Optional limit on number of results
/// * `schema` - Optional table schema for schema-aware parsing. When provided,
/// enables accurate type detection and avoids heuristic-based parsing.
/// Strongly recommended for Cassandra 5.0+ formats.
pub async fn scan(
&self,
table_id: &TableId,
start_key: Option<&RowKey>,
end_key: Option<&RowKey>,
limit: Option<usize>,
schema: Option<&crate::schema::TableSchema>,
) -> Result<Vec<(RowKey, Value)>> {
log::debug!("SSTableReader::scan - Starting scan");
log::debug!("SSTableReader::scan - File path: {:?}", self.file_path);
log::debug!("SSTableReader::scan - Table ID: {}", table_id);
log::debug!("SSTableReader::scan - Start key: {:?}", start_key);
log::debug!("SSTableReader::scan - End key: {:?}", end_key);
log::debug!("SSTableReader::scan - Limit: {:?}", limit);
log::debug!("SSTableReader::scan - Has schema: {}", schema.is_some());
log::debug!("SSTableReader::scan - Has index: {}", self.index.is_some());
log::debug!(
"SSTableReader::scan - Has bloom filter: {}",
self.bloom_filter.is_some()
);
let mut results = Vec::new();
// Use index for efficient range scan if available
if let Some(index) = &self.index {
log::debug!("SSTableReader::scan - Using index-based scan");
let entries = index.get_range(table_id, start_key, end_key)?;
log::debug!(
"SSTableReader::scan - Index returned {} entries",
entries.len()
);
// Issue #256 FIX: Fall back to sequential scan when index returns no entries
//
// This handles BTI (Big Trie Index) format where parsing may be incomplete or
// where the index format is not yet fully supported. Without this check, tables
// using BTI format return 0 rows because:
// 1. The index exists (so we take the index-based path)
// 2. But get_range() returns 0 entries (BTI parsing incomplete)
// 3. The has_zero_size check never triggers (no entries to check)
// 4. The for loop iterates 0 times, returning empty results
//
// Sequential scan correctly parses Data.db directly, bypassing index issues.
if entries.is_empty() {
log::debug!(
"SSTableReader::scan - Index returned 0 entries (BTI format or incomplete parsing), falling back to sequential scan"
);
return self
.sequential_scan(table_id, start_key, end_key, limit, schema)
.await;
}
// Check if any entry has size=0 (Cassandra 5.0 format)
let has_zero_size = entries.iter().any(|e| e.size == 0);
if has_zero_size {
log::debug!("SSTableReader::scan - Index reports size=0 for some entries, using sequential scan fallback");
return self
.sequential_scan(table_id, start_key, end_key, limit, schema)
.await;
}
// Collect ALL index entries (limit applied after sort — BLOCKING-1).
for (i, entry) in entries.iter().enumerate() {
// Index offsets are relative to data section start - adjust for header
let file_offset = entry.offset + self.actual_header_size as u64;
log::debug!(
"SSTableReader::scan - Processing index entry {}: index_offset={}, file_offset={}, size={}",
i, entry.offset, file_offset, entry.size
);
if let Some(value) = self.read_value_at_offset(file_offset, entry.size).await? {
log::debug!(
"SSTableReader::scan - Successfully read value at offset {}",
entry.offset
);
results.push((entry.key.clone(), value));
} else {
log::debug!("SSTableReader::scan - Value at offset {} was filtered out (tombstone or expired)", entry.offset);
}
}
} else {
// Fallback to sequential scan. sequential_scan() already returns results in
// token order (NON-BLOCKING-1: avoid double-sort — return directly).
log::debug!("SSTableReader::scan - No index, falling back to sequential scan");
let seq_results = self
.sequential_scan(table_id, start_key, end_key, limit, schema)
.await?;
log::debug!(
"SSTableReader::scan - Sequential scan returned {} results",
seq_results.len()
);
log::debug!(
"SSTableReader::scan - Returning {} final results",
seq_results.len()
);
return Ok(seq_results);
}
// Index-based path: sort by Murmur3 token order (ascending token, then key bytes).
// This matches the on-disk physical order (spec §5, Appendix B §313) and the write
// engine's PartitionPosition::cmp. Compute each key's token once before sorting to
// avoid O(n log n) recomputation inside the comparator.
sort_by_token_order(&mut results);
// Limit applied AFTER sort so LIMIT N returns the N token-smallest partitions.
if let Some(lim) = limit {
results.truncate(lim);
}
log::debug!(
"SSTableReader::scan - Returning {} final results",
results.len()
);
Ok(results)
}
/// Get all entries in the SSTable.
///
/// # Tombstone contract (Issue #505)
///
/// This is a **user-facing** accessor: row tombstones are filtered out via
/// [`Self::filter_tombstone`] and never appear in the returned entries. The
/// underlying `parse_block` path emits `Value::Tombstone(RowTombstone)` for
/// deleted rows, but those are suppressed here so callers see exactly the live
/// rows (matching the previous `Value::Null` suppression behaviour).
///
/// The compaction k-way merger must instead use
/// [`Self::iterate_all_partitions_for_compaction`], which preserves
/// `Value::Tombstone` entries (with their authoritative deletion timestamps)
/// so that tombstone-shadowing semantics can be applied during the merge.
pub async fn get_all_entries(&self) -> Result<Vec<(TableId, RowKey, Value)>> {
let mut results = Vec::new();
// Reset to beginning of data section
let header_size = self.calculate_header_size();
{
let mut file_guard = self.file.lock().await;
file_guard.seek(SeekFrom::Start(header_size as u64)).await?;
}
// Reset chunk index when seeking to start
self.current_chunk_index
.store(0, std::sync::atomic::Ordering::Relaxed);
if self.requires_chunk_stitching() {
// V5CompressedLegacy: Row payloads can span multiple compressed chunks
// We must decompress and stitch all chunks together before parsing
log::debug!(
"V5CompressedLegacy format detected, decompressing and stitching all chunks before parsing"
);
// Use shared stitching helper method
let entries = self.stitch_and_parse_all_chunks(None).await?;
results.extend(entries);
} else {
// Other formats: Read and parse blocks individually
while let Some(block) = self.read_next_block().await? {
let entries = self.parse_block_entries(&block, None)?;
results.extend(entries);
}
}
// Issue #505: suppress row tombstones from user-facing output. The compaction
// path (iterate_all_partitions_for_compaction) bypasses this filter.
results.retain(|(_tid, _key, value)| self.filter_tombstone(value));
Ok(results)
}
/// Stitch all compressed chunks and parse as a single buffer (V5CompressedLegacy)
///
/// This helper method extracts the stitching logic from get_all_entries so it can be
/// reused by sequential_scan and other methods that need to handle V5CompressedLegacy
/// format where partitions can span chunk boundaries.
async fn stitch_and_parse_all_chunks(
&self,
schema: Option<&crate::schema::TableSchema>,
) -> Result<Vec<(TableId, RowKey, Value)>> {
log::debug!("stitch_and_parse_all_chunks: Decompressing and stitching all chunks");
// Pre-allocate buffer for ~2.5MB (estimated max size for test data)
let mut stitched_buffer = Vec::with_capacity(2_500_000);
// Read, decompress, and concatenate all chunks
let mut chunk_count = 0;
while let Some(compressed_chunk) = self.read_next_block().await? {
// Decompress this chunk before stitching
use crate::storage::sstable::compression::Compression;
let decompressed_chunk = if let Some(compression_reader) = &self.compression_reader {
let compression = Compression::new(*compression_reader.algorithm())?;
match compression.decompress(&compressed_chunk) {
Ok(decompressed) => {
log::debug!(
"stitch_and_parse_all_chunks: Chunk {} decompressed {} bytes to {} bytes",
chunk_count,
compressed_chunk.len(),
decompressed.len()
);
decompressed
}
Err(e) => {
return Err(Error::corruption(format!(
"stitch_and_parse_all_chunks: Failed to decompress chunk {}: {}",
chunk_count, e
)));
}
}
} else {
// No compression (should not happen for V5CompressedLegacy)
log::warn!(
"stitch_and_parse_all_chunks: No compression reader, using raw chunk data"
);
compressed_chunk
};
stitched_buffer.extend_from_slice(&decompressed_chunk);
chunk_count += 1;
log::debug!(
"stitch_and_parse_all_chunks: Stitched chunk {}, total buffer size: {} bytes",
chunk_count,
stitched_buffer.len()
);
}
log::debug!(
"stitch_and_parse_all_chunks: Finished stitching {} chunks, total buffer: {} bytes",
chunk_count,
stitched_buffer.len()
);
// Extract keyspace/table from header
let keyspace = self.header.keyspace.clone();
let table_name = self.header.table_name.clone();
log::debug!(
"stitch_and_parse_all_chunks: Using keyspace='{}', table_name='{}'",
keyspace,
table_name
);
// Extract EncodingStats from statistics_reader (if available)
let (min_timestamp, min_local_deletion_time, min_ttl) =
if let Some(stats_reader) = &self.statistics_reader {
let ts_stats = &stats_reader.statistics().timestamp_stats;
(
ts_stats.min_timestamp,
ts_stats.min_deletion_time,
ts_stats.min_ttl,
)
} else {
(0, 0, None)
};
let parser = crate::storage::sstable::reader::parsing::V5CompressedLegacyParser::new(
keyspace,
table_name,
min_timestamp,
min_local_deletion_time,
min_ttl,
)
// VG1: thread VersionGates from SSTableReader down to row parser so
// that VG3 can flip gate-sensitive code paths without re-deriving gates.
.with_version_gates(self.version_gates.clone());
// Add UDT registry if available for UDT-aware collection parsing (Issue #238)
let parser = if let Some(ref registry) = self.udt_registry {
parser.with_udt_registry(registry.clone())
} else {
parser
};
// Get schema (use provided schema or reader's schema)
let reader_schema;
let table_schema = if let Some(s) = schema {
Some(s)
} else {
reader_schema = self.get_table_schema(None);
reader_schema.as_ref()
};
// Parse the stitched decompressed buffer
let entries = parser.parse_block(&stitched_buffer, table_schema, self)?;
log::debug!(
"stitch_and_parse_all_chunks: Parsed {} entries from stitched buffer",
entries.len()
);
Ok(entries)
}
/// Stitch all compressed chunks and parse with per-row timestamps (for compaction).
///
/// Identical to [`stitch_and_parse_all_chunks`] but delegates to
/// [`V5CompressedLegacyParser::parse_block_with_timestamps`] so that each
/// entry carries its actual row-level write timestamp rather than
/// `SystemTime::now()`. Row and cell tombstones are emitted as
/// `Value::Tombstone` with their authoritative deletion timestamps.
///
/// Used exclusively by the compaction k-way merger path (Issue #505).
async fn stitch_and_parse_all_chunks_for_compaction(
&self,
schema: Option<&crate::schema::TableSchema>,
) -> Result<Vec<(TableId, RowKey, Value, i64)>> {
log::debug!("stitch_and_parse_all_chunks_for_compaction: stitching chunks");
let mut stitched_buffer = Vec::with_capacity(2_500_000);
let mut chunk_count = 0;
while let Some(compressed_chunk) = self.read_next_block().await? {
use crate::storage::sstable::compression::Compression;
let decompressed_chunk = if let Some(compression_reader) = &self.compression_reader {
let compression = Compression::new(*compression_reader.algorithm())?;
compression.decompress(&compressed_chunk).map_err(|e| {
Error::corruption(format!(
"stitch_and_parse_all_chunks_for_compaction: Failed to decompress chunk {}: {}",
chunk_count, e
))
})?
} else {
compressed_chunk
};
stitched_buffer.extend_from_slice(&decompressed_chunk);
chunk_count += 1;
}
log::debug!(
"stitch_and_parse_all_chunks_for_compaction: {} chunks, {} bytes total",
chunk_count,
stitched_buffer.len()
);
let keyspace = self.header.keyspace.clone();
let table_name = self.header.table_name.clone();
let (min_timestamp, min_local_deletion_time, min_ttl) =
if let Some(stats_reader) = &self.statistics_reader {
let ts_stats = &stats_reader.statistics().timestamp_stats;
(
ts_stats.min_timestamp,
ts_stats.min_deletion_time,
ts_stats.min_ttl,
)
} else {
(0, 0, None)
};
let parser = crate::storage::sstable::reader::parsing::V5CompressedLegacyParser::new(
keyspace,
table_name,
min_timestamp,
min_local_deletion_time,
min_ttl,
)
// VG1: thread VersionGates from SSTableReader down to row parser.
.with_version_gates(self.version_gates.clone());
let parser = if let Some(ref registry) = self.udt_registry {
parser.with_udt_registry(registry.clone())
} else {
parser
};
let reader_schema;
let table_schema = if let Some(s) = schema {
Some(s)
} else {
reader_schema = self.get_table_schema(None);
reader_schema.as_ref()
};
let entries = parser.parse_block_with_timestamps(&stitched_buffer, table_schema, self)?;
log::debug!(
"stitch_and_parse_all_chunks_for_compaction: parsed {} entries",
entries.len()
);
Ok(entries)
}
/// Iterate all partitions with per-row timestamps, for use by the compaction merger.
///
/// Returns `(RowKey, Value, row_timestamp_micros)` for every row in the SSTable.
/// Unlike [`iterate_all_partitions`]:
///
/// - Row tombstones are returned as `Value::Tombstone(RowTombstone)` carrying
/// the actual deletion timestamp extracted from the on-disk row header.
/// - Cell tombstones within live rows are stored as `Value::Tombstone(CellTombstone)`
/// inside the `Value::Map`, also carrying the actual cell-level deletion timestamp.
/// - The third tuple element is the decoded row-level write timestamp, so the
/// merger can perform timestamp-accurate last-write-wins comparisons.
///
/// Normal user-facing reads use [`scan`] / [`get`] / [`iterate_all_partitions`],
/// which apply tombstone filtering. Do NOT use this method for user-visible queries.
///
/// (Issue #505)
pub async fn iterate_all_partitions_for_compaction(
&self,
schema: Option<&crate::schema::TableSchema>,
) -> Result<Vec<(RowKey, Value, i64)>> {
// Only the V5CompressedLegacy NB chunk-stitching path is supported here
// (that is the format the WriteEngine produces). For other formats, fall
// back to iterate_all_partitions and attach timestamp 0 as a conservative
// default (LWW ordering then relies solely on run_index).
if self.requires_chunk_stitching() {
// We need schema; retrieve it once.
// `schema` is Option<&TableSchema>; clone it into an owned value so we
// can pass it to the async helper without borrow-checker issues.
let owned_schema = schema.cloned().or_else(|| self.get_table_schema(None));
// Reset chunk reader to start of data section.
let header_size = self.calculate_header_size();
{
let mut file_guard = self.file.lock().await;
use tokio::io::AsyncSeekExt;
file_guard
.seek(std::io::SeekFrom::Start(header_size as u64))
.await?;
}
self.current_chunk_index
.store(0, std::sync::atomic::Ordering::Relaxed);
let entries = self
.stitch_and_parse_all_chunks_for_compaction(owned_schema.as_ref())
.await?;
return Ok(entries
.into_iter()
.map(|(_tid, key, value, ts)| (key, value, ts))
.collect());
}
// Non-stitching fallback: use iterate_all_partitions and attach ts=0.
let entries = self.iterate_all_partitions().await?;
Ok(entries
.into_iter()
.map(|(key, value)| (key, value, 0))
.collect())
}
/// Read value at a specific offset with caching
pub async fn read_value_at_offset(&self, offset: u64, size: u32) -> Result<Option<Value>> {
use crate::parser::header::CassandraVersion;
use crate::storage::sstable::compression::Compression;
// Size must be non-zero for offset-based reading
if size == 0 {
return Err(Error::corruption(format!(
"Cannot read value at offset {} with size=0. This should have been caught earlier and handled via sequential scan.",
offset
)));
}
// Use cached reading with metrics tracking
let buffer = self.get_cached_data(offset, size).await?;
// Decompress if needed
let data = if let Some(compression_reader) = &self.compression_reader {
let compression = Compression::new(*compression_reader.algorithm())?;
match compression.decompress(&buffer) {
Ok(decompressed) => {
debug!(
"Successfully decompressed {} bytes to {} bytes",
buffer.len(),
decompressed.len()
);
decompressed
}
Err(e) => {
// For modern formats (4.x/5.x), decompression failure is an error
if self.header.cassandra_version != CassandraVersion::Legacy {
return Err(Error::corruption(format!(
"Decompression failed for modern format at offset={}, size={}, algorithm={:?}: {}",
offset,
size,
compression_reader.algorithm(),
e
)));
} else {
// Only allow fallback for legacy formats
warn!(
"Decompression failed for legacy format ({}), using raw data",
e
);
debug!(
"First 32 bytes of raw data: {:02x?}",
&buffer[..std::cmp::min(32, buffer.len())]
);
buffer
}
}
}
} else {
buffer
};
// TODO: Parse value using schema-driven type information
// For now, preserve raw data until schema is available
let value = Value::Blob(data.to_vec());
// Extract write time from value (placeholder - would need to be parsed from SSTable)
let _write_time = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_micros() as i64)
.unwrap_or_else(|e| {
warn!("Failed to get system time: {}; using fallback value 0", e);
0
});
// Filter out tombstones and expired data
if !self.filter_tombstone(&value) {
return Ok(None);
}
Ok(Some(value))
}
/// Read block with caching support and hit/miss tracking
async fn get_cached_data(&self, block_offset: u64, size: u32) -> Result<Vec<u8>> {
use crate::parser::header::CassandraVersion;
use crate::storage::sstable::compression::Compression;
use tokio::io::AsyncReadExt;
// Calculate block identifier based on offset and size
let _block_id = block_offset;
// For now, always read from disk and track as cache miss
self.record_cache_miss();
// Read from disk
let mut file = self.file.lock().await;
file.seek(SeekFrom::Start(block_offset)).await?;
let mut buffer = vec![0u8; size as usize];
file.read_exact(&mut buffer).await?;
drop(file); // Release file lock early
// Decompress if needed
let data = if let Some(compression_reader) = &self.compression_reader {
let compression = Compression::new(*compression_reader.algorithm())?;
match compression.decompress(&buffer) {
Ok(decompressed) => decompressed,
Err(e) => {
// Handle decompression errors based on format
if self.header.cassandra_version != CassandraVersion::Legacy {
return Err(Error::corruption(format!(
"Decompression failed at offset={}, size={}: {}",
block_offset, size, e
)));
} else {
buffer // Fall back to raw data for legacy formats
}
}
}
} else {
buffer
};
Ok(data)
}
async fn scan_for_key(&self, table_id: &TableId, key: &RowKey) -> Result<Option<Value>> {
// For V5CompressedLegacy NB format, partitions can span chunk boundaries.
// The block-by-block parser will miss any partition whose bytes cross a
// chunk boundary. Use the same stitched-buffer path that sequential_scan()
// uses so that get() and scan() share a consistent view of the data.
// (Issue #517)
if self.requires_chunk_stitching() {
log::debug!(
"scan_for_key: V5CompressedLegacy NB detected, using stitched buffer for key lookup"
);
// Reset chunk index before stitching
self.current_chunk_index
.store(0, std::sync::atomic::Ordering::Relaxed);
// Pass the reader's own schema so that V5CompressedLegacy rows can be fully
// parsed and their partition RowKeys emitted. Without a schema, parse_row_v5
// fails for all rows in a partition, causing no entries to be pushed and making
// the key comparison always miss even when the key exists.
let schema_opt = self.get_table_schema(None);
let all_entries = match self.stitch_and_parse_all_chunks(schema_opt.as_ref()).await {
Ok(entries) => entries,
Err(e) => {
// Schema may not be available for this reader (e.g., wrong table type).
// Return None so the caller can try the next reader.
log::debug!(
"scan_for_key: stitch_and_parse_all_chunks failed (schema missing?): {}",
e
);
return Ok(None);
}
};
// NOTE: The SSTableIndex is built from 16-byte Murmur3 *digests*, not raw keys,
// so find_entry() always misses and falls through to this path. For a found key
// we stop early (O(found position)); for a key not present we must scan the whole
// stitched buffer — O(file size). This O(file) miss cost is an existing
// limitation of the digest-index design and is tracked separately as a follow-up.
//
// NON-BLOCKING-2: Table-id matching is intentionally skipped in the stitching path
// (consistent with sequential_scan's stitching path). The V5CompressedLegacy parser
// returns entries tagged with the table_id from the SSTable header, which may hold
// default or incorrect values when headers use bare keyspace/table names rather than
// the query's fully-qualified form. Since all entries in this stitch buffer come from
// the single SSTable being queried, skipping the check is correct and safe.
for (_, entry_key, entry_value) in all_entries {
if entry_key == *key {
// Early-return on first match (BLOCKING-2: don't parse the rest of the file).
if !self.filter_tombstone(&entry_value) {
return Ok(None);
}
return Ok(Some(entry_value));
}
}
return Ok(None);
}
let header_size = self.calculate_header_size();
{
let mut file_guard = self.file.lock().await;
file_guard.seek(SeekFrom::Start(header_size as u64)).await?;
}
// Reset chunk index when seeking to start
self.current_chunk_index
.store(0, std::sync::atomic::Ordering::Relaxed);
// Sequential scan through blocks
while let Some(block) = self.read_next_block().await? {
let entries = self.parse_block_entries(&block, None)?;
for (entry_table_id, entry_key, entry_value) in entries {
if table_ids_match(&entry_table_id, table_id) && entry_key == *key {
// Extract write time from entry metadata
let _write_time = self.extract_write_time_from_entry(&entry_key, &entry_value);
// Filter out tombstones and expired data
if !self.filter_tombstone(&entry_value) {
return Ok(None);
}
return Ok(Some(entry_value));
}
}
}
Ok(None)
}
pub(super) async fn sequential_scan(
&self,
table_id: &TableId,
start_key: Option<&RowKey>,
end_key: Option<&RowKey>,
limit: Option<usize>,
schema: Option<&crate::schema::TableSchema>,
) -> Result<Vec<(RowKey, Value)>> {
log::debug!("SSTableReader::sequential_scan - Starting sequential scan");
log::debug!("SSTableReader::sequential_scan - Table ID: {}", table_id);
log::debug!(
"SSTableReader::sequential_scan - Has schema: {}",
schema.is_some()
);
let mut results = Vec::new();
let header_size = self.calculate_header_size();
log::debug!(
"SSTableReader::sequential_scan - Header size: {} bytes",
header_size
);
{
let mut file_guard = self.file.lock().await;
file_guard.seek(SeekFrom::Start(header_size as u64)).await?;
log::debug!(
"SSTableReader::sequential_scan - Seeked to start of data section at offset {}",
header_size
);
}
// Reset chunk index when seeking to start
self.current_chunk_index
.store(0, std::sync::atomic::Ordering::Relaxed);
// CRITICAL FIX: V5CompressedLegacy partitions can span chunk boundaries.
// We must stitch all chunks together before parsing to avoid dropping partitions.
// Use `requires_chunk_stitching()` as the single source of truth for whether
// stitching is needed (BLOCKING-3: unified predicate).
//
// Note: We intentionally skip table_id matching in the stitching path because the
// parser may return incorrect table_ids from header defaults. Since sequential_scan
// is called with a specific table_id, all entries from this SSTable match it.
if self.requires_chunk_stitching() {
log::debug!(
"SSTableReader::sequential_scan - V5CompressedLegacy NB detected, using stitched buffer"
);
// Stitch all chunks together (reuse logic from get_all_entries)
let all_entries = self.stitch_and_parse_all_chunks(schema).await?;
log::debug!(
"SSTableReader::sequential_scan - Stitched parsing returned {} total entries",
all_entries.len()
);
// Apply key-range filter and tombstone filter; collect ALL matching entries
// before sorting. Limit is applied AFTER sort so that LIMIT N returns the N
// token-smallest partitions, not the first N encountered in parse order.
// (BLOCKING-1: limit-after-order)
for (_entry_table_id, entry_key, entry_value) in all_entries {
if let Some(start) = start_key {
if &entry_key < start {
continue;
}
}
if let Some(end) = end_key {
if &entry_key > end {
continue;
}
}
if !self.filter_tombstone(&entry_value) {
continue;
}
results.push((entry_key, entry_value));
}
log::debug!(
"SSTableReader::sequential_scan - Filtered to {} results before limit (limit: {:?})",
results.len(),
limit
);
// Sort by Murmur3 token order (spec §5, Appendix B §313), then truncate to limit.
sort_by_token_order(&mut results);
if let Some(lim) = limit {
results.truncate(lim);
}
log::debug!(
"SSTableReader::sequential_scan - Returning {} results after sort+limit",
results.len()
);
return Ok(results);
}
// Non-stitching path for other formats
let mut block_count = 0;
while let Some(block) = self.read_next_block().await? {
block_count += 1;
log::debug!(
"SSTableReader::sequential_scan - Read block {}, size {} bytes",
block_count,
block.len()
);
let entries = self.parse_block_entries_with_schema(&block, schema)?;
log::debug!(
"SSTableReader::sequential_scan - Block {} contains {} entries",
block_count,
entries.len()
);
for (i, (entry_table_id, entry_key, entry_value)) in entries.iter().enumerate() {
log::debug!(
"SSTableReader::sequential_scan - Block {} entry {}: table_id='{}', key={:?}",
block_count,
i,
entry_table_id,
entry_key
);
// Match table IDs - supports both qualified (keyspace.table) and unqualified (table) formats
// This allows queries with either format to match SSTables stored with either format
if !table_ids_match(entry_table_id, table_id) {
log::debug!("SSTableReader::sequential_scan - Skipping entry: table_id mismatch ('{}' != '{}')",
entry_table_id, table_id);
continue;
}
// Check key range
if let Some(start) = start_key {
if entry_key < start {
log::debug!(
"SSTableReader::sequential_scan - Skipping entry: key < start_key"
);
continue;
}
}
if let Some(end) = end_key {
if entry_key > end {
log::debug!(
"SSTableReader::sequential_scan - Skipping entry: key > end_key"
);
continue;
}
}
// Extract write time from entry metadata
let _write_time = self.extract_write_time_from_entry(entry_key, entry_value);
// Filter out tombstones and expired data
if !self.filter_tombstone(entry_value) {
log::debug!("SSTableReader::sequential_scan - Skipping entry: filtered out (tombstone or expired)");
continue;
}
log::debug!("SSTableReader::sequential_scan - Including entry in results");
results.push((entry_key.clone(), entry_value.clone()));
}
}
log::debug!(
"SSTableReader::sequential_scan - Finished scanning {} blocks",
block_count
);
log::debug!(
"SSTableReader::sequential_scan - {} results before sort+limit",
results.len()
);
// Sort by Murmur3 token order (spec §5, Appendix B §313), then apply limit.
// Limit is applied AFTER sort so that LIMIT N returns the N token-smallest
// partitions (BLOCKING-1: limit-after-order).
sort_by_token_order(&mut results);
if let Some(lim) = limit {
results.truncate(lim);
}
log::debug!(
"SSTableReader::sequential_scan - Returning {} results after sort+limit",
results.len()
);
Ok(results)
}
/// Read next block with enhanced error handling and streaming support
pub(super) async fn read_next_block(&self) -> Result<Option<Vec<u8>>> {
use super::block_io;
block_io::read_next_block(
&self.file,
&self.header.cassandra_version,
&self.config,
&self.compression_info,
&self.current_chunk_index,
self.actual_header_size as u64,
)
.await
}
}
#[cfg(test)]
mod tests {
use super::*;
// =========================================================================
// table_ids_match tests
// =========================================================================
#[test]
fn test_table_ids_match_exact() {
// Exact match cases
let id1 = TableId::new("simple_table".to_string());
let id2 = TableId::new("simple_table".to_string());
assert!(table_ids_match(&id1, &id2));
let id3 = TableId::new("test_basic.simple_table".to_string());
let id4 = TableId::new("test_basic.simple_table".to_string());
assert!(table_ids_match(&id3, &id4));
}
#[test]
fn test_table_ids_match_qualified_vs_unqualified() {
// Qualified matches unqualified
let qualified = TableId::new("test_basic.simple_table".to_string());
let unqualified = TableId::new("simple_table".to_string());
assert!(table_ids_match(&qualified, &unqualified));
assert!(table_ids_match(&unqualified, &qualified));
}
#[test]
fn test_table_ids_match_different_keyspaces() {
// Different keyspaces but same table name - should match on table name
let id1 = TableId::new("keyspace1.users".to_string());
let id2 = TableId::new("keyspace2.users".to_string());
assert!(
table_ids_match(&id1, &id2),
"Same table name should match across keyspaces"
);
}
#[test]
fn test_table_ids_match_completely_different() {
// Completely different tables - should not match
let id1 = TableId::new("users".to_string());
let id2 = TableId::new("orders".to_string());
assert!(!table_ids_match(&id1, &id2));
let id3 = TableId::new("test.users".to_string());
let id4 = TableId::new("test.orders".to_string());
assert!(!table_ids_match(&id3, &id4));
}
#[test]
fn test_table_ids_match_edge_cases() {
// Table names with dots (unusual but possible)
let id1 = TableId::new("schema.table.subtable".to_string());
let id2 = TableId::new("subtable".to_string());
assert!(
table_ids_match(&id1, &id2),
"Should match on last component"
);
}
#[test]
fn test_table_ids_match_empty() {
// Empty table IDs
let id1 = TableId::new("".to_string());
let id2 = TableId::new("".to_string());
assert!(table_ids_match(&id1, &id2), "Empty IDs should match");
}
// =========================================================================
// Key comparison tests
// =========================================================================
#[test]
fn test_row_key_comparison() {
let key1 = RowKey::new(vec![1, 2, 3]);
let key2 = RowKey::new(vec![1, 2, 3]);
let key3 = RowKey::new(vec![1, 2, 4]);
assert_eq!(key1, key2);
assert_ne!(key1, key3);
assert!(key1 < key3);
}
#[test]
fn test_row_key_ordering() {
let key_a = RowKey::new(vec![0x01]);
let key_b = RowKey::new(vec![0x02]);
let key_c = RowKey::new(vec![0x01, 0x00]); // Longer but starts with 0x01
assert!(key_a < key_b);
assert!(key_a < key_c); // Shorter prefix comes first in lexicographic order
}
// =========================================================================
// Value tests
// =========================================================================
#[test]
fn test_value_blob_creation() {
let data = vec![1, 2, 3, 4, 5];
let value = Value::Blob(data.clone());
if let Value::Blob(v) = value {
assert_eq!(v, data);
} else {
panic!("Expected Value::Blob");
}
}
// =========================================================================
// Integration tests with real SSTable data
// =========================================================================
#[tokio::test]
async fn test_get_nonexistent_key() {
use std::path::PathBuf;
use std::sync::Arc;
// Test with real SSTable data if available
let datasets_root = match std::env::var("CQLITE_DATASETS_ROOT") {
Ok(root) => PathBuf::from(root),
Err(_) => {
eprintln!("CQLITE_DATASETS_ROOT not set, skipping test");
return;
}
};
let simple_table_dir = datasets_root.join("sstables/test_basic");
if !simple_table_dir.exists() {
eprintln!("test_basic not found, skipping test");
return;
}
// Find simple_table
let table_dir = std::fs::read_dir(&simple_table_dir)
.ok()
.and_then(|entries| {
entries
.filter_map(|e| e.ok())
.find(|e| {
e.file_name()
.to_str()
.map(|n| n.starts_with("simple_table"))
.unwrap_or(false)
})
.map(|e| e.path())
});
let Some(table_path) = table_dir else {
eprintln!("simple_table not found, skipping");
return;
};
// Find Data.db file
let data_file = std::fs::read_dir(&table_path).ok().and_then(|entries| {
entries
.filter_map(|e| e.ok())
.find(|e| {
e.file_name()
.to_str()
.map(|n| n.ends_with("-Data.db"))
.unwrap_or(false)
})
.map(|e| e.path())
});
let Some(data_path) = data_file else {
eprintln!("Data.db not found, skipping");
return;
};
let config = crate::Config::default();
let platform = Arc::new(
crate::Platform::new(&config)
.await
.expect("Failed to create platform"),
);
let reader = SSTableReader::open(&data_path, &config, platform)
.await
.expect("Failed to open SSTable");
// Try to get a key that doesn't exist
let table_id = TableId::new("test_basic.simple_table".to_string());
let nonexistent_key = RowKey::new(vec![0xFF, 0xFF, 0xFF, 0xFF]); // Very unlikely to exist
let result = reader.get(&table_id, &nonexistent_key).await;
assert!(
result.is_ok(),
"get() should succeed even for nonexistent key"
);
assert!(
result.unwrap().is_none(),
"Nonexistent key should return None"
);
}
#[tokio::test]
async fn test_scan_with_limit() {
use std::path::PathBuf;
use std::sync::Arc;
let datasets_root = match std::env::var("CQLITE_DATASETS_ROOT") {
Ok(root) => PathBuf::from(root),
Err(_) => {
eprintln!("CQLITE_DATASETS_ROOT not set, skipping test");
return;
}
};
let simple_table_dir = datasets_root.join("sstables/test_basic");
if !simple_table_dir.exists() {
eprintln!("test_basic not found, skipping test");
return;
}
// Find simple_table
let table_dir = std::fs::read_dir(&simple_table_dir)
.ok()
.and_then(|entries| {
entries
.filter_map(|e| e.ok())
.find(|e| {
e.file_name()
.to_str()
.map(|n| n.starts_with("simple_table"))
.unwrap_or(false)
})
.map(|e| e.path())
});
let Some(table_path) = table_dir else {
eprintln!("simple_table not found, skipping");
return;
};
let data_file = std::fs::read_dir(&table_path).ok().and_then(|entries| {
entries
.filter_map(|e| e.ok())
.find(|e| {
e.file_name()
.to_str()
.map(|n| n.ends_with("-Data.db"))
.unwrap_or(false)
})
.map(|e| e.path())
});
let Some(data_path) = data_file else {
eprintln!("Data.db not found, skipping");
return;
};
let config = crate::Config::default();
let platform = Arc::new(
crate::Platform::new(&config)
.await
.expect("Failed to create platform"),
);
let reader = SSTableReader::open(&data_path, &config, platform)
.await
.expect("Failed to open SSTable");
let table_id = TableId::new("test_basic.simple_table".to_string());
// Test scan with limit
let result = reader.scan(&table_id, None, None, Some(5), None).await;
assert!(result.is_ok(), "scan() should succeed");
let entries = result.unwrap();
assert!(
entries.len() <= 5,
"Scan with limit 5 should return at most 5 entries, got {}",
entries.len()
);
eprintln!("Scan with limit 5 returned {} entries", entries.len());
}
#[tokio::test]
async fn test_scan_full_table() {
use std::path::PathBuf;
use std::sync::Arc;
let datasets_root = match std::env::var("CQLITE_DATASETS_ROOT") {
Ok(root) => PathBuf::from(root),
Err(_) => {
eprintln!("CQLITE_DATASETS_ROOT not set, skipping test");
return;
}
};
let simple_table_dir = datasets_root.join("sstables/test_basic");
if !simple_table_dir.exists() {
eprintln!("test_basic not found, skipping test");
return;
}
// Find simple_table
let table_dir = std::fs::read_dir(&simple_table_dir)
.ok()
.and_then(|entries| {
entries
.filter_map(|e| e.ok())
.find(|e| {
e.file_name()
.to_str()
.map(|n| n.starts_with("simple_table"))
.unwrap_or(false)
})
.map(|e| e.path())
});
let Some(table_path) = table_dir else {
eprintln!("simple_table not found, skipping");
return;
};
let data_file = std::fs::read_dir(&table_path).ok().and_then(|entries| {
entries
.filter_map(|e| e.ok())
.find(|e| {
e.file_name()
.to_str()
.map(|n| n.ends_with("-Data.db"))
.unwrap_or(false)
})
.map(|e| e.path())
});
let Some(data_path) = data_file else {
eprintln!("Data.db not found, skipping");
return;
};
let config = crate::Config::default();
let platform = Arc::new(
crate::Platform::new(&config)
.await
.expect("Failed to create platform"),
);
let reader = SSTableReader::open(&data_path, &config, platform)
.await
.expect("Failed to open SSTable");
let table_id = TableId::new("test_basic.simple_table".to_string());
// Full table scan (no limit)
let result = reader.scan(&table_id, None, None, None, None).await;
assert!(result.is_ok(), "Full scan should succeed");
let entries = result.unwrap();
eprintln!("Full scan returned {} entries", entries.len());
}
#[tokio::test]
async fn test_get_all_entries() {
use std::path::PathBuf;
use std::sync::Arc;
let datasets_root = match std::env::var("CQLITE_DATASETS_ROOT") {
Ok(root) => PathBuf::from(root),
Err(_) => {
eprintln!("CQLITE_DATASETS_ROOT not set, skipping test");
return;
}
};
let simple_table_dir = datasets_root.join("sstables/test_basic");
if !simple_table_dir.exists() {
eprintln!("test_basic not found, skipping test");
return;
}
// Find simple_table
let table_dir = std::fs::read_dir(&simple_table_dir)
.ok()
.and_then(|entries| {
entries
.filter_map(|e| e.ok())
.find(|e| {
e.file_name()
.to_str()
.map(|n| n.starts_with("simple_table"))
.unwrap_or(false)
})
.map(|e| e.path())
});
let Some(table_path) = table_dir else {
eprintln!("simple_table not found, skipping");
return;
};
let data_file = std::fs::read_dir(&table_path).ok().and_then(|entries| {
entries
.filter_map(|e| e.ok())
.find(|e| {
e.file_name()
.to_str()
.map(|n| n.ends_with("-Data.db"))
.unwrap_or(false)
})
.map(|e| e.path())
});
let Some(data_path) = data_file else {
eprintln!("Data.db not found, skipping");
return;
};
let config = crate::Config::default();
let platform = Arc::new(
crate::Platform::new(&config)
.await
.expect("Failed to create platform"),
);
let reader = SSTableReader::open(&data_path, &config, platform)
.await
.expect("Failed to open SSTable");
// Get all entries (for compaction use case)
let result = reader.get_all_entries().await;
assert!(result.is_ok(), "get_all_entries() should succeed");
let entries = result.unwrap();
eprintln!("get_all_entries() returned {} entries", entries.len());
}
/// Regression test for Issue #480: static cell duplication on read.
///
/// static_columns_table has 100 partitions, each containing one static_block
/// and one clustering row. CQLite should return exactly 100 result rows — one
/// per partition — not 200 (which would occur if static rows were emitted as
/// separate result entries).
///
/// Two bugs were fixed:
/// 1. Snappy varint collision: bytes `0xC0 0x51` at the start of the Snappy
/// stream were misidentified as the V5_0StaticColumns magic number, causing
/// the file pointer to advance past part of the compressed data before
/// decompression, resulting in "corrupt input" errors.
/// 2. Static row duplication: static rows were pushed into `results` just like
/// clustering rows. They should be accumulated per-partition and merged into
/// each subsequent clustering row instead.
#[tokio::test]
async fn test_static_columns_table_row_count_issue480() {
use std::path::PathBuf;
use std::sync::Arc;
let datasets_root = match std::env::var("CQLITE_DATASETS_ROOT") {
Ok(root) => PathBuf::from(root),
Err(_) => {
eprintln!("CQLITE_DATASETS_ROOT not set, skipping Issue #480 regression test");
return;
}
};
let table_base = datasets_root.join("sstables/test_basic");
if !table_base.exists() {
eprintln!("test_basic dir not found, skipping Issue #480 regression test");
return;
}
// Locate the static_columns_table directory
let table_dir = std::fs::read_dir(&table_base).ok().and_then(|entries| {
entries
.filter_map(|e| e.ok())
.find(|e| {
e.file_name()
.to_str()
.map(|n| n.starts_with("static_columns_table"))
.unwrap_or(false)
})
.map(|e| e.path())
});
let Some(table_path) = table_dir else {
eprintln!("static_columns_table not found, skipping Issue #480 regression test");
return;
};
// Find the Data.db file (must be real binary, not macOS ._resource_fork)
let data_file = std::fs::read_dir(&table_path).ok().and_then(|entries| {
entries
.filter_map(|e| e.ok())
.find(|e| {
let name = e.file_name();
let s = name.to_str().unwrap_or("");
s.ends_with("-Data.db") && !s.starts_with("._")
})
.map(|e| e.path())
});
let Some(data_path) = data_file else {
eprintln!("Data.db not found in static_columns_table dir, skipping");
return;
};
let config = crate::Config::default();
let platform = Arc::new(
crate::Platform::new(&config)
.await
.expect("Failed to create platform"),
);
let reader = SSTableReader::open(&data_path, &config, platform)
.await
.expect("Failed to open static_columns_table SSTable");
let table_id = crate::types::TableId::new("test_basic.static_columns_table".to_string());
let result = reader.scan(&table_id, None, None, None, None).await;
assert!(
result.is_ok(),
"Scan of static_columns_table should succeed: {:?}",
result.err()
);
let entries = result.unwrap();
eprintln!(
"Issue #480 regression: static_columns_table scan returned {} rows",
entries.len()
);
// Expected: 100 rows (one per partition, static data merged into clustering row)
// Before fix: 0 rows (Snappy decompression failure)
// After fixing only decompression: 200 rows (static rows emitted separately)
// After full fix: 100 rows
assert_eq!(
entries.len(),
100,
"static_columns_table should return 100 rows (one per partition), \
got {}. Regression for Issue #480: static cell duplication on read.",
entries.len()
);
}
}