1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
// SPDX-License-Identifier: Apache-2.0
// Copyright (c) 2024-present, fjall-rs
// Copyright (c) 2026-present, Structured World Foundation
#[cfg(feature = "metrics")]
use crate::metrics::Metrics;
use super::{
block_index::BlockIndexImpl, block_layout::BlockLayoutMap, meta::ParsedMeta,
regions::ParsedRegions,
};
use crate::deletion_pause::DeletionPause;
use crate::{
Checksum, GlobalTableId, SeqNo,
cache::Cache,
comparator::SharedComparator,
encryption::EncryptionProvider,
file_accessor::FileAccessor,
fs::Fs,
range_tombstone::RangeTombstone,
table::{IndexBlock, filter::block::FilterBlock},
tree::inner::TreeId,
};
use std::{
path::PathBuf,
sync::{Arc, OnceLock, atomic::AtomicBool},
};
pub struct Inner {
pub path: Arc<PathBuf>,
pub(crate) tree_id: TreeId,
#[doc(hidden)]
pub(crate) file_accessor: FileAccessor,
/// Filesystem backend for file operations (open, remove, etc.).
pub(crate) fs: Arc<dyn Fs>,
/// Parsed metadata
#[doc(hidden)]
pub metadata: ParsedMeta,
/// Parsed region block handles
#[doc(hidden)]
pub regions: ParsedRegions,
/// Translates key (first item of a block) to block offset (address inside file) and (compressed) size
#[doc(hidden)]
pub block_index: Arc<BlockIndexImpl>,
/// Block cache
///
/// Stores index and data blocks
#[doc(hidden)]
pub cache: Arc<Cache>,
/// Pinned filter index (in case of partitioned filters)
pub(super) pinned_filter_index: Option<IndexBlock>,
/// Pinned AMQ filter
pub pinned_filter_block: Option<FilterBlock>,
/// True when the table was compacted away or dropped
///
/// May be kept alive until all Arcs to the table have been dropped (to facilitate snapshots)
pub is_deleted: AtomicBool,
pub(super) checksum: Checksum,
pub(super) global_seqno: SeqNo,
pub(crate) comparator: SharedComparator,
#[cfg(feature = "metrics")]
pub(crate) metrics: Arc<Metrics>,
/// Cached sum of referenced blob file bytes for this table.
/// Lazily computed on first access to avoid repeated I/O in compaction decisions.
pub(crate) cached_blob_bytes: OnceLock<u64>,
/// Range tombstones stored in this table. Loaded on open.
pub(crate) range_tombstones: Vec<RangeTombstone>,
/// Inner zstd-block layout index, loaded on open from the optional
/// `block_layout` section. Empty (no entries) when the table has no
/// multi-inner-block data blocks. Lets a range query partial-decode only
/// the inner blocks covering a key range in a large cold block.
// Read only by the zstd partial-decode path; in a no-zstd build it is still
// loaded (always empty there, since no zstd blocks ever split) but unread.
#[cfg_attr(
not(feature = "zstd"),
expect(
dead_code,
reason = "consumed only by the zstd partial-decode read path"
)
)]
pub(crate) block_layout: BlockLayoutMap,
/// Block encryption provider for encryption at rest.
pub(crate) encryption: Option<Arc<dyn EncryptionProvider>>,
/// Pre-trained zstd dictionary for dictionary decompression.
#[cfg(zstd_any)]
pub(crate) zstd_dictionary: Option<Arc<crate::compression::ZstdDictionary>>,
/// Tree-wide file-deletion gate. Installed once by
/// [`Table::install_deletion_pause`](super::Table::install_deletion_pause)
/// after the table is registered with a tree. When `Some` and active,
/// the [`Drop`] impl defers the underlying `remove_file` call so that
/// an in-progress [`Tree::create_checkpoint`](crate::Tree::create_checkpoint)
/// can hard-link the file before it disappears.
// `once_cell::race::OnceBox` rather than `std::sync::OnceLock` so
// this field doesn't pin the type to `std` — OnceBox is no-std +
// alloc by construction. The slot is set once after recovery /
// compaction and read on every Drop; CAS-based race semantics are
// fine for this single-publisher, many-reader pattern.
pub(crate) deletion_pause: once_cell::race::OnceBox<Arc<DeletionPause>>,
/// Tree-wide background file deleter. Installed once by
/// [`Table::install_background_deleter`](super::Table::install_background_deleter)
/// after the table is registered with a tree. When present (and no
/// checkpoint pause is active), the [`Drop`] impl reclaims the SST's blocks
/// synchronously via [`Fs::truncate_file`](crate::fs::Fs::truncate_file) and
/// hands the directory-entry `unlink` to this deleter's worker, off the
/// foreground path. Absent (e.g. orphan cleanup before a tree owns the
/// table) the Drop falls back to a synchronous `remove_file`.
// std-only (the deleter spawns a thread); `no_std` builds never install
// one and keep the synchronous Drop path. OnceBox keeps the field itself
// alloc-friendly, matching `deletion_pause`.
#[cfg(feature = "std")]
pub(crate) background_deleter: once_cell::race::OnceBox<Arc<crate::BackgroundDeleter>>,
}
impl Inner {
/// Gets the global table ID.
#[must_use]
pub(super) fn global_id(&self) -> GlobalTableId {
(self.tree_id, self.metadata.id).into()
}
}
impl Drop for Inner {
fn drop(&mut self) {
let global_id = self.global_id();
if self.is_deleted.load(std::sync::atomic::Ordering::Acquire) {
log::trace!("Cleanup deleted table {global_id:?} at {:?}", self.path);
// Move the accessor and block index out so all file handles
// (including clones held by the block index) are closed before
// attempting deletion. On Windows, remove_file fails while any
// handle is open.
let file_accessor = std::mem::replace(&mut self.file_accessor, FileAccessor::Closed);
let block_index =
std::mem::replace(&mut self.block_index, Arc::new(BlockIndexImpl::Closed));
// Evict cached FD from the descriptor table.
file_accessor.as_descriptor_table().inspect(|d| {
d.remove_for_table(&global_id);
});
// Drop the accessor and block index (releases all Arc<dyn FsFile>).
drop(file_accessor);
drop(block_index);
// If a checkpoint is active, defer the physical deletion so the
// file remains hard-linkable until the checkpoint releases its
// pause. Falls through to immediate removal when no pause is
// installed or the pause is inactive.
// Short-circuit on the common no-checkpoint path: skip
// the Arc<dyn Fs> bump and PathBuf clone unless a pause is
// both installed AND currently active. `try_enqueue` still
// re-checks `is_active()` under the queue lock to close
// the publish-then-release race, so the outer check is a
// pure perf gate, not a correctness one.
if let Some(pause) = self.deletion_pause.get()
&& pause.is_active()
&& pause.try_enqueue(Arc::clone(&self.fs), (*self.path).clone())
{
log::trace!(
"Deferred deletion of table {global_id:?} at {:?} (checkpoint active)",
self.path,
);
return;
}
// Off-foreground reclaim: return the SST's blocks to the filesystem
// synchronously (so a footprint scan reflects the reclaim at once)
// and hand the directory-entry unlink to the background deleter.
// Falls through to a synchronous remove_file when no deleter is
// installed (e.g. orphan cleanup before a tree owns the table).
#[cfg(feature = "std")]
if let Some(deleter) = self.background_deleter.get() {
// Truncate (instant block-free) only when we own the sole hard
// link. A completed checkpoint may have hard-linked this SST;
// truncating the shared inode would zero the checkpoint's copy
// too. When the link is shared (or the count is unknown), skip
// the truncate and just unlink our directory entry — the data
// survives via the other link and its blocks free once the last
// link is gone. (An in-progress checkpoint is already handled by
// the deletion-pause branch above.)
if self.fs.hard_link_count(&self.path).is_ok_and(|n| n <= 1)
&& let Err(e) = self.fs.truncate_file(&self.path)
{
log::warn!(
"Failed to truncate deleted table {global_id:?} at {:?}: {e:?}",
self.path,
);
}
deleter.enqueue(Arc::clone(&self.fs), (*self.path).clone());
return;
}
if let Err(e) = self.fs.remove_file(&self.path) {
log::warn!(
"Failed to cleanup deleted table {global_id:?} at {:?}: {e:?}",
self.path,
);
}
}
}
}