reifydb_store_multi/tier/mod.rs
1// SPDX-License-Identifier: Apache-2.0
2// Copyright (c) 2025 ReifyDB
3
4use std::{collections::HashMap, ops::Bound};
5
6use reifydb_core::{common::CommitVersion, interface::store::EntryKind};
7use reifydb_type::{Result, util::cowvec::CowVec};
8
9/// A batch of key-value entries grouped by entry kind, used for atomic multi-table writes.
10pub type TierBatch = HashMap<EntryKind, Vec<(CowVec<u8>, Option<CowVec<u8>>)>>;
11
12/// A raw storage entry with version.
13///
14/// Value is None for tombstones (deletions).
15#[derive(Debug, Clone)]
16pub struct RawEntry {
17 pub key: CowVec<u8>,
18 pub version: CommitVersion,
19 pub value: Option<CowVec<u8>>,
20}
21
22/// A batch of range results with continuation info for pagination.
23#[derive(Debug, Clone)]
24pub struct RangeBatch {
25 /// The entries in this batch.
26 pub entries: Vec<RawEntry>,
27 /// Whether there are more entries after this batch.
28 pub has_more: bool,
29}
30
31impl RangeBatch {
32 /// Creates an empty batch with no more results.
33 pub fn empty() -> Self {
34 Self {
35 entries: Vec::new(),
36 has_more: false,
37 }
38 }
39
40 /// Returns true if this batch contains no entries.
41 pub fn is_empty(&self) -> bool {
42 self.entries.is_empty()
43 }
44}
45
46/// Cursor state for streaming range queries.
47///
48/// Tracks position within a range scan, enabling efficient continuation
49/// across multiple batches without re-scanning from the beginning.
50#[derive(Debug, Clone)]
51pub struct RangeCursor {
52 /// Last key seen in the previous batch (for Bound::Excluded continuation)
53 pub last_key: Option<CowVec<u8>>,
54 /// Whether this stream is exhausted
55 pub exhausted: bool,
56}
57
58impl RangeCursor {
59 /// Create a new cursor at the start of a range.
60 pub fn new() -> Self {
61 Self {
62 last_key: None,
63 exhausted: false,
64 }
65 }
66
67 /// Check if the stream is exhausted.
68 pub fn is_exhausted(&self) -> bool {
69 self.exhausted
70 }
71}
72
73impl Default for RangeCursor {
74 fn default() -> Self {
75 Self::new()
76 }
77}
78
79/// The tier storage trait.
80///
81/// This is intentionally minimal - just raw bytes in/out.
82/// Version is a first-class parameter for all operations.
83/// All MVCC, CDC, and routing logic belongs in the store layer above.
84///
85/// Implementations must be thread-safe and cloneable.
86pub trait TierStorage: Send + Sync + Clone + 'static {
87 /// Get the value for a key at or before the given version.
88 fn get(&self, table: EntryKind, key: &[u8], version: CommitVersion) -> Result<Option<CowVec<u8>>>;
89
90 /// Check if a key exists at or before the given version.
91 fn contains(&self, table: EntryKind, key: &[u8], version: CommitVersion) -> Result<bool> {
92 Ok(self.get(table, key, version)?.is_some())
93 }
94
95 /// Write entries to multiple tables atomically at the given version.
96 ///
97 /// All entries across all tables are written in a single transaction.
98 /// This ensures durability and atomicity for multi-table commits.
99 fn set(&self, version: CommitVersion, batches: TierBatch) -> Result<()>;
100
101 /// Fetch the next batch of entries in key order at or before version.
102 ///
103 /// Uses the cursor to track position. On first call, cursor should be new.
104 /// On subsequent calls, pass the same cursor to continue from where left off.
105 /// Returns up to `batch_size` entries. The cursor is updated with the last
106 /// key seen, and `exhausted` is set to true when no more entries remain.
107 fn range_next(
108 &self,
109 table: EntryKind,
110 cursor: &mut RangeCursor,
111 start: Bound<&[u8]>,
112 end: Bound<&[u8]>,
113 version: CommitVersion,
114 batch_size: usize,
115 ) -> Result<RangeBatch>;
116
117 /// Fetch the next batch of entries in reverse key order at or before version.
118 ///
119 /// Uses the cursor to track position. On first call, cursor should be new.
120 /// On subsequent calls, pass the same cursor to continue from where left off.
121 /// Returns up to `batch_size` entries. The cursor is updated with the last
122 /// key seen, and `exhausted` is set to true when no more entries remain.
123 fn range_rev_next(
124 &self,
125 table: EntryKind,
126 cursor: &mut RangeCursor,
127 start: Bound<&[u8]>,
128 end: Bound<&[u8]>,
129 version: CommitVersion,
130 batch_size: usize,
131 ) -> Result<RangeBatch>;
132
133 /// Ensure a table exists (creates if needed).
134 ///
135 /// For memory backends this is typically a no-op.
136 /// For SQL backends this may create tables.
137 fn ensure_table(&self, table: EntryKind) -> Result<()>;
138
139 /// Delete all entries in a table.
140 fn clear_table(&self, table: EntryKind) -> Result<()>;
141
142 /// Physically drop specific versions of entries from storage.
143 ///
144 /// Unlike `set()` with None values which inserts tombstones for MVCC,
145 /// this method actually removes entries from storage to reclaim memory.
146 /// Used by the drop worker to erase old versions after they're no longer needed.
147 ///
148 /// Each entry in the batch is a (key, version) pair identifying the specific
149 /// version of the key to remove.
150 fn drop(&self, batches: HashMap<EntryKind, Vec<(CowVec<u8>, CommitVersion)>>) -> Result<()>;
151
152 /// Get all versions of a specific key (for internal cleanup operations).
153 ///
154 /// Unlike `get()` which does MVCC resolution, this returns ALL stored versions
155 /// of the key with their values. Used by the drop worker to discover which
156 /// versions exist before deciding which to clean up.
157 ///
158 /// Returns a vector of (version, value) pairs, sorted by version descending.
159 fn get_all_versions(&self, table: EntryKind, key: &[u8]) -> Result<Vec<(CommitVersion, Option<CowVec<u8>>)>>;
160}
161
162/// Marker trait for storage tiers that support the tier storage interface.
163pub trait TierBackend: TierStorage {}