1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
//! File-backed arena allocator using `pread` for random access.
//!
//! Write data to a temporary file, then read it back by location.
//! The data stays on disk instead of in memory, so your process doesn't use extra RAM.
//!
//! `pread` lets us read from any offset without seeking, which means:
//! - No file position to manage between reads
//! - Thread-safe: multiple threads can read concurrently without locking
//!
//! Use this when you need scratch space for bytes but can't afford to keep everything in memory.
//!
//! # Limitations
//!
//! - Each file is limited to 4GB (u32 offsets). For larger data, use multiple files.
//! - `FileArena` is immutable once built. To add more data, create a new writer,
//! then build a new `FileArena` containing all files.
//! - Temp files use your system's temp directory (`TMPDIR`). This crate doesn't check
//! if it's on real disk - make sure it's not a ramdisk like `tmpfs` or `ramfs`.
//! - This crate does many random reads. Use a fast storage for best performance.
//!
//! # Ordering contract
//!
//! When using multiple files, the file at position `i` in the `files` vec must come
//! from a `FileArenaWriter` created with index `i`. Passing files in the wrong order
//! will return incorrect data without any error.
//!
//! Correct:
//! ```text
//! writer0 -> file0, writer1 -> file1
//! FileArena::new(vec![file0, file1]) // file at index 0 from writer 0
//! ```
//!
//! Incorrect:
//! ```text
//! writer0 -> file0, writer1 -> file1
//! FileArena::new(vec![file1, file0]) // WRONG ORDER — will read garbage
//! ```
//!
//! # Usage
//!
//! ```rust
//! use farena::{FileArenaWriter, Location};
//!
//! // Write phase
//! let mut writer = FileArenaWriter::new(0)?;
//! let loc1 = writer.push("hello")?;
//! let loc2 = writer.push(" world")?;
//!
//! // Read phase — into_arena() is a convenience for single-file arenas
//! let arena = writer.into_arena()?;
//!
//! assert_eq!(arena.get(loc1)?, b"hello");
//! assert_eq!(arena.get(loc2)?, b" world");
//! # Ok::<_, std::io::Error>(())
//! ```
//!
//! # Multiple files
//!
//! Each writer gets a unique index. Collect files in index order:
//!
//! ```rust
//! # use farena::{FileArena, FileArenaWriter, Location};
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! let mut w1 = FileArenaWriter::new(0)?;
//! let loc1 = w1.push("data1")?;
//! let f1 = w1.finish()?;
//!
//! let mut w2 = FileArenaWriter::new(1)?;
//! let loc2 = w2.push("data2")?;
//! let f2 = w2.finish()?;
//!
//! let arena = FileArena::new(vec![f1, f2])?;
//! assert_eq!(arena.get(loc1)?, b"data1");
//! assert_eq!(arena.get(loc2)?, b"data2");
//! # Ok(())
//! # }
//! ```
//!
//! # Parallel writing
//!
//! The design supports parallel writing. Each writer gets a unique index,
//! and you collect both the locations and files:
//!
//! ```rust,no_run
//! # use farena::{FileArena, FileArenaWriter, Location};
//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
//! let items = vec!["item1", "item2", "item3", "item4"];
//!
//! // Each task returns (location, file)
//! // Use .into_par_iter() with rayon for parallel execution
//! let results: Vec<(Location, std::fs::File)> = (0..items.len())
//! .into_iter()
//! .map(|i| {
//! let mut writer = FileArenaWriter::new(i as u16).unwrap();
//! let loc = writer.push(items[i]).unwrap();
//! let file = writer.finish().unwrap();
//! (loc, file)
//! })
//! .collect();
//!
//! // Split into locations and files
//! let (locations, files): (Vec<_>, Vec<_>) = results.into_iter().unzip();
//! let arena = FileArena::new(files)?;
//!
//! // Now you can read back using the locations
//! for loc in &locations {
//! let data = arena.get(*loc)?;
//! println!("Got: {}", String::from_utf8_lossy(&data));
//! }
//! # Ok(())
//! # }
//! ```
//!
//! # Graph/tree structures
//!
//! A common pattern is storing node metadata in memory while keeping
//! large payloads on disk. This is useful when:
//!
//! - Payloads are large and would consume too much memory
//! - You need to traverse the structure without loading all data at once
//! - You construct long text by concatenating payloads (e.g., thread content)
//!
//! For example, a tree where each node has an ID and a text payload:
//!
//! ```rust,no_run
//! # use farena::{FileArena, FileArenaWriter, Location};
//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
//! #[derive(Clone)]
//! struct Node {
//! id: u64,
//! payload_loc: Location, // Text stored on disk
//! children: Vec<u64>,
//! }
//!
//! // Build your tree with Locations instead of storing text directly
//! let mut nodes = Vec::new();
//! let mut writer = FileArenaWriter::new(0)?;
//!
//! // Write payloads, store locations
//! for (id, text) in &[("root", "root text"), ("child1", "child text")] {
//! let loc = writer.push(*text)?;
//! nodes.push(Node {
//! id: hash(id), // Your own hash function
//! payload_loc: loc,
//! children: vec![],
//! });
//! }
//!
//! let arena = writer.into_arena()?;
//!
//! // Traverse and read payloads as needed
//! // Note: get_str_into appends, so we create a fresh buffer each iteration
//! for node in &nodes {
//! let mut buf = String::new();
//! arena.get_str_into(node.payload_loc, &mut buf)?;
//! println!("Node {}: {}", node.id, buf);
//! }
//!
//! // Or concatenate payloads into a single buffer
//! let mut full_text = String::new();
//! for node in &nodes {
//! arena.get_str_into(node.payload_loc, &mut full_text)?;
//! }
//! // full_text now contains all payloads concatenated
//! # fn hash(_: &str) -> u64 { 0 }
//! # Ok(())
//! # }
//! ```
//!
//! # Buffer reuse
//!
//! Reuse the same buffer across multiple reads to avoid allocations:
//!
//! ```rust
//! # use farena::{FileArenaWriter, Location};
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! # let mut writer = FileArenaWriter::new(0)?;
//! # let loc1 = writer.push("hello")?;
//! # let loc2 = writer.push(" world")?;
//! # let arena = writer.into_arena()?;
//! let mut buf = Vec::new();
//!
//! arena.get_into(loc1, &mut buf)?;
//! assert_eq!(buf, b"hello");
//!
//! buf.clear(); // Reuse without reallocating
//! arena.get_into(loc2, &mut buf)?;
//! assert_eq!(buf, b" world");
//! # Ok(())
//! # }
//! ```
//!
//! # Unsafe reads
//!
//! If you know your stored data is valid UTF-8, use `get_str_into_unchecked`
//! to skip the UTF-8 validation:
//!
//! ```rust
//! # use farena::{FileArenaWriter, Location};
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! # let mut writer = FileArenaWriter::new(0)?;
//! # let loc = writer.push("known utf8")?;
//! # let arena = writer.into_arena()?;
//! let mut buf = String::new();
//!
//! // SAFETY: we pushed valid UTF-8 above
//! unsafe { arena.get_str_into_unchecked(loc, &mut buf) }?;
//! assert_eq!(buf, "known utf8");
//! # Ok(())
//! # }
//! ```
//!
//! # Temp directory
//!
//! Temp files are created in your system's temp directory (respects `TMPDIR`).
//! Check your temp directory is on real disk with:
//!
//! ```text
//! df -h ${TMPDIR:-/tmp}
//! ```
//!
//! The filesystem should not be `tmpfs` or `ramfs`.
pub use FileArena;
pub use Location;
pub use FileArenaWriter;