farena/lib.rs
1//! File-backed arena allocator using `pread` for random access.
2//!
3//! Write data to a temporary file, then read it back by location.
4//! The data stays on disk instead of in memory, so your process doesn't use extra RAM.
5//!
6//! `pread` lets us read from any offset without seeking, which means:
7//! - No file position to manage between reads
8//! - Thread-safe: multiple threads can read concurrently without locking
9//!
10//! Use this when you need scratch space for bytes but can't afford to keep everything in memory.
11//!
12//! # Limitations
13//!
14//! - Each file is limited to 4GB (u32 offsets). For larger data, use multiple files.
15//! - `FileArena` is immutable once built. To add more data, create a new writer,
16//! then build a new `FileArena` containing all files.
17//! - Temp files use your system's temp directory (`TMPDIR`). This crate doesn't check
18//! if it's on real disk - make sure it's not a ramdisk like `tmpfs` or `ramfs`.
19//! - This crate does many random reads. Use a fast storage for best performance.
20//! - Each file in a `FileArena` keeps one file descriptor open for its lifetime.
21//! Creating arenas with thousands of files may hit your system's ulimit. Check with
22//! `ulimit -n` and monitor with `lsof -p $$ | wc -l`. Increase the limit or reduce
23//! file count if needed.
24//!
25//! # Building multi-file arenas
26//!
27//! Use [`FileArenaBuilder`] to assemble arenas from multiple writers.
28//! It handles file placement automatically, so you don't need to worry
29//! about the ordering contract:
30//!
31//! ```rust
32//! # use farena::{FileArenaWriter, FileArenaBuilder};
33//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
34//! let mut w0 = FileArenaWriter::new(0)?;
35//! let loc0 = w0.push("data0")?;
36//! let f0 = w0.finish()?;
37//!
38//! let mut w1 = FileArenaWriter::new(1)?;
39//! let loc1 = w1.push("data1")?;
40//! let f1 = w1.finish()?;
41//!
42//! let mut builder = FileArenaBuilder::new();
43//! builder.add(f1, loc1); // Order doesn't matter
44//! builder.add(f0, loc0);
45//! let arena = builder.build()?;
46//!
47//! assert_eq!(arena.get(loc0)?, b"data0");
48//! assert_eq!(arena.get(loc1)?, b"data1");
49//! # Ok(())
50//! # }
51//! ```
52//!
53//! # Usage
54//!
55//! ```rust
56//! use farena::{FileArenaWriter, Location};
57//!
58//! // Write phase
59//! let mut writer = FileArenaWriter::new(0)?;
60//! let loc1 = writer.push("hello")?;
61//! let loc2 = writer.push(" world")?;
62//!
63//! // Read phase — into_arena() is a convenience for single-file arenas
64//! let arena = writer.into_arena()?;
65//!
66//! assert_eq!(arena.get(loc1)?, b"hello");
67//! assert_eq!(arena.get(loc2)?, b" world");
68//! # Ok::<_, std::io::Error>(())
69//! ```
70//!
71//! # Multiple files (low-level)
72//!
73//! **Prefer [`FileArenaBuilder`] above** — it enforces the ordering
74//! contract automatically. `FileArena::new` is the low-level alternative.
75//!
76//! Each writer gets a unique index. Files must be passed to
77//! `FileArena::new` in index order:
78//!
79//! ```rust
80//! # use farena::{FileArena, FileArenaWriter, Location};
81//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
82//! let mut w0 = FileArenaWriter::new(0)?;
83//! let loc0 = w0.push("data0")?;
84//! let f0 = w0.finish()?;
85//!
86//! let mut w1 = FileArenaWriter::new(1)?;
87//! let loc1 = w1.push("data1")?;
88//! let f1 = w1.finish()?;
89//!
90//! let arena = FileArena::new(vec![f0, f1])?;
91//! assert_eq!(arena.get(loc0)?, b"data0");
92//! assert_eq!(arena.get(loc1)?, b"data1");
93//! # Ok(())
94//! # }
95//! ```
96//!
97//! # Parallel writing
98//!
99//! The design supports parallel writing. Each writer gets a unique index,
100//! and [`FileArenaBuilder`] handles assembling the arena:
101//!
102//! ```rust,no_run
103//! # use farena::{FileArenaWriter, FileArenaBuilder, Location};
104//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
105//! let items = vec!["item1", "item2", "item3", "item4"];
106//!
107//! // Use .into_par_iter() with rayon for parallel execution
108//! let results: Vec<(Location, std::fs::File)> = (0..items.len())
109//! .into_iter()
110//! .map(|i| {
111//! let mut writer = FileArenaWriter::new(i as u16).unwrap();
112//! let loc = writer.push(items[i]).unwrap();
113//! let file = writer.finish().unwrap();
114//! (loc, file)
115//! })
116//! .collect();
117//!
118//! // Builder places files in the correct order automatically
119//! let mut builder = FileArenaBuilder::new();
120//! for (loc, file) in results {
121//! builder.add(file, loc);
122//! }
123//! let arena = builder.build()?;
124//! # Ok(())
125//! # }
126//! ```
127//!
128//! # Graph/tree structures
129//!
130//! A common pattern is storing node metadata in memory while keeping
131//! large payloads on disk. This is useful when:
132//!
133//! - Payloads are large and would consume too much memory
134//! - You need to traverse the structure without loading all data at once
135//! - You construct long text by concatenating payloads (e.g., thread content)
136//!
137//! For example, a tree where each node has an ID and a text payload:
138//!
139//! ```rust,no_run
140//! # use farena::{FileArena, FileArenaWriter, Location};
141//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
142//! #[derive(Clone)]
143//! struct Node {
144//! id: u64,
145//! payload_loc: Location, // Text stored on disk
146//! children: Vec<u64>,
147//! }
148//!
149//! // Build your tree with Locations instead of storing text directly
150//! let mut nodes = Vec::new();
151//! let mut writer = FileArenaWriter::new(0)?;
152//!
153//! // Write payloads, store locations
154//! for (id, text) in &[("root", "root text"), ("child1", "child text")] {
155//! let loc = writer.push(*text)?;
156//! nodes.push(Node {
157//! id: hash(id), // Your own hash function
158//! payload_loc: loc,
159//! children: vec![],
160//! });
161//! }
162//!
163//! let arena = writer.into_arena()?;
164//!
165//! // Traverse and read payloads as needed
166//! // Note: get_str_into appends, so we create a fresh buffer each iteration
167//! for node in &nodes {
168//! let mut buf = String::new();
169//! arena.get_str_into(node.payload_loc, &mut buf)?;
170//! println!("Node {}: {}", node.id, buf);
171//! }
172//!
173//! // Or concatenate payloads into a single buffer
174//! let mut full_text = String::new();
175//! for node in &nodes {
176//! arena.get_str_into(node.payload_loc, &mut full_text)?;
177//! }
178//! // full_text now contains all payloads concatenated
179//! # fn hash(_: &str) -> u64 { 0 }
180//! # Ok(())
181//! # }
182//! ```
183//!
184//! # Buffer reuse
185//!
186//! Reuse the same buffer across multiple reads to avoid allocations:
187//!
188//! ```rust
189//! # use farena::{FileArenaWriter, Location};
190//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
191//! # let mut writer = FileArenaWriter::new(0)?;
192//! # let loc1 = writer.push("hello")?;
193//! # let loc2 = writer.push(" world")?;
194//! # let arena = writer.into_arena()?;
195//! let mut buf = Vec::new();
196//!
197//! arena.get_into(loc1, &mut buf)?;
198//! assert_eq!(buf, b"hello");
199//!
200//! buf.clear(); // Reuse without reallocating
201//! arena.get_into(loc2, &mut buf)?;
202//! assert_eq!(buf, b" world");
203//! # Ok(())
204//! # }
205//! ```
206//!
207//! # Unsafe reads
208//!
209//! If you know your stored data is valid UTF-8, use `get_str_into_unchecked`
210//! to skip the UTF-8 validation:
211//!
212//! ```rust
213//! # use farena::{FileArenaWriter, Location};
214//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
215//! # let mut writer = FileArenaWriter::new(0)?;
216//! # let loc = writer.push("known utf8")?;
217//! # let arena = writer.into_arena()?;
218//! let mut buf = String::new();
219//!
220//! // SAFETY: we pushed valid UTF-8 above
221//! unsafe { arena.get_str_into_unchecked(loc, &mut buf) }?;
222//! assert_eq!(buf, "known utf8");
223//! # Ok(())
224//! # }
225//! ```
226//!
227//! # Temp directory
228//!
229//! Temp files are created in your system's temp directory (respects `TMPDIR`).
230//! Check your temp directory is on real disk with:
231//!
232//! ```text
233//! df -h ${TMPDIR:-/tmp}
234//! ```
235//!
236//! The filesystem should not be `tmpfs` or `ramfs`.
237
238mod arena;
239mod builder;
240mod location;
241mod writer;
242
243pub use arena::FileArena;
244pub use builder::FileArenaBuilder;
245pub use location::Location;
246pub use writer::FileArenaWriter;