farena/lib.rs
1//! File-backed arena allocator using `pread` for random access.
2//!
3//! Write data to a temporary file, then read it back by location.
4//! The data stays on disk instead of in memory, so your process doesn't use extra RAM.
5//!
6//! `pread` lets us read from any offset without seeking, which means:
7//! - No file position to manage between reads
8//! - Thread-safe: multiple threads can read concurrently without locking
9//!
10//! Use this when you need scratch space for bytes but can't afford to keep everything in memory.
11//!
12//! # Limitations
13//!
14//! - Each file is limited to 4GB (u32 offsets). For larger data, use multiple files.
15//! - `FileArena` is immutable once built. To add more data, create a new writer,
16//! then build a new `FileArena` containing all files.
17//! - Temp files use your system's temp directory (`TMPDIR`). This crate doesn't check
18//! if it's on real disk - make sure it's not a ramdisk like `tmpfs` or `ramfs`.
19//! - This crate does many random reads. Use a fast storage for best performance.
20//!
21//! # Ordering contract
22//!
23//! When using multiple files, the file at position `i` in the `files` vec must come
24//! from a `FileArenaWriter` created with index `i`. Passing files in the wrong order
25//! will return incorrect data without any error.
26//!
27//! Correct:
28//! ```text
29//! writer0 -> file0, writer1 -> file1
30//! FileArena::new(vec![file0, file1]) // file at index 0 from writer 0
31//! ```
32//!
33//! Incorrect:
34//! ```text
35//! writer0 -> file0, writer1 -> file1
36//! FileArena::new(vec![file1, file0]) // WRONG ORDER — will read garbage
37//! ```
38//!
39//! # Usage
40//!
41//! ```rust
42//! use farena::{FileArenaWriter, Location};
43//!
44//! // Write phase
45//! let mut writer = FileArenaWriter::new(0)?;
46//! let loc1 = writer.push("hello")?;
47//! let loc2 = writer.push(" world")?;
48//!
49//! // Read phase — into_arena() is a convenience for single-file arenas
50//! let arena = writer.into_arena()?;
51//!
52//! assert_eq!(arena.get(loc1)?, b"hello");
53//! assert_eq!(arena.get(loc2)?, b" world");
54//! # Ok::<_, std::io::Error>(())
55//! ```
56//!
57//! # Multiple files
58//!
59//! Each writer gets a unique index. Collect files in index order:
60//!
61//! ```rust
62//! # use farena::{FileArena, FileArenaWriter, Location};
63//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
64//! let mut w1 = FileArenaWriter::new(0)?;
65//! let loc1 = w1.push("data1")?;
66//! let f1 = w1.finish()?;
67//!
68//! let mut w2 = FileArenaWriter::new(1)?;
69//! let loc2 = w2.push("data2")?;
70//! let f2 = w2.finish()?;
71//!
72//! let arena = FileArena::new(vec![f1, f2])?;
73//! assert_eq!(arena.get(loc1)?, b"data1");
74//! assert_eq!(arena.get(loc2)?, b"data2");
75//! # Ok(())
76//! # }
77//! ```
78//!
79//! # Parallel writing
80//!
81//! The design supports parallel writing. Each writer gets a unique index,
82//! and you collect both the locations and files:
83//!
84//! ```rust,no_run
85//! # use farena::{FileArena, FileArenaWriter, Location};
86//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
87//! let items = vec!["item1", "item2", "item3", "item4"];
88//!
89//! // Each task returns (location, file)
90//! // Use .into_par_iter() with rayon for parallel execution
91//! let results: Vec<(Location, std::fs::File)> = (0..items.len())
92//! .into_iter()
93//! .map(|i| {
94//! let mut writer = FileArenaWriter::new(i as u16).unwrap();
95//! let loc = writer.push(items[i]).unwrap();
96//! let file = writer.finish().unwrap();
97//! (loc, file)
98//! })
99//! .collect();
100//!
101//! // Split into locations and files
102//! let (locations, files): (Vec<_>, Vec<_>) = results.into_iter().unzip();
103//! let arena = FileArena::new(files)?;
104//!
105//! // Now you can read back using the locations
106//! for loc in &locations {
107//! let data = arena.get(*loc)?;
108//! println!("Got: {}", String::from_utf8_lossy(&data));
109//! }
110//! # Ok(())
111//! # }
112//! ```
113//!
114//! # Graph/tree structures
115//!
116//! A common pattern is storing node metadata in memory while keeping
117//! large payloads on disk. This is useful when:
118//!
119//! - Payloads are large and would consume too much memory
120//! - You need to traverse the structure without loading all data at once
121//! - You construct long text by concatenating payloads (e.g., thread content)
122//!
123//! For example, a tree where each node has an ID and a text payload:
124//!
125//! ```rust,no_run
126//! # use farena::{FileArena, FileArenaWriter, Location};
127//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
128//! #[derive(Clone)]
129//! struct Node {
130//! id: u64,
131//! payload_loc: Location, // Text stored on disk
132//! children: Vec<u64>,
133//! }
134//!
135//! // Build your tree with Locations instead of storing text directly
136//! let mut nodes = Vec::new();
137//! let mut writer = FileArenaWriter::new(0)?;
138//!
139//! // Write payloads, store locations
140//! for (id, text) in &[("root", "root text"), ("child1", "child text")] {
141//! let loc = writer.push(*text)?;
142//! nodes.push(Node {
143//! id: hash(id), // Your own hash function
144//! payload_loc: loc,
145//! children: vec![],
146//! });
147//! }
148//!
149//! let arena = writer.into_arena()?;
150//!
151//! // Traverse and read payloads as needed
152//! // Note: get_str_into appends, so we create a fresh buffer each iteration
153//! for node in &nodes {
154//! let mut buf = String::new();
155//! arena.get_str_into(node.payload_loc, &mut buf)?;
156//! println!("Node {}: {}", node.id, buf);
157//! }
158//!
159//! // Or concatenate payloads into a single buffer
160//! let mut full_text = String::new();
161//! for node in &nodes {
162//! arena.get_str_into(node.payload_loc, &mut full_text)?;
163//! }
164//! // full_text now contains all payloads concatenated
165//! # fn hash(_: &str) -> u64 { 0 }
166//! # Ok(())
167//! # }
168//! ```
169//!
170//! # Buffer reuse
171//!
172//! Reuse the same buffer across multiple reads to avoid allocations:
173//!
174//! ```rust
175//! # use farena::{FileArenaWriter, Location};
176//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
177//! # let mut writer = FileArenaWriter::new(0)?;
178//! # let loc1 = writer.push("hello")?;
179//! # let loc2 = writer.push(" world")?;
180//! # let arena = writer.into_arena()?;
181//! let mut buf = Vec::new();
182//!
183//! arena.get_into(loc1, &mut buf)?;
184//! assert_eq!(buf, b"hello");
185//!
186//! buf.clear(); // Reuse without reallocating
187//! arena.get_into(loc2, &mut buf)?;
188//! assert_eq!(buf, b" world");
189//! # Ok(())
190//! # }
191//! ```
192//!
193//! # Unsafe reads
194//!
195//! If you know your stored data is valid UTF-8, use `get_str_into_unchecked`
196//! to skip the UTF-8 validation:
197//!
198//! ```rust
199//! # use farena::{FileArenaWriter, Location};
200//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
201//! # let mut writer = FileArenaWriter::new(0)?;
202//! # let loc = writer.push("known utf8")?;
203//! # let arena = writer.into_arena()?;
204//! let mut buf = String::new();
205//!
206//! // SAFETY: we pushed valid UTF-8 above
207//! unsafe { arena.get_str_into_unchecked(loc, &mut buf) }?;
208//! assert_eq!(buf, "known utf8");
209//! # Ok(())
210//! # }
211//! ```
212//!
213//! # Temp directory
214//!
215//! Temp files are created in your system's temp directory (respects `TMPDIR`).
216//! Check your temp directory is on real disk with:
217//!
218//! ```text
219//! df -h ${TMPDIR:-/tmp}
220//! ```
221//!
222//! The filesystem should not be `tmpfs` or `ramfs`.
223
224mod arena;
225mod location;
226mod writer;
227
228pub use arena::FileArena;
229pub use location::Location;
230pub use writer::FileArenaWriter;