json_archive/write_strategy.rs
1// json-archive is a tool for tracking JSON file changes over time
2// Copyright (C) 2025 Peoples Grocers LLC
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU Affero General Public License as published
6// by the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU Affero General Public License for more details.
13//
14// You should have received a copy of the GNU Affero General Public License
15// along with this program. If not, see <https://www.gnu.org/licenses/>.
16//
17// To purchase a license under different terms contact admin@peoplesgrocers.com
18// To request changes, report bugs, or give user feedback contact
19// marxism@peoplesgrocers.com
20//
21
22//! Write strategy for archive operations.
23//!
24//! There are exactly two questions:
25//! 1. Where do we write? (dest_path)
26//! 2. Can we write there directly, or do we need to dance?
27//!
28//! The dance (temp file + atomic swap) is required when:
29//! - source_path == dest_path, AND
30//! - the file is compressed
31//!
32//! Why? Compressed streams don't support append. To add one
33//! record to a gzip file, you decompress everything, add the
34//! record, recompress everything. If you write to the same
35//! file you're reading, you corrupt it mid-operation.
36//!
37//! So: write to temp, swap when done. See atomic_file.rs.
38//!
39//! When source != dest, there is no conflict. Read from source,
40//! write to dest. Even if source is compressed. Even if dest
41//! is compressed. Even if they use different compression.
42//! The source is never modified.
43//!
44//! When source == dest AND uncompressed, just append. Seek to
45//! end, write new records. Simple.
46//!
47//! The output compression format is determined by dest_path's
48//! extension, not the source's format. That's a separate concern.
49//!
50//! ## Truth Table
51//!
52//! ```text
53//! INPUTS OUTPUT FLAG STRATEGY
54//! ───────────────────────────────────────────────────────────────
55//! [A.json, B.json] (none) Create { out: A.json.archive, fmt: None }
56//! [A.json, B.json] -o X.archive.gz Create { out: X.archive.gz, fmt: Gzip }
57//!
58//! [A.archive, B.json] (none) Append { path: A.archive }
59//! [A.archive, B.json] -o X.archive Direct { in: A.archive/None, out: X.archive/None }
60//!
61//! [A.archive.gz, B.json] (none) AtomicSwap { path: A.archive.gz, fmt: Gzip, temp: .A.archive.gz.xxx }
62//! [A.archive.gz, B.json] -o A.archive.gz AtomicSwap { path: A.archive.gz, fmt: Gzip, temp: .A.archive.gz.xxx }
63//! [A.archive.gz, B.json] -o X.archive Direct { in: A.archive.gz/Gzip, out: X.archive/None }
64//! [A.archive.gz, B.json] -o X.archive.br Direct { in: A.archive.gz/Gzip, out: X.archive.br/Brotli }
65//! ```
66//!
67//! The rule:
68//! ```text
69//! if creating new archive:
70//! Create
71//! else if source != dest:
72//! Direct (read from source, write to dest, transcoding as needed)
73//! else if source == dest AND uncompressed:
74//! Append (seek to end, write)
75//! else if source == dest AND compressed:
76//! AtomicSwap (read all, write to temp, swap)
77//! ```
78
79use std::path::{Path, PathBuf};
80
81use crate::atomic_file::generate_temp_filename;
82use crate::detection::CompressionFormat;
83
84/// A path with its compression format.
85pub type CompressedPath = (PathBuf, CompressionFormat);
86
87/// Describes how to write archive data based on input/output paths and compression.
88#[derive(Debug, Clone)]
89pub enum WriteStrategy {
90 /// Create a new archive from scratch. No existing archive to read.
91 Create { output: CompressedPath },
92
93 /// Append to an existing uncompressed archive in-place.
94 /// Just seek to end and write new records.
95 Append { path: PathBuf },
96
97 /// Read from one location, write to another.
98 /// Handles transcoding between compression formats.
99 CopyOnWrite {
100 input: CompressedPath,
101 output: CompressedPath,
102 },
103
104 /// Read compressed archive, write to temp, atomic swap.
105 /// Required when source == dest AND compressed.
106 AtomicSwap {
107 /// The archive path (both input and output)
108 path: PathBuf,
109 /// Compression format (same for input and output in this case)
110 compression: CompressionFormat,
111 /// Temp file to write to before swapping
112 temp_path: PathBuf,
113 },
114}
115
116/// Determine compression format from file extension.
117///
118/// Returns `CompressionFormat::None` for uncompressed files.
119pub fn compression_from_extension(path: &Path) -> CompressionFormat {
120 let s = path.to_string_lossy();
121 if s.ends_with(".gz") {
122 CompressionFormat::Gzip
123 } else if s.ends_with(".br") {
124 CompressionFormat::Brotli
125 } else if s.ends_with(".zst") {
126 CompressionFormat::Zstd
127 } else if s.ends_with(".zlib") {
128 CompressionFormat::Zlib
129 } else {
130 CompressionFormat::None
131 }
132}
133
134/// Determine write strategy from parsed arguments.
135///
136/// # Arguments
137///
138/// * `source_archive` - Path to existing archive if appending, None if creating new
139/// * `dest_path` - Where to write the output
140/// * `source_compression` - Compression format of source (from magic bytes). Pass
141/// `CompressionFormat::None` if unknown or uncompressed.
142///
143/// # Returns
144///
145/// The appropriate `WriteStrategy` for this operation.
146pub fn determine_strategy(
147 source_archive: Option<&Path>,
148 dest_path: &Path,
149 source_compression: CompressionFormat,
150) -> WriteStrategy {
151 let dest_compression = compression_from_extension(dest_path);
152
153 // No source archive? Creating new.
154 let Some(source) = source_archive else {
155 return WriteStrategy::Create {
156 output: (dest_path.to_path_buf(), dest_compression),
157 };
158 };
159
160 // Check if source and dest are the same file
161 let same_file = match (source.canonicalize(), dest_path.canonicalize()) {
162 (Ok(s), Ok(d)) => s == d,
163 // dest doesn't exist yet, or other error - not same file
164 _ => false,
165 };
166
167 if !same_file {
168 // Different files: read from source, write to dest
169 let source_fmt = if source_compression == CompressionFormat::None {
170 compression_from_extension(source)
171 } else {
172 source_compression
173 };
174 return WriteStrategy::CopyOnWrite {
175 input: (source.to_path_buf(), source_fmt),
176 output: (dest_path.to_path_buf(), dest_compression),
177 };
178 }
179
180 // Same file - check if compressed
181 let compression = if source_compression == CompressionFormat::None {
182 compression_from_extension(source)
183 } else {
184 source_compression
185 };
186
187 match compression {
188 CompressionFormat::None => {
189 // Uncompressed: can append in-place
190 WriteStrategy::Append {
191 path: dest_path.to_path_buf(),
192 }
193 }
194 fmt => {
195 // Compressed: need atomic swap
196 WriteStrategy::AtomicSwap {
197 path: dest_path.to_path_buf(),
198 compression: fmt,
199 temp_path: generate_temp_filename(dest_path),
200 }
201 }
202 }
203}
204
205#[cfg(test)]
206mod tests {
207 use super::*;
208 use std::io::Write as IoWrite;
209 use tempfile::NamedTempFile;
210
211 #[test]
212 fn test_compression_from_extension() {
213 assert_eq!(
214 compression_from_extension(Path::new("foo.json.archive.gz")),
215 CompressionFormat::Gzip
216 );
217 assert_eq!(
218 compression_from_extension(Path::new("foo.json.archive.br")),
219 CompressionFormat::Brotli
220 );
221 assert_eq!(
222 compression_from_extension(Path::new("foo.json.archive.zst")),
223 CompressionFormat::Zstd
224 );
225 assert_eq!(
226 compression_from_extension(Path::new("foo.json.archive.zlib")),
227 CompressionFormat::Zlib
228 );
229 assert_eq!(
230 compression_from_extension(Path::new("foo.json.archive")),
231 CompressionFormat::None
232 );
233 assert_eq!(
234 compression_from_extension(Path::new("foo.json")),
235 CompressionFormat::None
236 );
237 }
238
239 #[test]
240 fn test_create_new_archive() {
241 let dest = Path::new("/tmp/new.json.archive");
242 let strategy = determine_strategy(None, dest, CompressionFormat::None);
243
244 match strategy {
245 WriteStrategy::Create { output } => {
246 assert_eq!(output.0, PathBuf::from("/tmp/new.json.archive"));
247 assert_eq!(output.1, CompressionFormat::None);
248 }
249 _ => panic!("Expected Create strategy"),
250 }
251 }
252
253 #[test]
254 fn test_create_new_compressed_archive() {
255 let dest = Path::new("/tmp/new.json.archive.gz");
256 let strategy = determine_strategy(None, dest, CompressionFormat::None);
257
258 match strategy {
259 WriteStrategy::Create { output } => {
260 assert_eq!(output.0, PathBuf::from("/tmp/new.json.archive.gz"));
261 assert_eq!(output.1, CompressionFormat::Gzip);
262 }
263 _ => panic!("Expected Create strategy"),
264 }
265 }
266
267 #[test]
268 fn test_append_uncompressed_same_file() -> Result<(), Box<dyn std::error::Error>> {
269 let mut temp = NamedTempFile::with_suffix(".json.archive")?;
270 writeln!(temp, "test")?;
271 temp.flush()?;
272
273 let path = temp.path();
274 let strategy = determine_strategy(Some(path), path, CompressionFormat::None);
275
276 match strategy {
277 WriteStrategy::Append { path: p } => {
278 assert_eq!(p, path);
279 }
280 _ => panic!("Expected Append strategy, got {:?}", strategy),
281 }
282
283 Ok(())
284 }
285
286 #[test]
287 fn test_atomic_swap_compressed_same_file() -> Result<(), Box<dyn std::error::Error>> {
288 let mut temp = NamedTempFile::with_suffix(".json.archive.gz")?;
289 writeln!(temp, "test")?;
290 temp.flush()?;
291
292 let path = temp.path();
293 let strategy = determine_strategy(Some(path), path, CompressionFormat::Gzip);
294
295 match strategy {
296 WriteStrategy::AtomicSwap {
297 path: p,
298 compression,
299 temp_path,
300 } => {
301 assert_eq!(p, path);
302 assert_eq!(compression, CompressionFormat::Gzip);
303 assert!(temp_path.to_string_lossy().contains(".json.archive.gz"));
304 }
305 _ => panic!("Expected AtomicSwap strategy, got {:?}", strategy),
306 }
307
308 Ok(())
309 }
310
311 #[test]
312 fn test_direct_different_files() -> Result<(), Box<dyn std::error::Error>> {
313 let mut source = NamedTempFile::with_suffix(".json.archive")?;
314 writeln!(source, "test")?;
315 source.flush()?;
316
317 let dest = Path::new("/tmp/different.json.archive");
318 let strategy = determine_strategy(Some(source.path()), dest, CompressionFormat::None);
319
320 match strategy {
321 WriteStrategy::CopyOnWrite { input, output } => {
322 assert_eq!(input.0, source.path());
323 assert_eq!(input.1, CompressionFormat::None);
324 assert_eq!(output.0, PathBuf::from("/tmp/different.json.archive"));
325 assert_eq!(output.1, CompressionFormat::None);
326 }
327 _ => panic!("Expected Direct strategy, got {:?}", strategy),
328 }
329
330 Ok(())
331 }
332
333 #[test]
334 fn test_direct_transcode_compression() -> Result<(), Box<dyn std::error::Error>> {
335 let mut source = NamedTempFile::with_suffix(".json.archive.gz")?;
336 writeln!(source, "test")?;
337 source.flush()?;
338
339 let dest = Path::new("/tmp/output.json.archive.br");
340 let strategy = determine_strategy(Some(source.path()), dest, CompressionFormat::Gzip);
341
342 match strategy {
343 WriteStrategy::CopyOnWrite { input, output } => {
344 assert_eq!(input.1, CompressionFormat::Gzip);
345 assert_eq!(output.1, CompressionFormat::Brotli);
346 }
347 _ => panic!("Expected Direct strategy, got {:?}", strategy),
348 }
349
350 Ok(())
351 }
352}