json_archive/
detection.rs

1// json-archive is a tool for tracking JSON file changes over time
2// Copyright (C) 2025  Peoples Grocers LLC
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU Affero General Public License as published
6// by the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU Affero General Public License for more details.
13//
14// You should have received a copy of the GNU Affero General Public License
15// along with this program.  If not, see <https://www.gnu.org/licenses/>.
16//
17// To purchase a license under different terms contact admin@peoplesgrocers.com
18// To request changes, report bugs, or give user feedback contact
19// marxism@peoplesgrocers.com
20//
21
22//! File type detection for JSON archives.
23//!
24//! This module exists to support ergonomic command-line usage without requiring
25//! `--archive=filename` flags. The goal is to infer intent just from filenames:
26//!
27//! - `json-archive data.json.archive data.json` -> append data.json to existing archive
28//! - `json-archive data.json` -> create new archive from data.json
29//! - `json-archive data.json.archive.tmp foo.json bar.json` -> append to archive with .tmp suffix
30//!
31//! Design choice by @nobody. No user requests for this, just seemed nice.
32
33use std::fs::File;
34use std::io::{BufRead, BufReader, Read};
35use std::path::Path;
36
37#[cfg(feature = "compression")]
38use brotli::Decompressor;
39#[cfg(feature = "compression")]
40use flate2::read::{DeflateDecoder, GzDecoder, ZlibDecoder};
41#[cfg(feature = "compression")]
42use zstd::stream::read::Decoder as ZstdDecoder;
43
44/// Detects if a file is a JSON archive by checking file extension or inspecting the header.
45///
46/// Detection strategy:
47/// 1. Check if filename ends with .json.archive
48/// 2. Inspect first line for type field as first key with value "@peoplesgrocers/json-archive"
49///
50/// Strategy 2 was added by @nobody based on frustration with the Elm compiler,
51/// which requires specific file extensions (like .js) while build systems often generate
52/// temporary files with arbitrary suffixes like .tmp. @nobody thought it would be nice if the CLI
53/// was robust enough to handle this.
54///
55///
56/// The magic value "@peoplesgrocers/json-archive" in the type field works as a file
57/// signature for cases where the extension isn't what we expect. Not requested by anyone,
58/// just anticipating potential tooling conflicts.
59pub fn is_json_archive<P: AsRef<Path>>(path: P) -> Result<bool, std::io::Error> {
60    let path = path.as_ref();
61
62    // Check file extension first (fast path)
63    if let Some(filename) = path.file_name() {
64        if let Some(filename_str) = filename.to_str() {
65            // Match .json.archive with any compression suffix
66            if filename_str.ends_with(".json.archive")
67                || filename_str.ends_with(".json.archive.gz")
68                || filename_str.ends_with(".json.archive.br")
69                || filename_str.ends_with(".json.archive.zst")
70                || filename_str.ends_with(".json.archive.zlib")
71            {
72                return Ok(true);
73            }
74        }
75    }
76
77    // Open file and detect compression
78    let mut file = File::open(path)?;
79    let mut magic_bytes = [0u8; 4];
80    let bytes_read = file.read(&mut magic_bytes)?;
81    let compression = detect_compression_format(path, &magic_bytes[..bytes_read]);
82
83    // Reopen file to reset position
84    file = File::open(path)?;
85
86    // Create appropriate reader based on compression format
87    let reader: Box<dyn BufRead> = create_reader(file, compression)?;
88
89    check_header_line(reader)
90}
91
92/// Create a buffered reader that handles decompression if needed.
93#[cfg(feature = "compression")]
94fn create_reader(
95    file: File,
96    compression: CompressionFormat,
97) -> Result<Box<dyn BufRead>, std::io::Error> {
98    Ok(match compression {
99        CompressionFormat::Gzip => Box::new(BufReader::new(GzDecoder::new(file))),
100        CompressionFormat::Deflate => Box::new(BufReader::new(DeflateDecoder::new(file))),
101        CompressionFormat::Zlib => Box::new(BufReader::new(ZlibDecoder::new(file))),
102        CompressionFormat::Brotli => Box::new(BufReader::new(Decompressor::new(file, 4096))),
103        CompressionFormat::Zstd => Box::new(BufReader::new(ZstdDecoder::new(file)?)),
104        CompressionFormat::None => Box::new(BufReader::new(file)),
105    })
106}
107
108#[cfg(not(feature = "compression"))]
109fn create_reader(
110    file: File,
111    compression: CompressionFormat,
112) -> Result<Box<dyn BufRead>, std::io::Error> {
113    if compression != CompressionFormat::None {
114        // Without compression support, we can't decompress to check the header.
115        // Return false by returning an empty reader that will fail header check.
116        return Ok(Box::new(BufReader::new(std::io::empty())));
117    }
118    Ok(Box::new(BufReader::new(file)))
119}
120
121/// Check if the first line of the reader contains a valid archive header.
122fn check_header_line(mut reader: Box<dyn BufRead>) -> Result<bool, std::io::Error> {
123    let mut first_line = String::new();
124
125    match reader.read_line(&mut first_line) {
126        Ok(0) => Ok(false), // Empty file
127        Ok(_) => {
128            // Try to parse as JSON and check if it has our type field as the first key
129            if let Ok(value) = serde_json::from_str::<serde_json::Value>(&first_line) {
130                if let Some(obj) = value.as_object() {
131                    // Check if the first key is "type" with our expected value
132                    // Note: serde_json::Map preserves insertion order
133                    if let Some((first_key, first_value)) = obj.iter().next() {
134                        if first_key == "type" {
135                            if let Some(type_str) = first_value.as_str() {
136                                return Ok(type_str == "@peoplesgrocers/json-archive");
137                            }
138                        }
139                    }
140                }
141            }
142            Ok(false)
143        }
144        Err(e) => Err(e),
145    }
146}
147
148#[derive(Debug, Clone, Copy, PartialEq, Eq)]
149pub enum CompressionFormat {
150    Gzip,
151    Deflate,
152    Zlib,
153    Brotli,
154    Zstd,
155    None,
156}
157
158impl std::fmt::Display for CompressionFormat {
159    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
160        match self {
161            CompressionFormat::Gzip => write!(f, "gzip"),
162            CompressionFormat::Deflate => write!(f, "deflate"),
163            CompressionFormat::Zlib => write!(f, "zlib"),
164            CompressionFormat::Brotli => write!(f, "brotli"),
165            CompressionFormat::Zstd => write!(f, "zstd"),
166            CompressionFormat::None => write!(f, "none"),
167        }
168    }
169}
170
171pub fn detect_compression_format(path: &Path, bytes: &[u8]) -> CompressionFormat {
172    if bytes.len() < 4 {
173        return CompressionFormat::None;
174    }
175
176    // Gzip magic number: 0x1f 0x8b
177    if bytes[0] == 0x1f && bytes[1] == 0x8b {
178        return CompressionFormat::Gzip;
179    }
180
181    // Zlib magic number: 0x78 followed by 0x01, 0x5e, 0x9c, or 0xda
182    if bytes[0] == 0x78
183        && (bytes[1] == 0x01 || bytes[1] == 0x5e || bytes[1] == 0x9c || bytes[1] == 0xda)
184    {
185        return CompressionFormat::Zlib;
186    }
187
188    // Zstd magic number: 0x28 0xb5 0x2f 0xfd
189    if bytes.len() >= 4
190        && bytes[0] == 0x28
191        && bytes[1] == 0xb5
192        && bytes[2] == 0x2f
193        && bytes[3] == 0xfd
194    {
195        return CompressionFormat::Zstd;
196    }
197
198    // Check file extension for brotli (no reliable magic number) and deflate
199    if let Some(ext) = path.extension() {
200        let ext_str = ext.to_string_lossy();
201        if ext_str == "br" || path.to_string_lossy().contains(".br.") {
202            return CompressionFormat::Brotli;
203        }
204        if ext_str == "deflate" {
205            return CompressionFormat::Deflate;
206        }
207    }
208
209    CompressionFormat::None
210}
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215    use std::io::Write;
216    use tempfile::NamedTempFile;
217
218    #[test]
219    fn test_detect_by_json_archive_extension() -> Result<(), Box<dyn std::error::Error>> {
220        let mut temp_file = NamedTempFile::with_suffix(".json.archive")?;
221        writeln!(temp_file, r#"{{"some": "json"}}"#)?;
222        temp_file.flush()?;
223
224        assert!(is_json_archive(temp_file.path())?);
225        Ok(())
226    }
227
228    #[test]
229    fn test_detect_by_type_field() -> Result<(), Box<dyn std::error::Error>> {
230        let mut temp_file = NamedTempFile::with_suffix(".weird-extension")?;
231        writeln!(
232            temp_file,
233            r#"{{"type":"@peoplesgrocers/json-archive","version":1}}"#
234        )?;
235        temp_file.flush()?;
236
237        assert!(is_json_archive(temp_file.path())?);
238        Ok(())
239    }
240
241    #[test]
242    fn test_detect_by_type_field_with_tmp_extension() -> Result<(), Box<dyn std::error::Error>> {
243        let mut temp_file = NamedTempFile::with_suffix(".json.tmp")?;
244        writeln!(
245            temp_file,
246            r#"{{"type":"@peoplesgrocers/json-archive","version":1}}"#
247        )?;
248        temp_file.flush()?;
249
250        assert!(is_json_archive(temp_file.path())?);
251        Ok(())
252    }
253
254    #[test]
255    fn test_not_archive_regular_json() -> Result<(), Box<dyn std::error::Error>> {
256        let mut temp_file = NamedTempFile::with_suffix(".json")?;
257        writeln!(temp_file, r#"{{"some": "json"}}"#)?;
258        temp_file.flush()?;
259
260        assert!(!is_json_archive(temp_file.path())?);
261        Ok(())
262    }
263
264    #[test]
265    fn test_not_archive_wrong_type_field() -> Result<(), Box<dyn std::error::Error>> {
266        let mut temp_file = NamedTempFile::with_suffix(".tmp")?;
267        writeln!(temp_file, r#"{{"type":"something-else","version":1}}"#)?;
268        temp_file.flush()?;
269
270        assert!(!is_json_archive(temp_file.path())?);
271        Ok(())
272    }
273
274    #[test]
275    fn test_not_archive_type_not_first_field() -> Result<(), Box<dyn std::error::Error>> {
276        let mut temp_file = NamedTempFile::with_suffix(".tmp")?;
277        // Use a key that comes after "type" alphabetically to ensure it's first
278        writeln!(
279            temp_file,
280            r#"{{"version":1,"zzz":"@peoplesgrocers/json-archive"}}"#
281        )?;
282        temp_file.flush()?;
283
284        // This should NOT be detected as an archive since the type field doesn't exist
285        assert!(!is_json_archive(temp_file.path())?);
286        Ok(())
287    }
288
289    #[test]
290    fn test_not_archive_empty_file() -> Result<(), Box<dyn std::error::Error>> {
291        let temp_file = NamedTempFile::with_suffix(".json")?;
292
293        assert!(!is_json_archive(temp_file.path())?);
294        Ok(())
295    }
296
297    #[test]
298    fn test_not_archive_invalid_json() -> Result<(), Box<dyn std::error::Error>> {
299        let mut temp_file = NamedTempFile::with_suffix(".tmp")?;
300        writeln!(temp_file, "not valid json")?;
301        temp_file.flush()?;
302
303        assert!(!is_json_archive(temp_file.path())?);
304        Ok(())
305    }
306}