json_archive/
detection.rs

1// json-archive is a tool for tracking JSON file changes over time
2// Copyright (C) 2025  Peoples Grocers LLC
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU Affero General Public License as published
6// by the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU Affero General Public License for more details.
13//
14// You should have received a copy of the GNU Affero General Public License
15// along with this program.  If not, see <https://www.gnu.org/licenses/>.
16//
17// To purchase a license under different terms contact admin@peoplesgrocers.com
18// To request changes, report bugs, or give user feedback contact
19// marxism@peoplesgrocers.com
20//
21
22//! File type detection for JSON archives.
23//!
24//! This module exists to support ergonomic command-line usage without requiring
25//! `--archive=filename` flags. The goal is to infer intent just from filenames:
26//!
27//! - `json-archive data.json.archive data.json` -> append data.json to existing archive
28//! - `json-archive data.json` -> create new archive from data.json
29//! - `json-archive data.json.archive.tmp foo.json bar.json` -> append to archive with .tmp suffix
30//!
31//! Design choice by @nobody. No user requests for this, just seemed nice.
32
33use std::fs::File;
34use std::io::{BufRead, BufReader, Read};
35use std::path::Path;
36
37#[cfg(feature = "compression")]
38use brotli::Decompressor;
39#[cfg(feature = "compression")]
40use flate2::read::{DeflateDecoder, GzDecoder, ZlibDecoder};
41#[cfg(feature = "compression")]
42use zstd::stream::read::Decoder as ZstdDecoder;
43
44/// Detects if a file is a JSON archive by checking file extension or inspecting the header.
45///
46/// Detection strategy:
47/// 1. Check if filename ends with .json.archive
48/// 2. Inspect first line for type field as first key with value "@peoplesgrocers/json-archive"
49///
50/// Strategy 2 was added by @nobody based on frustration with the Elm compiler,
51/// which requires specific file extensions (like .js) while build systems often generate
52/// temporary files with arbitrary suffixes like .tmp. @nobody thought it would be nice if the CLI
53/// was robust enough to handle this.
54///
55///
56/// The magic value "@peoplesgrocers/json-archive" in the type field works as a file
57/// signature for cases where the extension isn't what we expect. Not requested by anyone,
58/// just anticipating potential tooling conflicts.
59pub fn is_json_archive<P: AsRef<Path>>(path: P) -> Result<bool, std::io::Error> {
60    let path = path.as_ref();
61
62    // Check file extension first (fast path)
63    if let Some(filename) = path.file_name() {
64        if let Some(filename_str) = filename.to_str() {
65            // Match .json.archive with any compression suffix
66            if filename_str.ends_with(".json.archive")
67                || filename_str.ends_with(".json.archive.gz")
68                || filename_str.ends_with(".json.archive.br")
69                || filename_str.ends_with(".json.archive.zst")
70                || filename_str.ends_with(".json.archive.zlib")
71            {
72                return Ok(true);
73            }
74        }
75    }
76
77    // Open file and detect compression
78    let mut file = File::open(path)?;
79    let mut magic_bytes = [0u8; 4];
80    let bytes_read = file.read(&mut magic_bytes)?;
81    let compression = detect_compression_format(path, &magic_bytes[..bytes_read]);
82
83    // Reopen file to reset position
84    file = File::open(path)?;
85
86    // Create appropriate reader based on compression format
87    let reader: Box<dyn BufRead> = create_reader(file, compression)?;
88
89    check_header_line(reader)
90}
91
92/// Create a buffered reader that handles decompression if needed.
93#[cfg(feature = "compression")]
94fn create_reader(file: File, compression: CompressionFormat) -> Result<Box<dyn BufRead>, std::io::Error> {
95    Ok(match compression {
96        CompressionFormat::Gzip => Box::new(BufReader::new(GzDecoder::new(file))),
97        CompressionFormat::Deflate => Box::new(BufReader::new(DeflateDecoder::new(file))),
98        CompressionFormat::Zlib => Box::new(BufReader::new(ZlibDecoder::new(file))),
99        CompressionFormat::Brotli => Box::new(BufReader::new(Decompressor::new(file, 4096))),
100        CompressionFormat::Zstd => Box::new(BufReader::new(ZstdDecoder::new(file)?)),
101        CompressionFormat::None => Box::new(BufReader::new(file)),
102    })
103}
104
105#[cfg(not(feature = "compression"))]
106fn create_reader(file: File, compression: CompressionFormat) -> Result<Box<dyn BufRead>, std::io::Error> {
107    if compression != CompressionFormat::None {
108        // Without compression support, we can't decompress to check the header.
109        // Return false by returning an empty reader that will fail header check.
110        return Ok(Box::new(BufReader::new(std::io::empty())));
111    }
112    Ok(Box::new(BufReader::new(file)))
113}
114
115/// Check if the first line of the reader contains a valid archive header.
116fn check_header_line(mut reader: Box<dyn BufRead>) -> Result<bool, std::io::Error> {
117    let mut first_line = String::new();
118
119    match reader.read_line(&mut first_line) {
120        Ok(0) => Ok(false), // Empty file
121        Ok(_) => {
122            // Try to parse as JSON and check if it has our type field as the first key
123            if let Ok(value) = serde_json::from_str::<serde_json::Value>(&first_line) {
124                if let Some(obj) = value.as_object() {
125                    // Check if the first key is "type" with our expected value
126                    // Note: serde_json::Map preserves insertion order
127                    if let Some((first_key, first_value)) = obj.iter().next() {
128                        if first_key == "type" {
129                            if let Some(type_str) = first_value.as_str() {
130                                return Ok(type_str == "@peoplesgrocers/json-archive");
131                            }
132                        }
133                    }
134                }
135            }
136            Ok(false)
137        }
138        Err(e) => Err(e),
139    }
140}
141
142#[derive(Debug, Clone, Copy, PartialEq, Eq)]
143pub enum CompressionFormat {
144    Gzip,
145    Deflate,
146    Zlib,
147    Brotli,
148    Zstd,
149    None,
150}
151
152pub fn detect_compression_format(path: &Path, bytes: &[u8]) -> CompressionFormat {
153    if bytes.len() < 4 {
154        return CompressionFormat::None;
155    }
156
157    // Gzip magic number: 0x1f 0x8b
158    if bytes[0] == 0x1f && bytes[1] == 0x8b {
159        return CompressionFormat::Gzip;
160    }
161
162    // Zlib magic number: 0x78 followed by 0x01, 0x5e, 0x9c, or 0xda
163    if bytes[0] == 0x78 && (bytes[1] == 0x01 || bytes[1] == 0x5e || bytes[1] == 0x9c || bytes[1] == 0xda) {
164        return CompressionFormat::Zlib;
165    }
166
167    // Zstd magic number: 0x28 0xb5 0x2f 0xfd
168    if bytes.len() >= 4 && bytes[0] == 0x28 && bytes[1] == 0xb5 && bytes[2] == 0x2f && bytes[3] == 0xfd {
169        return CompressionFormat::Zstd;
170    }
171
172    // Check file extension for brotli (no reliable magic number) and deflate
173    if let Some(ext) = path.extension() {
174        let ext_str = ext.to_string_lossy();
175        if ext_str == "br" || path.to_string_lossy().contains(".br.") {
176            return CompressionFormat::Brotli;
177        }
178        if ext_str == "deflate" {
179            return CompressionFormat::Deflate;
180        }
181    }
182
183    CompressionFormat::None
184}
185
186#[cfg(test)]
187mod tests {
188    use super::*;
189    use std::io::Write;
190    use tempfile::NamedTempFile;
191
192    #[test]
193    fn test_detect_by_json_archive_extension() -> Result<(), Box<dyn std::error::Error>> {
194        let mut temp_file = NamedTempFile::with_suffix(".json.archive")?;
195        writeln!(temp_file, r#"{{"some": "json"}}"#)?;
196        temp_file.flush()?;
197
198        assert!(is_json_archive(temp_file.path())?);
199        Ok(())
200    }
201
202    #[test]
203    fn test_detect_by_type_field() -> Result<(), Box<dyn std::error::Error>> {
204        let mut temp_file = NamedTempFile::with_suffix(".weird-extension")?;
205        writeln!(
206            temp_file,
207            r#"{{"type":"@peoplesgrocers/json-archive","version":1}}"#
208        )?;
209        temp_file.flush()?;
210
211        assert!(is_json_archive(temp_file.path())?);
212        Ok(())
213    }
214
215    #[test]
216    fn test_detect_by_type_field_with_tmp_extension() -> Result<(), Box<dyn std::error::Error>> {
217        let mut temp_file = NamedTempFile::with_suffix(".json.tmp")?;
218        writeln!(
219            temp_file,
220            r#"{{"type":"@peoplesgrocers/json-archive","version":1}}"#
221        )?;
222        temp_file.flush()?;
223
224        assert!(is_json_archive(temp_file.path())?);
225        Ok(())
226    }
227
228    #[test]
229    fn test_not_archive_regular_json() -> Result<(), Box<dyn std::error::Error>> {
230        let mut temp_file = NamedTempFile::with_suffix(".json")?;
231        writeln!(temp_file, r#"{{"some": "json"}}"#)?;
232        temp_file.flush()?;
233
234        assert!(!is_json_archive(temp_file.path())?);
235        Ok(())
236    }
237
238    #[test]
239    fn test_not_archive_wrong_type_field() -> Result<(), Box<dyn std::error::Error>> {
240        let mut temp_file = NamedTempFile::with_suffix(".tmp")?;
241        writeln!(temp_file, r#"{{"type":"something-else","version":1}}"#)?;
242        temp_file.flush()?;
243
244        assert!(!is_json_archive(temp_file.path())?);
245        Ok(())
246    }
247
248    #[test]
249    fn test_not_archive_type_not_first_field() -> Result<(), Box<dyn std::error::Error>> {
250        let mut temp_file = NamedTempFile::with_suffix(".tmp")?;
251        // Use a key that comes after "type" alphabetically to ensure it's first
252        writeln!(
253            temp_file,
254            r#"{{"version":1,"zzz":"@peoplesgrocers/json-archive"}}"#
255        )?;
256        temp_file.flush()?;
257
258        // This should NOT be detected as an archive since the type field doesn't exist
259        assert!(!is_json_archive(temp_file.path())?);
260        Ok(())
261    }
262
263    #[test]
264    fn test_not_archive_empty_file() -> Result<(), Box<dyn std::error::Error>> {
265        let temp_file = NamedTempFile::with_suffix(".json")?;
266
267        assert!(!is_json_archive(temp_file.path())?);
268        Ok(())
269    }
270
271    #[test]
272    fn test_not_archive_invalid_json() -> Result<(), Box<dyn std::error::Error>> {
273        let mut temp_file = NamedTempFile::with_suffix(".tmp")?;
274        writeln!(temp_file, "not valid json")?;
275        temp_file.flush()?;
276
277        assert!(!is_json_archive(temp_file.path())?);
278        Ok(())
279    }
280}