sett 0.3.0

Rust port of sett (data compression, encryption and transfer tool).
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
//! Types and constants related to the data package format

use std::{
    collections::BTreeMap,
    path::{Path, PathBuf},
    str::FromStr,
};

use anyhow::Context;
use chrono::{DateTime, Utc};
use sequoia_openpgp::parse::{stream::DetachedVerifierBuilder, Parse};
use serde::{de::Visitor, Deserialize, Serialize, Serializer};
use tokio::io::AsyncReadExt as _;
use tracing::instrument;

/// Date format used for generating the default data package file name.
pub const DATETIME_FORMAT: &str = "%Y%m%dT%H%M%S";
/// Directory where decrypted and decompressed files are stored.
pub const CONTENT_FOLDER: &str = "content";
/// File containing checksums of individual input files.
pub const CHECKSUM_FILE: &str = "checksum.sha256";
/// Archive file containing all input files.
pub const DATA_FILE: &str = "data.tar.gz";
/// Encrypted archive file.
pub const DATA_FILE_ENCRYPTED: &str = "data.tar.gz.gpg";
/// File containing package metadata.
pub const METADATA_FILE: &str = "metadata.json";
/// Detached signature of the metadata file.
pub const METADATA_SIG_FILE: &str = "metadata.json.sig";

/// Data package verification state.
pub mod state {
    /// A marker type for a data package that has been verified.
    #[derive(Debug, Clone)]
    pub struct Verified;
    /// A marker type for a data package that has not been verified.
    #[derive(Debug, Clone)]
    pub struct Unverified;

    /// This is a sealed trait, it cannot be implemented outside of this crate.
    pub trait State: sealed::Sealed {}
    impl State for Unverified {}
    impl State for Verified {}

    mod sealed {
        pub trait Sealed {}
        impl Sealed for super::Unverified {}
        impl Sealed for super::Verified {}
    }
}

#[derive(Debug, Clone)]
pub(crate) enum Source {
    Local(PathBuf),
    S3(S3Source),
}

#[derive(Debug, Clone)]
pub(crate) struct S3Source {
    pub(crate) client: crate::remote::s3::Client,
    pub(crate) bucket: String,
    pub(crate) object: String,
}

/// A type for securely working with the data package format.
///
/// A data package is a ZIP file containing the following files:
/// - `metadata.json`: metadata about the package
/// - `metadata.json.sig`: detached signature of the metadata file
/// - `data.tar.gz.gpg`: encrypted archive of the input files
///
/// This type is generic over the verification state of the package.  A
/// package that has been verified is in the [`Verified`] state, and a
/// package that has not been verified is in the [`Unverified`] state.
/// [`Package::verify`] can be used to transition a package from the
/// [`Unverified`] state to the [`Verified`] state.
///
/// [`Verified`]: state::Verified
/// [`Unverified`]: state::Unverified
#[derive(Clone)]
pub struct Package<State: state::State = state::Unverified> {
    state: std::marker::PhantomData<State>,
    zip: crate::zip::ZipReader,
    name: String,
}

impl<State: state::State> std::fmt::Debug for Package<State> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("Package")
            .field("name", &self.name)
            .field("source", &self.zip.source)
            .finish()
    }
}

impl<State: state::State> Package<State> {
    pub(crate) fn name(&self) -> &str {
        &self.name
    }

    pub(crate) fn path(&self) -> anyhow::Result<&Path> {
        if let Source::Local(path) = &self.zip.source {
            Ok(path)
        } else {
            Err(anyhow::anyhow!(
                "package path is not avialble for non-local sources"
            ))
        }
    }
}

macro_rules! read_metadata {
    ($package:expr) => {{
        let (mut reader, size) = $package.zip.get_file_reader(METADATA_FILE).await?;
        let mut buf = Vec::with_capacity(size as usize);
        reader.read_to_end(&mut buf).await?;
        Ok(serde_json::from_slice(&buf)?)
    }};
}

impl Package<state::Unverified> {
    /// Opens a data package from the local file system.
    #[instrument(fields(path = %path.as_ref().display()), err(Debug, level=tracing::Level::ERROR))]
    pub async fn open(path: impl AsRef<Path>) -> anyhow::Result<Self> {
        let path = path.as_ref();
        anyhow::ensure!(path.exists(), "package '{}' does not exist", path.display());
        let path = path.canonicalize()?;
        let name = path
            .file_name()
            .context("Unable to get package file name")?
            .to_string_lossy()
            .to_string();
        Ok(Package {
            state: Default::default(),
            name,
            zip: crate::zip::ZipReader::open(Source::Local(path)).await?,
        })
    }

    /// Opens a data package from an object store.
    #[instrument(err(Debug, level=tracing::Level::ERROR))]
    pub async fn open_s3(
        client: &crate::remote::s3::Client,
        bucket: String,
        object: String,
    ) -> anyhow::Result<Self> {
        let source = Source::S3(S3Source {
            client: client.clone(),
            bucket,
            object: object.clone(),
        });
        Ok(Package {
            state: Default::default(),
            zip: crate::zip::ZipReader::open(source).await?,
            name: object,
        })
    }

    /// Verifies the Zip archive has the correct structure for a data package.
    ///
    /// A data package must only contain exactly the expected files.
    fn verify_format(&self) -> anyhow::Result<()> {
        const EXPECTED_FILES: [&str; 3] = [DATA_FILE_ENCRYPTED, METADATA_FILE, METADATA_SIG_FILE];
        let error_msg_base = format!(
            "A valid data package must contain exactly the following {} files: {}",
            EXPECTED_FILES.len(),
            EXPECTED_FILES.join(", ")
        );

        // Search the .zip archive for the expected files.
        let mut actual_files = Vec::new();
        for file_name in self.zip.file_names() {
            if EXPECTED_FILES.contains(&file_name) {
                actual_files.push(file_name);
            } else {
                anyhow::bail!(
                    "invalid data package. Zip archive contains unexpected \
                    files. {error_msg_base}."
                );
            }
        }

        // If exactly all expected files are present, verification is complete.
        if actual_files.len() == EXPECTED_FILES.len() {
            return Ok(());
        }

        Err(anyhow::anyhow!(
            "invalid data package. Zip archive is missing the following \
            files: {}. {}.",
            EXPECTED_FILES
                .into_iter()
                .filter(|f| !actual_files.contains(f))
                .collect::<Vec<&str>>()
                .join(", "),
            error_msg_base
        ))
    }

    /// Verify the signature of a data package.
    ///
    /// Verifies the signature of the metadata file contained in the data
    /// package. If the signature is valid, the function returns a [`Package`]
    /// in [`Verified`] state. Otherwise, the function returns an error.
    ///
    /// [`Verified`]: state::Verified
    pub async fn verify(
        self,
        cert_store: &crate::openpgp::certstore::CertStore<'_>,
    ) -> anyhow::Result<Package<state::Verified>> {
        macro_rules! read_inner {
            ($file:expr) => {{
                let (mut reader, size) = self
                    .zip
                    .get_file_reader($file)
                    .await
                    .with_context(|| format!("{} not found", $file))?;
                let mut buffer = Vec::with_capacity(size as usize);
                reader.read_to_end(&mut buffer).await?;
                buffer
            }};
        }

        self.verify_format()?;
        DetachedVerifierBuilder::from_bytes(&read_inner!(METADATA_SIG_FILE))?
            .with_policy(
                &sequoia_openpgp::policy::StandardPolicy::new(),
                None,
                crate::openpgp::crypto::VerificationHelper { cert_store },
            )?
            .verify_bytes(read_inner!(METADATA_FILE))?;
        Ok(Package {
            state: Default::default(),
            zip: self.zip,
            name: self.name,
        })
    }

    /// Reads metadata from a data package file without verification.
    ///
    /// Use this method only for low-level inspection. Prefer using the
    /// `metadata` method from [`Package<state::Verified>`].
    pub async fn metadata_unverified(&self) -> anyhow::Result<Metadata> {
        read_metadata!(self)
    }
}

impl Package<state::Verified> {
    /// Reads metadata from a data package file.
    pub async fn metadata(&self) -> anyhow::Result<Metadata> {
        read_metadata!(self)
    }

    /// Reads the encrypted data file from a data package.
    ///
    /// Returns a reader for the encrypted data file and its size.
    pub(crate) async fn data(
        &self,
    ) -> anyhow::Result<(Box<dyn tokio::io::AsyncBufRead + Unpin + Sync + Send>, u64)> {
        self.zip.get_file_reader(DATA_FILE_ENCRYPTED).await
    }
}

/// Package Metadata struct
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
pub struct Metadata {
    /// Fingerprint of the data package sender.
    pub sender: String,
    /// Data package recipients fingerprints.
    pub recipients: Vec<String>,
    /// Checksum of the encrypted data file.
    pub checksum: String,
    /// Creation time of the data package.
    ///
    /// (De)serialized using the RFC 3339 format.
    pub timestamp: DateTime<Utc>,
    /// Metadata version.
    #[serde(default = "default_version")]
    pub version: String,
    /// Algorithm used to compute the checksum of the encrypted data file.
    #[serde(default)]
    pub checksum_algorithm: ChecksumAlgorithm,
    #[serde(default)]
    /// Algorithm used to compress input data files.
    pub compression_algorithm: CompressionAlgorithm,
    #[serde(default)]
    /// Data transfer ID (DTR).
    pub transfer_id: Option<u32>,
    #[serde(default)]
    /// Data package purpose.
    pub purpose: Option<Purpose>,
    #[serde(default)]
    /// Extra metadata key-value fields.
    pub extra: BTreeMap<String, String>,
}

impl Metadata {
    /// Serializes the metadata to JSON or a debug string if serialization fails.
    ///
    /// Note, it's very unlikely for serialization to fail.
    pub(crate) fn to_json_or_debug(&self) -> String {
        serde_json::to_string(self).unwrap_or_else(|_| format!("{:?}", self))
    }
}

/// Possible checksum algorithms for the encrypted data file.
#[derive(Deserialize, Serialize, Debug, Default, Clone, Copy, PartialEq, Eq)]
pub enum ChecksumAlgorithm {
    /// sha256
    #[default]
    SHA256,
}

/// Possible compression algorithms for data.
///
/// Note: compression is applied before encryption.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum CompressionAlgorithm {
    /// No compression
    Stored,
    /// Use gzip compression (level 1-9)
    Gzip(Option<u32>),
    /// Use zstandard compression (level 1-21)
    Zstandard(Option<i32>),
}

impl Default for CompressionAlgorithm {
    fn default() -> Self {
        Self::Zstandard(None)
    }
}

impl Serialize for CompressionAlgorithm {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: Serializer,
    {
        match *self {
            CompressionAlgorithm::Stored => {
                serializer.serialize_unit_variant("CompressionAlgorithm", 0, "stored")
            }
            CompressionAlgorithm::Gzip(_) => {
                serializer.serialize_unit_variant("CompressionAlgorithm", 1, "gzip")
            }
            CompressionAlgorithm::Zstandard(_) => {
                serializer.serialize_unit_variant("CompressionAlgorithm", 2, "zstandard")
            }
        }
    }
}

impl<'de> Deserialize<'de> for CompressionAlgorithm {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        struct CompressionAlgorithmVisitor;

        impl<'de> Visitor<'de> for CompressionAlgorithmVisitor {
            type Value = CompressionAlgorithm;

            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
                formatter.write_str("one of `stored`, `gzip`, `zstandard`")
            }

            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
            where
                E: serde::de::Error,
            {
                match v.to_lowercase().as_str() {
                    "stored" => Ok(CompressionAlgorithm::Stored),
                    "gzip" => Ok(CompressionAlgorithm::Gzip(None)),
                    "zstandard" => Ok(CompressionAlgorithm::Zstandard(None)),
                    _ => Err(E::custom(format!("unknown variant `{}`", v))),
                }
            }
        }
        deserializer.deserialize_str(CompressionAlgorithmVisitor {})
    }
}

/// Returns the current default metadata version.
pub fn default_version() -> String {
    "0.7.2".into()
}

/// Data package purpose determines if data is meant for testing or production.
#[derive(Copy, Clone, Deserialize, Serialize, Debug)]
pub enum Purpose {
    /// For sensitive data
    PRODUCTION,
    /// Only for testing
    TEST,
}

impl FromStr for Purpose {
    type Err = anyhow::Error;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s.to_lowercase().as_str() {
            "production" => Ok(Self::PRODUCTION),
            "test" => Ok(Self::TEST),
            _ => Err(anyhow::anyhow!("Invalid purpose: {}", s)),
        }
    }
}

/// Generates the default name for the data package based on a timestamp.
pub(crate) fn generate_package_name(timestamp: &DateTime<Utc>, prefix: Option<&str>) -> String {
    let ts = timestamp.format(DATETIME_FORMAT);
    if let Some(prefix) = prefix {
        format!("{prefix}_{ts}.zip")
    } else {
        format!("{ts}.zip")
    }
}