omry-archiving 0.13.0

Archiving abstractions for the Omry project.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
use chrono::Local;
use oxilangtag::LanguageTag;
use thiserror::Error;
use tracing::instrument;
use url::Url;

use crate::Document;
use datetime::ClientDateTimeRecord;

/// Possible errors that might occur when processing records.
#[derive(Debug, Error)]
pub enum RecordError {
    /// Error when parsing the url.
    #[error("Couldn't parse the URL {raw}: {msg}")]
    UrlParse {
        /// The original raw URL.
        raw: String,

        /// Error message from the parsing code.
        msg: String,
    },

    /// Error parsing the language tag.
    #[error("Couldn't parse the language")]
    LanguageParse {
        /// The original language record as given by the client.
        raw: String,

        /// Error message from the parsing code.
        msg: String,
    },
}

impl RecordError {
    /// Gets the original 'raw' record stored with this error.
    #[must_use]
    pub fn raw_record(&self) -> &str {
        match self {
            Self::UrlParse { raw, .. } | Self::LanguageParse { raw, .. } => raw.as_str(),
        }
    }
}

/// Result alias for fallible operations when constructing
/// or using [`Record`].
pub type Result<T> = std::result::Result<T, RecordError>;

/// Enum to encode the possible outcomes of parsing a language tag.
#[derive(Debug, Clone, PartialEq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum LanguageRecord {
    /// Variant containing a successfully parsed language tag.
    Parsed(LanguageTag<String>),

    /// Variant containing the raw value of the language tag if parsing failed.
    Raw {
        /// The original raw value for the language tag, as given by the client.
        value: String,

        /// The error message from the parsing code.
        error_msg: String,
    },
}

impl LanguageRecord {
    /// Constructs a new [`LanguageRecord`], regardless of whether [`oxilangtag`] parsed
    /// it successfully.
    fn new(language_tag: String) -> Self {
        language_tag.parse::<LanguageTag<_>>().map_or_else(
            |error| Self::Raw {
                value: language_tag,
                error_msg: error.to_string(),
            },
            Self::Parsed,
        )
    }

    /// Returns the [primary language subtag](https://datatracker.ietf.org/doc/html/rfc5646#section-2.2.1),
    /// as extracted by [`oxilangtag`].
    ///
    /// See [oxilangtag::LanguageTag::primary_language](https://docs.rs/oxilangtag/latest/oxilangtag/struct.LanguageTag.html#method.primary_language)
    ///
    /// ## Errors
    /// If oxilangtag cannot parse the original record's language tag,
    /// the error value will be the original record's language tag as it was specified in the record
    /// (or None if the record didn't have one).
    pub fn primary(&self) -> Result<String> {
        match self {
            LanguageRecord::Parsed(language_tag) => Ok(language_tag.primary_language().to_string()),
            LanguageRecord::Raw { value, error_msg } => Err(RecordError::LanguageParse {
                raw: value.to_owned(),
                msg: error_msg.to_owned(),
            }),
        }
    }
}

/// Url in the Record.
#[derive(Debug, PartialEq, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
enum UrlRecord {
    /// If we could successfully parse the URL.
    Parsed(Url),
    /// Original Url, if we could not parse it (and the error).
    Raw(String, String),
}

impl UrlRecord {
    /// Constructs a new [`UrlRecord`] regardless of whether [`Url`]
    /// parsed it successfully.
    fn new(url: &str) -> Self {
        url.parse::<Url>()
            .map_or_else(|e| Self::Raw(url.to_string(), e.to_string()), Self::Parsed)
    }

    /// Returns either the string representation of the url if it
    /// had been successfully parsed, or the original string
    /// if it was not.
    fn as_str(&self) -> &str {
        match self {
            Self::Parsed(url) => url.as_str(),
            Self::Raw(raw, _) => raw.as_str(),
        }
    }
}

/// Date time module for flora records.
pub mod datetime {
    use std::fmt::{Display, Formatter};

    use chrono::{DateTime, Datelike, FixedOffset, Local};
    use serde_with::serde_as;
    use tracing::instrument;

    /// A Record to store the local date/time, as provided by a flora client.
    #[serde_as]
    #[derive(Debug, PartialEq, Clone)]
    #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
    pub enum ClientDateTimeRecord {
        /// The naive date/time (without any time zone information).
        Parsed(DateTime<FixedOffset>),

        /// The local time of the flora instance, if we couldn't parse the one
        /// given by the client.
        Interpolated {
            /// The interpolated date time value.
            value: DateTime<FixedOffset>,
            /// The original string, as given by the client.
            raw: String,
        },
    }

    impl ClientDateTimeRecord {
        /// Creates a new instance of `DateTimeRecord` from the given RFC 3339 local time string.
        /// Note: If we get an error trying to parse the string, we will use the current local time.
        #[instrument]
        pub fn new(local_datetime_client: &str) -> Self {
            DateTime::parse_from_rfc3339(local_datetime_client)
                .inspect_err(|e| tracing::warn!("couldn't parse {local_datetime_client}: {e}"))
                .map_or_else(
                    |_| Self::Interpolated {
                        value: Local::now().into(),
                        raw: local_datetime_client.to_string(),
                    },
                    Self::Parsed,
                )
        }

        /// Retrieve the year component of the date/time value, regardless
        /// of whether we originally successfully parsed this from the date/time data
        /// submitted by the client, or interpolated it on the server.
        #[must_use]
        pub fn year(&self) -> i32 {
            match self {
                Self::Parsed(dt) => dt.year(),
                Self::Interpolated { value, .. } => value.year(),
            }
        }
    }

    impl Display for ClientDateTimeRecord {
        fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
            let dt = match self {
                Self::Parsed(dt) => dt,
                Self::Interpolated { value, .. } => value,
            };
            write!(f, "{}", dt.to_rfc3339())
        }
    }
}

/// Parameters used to construct the [`Record`].
#[derive(Debug)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct RecordParams {
    /// SQLite/Typesense ID.
    pub id: Option<i64>,

    /// Original URL where the client got this record.
    pub url: String,

    /// Title for this record (may not match the web page title, if any).
    pub title: String,

    /// RFC 3339 date and time in local time zone, with UTC offset.
    pub client_datetime: String,

    /// Unix timestamp (seconds) of the flora server when the record was first added.
    pub timestamp_flora: Option<i64>,

    /// Web archive holder.
    pub document: Document,

    /// The record's language (if any was specified).
    pub language: Option<String>,
}

/// Data from the original record.
#[derive(Debug)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct Record {
    /// SQLite/Typesense ID. Assigned by SQLite, so records that haven't yet
    /// been added to SQLite will have it as `None`.
    id: Option<i64>,

    /// Internal [`UrlRecord`] value.
    url: UrlRecord,

    /// Title for this record (may not match the web page title, if any).
    title: String,

    /// Local date/time.
    client_datetime: ClientDateTimeRecord,

    /// Unix timestamp from the flora server when the record was first added.
    timestamp_flora: i64,

    /// The record's language (if any was specified).
    language: Option<LanguageRecord>,

    /// Web archive
    document: Document,
}

impl Record {
    /// Construct a new Record from the given params.
    #[instrument]
    pub fn new(params: RecordParams) -> Self {
        let url = UrlRecord::new(&params.url);
        let client_datetime = ClientDateTimeRecord::new(params.client_datetime.as_str());
        let language = params.language.map(LanguageRecord::new);
        let timestamp_flora = params
            .timestamp_flora
            .unwrap_or_else(|| Local::now().timestamp());

        Self {
            id: params.id,
            title: params.title,
            url,
            client_datetime,
            timestamp_flora,
            language,
            document: params.document,
        }
    }

    /// This record's ID in SQLite and Typesense.
    #[must_use]
    pub fn id(&self) -> Option<i64> {
        self.id
    }

    /// Get the url. Not guaranteed to be valid (the client may have provided an invalid one).
    #[must_use]
    pub fn url(&self) -> &str {
        self.url.as_str()
    }

    /// This record's title, as set by the client
    /// (may not match the web page title, if any).
    #[must_use]
    pub fn title(&self) -> &str {
        &self.title
    }

    /// Local date/time as set by the client (or interpolated,
    /// if we failed to parse the value set by the client).
    #[must_use]
    pub fn client_datetime(&self) -> &ClientDateTimeRecord {
        &self.client_datetime
    }

    /// Unix timestamp from the flora server when the record was first added.
    #[must_use]
    pub fn timestamp_archived(&self) -> i64 {
        self.timestamp_flora
    }

    /// Returns the host component of `Self::url`.
    ///
    /// ## Errors
    /// Returns [`RecordError::UrlParse`] if this record's URL couldn't be parsed.
    pub fn host(&self) -> Result<Option<String>> {
        match &self.url {
            UrlRecord::Parsed(url) => Ok(url.host().map(|h| h.to_string())),
            UrlRecord::Raw(raw, error) => Err(RecordError::UrlParse {
                raw: raw.clone(),
                msg: error.clone(),
            }),
        }
    }

    /// The record's language (if any was specified).
    #[must_use]
    pub fn language(&self) -> Option<&LanguageRecord> {
        self.language.as_ref()
    }

    /// The archived contents stored in this record.
    #[must_use]
    pub fn document(&self) -> &Document {
        &self.document
    }
}

impl PartialEq for Record {
    /// Records with identical data compare equal if both have the same id,
    /// if one has an id that is `None`. If the ids are different, the records
    /// will compare unequal.
    fn eq(&self, other: &Self) -> bool {
        if let (Some(this_id), Some(other_id)) = (self.id, other.id)
            && this_id != other_id
        {
            return false;
        }

        // idea: this may result in false positives; should eventually go back
        // to computing and verifying checksums
        self.url == other.url
            && self.client_datetime == other.client_datetime
            && self.timestamp_flora == other.timestamp_flora
    }
}

/// Convert to a [`Record`].
pub trait ToRecord {
    /// The associated error which can be returned from converting to [`Record`].
    type Error: std::error::Error;

    /// Converts a value of this type to a flora record.
    ///
    /// ## Errors
    /// Returns [`std::error::Error`] if the [`Record`] couldn't be created.
    fn to_record(&self) -> std::result::Result<Record, Self::Error>;
}

impl<T> ToRecord for &T
where
    T: ToRecord,
{
    type Error = <T as ToRecord>::Error;

    fn to_record(&self) -> std::result::Result<Record, Self::Error> {
        (*self).to_record()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    use pretty_assertions::{assert_eq, assert_ne};
    use proptest::prelude::*;
    use tracing_test::traced_test;

    use crate::document::tests::load_test_web_doc;

    fn test_record_params() -> anyhow::Result<RecordParams> {
        let document = load_test_web_doc()?;
        let url = "https://example.org/".to_string();
        let title = "Example Domain";
        let client_datetime = "2024-10-11T13:49:46-05:00";

        #[allow(clippy::unreadable_literal)]
        let timestamp_flora = Some(1728695243i64);
        let language = Some("en".to_string());

        Ok(RecordParams {
            id: None,
            url,
            title: title.to_string(),
            client_datetime: client_datetime.to_string(),
            timestamp_flora,
            language,
            document,
        })
    }

    #[test]
    fn can_create_record() -> anyhow::Result<()> {
        let record_params = test_record_params()?;
        let record = Record::new(record_params);
        insta::assert_debug_snapshot!(record);
        Ok(())
    }

    #[test]
    fn record_with_some_id_equals_record_with_none_id() -> anyhow::Result<()> {
        let record = Record::new(test_record_params()?);
        let mut also_record = Record::new(test_record_params()?);
        also_record.id = Some(37);
        assert_eq!(record, also_record);
        Ok(())
    }

    #[test]
    fn records_with_different_ids_compare_unequal() -> anyhow::Result<()> {
        let mut record = Record::new(test_record_params()?);
        record.id = Some(37);
        let mut also_record = Record::new(test_record_params()?);
        also_record.id = Some(42);
        assert_ne!(record, also_record);
        Ok(())
    }

    proptest! {
        #[test]
        fn parse_language(s in "[a-z]{2,6}") {
            // https://tools.ietf.org/html/rfc5646
            let language = LanguageRecord::new(s.clone());
            prop_assert!(language.primary().is_ok());
        }
    }

    #[test]
    fn can_parse_rfc3339_datetime() {
        let dt = "2025-08-08T15:28:02-05:00";
        let parsed = datetime::ClientDateTimeRecord::new(dt);
        insta::assert_debug_snapshot!(parsed);
    }

    #[traced_test]
    #[test]
    fn malformed_datetime_traces_warning() {
        let dt = "2025-08-11T21:49:15.031404172";
        let _ = datetime::ClientDateTimeRecord::new(dt);

        logs_assert(|lines: &[&str]| {
            let line = lines
                .first()
                .ok_or_else(|| "No tracing lines".to_string())?;
            let warning_start = line
                .find("WARN")
                .ok_or_else(|| "trace contains no warning".to_string())?;

            let (_timestamp, warning) = line.split_at(warning_start);
            insta::assert_snapshot!(warning);
            Ok(())
        });
    }
}