Skip to main content

omry_archiving/
record.rs

1//! Data structures for archived records.
2use chrono::{DateTime, FixedOffset, Local, Utc};
3use oxilangtag::LanguageTag;
4use std::fmt;
5use thiserror::Error;
6use tracing::instrument;
7use url::Url;
8
9use crate::Document;
10use datetime::ClientDateTimeRecord;
11
12/// Error type returned by fallible operations on records.
13#[derive(Debug, Error)]
14#[error(transparent)]
15pub struct RecordError(#[from] RecordErrorRepr);
16
17/// Internal error representation for failures that we can run into when working with records.
18#[derive(Debug, Error)]
19enum RecordErrorRepr {
20    /// Error getting data from the record (such as values that failed parsing).
21    #[error("Error getting the record data: {0}")]
22    Data(#[from] RecordDataError),
23}
24
25/// Possible errors that might occur when processing data from records.
26#[derive(Debug, Error)]
27pub enum RecordDataError {
28    /// Error when parsing the url.
29    #[error("Couldn't parse the URL {raw}: {msg}")]
30    UrlParse {
31        /// The original raw URL.
32        raw: String,
33
34        /// Error message from the parsing code.
35        msg: String,
36    },
37
38    /// Error parsing the language tag.
39    #[error("Couldn't parse the language")]
40    LanguageParse {
41        /// The original language record as given by the client.
42        raw: String,
43
44        /// Error message from the parsing code.
45        msg: String,
46    },
47}
48
49impl RecordDataError {
50    /// Gets the original 'raw' data stored with this error.
51    #[must_use]
52    pub fn raw_data(&self) -> &str {
53        match self {
54            Self::UrlParse { raw, .. } | Self::LanguageParse { raw, .. } => raw.as_str(),
55        }
56    }
57}
58
59/// Result alias for fallible operations when constructing
60/// or using [`Record`] data that needs parsing, or might be missing.
61type DataResult<T> = std::result::Result<T, RecordDataError>;
62
63/// Enum to encode the possible outcomes of parsing a language tag.
64#[derive(Debug, Clone, PartialEq)]
65#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
66pub enum LanguageRecord {
67    /// Variant containing a successfully parsed language tag.
68    Parsed(LanguageTag<String>),
69
70    /// Variant containing the raw value of the language tag if parsing failed.
71    Raw {
72        /// The original raw value for the language tag, as given by the client.
73        value: String,
74
75        /// The error message from the parsing code.
76        error_msg: String,
77    },
78}
79
80impl LanguageRecord {
81    /// Constructs a new [`LanguageRecord`], regardless of whether [`oxilangtag`] parsed
82    /// it successfully.
83    pub fn new(language_tag: String) -> Self {
84        language_tag.parse::<LanguageTag<_>>().map_or_else(
85            |error| Self::Raw {
86                value: language_tag,
87                error_msg: error.to_string(),
88            },
89            Self::Parsed,
90        )
91    }
92
93    /// Returns the [primary language subtag](https://datatracker.ietf.org/doc/html/rfc5646#section-2.2.1),
94    /// as extracted by [`oxilangtag`].
95    ///
96    /// See [oxilangtag::LanguageTag::primary_language](https://docs.rs/oxilangtag/latest/oxilangtag/struct.LanguageTag.html#method.primary_language)
97    ///
98    /// ## Errors
99    /// If oxilangtag cannot parse the original record's language tag,
100    /// the error value will be the original record's language tag as it was specified in the record
101    /// (or None if the record didn't have one).
102    pub fn primary(&self) -> DataResult<String> {
103        match self {
104            LanguageRecord::Parsed(language_tag) => Ok(language_tag.primary_language().to_string()),
105            LanguageRecord::Raw { value, error_msg } => Err(RecordDataError::LanguageParse {
106                raw: value.to_owned(),
107                msg: error_msg.to_owned(),
108            }),
109        }
110    }
111}
112
113impl fmt::Display for LanguageRecord {
114    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
115        match self.primary() {
116            Ok(tag) => write!(f, "{tag}"),
117            Err(RecordDataError::LanguageParse { raw, .. }) => write!(f, "{raw}"),
118            Err(_) => unreachable!("there are no other possible error states"),
119        }
120    }
121}
122
123/// Url in the Record.
124#[derive(Debug, PartialEq, Clone)]
125#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
126enum UrlRecord {
127    /// If we could successfully parse the URL.
128    Parsed(Url),
129    /// Original Url, if we could not parse it (and the error).
130    Raw(String, String),
131}
132
133impl UrlRecord {
134    /// Constructs a new [`UrlRecord`] regardless of whether [`Url`]
135    /// parsed it successfully.
136    fn new(url: &str) -> Self {
137        url.parse::<Url>()
138            .map_or_else(|e| Self::Raw(url.to_string(), e.to_string()), Self::Parsed)
139    }
140
141    /// Returns either the string representation of the url if it
142    /// had been successfully parsed, or the original string
143    /// if it was not.
144    fn as_str(&self) -> &str {
145        match self {
146            Self::Parsed(url) => url.as_str(),
147            Self::Raw(raw, _) => raw.as_str(),
148        }
149    }
150}
151
152/// Date time module for flora records.
153pub mod datetime {
154    use std::fmt::{Display, Formatter};
155
156    use chrono::{DateTime, Datelike, FixedOffset, Local};
157    use serde_with::serde_as;
158    use tracing::instrument;
159
160    /// A Record to store the local date/time, as provided by a flora client.
161    #[serde_as]
162    #[derive(Debug, PartialEq, Clone)]
163    #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
164    pub enum ClientDateTimeRecord {
165        /// The naive date/time (without any time zone information).
166        Parsed(DateTime<FixedOffset>),
167
168        /// The local time of the flora instance, if we couldn't parse the one
169        /// given by the client.
170        Interpolated {
171            /// The interpolated date time value.
172            value: DateTime<FixedOffset>,
173            /// The original string, as given by the client.
174            raw: String,
175        },
176    }
177
178    impl ClientDateTimeRecord {
179        /// Creates a new instance of `DateTimeRecord` from the given RFC 3339 local time string.
180        /// Note: If we get an error trying to parse the string, we will use the current local time.
181        #[instrument]
182        pub fn new(local_datetime_client: &str) -> Self {
183            DateTime::parse_from_rfc3339(local_datetime_client)
184                .inspect_err(|e| tracing::warn!("couldn't parse {local_datetime_client}: {e}"))
185                .map_or_else(
186                    |_| Self::Interpolated {
187                        value: Local::now().into(),
188                        raw: local_datetime_client.to_string(),
189                    },
190                    Self::Parsed,
191                )
192        }
193
194        /// Retrieve the year component of the date/time value, regardless
195        /// of whether we originally successfully parsed this from the date/time data
196        /// submitted by the client, or interpolated it on the server.
197        #[must_use]
198        pub fn year(&self) -> i32 {
199            match self {
200                Self::Parsed(dt) => dt.year(),
201                Self::Interpolated { value, .. } => value.year(),
202            }
203        }
204    }
205
206    impl Display for ClientDateTimeRecord {
207        fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
208            let dt = match self {
209                Self::Parsed(dt) => dt,
210                Self::Interpolated { value, .. } => value,
211            };
212            write!(f, "{}", dt.to_rfc3339())
213        }
214    }
215}
216
217/// Parameters used to construct the [`Record`].
218#[derive(Debug)]
219#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
220pub struct RecordParams {
221    /// SQLite database ID (if already imported).
222    pub id: Option<i64>,
223
224    /// Original URL from which the client got this record.
225    pub url: String,
226
227    /// Title for this record (may not match the web page title, if any).
228    pub title: String,
229
230    /// RFC 3339 date and time in local time zone, with UTC offset.
231    pub client_datetime: String,
232
233    /// Unix timestamp (seconds) of the flora server when the record was first added.
234    pub timestamp_flora: Option<i64>,
235
236    /// Web archive holder.
237    pub document: Document,
238
239    /// The record's language (if any was specified).
240    pub language: Option<String>,
241
242    /// Unix timestamp from the server, present if the record was updated.
243    pub updated_at: Option<i64>,
244}
245
246impl From<Record> for RecordParams {
247    fn from(rec: Record) -> Self {
248        let Record {
249            id,
250            url,
251            title,
252            client_datetime,
253            timestamp_flora,
254            language,
255            document,
256            updated_at,
257        } = rec;
258        let url = url.as_str().to_string();
259        let language = language.map(|lang| lang.to_string());
260        // this will return the RFC 3339 representation
261        let client_datetime = client_datetime.to_string();
262
263        Self {
264            id,
265            url,
266            title,
267            client_datetime,
268            timestamp_flora: Some(timestamp_flora),
269            document,
270            language,
271            updated_at,
272        }
273    }
274}
275
276/// Data from the original record.
277#[must_use]
278#[derive(Debug)]
279#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
280pub struct Record {
281    /// Record's ID in SQLite. Assigned by SQLite, so records that haven't yet
282    /// been added to SQLite will have it as `None`.
283    id: Option<i64>,
284
285    /// Internal [`UrlRecord`] value.
286    url: UrlRecord,
287
288    /// Title for this record (may not match the web page title, if any).
289    title: String,
290
291    /// Local date/time.
292    client_datetime: ClientDateTimeRecord,
293
294    /// Unix timestamp from the flora server when the record was first added.
295    timestamp_flora: i64,
296
297    /// The record's language (if any was specified).
298    language: Option<LanguageRecord>,
299
300    /// Web archive
301    document: Document,
302
303    /// Unix timestamp from the server, present if the record was updated.
304    updated_at: Option<i64>,
305}
306
307impl Record {
308    /// Construct a new Record from the given params.
309    #[instrument]
310    pub fn new(params: RecordParams) -> Self {
311        let RecordParams {
312            id,
313            url,
314            title,
315            client_datetime,
316            timestamp_flora,
317            language,
318            document,
319            updated_at,
320        } = params;
321        let url = UrlRecord::new(&url);
322        let client_datetime = ClientDateTimeRecord::new(client_datetime.as_str());
323        let language = language.map(LanguageRecord::new);
324        let timestamp_flora = timestamp_flora.unwrap_or_else(|| Local::now().timestamp());
325
326        Self {
327            id,
328            url,
329            title,
330            client_datetime,
331            timestamp_flora,
332            language,
333            document,
334            updated_at,
335        }
336    }
337
338    /// This record's ID in SQLite and Typesense.
339    #[must_use]
340    pub fn id(&self) -> Option<i64> {
341        self.id
342    }
343
344    /// Get the url. Not guaranteed to be valid (the client may have provided an invalid one).
345    #[must_use]
346    pub fn url(&self) -> &str {
347        self.url.as_str()
348    }
349
350    /// This record's title, as set by the client
351    /// (may not match the web page title, if any).
352    #[must_use]
353    pub fn title(&self) -> &str {
354        &self.title
355    }
356
357    /// Local date/time as set by the client (or interpolated,
358    /// if we failed to parse the value set by the client).
359    #[must_use]
360    pub fn client_datetime(&self) -> &ClientDateTimeRecord {
361        &self.client_datetime
362    }
363
364    /// Unix timestamp from the flora server when the record was first added.
365    #[must_use]
366    pub fn timestamp_archived(&self) -> i64 {
367        self.timestamp_flora
368    }
369
370    /// Unix timestamp from the server, present if the record was updated.
371    #[must_use]
372    pub fn updated_at(&self) -> Option<i64> {
373        self.updated_at
374    }
375
376    /// Returns a new record with `updated_at` set to current timestamp.
377    ///
378    /// # Notes
379    /// This method is used by the server. It is not necessary
380    /// to call it on the client, and any timestamp set by the client
381    /// may be overwritten by the server.
382    pub fn with_updated_at_now(mut self) -> Self {
383        self.updated_at = Some(Utc::now().timestamp());
384        self
385    }
386
387    /// Returns the host component of `Self::url`.
388    ///
389    /// ## Errors
390    /// Returns [`RecordDataError`] if this record's URL couldn't be parsed.
391    pub fn host(&self) -> DataResult<Option<String>> {
392        match &self.url {
393            UrlRecord::Parsed(url) => Ok(url.host().map(|h| h.to_string())),
394            UrlRecord::Raw(raw, error) => Err(RecordDataError::UrlParse {
395                raw: raw.clone(),
396                msg: error.clone(),
397            }),
398        }
399    }
400
401    /// The record's language (if any was specified).
402    #[must_use]
403    pub fn language(&self) -> Option<&LanguageRecord> {
404        self.language.as_ref()
405    }
406
407    /// The archived contents stored in this record.
408    #[must_use]
409    pub fn document(&self) -> &Document {
410        &self.document
411    }
412
413    /// Returns the document (web page or other data) contained in this [`Record`],
414    /// consuming it.
415    #[must_use]
416    pub fn into_document(self) -> Document {
417        self.document
418    }
419}
420
421impl PartialEq for Record {
422    /// Records with identical data compare equal if both have the same id,
423    /// if one has an id that is `None`. If the ids are different, the records
424    /// will compare unequal.
425    fn eq(&self, other: &Self) -> bool {
426        if let (Some(this_id), Some(other_id)) = (self.id, other.id)
427            && this_id != other_id
428        {
429            return false;
430        }
431
432        // idea: this may result in false positives; should eventually go back
433        // to computing and verifying checksums
434        self.url == other.url
435            && self.client_datetime == other.client_datetime
436            && self.timestamp_flora == other.timestamp_flora
437    }
438}
439
440/// Convert to a [`Record`].
441pub trait ToRecord {
442    /// The associated error which can be returned from converting to [`Record`].
443    type Error: std::error::Error;
444
445    /// Converts a value of this type to a flora record.
446    ///
447    /// ## Errors
448    /// Returns [`std::error::Error`] if the [`Record`] couldn't be created.
449    fn to_record(&self) -> std::result::Result<Record, Self::Error>;
450}
451
452impl<T> ToRecord for &T
453where
454    T: ToRecord,
455{
456    type Error = <T as ToRecord>::Error;
457
458    fn to_record(&self) -> std::result::Result<Record, Self::Error> {
459        (*self).to_record()
460    }
461}
462
463/// A [`Record`]'s metadata.
464///
465/// Includes all fields except the content.
466#[derive(Debug, Clone)]
467#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
468pub struct RecordMeta {
469    /// SQLite ID.
470    pub id: i64,
471
472    /// Original URL from which the client got this record.
473    pub url: String,
474
475    /// Title for this record (may not match the web page title, if any).
476    pub title: String,
477
478    /// Local date/time.
479    pub client_datetime: ClientDateTimeRecord,
480
481    /// Unix timestamp from the Omry server when the record was first added.
482    pub timestamp_flora: i64,
483
484    /// The record's language (if any was specified).
485    pub language: Option<LanguageRecord>,
486
487    /// Unix timestamp from the server, present if the record was updated.
488    pub updated_at: Option<i64>,
489}
490
491/// A record with the raw document data (not deserialized).
492///
493/// Nearly 1:1 mapping with `DbRecord`, except this requires
494/// `id` to be set.
495#[derive(Debug)]
496#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
497pub struct RawRecord {
498    /// The SQLite row ID of this record.
499    pub id: i64,
500
501    /// The original URL that corresponds to this record.
502    pub url: String,
503
504    /// The date and time when the client saved or submitted the record.
505    pub client_datetime: DateTime<FixedOffset>,
506
507    /// Raw date time, in case we couldn't parse it.
508    pub client_datetime_raw: Option<String>,
509
510    /// The timestamp when omry server first got this record.
511    pub timestamp_flora: i64,
512
513    /// The page title that the client provided for this record.
514    pub title: String,
515
516    /// The language we extracted from this record's metadata, if any.
517    pub language: Option<String>,
518
519    /// The binary serialized content of this record's document.
520    pub document: Vec<u8>,
521
522    /// Unix timestamp from the server, present if the record was updated.
523    pub updated_at: Option<i64>,
524}
525
526#[cfg(test)]
527mod tests {
528    use super::*;
529
530    use pretty_assertions::{assert_eq, assert_ne};
531    use proptest::prelude::*;
532    use tracing_test::traced_test;
533
534    use crate::document::tests::load_test_web_doc;
535
536    fn test_record_params() -> anyhow::Result<RecordParams> {
537        let document = load_test_web_doc()?;
538        let url = "https://example.org/".to_string();
539        let title = "Example Domain";
540        let client_datetime = "2024-10-11T13:49:46-05:00";
541
542        #[allow(clippy::unreadable_literal)]
543        let timestamp_flora = Some(1728695243i64);
544        let language = Some("en".to_string());
545
546        Ok(RecordParams {
547            id: None,
548            url,
549            title: title.to_string(),
550            client_datetime: client_datetime.to_string(),
551            timestamp_flora,
552            language,
553            document,
554            updated_at: None,
555        })
556    }
557
558    #[test]
559    fn can_create_record() -> anyhow::Result<()> {
560        let record_params = test_record_params()?;
561        let record = Record::new(record_params);
562        insta::assert_debug_snapshot!(record);
563        Ok(())
564    }
565
566    #[test]
567    fn record_with_some_id_equals_record_with_none_id() -> anyhow::Result<()> {
568        let record = Record::new(test_record_params()?);
569        let mut also_record = Record::new(test_record_params()?);
570        also_record.id = Some(37);
571        assert_eq!(record, also_record);
572        Ok(())
573    }
574
575    #[test]
576    fn records_with_different_ids_compare_unequal() -> anyhow::Result<()> {
577        let mut record = Record::new(test_record_params()?);
578        record.id = Some(37);
579        let mut also_record = Record::new(test_record_params()?);
580        also_record.id = Some(42);
581        assert_ne!(record, also_record);
582        Ok(())
583    }
584
585    proptest! {
586        #[test]
587        fn parse_language(s in "[a-z]{2,6}") {
588            // https://tools.ietf.org/html/rfc5646
589            let language = LanguageRecord::new(s.clone());
590            prop_assert!(language.primary().is_ok());
591        }
592    }
593
594    #[test]
595    fn can_parse_rfc3339_datetime() {
596        let dt = "2025-08-08T15:28:02-05:00";
597        let parsed = datetime::ClientDateTimeRecord::new(dt);
598        insta::assert_debug_snapshot!(parsed);
599    }
600
601    #[traced_test]
602    #[test]
603    fn malformed_datetime_traces_warning() {
604        let dt = "2025-08-11T21:49:15.031404172";
605        let _ = datetime::ClientDateTimeRecord::new(dt);
606
607        logs_assert(|lines: &[&str]| {
608            let line = lines
609                .first()
610                .ok_or_else(|| "No tracing lines".to_string())?;
611            let warning_start = line
612                .find("WARN")
613                .ok_or_else(|| "trace contains no warning".to_string())?;
614
615            let (_timestamp, warning) = line.split_at(warning_start);
616            insta::assert_snapshot!(warning);
617            Ok(())
618        });
619    }
620
621    #[test]
622    fn can_take_owned_document_from_record() -> anyhow::Result<()> {
623        let record_params = test_record_params()?;
624        let record = Record::new(record_params);
625        insta::assert_debug_snapshot!(record.into_document());
626        Ok(())
627    }
628}