warc/
record.rs

1use chrono::prelude::*;
2use std::borrow::Cow;
3use std::collections::HashMap;
4use std::fmt;
5use std::io::Read;
6
7use uuid::Uuid;
8
9use crate::header::WarcHeader;
10use crate::record_type::RecordType;
11use crate::truncated_type::TruncatedType;
12use crate::Error as WarcError;
13
14use streaming_trait::BodyKind;
15pub use streaming_trait::{BufferedBody, EmptyBody, StreamingBody};
16
17mod streaming_trait {
18    use std::io::Read;
19
20    /// An associated type indicating how the body of a record is represented.
21    pub trait BodyKind {
22        fn content_length(&self) -> u64;
23    }
24
25    #[derive(Clone, Debug, PartialEq)]
26    /// An associated type indicating the body is buffered within the record.
27    pub struct BufferedBody(pub Vec<u8>);
28    impl BodyKind for BufferedBody {
29        fn content_length(&self) -> u64 {
30            self.0.len() as u64
31        }
32    }
33
34    /// An associated type indicating the body is streamed from a reader.
35    pub struct StreamingBody<'t, T: Read + 't>(&'t mut T, &'t mut u64);
36    impl<'t, T: Read + 't> StreamingBody<'t, T> {
37        pub(crate) fn new(stream: &'t mut T, max_len: &'t mut u64) -> StreamingBody<'t, T> {
38            StreamingBody(stream, max_len)
39        }
40
41        pub(crate) fn len(&self) -> u64 {
42            *self.1
43        }
44    }
45    impl<'t, T: Read + 't> BodyKind for StreamingBody<'t, T> {
46        fn content_length(&self) -> u64 {
47            *self.1
48        }
49    }
50
51    impl<'t, T: Read + 't> Read for StreamingBody<'t, T> {
52        fn read(&mut self, data: &mut [u8]) -> std::io::Result<usize> {
53            let max_read = std::cmp::min(data.len(), *self.1 as usize);
54            self.0.read(&mut data[..max_read]).inspect(|&n| {
55                *self.1 -= n as u64;
56            })
57        }
58    }
59
60    #[derive(Clone, Copy, Debug)]
61    /// An associated type indicated the record has a zero-length body.
62    pub struct EmptyBody();
63    impl BodyKind for EmptyBody {
64        fn content_length(&self) -> u64 {
65            0
66        }
67    }
68}
69
70/// A header block of a single WARC record as parsed from a data stream.
71///
72/// It is guaranteed to be well-formed, but may not be valid according to the specification.
73///
74/// Use the `Display` trait to generate the formatted representation.
75#[derive(Clone, Debug, PartialEq)]
76pub struct RawRecordHeader {
77    /// The WARC standard version this record reports conformance to.
78    pub version: String,
79    /// All headers that are part of this record.
80    pub headers: HashMap<WarcHeader, Vec<u8>>,
81}
82
83impl AsRef<HashMap<WarcHeader, Vec<u8>>> for RawRecordHeader {
84    fn as_ref(&self) -> &HashMap<WarcHeader, Vec<u8>> {
85        &self.headers
86    }
87}
88
89impl AsMut<HashMap<WarcHeader, Vec<u8>>> for RawRecordHeader {
90    fn as_mut(&mut self) -> &mut HashMap<WarcHeader, Vec<u8>> {
91        &mut self.headers
92    }
93}
94
95impl std::convert::TryFrom<RawRecordHeader> for Record<EmptyBody> {
96    type Error = WarcError;
97    fn try_from(mut headers: RawRecordHeader) -> Result<Self, WarcError> {
98        headers
99            .as_mut()
100            .remove(&WarcHeader::ContentLength)
101            .ok_or(WarcError::MissingHeader(WarcHeader::ContentLength))
102            .and_then(|vec| {
103                String::from_utf8(vec).map_err(|_| {
104                    WarcError::MalformedHeader(WarcHeader::Date, "not a UTF-8 string".to_string())
105                })
106            })?;
107
108        let record_type = headers
109            .as_mut()
110            .remove(&WarcHeader::WarcType)
111            .ok_or(WarcError::MissingHeader(WarcHeader::WarcType))
112            .and_then(|vec| {
113                String::from_utf8(vec).map_err(|_| {
114                    WarcError::MalformedHeader(
115                        WarcHeader::WarcType,
116                        "not a UTF-8 string".to_string(),
117                    )
118                })
119            })
120            .map(|rtype| rtype.into())?;
121
122        let record_id = headers
123            .as_mut()
124            .remove(&WarcHeader::RecordID)
125            .ok_or(WarcError::MissingHeader(WarcHeader::RecordID))
126            .and_then(|vec| {
127                String::from_utf8(vec).map_err(|_| {
128                    WarcError::MalformedHeader(WarcHeader::Date, "not a UTF-8 string".to_string())
129                })
130            })?;
131
132        let record_date = headers
133            .as_mut()
134            .remove(&WarcHeader::Date)
135            .ok_or(WarcError::MissingHeader(WarcHeader::Date))
136            .and_then(|vec| {
137                String::from_utf8(vec).map_err(|_| {
138                    WarcError::MalformedHeader(WarcHeader::Date, "not a UTF-8 string".to_string())
139                })
140            })
141            .and_then(|date| Record::<BufferedBody>::parse_record_date(&date))?;
142
143        Ok(Record {
144            headers,
145            record_date,
146            record_id,
147            record_type,
148            body: EmptyBody(),
149            ..Default::default()
150        })
151    }
152}
153
154impl std::fmt::Display for RawRecordHeader {
155    fn fmt(&self, w: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
156        writeln!(w, "WARC/{}", self.version)?;
157        for (key, value) in self.as_ref().iter() {
158            writeln!(w, "{}: {}", key, String::from_utf8_lossy(value))?;
159        }
160        writeln!(w)?;
161
162        Ok(())
163    }
164}
165
166/// A builder for WARC records from data.
167#[derive(Default)]
168pub struct RecordBuilder {
169    value: Record<BufferedBody>,
170    broken_headers: HashMap<WarcHeader, Vec<u8>>,
171    last_error: Option<WarcError>,
172}
173
174// HACK: std::io::Error doesn't implement Clone, this is the next best thing
175// see: https://github.com/rust-lang/rust/issues/24135
176impl Clone for RecordBuilder {
177    fn clone(&self) -> Self {
178        let err: Option<&WarcError> = self.last_error.as_ref();
179        let last_error: Option<WarcError> = err.map(|err| match err {
180            WarcError::ReadData(e) => WarcError::ReadData(std::io::Error::from(e.kind())),
181            WarcError::ParseHeaders(e) => WarcError::ParseHeaders(e.clone()),
182            WarcError::MissingHeader(e) => WarcError::MissingHeader(e.clone()),
183            WarcError::MalformedHeader(h, e) => WarcError::MalformedHeader(h.clone(), e.clone()),
184            WarcError::ReadOverflow => WarcError::ReadOverflow,
185            WarcError::UnexpectedEOB => WarcError::UnexpectedEOB,
186        });
187        RecordBuilder {
188            value: self.value.clone(),
189            broken_headers: self.broken_headers.clone(),
190            last_error,
191        }
192    }
193}
194
195/// A single WARC record.
196///
197/// A record can be constructed by a `RecordBuilder`, or by reading from a stream.
198///
199/// The associated type `T` indicates the representation of this record's body.
200///
201/// A record is guaranteed to be valid according to the specification it conforms to, except:
202/// * The validity of the WARC-Record-ID header is not checked
203/// * Date information not in the UTC timezone will be silently converted to UTC
204///
205/// Use the `Display` trait to generate the formatted representation.
206#[derive(Debug, PartialEq)]
207pub struct Record<T: BodyKind> {
208    // NB: invariant: does not contain the headers stored in the struct
209    headers: RawRecordHeader,
210    record_date: DateTime<Utc>,
211    record_id: String,
212    record_type: RecordType,
213    truncated_type: Option<TruncatedType>,
214    body: T,
215}
216
217impl<T: BodyKind> Record<T> {
218    /// Create a new empty record with default values.
219    ///
220    /// Using a `RecordBuilder` is more efficient when creating records from known data.
221    ///
222    /// The record returned contains an empty body, and the following fields:
223    /// * WARC-Record-ID: generated by `generate_record_id()`
224    /// * WARC-Date: the current moment in time
225    /// * WARC-Type: resource
226    /// * WARC-Content-Length: 0
227    pub fn new() -> Record<EmptyBody> {
228        Record::default()
229    }
230
231    /// Create a new empty record with a known body.
232    ///
233    /// Using a `RecordBuilder` is more efficient when creating records from known data.
234    ///
235    /// The record returned contains the passed body buffer, and the following fields:
236    /// * WARC-Record-ID: generated by `generate_record_id()`
237    /// * WARC-Date: the current moment in time
238    /// * WARC-Type: resource
239    /// * WARC-Content-Length: `body.len()`
240    pub fn with_body<B: Into<Vec<u8>>>(body: B) -> Record<BufferedBody> {
241        Record {
242            body: BufferedBody(body.into()),
243            ..Record::default()
244        }
245    }
246
247    /// Generate and return a new value suitable for use in the WARC-Record-ID header.
248    ///
249    /// # Compatibility
250    /// The standard only places a small number of constraints on this field:
251    /// 1. This value is globally unique "for its period of use"
252    /// 1. This value is a valid URI
253    /// 1. This value "clearly indicate\[s\] a documented and registered scheme to which it conforms."
254    ///
255    /// These guarantees will be upheld by all generated outputs, where the "period of use" is
256    /// presumed to be indefinite and unlimited.
257    ///
258    /// However, any *specific algorithm* used to generate values is **not** part of the crate's
259    /// public API for purposes of semantic versioning.
260    ///
261    /// # Implementation
262    /// The current implementation generates random values based on UUID version 4.
263    ///
264    pub fn generate_record_id() -> String {
265        format!("<{}>", Uuid::new_v4().to_urn())
266    }
267
268    fn parse_content_length(len: &str) -> Result<u64, WarcError> {
269        (len).parse::<u64>().map_err(|_| {
270            WarcError::MalformedHeader(
271                WarcHeader::ContentLength,
272                "not an integer between 0 and 2^64-1".to_string(),
273            )
274        })
275    }
276
277    fn parse_record_date(date: &str) -> Result<DateTime<Utc>, WarcError> {
278        DateTime::parse_from_rfc3339(date)
279            .map_err(|_| {
280                WarcError::MalformedHeader(
281                    WarcHeader::Date,
282                    "not an ISO 8601 datestamp".to_string(),
283                )
284            })
285            .map(|date| date.into())
286    }
287
288    /// Return the WARC version string of this record.
289    pub fn warc_version(&self) -> &str {
290        &self.headers.version
291    }
292
293    /// Set the WARC version string of this record.
294    pub fn set_warc_version<S: Into<String>>(&mut self, id: S) {
295        self.headers.version = id.into();
296    }
297
298    /// Return the WARC-Record-ID header for this record.
299    pub fn warc_id(&self) -> &str {
300        &self.record_id
301    }
302
303    /// Set the WARC-Record-ID header for this record.
304    ///
305    /// Note that this value is **not** checked for validity.
306    pub fn set_warc_id<S: Into<String>>(&mut self, id: S) {
307        self.record_id = id.into();
308    }
309
310    /// Return the WARC-Type header for this record.
311    pub fn warc_type(&self) -> &RecordType {
312        &self.record_type
313    }
314
315    /// Set the WARC-Type header for this record.
316    pub fn set_warc_type(&mut self, type_: RecordType) {
317        self.record_type = type_;
318    }
319
320    /// Return the WARC-Date header for this record.
321    pub fn date(&self) -> &DateTime<Utc> {
322        &self.record_date
323    }
324
325    /// Set the WARC-Date header for this record.
326    pub fn set_date(&mut self, date: DateTime<Utc>) {
327        self.record_date = date;
328    }
329
330    /// Return the WARC-Truncated header for this record.
331    pub fn truncated_type(&self) -> &Option<TruncatedType> {
332        &self.truncated_type
333    }
334
335    /// Set the WARC-Truncated header for this record.
336    pub fn set_truncated_type(&mut self, truncated_type: TruncatedType) {
337        self.truncated_type = Some(truncated_type);
338    }
339
340    /// Remove the WARC-Truncated header for this record.
341    pub fn clear_truncated_type(&mut self) {
342        self.truncated_type = None;
343    }
344
345    /// Return the WARC header requested if present in this record, or `None`.
346    pub fn header(&self, header: WarcHeader) -> Option<Cow<'_, str>> {
347        match &header {
348            WarcHeader::ContentLength => {
349                Some(Cow::Owned(format!("{}", self.body.content_length())))
350            }
351            WarcHeader::RecordID => Some(Cow::Borrowed(self.warc_id())),
352            WarcHeader::WarcType => Some(Cow::Owned(self.record_type.to_string())),
353            WarcHeader::Date => Some(Cow::Owned(
354                self.date().to_rfc3339_opts(SecondsFormat::Secs, true),
355            )),
356            _ => self
357                .headers
358                .as_ref()
359                .get(&header)
360                .map(|h| Cow::Owned(String::from_utf8(h.clone()).unwrap())),
361        }
362    }
363
364    /// Set a WARC header in this record, returning the previous value if present.
365    ///
366    /// # Errors
367    ///
368    /// If setting a header whose value has a well-formedness test, an error is returned if the
369    /// value is not well-formed.
370    pub fn set_header<V>(
371        &mut self,
372        header: WarcHeader,
373        value: V,
374    ) -> Result<Option<Cow<'_, str>>, WarcError>
375    where
376        V: Into<String>,
377    {
378        let value = value.into();
379        match &header {
380            WarcHeader::Date => {
381                let old_date = std::mem::replace(
382                    &mut self.record_date,
383                    Record::<T>::parse_record_date(&value)?,
384                );
385                Ok(Some(Cow::Owned(
386                    old_date.to_rfc3339_opts(SecondsFormat::Secs, true),
387                )))
388            }
389            WarcHeader::RecordID => {
390                let old_id = std::mem::replace(&mut self.record_id, value);
391                Ok(Some(Cow::Owned(old_id)))
392            }
393            WarcHeader::WarcType => {
394                let old_type = std::mem::replace(&mut self.record_type, RecordType::from(&value));
395                Ok(Some(Cow::Owned(old_type.to_string())))
396            }
397            WarcHeader::Truncated => {
398                let old_type = self.truncated_type.take();
399                self.truncated_type = Some(TruncatedType::from(&value));
400                Ok(old_type.map(|old| (Cow::Owned(old.to_string()))))
401            }
402            WarcHeader::ContentLength => {
403                if Record::<T>::parse_content_length(&value)? != self.body.content_length() {
404                    Err(WarcError::MalformedHeader(
405                        WarcHeader::ContentLength,
406                        "content length != body size".to_string(),
407                    ))
408                } else {
409                    Ok(Some(Cow::Owned(value)))
410                }
411            }
412            _ => Ok(self
413                .headers
414                .as_mut()
415                .insert(header, Vec::from(value))
416                .map(|v| Cow::Owned(String::from_utf8(v).unwrap()))),
417        }
418    }
419
420    /// Return the Content-Length header for this record.
421    ///
422    /// This value is guaranteed to match the actual length of the body.
423    pub fn content_length(&self) -> u64 {
424        self.body.content_length()
425    }
426}
427
428impl Record<EmptyBody> {
429    /// Add a known body to this record, transforming it into a buffered body record.
430    pub fn add_body<B: Into<Vec<u8>>>(self, body: B) -> Record<BufferedBody> {
431        let Self {
432            headers,
433            record_date,
434            record_id,
435            record_type,
436            truncated_type,
437            body: _,
438        } = self;
439        Record {
440            headers,
441            record_date,
442            record_id,
443            record_type,
444            truncated_type,
445            body: BufferedBody(body.into()),
446        }
447    }
448
449    /// Add a streaming body to this record, whose expected size may not match the actual stream
450    /// length.
451    pub fn add_fixed_stream<'r, R: Read + 'r>(
452        self,
453        stream: &'r mut R,
454        len: &'r mut u64,
455    ) -> std::io::Result<Record<StreamingBody<'r, R>>> {
456        let Record {
457            headers,
458            record_date,
459            record_id,
460            record_type,
461            truncated_type,
462            ..
463        } = self;
464
465        Ok(Record {
466            headers,
467            record_date,
468            record_id,
469            record_type,
470            truncated_type,
471            body: StreamingBody::new(stream, len),
472        })
473    }
474}
475
476impl Record<BufferedBody> {
477    /// Strip the body from this record.
478    pub fn strip_body(self) -> Record<EmptyBody> {
479        let Self {
480            headers,
481            record_date,
482            record_id,
483            record_type,
484            truncated_type,
485            body: _,
486        } = self;
487        Record {
488            headers,
489            record_date,
490            record_id,
491            record_type,
492            truncated_type,
493            body: EmptyBody(),
494        }
495    }
496
497    /// Return the body of this record.
498    pub fn body(&self) -> &[u8] {
499        self.body.0.as_slice()
500    }
501
502    /// Return a reference to mutate the body of this record, but without changing its length.
503    ///
504    /// To update the body of the record or change its length, use the `replace_body` method
505    /// instead.
506    pub fn body_mut(&mut self) -> &mut [u8] {
507        self.body.0.as_mut_slice()
508    }
509
510    /// Replace the body of this record with the given body.
511    pub fn replace_body<V: Into<Vec<u8>>>(&mut self, new_body: V) {
512        let _: Vec<u8> = std::mem::replace(&mut self.body.0, new_body.into());
513    }
514
515    /// Transform this record into a raw record containing the same data.
516    pub fn into_raw_parts(self) -> (RawRecordHeader, Vec<u8>) {
517        let Record {
518            mut headers,
519            record_date,
520            record_id,
521            record_type,
522            body,
523            ..
524        } = self;
525        let insert1 = headers.as_mut().insert(
526            WarcHeader::ContentLength,
527            format!("{}", body.0.len()).into(),
528        );
529        let insert2 = headers
530            .as_mut()
531            .insert(WarcHeader::WarcType, record_type.to_string().into());
532        let insert3 = headers
533            .as_mut()
534            .insert(WarcHeader::RecordID, record_id.into());
535        let insert4 = if let Some(ref truncated_type) = self.truncated_type {
536            headers
537                .as_mut()
538                .insert(WarcHeader::Truncated, truncated_type.to_string().into())
539        } else {
540            None
541        };
542        let insert5 = headers.as_mut().insert(
543            WarcHeader::Date,
544            record_date
545                .to_rfc3339_opts(SecondsFormat::Secs, true)
546                .into(),
547        );
548
549        debug_assert!(
550            insert1.is_none()
551                && insert2.is_none()
552                && insert3.is_none()
553                && insert4.is_none()
554                && insert5.is_none(),
555            "invariant violation: raw struct contains externally stored fields"
556        );
557
558        (headers, body.0)
559    }
560}
561
562impl<'t, T: Read + 't> Record<StreamingBody<'t, T>> {
563    /// Returns a record with a buffered body by collecting the streaming body.
564    ///
565    /// # Errors
566    ///
567    /// This method can fail if the underlying stream returns an error. If this happens, the
568    /// state of the stream is not guaranteed.
569    pub fn into_buffered(self) -> std::io::Result<Record<BufferedBody>> {
570        let Record {
571            headers,
572            record_date,
573            record_id,
574            record_type,
575            truncated_type,
576            mut body,
577        } = self;
578
579        let buf = {
580            let mut body_vec = Vec::with_capacity(body.len() as usize);
581            body.read_to_end(&mut body_vec)?;
582            body_vec
583        };
584
585        let empty_record = Record {
586            headers,
587            record_date,
588            record_id,
589            record_type,
590            truncated_type,
591            ..Default::default()
592        };
593
594        Ok(empty_record.add_body(buf))
595    }
596}
597
598impl<'t, T: Read + 't> Read for Record<StreamingBody<'t, T>> {
599    fn read(&mut self, dst: &mut [u8]) -> Result<usize, std::io::Error> {
600        self.body.read(dst)
601    }
602}
603
604impl Default for Record<BufferedBody> {
605    fn default() -> Record<BufferedBody> {
606        Record {
607            headers: RawRecordHeader {
608                version: "1.0".to_string(),
609                headers: HashMap::new(),
610            },
611            record_date: Utc::now(),
612            record_id: Record::<BufferedBody>::generate_record_id(),
613            record_type: RecordType::Resource,
614            truncated_type: None,
615            body: BufferedBody(vec![]),
616        }
617    }
618}
619
620impl Default for Record<EmptyBody> {
621    fn default() -> Record<EmptyBody> {
622        Record {
623            headers: RawRecordHeader {
624                version: "1.0".to_string(),
625                headers: HashMap::new(),
626            },
627            record_date: Utc::now(),
628            record_id: Record::<EmptyBody>::generate_record_id(),
629            record_type: RecordType::Resource,
630            truncated_type: None,
631            body: EmptyBody(),
632        }
633    }
634}
635
636impl fmt::Display for Record<BufferedBody> {
637    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
638        let (headers, body) = self.clone().into_raw_parts();
639        write!(f, "Record({}, {:?})", headers, body)
640    }
641}
642impl fmt::Display for Record<EmptyBody> {
643    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
644        write!(f, "Record({:?}, Empty)", self.headers)
645    }
646}
647
648impl Clone for Record<EmptyBody> {
649    fn clone(&self) -> Self {
650        Record {
651            headers: self.headers.clone(),
652            record_type: self.record_type.clone(),
653            record_date: self.record_date,
654            record_id: self.record_id.clone(),
655            truncated_type: self.truncated_type.clone(),
656            body: self.body,
657        }
658    }
659}
660
661impl Clone for Record<BufferedBody> {
662    fn clone(&self) -> Self {
663        Record {
664            headers: self.headers.clone(),
665            record_type: self.record_type.clone(),
666            record_date: self.record_date,
667            record_id: self.record_id.clone(),
668            truncated_type: self.truncated_type.clone(),
669            body: self.body.clone(),
670        }
671    }
672}
673
674impl RecordBuilder {
675    /// Set the body of the record under construction.
676    pub fn body(mut self, body: Vec<u8>) -> Self {
677        self.value.replace_body(body);
678
679        self
680    }
681
682    /// Set the record date header of the record under construction.
683    pub fn date(mut self, date: DateTime<Utc>) -> Self {
684        self.value.set_date(date);
685
686        self
687    }
688
689    /// Set the record ID header of the record under construction.
690    pub fn warc_id<S: Into<String>>(mut self, id: S) -> Self {
691        self.value.set_warc_id(id);
692
693        self
694    }
695
696    /// Set the WARC version of the record under construction.
697    pub fn version(mut self, version: String) -> Self {
698        self.value.set_warc_version(version);
699
700        self
701    }
702
703    /// Set the WARC record type header field of the record under construction.
704    pub fn warc_type(mut self, warc_type: RecordType) -> Self {
705        self.value.set_warc_type(warc_type);
706
707        self
708    }
709
710    /// Set the truncated type header of the record under construction.
711    pub fn truncated_type(mut self, trunc_type: TruncatedType) -> Self {
712        self.value.set_truncated_type(trunc_type);
713
714        self
715    }
716
717    /// Create or replace an arbitrary header of the record under construction.
718    pub fn header<V: Into<Vec<u8>>>(mut self, key: WarcHeader, value: V) -> Self {
719        self.broken_headers.insert(key.clone(), value.into());
720
721        let is_ok;
722        match std::str::from_utf8(self.broken_headers.get(&key).unwrap()) {
723            Ok(string) => {
724                if let Err(e) = self.value.set_header(key.clone(), string) {
725                    self.last_error = Some(e);
726                    is_ok = false;
727                } else {
728                    is_ok = true;
729                }
730            }
731            Err(_) => {
732                is_ok = false;
733                self.last_error = Some(WarcError::MalformedHeader(
734                    key.clone(),
735                    "not a UTF-8 string".to_string(),
736                ));
737            }
738        }
739
740        if is_ok {
741            self.broken_headers.remove(&key);
742        }
743
744        self
745    }
746
747    /// Build a raw record header from the data collected in this builder.
748    ///
749    /// A body set in this builder will be returned raw.
750    pub fn build_raw(self) -> (RawRecordHeader, Vec<u8>) {
751        let RecordBuilder {
752            value,
753            broken_headers,
754            ..
755        } = self;
756        let (mut headers, body) = value.into_raw_parts();
757        headers.as_mut().extend(broken_headers);
758
759        (headers, body)
760    }
761
762    /// Build a record from the data collected in this builder.
763    pub fn build(self) -> Result<Record<BufferedBody>, WarcError> {
764        let RecordBuilder {
765            value,
766            broken_headers,
767            last_error,
768        } = self;
769
770        if let Some(e) = last_error {
771            Err(e)
772        } else {
773            debug_assert!(
774                broken_headers.is_empty(),
775                "invariant violation: broken headers without last error"
776            );
777            Ok(value)
778        }
779    }
780}
781
782#[cfg(test)]
783mod record_tests {
784    use crate::header::WarcHeader;
785    use crate::{BufferedBody, Record, RecordType};
786
787    use chrono::prelude::*;
788
789    #[test]
790    fn default() {
791        let before = Utc::now();
792        std::thread::sleep(std::time::Duration::from_millis(10));
793        let record = Record::<BufferedBody>::default();
794        std::thread::sleep(std::time::Duration::from_millis(10));
795        let after = Utc::now();
796        assert_eq!(record.content_length(), 0);
797        assert_eq!(record.warc_version(), "1.0");
798        assert_eq!(record.warc_type(), &RecordType::Resource);
799        assert!(record.date() > &before);
800        assert!(record.date() < &after);
801    }
802
803    #[test]
804    fn impl_eq() {
805        let record1 = Record::<BufferedBody>::default();
806        let record2 = record1.clone();
807        assert_eq!(record1, record2);
808    }
809
810    #[test]
811    fn body() {
812        let mut record = Record::<BufferedBody>::default();
813        assert_eq!(record.content_length(), 0);
814        assert_eq!(record.body(), &[]);
815        record.replace_body(b"hello!!".to_vec());
816        assert_eq!(record.content_length(), 7);
817        assert_eq!(record.body(), b"hello!!");
818        record.body_mut().copy_from_slice(b"goodbye");
819        assert_eq!(record.content_length(), 7);
820        assert_eq!(record.body(), b"goodbye");
821    }
822
823    #[test]
824    fn add_header() {
825        let mut record = Record::<BufferedBody>::default();
826        assert!(record.header(WarcHeader::TargetURI).is_none());
827        assert!(record
828            .set_header(WarcHeader::TargetURI, "https://www.rust-lang.org")
829            .unwrap()
830            .is_none());
831        assert_eq!(
832            record.header(WarcHeader::TargetURI).unwrap(),
833            "https://www.rust-lang.org"
834        );
835        assert_eq!(
836            record
837                .set_header(WarcHeader::TargetURI, "https://docs.rs")
838                .unwrap()
839                .unwrap(),
840            "https://www.rust-lang.org"
841        );
842        assert_eq!(
843            record.header(WarcHeader::TargetURI).unwrap(),
844            "https://docs.rs"
845        );
846    }
847
848    #[test]
849    fn set_header_override_content_length() {
850        let mut record = Record::<BufferedBody>::default();
851        assert_eq!(record.header(WarcHeader::ContentLength).unwrap(), "0");
852        assert!(record
853            .set_header(WarcHeader::ContentLength, "really short")
854            .is_err());
855        assert!(record.set_header(WarcHeader::ContentLength, "50").is_err());
856        assert_eq!(
857            record
858                .set_header(WarcHeader::ContentLength, "0")
859                .unwrap()
860                .unwrap(),
861            "0"
862        );
863    }
864
865    #[test]
866    fn set_header_override_warc_date() {
867        let mut record = Record::<BufferedBody>::default();
868        let old_date = record.date().to_rfc3339_opts(SecondsFormat::Secs, true);
869        assert_eq!(record.header(WarcHeader::Date).unwrap(), old_date);
870        assert!(record.set_header(WarcHeader::Date, "yesterday").is_err());
871        assert_eq!(
872            record
873                .set_header(WarcHeader::Date, "2020-07-21T22:00:00Z")
874                .unwrap()
875                .unwrap(),
876            old_date
877        );
878        assert_eq!(
879            record.header(WarcHeader::Date).unwrap(),
880            "2020-07-21T22:00:00Z"
881        );
882    }
883
884    #[test]
885    fn set_header_override_warc_record_id() {
886        let mut record = Record::<BufferedBody>::default();
887        let old_id = record.warc_id().to_string();
888        assert_eq!(
889            record.header(WarcHeader::RecordID).unwrap(),
890            old_id.as_str()
891        );
892        assert_eq!(
893            record
894                .set_header(WarcHeader::RecordID, "urn:http:www.rust-lang.org")
895                .unwrap()
896                .unwrap(),
897            old_id.as_str()
898        );
899        assert_eq!(
900            record.header(WarcHeader::RecordID).unwrap(),
901            "urn:http:www.rust-lang.org"
902        );
903    }
904
905    #[test]
906    fn set_header_override_warc_type() {
907        let mut record = Record::<BufferedBody>::default();
908        assert_eq!(record.header(WarcHeader::WarcType).unwrap(), "resource");
909        assert_eq!(
910            record
911                .set_header(WarcHeader::WarcType, "revisit")
912                .unwrap()
913                .unwrap(),
914            "resource"
915        );
916        assert_eq!(record.header(WarcHeader::WarcType).unwrap(), "revisit");
917    }
918}
919
920#[cfg(test)]
921mod raw_tests {
922    use crate::header::WarcHeader;
923    use crate::{EmptyBody, RawRecordHeader, Record, RecordType};
924
925    use std::collections::HashMap;
926    use std::convert::TryFrom;
927
928    #[test]
929    fn create() {
930        let headers = RawRecordHeader {
931            version: "1.0".to_owned(),
932            headers: HashMap::new(),
933        };
934
935        assert_eq!(headers.as_ref().len(), 0);
936    }
937
938    #[test]
939    fn create_with_headers() {
940        let headers = RawRecordHeader {
941            version: "1.0".to_owned(),
942            headers: vec![(
943                WarcHeader::WarcType,
944                RecordType::WarcInfo.to_string().into_bytes(),
945            )]
946            .into_iter()
947            .collect(),
948        };
949
950        assert_eq!(headers.as_ref().len(), 1);
951    }
952
953    #[test]
954    fn verify_ok() {
955        let headers = RawRecordHeader {
956            version: "1.0".to_owned(),
957            headers: vec![
958                (WarcHeader::WarcType, b"dunno".to_vec()),
959                (WarcHeader::ContentLength, b"5".to_vec()),
960                (
961                    WarcHeader::RecordID,
962                    b"<urn:test:basic-record:record-0>".to_vec(),
963                ),
964                (WarcHeader::Date, b"2020-07-08T02:52:55Z".to_vec()),
965            ]
966            .into_iter()
967            .collect(),
968        };
969
970        assert!(Record::<EmptyBody>::try_from(headers).is_ok());
971    }
972
973    #[test]
974    fn verify_missing_type() {
975        let headers = RawRecordHeader {
976            version: "1.0".to_owned(),
977            headers: vec![
978                (WarcHeader::ContentLength, b"5".to_vec()),
979                (
980                    WarcHeader::RecordID,
981                    b"<urn:test:basic-record:record-0>".to_vec(),
982                ),
983                (WarcHeader::Date, b"2020-07-08T02:52:55Z".to_vec()),
984            ]
985            .into_iter()
986            .collect(),
987        };
988
989        assert!(Record::<EmptyBody>::try_from(headers).is_err());
990    }
991
992    #[test]
993    fn verify_missing_content_length() {
994        let headers = RawRecordHeader {
995            version: "1.0".to_owned(),
996            headers: vec![
997                (WarcHeader::WarcType, b"dunno".to_vec()),
998                (
999                    WarcHeader::RecordID,
1000                    b"<urn:test:basic-record:record-0>".to_vec(),
1001                ),
1002                (WarcHeader::Date, b"2020-07-08T02:52:55Z".to_vec()),
1003            ]
1004            .into_iter()
1005            .collect(),
1006        };
1007
1008        assert!(Record::<EmptyBody>::try_from(headers).is_err());
1009    }
1010
1011    #[test]
1012    fn verify_missing_record_id() {
1013        let headers = RawRecordHeader {
1014            version: "1.0".to_owned(),
1015            headers: vec![
1016                (WarcHeader::WarcType, b"dunno".to_vec()),
1017                (WarcHeader::ContentLength, b"5".to_vec()),
1018                (WarcHeader::Date, b"2020-07-08T02:52:55Z".to_vec()),
1019            ]
1020            .into_iter()
1021            .collect(),
1022        };
1023
1024        assert!(Record::<EmptyBody>::try_from(headers).is_err());
1025    }
1026
1027    #[test]
1028    fn verify_missing_date() {
1029        let headers = RawRecordHeader {
1030            version: "1.0".to_owned(),
1031            headers: vec![
1032                (WarcHeader::WarcType, b"dunno".to_vec()),
1033                (WarcHeader::ContentLength, b"5".to_vec()),
1034                (
1035                    WarcHeader::RecordID,
1036                    b"<urn:test:basic-record:record-0>".to_vec(),
1037                ),
1038            ]
1039            .into_iter()
1040            .collect(),
1041        };
1042
1043        assert!(Record::<EmptyBody>::try_from(headers).is_err());
1044    }
1045
1046    #[test]
1047    fn verify_display() {
1048        let header_entries = vec![
1049            (WarcHeader::WarcType, b"dunno".to_vec()),
1050            (WarcHeader::Date, b"2024-01-01T00:00:00Z".to_vec()),
1051        ];
1052
1053        let headers = RawRecordHeader {
1054            version: "1.0".to_owned(),
1055            headers: header_entries.into_iter().collect(),
1056        };
1057
1058        let output = headers.to_string();
1059
1060        let expected_lines = [
1061            "WARC/1.0",
1062            "warc-type: dunno",
1063            "warc-date: 2024-01-01T00:00:00Z",
1064            "",
1065        ];
1066        let actual_lines: Vec<_> = output.lines().collect();
1067
1068        let mut expected_headers: Vec<_> = expected_lines[1..expected_lines.len() - 1].to_vec();
1069        expected_headers.sort();
1070
1071        let mut actual_headers: Vec<_> = actual_lines[1..actual_lines.len() - 1].to_vec();
1072        actual_headers.sort();
1073
1074        // verify parts
1075        assert_eq!(actual_lines[0], expected_lines[0]); // WARC version
1076        assert_eq!(actual_headers, expected_headers); // headers (sorted)
1077        assert_eq!(actual_lines.last(), expected_lines.last()); // empty line
1078    }
1079}
1080
1081#[cfg(test)]
1082mod builder_tests {
1083    use crate::header::WarcHeader;
1084    use crate::{
1085        BufferedBody, EmptyBody, RawRecordHeader, Record, RecordBuilder, RecordType, TruncatedType,
1086    };
1087
1088    use std::convert::TryFrom;
1089
1090    #[test]
1091    fn default() {
1092        let (headers, body) = RecordBuilder::default().build_raw();
1093        assert_eq!(headers.version, "1.0".to_string());
1094        assert_eq!(
1095            headers.as_ref().get(&WarcHeader::ContentLength).unwrap(),
1096            &b"0".to_vec()
1097        );
1098        assert!(body.is_empty());
1099        assert_eq!(
1100            RecordBuilder::default().build().unwrap().content_length(),
1101            0
1102        );
1103    }
1104
1105    #[test]
1106    fn default_with_body() {
1107        let (headers, body) = RecordBuilder::default()
1108            .body(b"abcdef".to_vec())
1109            .build_raw();
1110        assert_eq!(headers.version, "1.0".to_string());
1111        assert_eq!(
1112            headers.as_ref().get(&WarcHeader::ContentLength).unwrap(),
1113            &b"6".to_vec()
1114        );
1115        assert_eq!(body.as_slice(), b"abcdef");
1116        assert_eq!(
1117            RecordBuilder::default()
1118                .body(b"abcdef".to_vec())
1119                .build()
1120                .unwrap()
1121                .content_length(),
1122            6
1123        );
1124    }
1125
1126    #[test]
1127    fn impl_eq_raw() {
1128        let builder = RecordBuilder::default();
1129        let raw1 = builder.clone().build_raw();
1130
1131        let raw2 = builder.build_raw();
1132        assert_eq!(raw1, raw2);
1133    }
1134
1135    #[test]
1136    fn impl_eq_record() {
1137        let builder = RecordBuilder::default();
1138        let record1 = builder.clone().build().unwrap();
1139
1140        let record2 = builder.build().unwrap();
1141        assert_eq!(record1, record2);
1142    }
1143
1144    #[test]
1145    fn create_with_headers() {
1146        let headers = RawRecordHeader {
1147            version: "1.0".to_owned(),
1148            headers: vec![(
1149                WarcHeader::WarcType,
1150                RecordType::WarcInfo.to_string().into_bytes(),
1151            )]
1152            .into_iter()
1153            .collect(),
1154        };
1155
1156        assert_eq!(headers.as_ref().len(), 1);
1157    }
1158
1159    #[test]
1160    fn verify_ok() {
1161        let headers = RawRecordHeader {
1162            version: "1.0".to_owned(),
1163            headers: vec![
1164                (WarcHeader::WarcType, b"dunno".to_vec()),
1165                (WarcHeader::ContentLength, b"5".to_vec()),
1166                (
1167                    WarcHeader::RecordID,
1168                    b"<urn:test:basic-record:record-0>".to_vec(),
1169                ),
1170                (WarcHeader::Date, b"2020-07-08T02:52:55Z".to_vec()),
1171            ]
1172            .into_iter()
1173            .collect(),
1174        };
1175
1176        assert!(Record::<EmptyBody>::try_from(headers).is_ok());
1177    }
1178
1179    #[test]
1180    fn verify_content_length() {
1181        let mut builder = RecordBuilder::default().body(b"12345".to_vec());
1182
1183        assert_eq!(
1184            builder
1185                .clone()
1186                .build()
1187                .unwrap()
1188                .into_raw_parts()
1189                .0
1190                .as_ref()
1191                .get(&WarcHeader::ContentLength)
1192                .unwrap(),
1193            &b"5".to_vec()
1194        );
1195
1196        assert_eq!(
1197            builder
1198                .clone()
1199                .build_raw()
1200                .0
1201                .as_ref()
1202                .get(&WarcHeader::ContentLength)
1203                .unwrap(),
1204            &b"5".to_vec()
1205        );
1206
1207        builder = builder.header(WarcHeader::ContentLength, "1");
1208        assert_eq!(
1209            builder
1210                .clone()
1211                .build_raw()
1212                .0
1213                .as_ref()
1214                .get(&WarcHeader::ContentLength)
1215                .unwrap(),
1216            &b"1".to_vec()
1217        );
1218
1219        assert!(builder.build().is_err());
1220    }
1221
1222    #[test]
1223    fn verify_build_record_type() {
1224        let builder1 = RecordBuilder::default().header(WarcHeader::WarcType, "request");
1225        let builder2 = builder1.clone().warc_type(RecordType::Request);
1226
1227        let record1 = builder1.build().unwrap();
1228        let record2 = builder2.build().unwrap();
1229
1230        assert_eq!(record1, record2);
1231        assert_eq!(
1232            record1
1233                .into_raw_parts()
1234                .0
1235                .as_ref()
1236                .get(&WarcHeader::WarcType),
1237            Some(&b"request".to_vec())
1238        );
1239    }
1240
1241    #[test]
1242    fn verify_build_date() {
1243        const DATE_STRING_0: &str = "2020-07-08T02:52:55Z";
1244        const DATE_STRING_1: &[u8] = b"2020-07-18T02:12:45Z";
1245
1246        let mut builder = RecordBuilder::default();
1247        builder = builder.date(Record::<BufferedBody>::parse_record_date(DATE_STRING_0).unwrap());
1248
1249        let record = builder.clone().build().unwrap();
1250        assert_eq!(
1251            record
1252                .into_raw_parts()
1253                .0
1254                .as_ref()
1255                .get(&WarcHeader::Date)
1256                .unwrap(),
1257            &DATE_STRING_0.as_bytes()
1258        );
1259        assert_eq!(
1260            builder
1261                .clone()
1262                .build_raw()
1263                .0
1264                .as_ref()
1265                .get(&WarcHeader::Date)
1266                .unwrap(),
1267            &DATE_STRING_0.as_bytes()
1268        );
1269
1270        builder = builder.header(WarcHeader::Date, DATE_STRING_1.to_vec());
1271        let record = builder.clone().build().unwrap();
1272        assert_eq!(
1273            record
1274                .into_raw_parts()
1275                .0
1276                .as_ref()
1277                .get(&WarcHeader::Date)
1278                .unwrap(),
1279            &DATE_STRING_1.to_vec()
1280        );
1281        assert_eq!(
1282            builder
1283                .clone()
1284                .build_raw()
1285                .0
1286                .as_ref()
1287                .get(&WarcHeader::Date)
1288                .unwrap(),
1289            &DATE_STRING_1.to_vec()
1290        );
1291
1292        let builder = builder.header(WarcHeader::Date, b"not-a-dayTor:a:time".to_vec());
1293        assert!(builder.build().is_err());
1294    }
1295
1296    #[test]
1297    fn verify_build_record_id() {
1298        const RECORD_ID_0: &[u8] = b"<urn:test:verify-build-id:record-0>";
1299        const RECORD_ID_1: &[u8] = b"<urn:test:verify-build-id:record-1>";
1300
1301        let mut builder = RecordBuilder::default();
1302        builder = builder.warc_id(std::str::from_utf8(RECORD_ID_0).unwrap());
1303
1304        let record = builder.clone().build().unwrap();
1305        assert_eq!(
1306            record
1307                .into_raw_parts()
1308                .0
1309                .as_ref()
1310                .get(&WarcHeader::RecordID)
1311                .unwrap(),
1312            &RECORD_ID_0.to_vec()
1313        );
1314        assert_eq!(
1315            builder
1316                .clone()
1317                .build_raw()
1318                .0
1319                .as_ref()
1320                .get(&WarcHeader::RecordID)
1321                .unwrap(),
1322            &RECORD_ID_0.to_vec()
1323        );
1324
1325        let builder = builder.header(WarcHeader::RecordID, RECORD_ID_1.to_vec());
1326        let record = builder.clone().build().unwrap();
1327        assert_eq!(
1328            record
1329                .into_raw_parts()
1330                .0
1331                .as_ref()
1332                .get(&WarcHeader::RecordID)
1333                .unwrap(),
1334            &RECORD_ID_1.to_vec()
1335        );
1336        assert_eq!(
1337            builder
1338                .clone()
1339                .build_raw()
1340                .0
1341                .as_ref()
1342                .get(&WarcHeader::RecordID)
1343                .unwrap(),
1344            &RECORD_ID_1.to_vec()
1345        );
1346    }
1347
1348    #[test]
1349    fn verify_build_truncated_type() {
1350        const TRUNCATED_TYPE_0: &[u8] = b"length";
1351        const TRUNCATED_TYPE_1: &[u8] = b"disconnect";
1352
1353        let mut builder = RecordBuilder::default();
1354        builder = builder.truncated_type(TruncatedType::Length);
1355
1356        let record = builder.clone().build().unwrap();
1357        assert_eq!(
1358            record
1359                .into_raw_parts()
1360                .0
1361                .as_ref()
1362                .get(&WarcHeader::Truncated)
1363                .unwrap(),
1364            &TRUNCATED_TYPE_0.to_vec()
1365        );
1366        assert_eq!(
1367            builder
1368                .clone()
1369                .build_raw()
1370                .0
1371                .as_ref()
1372                .get(&WarcHeader::Truncated)
1373                .unwrap(),
1374            &TRUNCATED_TYPE_0.to_vec()
1375        );
1376
1377        builder = builder.header(WarcHeader::Truncated, "disconnect");
1378        let record = builder.clone().build().unwrap();
1379        assert_eq!(
1380            record
1381                .into_raw_parts()
1382                .0
1383                .as_ref()
1384                .get(&WarcHeader::Truncated)
1385                .unwrap(),
1386            &TRUNCATED_TYPE_1.to_vec()
1387        );
1388        assert_eq!(
1389            builder
1390                .clone()
1391                .build_raw()
1392                .0
1393                .as_ref()
1394                .get(&WarcHeader::Truncated)
1395                .unwrap(),
1396            &TRUNCATED_TYPE_1.to_vec()
1397        );
1398
1399        builder = builder.header(WarcHeader::Truncated, "foreign-intervention");
1400        assert_eq!(
1401            builder
1402                .clone()
1403                .build()
1404                .unwrap()
1405                .into_raw_parts()
1406                .0
1407                .as_ref()
1408                .get(&WarcHeader::Truncated)
1409                .unwrap()
1410                .as_slice(),
1411            &b"foreign-intervention"[..]
1412        );
1413
1414        assert_eq!(
1415            builder
1416                .clone()
1417                .build_raw()
1418                .0
1419                .as_ref()
1420                .get(&WarcHeader::Truncated)
1421                .unwrap()
1422                .as_slice(),
1423            &b"foreign-intervention"[..]
1424        );
1425    }
1426}