1use chrono::prelude::*;
2use std::borrow::Cow;
3use std::collections::HashMap;
4use std::fmt;
5use std::io::Read;
6
7use uuid::Uuid;
8
9use crate::header::WarcHeader;
10use crate::record_type::RecordType;
11use crate::truncated_type::TruncatedType;
12use crate::Error as WarcError;
13
14use streaming_trait::BodyKind;
15pub use streaming_trait::{BufferedBody, EmptyBody, StreamingBody};
16
17mod streaming_trait {
18 use std::io::Read;
19
20 pub trait BodyKind {
22 fn content_length(&self) -> u64;
23 }
24
25 #[derive(Clone, Debug, PartialEq)]
26 pub struct BufferedBody(pub Vec<u8>);
28 impl BodyKind for BufferedBody {
29 fn content_length(&self) -> u64 {
30 self.0.len() as u64
31 }
32 }
33
34 pub struct StreamingBody<'t, T: Read + 't>(&'t mut T, &'t mut u64);
36 impl<'t, T: Read + 't> StreamingBody<'t, T> {
37 pub(crate) fn new(stream: &'t mut T, max_len: &'t mut u64) -> StreamingBody<'t, T> {
38 StreamingBody(stream, max_len)
39 }
40
41 pub(crate) fn len(&self) -> u64 {
42 *self.1
43 }
44 }
45 impl<'t, T: Read + 't> BodyKind for StreamingBody<'t, T> {
46 fn content_length(&self) -> u64 {
47 *self.1
48 }
49 }
50
51 impl<'t, T: Read + 't> Read for StreamingBody<'t, T> {
52 fn read(&mut self, data: &mut [u8]) -> std::io::Result<usize> {
53 let max_read = std::cmp::min(data.len(), *self.1 as usize);
54 self.0.read(&mut data[..max_read]).inspect(|&n| {
55 *self.1 -= n as u64;
56 })
57 }
58 }
59
60 #[derive(Clone, Copy, Debug)]
61 pub struct EmptyBody();
63 impl BodyKind for EmptyBody {
64 fn content_length(&self) -> u64 {
65 0
66 }
67 }
68}
69
70#[derive(Clone, Debug, PartialEq)]
76pub struct RawRecordHeader {
77 pub version: String,
79 pub headers: HashMap<WarcHeader, Vec<u8>>,
81}
82
83impl AsRef<HashMap<WarcHeader, Vec<u8>>> for RawRecordHeader {
84 fn as_ref(&self) -> &HashMap<WarcHeader, Vec<u8>> {
85 &self.headers
86 }
87}
88
89impl AsMut<HashMap<WarcHeader, Vec<u8>>> for RawRecordHeader {
90 fn as_mut(&mut self) -> &mut HashMap<WarcHeader, Vec<u8>> {
91 &mut self.headers
92 }
93}
94
95impl std::convert::TryFrom<RawRecordHeader> for Record<EmptyBody> {
96 type Error = WarcError;
97 fn try_from(mut headers: RawRecordHeader) -> Result<Self, WarcError> {
98 headers
99 .as_mut()
100 .remove(&WarcHeader::ContentLength)
101 .ok_or(WarcError::MissingHeader(WarcHeader::ContentLength))
102 .and_then(|vec| {
103 String::from_utf8(vec).map_err(|_| {
104 WarcError::MalformedHeader(WarcHeader::Date, "not a UTF-8 string".to_string())
105 })
106 })?;
107
108 let record_type = headers
109 .as_mut()
110 .remove(&WarcHeader::WarcType)
111 .ok_or(WarcError::MissingHeader(WarcHeader::WarcType))
112 .and_then(|vec| {
113 String::from_utf8(vec).map_err(|_| {
114 WarcError::MalformedHeader(
115 WarcHeader::WarcType,
116 "not a UTF-8 string".to_string(),
117 )
118 })
119 })
120 .map(|rtype| rtype.into())?;
121
122 let record_id = headers
123 .as_mut()
124 .remove(&WarcHeader::RecordID)
125 .ok_or(WarcError::MissingHeader(WarcHeader::RecordID))
126 .and_then(|vec| {
127 String::from_utf8(vec).map_err(|_| {
128 WarcError::MalformedHeader(WarcHeader::Date, "not a UTF-8 string".to_string())
129 })
130 })?;
131
132 let record_date = headers
133 .as_mut()
134 .remove(&WarcHeader::Date)
135 .ok_or(WarcError::MissingHeader(WarcHeader::Date))
136 .and_then(|vec| {
137 String::from_utf8(vec).map_err(|_| {
138 WarcError::MalformedHeader(WarcHeader::Date, "not a UTF-8 string".to_string())
139 })
140 })
141 .and_then(|date| Record::<BufferedBody>::parse_record_date(&date))?;
142
143 Ok(Record {
144 headers,
145 record_date,
146 record_id,
147 record_type,
148 body: EmptyBody(),
149 ..Default::default()
150 })
151 }
152}
153
154impl std::fmt::Display for RawRecordHeader {
155 fn fmt(&self, w: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
156 writeln!(w, "WARC/{}", self.version)?;
157 for (key, value) in self.as_ref().iter() {
158 writeln!(w, "{}: {}", key, String::from_utf8_lossy(value))?;
159 }
160 writeln!(w)?;
161
162 Ok(())
163 }
164}
165
166#[derive(Default)]
168pub struct RecordBuilder {
169 value: Record<BufferedBody>,
170 broken_headers: HashMap<WarcHeader, Vec<u8>>,
171 last_error: Option<WarcError>,
172}
173
174impl Clone for RecordBuilder {
177 fn clone(&self) -> Self {
178 let err: Option<&WarcError> = self.last_error.as_ref();
179 let last_error: Option<WarcError> = err.map(|err| match err {
180 WarcError::ReadData(e) => WarcError::ReadData(std::io::Error::from(e.kind())),
181 WarcError::ParseHeaders(e) => WarcError::ParseHeaders(e.clone()),
182 WarcError::MissingHeader(e) => WarcError::MissingHeader(e.clone()),
183 WarcError::MalformedHeader(h, e) => WarcError::MalformedHeader(h.clone(), e.clone()),
184 WarcError::ReadOverflow => WarcError::ReadOverflow,
185 WarcError::UnexpectedEOB => WarcError::UnexpectedEOB,
186 });
187 RecordBuilder {
188 value: self.value.clone(),
189 broken_headers: self.broken_headers.clone(),
190 last_error,
191 }
192 }
193}
194
195#[derive(Debug, PartialEq)]
207pub struct Record<T: BodyKind> {
208 headers: RawRecordHeader,
210 record_date: DateTime<Utc>,
211 record_id: String,
212 record_type: RecordType,
213 truncated_type: Option<TruncatedType>,
214 body: T,
215}
216
217impl<T: BodyKind> Record<T> {
218 pub fn new() -> Record<EmptyBody> {
228 Record::default()
229 }
230
231 pub fn with_body<B: Into<Vec<u8>>>(body: B) -> Record<BufferedBody> {
241 Record {
242 body: BufferedBody(body.into()),
243 ..Record::default()
244 }
245 }
246
247 pub fn generate_record_id() -> String {
265 format!("<{}>", Uuid::new_v4().to_urn())
266 }
267
268 fn parse_content_length(len: &str) -> Result<u64, WarcError> {
269 (len).parse::<u64>().map_err(|_| {
270 WarcError::MalformedHeader(
271 WarcHeader::ContentLength,
272 "not an integer between 0 and 2^64-1".to_string(),
273 )
274 })
275 }
276
277 fn parse_record_date(date: &str) -> Result<DateTime<Utc>, WarcError> {
278 DateTime::parse_from_rfc3339(date)
279 .map_err(|_| {
280 WarcError::MalformedHeader(
281 WarcHeader::Date,
282 "not an ISO 8601 datestamp".to_string(),
283 )
284 })
285 .map(|date| date.into())
286 }
287
288 pub fn warc_version(&self) -> &str {
290 &self.headers.version
291 }
292
293 pub fn set_warc_version<S: Into<String>>(&mut self, id: S) {
295 self.headers.version = id.into();
296 }
297
298 pub fn warc_id(&self) -> &str {
300 &self.record_id
301 }
302
303 pub fn set_warc_id<S: Into<String>>(&mut self, id: S) {
307 self.record_id = id.into();
308 }
309
310 pub fn warc_type(&self) -> &RecordType {
312 &self.record_type
313 }
314
315 pub fn set_warc_type(&mut self, type_: RecordType) {
317 self.record_type = type_;
318 }
319
320 pub fn date(&self) -> &DateTime<Utc> {
322 &self.record_date
323 }
324
325 pub fn set_date(&mut self, date: DateTime<Utc>) {
327 self.record_date = date;
328 }
329
330 pub fn truncated_type(&self) -> &Option<TruncatedType> {
332 &self.truncated_type
333 }
334
335 pub fn set_truncated_type(&mut self, truncated_type: TruncatedType) {
337 self.truncated_type = Some(truncated_type);
338 }
339
340 pub fn clear_truncated_type(&mut self) {
342 self.truncated_type = None;
343 }
344
345 pub fn header(&self, header: WarcHeader) -> Option<Cow<'_, str>> {
347 match &header {
348 WarcHeader::ContentLength => {
349 Some(Cow::Owned(format!("{}", self.body.content_length())))
350 }
351 WarcHeader::RecordID => Some(Cow::Borrowed(self.warc_id())),
352 WarcHeader::WarcType => Some(Cow::Owned(self.record_type.to_string())),
353 WarcHeader::Date => Some(Cow::Owned(
354 self.date().to_rfc3339_opts(SecondsFormat::Secs, true),
355 )),
356 _ => self
357 .headers
358 .as_ref()
359 .get(&header)
360 .map(|h| Cow::Owned(String::from_utf8(h.clone()).unwrap())),
361 }
362 }
363
364 pub fn set_header<V>(
371 &mut self,
372 header: WarcHeader,
373 value: V,
374 ) -> Result<Option<Cow<'_, str>>, WarcError>
375 where
376 V: Into<String>,
377 {
378 let value = value.into();
379 match &header {
380 WarcHeader::Date => {
381 let old_date = std::mem::replace(
382 &mut self.record_date,
383 Record::<T>::parse_record_date(&value)?,
384 );
385 Ok(Some(Cow::Owned(
386 old_date.to_rfc3339_opts(SecondsFormat::Secs, true),
387 )))
388 }
389 WarcHeader::RecordID => {
390 let old_id = std::mem::replace(&mut self.record_id, value);
391 Ok(Some(Cow::Owned(old_id)))
392 }
393 WarcHeader::WarcType => {
394 let old_type = std::mem::replace(&mut self.record_type, RecordType::from(&value));
395 Ok(Some(Cow::Owned(old_type.to_string())))
396 }
397 WarcHeader::Truncated => {
398 let old_type = self.truncated_type.take();
399 self.truncated_type = Some(TruncatedType::from(&value));
400 Ok(old_type.map(|old| (Cow::Owned(old.to_string()))))
401 }
402 WarcHeader::ContentLength => {
403 if Record::<T>::parse_content_length(&value)? != self.body.content_length() {
404 Err(WarcError::MalformedHeader(
405 WarcHeader::ContentLength,
406 "content length != body size".to_string(),
407 ))
408 } else {
409 Ok(Some(Cow::Owned(value)))
410 }
411 }
412 _ => Ok(self
413 .headers
414 .as_mut()
415 .insert(header, Vec::from(value))
416 .map(|v| Cow::Owned(String::from_utf8(v).unwrap()))),
417 }
418 }
419
420 pub fn content_length(&self) -> u64 {
424 self.body.content_length()
425 }
426}
427
428impl Record<EmptyBody> {
429 pub fn add_body<B: Into<Vec<u8>>>(self, body: B) -> Record<BufferedBody> {
431 let Self {
432 headers,
433 record_date,
434 record_id,
435 record_type,
436 truncated_type,
437 body: _,
438 } = self;
439 Record {
440 headers,
441 record_date,
442 record_id,
443 record_type,
444 truncated_type,
445 body: BufferedBody(body.into()),
446 }
447 }
448
449 pub fn add_fixed_stream<'r, R: Read + 'r>(
452 self,
453 stream: &'r mut R,
454 len: &'r mut u64,
455 ) -> std::io::Result<Record<StreamingBody<'r, R>>> {
456 let Record {
457 headers,
458 record_date,
459 record_id,
460 record_type,
461 truncated_type,
462 ..
463 } = self;
464
465 Ok(Record {
466 headers,
467 record_date,
468 record_id,
469 record_type,
470 truncated_type,
471 body: StreamingBody::new(stream, len),
472 })
473 }
474}
475
476impl Record<BufferedBody> {
477 pub fn strip_body(self) -> Record<EmptyBody> {
479 let Self {
480 headers,
481 record_date,
482 record_id,
483 record_type,
484 truncated_type,
485 body: _,
486 } = self;
487 Record {
488 headers,
489 record_date,
490 record_id,
491 record_type,
492 truncated_type,
493 body: EmptyBody(),
494 }
495 }
496
497 pub fn body(&self) -> &[u8] {
499 self.body.0.as_slice()
500 }
501
502 pub fn body_mut(&mut self) -> &mut [u8] {
507 self.body.0.as_mut_slice()
508 }
509
510 pub fn replace_body<V: Into<Vec<u8>>>(&mut self, new_body: V) {
512 let _: Vec<u8> = std::mem::replace(&mut self.body.0, new_body.into());
513 }
514
515 pub fn into_raw_parts(self) -> (RawRecordHeader, Vec<u8>) {
517 let Record {
518 mut headers,
519 record_date,
520 record_id,
521 record_type,
522 body,
523 ..
524 } = self;
525 let insert1 = headers.as_mut().insert(
526 WarcHeader::ContentLength,
527 format!("{}", body.0.len()).into(),
528 );
529 let insert2 = headers
530 .as_mut()
531 .insert(WarcHeader::WarcType, record_type.to_string().into());
532 let insert3 = headers
533 .as_mut()
534 .insert(WarcHeader::RecordID, record_id.into());
535 let insert4 = if let Some(ref truncated_type) = self.truncated_type {
536 headers
537 .as_mut()
538 .insert(WarcHeader::Truncated, truncated_type.to_string().into())
539 } else {
540 None
541 };
542 let insert5 = headers.as_mut().insert(
543 WarcHeader::Date,
544 record_date
545 .to_rfc3339_opts(SecondsFormat::Secs, true)
546 .into(),
547 );
548
549 debug_assert!(
550 insert1.is_none()
551 && insert2.is_none()
552 && insert3.is_none()
553 && insert4.is_none()
554 && insert5.is_none(),
555 "invariant violation: raw struct contains externally stored fields"
556 );
557
558 (headers, body.0)
559 }
560}
561
562impl<'t, T: Read + 't> Record<StreamingBody<'t, T>> {
563 pub fn into_buffered(self) -> std::io::Result<Record<BufferedBody>> {
570 let Record {
571 headers,
572 record_date,
573 record_id,
574 record_type,
575 truncated_type,
576 mut body,
577 } = self;
578
579 let buf = {
580 let mut body_vec = Vec::with_capacity(body.len() as usize);
581 body.read_to_end(&mut body_vec)?;
582 body_vec
583 };
584
585 let empty_record = Record {
586 headers,
587 record_date,
588 record_id,
589 record_type,
590 truncated_type,
591 ..Default::default()
592 };
593
594 Ok(empty_record.add_body(buf))
595 }
596}
597
598impl<'t, T: Read + 't> Read for Record<StreamingBody<'t, T>> {
599 fn read(&mut self, dst: &mut [u8]) -> Result<usize, std::io::Error> {
600 self.body.read(dst)
601 }
602}
603
604impl Default for Record<BufferedBody> {
605 fn default() -> Record<BufferedBody> {
606 Record {
607 headers: RawRecordHeader {
608 version: "1.0".to_string(),
609 headers: HashMap::new(),
610 },
611 record_date: Utc::now(),
612 record_id: Record::<BufferedBody>::generate_record_id(),
613 record_type: RecordType::Resource,
614 truncated_type: None,
615 body: BufferedBody(vec![]),
616 }
617 }
618}
619
620impl Default for Record<EmptyBody> {
621 fn default() -> Record<EmptyBody> {
622 Record {
623 headers: RawRecordHeader {
624 version: "1.0".to_string(),
625 headers: HashMap::new(),
626 },
627 record_date: Utc::now(),
628 record_id: Record::<EmptyBody>::generate_record_id(),
629 record_type: RecordType::Resource,
630 truncated_type: None,
631 body: EmptyBody(),
632 }
633 }
634}
635
636impl fmt::Display for Record<BufferedBody> {
637 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
638 let (headers, body) = self.clone().into_raw_parts();
639 write!(f, "Record({}, {:?})", headers, body)
640 }
641}
642impl fmt::Display for Record<EmptyBody> {
643 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
644 write!(f, "Record({:?}, Empty)", self.headers)
645 }
646}
647
648impl Clone for Record<EmptyBody> {
649 fn clone(&self) -> Self {
650 Record {
651 headers: self.headers.clone(),
652 record_type: self.record_type.clone(),
653 record_date: self.record_date,
654 record_id: self.record_id.clone(),
655 truncated_type: self.truncated_type.clone(),
656 body: self.body,
657 }
658 }
659}
660
661impl Clone for Record<BufferedBody> {
662 fn clone(&self) -> Self {
663 Record {
664 headers: self.headers.clone(),
665 record_type: self.record_type.clone(),
666 record_date: self.record_date,
667 record_id: self.record_id.clone(),
668 truncated_type: self.truncated_type.clone(),
669 body: self.body.clone(),
670 }
671 }
672}
673
674impl RecordBuilder {
675 pub fn body(mut self, body: Vec<u8>) -> Self {
677 self.value.replace_body(body);
678
679 self
680 }
681
682 pub fn date(mut self, date: DateTime<Utc>) -> Self {
684 self.value.set_date(date);
685
686 self
687 }
688
689 pub fn warc_id<S: Into<String>>(mut self, id: S) -> Self {
691 self.value.set_warc_id(id);
692
693 self
694 }
695
696 pub fn version(mut self, version: String) -> Self {
698 self.value.set_warc_version(version);
699
700 self
701 }
702
703 pub fn warc_type(mut self, warc_type: RecordType) -> Self {
705 self.value.set_warc_type(warc_type);
706
707 self
708 }
709
710 pub fn truncated_type(mut self, trunc_type: TruncatedType) -> Self {
712 self.value.set_truncated_type(trunc_type);
713
714 self
715 }
716
717 pub fn header<V: Into<Vec<u8>>>(mut self, key: WarcHeader, value: V) -> Self {
719 self.broken_headers.insert(key.clone(), value.into());
720
721 let is_ok;
722 match std::str::from_utf8(self.broken_headers.get(&key).unwrap()) {
723 Ok(string) => {
724 if let Err(e) = self.value.set_header(key.clone(), string) {
725 self.last_error = Some(e);
726 is_ok = false;
727 } else {
728 is_ok = true;
729 }
730 }
731 Err(_) => {
732 is_ok = false;
733 self.last_error = Some(WarcError::MalformedHeader(
734 key.clone(),
735 "not a UTF-8 string".to_string(),
736 ));
737 }
738 }
739
740 if is_ok {
741 self.broken_headers.remove(&key);
742 }
743
744 self
745 }
746
747 pub fn build_raw(self) -> (RawRecordHeader, Vec<u8>) {
751 let RecordBuilder {
752 value,
753 broken_headers,
754 ..
755 } = self;
756 let (mut headers, body) = value.into_raw_parts();
757 headers.as_mut().extend(broken_headers);
758
759 (headers, body)
760 }
761
762 pub fn build(self) -> Result<Record<BufferedBody>, WarcError> {
764 let RecordBuilder {
765 value,
766 broken_headers,
767 last_error,
768 } = self;
769
770 if let Some(e) = last_error {
771 Err(e)
772 } else {
773 debug_assert!(
774 broken_headers.is_empty(),
775 "invariant violation: broken headers without last error"
776 );
777 Ok(value)
778 }
779 }
780}
781
782#[cfg(test)]
783mod record_tests {
784 use crate::header::WarcHeader;
785 use crate::{BufferedBody, Record, RecordType};
786
787 use chrono::prelude::*;
788
789 #[test]
790 fn default() {
791 let before = Utc::now();
792 std::thread::sleep(std::time::Duration::from_millis(10));
793 let record = Record::<BufferedBody>::default();
794 std::thread::sleep(std::time::Duration::from_millis(10));
795 let after = Utc::now();
796 assert_eq!(record.content_length(), 0);
797 assert_eq!(record.warc_version(), "1.0");
798 assert_eq!(record.warc_type(), &RecordType::Resource);
799 assert!(record.date() > &before);
800 assert!(record.date() < &after);
801 }
802
803 #[test]
804 fn impl_eq() {
805 let record1 = Record::<BufferedBody>::default();
806 let record2 = record1.clone();
807 assert_eq!(record1, record2);
808 }
809
810 #[test]
811 fn body() {
812 let mut record = Record::<BufferedBody>::default();
813 assert_eq!(record.content_length(), 0);
814 assert_eq!(record.body(), &[]);
815 record.replace_body(b"hello!!".to_vec());
816 assert_eq!(record.content_length(), 7);
817 assert_eq!(record.body(), b"hello!!");
818 record.body_mut().copy_from_slice(b"goodbye");
819 assert_eq!(record.content_length(), 7);
820 assert_eq!(record.body(), b"goodbye");
821 }
822
823 #[test]
824 fn add_header() {
825 let mut record = Record::<BufferedBody>::default();
826 assert!(record.header(WarcHeader::TargetURI).is_none());
827 assert!(record
828 .set_header(WarcHeader::TargetURI, "https://www.rust-lang.org")
829 .unwrap()
830 .is_none());
831 assert_eq!(
832 record.header(WarcHeader::TargetURI).unwrap(),
833 "https://www.rust-lang.org"
834 );
835 assert_eq!(
836 record
837 .set_header(WarcHeader::TargetURI, "https://docs.rs")
838 .unwrap()
839 .unwrap(),
840 "https://www.rust-lang.org"
841 );
842 assert_eq!(
843 record.header(WarcHeader::TargetURI).unwrap(),
844 "https://docs.rs"
845 );
846 }
847
848 #[test]
849 fn set_header_override_content_length() {
850 let mut record = Record::<BufferedBody>::default();
851 assert_eq!(record.header(WarcHeader::ContentLength).unwrap(), "0");
852 assert!(record
853 .set_header(WarcHeader::ContentLength, "really short")
854 .is_err());
855 assert!(record.set_header(WarcHeader::ContentLength, "50").is_err());
856 assert_eq!(
857 record
858 .set_header(WarcHeader::ContentLength, "0")
859 .unwrap()
860 .unwrap(),
861 "0"
862 );
863 }
864
865 #[test]
866 fn set_header_override_warc_date() {
867 let mut record = Record::<BufferedBody>::default();
868 let old_date = record.date().to_rfc3339_opts(SecondsFormat::Secs, true);
869 assert_eq!(record.header(WarcHeader::Date).unwrap(), old_date);
870 assert!(record.set_header(WarcHeader::Date, "yesterday").is_err());
871 assert_eq!(
872 record
873 .set_header(WarcHeader::Date, "2020-07-21T22:00:00Z")
874 .unwrap()
875 .unwrap(),
876 old_date
877 );
878 assert_eq!(
879 record.header(WarcHeader::Date).unwrap(),
880 "2020-07-21T22:00:00Z"
881 );
882 }
883
884 #[test]
885 fn set_header_override_warc_record_id() {
886 let mut record = Record::<BufferedBody>::default();
887 let old_id = record.warc_id().to_string();
888 assert_eq!(
889 record.header(WarcHeader::RecordID).unwrap(),
890 old_id.as_str()
891 );
892 assert_eq!(
893 record
894 .set_header(WarcHeader::RecordID, "urn:http:www.rust-lang.org")
895 .unwrap()
896 .unwrap(),
897 old_id.as_str()
898 );
899 assert_eq!(
900 record.header(WarcHeader::RecordID).unwrap(),
901 "urn:http:www.rust-lang.org"
902 );
903 }
904
905 #[test]
906 fn set_header_override_warc_type() {
907 let mut record = Record::<BufferedBody>::default();
908 assert_eq!(record.header(WarcHeader::WarcType).unwrap(), "resource");
909 assert_eq!(
910 record
911 .set_header(WarcHeader::WarcType, "revisit")
912 .unwrap()
913 .unwrap(),
914 "resource"
915 );
916 assert_eq!(record.header(WarcHeader::WarcType).unwrap(), "revisit");
917 }
918}
919
920#[cfg(test)]
921mod raw_tests {
922 use crate::header::WarcHeader;
923 use crate::{EmptyBody, RawRecordHeader, Record, RecordType};
924
925 use std::collections::HashMap;
926 use std::convert::TryFrom;
927
928 #[test]
929 fn create() {
930 let headers = RawRecordHeader {
931 version: "1.0".to_owned(),
932 headers: HashMap::new(),
933 };
934
935 assert_eq!(headers.as_ref().len(), 0);
936 }
937
938 #[test]
939 fn create_with_headers() {
940 let headers = RawRecordHeader {
941 version: "1.0".to_owned(),
942 headers: vec![(
943 WarcHeader::WarcType,
944 RecordType::WarcInfo.to_string().into_bytes(),
945 )]
946 .into_iter()
947 .collect(),
948 };
949
950 assert_eq!(headers.as_ref().len(), 1);
951 }
952
953 #[test]
954 fn verify_ok() {
955 let headers = RawRecordHeader {
956 version: "1.0".to_owned(),
957 headers: vec![
958 (WarcHeader::WarcType, b"dunno".to_vec()),
959 (WarcHeader::ContentLength, b"5".to_vec()),
960 (
961 WarcHeader::RecordID,
962 b"<urn:test:basic-record:record-0>".to_vec(),
963 ),
964 (WarcHeader::Date, b"2020-07-08T02:52:55Z".to_vec()),
965 ]
966 .into_iter()
967 .collect(),
968 };
969
970 assert!(Record::<EmptyBody>::try_from(headers).is_ok());
971 }
972
973 #[test]
974 fn verify_missing_type() {
975 let headers = RawRecordHeader {
976 version: "1.0".to_owned(),
977 headers: vec![
978 (WarcHeader::ContentLength, b"5".to_vec()),
979 (
980 WarcHeader::RecordID,
981 b"<urn:test:basic-record:record-0>".to_vec(),
982 ),
983 (WarcHeader::Date, b"2020-07-08T02:52:55Z".to_vec()),
984 ]
985 .into_iter()
986 .collect(),
987 };
988
989 assert!(Record::<EmptyBody>::try_from(headers).is_err());
990 }
991
992 #[test]
993 fn verify_missing_content_length() {
994 let headers = RawRecordHeader {
995 version: "1.0".to_owned(),
996 headers: vec![
997 (WarcHeader::WarcType, b"dunno".to_vec()),
998 (
999 WarcHeader::RecordID,
1000 b"<urn:test:basic-record:record-0>".to_vec(),
1001 ),
1002 (WarcHeader::Date, b"2020-07-08T02:52:55Z".to_vec()),
1003 ]
1004 .into_iter()
1005 .collect(),
1006 };
1007
1008 assert!(Record::<EmptyBody>::try_from(headers).is_err());
1009 }
1010
1011 #[test]
1012 fn verify_missing_record_id() {
1013 let headers = RawRecordHeader {
1014 version: "1.0".to_owned(),
1015 headers: vec![
1016 (WarcHeader::WarcType, b"dunno".to_vec()),
1017 (WarcHeader::ContentLength, b"5".to_vec()),
1018 (WarcHeader::Date, b"2020-07-08T02:52:55Z".to_vec()),
1019 ]
1020 .into_iter()
1021 .collect(),
1022 };
1023
1024 assert!(Record::<EmptyBody>::try_from(headers).is_err());
1025 }
1026
1027 #[test]
1028 fn verify_missing_date() {
1029 let headers = RawRecordHeader {
1030 version: "1.0".to_owned(),
1031 headers: vec![
1032 (WarcHeader::WarcType, b"dunno".to_vec()),
1033 (WarcHeader::ContentLength, b"5".to_vec()),
1034 (
1035 WarcHeader::RecordID,
1036 b"<urn:test:basic-record:record-0>".to_vec(),
1037 ),
1038 ]
1039 .into_iter()
1040 .collect(),
1041 };
1042
1043 assert!(Record::<EmptyBody>::try_from(headers).is_err());
1044 }
1045
1046 #[test]
1047 fn verify_display() {
1048 let header_entries = vec![
1049 (WarcHeader::WarcType, b"dunno".to_vec()),
1050 (WarcHeader::Date, b"2024-01-01T00:00:00Z".to_vec()),
1051 ];
1052
1053 let headers = RawRecordHeader {
1054 version: "1.0".to_owned(),
1055 headers: header_entries.into_iter().collect(),
1056 };
1057
1058 let output = headers.to_string();
1059
1060 let expected_lines = [
1061 "WARC/1.0",
1062 "warc-type: dunno",
1063 "warc-date: 2024-01-01T00:00:00Z",
1064 "",
1065 ];
1066 let actual_lines: Vec<_> = output.lines().collect();
1067
1068 let mut expected_headers: Vec<_> = expected_lines[1..expected_lines.len() - 1].to_vec();
1069 expected_headers.sort();
1070
1071 let mut actual_headers: Vec<_> = actual_lines[1..actual_lines.len() - 1].to_vec();
1072 actual_headers.sort();
1073
1074 assert_eq!(actual_lines[0], expected_lines[0]); assert_eq!(actual_headers, expected_headers); assert_eq!(actual_lines.last(), expected_lines.last()); }
1079}
1080
1081#[cfg(test)]
1082mod builder_tests {
1083 use crate::header::WarcHeader;
1084 use crate::{
1085 BufferedBody, EmptyBody, RawRecordHeader, Record, RecordBuilder, RecordType, TruncatedType,
1086 };
1087
1088 use std::convert::TryFrom;
1089
1090 #[test]
1091 fn default() {
1092 let (headers, body) = RecordBuilder::default().build_raw();
1093 assert_eq!(headers.version, "1.0".to_string());
1094 assert_eq!(
1095 headers.as_ref().get(&WarcHeader::ContentLength).unwrap(),
1096 &b"0".to_vec()
1097 );
1098 assert!(body.is_empty());
1099 assert_eq!(
1100 RecordBuilder::default().build().unwrap().content_length(),
1101 0
1102 );
1103 }
1104
1105 #[test]
1106 fn default_with_body() {
1107 let (headers, body) = RecordBuilder::default()
1108 .body(b"abcdef".to_vec())
1109 .build_raw();
1110 assert_eq!(headers.version, "1.0".to_string());
1111 assert_eq!(
1112 headers.as_ref().get(&WarcHeader::ContentLength).unwrap(),
1113 &b"6".to_vec()
1114 );
1115 assert_eq!(body.as_slice(), b"abcdef");
1116 assert_eq!(
1117 RecordBuilder::default()
1118 .body(b"abcdef".to_vec())
1119 .build()
1120 .unwrap()
1121 .content_length(),
1122 6
1123 );
1124 }
1125
1126 #[test]
1127 fn impl_eq_raw() {
1128 let builder = RecordBuilder::default();
1129 let raw1 = builder.clone().build_raw();
1130
1131 let raw2 = builder.build_raw();
1132 assert_eq!(raw1, raw2);
1133 }
1134
1135 #[test]
1136 fn impl_eq_record() {
1137 let builder = RecordBuilder::default();
1138 let record1 = builder.clone().build().unwrap();
1139
1140 let record2 = builder.build().unwrap();
1141 assert_eq!(record1, record2);
1142 }
1143
1144 #[test]
1145 fn create_with_headers() {
1146 let headers = RawRecordHeader {
1147 version: "1.0".to_owned(),
1148 headers: vec![(
1149 WarcHeader::WarcType,
1150 RecordType::WarcInfo.to_string().into_bytes(),
1151 )]
1152 .into_iter()
1153 .collect(),
1154 };
1155
1156 assert_eq!(headers.as_ref().len(), 1);
1157 }
1158
1159 #[test]
1160 fn verify_ok() {
1161 let headers = RawRecordHeader {
1162 version: "1.0".to_owned(),
1163 headers: vec![
1164 (WarcHeader::WarcType, b"dunno".to_vec()),
1165 (WarcHeader::ContentLength, b"5".to_vec()),
1166 (
1167 WarcHeader::RecordID,
1168 b"<urn:test:basic-record:record-0>".to_vec(),
1169 ),
1170 (WarcHeader::Date, b"2020-07-08T02:52:55Z".to_vec()),
1171 ]
1172 .into_iter()
1173 .collect(),
1174 };
1175
1176 assert!(Record::<EmptyBody>::try_from(headers).is_ok());
1177 }
1178
1179 #[test]
1180 fn verify_content_length() {
1181 let mut builder = RecordBuilder::default().body(b"12345".to_vec());
1182
1183 assert_eq!(
1184 builder
1185 .clone()
1186 .build()
1187 .unwrap()
1188 .into_raw_parts()
1189 .0
1190 .as_ref()
1191 .get(&WarcHeader::ContentLength)
1192 .unwrap(),
1193 &b"5".to_vec()
1194 );
1195
1196 assert_eq!(
1197 builder
1198 .clone()
1199 .build_raw()
1200 .0
1201 .as_ref()
1202 .get(&WarcHeader::ContentLength)
1203 .unwrap(),
1204 &b"5".to_vec()
1205 );
1206
1207 builder = builder.header(WarcHeader::ContentLength, "1");
1208 assert_eq!(
1209 builder
1210 .clone()
1211 .build_raw()
1212 .0
1213 .as_ref()
1214 .get(&WarcHeader::ContentLength)
1215 .unwrap(),
1216 &b"1".to_vec()
1217 );
1218
1219 assert!(builder.build().is_err());
1220 }
1221
1222 #[test]
1223 fn verify_build_record_type() {
1224 let builder1 = RecordBuilder::default().header(WarcHeader::WarcType, "request");
1225 let builder2 = builder1.clone().warc_type(RecordType::Request);
1226
1227 let record1 = builder1.build().unwrap();
1228 let record2 = builder2.build().unwrap();
1229
1230 assert_eq!(record1, record2);
1231 assert_eq!(
1232 record1
1233 .into_raw_parts()
1234 .0
1235 .as_ref()
1236 .get(&WarcHeader::WarcType),
1237 Some(&b"request".to_vec())
1238 );
1239 }
1240
1241 #[test]
1242 fn verify_build_date() {
1243 const DATE_STRING_0: &str = "2020-07-08T02:52:55Z";
1244 const DATE_STRING_1: &[u8] = b"2020-07-18T02:12:45Z";
1245
1246 let mut builder = RecordBuilder::default();
1247 builder = builder.date(Record::<BufferedBody>::parse_record_date(DATE_STRING_0).unwrap());
1248
1249 let record = builder.clone().build().unwrap();
1250 assert_eq!(
1251 record
1252 .into_raw_parts()
1253 .0
1254 .as_ref()
1255 .get(&WarcHeader::Date)
1256 .unwrap(),
1257 &DATE_STRING_0.as_bytes()
1258 );
1259 assert_eq!(
1260 builder
1261 .clone()
1262 .build_raw()
1263 .0
1264 .as_ref()
1265 .get(&WarcHeader::Date)
1266 .unwrap(),
1267 &DATE_STRING_0.as_bytes()
1268 );
1269
1270 builder = builder.header(WarcHeader::Date, DATE_STRING_1.to_vec());
1271 let record = builder.clone().build().unwrap();
1272 assert_eq!(
1273 record
1274 .into_raw_parts()
1275 .0
1276 .as_ref()
1277 .get(&WarcHeader::Date)
1278 .unwrap(),
1279 &DATE_STRING_1.to_vec()
1280 );
1281 assert_eq!(
1282 builder
1283 .clone()
1284 .build_raw()
1285 .0
1286 .as_ref()
1287 .get(&WarcHeader::Date)
1288 .unwrap(),
1289 &DATE_STRING_1.to_vec()
1290 );
1291
1292 let builder = builder.header(WarcHeader::Date, b"not-a-dayTor:a:time".to_vec());
1293 assert!(builder.build().is_err());
1294 }
1295
1296 #[test]
1297 fn verify_build_record_id() {
1298 const RECORD_ID_0: &[u8] = b"<urn:test:verify-build-id:record-0>";
1299 const RECORD_ID_1: &[u8] = b"<urn:test:verify-build-id:record-1>";
1300
1301 let mut builder = RecordBuilder::default();
1302 builder = builder.warc_id(std::str::from_utf8(RECORD_ID_0).unwrap());
1303
1304 let record = builder.clone().build().unwrap();
1305 assert_eq!(
1306 record
1307 .into_raw_parts()
1308 .0
1309 .as_ref()
1310 .get(&WarcHeader::RecordID)
1311 .unwrap(),
1312 &RECORD_ID_0.to_vec()
1313 );
1314 assert_eq!(
1315 builder
1316 .clone()
1317 .build_raw()
1318 .0
1319 .as_ref()
1320 .get(&WarcHeader::RecordID)
1321 .unwrap(),
1322 &RECORD_ID_0.to_vec()
1323 );
1324
1325 let builder = builder.header(WarcHeader::RecordID, RECORD_ID_1.to_vec());
1326 let record = builder.clone().build().unwrap();
1327 assert_eq!(
1328 record
1329 .into_raw_parts()
1330 .0
1331 .as_ref()
1332 .get(&WarcHeader::RecordID)
1333 .unwrap(),
1334 &RECORD_ID_1.to_vec()
1335 );
1336 assert_eq!(
1337 builder
1338 .clone()
1339 .build_raw()
1340 .0
1341 .as_ref()
1342 .get(&WarcHeader::RecordID)
1343 .unwrap(),
1344 &RECORD_ID_1.to_vec()
1345 );
1346 }
1347
1348 #[test]
1349 fn verify_build_truncated_type() {
1350 const TRUNCATED_TYPE_0: &[u8] = b"length";
1351 const TRUNCATED_TYPE_1: &[u8] = b"disconnect";
1352
1353 let mut builder = RecordBuilder::default();
1354 builder = builder.truncated_type(TruncatedType::Length);
1355
1356 let record = builder.clone().build().unwrap();
1357 assert_eq!(
1358 record
1359 .into_raw_parts()
1360 .0
1361 .as_ref()
1362 .get(&WarcHeader::Truncated)
1363 .unwrap(),
1364 &TRUNCATED_TYPE_0.to_vec()
1365 );
1366 assert_eq!(
1367 builder
1368 .clone()
1369 .build_raw()
1370 .0
1371 .as_ref()
1372 .get(&WarcHeader::Truncated)
1373 .unwrap(),
1374 &TRUNCATED_TYPE_0.to_vec()
1375 );
1376
1377 builder = builder.header(WarcHeader::Truncated, "disconnect");
1378 let record = builder.clone().build().unwrap();
1379 assert_eq!(
1380 record
1381 .into_raw_parts()
1382 .0
1383 .as_ref()
1384 .get(&WarcHeader::Truncated)
1385 .unwrap(),
1386 &TRUNCATED_TYPE_1.to_vec()
1387 );
1388 assert_eq!(
1389 builder
1390 .clone()
1391 .build_raw()
1392 .0
1393 .as_ref()
1394 .get(&WarcHeader::Truncated)
1395 .unwrap(),
1396 &TRUNCATED_TYPE_1.to_vec()
1397 );
1398
1399 builder = builder.header(WarcHeader::Truncated, "foreign-intervention");
1400 assert_eq!(
1401 builder
1402 .clone()
1403 .build()
1404 .unwrap()
1405 .into_raw_parts()
1406 .0
1407 .as_ref()
1408 .get(&WarcHeader::Truncated)
1409 .unwrap()
1410 .as_slice(),
1411 &b"foreign-intervention"[..]
1412 );
1413
1414 assert_eq!(
1415 builder
1416 .clone()
1417 .build_raw()
1418 .0
1419 .as_ref()
1420 .get(&WarcHeader::Truncated)
1421 .unwrap()
1422 .as_slice(),
1423 &b"foreign-intervention"[..]
1424 );
1425 }
1426}