wayback_urls/
timemap.rs

1use std::{error::Error, fmt, marker::PhantomData, str::FromStr};
2
3mod sealed {
4    /// Trait to mark that all possible values have a fixed string representation
5    pub trait ToStaticStr {
6        fn to_static_str(&self) -> &'static str;
7    }
8
9    /// Marker Trait for the typestate pattern in a [`RequestBuilder`]
10    ///
11    /// [`RequestBuilder`]: ../struct.RequestBuilder.html
12    pub trait RequestState {}
13    pub trait RequestField: ToStaticStr + Into<super::AnyField> {}
14}
15
16/// The base URL for the timemap of the Internet Archives' Wayback Machine
17///
18/// i.e. `https://web.archive.org/web/timemap/?`
19pub const TIMEMAP_BASE: &str = "https://web.archive.org/web/timemap/?";
20
21/// Restriction of the returned items **before** collapsing/grouping
22///
23/// The notation for a filter is `"!"? FIELD ":" REGEX`, i.e.
24/// an optional negation, the field that it applies to and a regex
25/// that the server will used to match on the value.
26pub struct Filter<'a> {
27    invert: bool,
28    field: Field,
29    regex: &'a str,
30}
31
32impl<'a> Filter<'a> {
33    /// Parses a filter from a borrowed string
34    pub fn parse_from_str(s: &'a str) -> Result<Self, ParseFilterError> {
35        let (invert, input) = if s.starts_with('!') {
36            (true, s /*.split_at(1).1*/)
37        } else {
38            (false, s)
39        };
40
41        let mut split = input.splitn(2, ':');
42        let field: Field = if let Some(f) = split.next() {
43            f.parse().map_err(ParseFilterError::UnknownField)?
44        } else {
45            panic!("splitn should always return at least one item");
46        };
47
48        if let Some(regex) = split.next() {
49            Ok(Filter {
50                invert,
51                field,
52                regex,
53            })
54        } else {
55            Err(ParseFilterError::MissingColon)
56        }
57    }
58
59    /// Create an owned ('static) version of this filter
60    pub fn to_owned(&self) -> FilterBuf {
61        FilterBuf {
62            invert: self.invert,
63            field: self.field,
64            regex: self.regex.to_owned(),
65        }
66    }
67}
68
69/// An owned (`'static`) variant of [`Filter`](struct.Filter.html)
70pub struct FilterBuf {
71    invert: bool,
72    field: Field,
73    regex: String,
74}
75
76impl FilterBuf {
77    /// Ìnverse of [`Filter::to_owned`]
78    ///
79    /// [`Filter::to_owned`]: struct.Filter.html#method.to_owned
80    pub fn as_ref(&self) -> Filter {
81        Filter {
82            invert: self.invert,
83            field: self.field,
84            regex: &self.regex,
85        }
86    }
87}
88
89#[derive(Debug)]
90/// An error returned from the `parse_from_str` function on [`Filter`].
91///
92/// [`Filter`]: struct.Filter.html
93pub enum ParseFilterError {
94    MissingColon,
95    UnknownField(UnknownFieldError),
96}
97
98impl Error for ParseFilterError {}
99
100impl fmt::Display for ParseFilterError {
101    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
102        match self {
103            Self::MissingColon => write!(f, "Missing `:` between field name and regex"),
104            Self::UnknownField(e) => write!(f, "{}", e),
105        }
106    }
107}
108
109impl FromStr for FilterBuf {
110    type Err = ParseFilterError;
111
112    fn from_str(s: &str) -> Result<Self, Self::Err> {
113        Filter::parse_from_str(s).map(|f| f.to_owned())
114    }
115}
116
117impl<'a> Filter<'a> {
118    /// Creates a new filter that requires the given field to match the regex
119    pub fn new(field: Field, regex: &'a str) -> Self {
120        Self {
121            invert: false,
122            field,
123            regex,
124        }
125    }
126
127    /// Creates a new filter that prohibits the given field to match the regex
128    pub fn inverted(field: Field, regex: &'a str) -> Self {
129        Self {
130            invert: true,
131            field,
132            regex,
133        }
134    }
135}
136
137/// A configured URL for a request to the wayback machines' timemap
138///
139/// ```
140/// use wayback_urls::timemap::{Request, Field::{Timestamp, StatusCode, UrlKey}};
141///
142/// let r = Request::builder("nexushq.universe.lego.com/en-us/character/details")
143///                 .match_prefix()
144///                 .with_field(Timestamp)
145///                 .with_field(UrlKey)
146///                 .filter_inverted(StatusCode, "[45]..")
147///                 .collapse(UrlKey)
148///                 .done().to_url();
149/// assert_eq!(&r, "https://web.archive.org/web/timemap/\
150/// ?url=nexushq.universe.lego.com%2Fen-us%2Fcharacter%2Fdetails\
151/// &fl=timestamp,urlkey\
152/// &matchType=prefix\
153/// &collapse=urlkey\
154/// &filter=!statuscode:[45]..");
155/// ```
156pub struct Request<'a> {
157    /// URL
158    url: &'a str,
159    /// Output format
160    output: Output,
161    /// Fields
162    fl: Vec<AnyField>,
163    /// Filter
164    filter: Vec<Filter<'a>>,
165    /// Match Type
166    match_type: MatchType,
167    /// Collapse to groups
168    collapse: Option<Field>,
169}
170
171impl<'a> Request<'a> {
172    /// Create a new builder in the basic (non-grouped) state
173    pub fn builder(url: &'a str) -> RequestBuilder<'a, BasicRequest> {
174        RequestBuilder::new(url)
175    }
176
177    /// Return the URL that this request represents as an owned string
178    pub fn to_url(&self) -> String {
179        use sealed::ToStaticStr;
180
181        let mut url = TIMEMAP_BASE.to_string();
182        url.push_str("url=");
183        url.push_str(&urlencoding::encode(self.url));
184        let mut fl_iter = self.fl.iter();
185        if let Some(s) = fl_iter.next() {
186            url.push_str("&fl=");
187            url.push_str(s.to_static_str());
188            for f in fl_iter {
189                url.push_str(",");
190                url.push_str(f.to_static_str());
191            }
192        }
193        if let Some(match_type) = self.match_type.opt_static_str() {
194            url.push_str("&matchType=");
195            url.push_str(match_type);
196        }
197        if let Some(output) = self.output.opt_static_str() {
198            url.push_str("&output=");
199            url.push_str(output);
200        }
201        if let Some(collapse) = self.collapse {
202            url.push_str("&collapse=");
203            url.push_str(collapse.to_static_str());
204        }
205        let filter_iter = self.filter.iter();
206        for filter in filter_iter {
207            url.push_str("&filter=");
208            if filter.invert {
209                url.push_str("!");
210            }
211            url.push_str(filter.field.to_static_str());
212            url.push_str(":");
213            url.push_str(filter.regex);
214        }
215        url
216    }
217}
218
219/// Struct implementing the builder \& typestate patterns for
220/// creating a request
221pub struct RequestBuilder<'a, S: sealed::RequestState> {
222    /// The request under construction
223    request: Request<'a>,
224    /// The current state
225    _state: PhantomData<S>,
226}
227
228/// Marker type for a basic (non-grouped) request
229pub struct BasicRequest;
230/// Marker type for a grouped (collapsed) request
231pub struct GroupedRequest;
232
233impl sealed::RequestState for BasicRequest {}
234impl sealed::RequestState for GroupedRequest {}
235
236impl<'a, S: sealed::RequestState> RequestBuilder<'a, S> {
237    /// Drop the builder and extract the finished request.
238    pub fn done(self) -> Request<'a> {
239        self.request
240    }
241
242    /// Set the match type of the request
243    pub fn match_type(mut self, match_type: MatchType) -> Self {
244        self.request.match_type = match_type;
245        self
246    }
247
248    /// Shorthand for `match_type(MatchType::Prefix)`
249    pub fn match_prefix(self) -> Self {
250        self.match_type(MatchType::Prefix)
251    }
252
253    /// Set the output format of the request
254    pub fn output(mut self, output: Output) -> Self {
255        self.request.output = output;
256        self
257    }
258
259    /// Add a filter to be applied to the whole set of entries before any grouping takes place
260    ///
261    /// Normal filters are requirements, inverted filters exclude matching results.
262    pub fn with_filter(mut self, filter: Filter<'a>) -> Self {
263        self.request.filter.push(filter);
264        self
265    }
266
267    /// Shorthand for `with_filter(Filter::new(...))`
268    pub fn filter(self, field: Field, regex: &'a str) -> Self {
269        self.with_filter(Filter::new(field, regex))
270    }
271
272    /// Shorthand for `with_filter(Filter::inverted(...))`
273    pub fn filter_inverted(self, field: Field, regex: &'a str) -> Self {
274        self.with_filter(Filter::inverted(field, regex))
275    }
276}
277
278impl<'a> RequestBuilder<'a, BasicRequest> {
279    /// Create a new basic request
280    pub fn new(url: &'a str) -> Self {
281        Self {
282            request: Request {
283                url,
284                output: Output::Default,
285                fl: Vec::new(),
286                match_type: MatchType::Exact,
287                collapse: None,
288                filter: Vec::new(),
289            },
290            _state: PhantomData,
291        }
292    }
293
294    /// Explicitly add a [`Field`] to the list of fields in the result.
295    ///
296    /// Fields can be added multiple times. When no field is added
297    /// to a request, the server chooses a default set of fields
298    ///
299    /// [`Field`]: enum.Field.html
300    pub fn with_field(mut self, field: Field) -> Self {
301        self.request.fl.push(field.into());
302        self
303    }
304
305    /// Collapse the result set by grouping entries for which the given field
306    /// has the same value.
307    ///
308    /// This is generally used to group results by [`UrlKey`]
309    ///
310    /// [`UrlKey`]: enum.Field.html#variant.UrlKey
311    pub fn collapse(mut self, field: Field) -> RequestBuilder<'a, GroupedRequest> {
312        self.request.collapse = Some(field);
313        RequestBuilder {
314            request: self.request,
315            _state: PhantomData,
316        }
317    }
318}
319
320impl<'a> RequestBuilder<'a, GroupedRequest> {
321    /// Explicitly add a [`Field`] or [`GroupField`] to the list of fields in the result.
322    ///
323    /// Fields can be added multiple times. When no field is added to a request,
324    /// the server chooses a default set of fields
325    ///
326    /// [`Field`]: enum.Field.html
327    /// [`GroupField`]: enum.GroupField.html
328    pub fn with_field<F: sealed::RequestField>(mut self, field: F) -> Self {
329        self.request.fl.push(field.into());
330        self
331    }
332}
333
334/// All fields that single entries in the archives' database have
335#[derive(Copy, Clone, Debug, Eq, PartialEq)]
336pub enum Field {
337    /// The original Location/URL of the memeto
338    Original,
339    /// The timestamp when the memeto was stored
340    Timestamp,
341    /// A simplified variant of a URL that represents semantic equality of URLs,
342    /// e.g. `http://example.com` and `http://www.example.com:80/` have the same key
343    UrlKey,
344    /// The content type of teh resource, e.g. `text/html`
345    MimeType,
346    /// The HTTP status code at crawl time, e.g. `200`, `404`, `302`
347    StatusCode,
348    /// A Hash Value?
349    Digest,
350    /// ...
351    Redirect,
352    /// ...
353    RobotFlags,
354    /// The length of the memeto
355    Length,
356    /// The offset of the memeto within containing archive
357    Offset,
358    /// The filename of a compressed archive that contains the memeto
359    Filename,
360}
361
362impl sealed::RequestField for Field {}
363
364/// Variant of [`ToStaticStr`] where one variant is the default choice and may be omitted
365///
366/// [`ToStaticStr`]: trait.ToStaticStr.html
367trait OptStaticStr {
368    fn opt_static_str(&self) -> Option<&'static str>;
369}
370
371/// Trait to mark that one possible value is considered the default
372trait HasDefaultVariant {
373    fn is_default(&self) -> bool;
374}
375
376/// Fields that are available when the result is collapsed/grouped
377#[derive(Copy, Clone, Debug, Eq, PartialEq)]
378pub enum GroupField {
379    EndTimestamp,
380    GroupCount,
381    UniqCount,
382}
383
384/// All fields that can be set in a request
385#[derive(Copy, Clone, Debug, Eq, PartialEq)]
386pub enum AnyField {
387    Basic(Field),
388    Group(GroupField),
389}
390
391impl sealed::ToStaticStr for AnyField {
392    fn to_static_str(&self) -> &'static str {
393        match self {
394            Self::Basic(f) => f.to_static_str(),
395            Self::Group(f) => f.to_static_str(),
396        }
397    }
398}
399
400impl From<Field> for AnyField {
401    fn from(f: Field) -> Self {
402        Self::Basic(f)
403    }
404}
405
406impl From<GroupField> for AnyField {
407    fn from(f: GroupField) -> Self {
408        Self::Group(f)
409    }
410}
411
412impl sealed::RequestField for GroupField {}
413
414impl sealed::ToStaticStr for Field {
415    fn to_static_str(&self) -> &'static str {
416        match self {
417            Self::Original => "original",
418            Self::Timestamp => "timestamp",
419            Self::UrlKey => "urlkey",
420            Self::MimeType => "mimetype",
421            Self::StatusCode => "statuscode",
422            Self::Digest => "digest",
423            Self::Redirect => "redirect",
424            Self::RobotFlags => "robotflags",
425            Self::Length => "length",
426            Self::Offset => "offset",
427            Self::Filename => "filename",
428        }
429    }
430}
431
432#[derive(Debug)]
433/// An error returned from the `from_str` function on [`Field`](enum.Field.html).
434pub struct UnknownFieldError(String);
435
436impl fmt::Display for UnknownFieldError {
437    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
438        write!(f, "Unknown field name `{}`", self.0)
439    }
440}
441
442impl Error for UnknownFieldError {}
443
444impl FromStr for Field {
445    type Err = UnknownFieldError;
446    fn from_str(s: &str) -> Result<Self, Self::Err> {
447        match s {
448            "original" => Ok(Self::Original),
449            "timestamp" => Ok(Self::Timestamp),
450            "urlkey" => Ok(Self::UrlKey),
451            "mimetype" => Ok(Self::MimeType),
452            "statuscode" => Ok(Self::StatusCode),
453            "digest" => Ok(Self::Digest),
454            "redirect" => Ok(Self::Redirect),
455            "robotflags" => Ok(Self::RobotFlags),
456            "length" => Ok(Self::Length),
457            "offset" => Ok(Self::Offset),
458            "filename" => Ok(Self::Filename),
459            _ => Err(UnknownFieldError(s.to_owned())),
460        }
461    }
462}
463
464impl sealed::ToStaticStr for GroupField {
465    fn to_static_str(&self) -> &'static str {
466        match self {
467            Self::EndTimestamp => "endtimestamp",
468            Self::GroupCount => "groupcount",
469            Self::UniqCount => "uniqcount",
470        }
471    }
472}
473
474/// How to match an entry to the `url` key of a request.
475#[derive(Debug, Eq, PartialEq)]
476pub enum MatchType {
477    /// The result URL is must match the query exactly
478    Exact,
479    /// The result URL must start with the query
480    Prefix,
481}
482
483impl Default for MatchType {
484    fn default() -> Self {
485        Self::Exact
486    }
487}
488
489impl HasDefaultVariant for MatchType {
490    fn is_default(&self) -> bool {
491        *self == Self::Exact
492    }
493}
494
495impl OptStaticStr for MatchType {
496    fn opt_static_str(&self) -> Option<&'static str> {
497        match self {
498            Self::Exact => None,
499            Self::Prefix => Some("prefix"),
500        }
501    }
502}
503
504impl sealed::ToStaticStr for MatchType {
505    fn to_static_str(&self) -> &'static str {
506        match self {
507            Self::Exact => "exact",
508            Self::Prefix => "prefix",
509        }
510    }
511}
512
513/// Different available output formats
514#[derive(Debug, Eq, PartialEq)]
515pub enum Output {
516    /// Default, space-separated, line-based format
517    Default,
518    /// Array of Arrays of Strings, first inner array are the column names
519    Json,
520    /// Memeto Link-Format
521    Link,
522}
523
524impl HasDefaultVariant for Output {
525    fn is_default(&self) -> bool {
526        *self == Self::Default
527    }
528}
529
530impl OptStaticStr for Output {
531    fn opt_static_str(&self) -> Option<&'static str> {
532        match self {
533            Self::Default => None,
534            Self::Json => Some("json"),
535            Self::Link => Some("link"),
536        }
537    }
538}
539
540impl sealed::ToStaticStr for Output {
541    fn to_static_str(&self) -> &'static str {
542        match self {
543            Self::Default => "",
544            Self::Json => "json",
545            Self::Link => "link",
546        }
547    }
548}