1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
//! # archiveis - Rust API Wrapper for Archive.is
//! This crate provides simple access to the Archive.is Capturing Service.
//! ## Quick Start
//! ### Creating a Client
//! To create a client to access the Archive.is Capturing Service, you should use the `ArchiveClient`
//! struct. You can pass a specific user agent or none to use a default one.
//! To capture a specific url all you need to do is call the `capture` function of the client provided
//! with the desired url.
//!
//! ### Archive a url
//! The `ArchiveClient` is build with `hyper` and therefor uses futures for its services.
//!
//! ```rust,no_run
//! extern crate archiveis;
//! extern crate futures;
//!
//! use archiveis::ArchiveClient;
//! use futures::future::Future;
//!
//! let client = ArchiveClient::new(Some("archiveis (https://github.com/MattsSe/archiveis-rs)"));
//! let url = "http://example.com/";
//! let capture = client.capture(url).and_then(|archived| {
//!     println!("targeted url: {}", archived.target_url);
//!     println!("url of archived site: {}", archived.archived_url);
//!     println!("archive.is submit token: {}", archived.submit_token);
//!     Ok(())
//! });
//! ```
//! ### Archive multiple urls
//! archive.is uses a temporary token to validate a archive request.
//! The `ArchiveClient` `capture` function first obtains a new submit token via a GET request.
//! The token is usually valid several minutes, and even if archive.is switches to a new in the
//! meantime token,the older ones are still valid. So if we need to archive multiple links,
//! we can only need to obtain the token once and then invoke the capturing service directly with
//! `capture_with_token` for each url. `capture_all` returns a Vec of Results of every capturing
//! request, so every single capture request gets executed regardless of the success of prior requests.
//!
//! ```rust,no_run
//! extern crate archiveis;
//! extern crate futures;
//!
//! use archiveis::ArchiveClient;
//! use futures::future::{join_all, Future};
//!
//! let client = ArchiveClient::new(Some("archiveis (https://github.com/MattsSe/archiveis-rs)"));
//!
//! // the urls to capture
//! let urls = vec![
//!     "http://example.com/",
//!     "https://github.com/MattsSe/archiveis-rs",
//!     "https://crates.io",
//! ];
//!
//! let capture = client.capture_all(urls, None).and_then(|archives| {
//!         let failures: Vec<_> = archives
//!             .iter()
//!             .map(Result::as_ref)
//!             .filter(Result::is_err)
//!             .map(Result::unwrap_err)
//!             .collect();
//!         if failures.is_empty() {
//!             println!("all links successfully archived.");
//!         } else {
//!            for err in failures {
//!                 if let archiveis::Error::MissingUrl(url) = err {
//!                     println!("Failed to archive url: {}", url);
//!                 }
//!             }
//!         }
//!        Ok(())
//!    });
//! ```
//!

#![deny(warnings)]
extern crate chrono;
extern crate futures;
extern crate hyper;
extern crate url;

use chrono::DateTime;
use futures::future;
use hyper::rt::{Future, Stream};
use hyper::Client;
use hyper::Request;

/// The Error Type used in this crate
///
#[derive(Debug)]
pub enum Error {
    /// Represents an error originated from hyper
    Hyper(hyper::Error),
    /// Means that no token could be obtained from archive.is
    MissingToken,
    /// Means that the POST was successfull but no archive url to the requested
    /// url, which `MissingUrl` stores, could be obtained from the HTTP response
    MissingUrl(String),
}

/// Represents a result of the capture service
#[derive(Debug, Clone)]
pub struct Archived {
    /// The requested url to archive with the archive.is capture service
    pub target_url: String,
    /// The archive.is url that archives the `target_url`
    pub archived_url: String,
    /// The time stamp when the site was archived
    pub time_stamp: Option<DateTime<chrono::Utc>>,
    /// The submitid token used to authorize access on the archive.is server
    pub submit_token: String,
}

/// A Client that serves as a wrapper around the archive.is capture service
pub struct ArchiveClient {
    /// The internal Hyper Http Client.
    client: Client<hyper::client::HttpConnector, hyper::Body>,
    /// The user agent used for the HTTP Requests
    user_agent: String,
}

impl ArchiveClient {
    /// Creates a new instance of the `ArchiveClient` using the provided user agent or a dummy one.
    pub fn new(user_agent: Option<&str>) -> Self {
        ArchiveClient {
            client: Client::new(),
            user_agent: user_agent.map(|x| x.to_owned()).unwrap_or_else(|| {
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36".to_owned()
            }),
        }
    }

    /// Invokes the archive.is capture service an each url supplied.
    /// If no token was passed, a fresh token is obtained via `get_unique_token`,
    /// afterwards all capture requests are joined in a single future that returns
    /// a `Vec<Result<Archived, Error>>` which holds every result of the individual
    /// capturing requests, so every single capture request gets executed regardless
    /// of the success of prior requests.
    pub fn capture_all<'a>(
        &'a self,
        urls: Vec<&'a str>,
        token: Option<String>,
    ) -> impl Future<Item = Vec<Result<Archived, Error>>, Error = Error> + 'a {
        use futures::future::join_all;
        let get_token: Box<Future<Item = String, Error = Error>> = match token {
            Some(t) => Box::new(future::ok(t)),
            _ => Box::new(self.get_unique_token()),
        };
        get_token.and_then(move |token| {
            let mut futures = Vec::new();
            for url in urls {
                futures.push(self.capture_with_token(url, &token).then(Ok));
            }
            join_all(futures)
        })
    }

    /// Invokes the archive.is capture service.
    /// First it get's the current valid unique `submitid` by calling `get_unique_id`.
    /// Then it sends a new POST request to the archive.is submit endpoint with the `url` and the
    /// `submitid` encoded as `x-www-form-urlencoded` in the body.
    /// The link to the archived page is then contained in the `Refresh` header of the Response.
    /// It also tries to parse the timemap from the `Date` header and packs it together with the url
    /// in a new `Archived` instance.
    pub fn capture<'a>(&'a self, url: &str) -> impl Future<Item = Archived, Error = Error> + 'a {
        // TODO add lifetime constraints to url instead?
        let u = url.to_owned();
        // TODO The id is usually valid a couple minutes, perhaps caching it instead?
        self.get_unique_token()
            .and_then(move |id| self.capture_with_token(&u, id.as_str()))
    }

    /// Invokes the archive.is capture service directly without retrieving a submit id first.
    /// This can have the advantage that no additional request is necessary, but poses potential
    /// drawbacks when the `id` is not valid. Generally the temporarily ``` are still valid
    /// even when the archiv.is server switched to a new one in the meantime. But it might be the
    /// case, that the server returns a `Server Error`, In that case a `Error::MissingUrl(url)` is
    /// returned containing the requested url.
    /// Switching to the ordinary `capture` method would also be possible but that could result in
    /// undesired cyclic behavior.
    /// There might also be the possibility, where the response body already
    /// contains the html of the archived `url`. In that case we read the archive.is url from the
    /// html's meta information instead.
    pub fn capture_with_token<'a>(
        &'a self,
        url: &str,
        submit_token: &str,
    ) -> impl Future<Item = Archived, Error = Error> + 'a {
        use chrono::TimeZone;
        use url::form_urlencoded;

        let target_url = url.to_owned();
        let body: String = form_urlencoded::Serializer::new(String::new())
            .append_pair("url", target_url.as_str())
            .append_pair("anyway", "1")
            .append_pair("submitid", submit_token)
            .finish();
        let submit_token = submit_token.to_owned();
        // prepare the POST request
        let req = Request::post("http://archive.is/submit/")
            .header("User-Agent", self.user_agent.as_str())
            .header("Content-Type", "application/x-www-form-urlencoded")
            .body(body.into())
            .unwrap();
        let capture = self.client.request(req).map_err(Error::Hyper).and_then(move |resp| {
            // get the url of the archived page
            let refresh = resp.headers().get("Refresh").and_then(|x| {
                x.to_str()
                    .ok()
                    .and_then(|x| x.split('=').nth(1).map(str::to_owned))
            });
            let archived: Box<Future<Item = Archived, Error = Error>> = match refresh
            {
                Some(archived_url) => {
                    // parse the timemap from the Date header
                    let time_stamp = resp.headers().get("Date").and_then(|x| {
                        x.to_str().ok().and_then(|x| {
                            chrono::Utc.datetime_from_str(x, "%a, %e %b %Y %T GMT").ok()
                        })
                    });
                    let archived = Archived {
                        target_url,
                        archived_url,
                        time_stamp,
                        submit_token,
                    };
                    Box::new(future::ok(archived))
                }
                _ => {
                    // an err response body can be empty, contain Server Error or
                    // can directly contain the archived site, in that case we extract the archived_url
                    let err_resp_handling = resp.into_body().concat2().map_err(Error::Hyper).and_then(move |ch| {
                        if let Ok(html) = ::std::str::from_utf8(&ch) {
                            if html.starts_with("<h1>Server Error</h1>") {
                                return Box::new(self.capture(target_url.as_str()))
                                    as Box<Future<Item = Archived, Error = Error>>;
                            }
                            let archived_url = html
                                .splitn(2, "<meta property=\"og:url\"")
                                .nth(1)
                                .and_then(|x| {
                                    x.splitn(2, "content=\"")
                                        .nth(1)
                                        .and_then(|id| id.splitn(2, '\"').next().map(str::to_owned))
                                });
                            if let Some(archived_url) = archived_url {
                                let archived = Archived {
                                    target_url,
                                    archived_url,
                                    time_stamp: None,
                                    submit_token,
                                };
                                return Box::new(future::ok(archived));
                            }
                        }
                        // TODO possible cycle: calling self.capture can cause an undesired loop
                        // Box::new(self.capture(target_url.as_str()))
                        // return an Error instead
                        Box::new(future::err(Error::MissingUrl(target_url)))
                    });
                    Box::new(err_resp_handling)
                }
            };
            archived
        });
        Box::new(capture)
    }

    /// In order to submit an authorized capture request we need to first obtain a temporarily valid
    /// unique token.
    /// This is achieved by sending a GET request to the archive.is domain and parsing the `
    /// `submitid` from the responding html.
    pub fn get_unique_token(&self) -> impl Future<Item = String, Error = Error> {
        let req = Request::get("http://archive.is/")
            .header("User-Agent", self.user_agent.as_str())
            .body(hyper::Body::empty())
            .unwrap();

        self.client
            .request(req)
            .map_err(Error::Hyper)
            .and_then(|res| {
                res.into_body()
                    .concat2()
                    .map_err(Error::Hyper)
                    .and_then(|ch| {
                        ::std::str::from_utf8(&ch)
                            .map_err(|_| Error::MissingToken)
                            .and_then(|html| {
                                html.rsplitn(2, "name=\"submitid")
                                    .next()
                                    .and_then(|x| {
                                        x.splitn(2, "value=\"").nth(1).and_then(|token| {
                                            token.splitn(2, '\"').next().map(str::to_owned)
                                        })
                                    }).ok_or(Error::MissingToken)
                            })
                    })
            })
    }
}

#[cfg(test)]
mod tests {
    #[test]
    fn extract_unique_token() {
        let html = r###"type="hidden" name="submitid" value="1yPA39C6QcM84Dzspl+7s28rrAFOnliPMCiJtoP+OlTKmd5kJd21G4ucgTkx0mnZ"/>"###;

        let split = html.rsplitn(2, "name=\"submitid").next().and_then(|x| {
            x.splitn(2, "value=\"")
                .nth(1)
                .and_then(|id| id.splitn(2, "\"").next().map(|x| x.to_owned()))
        });
        assert_eq!(
            Some("1yPA39C6QcM84Dzspl+7s28rrAFOnliPMCiJtoP+OlTKmd5kJd21G4ucgTkx0mnZ".to_owned()),
            split
        );
    }
}