1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
//! # archiveis - Rust API Wrapper for Archive.is
//! This crate provides simple access to the Archive.is Capturing Service.
//! ## Quick Start
//! ### Creating a Client
//! To create a client to access the Archive.is Capturing Service, you should use the `ArchiveClient`
//! struct. You can pass a specific user agent or none to use a default one.
//! To capture a specific url all you need to do is call the `capture` function of the client provided
//! with the desired url.
//!
//! ### Archive a url
//! The `ArchiveClient` is build with `hyper` and therefor uses futures for its services.
//!
//! ```rust,no_run
//! extern crate archiveis;
//! extern crate futures;
//!
//! use archiveis::ArchiveClient;
//! use futures::future::Future;
//!
//! let client = ArchiveClient::new(Some("archiveis (https://github.com/MattsSe/archiveis-rs)"));
//! let url = "http://example.com/";
//! let capture = client.capture(url).and_then(|res| {
//!     if let Some(archived) = res {
//!         println!("targeted url: {}", archived.target_url);
//!         println!("url of archived site: {}", archived.archived_url);
//!         println!("archive.is submit token: {}", archived.submit_id);
//!     }
//!     Ok(())
//! });
//!

//#![deny(warnings)]
extern crate chrono;
extern crate futures;
extern crate hyper;
extern crate url;

use chrono::DateTime;
use futures::future;
use hyper::rt::{Future, Stream};
use hyper::Client;
use hyper::Request;

/// Represents a result of the capture service
#[derive(Debug, Clone)]
pub struct Archived {
    /// The requested url to archive with the archive.is capture service
    pub target_url: String,
    /// The archive.is url that archives the `target_url`
    pub archived_url: String,
    /// The time stamp when the site was archived
    pub time_stamp: Option<DateTime<chrono::Utc>>,
    /// The submitid used to authorize access on the archive.is server
    pub submit_id: String,
}

/// A Client that serves as a wrapper around the archive.is capture service
pub struct ArchiveClient {
    /// The internal Hyper Http Client.
    client: Client<hyper::client::HttpConnector, hyper::Body>,
    /// The user agent used for the HTTP Requests
    user_agent: String,
}

impl ArchiveClient {
    /// Creates a new instance of the `ArchiveClient` using the provided user agent or a dummy one.
    pub fn new(user_agent: Option<&str>) -> Self {
        ArchiveClient {
            client: Client::new(),
            user_agent: user_agent.map(|x| x.to_owned()).unwrap_or_else(|| {
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36".to_owned()
            }),
        }
    }

    /// Invokes the archive.is capture service.
    /// First it get's the current valid unique `submitid` by calling `get_unique_id`.
    /// Then it sends a new POST request to the archive.is submit endpoint with the `url` and the
    /// `submitid` encoded as `x-www-form-urlencoded` in the body.
    /// The link to the archived page is then contained in the `Refresh` header of the Response.
    /// It also tries to parse the timemap from the `Date` header and packs it together with the url
    /// in a new `Archived` instance.
    pub fn capture<'a>(
        &'a self,
        url: &str,
    ) -> impl Future<Item = Option<Archived>, Error = hyper::Error> + 'a {
        // TODO add lifetime constraints to url instead?
        let u = url.to_owned();
        // TODO The id is usually valid a couple minutes, perhaps caching it instead?
        self.get_unique_id().and_then(move |resp| {
            let res: Box<Future<Item = Option<Archived>, Error = hyper::Error>> = match resp {
                Some(id) => Box::new(self.capture_with_id(&u, id.as_str())),
                _ => Box::new(future::ok(None)),
            };
            res
        })
    }

    /// Invokes the archive.is capture service directly without retrieving a submit id first.
    /// This can have the advantage that no additional request is necessary, but poses potential
    /// drawbacks when the `id` is not valid. Generally the temporarily ``` are still valid
    /// even when the archiv.is server switched to a new one in the meantime. But it might be the
    /// case, that the server returns a `Server Error`, then the function catches a new `submit_id`
    /// and then tries to capture the `url` again. This is done by switching to the general `capture`
    /// function in case of error.
    /// There might also be the possibility, where the response body already
    /// contains the html of the archived `url`. In that case we read the archive.is url from the
    /// html's meta information instead.
    pub fn capture_with_id<'a>(
        &'a self,
        url: &str,
        submit_id: &str,
    ) -> impl Future<Item = Option<Archived>, Error = hyper::Error> + 'a {
        use chrono::TimeZone;
        use url::form_urlencoded;

        let target_url = url.to_owned();
        let body: String = form_urlencoded::Serializer::new(String::new())
            .append_pair("url", target_url.as_str())
            .append_pair("anyway", "1")
            .append_pair("submitid", submit_id)
            .finish();
        let submit_id = submit_id.to_owned();
        // prepare the POST request
        let req = Request::post("http://archive.is/submit/")
            .header("User-Agent", self.user_agent.as_str())
            .header("Content-Type", "application/x-www-form-urlencoded")
            .body(body.into())
            .unwrap();
        let capture = self.client.request(req).and_then(move |resp| {
            // get the url of the archived page
            let refresh = resp.headers().get("Refresh").and_then(|x| {
                x.to_str()
                    .ok()
                    .and_then(|x| x.split('=').nth(1).map(str::to_owned))
            });
            let archived: Box<Future<Item = Option<Archived>, Error = hyper::Error>> = match refresh
            {
                Some(archived_url) => {
                    // parse the timemap from the Date header
                    let time_stamp = resp.headers().get("Date").and_then(|x| {
                        x.to_str().ok().and_then(|x| {
                            chrono::Utc.datetime_from_str(x, "%a, %e %b %Y %T GMT").ok()
                        })
                    });
                    let archived = Archived {
                        target_url,
                        archived_url,
                        time_stamp,
                        submit_id,
                    };
                    Box::new(future::ok(Some(archived)))
                }
                _ => {
                    // an err response body can be empty, contain Server Error or
                    // can directly contain the archived site, in that case we extract the archived_url
                    let err_resp_handling = resp.into_body().concat2().and_then(move |ch| {
                        if let Ok(html) = ::std::str::from_utf8(&ch) {
                            if html.starts_with("<h1>Server Error</h1>") {
                                println!("here3");
                                return Box::new(self.capture(target_url.as_str()))
                                    as Box<Future<Item = Option<Archived>, Error = hyper::Error>>;
                            }
                            let archived_url = html
                                .splitn(2, "<meta property=\"og:url\"")
                                .nth(1)
                                .and_then(|x| {
                                    x.splitn(2, "content=\"")
                                        .nth(1)
                                        .and_then(|id| id.splitn(2, '\"').next().map(str::to_owned))
                                });
                            if let Some(archived_url) = archived_url {
                                let archived = Archived {
                                    target_url,
                                    archived_url,
                                    time_stamp: None,
                                    submit_id,
                                };
                                return Box::new(future::ok(Some(archived)));
                            }
                        }
                        Box::new(self.capture(target_url.as_str()))
                    });
                    Box::new(err_resp_handling)
                }
            };
            archived
        });
        Box::new(capture)
    }

    /// In order to submit an authorized capture request we need to first obtain a temporarily valid
    /// unique identifier, or none could be found.
    /// This is achieved by sending a GET request to the archive.is domain and parsing the `
    /// `submitid` from the responding html.
    pub fn get_unique_id(&self) -> impl Future<Item = Option<String>, Error = hyper::Error> {
        let req = Request::get("http://archive.is/")
            .header("User-Agent", self.user_agent.as_str())
            .body(hyper::Body::empty())
            .unwrap();

        self.client
            .request(req)
            .and_then(|res| {
                res.into_body().concat2().map(|ch| {
                    ::std::str::from_utf8(&ch).and_then(|html| {
                        Ok(html.rsplitn(2, "name=\"submitid").next().and_then(|x| {
                            x.splitn(2, "value=\"")
                                .nth(1)
                                .and_then(|id| id.splitn(2, '\"').next().map(str::to_owned))
                        }))
                    })
                })
            }).and_then(|x| Ok(x.unwrap_or(None)))
    }
}

#[cfg(test)]
mod tests {
    #[test]
    fn extract_unique_id() {
        let html = r###"type="hidden" name="submitid" value="1yPA39C6QcM84Dzspl+7s28rrAFOnliPMCiJtoP+OlTKmd5kJd21G4ucgTkx0mnZ"/>"###;

        let split = html.rsplitn(2, "name=\"submitid").next().and_then(|x| {
            x.splitn(2, "value=\"")
                .nth(1)
                .and_then(|id| id.splitn(2, "\"").next().map(|x| x.to_owned()))
        });
        assert_eq!(
            Some("1yPA39C6QcM84Dzspl+7s28rrAFOnliPMCiJtoP+OlTKmd5kJd21G4ucgTkx0mnZ".to_owned()),
            split
        );
    }
}