keyhunter 0.2.0

Check for leaked API keys and secrets on public websites
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
use std::{borrow::Cow, num::NonZeroUsize, sync::mpsc, time::Duration};

use miette::{Error, MietteDiagnostic, Result};
use ureq::{Agent, AgentBuilder};

use super::{walk::ScriptSender, walk_cache::WalkCache, Script};
use crate::{http::random_ua, ScriptReceiver, WebsiteWalker};

#[derive(Debug, Clone)]
#[must_use]
#[non_exhaustive]
pub struct WebsiteWalkBuilder {
    /// Maximum number of pages that can be visited.
    ///
    /// [`None`] means there is no limit.
    ///
    /// Default [`None`]
    pub(crate) max_walks: Option<NonZeroUsize>,
    /// User agent header to use when making requests
    ///
    /// Default [`Some`] user agent
    pub(crate) ua: Option<Cow<'static, str>>,
    /// Extra headers to add to requests
    ///
    /// By default, the following headers are added:
    /// - `Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8`
    /// - `Keep-Alive: timeout=5, max=100`
    /// - `Connection: keep-alive`
    /// - `Accept-Language: en-US,en;q=0.5`
    /// - `Accept-Encoding: gzip, deflate, br`
    /// - `DNT: 1`
    pub(crate) headers: Vec<(String, String)>,
    /// Domains that can be visited (and have their scripts extracted)
    ///
    /// When a walk begins, the domain of the URL is checked against this list.
    ///
    /// Default `[]`
    pub(crate) domain_whitelist: Vec<String>,
    /// When `true`, [`None`] will be sent over the script channel to close it.
    ///
    /// Default `true`
    pub(crate) close_channel_when_done: bool,
    /// When `true`, cookies will be stored and used across requests.
    ///
    /// Default `true`
    store_cookies: bool,
    /// Shared cache across walks
    pub(crate) cache: Option<WalkCache>,
    /// Timeout for requests
    ///
    /// See: [`AgentBuilder::timeout`]
    ///
    /// Default [`None`]
    pub(crate) timeout: Option<Duration>,
    /// Timeout for connecting to a server
    ///
    /// See: [`AgentBuilder::timeout_connect`]
    ///
    /// Default [`None`]
    pub(crate) timeout_connect: Option<Duration>,
}

impl Default for WebsiteWalkBuilder {
    fn default() -> Self {
        let headers: Vec<(String, String)> = vec![
            (
                "Accept".into(),
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8".into(),
            ),
            ("Keep-Alive".into(), "timeout=5, max=100".into()),
            ("Connection".into(), "keep-alive".into()),
            ("Accept-Language".into(), "en-US,en;q=0.5".into()),
            // TODO: use flat2 to decompress responses
            // ("Accept-Encoding".into(), "gzip, deflate, br".into()),
            ("DNT".into(), "1".into()),
        ];

        let mut rng = rand::thread_rng();
        let ua = Some(Cow::Borrowed(random_ua(&mut rng)));

        Self {
            max_walks: None,
            ua,
            headers,
            domain_whitelist: Vec::new(),
            close_channel_when_done: true,
            timeout: None,
            timeout_connect: None,
            store_cookies: true,
            cache: None,
        }
    }
}

impl WebsiteWalkBuilder {
    const USER_AGENT: &'static str = "User-Agent";

    /// Create a new builder with default settings
    pub fn new() -> Self {
        Default::default()
    }

    /// Set the maximum number of pages that can be visited.
    ///
    /// Use [`WebsiteWalkBuilder::with_unlimited_walks`] to remove the limit.
    ///
    /// By default, there is no limit.
    ///
    /// # Panics
    /// if `max_walks` is zero.
    pub fn with_max_walks(mut self, max_walks: usize) -> Self {
        let max_walks = NonZeroUsize::new(max_walks)
            .ok_or_else(|| {
                Error::msg(
                    "max_walks must be greater than zero, otherwise no pages will be checked.",
                )
                .context("Failed to configure WebsiteWalkBuilder")
            })
            .unwrap();
        self.max_walks = Some(max_walks);
        self
    }

    /// Do not limit the number of pages that can be visited.
    ///
    /// Use [`WebsiteWalkBuilder::with_max_walks`] to set a walk limit.
    ///
    /// By default, there is no limit. Using this method on
    /// [`WebsiteWalkBuilder::default()`] will have no effect.
    pub fn with_unlimited_walks(mut self) -> Self {
        self.max_walks = None;
        self
    }

    /// Use a random, browser-like `User-Agent` header when making requests.
    ///
    /// Using a mock UA can help bypass bot detection on some websites. However,
    /// there are some cases where specific browsers are prevented from
    /// accessing websites, and so using a random UA may not be ideal.
    ///
    /// This is a semi-specific case of
    /// [`WebsiteWalkBuilder::with_header`]. `User-Agent`s set with this
    /// method will take precedence.
    ///
    /// By default, no `User-Agent` header is set.
    pub fn with_random_ua(mut self, yes: bool) -> Self {
        if yes && self.ua.is_none() {
            let mut rng = rand::thread_rng();
            self.ua = Some(Cow::Borrowed(random_ua(&mut rng)));
        } else if !yes {
            self.ua = None;
        }

        self
    }

    /// Add an extra header to all requests.
    ///
    /// Use [`WebsiteWalkBuilder::with_headers`] for adding multiple headers.
    #[inline]
    pub fn with_header<S: Into<String>>(mut self, key: S, value: S) -> Self {
        let key = key.into();
        if key == Self::USER_AGENT {
            self.ua = Some(Cow::Owned(value.into()));
        } else {
            self.headers.push((key, value.into()));
        }

        self
    }

    /// Add extra headers to all requests
    ///
    /// Use [`WebsiteWalkBuilder::with_header`] for adding a single header.
    pub fn with_headers<I>(mut self, headers: I) -> Self
    where
        I: IntoIterator<Item = (String, String)>,
    {
        self.headers.extend(headers);
        self
    }

    /// Whitelist a domain for crawling. Only domains in this list will have
    /// their pages scanned for scripts.
    ///
    /// This setting does not affect what scripts will be checked; cross-origin
    /// scripts will still be sent to the script channel.
    ///
    /// Use [`WebsiteWalkBuilder::with_whitelisted_domains`] to add multiple
    /// domains.
    #[inline]
    pub fn with_whitelisted_domain<S: Into<String>>(mut self, domain: S) -> Self {
        self.domain_whitelist.push(domain.into());
        self
    }

    /// Whitelist multiple domains for crawling. Only domains in this list will have
    /// their pages scanned for scripts.
    ///
    /// This setting does not affect what scripts will be checked; cross-origin
    /// scripts will still be sent to the script channel.
    ///
    /// Use [`WebsiteWalkBuilder::with_whitelisted_domain`] to add a single
    /// domain.
    pub fn with_whitelisted_domains<I, S>(mut self, domains: I) -> Self
    where
        I: IntoIterator<Item = S>,
        S: Into<String>,
    {
        self.domain_whitelist
            .extend(domains.into_iter().map(|s| s.into()));
        self
    }

    /// Close the script channel when the walk is done. If you plan on
    /// performing multiple walks, leave the channel open.
    ///
    /// By default, the script channel will be closed when the walk is done.
    pub fn with_close_channel(mut self, yes: bool) -> Self {
        self.close_channel_when_done = yes;
        self
    }

    /// Store cookies and use them across requests.
    ///
    /// This is enabled by default.
    pub fn with_cookie_jar(mut self, yes: bool) -> Self {
        self.store_cookies = yes;
        self
    }

    /// Share a URL and script cache across walks.
    ///
    /// Useful for avoiding duplicate work when performing multiple walks.
    ///
    /// By default, each walk has its own cache.
    pub fn with_shared_cache(mut self, yes: bool) -> Self {
        if yes && self.cache.is_none() {
            self.cache = Some(WalkCache::default());
        } else if !yes {
            self.cache = None;
        }
        self
    }

    pub fn clear_cache(&mut self) {
        self.cache.as_mut().map(WalkCache::clear);
    }

    /// Overall timeout for page requests. You can override socket connection
    /// timeouts using [`WebsiteWalkBuilder::with_timeout_connect`].
    ///
    /// See: [`AgentBuilder::timeout`]
    pub fn with_timeout(mut self, timeout: Duration) -> Self {
        self.timeout = Some(timeout);
        self
    }

    /// Timeout for socket connection to a server. Overrides [`WebsiteWalkBuilder::with_timeout`].
    ///
    /// See: [`AgentBuilder::timeout_connect`]
    pub fn with_timeout_connect(mut self, timeout: Duration) -> Self {
        self.timeout_connect = Some(timeout);
        self
    }

    pub(crate) fn build_agent(&self) -> Agent {
        let mut builder = AgentBuilder::new();

        // enable/disable cookie jar
        if self.store_cookies {
            builder = builder.cookie_store(Default::default());
        }

        // set default timeout
        if let Some(timeout) = self.timeout {
            builder = builder.timeout(timeout);
        }

        // set connect timeout override
        if let Some(connect_timeout) = self.timeout_connect {
            builder = builder.timeout_connect(connect_timeout);
        }

        builder.build()
    }

    pub(crate) fn headers(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
        self.ua
            .as_ref()
            .map(|ua| (Self::USER_AGENT, ua.as_ref()))
            .into_iter()
            .chain(self.headers.iter().map(|(k, v)| (k.as_str(), v.as_str())))
    }

    pub fn build(&self, sender: ScriptSender) -> WebsiteWalker {
        WebsiteWalker::new(self, sender)
    }

    pub fn build_with_channel(&self) -> (WebsiteWalker, ScriptReceiver) {
        let (tx, rx) = mpsc::channel();
        let walker = WebsiteWalker::new(self, tx);
        (walker, rx)
    }

    pub fn collect<S: AsRef<str>>(&self, entrypoint: S) -> Result<Vec<Script>> {
        const ACC_INITIAL_CAPACITY: usize = 32;

        let (walker, receiver) = self.build_with_channel();
        let recv_handle = std::thread::spawn(move || {
            receiver
                .into_iter()
                .fold(Vec::with_capacity(ACC_INITIAL_CAPACITY), |mut acc, el| {
                    acc.extend(el);
                    acc
                })
        });
        walker.walk(entrypoint.as_ref())?;

        recv_handle.join().map_err(|e| {
            match e.downcast::<MietteDiagnostic>() {
                Ok(e) => {
                    Error::new_boxed(e)
                },
                Err(e) => {
                    match e.downcast::<String> () {
                        Ok(e) => {
                            Error::msg(e).context(format!("Failed to join script receiver handle while walking '{}'", entrypoint.as_ref()))
                        },
                        Err(_) => {
                            Error::msg(format!("Failed to join script receiver handle while walking '{}': an unknown error occurred", entrypoint.as_ref()))
                        }

                    }
                }
            }
        })
    }
}

#[cfg(test)]
mod test {

    use super::*;
    #[test]
    fn test_builder() {
        let builder = WebsiteWalkBuilder::default()
            .with_max_walks(20)
            .with_shared_cache(true)
            .with_cookie_jar(true);
        let (sender, _receiver) = mpsc::channel();

        let _walker: WebsiteWalker = builder.build(sender);
    }

    #[test]
    fn test_headers() {
        let mut builder = WebsiteWalkBuilder::default();
        let headers: Vec<_> = builder.headers().collect();

        assert_eq!(headers.len(), 6);
        assert!(headers.iter().any(|(k, _)| *k == "User-Agent"));

        // FIXME: `builder.with_headers` duplicates the UA header
        builder = builder.with_header("User-Agent", "test");
        assert_eq!(builder.headers().count(), 6);
        let ua = builder
            .headers()
            .find(|(k, _)| *k == "User-Agent")
            .expect("No UA header");
        assert_eq!(ua.1, "test");
    }

    #[test]
    fn test_ua() {
        let builder = WebsiteWalkBuilder::default();
        // by default, walker starts with a random user agent
        assert!(builder.ua.is_some());
        let ua = builder
            .ua
            .as_ref()
            .expect("Walk builder should start with a random user agent")
            .clone();

        // setting a random ua when one exists is a no-op
        let builder = builder.with_random_ua(true);
        let new_ua = builder.ua.as_ref().unwrap();
        assert_eq!(
            &ua, new_ua,
            "with_random_ua should not replace an existing user agent"
        );

        let builder = builder.with_random_ua(false);
        assert!(
            builder.ua.is_none(),
            "with_random_ua(false) should remove the user agent"
        );

        // setting a random ua when none exists adds one
        let builder = builder.with_random_ua(true);
        assert!(
            builder.ua.is_some(),
            "with_random_ua(true) should add a user agent"
        );
    }
}