1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
use crate::{
    client,
    config::{Configuration, CONFIGURATION},
    scanner::SCANNED_URLS,
    statistics::{
        StatCommand::{self, UpdateUsizeField},
        StatField::{LinksExtracted, TotalExpected},
    },
    utils::{format_url, make_request},
    FeroxResponse,
};
use lazy_static::lazy_static;
use regex::Regex;
use reqwest::Url;
use std::collections::HashSet;
use tokio::sync::mpsc::UnboundedSender;

/// Regular expression used in [LinkFinder](https://github.com/GerbenJavado/LinkFinder)
///
/// Incorporates change from this [Pull Request](https://github.com/GerbenJavado/LinkFinder/pull/66/files)
const LINKFINDER_REGEX: &str = r#"(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-.]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')"#;

/// Regular expression to pull url paths from robots.txt
///
/// ref: https://developers.google.com/search/reference/robots_txt
const ROBOTS_TXT_REGEX: &str =
    r#"(?m)^ *(Allow|Disallow): *(?P<url_path>[a-zA-Z0-9._/?#@!&'()+,;%=-]+?)$"#; // multi-line (?m)

lazy_static! {
    /// `LINKFINDER_REGEX` as a regex::Regex type
    static ref LINKS_REGEX: Regex = Regex::new(LINKFINDER_REGEX).unwrap();

    /// `ROBOTS_TXT_REGEX` as a regex::Regex type
    static ref ROBOTS_REGEX: Regex = Regex::new(ROBOTS_TXT_REGEX).unwrap();
}

/// Iterate over a given path, return a list of every sub-path found
///
/// example: `path` contains a link fragment `homepage/assets/img/icons/handshake.svg`
/// the following fragments would be returned:
///   - homepage/assets/img/icons/handshake.svg
///   - homepage/assets/img/icons/
///   - homepage/assets/img/
///   - homepage/assets/
///   - homepage/
fn get_sub_paths_from_path(path: &str) -> Vec<String> {
    log::trace!("enter: get_sub_paths_from_path({})", path);
    let mut paths = vec![];

    // filter out any empty strings caused by .split
    let mut parts: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();

    let length = parts.len();

    for i in 0..length {
        // iterate over all parts of the path
        if parts.is_empty() {
            // pop left us with an empty vector, we're done
            break;
        }

        let mut possible_path = parts.join("/");

        if possible_path.is_empty() {
            // .join can result in an empty string, which we don't need, ignore
            continue;
        }

        if i > 0 {
            // this isn't the last index of the parts array
            // ex: /buried/misc/stupidfile.php
            // this block skips the file but sees all parent folders
            possible_path = format!("{}/", possible_path);
        }

        paths.push(possible_path); // good sub-path found
        parts.pop(); // use .pop() to remove the last part of the path and continue iteration
    }

    log::trace!("exit: get_sub_paths_from_path -> {:?}", paths);
    paths
}

/// simple helper to stay DRY, trys to join a url + fragment and add it to the `links` HashSet
fn add_link_to_set_of_links(link: &str, url: &Url, links: &mut HashSet<String>) {
    log::trace!(
        "enter: add_link_to_set_of_links({}, {}, {:?})",
        link,
        url.to_string(),
        links
    );
    match url.join(&link) {
        Ok(new_url) => {
            links.insert(new_url.to_string());
        }
        Err(e) => {
            log::error!("Could not join given url to the base url: {}", e);
        }
    }
    log::trace!("exit: add_link_to_set_of_links");
}

/// Given a `reqwest::Response`, perform the following actions
///   - parse the response's text for links using the linkfinder regex
///   - for every link found take its url path and parse each sub-path
///     - example: Response contains a link fragment `homepage/assets/img/icons/handshake.svg`
///       with a base url of http://localhost, the following urls would be returned:
///         - homepage/assets/img/icons/handshake.svg
///         - homepage/assets/img/icons/
///         - homepage/assets/img/
///         - homepage/assets/
///         - homepage/
pub async fn get_links(
    response: &FeroxResponse,
    tx_stats: UnboundedSender<StatCommand>,
) -> HashSet<String> {
    log::trace!(
        "enter: get_links({}, {:?})",
        response.url().as_str(),
        tx_stats
    );

    let mut links = HashSet::<String>::new();

    let body = response.text();

    for capture in LINKS_REGEX.captures_iter(&body) {
        // remove single & double quotes from both ends of the capture
        // capture[0] is the entire match, additional capture groups start at [1]
        let link = capture[0].trim_matches(|c| c == '\'' || c == '"');

        match Url::parse(link) {
            Ok(absolute) => {
                if absolute.domain() != response.url().domain()
                    || absolute.host() != response.url().host()
                {
                    // domains/ips are not the same, don't scan things that aren't part of the original
                    // target url
                    continue;
                }

                add_all_sub_paths(absolute.path(), &response, &mut links);
            }
            Err(e) => {
                // this is the expected error that happens when we try to parse a url fragment
                //     ex: Url::parse("/login") -> Err("relative URL without a base")
                // while this is technically an error, these are good results for us
                if e.to_string().contains("relative URL without a base") {
                    add_all_sub_paths(link, &response, &mut links);
                } else {
                    // unexpected error has occurred
                    log::error!("Could not parse given url: {}", e);
                }
            }
        }
    }

    let multiplier = CONFIGURATION.extensions.len().max(1);

    update_stat!(tx_stats, UpdateUsizeField(LinksExtracted, links.len()));
    update_stat!(
        tx_stats,
        UpdateUsizeField(TotalExpected, links.len() * multiplier)
    );

    log::trace!("exit: get_links -> {:?}", links);

    links
}

/// take a url fragment like homepage/assets/img/icons/handshake.svg and
/// incrementally add
///     - homepage/assets/img/icons/
///     - homepage/assets/img/
///     - homepage/assets/
///     - homepage/
fn add_all_sub_paths(url_path: &str, response: &FeroxResponse, mut links: &mut HashSet<String>) {
    log::trace!(
        "enter: add_all_sub_paths({}, {}, {:?})",
        url_path,
        response,
        links
    );

    for sub_path in get_sub_paths_from_path(url_path) {
        log::debug!("Adding {} to {:?}", sub_path, links);
        add_link_to_set_of_links(&sub_path, &response.url(), &mut links);
    }

    log::trace!("exit: add_all_sub_paths");
}

/// Wrapper around link extraction logic
/// currently used in two places:
///   - links from response bodys
///   - links from robots.txt responses
///
/// general steps taken:
///   - create a new Url object based on cli options/args
///   - check if the new Url has already been seen/scanned -> None
///   - make a request to the new Url ? -> Some(response) : None
pub async fn request_feroxresponse_from_new_link(
    url: &str,
    tx_stats: UnboundedSender<StatCommand>,
) -> Option<FeroxResponse> {
    log::trace!(
        "enter: request_feroxresponse_from_new_link({}, {:?})",
        url,
        tx_stats
    );

    // create a url based on the given command line options, return None on error
    let new_url = match format_url(
        &url,
        &"",
        CONFIGURATION.add_slash,
        &CONFIGURATION.queries,
        None,
        tx_stats.clone(),
    ) {
        Ok(url) => url,
        Err(_) => {
            log::trace!("exit: request_feroxresponse_from_new_link -> None");
            return None;
        }
    };

    if SCANNED_URLS.get_scan_by_url(&new_url.to_string()).is_some() {
        //we've seen the url before and don't need to scan again
        log::trace!("exit: request_feroxresponse_from_new_link -> None");
        return None;
    }

    // make the request and store the response
    let new_response = match make_request(&CONFIGURATION.client, &new_url, tx_stats).await {
        Ok(resp) => resp,
        Err(_) => {
            log::trace!("exit: request_feroxresponse_from_new_link -> None");
            return None;
        }
    };

    let new_ferox_response = FeroxResponse::from(new_response, true).await;

    log::trace!(
        "exit: request_feroxresponse_from_new_link -> {:?}",
        new_ferox_response
    );
    Some(new_ferox_response)
}

/// helper function that simply requests /robots.txt on the given url's base url
///
/// example:
///     http://localhost/api/users -> http://localhost/robots.txt
///     
/// The length of the given path has no effect on what's requested; it's always
/// base url + /robots.txt
pub async fn request_robots_txt(
    base_url: &str,
    config: &Configuration,
    tx_stats: UnboundedSender<StatCommand>,
) -> Option<FeroxResponse> {
    log::trace!(
        "enter: get_robots_file({}, CONFIGURATION, {:?})",
        base_url,
        tx_stats
    );

    // more often than not, domain/robots.txt will redirect to www.domain/robots.txt or something
    // similar; to account for that, create a client that will follow redirects, regardless of
    // what the user specified for the scanning client. Other than redirects, it will respect
    // all other user specified settings
    let follow_redirects = true;

    let proxy = if config.proxy.is_empty() {
        None
    } else {
        Some(config.proxy.as_str())
    };

    let client = client::initialize(
        config.timeout,
        &config.user_agent,
        follow_redirects,
        config.insecure,
        &config.headers,
        proxy,
    );

    if let Ok(mut url) = Url::parse(base_url) {
        url.set_path("/robots.txt"); // overwrite existing path with /robots.txt

        if let Ok(response) = make_request(&client, &url, tx_stats).await {
            let ferox_response = FeroxResponse::from(response, true).await;

            log::trace!("exit: get_robots_file -> {}", ferox_response);
            return Some(ferox_response);
        }
    }

    None
}

/// Entry point to perform link extraction from robots.txt
///
/// `base_url` can have paths and subpaths, however robots.txt will be requested from the
/// root of the url
/// given the url:
///     http://localhost/stuff/things
/// this function requests:
///     http://localhost/robots.txt
pub async fn extract_robots_txt(
    base_url: &str,
    config: &Configuration,
    tx_stats: UnboundedSender<StatCommand>,
) -> HashSet<String> {
    log::trace!(
        "enter: extract_robots_txt({}, CONFIGURATION, {:?})",
        base_url,
        tx_stats
    );
    let mut links = HashSet::new();

    if let Some(response) = request_robots_txt(&base_url, &config, tx_stats.clone()).await {
        for capture in ROBOTS_REGEX.captures_iter(response.text.as_str()) {
            if let Some(new_path) = capture.name("url_path") {
                if let Ok(mut new_url) = Url::parse(base_url) {
                    new_url.set_path(new_path.as_str());
                    add_all_sub_paths(new_url.path(), &response, &mut links);
                }
            }
        }
    }

    let multiplier = CONFIGURATION.extensions.len().max(1);

    update_stat!(tx_stats, UpdateUsizeField(LinksExtracted, links.len()));
    update_stat!(
        tx_stats,
        UpdateUsizeField(TotalExpected, links.len() * multiplier)
    );

    log::trace!("exit: extract_robots_txt -> {:?}", links);
    links
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::utils::make_request;
    use crate::FeroxChannel;
    use httpmock::Method::GET;
    use httpmock::MockServer;
    use reqwest::Client;
    use tokio::sync::mpsc;

    #[test]
    /// extract sub paths from the given url fragment; expect 4 sub paths and that all are
    /// in the expected array
    fn extractor_get_sub_paths_from_path_with_multiple_paths() {
        let path = "homepage/assets/img/icons/handshake.svg";
        let paths = get_sub_paths_from_path(&path);
        let expected = vec![
            "homepage/",
            "homepage/assets/",
            "homepage/assets/img/",
            "homepage/assets/img/icons/",
            "homepage/assets/img/icons/handshake.svg",
        ];

        assert_eq!(paths.len(), expected.len());
        for expected_path in expected {
            assert_eq!(paths.contains(&expected_path.to_string()), true);
        }
    }

    #[test]
    /// extract sub paths from the given url fragment; expect 2 sub paths and that all are
    /// in the expected array. the fragment is wrapped in slashes to ensure no empty strings are
    /// returned
    fn extractor_get_sub_paths_from_path_with_enclosing_slashes() {
        let path = "/homepage/assets/";
        let paths = get_sub_paths_from_path(&path);
        let expected = vec!["homepage/", "homepage/assets"];

        assert_eq!(paths.len(), expected.len());
        for expected_path in expected {
            assert_eq!(paths.contains(&expected_path.to_string()), true);
        }
    }

    #[test]
    /// extract sub paths from the given url fragment; expect 1 sub path, no forward slashes are
    /// included
    fn extractor_get_sub_paths_from_path_with_only_a_word() {
        let path = "homepage";
        let paths = get_sub_paths_from_path(&path);
        let expected = vec!["homepage"];

        assert_eq!(paths.len(), expected.len());
        for expected_path in expected {
            assert_eq!(paths.contains(&expected_path.to_string()), true);
        }
    }

    #[test]
    /// extract sub paths from the given url fragment; expect 1 sub path, forward slash removed
    fn extractor_get_sub_paths_from_path_with_an_absolute_word() {
        let path = "/homepage";
        let paths = get_sub_paths_from_path(&path);
        let expected = vec!["homepage"];

        assert_eq!(paths.len(), expected.len());
        for expected_path in expected {
            assert_eq!(paths.contains(&expected_path.to_string()), true);
        }
    }

    #[test]
    /// test that a full url and fragment are joined correctly, then added to the given list
    /// i.e. the happy path
    fn extractor_add_link_to_set_of_links_happy_path() {
        let url = Url::parse("https://localhost").unwrap();
        let mut links = HashSet::<String>::new();
        let link = "admin";

        assert_eq!(links.len(), 0);
        add_link_to_set_of_links(link, &url, &mut links);

        assert_eq!(links.len(), 1);
        assert!(links.contains("https://localhost/admin"));
    }

    #[test]
    /// test that an invalid path fragment doesn't add anything to the set of links
    fn extractor_add_link_to_set_of_links_with_non_base_url() {
        let url = Url::parse("https://localhost").unwrap();
        let mut links = HashSet::<String>::new();
        let link = "\\\\\\\\";

        assert_eq!(links.len(), 0);
        add_link_to_set_of_links(link, &url, &mut links);

        assert_eq!(links.len(), 0);
        assert!(links.is_empty());
    }

    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
    /// use make_request to generate a Response, and use the Response to test get_links;
    /// the response will contain an absolute path to a domain that is not part of the scanned
    /// domain; expect an empty set returned
    async fn extractor_get_links_with_absolute_url_that_differs_from_target_domain(
    ) -> Result<(), Box<dyn std::error::Error>> {
        let srv = MockServer::start();

        let mock = srv.mock(|when, then|{
            when.method(GET)
                .path("/some-path");
            then.status(200)
                .body("\"http://defintely.not.a.thing.probably.com/homepage/assets/img/icons/handshake.svg\"");
        });

        let client = Client::new();
        let url = Url::parse(&srv.url("/some-path")).unwrap();
        let (tx, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();

        let response = make_request(&client, &url, tx.clone()).await.unwrap();

        let ferox_response = FeroxResponse::from(response, true).await;

        let links = get_links(&ferox_response, tx).await;

        assert!(links.is_empty());

        assert_eq!(mock.hits(), 1);
        Ok(())
    }

    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
    /// test that /robots.txt is correctly requested given a base url (happy path)
    async fn request_robots_txt_with_and_without_proxy() {
        let srv = MockServer::start();

        let mock = srv.mock(|when, then| {
            when.method(GET).path("/robots.txt");
            then.status(200).body("this is a test");
        });

        let mut config = Configuration::default();

        let (tx, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();

        request_robots_txt(&srv.url("/api/users/stuff/things"), &config, tx.clone()).await;

        // note: the proxy doesn't actually do anything other than hit a different code branch
        // in this unit test; it would however have an effect on an integration test
        config.proxy = srv.url("/ima-proxy");

        request_robots_txt(&srv.url("/api/different/path"), &config, tx).await;

        assert_eq!(mock.hits(), 2);
    }
}