use super::*;
use crate::event_handlers::Handles;
use anyhow::{bail, Result};
pub(super) const LINKFINDER_REGEX: &str = r#"(?x)
(?:"|') # Start newline delimiter
(
((?:[a-zA-Z]{1,10}://|//) # Match a scheme [a-Z]*1-10 or //
[^"'/]{1,}\. # Match a domainname (any character + dot)
[a-zA-Z]{2,}[^"']{0,}) # The domainextension and/or path
|
((?:/|\.\./|\./) # Start with /,../,./
[^"'><,;| *()(%%$^/\\\[\]] # Next character can't be...
[^"'><,;|()]{1,}) # Rest of the characters can't be
|
([a-zA-Z0-9_\-/]{1,}/ # Relative endpoint with /
[a-zA-Z0-9_\-/.]{1,} # Resource name
\.(?:[a-zA-Z]{1,4}|action) # Rest + extension (length 1-4 or action)
(?:[\?|\#][^"|']{0,}|)) # ? or # mark with parameters
|
([a-zA-Z0-9_\-/]{1,}/ # REST API (no extension) with /
[a-zA-Z0-9_\-/]{3,} # Proper REST endpoints usually have 3+ chars
(?:[\?|\#][^"|']{0,}|)) # ? or # mark with parameters
|
([a-zA-Z0-9_\-]{1,} # filename
\.(?:php|asp|aspx|jsp|json|
action|html|js|txt|xml) # . + extension
(?:[\?|\#][^"|']{0,}|)) # ? or # mark with parameters
)
(?:"|') # End newline delimiter
"#;
pub(super) const ROBOTS_TXT_REGEX: &str =
r#"(?m)^ *(Allow|Disallow): *(?P<url_path>[a-zA-Z0-9._/?#@!&'()+,;%=-]+?)$"#;
pub(super) const URL_CHARS_REGEX: &str = r#"["<>\\^`{|} ]"#;
#[derive(Debug, Copy, Clone)]
pub enum ExtractionTarget {
ResponseBody,
RobotsTxt,
DirectoryListing,
}
pub struct ExtractorBuilder<'a> {
response: Option<&'a FeroxResponse>,
url: String,
handles: Option<Arc<Handles>>,
target: ExtractionTarget,
}
impl<'a> Default for ExtractorBuilder<'a> {
fn default() -> Self {
Self {
response: None,
url: "".to_string(),
handles: None,
target: ExtractionTarget::ResponseBody,
}
}
}
impl<'a> ExtractorBuilder<'a> {
pub fn handles(&mut self, handles: Arc<Handles>) -> &mut Self {
self.handles = Some(handles);
self
}
pub fn url(&mut self, url: &str) -> &mut Self {
self.url = url.to_string();
self
}
pub fn target(&mut self, target: ExtractionTarget) -> &mut Self {
self.target = target;
self
}
pub fn response(&mut self, response: &'a FeroxResponse) -> &mut Self {
self.response = Some(response);
self
}
pub fn build(&self) -> Result<Extractor<'a>> {
if (self.url.is_empty() && self.response.is_none()) || self.handles.is_none() {
bail!("Extractor requires a URL or a FeroxResponse be specified as well as a Handles object")
}
Ok(Extractor {
links_regex: Regex::new(LINKFINDER_REGEX).unwrap(),
robots_regex: Regex::new(ROBOTS_TXT_REGEX).unwrap(),
url_regex: Regex::new(URL_CHARS_REGEX).unwrap(),
response: if self.response.is_some() {
Some(self.response.unwrap())
} else {
None
},
url: self.url.to_owned(),
handles: self.handles.as_ref().unwrap().clone(),
target: self.target,
})
}
}