use fancy_regex::Regex;
use idna::domain_to_ascii;
lazy_static! {
static ref NOT_DOMAINS: Vec<&'static str> = vec![
"System.Collections.IComparer.Compare",
"System.IO",
"System.Management",
"System.Net",
"System.Security"
];
static ref DOMAIN_WHITELIST: Vec<&'static str> = vec!["localhost"];
static ref DOMAIN: Regex = Regex::new(
&[
r"(?i)^(?:[a-zA-Z0-9]", r"(?:[a-zA-Z0-9-_]{0,61}[A-Za-z0-9])?\.)", r"+[A-Za-z0-9][A-Za-z0-9-_]{0,61}", r"[A-Za-z]$", ].join("")
).unwrap();
static ref DOMAINS_EXT: Vec<String> = tld_download::from_db();
static ref EMAIL: Regex = Regex::new(r"^[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z0-9\-]*$").unwrap();
static ref EMAIL_DOMAIN: Regex = Regex::new(
&[
r"(?i)",
r"(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+",
r"(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?$)",
r"|^\[(25[0-5]|2[0-4]\d|[0-1]?\d?\d)",
r"(\.(25[0-5]|2[0-4]\d|[0-1]?\d?\d)){3}\]$",
].join("")
).unwrap();
static ref IP_MIDDLE_OCTET: &'static str = r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5]))";
static ref IP_LAST_OCTET: &'static str = r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))";
static ref URL: Regex = Regex::new(
&[
r"(?i)",
r"(?:(?:https?|ftp)://)",
r"(?:[-a-z\u00a1-\uffff0-9._~%!$&'()*+,;=:]+",
r"(?::[-a-z0-9._~%!$&'()*+,;=:]*)?@)?",
r"(?:",
r"(?P<private_ip>",
format!(r"(?:(?:10|127){}{}{})|", *IP_MIDDLE_OCTET, r"{2}", *IP_LAST_OCTET).as_str(),
format!(r"(?:(?:169\.254|192\.168){}{})|", *IP_MIDDLE_OCTET, *IP_LAST_OCTET).as_str(),
format!(r"(?:172\.(?:1[6-9]|2\d|3[0-1]){}{}))", *IP_MIDDLE_OCTET, *IP_LAST_OCTET).as_str(),
r"|",
r"(?P<private_host>",
r"(?:localhost))",
r"|",
r"(?P<public_ip>",
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])",
format!("{}{}", *IP_MIDDLE_OCTET, r"{2}").as_str(),
format!("{})", *IP_LAST_OCTET).as_str(),
r"|",
r"\[(",
r"([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|",
r"([0-9a-fA-F]{1,4}:){1,7}:|",
r"([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|",
r"([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|",
r"([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|",
r"([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|",
r"([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|",
r"[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|",
r":((:[0-9a-fA-F]{1,4}){1,7}|:)|",
r"fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|",
r"::(ffff(:0{1,4}){0,1}:){0,1}",
r"((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}",
r"(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|",
r"([0-9a-fA-F]{1,4}:){1,4}:",
r"((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}",
r"(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])",
r")\]|",
r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)",
r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*",
r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))",
r")",
r"(?::\d{2,5})?",
r"(?:/[-a-z\u00a1-\uffff0-9._~%!$&'()*+,;=:@/]*)?",
r"(?:\?\S*)?",
r"(?:#\S*)?",
r"$",
].join("")
).unwrap();
}
fn is_domain_valid(domain: &str) -> bool {
if NOT_DOMAINS
.iter()
.any(|dom| domain.to_lowercase().eq(&dom.to_lowercase()))
{
false
} else {
true
}
}
fn is_tld_valid(domain: &str) -> bool {
let parts: Vec<&str> = domain.splitn(2, '.').collect();
if parts.len() > 1 {
let tld = parts[1].to_lowercase();
if tld
.chars()
.map(|c| (c.is_alphabetic() || c == '.'))
.any(|x| !x)
{
return false;
}
}
true
}
pub fn is_email(value: &str, whitelist: Option<Vec<&str>>) -> bool {
if value.is_empty() || !value.contains('@') {
return false;
}
let whitelist = whitelist.unwrap_or_else(|| DOMAIN_WHITELIST.to_vec());
let parts: Vec<&str> = value.rsplitn(2, '@').collect();
let user_part = parts[1];
let domain_part = parts[0];
if !is_tld_valid(domain_part) {
return false;
}
let user_part = match domain_to_ascii(user_part) {
Ok(x) => x,
Err(_) => return false,
};
if user_part.len() > 64 {
return false;
}
let domain_part = match domain_to_ascii(domain_part) {
Ok(x) => x,
Err(_) => return false,
};
let value = format!("{}@{}", user_part, domain_part);
EMAIL.is_match(&value).unwrap_or_default()
&& EMAIL_DOMAIN.is_match(&domain_part).unwrap_or_default()
|| whitelist.contains(&domain_part.as_str())
}
pub fn is_domain(value: &str) -> bool {
let x = match domain_to_ascii(value) {
Ok(x) => x,
Err(_) => return false,
};
let period_count = x.chars().filter(|&c| c == '.').count();
if DOMAIN.is_match(&x).unwrap_or_default()
&& DOMAINS_EXT.iter().any(|suffix| x.ends_with(suffix))
{
if period_count > 1 {
return is_domain_valid(&x)
} else {
if x.len() > 7 {
return is_domain_valid(&x)
}
}
}
false
}
pub fn is_url(value: &str) -> bool {
URL.is_match(value).unwrap_or_default()
}
pub fn get_url(value: &str) -> Option<String> {
let val = URL
.find(value)
.expect("Regex failed to operate on input!")?;
Some(val.as_str().to_string())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_tld_valid() {
assert!(is_tld_valid("example.com"));
assert!(is_tld_valid("example.co.uk"));
assert!(is_tld_valid("example.io"));
assert!(is_tld_valid("清华大学.cn"));
assert!(!is_tld_valid("example.co9.uk"));
assert!(!is_tld_valid("example.i1o"));
assert!(!is_tld_valid("清华大学.cn9"));
}
#[test]
fn test_is_email() {
assert!(is_email("doe_john@example.com", None));
assert!(is_email("doe-john@example.com", None));
assert!(is_email("doe.john@example.com", None));
assert!(is_email("johndoe@example.com", None));
assert!(is_email("doe.john1940@example.com", None));
assert!(is_email("johndoe@example.co.uk", None));
assert!(is_email("johndoe@транспорт.com", None));
assert!(is_email("johndoe@清华大学.cn", None));
assert!(is_email("Маниш@Ашок.Индия", None));
assert!(is_email("अ@अशोका.भारत", None));
assert!(is_email("johndoe@localhost", None));
assert!(!is_email("johndoe@nonsupporteddomain", None));
assert!(!is_email("johndoe@example.c9om", None));
assert!(!is_email("johndoe@example.co9.uk", None));
assert!(!is_email("johndoe@example.co1.u9k", None));
assert!(!is_email("johndoe@清华大学.cn9", None));
assert!(!is_email("john doe@example.com", None));
}
#[test]
fn test_is_domain() {
assert!(is_domain("example.com"));
assert!(is_domain("www.example.co.uk"));
assert!(is_domain("www.v2.example.co.uk"));
assert!(is_domain("清华大学.cn"));
assert!(is_domain("अशोका.भारत"));
assert!(!is_domain("example.c1om"));
assert!(!is_domain("@example.com"));
assert!(!is_domain("http://www.транспорт.com"));
assert!(!is_domain("https://www.example.com"));
assert!(!is_domain("example.com invalid"));
assert!(!is_domain("kernel32.DLL"));
}
#[test]
fn test_is_url() {
assert!(is_url("https://www.example.com"));
assert!(is_url("https://example.com"));
assert!(is_url("https://example.co.uk"));
assert!(is_url("http://www.example.co.uk"));
assert!(is_url("https://localhost:8443"));
assert!(is_url("http://localhost"));
assert!(is_url("http://清华大学.cn"));
assert!(!is_url("abc.com"));
assert!(!is_url("localhost"));
assert!(!is_url("localhost:9455"));
}
#[test]
fn test_get_url() {
assert_eq!(
get_url("https://www.example.com").unwrap(),
"https://www.example.com"
);
assert_eq!(
get_url("https://example.com").unwrap(),
"https://example.com"
);
assert_eq!(
get_url("https://example.co.uk").unwrap(),
"https://example.co.uk"
);
assert_eq!(
get_url("http://www.example.co.uk").unwrap(),
"http://www.example.co.uk"
);
assert_eq!(
get_url("https://localhost:8443").unwrap(),
"https://localhost:8443"
);
assert_eq!(get_url("http://localhost").unwrap(), "http://localhost");
assert_eq!(get_url("http://清华大学.cn").unwrap(), "http://清华大学.cn");
assert_eq!(
get_url("foo:https://example.com").unwrap(),
"https://example.com"
);
assert_eq!(get_url("abc.com"), None);
}
}