use std::char;
use std::ops::Range;
use crate::domains::find_authority_end;
use crate::scanner::Scanner;
const MIN_URL_LENGTH: usize = 4;
const QUOTES: &[char] = &['\'', '\"'];
pub struct UrlScanner {
pub iri_parsing_enabled: bool,
}
pub struct DomainScanner {
pub iri_parsing_enabled: bool,
}
impl Scanner for UrlScanner {
fn scan(&self, s: &str, separator: usize) -> Option<Range<usize>> {
if separator == 0 {
return None;
}
if !s[separator..].starts_with("://") {
return None;
}
let after_separator = separator + "://".len();
if after_separator >= s.len() {
return None;
}
if let (Some(start), quote) = find_scheme_start(&s[0..separator]) {
let scheme = &s[start..separator];
let s = &s[after_separator..];
let require_host = scheme_requires_host(scheme);
if let (Some(after_authority), _) =
find_authority_end(s, true, require_host, true, self.iri_parsing_enabled)
{
if let Some(end) =
find_url_end(&s[after_authority..], quote, self.iri_parsing_enabled)
{
if after_authority == 0 && end == 0 {
return None;
}
let range = Range {
start,
end: after_separator + after_authority + end,
};
return Some(range);
}
}
}
None
}
}
impl Scanner for DomainScanner {
fn scan(&self, s: &str, separator: usize) -> Option<Range<usize>> {
if separator == 0 || s.len() < MIN_URL_LENGTH {
return None;
}
if let (Some(start), quote) = find_domain_start(&s[0..separator], self.iri_parsing_enabled)
{
let s = &s[start..];
if let (Some(domain_end), Some(_)) =
find_authority_end(s, false, true, true, self.iri_parsing_enabled)
{
if let Some(end) = find_url_end(&s[domain_end..], quote, self.iri_parsing_enabled) {
let range = Range {
start,
end: start + domain_end + end,
};
return Some(range);
}
}
}
None
}
}
fn find_scheme_start(s: &str) -> (Option<usize>, Option<char>) {
let mut first = None;
let mut special = None;
let mut quote = None;
for (i, c) in s.char_indices().rev() {
match c {
'a'..='z' | 'A'..='Z' => first = Some(i),
'0'..='9' => special = Some(i),
'+' | '-' | '.' => {}
'@' => return (None, None),
c if QUOTES.contains(&c) => {
quote = Some(c);
break;
}
_ => break,
}
}
if let Some(first) = first {
if let Some(special) = special {
if first > 0 && first - 1 == special {
return (None, quote);
}
}
}
(first, quote)
}
fn scheme_requires_host(scheme: &str) -> bool {
matches!(scheme, "https" | "http" | "ftp" | "ssh")
}
fn find_domain_start(s: &str, iri_parsing_enabled: bool) -> (Option<usize>, Option<char>) {
let mut first = None;
let mut quote = None;
for (i, c) in s.char_indices().rev() {
match c {
'a'..='z' | 'A'..='Z' | '0'..='9' => first = Some(i),
'\u{80}'..=char::MAX if iri_parsing_enabled => first = Some(i),
'/' => return (None, None),
'@' => return (None, None),
'.' => return (None, None),
'-' => {
if first.is_none() {
return (None, None);
} else {
first = Some(i);
}
}
c if QUOTES.contains(&c) => {
quote = Some(c);
break;
}
_ => break,
}
}
if let Some(first) = first {
if s[first..].starts_with('-') {
return (None, None);
}
}
(first, quote)
}
fn find_url_end(s: &str, quote: Option<char>, iri_parsing_enabled: bool) -> Option<usize> {
let mut round = 0;
let mut square = 0;
let mut curly = 0;
let mut single_quote = false;
let mut previous_is_url_char = true;
let mut end = Some(0);
if !s[0..].starts_with("/") && !s[0..].starts_with("?") {
return Some(0);
}
for (i, c) in s.char_indices() {
let can_be_last = match c {
'\u{00}'..='\u{1F}' | ' ' | '|' | '\"' | '<' | '>' | '`' | '\u{7F}'..='\u{9F}' => {
break;
}
'?' | '!' | '.' | ',' | ':' | ';' | '*' => {
false
}
'/' => {
previous_is_url_char
}
'(' => {
round += 1;
false
}
')' => {
round -= 1;
if round < 0 {
break;
}
true
}
'[' => {
square += 1;
false
}
']' => {
square -= 1;
if square < 0 {
break;
}
true
}
'{' => {
curly += 1;
false
}
'}' => {
curly -= 1;
if curly < 0 {
break;
}
true
}
_ if Some(c) == quote => {
break;
}
'\'' => {
single_quote = !single_quote;
!single_quote
}
'\u{80}'..=char::MAX if !iri_parsing_enabled => false,
_ => true,
};
if can_be_last {
end = Some(i + c.len_utf8());
}
previous_is_url_char = c.is_ascii() || iri_parsing_enabled;
}
end
}