#![cfg_attr(all(test, feature = "bench"), feature(test))]
#[cfg(test)]
mod tests;
pub(crate) const SCHEMES: [&str; 8] =
["http", "https", "mailto", "news", "file", "git", "ssh", "ftp"];
const SURROUND_CHARACTERS: [SurroundCharacter; 3] = [
SurroundCharacter::Bracket('(', ')'),
SurroundCharacter::Bracket('[', ']'),
SurroundCharacter::Quote('\''),
];
#[derive(Debug, PartialEq)]
enum State {
Default,
Path,
SchemeFirstSlash,
SchemeSecondSlash,
Scheme,
}
impl Default for State {
#[inline]
fn default() -> Self {
State::Default
}
}
#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum ParserState {
Url(usize),
MaybeUrl,
NoUrl,
}
#[derive(Default)]
pub struct Parser {
pub(crate) scheme_indices: [u8; 8],
pub(crate) state: State,
surround_states: Vec<(char, usize)>,
len: usize,
}
impl Parser {
#[inline]
pub fn new() -> Self {
Self::default()
}
#[inline]
pub fn advance(&mut self, c: char) -> ParserState {
self.len += 1;
if is_illegal(c) {
self.reset();
return ParserState::NoUrl;
}
for surround_char in &SURROUND_CHARACTERS[..] {
let m = self.surround_states.iter().enumerate().rfind(|s| (s.1).0 == c);
match m {
Some((index, elem)) if elem.1 + 1 < self.len => {
self.surround_states.remove(index);
return ParserState::MaybeUrl;
},
None if surround_char.start() == &c => {
self.surround_states.push((*surround_char.end(), self.len));
return ParserState::MaybeUrl;
},
_ => (),
}
if surround_char.end() == &c {
self.reset();
return ParserState::NoUrl;
}
}
match self.state {
State::Default => self.advance_default(c),
State::Path => self.advance_path(c),
State::SchemeFirstSlash => self.advance_scheme_first_slash(c),
State::SchemeSecondSlash => self.advance_scheme_second_slash(c),
State::Scheme => {
if let Some(length) = self.advance_scheme(c) {
self.reset();
return ParserState::Url(length);
}
},
}
match self.len {
0 => ParserState::NoUrl,
_ => ParserState::MaybeUrl,
}
}
#[inline]
pub fn reset(&mut self) {
self.surround_states.clear();
self.scheme_indices = [0; 8];
self.state = State::Default;
self.len = 0;
}
#[inline]
fn advance_default(&mut self, c: char) {
match c {
'.' | ',' | ':'..=';' | '?' | '!' | '(' => self.reset(),
_ => self.state = State::Path,
}
}
#[inline]
fn advance_path(&mut self, c: char) {
if c == '/' {
self.state = State::SchemeFirstSlash
}
}
#[inline]
fn advance_scheme_first_slash(&mut self, c: char) {
if c == '/' {
self.state = State::SchemeSecondSlash;
} else {
self.state = State::Path;
}
}
#[inline]
fn advance_scheme_second_slash(&mut self, c: char) {
if c == ':' {
self.state = State::Scheme;
} else {
self.state = State::Path;
}
}
#[inline]
fn advance_scheme(&mut self, c: char) -> Option<usize> {
match c {
'a'..='z' | 'A'..='Z' => {
for (i, index) in self.scheme_indices.iter_mut().enumerate() {
let scheme_len = SCHEMES[i].len() as u8;
if *index >= scheme_len {
continue;
}
if SCHEMES[i].chars().rev().nth(*index as usize) != Some(c) {
*index = scheme_len + 1;
} else {
*index += 1;
}
if *index == scheme_len {
self.len -= self.surround_states.last().map(|s| s.1).unwrap_or(0);
return Some(self.len);
}
}
},
_ => self.reset(),
}
None
}
}
#[inline]
fn is_illegal(c: char) -> bool {
match c {
'\u{00}'..='\u{1F}'
| '\u{7F}'..='\u{9F}'
| '<'
| '>'
| '"'
| ' '
| '{'..='}'
| '\\'
| '^'
| '⟨'
| '⟩'
| '`' => true,
_ => false,
}
}
#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
enum SurroundCharacter {
Bracket(char, char),
Quote(char),
}
impl SurroundCharacter {
#[inline]
fn start(&self) -> &char {
match self {
SurroundCharacter::Bracket(_end, start) => &start,
SurroundCharacter::Quote(quote) => "e,
}
}
#[inline]
fn end(&self) -> &char {
match self {
SurroundCharacter::Bracket(end, _start) => &end,
SurroundCharacter::Quote(quote) => "e,
}
}
}