use std::ops::RangeInclusive;
use nom::{
branch::alt,
bytes::complete::{tag, take_while, take_while1, take_while_m_n},
character::complete::char,
combinator::{opt, recognize},
multi::{many0, many1},
sequence::{pair, tuple},
IResult, Slice,
};
use crate::parser::{
link_url::{
ip::{ip_literal::ip_literal, ipv4::ipv4},
LinkDestination,
},
parse_from_text::base_parsers::CustomError,
utils::{
is_alpha, is_digit, is_hex_digit, is_in_one_of_ranges, is_not_white_space, is_sub_delim,
is_unreserved,
},
};
use super::{
allowed_tlds::check_if_tld_is_allowed,
parenthesis_counter::count_chars_in_complete_parenthesis,
punycode_warning::get_puny_code_warning,
};
fn is_allowed_generic_scheme(scheme: &str) -> bool {
matches!(
scheme.to_ascii_lowercase().as_ref(),
"mailto"
| "news"
| "feed"
| "tel"
| "sms"
| "geo"
| "maps"
| "bitcoin"
| "bitcoincash"
| "eth"
| "ethereum"
| "magnet"
)
}
const UCSCHAR_RANGES: [RangeInclusive<u32>; 17] = [
0xa0..=0xd7ff,
0xF900..=0xFDCF,
0xFDF0..=0xFFEF,
0x10000..=0x1FFFD,
0x20000..=0x2FFFD,
0x30000..=0x3FFFD,
0x40000..=0x4FFFD,
0x50000..=0x5FFFD,
0x60000..=0x6FFFD,
0x70000..=0x7FFFD,
0x80000..=0x8FFFD,
0x90000..=0x9FFFD,
0xA0000..=0xAFFFD,
0xB0000..=0xBFFFD,
0xC0000..=0xCFFFD,
0xD0000..=0xDFFFD,
0xE1000..=0xEFFFD,
];
fn is_ucschar(c: char) -> bool {
is_in_one_of_ranges(c as u32, &UCSCHAR_RANGES[..])
}
fn is_iunreserved(c: char) -> bool {
is_unreserved(c) || is_ucschar(c)
}
fn is_scheme(c: char) -> bool {
is_alpha(c) || is_digit(c) || is_other_scheme(c)
}
fn is_other_scheme(c: char) -> bool {
matches!(c, '+' | '-' | '.')
}
fn is_ireg_name_not_pct_encoded(c: char) -> bool {
is_iunreserved(c)
}
fn parse_host(input: &str) -> IResult<&str, (&str, bool), CustomError<&str>> {
match ip_literal(input) {
Ok((input, host)) => {
Ok((input, (host, true)))
}
Err(..) => {
let (input, host) = alt((ipv4, take_while_ireg))(input)?;
Ok((input, (host, false)))
}
}
}
fn take_while_ireg(input: &str) -> IResult<&str, &str, CustomError<&str>> {
let (input, result) = recognize(many0(alt((
recognize(many1(take_while_pct_encoded)),
take_while1(is_ireg_name_not_pct_encoded),
))))(input)?;
Ok((input, result.trim_end_matches('.')))
}
fn iauthority(input: &str) -> IResult<&str, (&str, &str, bool), CustomError<&str>>
{
let i = <&str>::clone(&input);
let (input, userinfo) = opt(recognize(tuple((take_while_iuserinfo, char('@')))))(input)?;
let (input, (host, is_ipv6_or_future)) = parse_host(input)?;
let (input, port) = opt(recognize(tuple((char(':'), take_while(is_digit)))))(input)?;
let userinfo = userinfo.unwrap_or("");
let port = port.unwrap_or("");
let len = userinfo.len().saturating_add(port.len());
if let Some(out) = i.get(0..len) {
Ok((input, (out, host, is_ipv6_or_future)))
} else {
Err(nom::Err::Failure(CustomError::NoContent))
}
}
fn take_while_iuserinfo(input: &str) -> IResult<&str, &str, CustomError<&str>> {
alt((
recognize(many0(take_while_pct_encoded)),
take_while(is_iuserinfo_not_pct_encoded),
))(input)
}
fn is_iuserinfo_not_pct_encoded(c: char) -> bool {
is_iunreserved(c) || is_sub_delim(c) || c == ':'
}
fn is_ipchar_not_pct_encoded(c: char) -> bool {
is_iunreserved(c) || is_sub_delim(c) || matches!(c, ':' | '@')
}
fn take_while_ipchar(input: &str) -> IResult<&str, &str, CustomError<&str>> {
recognize(many0(alt((
take_while(is_ipchar_not_pct_encoded),
take_while_pct_encoded,
))))(input)
}
fn take_while_ipchar1(input: &str) -> IResult<&str, &str, CustomError<&str>> {
recognize(many1(alt((
take_while1(is_ipchar_not_pct_encoded),
take_while_pct_encoded,
))))(input)
}
const IPRIVATE_RANGES: [RangeInclusive<u32>; 3] =
[0xe000..=0xf8ff, 0xf0000..=0xffffd, 0x100000..=0x10fffd];
fn is_iprivate(c: char) -> bool {
is_in_one_of_ranges(c as u32, &IPRIVATE_RANGES[..])
}
fn is_iquery_not_pct_encoded(c: char) -> bool {
is_iprivate(c) || is_ipchar_not_pct_encoded(c) || matches!(c, '/' | '?')
}
fn iquery(input: &str) -> IResult<&str, &str, CustomError<&str>> {
recognize(many0(alt((
take_while1(is_iquery_not_pct_encoded),
take_while_pct_encoded,
))))(input)
}
fn take_while_ifragment(input: &str) -> IResult<&str, &str, CustomError<&str>> {
recognize(many0(alt((take_while_ipchar1, tag("/"), tag("?")))))(input)
}
fn scheme_and_separator(input: &str) -> IResult<&str, (&str, &str), CustomError<&str>> {
let _input = <&str>::clone(&input);
let (input, (_first, second)) =
pair(take_while_m_n(1, 1, is_alpha), take_while(is_scheme))(input)?;
let len = 1usize.saturating_add(second.len());
if let Some(scheme) = _input.get(0..len) {
let (input, separator) = alt((tag("://"), tag(":")))(input)?;
return Ok((input, (scheme, separator)));
}
Err(nom::Err::Failure(CustomError::NoContent))
}
#[test]
fn scheme_with_separator() {
let result = opt(scheme_and_separator)("scheme:host/path");
assert_eq!(Ok(("host/path", Some(("scheme", ":")))), result);
let result = opt(scheme_and_separator)("scheme://host/path");
assert_eq!(Ok(("host/path", Some(("scheme", "://")))), result);
let result = opt(scheme_and_separator)("no_scheme/host/path");
assert_eq!(Ok(("no_scheme/host/path", None)), result);
}
fn take_while_pct_encoded(input: &str) -> IResult<&str, &str, CustomError<&str>> {
recognize(many1(tuple((
char('%'),
take_while_m_n(2, 2, is_hex_digit),
))))(input)
}
fn ifragment(input: &str) -> IResult<&str, &str, CustomError<&str>> {
recognize(many0(tuple((char('#'), take_while_ifragment))))(input)
}
fn parse_ipath_abempty(input: &str) -> IResult<&str, &str, CustomError<&str>> {
recognize(many0(tuple((char('/'), opt(take_while_ipchar1)))))(input)
}
#[test]
fn test_ipath_abempty() {
assert_eq!(parse_ipath_abempty("///foo/bar"), Ok(("", "///foo/bar")));
}
fn parse_ipath_absolute(input: &str) -> IResult<&str, &str, CustomError<&str>> {
recognize(tuple((
char('/'),
opt(tuple((
take_while_ipchar1,
many0(tuple((char('/'), opt(take_while_ipchar1)))),
))),
)))(input)
}
#[test]
fn test_ipath_absolute() {
assert_eq!(parse_ipath_absolute("/foo"), Ok(("", "/foo")));
assert_eq!(parse_ipath_absolute("/foo/bar"), Ok(("", "/foo/bar")));
assert_eq!(parse_ipath_absolute("/foo//bar"), Ok(("", "/foo//bar")));
}
fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> {
let input_ = <&str>::clone(&input);
let (input, scheme_parts) = opt(scheme_and_separator)(input)?;
let (scheme, separator) = scheme_parts.unwrap_or(("", ""));
let (input, (authority, mut host, is_ipv6_or_future)) = iauthority(input)?;
if scheme.is_empty() {
if !host.contains('.') {
return Err(nom::Err::Failure(CustomError::<&str>::InvalidLink));
}
let tld = host
.split('.')
.last()
.ok_or(nom::Err::Failure(CustomError::<&str>::InvalidLinkNoTLD))?;
if !check_if_tld_is_allowed(tld) {
return Err(nom::Err::Failure(CustomError::<&str>::InvalidLink));
}
}
let (input, path) = opt(alt((
parse_ipath_abempty,
parse_ipath_absolute,
recognize(tuple((
take_while_ipchar,
many0(tuple((char('/'), opt(take_while_ipchar1)))),
))), )))(input)?;
let path = path.unwrap_or("");
let (input, query) = opt(recognize(tuple((char('?'), iquery))))(input)?;
let query = query.unwrap_or("");
let (_, fragment) = opt(ifragment)(input)?;
let fragment = fragment.unwrap_or("");
let ihier_len = authority
.len()
.saturating_add(host.len())
.saturating_add(path.len());
if ihier_len == 0 {
return Err(nom::Err::Error(CustomError::InvalidLink));
}
let mut len = scheme
.len()
.saturating_add(separator.len())
.saturating_add(ihier_len)
.saturating_add(query.len())
.saturating_add(fragment.len());
if let Some(link) = input_.get(0..len) {
if link.ends_with([':', ';', '.', ',', '!']) {
len = len.saturating_sub(1);
if path.is_empty() && query.is_empty() && fragment.is_empty() {
host = input_.slice(
scheme.len().saturating_add(separator.len())..input_.len().saturating_sub(1),
);
}
}
len = count_chars_in_complete_parenthesis(link).unwrap_or(len);
let link = input_.slice(0..len);
let input = input_.slice(len..);
return Ok((
input,
LinkDestination {
target: link,
hostname: if host.is_empty() { None } else { Some(host) },
punycode: if is_ipv6_or_future {
None
} else {
get_puny_code_warning(link, host)
},
scheme: if scheme.is_empty() {
None
} else {
Some(scheme)
},
},
));
}
Err(nom::Err::Failure(CustomError::NoContent))
}
fn parse_generic(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> {
let i = <&str>::clone(&input);
let (input, scheme_parts) = opt(scheme_and_separator)(input)?;
let (scheme, _separator) = scheme_parts.unwrap_or(("", ""));
if !is_allowed_generic_scheme(scheme) {
return Err(nom::Err::Error(CustomError::InvalidLink));
}
let (input, rest) = take_while1(is_not_white_space)(input)?;
let len = scheme.len().saturating_add(1).saturating_add(rest.len());
if let Some(target) = i.get(0..len) {
return Ok((
input,
LinkDestination {
scheme: Some(scheme),
target,
hostname: None,
punycode: None,
},
));
}
Err(nom::Err::Failure(CustomError::NoContent))
}
pub(super) fn parse_link(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> {
alt((parse_generic, parse_iri))(input)
}