#![allow(non_snake_case)]
use std::ops::Range;
use nom::{
branch::alt,
bytes::streaming::{is_not, tag, take_while1},
character::{
complete::{char, multispace1},
is_digit, is_hex_digit,
streaming::{alpha1, alphanumeric1, digit1, multispace0},
},
combinator::{map, opt, recognize},
error::{Error, ErrorKind, ParseError},
multi::many0,
sequence::{delimited, pair, preceded, separated_pair, terminated, tuple},
Err, IResult, InputLength, Needed, Offset, Parser,
};
#[allow(unused_imports)]
use nom::error_position;
static UTF8_CHAR_WIDTH: [u8; 256] = [
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
];
#[inline]
fn utf8_char_width(b: u8) -> usize {
return UTF8_CHAR_WIDTH[b as usize] as usize;
}
#[inline]
fn is_xml_char_t(chr: char) -> bool {
chr == '\u{9}'
|| (chr >= '\u{A}' && chr <= '\u{D}')
|| (chr >= '\u{20}' && chr <= '\u{D7FF}')
|| (chr >= '\u{E000}' && chr <= '\u{FFFD}')
|| (chr >= '\u{10000}' && chr <= '\u{10FFFF}')
}
#[inline]
fn is_namestart_char_t(chr: char) -> bool {
(chr >= 'A' && chr <= 'Z')
|| (chr >= 'a' && chr <= 'z')
|| (chr >= '\u{C0}' && chr <= '\u{D6}')
|| (chr >= '\u{D8}' && chr <= '\u{F6}')
|| (chr >= '\u{F8}' && chr <= '\u{2FF}')
|| (chr >= '\u{370}' && chr <= '\u{37D}')
|| (chr >= '\u{37F}' && chr <= '\u{1FFF}')
|| (chr >= '\u{200C}' && chr <= '\u{200D}')
|| (chr >= '\u{2070}' && chr <= '\u{218F}')
|| (chr >= '\u{2C00}' && chr <= '\u{2FEF}')
|| (chr >= '\u{3001}' && chr <= '\u{D7FF}')
|| (chr >= '\u{F900}' && chr <= '\u{FDCF}')
|| (chr >= '\u{FDF0}' && chr <= '\u{FFFD}')
|| (chr >= '\u{10000}' && chr <= '\u{EFFFF}')
|| chr == ':'
|| chr == '_'
}
fn namestart_char(input: &[u8]) -> IResult<&[u8], &[u8]> {
if input.len() == 0 {
return Err(Err::Incomplete(Needed::new(1)));
}
let width = utf8_char_width(input[0]);
if input.len() < width {
return Err(Err::Incomplete(Needed::new(width - input.len())));
}
let c = match std::str::from_utf8(&input[..width]).ok() {
Some(s) => s.chars().next().unwrap(),
None => return Err(Err::Error(Error::new(input, ErrorKind::Char))),
};
if is_namestart_char_t(c) {
return Ok((&input[width..], &input[0..width]));
} else {
return Err(Err::Error(Error::new(input, ErrorKind::Char)));
}
}
#[inline]
fn is_namechar_t(chr: char) -> bool {
is_namestart_char_t(chr)
|| (chr >= '0' && chr <= '9')
|| (chr >= '\u{0300}' && chr <= 'z')
|| (chr >= '\u{203F}' && chr <= '\u{2040}')
|| chr == '-'
|| chr == '.'
|| chr == '\u{B7}'
}
fn namechar(input: &[u8]) -> IResult<&[u8], &[u8]> {
if input.len() == 0 {
return Err(Err::Incomplete(Needed::new(1)));
}
let width = utf8_char_width(input[0]);
if input.len() < width {
return Err(Err::Incomplete(Needed::new(width - input.len())));
}
let c = match std::str::from_utf8(&input[..width]).ok() {
Some(s) => s.chars().next().unwrap(),
None => return Err(Err::Error(Error::new(input, ErrorKind::Char))),
};
if is_namechar_t(c) {
return Ok((&input[width..], &input[0..width]));
} else {
return Err(Err::Error(Error::new(input, ErrorKind::Char)));
}
}
fn many0_custom_chardata<I, O, E, F>(mut f: F) -> impl FnMut(I) -> IResult<I, (), E>
where
I: Clone + InputLength,
F: Parser<I, O, E>,
E: ParseError<I>,
{
move |mut i: I| {
loop {
let len = i.input_len();
match f.parse(i.clone()) {
Err(Err::Error(_)) => return Ok((i, ())),
Err(_e) => return Ok((i, ())),
Ok((i1, _o)) => {
if i1.input_len() == len {
return Err(Err::Error(E::from_error_kind(i, ErrorKind::Many0)));
}
i = i1;
}
}
}
}
}
fn many0_custom_trycomplete<I, O, E, F>(mut f: F) -> impl FnMut(I) -> IResult<I, (), E>
where
I: Clone + InputLength,
F: Parser<I, O, E>,
E: ParseError<I>,
{
move |mut i: I| {
loop {
let len = i.input_len();
match f.parse(i.clone()) {
Err(Err::Error(_)) => return Ok((i, ())),
Err(e) => return Err(e), Ok((i1, _o)) => {
if i1.input_len() == len {
return Err(Err::Error(E::from_error_kind(i, ErrorKind::Many0)));
}
i = i1;
}
}
}
}
}
fn many1_custom<I, O, E, F>(mut f: F) -> impl FnMut(I) -> IResult<I, (), E>
where
I: Clone + InputLength,
F: Parser<I, O, E>,
E: ParseError<I>,
{
move |mut i: I| match f.parse(i.clone()) {
Err(Err::Error(err)) => Err(Err::Error(E::append(i, ErrorKind::Many1, err))),
Err(e) => Err(e),
Ok((i1, _o)) => {
i = i1;
loop {
let len = i.input_len();
match f.parse(i.clone()) {
Err(Err::Error(_)) => return Ok((i, ())),
Err(e) => return Err(e),
Ok((i1, _o)) => {
if i1.input_len() == len {
return Err(Err::Error(E::from_error_kind(i, ErrorKind::Many1)));
}
i = i1;
}
}
}
}
}
}
fn name(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(pair(namestart_char, many0_custom_trycomplete(namechar)))(input)
}
fn CharRef(input: &[u8]) -> IResult<&[u8], &[u8]> {
alt((
recognize(tuple((tag("&#"), take_while1(is_digit), char(';')))),
recognize(tuple((tag("&#x"), take_while1(is_hex_digit), char(';')))),
))(input)
}
fn EntityRef(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(tuple((tag("&"), name, char(';'))))(input)
}
fn Reference(input: &[u8]) -> IResult<&[u8], &[u8]> {
alt((EntityRef, CharRef))(input)
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Reference<'a> {
pub initial: &'a str,
}
fn AttValue(input: &[u8]) -> IResult<&[u8], &[u8]> {
alt((
delimited(
char('"'),
recognize(many0_custom_trycomplete(alt((is_not(r#"<&""#), Reference)))),
char('"'),
),
delimited(
char('\''),
recognize(many0_custom_trycomplete(alt((is_not(r#"<&'"#), Reference)))),
char('\''),
),
))(input)
}
fn Eq(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(tuple((multispace0, char('='), multispace0)))(input)
}
fn Attribute(input: &[u8]) -> IResult<&[u8], SAXAttribute> {
match tuple((name, Eq, AttValue))(input) {
Ok((i, o)) => {
return Ok((
i,
SAXAttribute {
value: unsafe { std::str::from_utf8_unchecked(o.2) },
qualified_name: unsafe { std::str::from_utf8_unchecked(o.0) },
},
));
}
Err(e) => Err(e),
}
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct AttributeRange {
pub value: Range<usize>,
pub name: Range<usize>,
pub local_name: Range<usize>,
pub prefix: Range<usize>,
pub namespace: Range<usize>,
}
pub struct AttributeRanges {
pub data: Vec<AttributeRange>,
}
pub(crate) fn Attribute2(input: &[u8]) -> IResult<&[u8], AttributeRange> {
match preceded(multispace0, tuple((name, Eq, AttValue)))(input) {
Ok((i, o)) => {
let name_start = input.offset(o.0);
let name_end = name_start + o.0.len();
let val_start = input.offset(o.2);
let val_end = val_start + o.2.len();
return Ok((
i,
AttributeRange{
name: std::ops::Range { start:name_start , end: name_end } ,
value: (val_start..val_end),
local_name: (0..0),
prefix: (0..0),
namespace: (0..0),
}
));
}
Err(e) => Err(e),
}
}
#[test]
fn test_attribute2() {
let data = r#" a:b12='val2'"#.as_bytes();
let res = Attribute2(&data);
println!("{:?}", res);
let range = res.unwrap().1;
assert_eq!("a:b12".as_bytes(), &data[range.name.clone()]);
assert_eq!("val2".as_bytes(), &data[range.value.clone()]);
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct SAXAttribute<'a> {
pub value: &'a str,
pub qualified_name: &'a str,
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct SAXAttribute2 {
pub value: std::ops::Range<usize>,
pub qualified_name: std::ops::Range<usize>,
}
#[derive(Clone, Debug, Eq, PartialEq)]
struct SAXAttributeNsAware {
pub value: std::ops::Range<usize>,
pub qualified_name: std::ops::Range<usize>,
pub prefix: std::ops::Range<usize>,
pub local_name: std::ops::Range<usize>,
pub namespace: std::ops::Range<usize>,
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct StartElement<'a> {
pub name: &'a str,
pub attributes_chunk: &'a [u8],
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct EndElement<'a> {
pub name: &'a str,
}
fn STag<'a>(input: &'a [u8]) -> IResult<&[u8], StartElement<'a>> {
match tuple((
char('<'),
name,
recognize(many0_custom_trycomplete(preceded(multispace0, Attribute))),
multispace0,
char('>'),
))(input)
{
Ok((i, o)) => {
return Ok((
i,
StartElement {
name: unsafe { std::str::from_utf8_unchecked(o.1) },
attributes_chunk: o.2,
},
));
}
Err(e) => Err(e),
}
}
fn EmptyElemTag(input: &[u8]) -> IResult<&[u8], StartElement> {
match tuple((
char('<'),
name,
recognize(many0_custom_trycomplete(preceded(multispace0, Attribute))),
multispace0,
tag("/>"),
))(input)
{
Ok((i, o)) => Ok((
i,
StartElement {
name: unsafe { std::str::from_utf8_unchecked(o.1) },
attributes_chunk: o.2,
},
)),
Err(e) => Err(e),
}
}
fn ETag(input: &[u8]) -> IResult<&[u8], EndElement> {
match tuple((tag("</"), name, multispace0, char('>')))(input) {
Ok((i, o)) => {
return Ok((
i,
EndElement {
name: unsafe { std::str::from_utf8_unchecked(o.1) },
},
));
}
Err(e) => Err(e),
}
}
#[test]
fn test_etag() {
let data = r#"</A>"#.as_bytes();
let res = ETag(&data);
println!("{:?}", res);
}
#[test]
fn test_namestart_char_t() {
let data = "<a.abc-ab1çroot><A/><B/><C/></root>".as_bytes();
let res = STag(&data);
println!("{:?}", res);
}
#[test]
fn test_stag() {
let data = r#"<A a="b" c = "d"></A>"#.as_bytes();
let res = STag(&data);
println!("{:?}", res);
let data = r#"<A a='x'>"#.as_bytes();
let res = STag(&data);
println!("{:?}", res);
let data = r#"<B b="val" >"#.as_bytes();
let res = STag(&data);
println!("{:?}", res);
}
#[inline]
fn is_CharData_single_pure_t(chr: char) -> bool {
chr != '<' && chr != '&' && is_xml_char_t(chr)
}
fn CharData_single_pure(input: &[u8]) -> IResult<&[u8], &[u8]> {
if input.len() == 0 {
return Err(Err::Incomplete(Needed::new(1)));
}
let width = utf8_char_width(input[0]);
if input.len() < width {
return Err(Err::Incomplete(Needed::new(width - input.len())));
}
let c = match std::str::from_utf8(&input[..width]).ok() {
Some(s) => s.chars().next().unwrap(),
None => return Err(Err::Error(Error::new(input, ErrorKind::Char))),
};
if is_CharData_single_pure_t(c) {
return Ok((&input[width..], &input[0..width]));
} else {
return Err(Err::Error(Error::new(input, ErrorKind::Char)));
}
}
fn CharData_single(input: &[u8]) -> IResult<&[u8], &[u8]> {
if input.len() == 0 {
return Err(Err::Error(Error::new(input, ErrorKind::Char)));
}
match tag::<&str, &[u8], Error<&[u8]>>("]]>")(input) {
Ok(_r) => return Err(Err::Error(Error::new(input, ErrorKind::Char))),
Err(Err::Incomplete(_n)) => return Err(Err::Incomplete(Needed::Unknown)),
_ => (),
};
CharData_single_pure(input)
}
#[test]
fn test_chardata_single() {
let _data = "]]".as_bytes();
assert_eq!(
CharData_single("]".as_bytes()),
Err(Err::Incomplete(Needed::Unknown))
);
assert_eq!(
CharData_single("]]".as_bytes()),
Err(Err::Incomplete(Needed::Unknown))
);
assert_eq!(
CharData_single("]]>".as_bytes()),
Err(Err::Error(error_position!(
"]]>".as_bytes(),
ErrorKind::Char
)))
);
assert_eq!(
CharData_single("]]<".as_bytes()),
Ok((&b"]<"[..], &b"]"[..]))
);
assert_eq!(
CharData_single("&".as_bytes()),
Err(Err::Error(error_position!("&".as_bytes(), ErrorKind::Char)))
);
assert_eq!(
CharData_single("<".as_bytes()),
Err(Err::Error(error_position!("<".as_bytes(), ErrorKind::Char)))
);
assert_eq!(
CharData_single("abc".as_bytes()),
Ok((&b"bc"[..], &b"a"[..]))
);
}
fn CharData(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(tuple((
CharData_single,
many0_custom_chardata(CharData_single),
)))(input)
}
#[test]
fn test_chardata() {
assert_eq!(CharData("abc]".as_bytes()), Ok((&b"]"[..], &b"abc"[..])));
assert_eq!(
CharData("]]".as_bytes()),
Err(Err::Incomplete(Needed::Unknown))
);
assert_eq!(
CharData("]]>".as_bytes()),
Err(Err::Error(error_position!(
"]]>".as_bytes(),
ErrorKind::Char
)))
);
assert_eq!(CharData("]]<".as_bytes()), Ok((&b"<"[..], &b"]]"[..])));
assert_eq!(CharData("a&".as_bytes()), Ok((&b"&"[..], &b"a"[..])));
assert_eq!(CharData("a<".as_bytes()), Ok((&b"<"[..], &b"a"[..])));
assert_eq!(CharData("abc".as_bytes()), Ok((&b""[..], &b"abc"[..])));
let data: Vec<u8> = [
65, 108, 99, 104, 101, 109, 121, 32, 40, 102, 114, 111, 109, 32, 65, 114, 97, 98, 105, 99,
58, 32, 97, 108, 45, 107, 196, 171, 109, 105, 121, 196,
]
.to_vec();
let remainder: Vec<u8> = [196].to_vec();
println!("try to read: {:?}", unsafe {
std::str::from_utf8_unchecked(&data[0..31])
});
assert_eq!(
CharData(&data),
Ok((
&remainder[0..1],
&"Alchemy (from Arabic: al-kīmiy".as_bytes()[..]
))
);
}
pub enum ContentRelaxed<'a> {
CharData(&'a [u8]),
StartElement(StartElement<'a>),
EmptyElemTag(StartElement<'a>),
EndElement(EndElement<'a>),
Reference(Reference<'a>),
CdataStart,
CommentStart,
}
fn content_relaxed_CharData(input: &[u8]) -> IResult<&[u8], ContentRelaxed> {
match CharData(input) {
Ok(succ) => Ok((succ.0, ContentRelaxed::CharData(succ.1))),
Err(err) => return Err(err),
}
}
fn content_relaxed_STag(input: &[u8]) -> IResult<&[u8], ContentRelaxed> {
match STag(input) {
Ok(succ) => Ok((succ.0, ContentRelaxed::StartElement(succ.1))),
Err(err) => return Err(err),
}
}
fn content_relaxed_ETag(input: &[u8]) -> IResult<&[u8], ContentRelaxed> {
match ETag(input) {
Ok(succ) => Ok((succ.0, ContentRelaxed::EndElement(succ.1))),
Err(err) => return Err(err),
}
}
fn content_relaxed_EmptyElemTag(input: &[u8]) -> IResult<&[u8], ContentRelaxed> {
match EmptyElemTag(input) {
Ok(succ) => Ok((succ.0, ContentRelaxed::EmptyElemTag(succ.1))),
Err(err) => return Err(err),
}
}
fn content_relaxed_Reference(input: &[u8]) -> IResult<&[u8], ContentRelaxed> {
match Reference(input) {
Ok(succ) => Ok((
succ.0,
ContentRelaxed::Reference(Reference {
initial: unsafe { std::str::from_utf8_unchecked(succ.1) },
}),
)),
Err(err) => return Err(err),
}
}
fn content_relaxed_CdataStart(input: &[u8]) -> IResult<&[u8], ContentRelaxed> {
match CDATASection_start(input) {
Ok(succ) => Ok((succ.0, ContentRelaxed::CdataStart)),
Err(err) => return Err(err),
}
}
fn content_relaxed_CommentStart(input: &[u8]) -> IResult<&[u8], ContentRelaxed> {
match Comment_start(input) {
Ok(succ) => Ok((succ.0, ContentRelaxed::CommentStart)),
Err(err) => return Err(err),
}
}
pub fn content_relaxed(input: &[u8]) -> IResult<&[u8], ContentRelaxed> {
alt((
content_relaxed_CharData,
content_relaxed_STag,
content_relaxed_EmptyElemTag,
content_relaxed_ETag,
content_relaxed_Reference,
content_relaxed_CdataStart,
content_relaxed_CommentStart,
))(input)
}
#[test]
fn test_xml3() {
let data = "<root><A/><B/><C/></root>".as_bytes();
fn parser(s: &[u8]) -> IResult<&[u8], &[u8]> {
tag("<root>")(s)
}
let res = parser(&data);
println!("{:?}", res);
}
fn VersionNum(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(tuple((tag("1."), digit1)))(input)
}
#[test]
fn test_VersionNum() {
let data = r#"1.123 "#.as_bytes();
let res = VersionNum(&data);
println!("{:?}", res);
}
fn VersionInfo(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(tuple((
multispace1,
tag("version"),
Eq,
alt((
delimited(char('"'), VersionNum, char('"')),
delimited(char('\''), VersionNum, char('\'')),
)),
)))(input)
}
#[test]
fn test_VersionInfo() {
let data = r#" version="1.0" "#.as_bytes();
let res = VersionInfo(&data);
println!("{:?}", res);
}
fn EncName(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(tuple((
alpha1,
many0_custom_trycomplete(alt((alphanumeric1, tag("-"), tag("."), tag("_")))),
)))(input)
}
#[test]
fn test_EncName() {
let data = r#"UTF-8 "#.as_bytes();
let res = EncName(&data);
println!("{:?}", res);
}
fn EncodingDecl(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(tuple((
multispace1,
tag("encoding"),
Eq,
alt((
delimited(char('"'), EncName, char('"')),
delimited(char('\''), EncName, char('\'')),
)),
)))(input)
}
#[test]
fn test_EncodingDecl() {
let data = r#" encoding='EUC-JP' "#.as_bytes();
let res = EncodingDecl(&data);
println!("{:?}", res);
}
fn yes_mi_no_mu(input: &[u8]) -> IResult<&[u8], &[u8]> {
alt((tag("yes"), tag("no")))(input)
}
fn SDDecl(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(tuple((
multispace1,
tag("standalone"),
Eq,
alt((
delimited(char('"'), yes_mi_no_mu, char('"')),
delimited(char('\''), yes_mi_no_mu, char('\'')),
)),
)))(input)
}
#[test]
fn test_SDDecl() {
let data = r#" standalone='yes' "#.as_bytes();
let res = SDDecl(&data);
println!("{:?}", res);
}
fn XMLDecl(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(tuple((
tag("<?xml"),
VersionInfo,
opt(EncodingDecl),
opt(SDDecl),
multispace0,
tag("?>"),
)))(input)
}
#[test]
fn test_XMLDecl() {
let data = r#"<?xml version="1.0" encoding="UTF-8" standalone='yes'?>"#.as_bytes();
let res = XMLDecl(&data);
println!("{:?}", res);
}
fn Comment_start(input: &[u8]) -> IResult<&[u8], &[u8]> {
tag("<!--")(input)
}
fn Comment_end(input: &[u8]) -> IResult<&[u8], &[u8]> {
tag("-->")(input)
}
fn inside_Comment_or_CDATA_single_pure(input: &[u8]) -> IResult<&[u8], &[u8]> {
if input.len() == 0 {
return Err(Err::Incomplete(Needed::new(1)));
}
let width = utf8_char_width(input[0]);
if input.len() < width {
return Err(Err::Incomplete(Needed::new(width - input.len())));
}
let c = match std::str::from_utf8(&input[..width]).ok() {
Some(s) => s.chars().next().unwrap(),
None => return Err(Err::Error(Error::new(input, ErrorKind::Char))),
};
if is_xml_char_t(c) {
return Ok((&input[width..], &input[0..width]));
} else {
return Err(Err::Error(Error::new(input, ErrorKind::Char)));
}
}
fn inside_Comment_single(input: &[u8]) -> IResult<&[u8], &[u8]> {
if input.len() == 0 {
return Err(Err::Error(Error::new(input, ErrorKind::Char)));
}
match tag::<&str, &[u8], Error<&[u8]>>("--")(input) {
Ok(_r) => return Err(Err::Error(Error::new(input, ErrorKind::Char))),
Err(Err::Incomplete(_n)) => return Err(Err::Incomplete(Needed::new(1))),
_ => (),
};
inside_Comment_or_CDATA_single_pure(input)
}
fn Comment(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(tuple((
Comment_start,
many0_custom_chardata(inside_Comment_single),
Comment_end,
)))(input)
}
#[test]
fn test_comment() {
assert_eq!(
Comment("<!-- comment -->a".as_bytes()),
Ok((&b"a"[..], &b"<!-- comment -->"[..]))
);
assert_eq!(
Comment("<!---->cc".as_bytes()),
Ok((&b"cc"[..], &b"<!---->"[..]))
);
assert_eq!(
Comment("<!-- comment --->a".as_bytes()),
Err(Err::Error(error_position!(
"--->a".as_bytes(),
ErrorKind::Tag
)))
);
assert_eq!(
Comment("<!-- com--ment -->a".as_bytes()),
Err(Err::Error(error_position!(
"--ment -->a".as_bytes(),
ErrorKind::Tag
)))
);
assert_eq!(
Comment("<!--ok-".as_bytes()),
Err(Err::Incomplete(Needed::new(2)))
);
assert_eq!(
Comment("<!--ok--".as_bytes()),
Err(Err::Incomplete(Needed::new(1)))
);
}
pub enum InsideComment<'a> {
Characters(&'a [u8]),
CommentEnd,
}
fn insidecomment_characters(input: &[u8]) -> IResult<&[u8], InsideComment> {
match recognize(tuple((
inside_Comment_single,
many0_custom_chardata(inside_Comment_single),
)))(input)
{
Ok(succ) => Ok((succ.0, InsideComment::Characters(succ.1))),
Err(err) => return Err(err),
}
}
fn insidecomment_comment_end(input: &[u8]) -> IResult<&[u8], InsideComment> {
match Comment_end(input) {
Ok(succ) => Ok((succ.0, InsideComment::CommentEnd)),
Err(err) => return Err(err),
}
}
pub fn insidecomment(input: &[u8]) -> IResult<&[u8], InsideComment> {
alt((insidecomment_characters, insidecomment_comment_end))(input)
}
fn CDATASection_start(input: &[u8]) -> IResult<&[u8], &[u8]> {
tag("<![CDATA[")(input)
}
fn CDATASection_end(input: &[u8]) -> IResult<&[u8], &[u8]> {
tag("]]>")(input)
}
fn inside_CDATASection_single(input: &[u8]) -> IResult<&[u8], &[u8]> {
if input.len() == 0 {
return Err(Err::Error(Error::new(input, ErrorKind::Char)));
}
match tag::<&str, &[u8], Error<&[u8]>>("]]>")(input) {
Ok(_r) => return Err(Err::Error(Error::new(input, ErrorKind::Char))),
Err(Err::Incomplete(_n)) => return Err(Err::Incomplete(Needed::Unknown)),
_ => (),
};
inside_Comment_or_CDATA_single_pure(input)
}
#[allow(dead_code)]
fn CDATASection(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(tuple((
CDATASection_start,
many0_custom_chardata(inside_CDATASection_single),
CDATASection_end,
)))(input)
}
#[test]
fn test_cdata() {
assert_eq!(
CDATASection("<![CDATA[abc]]>a".as_bytes()),
Ok((&b"a"[..], &b"<![CDATA[abc]]>"[..]))
);
assert_eq!(
CDATASection("<![CDATA[]]>".as_bytes()),
Ok((&b""[..], &b"<![CDATA[]]>"[..]))
);
assert_eq!(
CDATASection("<![CDATA[ ]]".as_bytes()),
Err(Err::Incomplete(Needed::new(1)))
);
assert_eq!(
CDATASection("<![CDATA[ ]".as_bytes()),
Err(Err::Incomplete(Needed::new(2)))
);
}
fn PI_start(input: &[u8]) -> IResult<&[u8], &[u8]> {
tag("<?")(input)
}
fn PI_end(input: &[u8]) -> IResult<&[u8], &[u8]> {
tag("?>")(input)
}
fn inside_PI_single(input: &[u8]) -> IResult<&[u8], &[u8]> {
if input.len() == 0 {
return Err(Err::Error(Error::new(input, ErrorKind::Char)));
}
match tag::<&str, &[u8], Error<&[u8]>>("?>")(input) {
Ok(_r) => return Err(Err::Error(Error::new(input, ErrorKind::Char))),
Err(Err::Incomplete(_n)) => return Err(Err::Incomplete(Needed::Unknown)),
_ => (),
};
inside_Comment_or_CDATA_single_pure(input)
}
fn PI(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(tuple((
PI_start,
many0_custom_chardata(inside_PI_single),
PI_end,
)))(input)
}
#[test]
fn test_pi() {
assert_eq!(PI("<??>a".as_bytes()), Ok((&b"a"[..], &b"<??>"[..])));
assert_eq!(
PI("<?dummmy?>".as_bytes()),
Ok((&b""[..], &b"<?dummmy?>"[..]))
);
}
fn doctypedecl_start(input: &[u8]) -> IResult<&[u8], &[u8]> {
tag("<!DOCTYPE")(input)
}
fn doctypedecl_end(input: &[u8]) -> IResult<&[u8], &[u8]> {
tag(">")(input)
}
fn inside_doctypedecl_single_pure(input: &[u8]) -> IResult<&[u8], &[u8]> {
if input.len() == 0 {
return Err(Err::Incomplete(Needed::new(1)));
}
let width = utf8_char_width(input[0]);
if input.len() < width {
return Err(Err::Incomplete(Needed::new(width - input.len())));
}
let c = match std::str::from_utf8(&input[..width]).ok() {
Some(s) => s.chars().next().unwrap(),
None => return Err(Err::Error(Error::new(input, ErrorKind::Char))),
};
if is_xml_char_t(c) && c != '<' && c != '>' {
return Ok((&input[width..], &input[0..width]));
} else {
return Err(Err::Error(Error::new(input, ErrorKind::Char)));
}
}
fn doctypedecl_dummy_internal(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(tuple((
tag("<"),
many0_custom_trycomplete(alt((
recognize(many1_custom(inside_doctypedecl_single_pure)),
Comment,
doctypedecl_dummy_internal,
))),
tag(">"),
)))(input)
}
fn doctypedecl(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(tuple((
doctypedecl_start,
many0_custom_trycomplete(alt((
recognize(many1_custom(inside_doctypedecl_single_pure)),
Comment,
doctypedecl_dummy_internal,
))),
doctypedecl_end,
)))(input)
}
#[test]
fn test_doctypedecl() {
assert_eq!(
doctypedecl(r#"<!DOCTYPE>a"#.as_bytes()),
Ok((&b"a"[..], &br#"<!DOCTYPE>"#[..]))
);
assert_eq!(
doctypedecl(r#"<!DOCTYPE greeting SYSTEM "hello.dtd">a"#.as_bytes()),
Ok((&b"a"[..], &br#"<!DOCTYPE greeting SYSTEM "hello.dtd">"#[..]))
);
assert_eq!(
doctypedecl(r#"<!DOCTYPE dummy>"#.as_bytes()),
Ok((&b""[..], &br#"<!DOCTYPE dummy>"#[..]))
);
assert_eq!(
doctypedecl(r#"<!DOCTYPE <!-- --> <[]>dummy>"#.as_bytes()),
Ok((&b""[..], &br#"<!DOCTYPE <!-- --> <[]>dummy>"#[..]))
);
assert_eq!(
doctypedecl(r#"<!DOCTYPE <!-- > --> <[]>dummy>"#.as_bytes()),
Ok((&b""[..], &br#"<!DOCTYPE <!-- > --> <[]>dummy>"#[..]))
);
}
pub enum InsideCdata<'a> {
Characters(&'a [u8]),
CdataEnd,
}
fn insidecdata_characters(input: &[u8]) -> IResult<&[u8], InsideCdata> {
match recognize(tuple((
inside_CDATASection_single,
many0_custom_chardata(inside_CDATASection_single),
)))(input)
{
Ok(succ) => Ok((succ.0, InsideCdata::Characters(succ.1))),
Err(err) => return Err(err),
}
}
fn insidecdata_cdata_end(input: &[u8]) -> IResult<&[u8], InsideCdata> {
match CDATASection_end(input) {
Ok(succ) => Ok((succ.0, InsideCdata::CdataEnd)),
Err(err) => return Err(err),
}
}
pub fn insidecdata(input: &[u8]) -> IResult<&[u8], InsideCdata> {
alt((insidecdata_characters, insidecdata_cdata_end))(input)
}
pub enum MiscBeforeXmlDecl<'a> {
PI(&'a [u8]),
Whitespace(&'a [u8]),
CommentStart,
DocType(&'a [u8]),
XmlDecl(&'a [u8]),
}
pub enum MiscBeforeDoctype<'a> {
PI(&'a [u8]),
Whitespace(&'a [u8]),
CommentStart,
DocType(&'a [u8]),
}
pub enum Misc<'a> {
PI(&'a [u8]),
Whitespace(&'a [u8]),
CommentStart,
}
pub fn misc(input: &[u8]) -> IResult<&[u8], Misc> {
alt((
map(PI, |a| Misc::PI(a)),
map(multispace1, |a| Misc::Whitespace(a)),
map(Comment_start, |_a| Misc::CommentStart),
))(input)
}
pub fn misc_before_doctype(input: &[u8]) -> IResult<&[u8], MiscBeforeDoctype> {
alt((
map(PI, |a| MiscBeforeDoctype::PI(a)),
map(multispace1, |a| MiscBeforeDoctype::Whitespace(a)),
map(Comment_start, |_a| MiscBeforeDoctype::CommentStart),
map(doctypedecl, |a| MiscBeforeDoctype::DocType(a)),
))(input)
}
pub fn misc_before_xmldecl(input: &[u8]) -> IResult<&[u8], MiscBeforeXmlDecl> {
alt((
map(XMLDecl, |a| MiscBeforeXmlDecl::XmlDecl(a)), map(PI, |a| MiscBeforeXmlDecl::PI(a)),
map(multispace1, |a| MiscBeforeXmlDecl::Whitespace(a)),
map(Comment_start, |_a| MiscBeforeXmlDecl::CommentStart),
map(doctypedecl, |a| MiscBeforeXmlDecl::DocType(a)),
))(input)
}
#[inline]
fn is_nc_namestart_char_t(chr: char) -> bool {
(chr >= 'A' && chr <= 'Z')
|| (chr >= 'a' && chr <= 'z')
|| (chr >= '\u{C0}' && chr <= '\u{D6}')
|| (chr >= '\u{D8}' && chr <= '\u{F6}')
|| (chr >= '\u{F8}' && chr <= '\u{2FF}')
|| (chr >= '\u{370}' && chr <= '\u{37D}')
|| (chr >= '\u{37F}' && chr <= '\u{1FFF}')
|| (chr >= '\u{200C}' && chr <= '\u{200D}')
|| (chr >= '\u{2070}' && chr <= '\u{218F}')
|| (chr >= '\u{2C00}' && chr <= '\u{2FEF}')
|| (chr >= '\u{3001}' && chr <= '\u{D7FF}')
|| (chr >= '\u{F900}' && chr <= '\u{FDCF}')
|| (chr >= '\u{FDF0}' && chr <= '\u{FFFD}')
|| (chr >= '\u{10000}' && chr <= '\u{EFFFF}')
|| chr == '_'
}
fn nc_namestart_char(input: &[u8]) -> IResult<&[u8], &[u8]> {
if input.len() == 0 {
return Err(Err::Incomplete(Needed::new(1)));
}
let width = utf8_char_width(input[0]);
if input.len() < width {
return Err(Err::Incomplete(Needed::new(width - input.len())));
}
let c = match std::str::from_utf8(&input[..width]).ok() {
Some(s) => s.chars().next().unwrap(),
None => return Err(Err::Error(Error::new(input, ErrorKind::Char))),
};
if is_nc_namestart_char_t(c) {
return Ok((&input[width..], &input[0..width]));
} else {
return Err(Err::Error(Error::new(input, ErrorKind::Char)));
}
}
#[inline]
fn is_nc_namechar_t(chr: char) -> bool {
is_nc_namestart_char_t(chr)
|| (chr >= '0' && chr <= '9')
|| (chr >= '\u{0300}' && chr <= 'z')
|| (chr >= '\u{203F}' && chr <= '\u{2040}')
|| chr == '-'
|| chr == '.'
|| chr == '\u{B7}'
}
fn nc_namechar(input: &[u8]) -> IResult<&[u8], &[u8]> {
if input.len() == 0 {
return Err(Err::Incomplete(Needed::new(1)));
}
let width = utf8_char_width(input[0]);
if input.len() < width {
return Err(Err::Incomplete(Needed::new(width - input.len())));
}
let c = match std::str::from_utf8(&input[..width]).ok() {
Some(s) => s.chars().next().unwrap(),
None => return Err(Err::Error(Error::new(input, ErrorKind::Char))),
};
if is_nc_namechar_t(c) {
return Ok((&input[width..], &input[0..width]));
} else {
return Err(Err::Error(Error::new(input, ErrorKind::Char)));
}
}
fn nc_name(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(pair(nc_namestart_char, many0_custom_chardata(nc_namechar)))(input)
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct QName<'a> {
pub prefix: &'a str,
pub local_name: &'a str,
pub prefix_range: Range<usize>,
pub local_name_range: Range<usize>,
}
pub fn QName(input: &[u8]) -> IResult<&[u8], QName> {
alt((
(map(
terminated(
separated_pair(nc_name, char(':'), nc_name),
nom::combinator::eof,
),
|(pre, loc)| {
let pre_start = input.offset(pre);
let local_start = input.offset(loc);
QName {
prefix: unsafe { std::str::from_utf8_unchecked(pre) },
local_name: unsafe { std::str::from_utf8_unchecked(loc) },
prefix_range: Range {
start: pre_start,
end: pre_start + pre.len(),
},
local_name_range: Range {
start: local_start,
end: local_start + loc.len(),
},
}
},
)),
map(terminated(nc_name, nom::combinator::eof), |loc| {
let local_start = input.offset(loc);
QName {
prefix: "",
local_name: unsafe { std::str::from_utf8_unchecked(loc) },
prefix_range: 0..0,
local_name_range: Range {
start: local_start,
end: local_start + loc.len(),
},
}
}),
))(input)
}
#[test]
fn test_qname() {
assert_eq!(
QName(":no".as_bytes()),
Err(Err::Error(error_position!(
":no".as_bytes(),
ErrorKind::Char
)))
);
assert_eq!(
QName("a:b:".as_bytes()),
Err(Err::Error(error_position!(
":b:".as_bytes(),
ErrorKind::Eof
)))
);
assert_eq!(
QName("a:b".as_bytes()),
Ok((
&b""[..],
QName {
prefix: &"a",
local_name: &"b",
prefix_range: 0..1,
local_name_range: 2..3
}
))
);
assert_eq!(
QName("a:123".as_bytes()),
Err(Err::Error(error_position!(
":123".as_bytes(),
ErrorKind::Eof
)))
);
}