use std::{
borrow::Cow,
cell::RefCell,
io::{Read, Write},
mem::take,
ptr::null_mut,
rc::Rc,
str::{from_utf8, from_utf8_unchecked},
sync::atomic::{AtomicI32, Ordering},
};
use libc::size_t;
use crate::{
chvalid::XmlCharValid,
encoding::{EncodingError, XmlCharEncoding, detect_encoding, find_encoding_handler},
error::{
__xml_raise_error, XmlErrorDomain, XmlErrorLevel, XmlParserErrors, parser_validity_error,
parser_validity_warning,
},
globals::{GenericErrorContext, get_keep_blanks_default_value, get_line_numbers_default_value},
io::XmlParserInputBuffer,
libxml::{
sax2::{xml_sax2_ignorable_whitespace, xml_sax2_init_html_default_sax_handler},
xmlstring::XmlChar,
},
parser::{
INPUT_CHUNK, XML_MAX_HUGE_LENGTH, XML_MAX_NAME_LENGTH, XML_MAX_TEXT_LENGTH,
XML_VCTXT_USE_PCTXT, XmlParserCtxt, XmlParserInput, XmlParserInputState, XmlParserOption,
XmlSAXHandler, XmlSAXLocator, xml_init_parser, xml_is_letter, xml_load_external_entity,
},
tree::{NodeCommon, XmlElementType, XmlNodePtr, xml_create_int_subset, xml_free_doc},
uri::canonic_path,
};
#[cfg(feature = "libxml_push")]
use super::HtmlParserInput;
use super::{
HtmlDocPtr, HtmlNodePtr, HtmlParserCtxt, HtmlParserNodeInfo, HtmlSAXHandler, taginfo::*,
};
#[doc(alias = "htmlTagLookup")]
pub fn html_tag_lookup(tag: &str) -> Option<&'static HtmlElemDesc> {
let tag = tag.to_ascii_lowercase();
HTML40_ELEMENT_TABLE
.binary_search_by(|desc| desc.name.to_ascii_lowercase().cmp(&tag))
.ok()
.and_then(|pos| HTML40_ELEMENT_TABLE.get(pos))
}
#[doc(alias = "htmlEntityLookup")]
pub fn html_entity_lookup(name: &str) -> Option<&'static HtmlEntityDesc> {
HTML40_ENTITIES_TABLE
.iter()
.find(|entry| entry.name == name)
}
#[doc(alias = "htmlEntityValueLookup")]
pub fn html_entity_value_lookup(value: u32) -> Option<&'static HtmlEntityDesc> {
HTML40_ENTITIES_TABLE
.binary_search_by_key(&value, |entry| entry.value)
.ok()
.and_then(|index| HTML40_ENTITIES_TABLE.get(index))
}
#[doc(alias = "htmlIsAutoClosed")]
pub fn html_is_auto_closed(doc: HtmlDocPtr, elem: HtmlNodePtr) -> i32 {
let mut child = elem.children().map(|c| XmlNodePtr::try_from(c).unwrap());
while let Some(now) = child {
if html_auto_close_tag(doc, elem.name().as_deref().unwrap(), now) != 0 {
return 1;
}
child = now.next().map(|n| XmlNodePtr::try_from(n).unwrap());
}
0
}
#[doc(alias = "htmlCheckAutoClose")]
fn html_check_auto_close(newtag: &str, oldtag: &str) -> bool {
HTML_START_CLOSE
.binary_search_by(|entry| (entry.old_tag, entry.new_tag).cmp(&(oldtag, newtag)))
.is_ok()
}
#[doc(alias = "htmlAutoCloseTag")]
pub fn html_auto_close_tag(_doc: HtmlDocPtr, name: &str, elem: HtmlNodePtr) -> i32 {
if name == elem.name().as_deref().unwrap() {
return 0;
}
if html_check_auto_close(elem.name().as_deref().unwrap(), name) {
return 1;
}
let mut child = elem.children().map(|c| XmlNodePtr::try_from(c).unwrap());
while let Some(now) = child {
if html_auto_close_tag(_doc, name, now) != 0 {
return 1;
}
child = now.next().map(|n| XmlNodePtr::try_from(n).unwrap());
}
0
}
#[doc(alias = "htmlParseContent")]
pub(crate) fn html_parse_content(ctxt: &mut HtmlParserCtxt) {
html_parse_content_internal(ctxt);
}
#[doc(alias = "htmlSkipBlankChars")]
fn html_skip_blank_chars(ctxt: &mut XmlParserCtxt) -> i32 {
let input = ctxt.input().unwrap();
let mut line = input.line;
let mut col = input.col;
ctxt.force_grow();
let mut buffer = ctxt.content_bytes();
let mut res = 0;
while !buffer.is_empty() && buffer[0].is_xml_blank_char() {
if buffer[0] == b'\n' {
line += 1;
col = 1;
} else {
col += 1;
}
buffer = &buffer[1..];
if buffer.is_empty() {
let len = ctxt.content_bytes().len();
res += len;
let input = ctxt.input_mut().unwrap();
input.cur += len;
input.line = line;
input.col = col;
ctxt.force_grow();
buffer = ctxt.content_bytes();
}
}
let diff = ctxt.content_bytes().len() - buffer.len();
res += diff;
let input = ctxt.input_mut().unwrap();
input.cur += diff;
input.line = line;
input.col = col;
res as i32
}
#[doc(alias = "htmlParseErrInt")]
macro_rules! html_parse_err_int {
($ctxt:expr, $error:expr, $msg:literal, $val:expr) => {
if !$ctxt.disable_sax || !matches!($ctxt.instate, XmlParserInputState::XmlParserEOF) {
$ctxt.err_no = $error as i32;
__xml_raise_error!(
None,
None,
None,
$ctxt as *mut XmlParserCtxt as _,
None,
XmlErrorDomain::XmlFromHTML,
$error,
XmlErrorLevel::XmlErrError,
None,
0,
None,
None,
None,
$val,
0,
Some(format!($msg, $val).as_str()),
);
$ctxt.well_formed = false;
}
};
}
#[doc(alias = "htmlFindEncoding")]
fn html_find_encoding(ctxt: &mut XmlParserCtxt) -> Option<String> {
if ctxt.input().is_none_or(|input| {
input.encoding.is_some() || input.buf.as_ref().is_none_or(|buf| buf.encoder.is_some())
}) {
return None;
}
if ctxt.content_bytes().is_empty() {
return None;
}
const HTTP_EQUIV: &[u8] = b"HTTP-EQUIV";
const CONTENT: &[u8] = b"CONTENT";
const CHARSET: &[u8] = b"CHARSET=";
let start = ctxt.content_bytes();
let cur = start
.windows(HTTP_EQUIV.len())
.position(|chunk| chunk.eq_ignore_ascii_case(HTTP_EQUIV))
.map(|pos| &start[pos..])?;
let cur = cur
.windows(CONTENT.len())
.position(|chunk| chunk.eq_ignore_ascii_case(CONTENT))
.map(|pos| &start[pos..])?;
let cur = cur
.windows(CHARSET.len())
.position(|chunk| chunk.eq_ignore_ascii_case(CHARSET))
.map(|pos| &start[pos..])?;
let cur = &cur[CHARSET.len()..];
let start = cur;
let count = cur
.iter()
.position(|c| !c.is_ascii_alphanumeric() && !matches!(c, b'-' | b'_' | b':' | b'/'))
.unwrap_or(cur.len());
if count == 0 {
return None;
}
unsafe {
Some(from_utf8_unchecked(&start[..count]).to_owned())
}
}
#[doc(alias = "htmlParseErr")]
fn html_parse_err(
mut ctxt: Option<&mut XmlParserCtxt>,
error: XmlParserErrors,
msg: &str,
str1: Option<&str>,
str2: Option<&str>,
) {
if ctxt.as_ref().is_none_or(|ctxt| {
ctxt.disable_sax && matches!(ctxt.instate, XmlParserInputState::XmlParserEOF)
}) {
return;
}
if let Some(ctxt) = ctxt.as_mut() {
ctxt.err_no = error as i32;
}
let ptr = ctxt
.as_mut()
.map_or(null_mut(), |ctxt| *ctxt as *mut XmlParserCtxt);
__xml_raise_error!(
None,
None,
None,
ptr as _,
None,
XmlErrorDomain::XmlFromHTML,
error,
XmlErrorLevel::XmlErrError,
None,
0,
str1.map(|s| s.to_owned().into()),
str2.map(|s| s.to_owned().into()),
None,
0,
0,
Some(msg),
);
if let Some(ctxt) = ctxt {
ctxt.well_formed = false;
}
}
#[doc(alias = "htmlCurrentChar")]
fn html_current_char(ctxt: &mut XmlParserCtxt, len: &mut i32) -> i32 {
if matches!(ctxt.instate, XmlParserInputState::XmlParserEOF) {
return 0;
}
if ctxt.token != 0 {
*len = 0;
return ctxt.token;
}
if ctxt.input().unwrap().remainder_len() < INPUT_CHUNK && ctxt.force_grow() < 0 {
return 0;
}
if ctxt.charset != XmlCharEncoding::UTF8 {
if ctxt.current_byte() < 0x80 {
*len = 1;
if ctxt.current_byte() == 0 && !ctxt.content_bytes().is_empty() {
html_parse_err_int!(
ctxt,
XmlParserErrors::XmlErrInvalidChar,
"Char 0x{:X} out of allowed range\n",
0
);
return b' ' as _;
}
return ctxt.current_byte() as _;
}
if let Some(guess) = html_find_encoding(ctxt) {
ctxt.input_mut().unwrap().encoding = Some(guess.clone());
if let Some(handler) = find_encoding_handler(&guess) {
if handler.name() != "UTF-8" {
ctxt.switch_to_encoding(handler);
}
} else {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInvalidEncoding,
format!("Unsupported encoding {guess}").as_str(),
Some(&guess),
None,
);
}
} else {
ctxt.switch_encoding(XmlCharEncoding::ISO8859_1);
}
ctxt.charset = XmlCharEncoding::UTF8;
}
let content = ctxt.content_bytes();
let l = 4.min(content.len());
if l == 0 {
*len = 0;
return 0;
}
let c = match from_utf8(&content[..l]) {
Ok(s) => {
let c = s.chars().next().unwrap();
*len = c.len_utf8() as i32;
c
}
Err(e) if e.valid_up_to() > 0 => {
let s = unsafe { from_utf8_unchecked(&content[..e.valid_up_to()]) };
let c = s.chars().next().unwrap();
*len = c.len_utf8() as i32;
c
}
Err(e) => {
match e.error_len() {
Some(l) => {
*len = l as i32;
use std::fmt::Write as _;
let mut buffer = String::new();
if ctxt.input().unwrap().remainder_len() >= 4 {
let content = ctxt.content_bytes();
writeln!(
buffer,
"Bytes: 0x{:02X} 0x{:02X} 0x{:02X} 0x{:02X}",
content[0] as u32,
content[1] as u32,
content[2] as u32,
content[3] as u32,
)
.ok();
} else {
writeln!(buffer, "Bytes: 0x{:02X}", ctxt.current_byte() as u32,).ok();
}
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInvalidEncoding,
"Input is not proper UTF-8, indicate encoding !\n",
Some(&buffer),
None,
);
if ctxt.input().unwrap().buf.is_some()
&& ctxt
.input()
.unwrap()
.buf
.as_ref()
.unwrap()
.encoder
.is_none()
{
ctxt.switch_encoding(XmlCharEncoding::ISO8859_1);
}
*len = 1;
return ctxt.current_byte() as i32;
}
None => {
*len = 0;
return 0;
}
}
}
};
if (*len > 1 && !c.is_xml_char())
|| (*len == 1 && c == '\0' && !ctxt.content_bytes().is_empty())
{
html_parse_err_int!(
ctxt,
XmlParserErrors::XmlErrInvalidChar,
"Char 0x{:X} out of allowed range\n",
c as i32
);
}
c as i32
}
#[doc(alias = "htmlParseNameComplex")]
fn html_parse_name_complex(ctxt: &mut XmlParserCtxt) -> Option<String> {
let mut l: i32 = 0;
let mut c: i32;
let max_length = if ctxt.options & XmlParserOption::XmlParseHuge as i32 != 0 {
XML_MAX_TEXT_LENGTH
} else {
XML_MAX_NAME_LENGTH
};
let charset = ctxt.charset;
c = html_current_char(ctxt, &mut l);
if c == b' ' as i32
|| c == b'>' as i32
|| c == b'/' as i32
|| (!xml_is_letter(c as u32) && c != b'_' as i32 && c != b':' as i32)
{
return None;
}
let mut ret = String::new();
while c != b' ' as i32
&& c != b'>' as i32
&& c != b'/' as i32
&& (xml_is_letter(c as u32)
|| (c as u32).is_xml_digit()
|| c == b'.' as i32
|| c == b'-' as i32
|| c == b'_' as i32
|| c == b':' as i32
|| (c as u32).is_xml_combining()
|| (c as u32).is_xml_extender())
{
ret.push(char::from_u32(c as u32).unwrap());
if ret.len() > max_length {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrNameTooLong,
"name too long",
None,
None,
);
return None;
}
ctxt.advance_with_line_handling(l as usize);
ctxt.token = 0;
c = html_current_char(ctxt, &mut l);
if ctxt.charset != charset {
return html_parse_name_complex(ctxt);
}
}
if matches!(ctxt.instate, XmlParserInputState::XmlParserEOF) {
return None;
}
Some(ret)
}
#[doc(alias = "htmlParseName")]
fn html_parse_name(ctxt: &mut HtmlParserCtxt) -> Option<String> {
ctxt.grow();
let mut input = ctxt.content_bytes();
if !input.is_empty() && (input[0].is_ascii_alphabetic() || matches!(input[0], b'_' | b':')) {
input = &input[1..];
let count = input
.iter()
.position(|&b| !b.is_ascii_alphanumeric() && !matches!(b, b'_' | b'-' | b':' | b'.'))
.unwrap_or(input.len())
+ 1;
if count == ctxt.content_bytes().len() {
return None;
}
if (0x01..0x80).contains(&ctxt.content_bytes()[count]) {
let ret = unsafe {
String::from_utf8_unchecked(ctxt.content_bytes()[..count].to_vec())
};
ctxt.input_mut().unwrap().cur += count;
ctxt.input_mut().unwrap().col += count as i32;
return Some(ret);
}
}
html_parse_name_complex(ctxt)
}
#[doc(alias = "htmlParseEntityRef")]
pub(crate) fn html_parse_entity_ref(
ctxt: &mut HtmlParserCtxt,
str: &mut Option<String>,
) -> Option<&'static HtmlEntityDesc> {
*str = None;
ctxt.input()?;
let mut ent = None;
if ctxt.current_byte() == b'&' {
ctxt.skip_char();
if let Some(name) = html_parse_name(ctxt) {
ctxt.grow();
if ctxt.current_byte() == b';' {
ent = html_entity_lookup(&name);
if ent.is_some() {
ctxt.skip_char();
}
*str = Some(name);
} else {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrEntityRefSemicolMissing,
"htmlParseEntityRef: expecting ';'\n",
None,
None,
);
*str = Some(name);
}
} else {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrNameRequired,
"htmlParseEntityRef: no name\n",
None,
None,
);
}
}
ent
}
#[doc(alias = "htmlParseCharRef")]
pub(crate) fn html_parse_char_ref(ctxt: &mut HtmlParserCtxt) -> i32 {
let mut val: i32 = 0;
if ctxt.input().is_none() {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInternalError,
"htmlParseCharRef: context error\n",
None,
None,
);
return 0;
}
if ctxt.current_byte() == b'&'
&& ctxt.nth_byte(1) == b'#'
&& (ctxt.nth_byte(2) == b'x' || ctxt.nth_byte(2) == b'X')
{
ctxt.advance(3);
while ctxt.current_byte() != b';' {
if ctxt.current_byte() >= b'0' && ctxt.current_byte() <= b'9' {
if val < 0x110000 {
val = val * 16 + (ctxt.current_byte() - b'0') as i32;
}
} else if ctxt.current_byte() >= b'a' && ctxt.current_byte() <= b'f' {
if val < 0x110000 {
val = val * 16 + (ctxt.current_byte() - b'a') as i32 + 10;
}
} else if ctxt.current_byte() >= b'A' && ctxt.current_byte() <= b'F' {
if val < 0x110000 {
val = val * 16 + (ctxt.current_byte() - b'A') as i32 + 10;
}
} else {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInvalidHexCharRef,
"htmlParseCharRef: missing semicolon\n",
None,
None,
);
break;
}
ctxt.skip_char();
}
if ctxt.current_byte() == b';' {
ctxt.skip_char();
}
} else if ctxt.content_bytes().starts_with(b"&#") {
ctxt.advance(2);
while ctxt.current_byte() != b';' {
if ctxt.current_byte().is_ascii_digit() {
if val < 0x110000 {
val = val * 10 + (ctxt.current_byte() - b'0') as i32;
}
} else {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInvalidDecCharRef,
"htmlParseCharRef: missing semicolon\n",
None,
None,
);
break;
}
ctxt.skip_char();
}
if ctxt.current_byte() == b';' {
ctxt.skip_char();
}
} else {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInvalidCharRef,
"htmlParseCharRef: invalid value\n",
None,
None,
);
}
if (val as u32).is_xml_char() {
return val;
} else if val >= 0x110000 {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInvalidChar,
"htmlParseCharRef: value too large\n",
None,
None,
);
} else {
html_parse_err_int!(
&mut *ctxt,
XmlParserErrors::XmlErrInvalidChar,
"htmlParseCharRef: invalid xmlChar value {}\n",
val
);
}
0
}
const HTML_PARSER_BIG_BUFFER_SIZE: usize = 1000;
const HTML_PARSER_BUFFER_SIZE: usize = 100;
#[doc(alias = "htmlErrMemory")]
pub(crate) fn html_err_memory(ctxt: Option<&mut XmlParserCtxt>, extra: Option<&str>) {
if ctxt.as_ref().is_none_or(|ctxt| {
ctxt.disable_sax && matches!(ctxt.instate, XmlParserInputState::XmlParserEOF)
}) {
return;
}
let mut ptr = null_mut();
if let Some(ctxt) = ctxt {
ctxt.err_no = XmlParserErrors::XmlErrNoMemory as i32;
ctxt.instate = XmlParserInputState::XmlParserEOF;
ctxt.disable_sax = true;
ptr = ctxt as *mut XmlParserCtxt;
}
if let Some(extra) = extra {
__xml_raise_error!(
None,
None,
None,
ptr as _,
None,
XmlErrorDomain::XmlFromParser,
XmlParserErrors::XmlErrNoMemory,
XmlErrorLevel::XmlErrFatal,
None,
0,
Some(extra.to_owned().into()),
None,
None,
0,
0,
"Memory allocation failed : {}\n",
extra
);
} else {
__xml_raise_error!(
None,
None,
None,
ptr as _,
None,
XmlErrorDomain::XmlFromParser,
XmlParserErrors::XmlErrNoMemory,
XmlErrorLevel::XmlErrFatal,
None,
0,
None,
None,
None,
0,
0,
"Memory allocation failed\n",
);
}
}
#[doc(alias = "htmlParseHTMLName")]
fn html_parse_html_name(ctxt: &mut HtmlParserCtxt) -> Option<String> {
let mut i = 0;
let mut loc = [0; HTML_PARSER_BUFFER_SIZE];
if !ctxt.current_byte().is_ascii_alphabetic()
&& !matches!(ctxt.current_byte(), b'_' | b':' | b'.')
{
return None;
}
while i < HTML_PARSER_BUFFER_SIZE
&& (ctxt.current_byte().is_ascii_alphabetic()
|| ctxt.current_byte().is_ascii_digit()
|| ctxt.current_byte() == b':'
|| ctxt.current_byte() == b'-'
|| ctxt.current_byte() == b'_'
|| ctxt.current_byte() == b'.')
{
if ctxt.current_byte().is_ascii_uppercase() {
loc[i] = ctxt.current_byte() + 0x20;
} else {
loc[i] = ctxt.current_byte();
}
i += 1;
ctxt.skip_char();
}
unsafe {
Some(String::from_utf8_unchecked(loc[..i].to_vec()))
}
}
#[doc(alias = "htmlnamePop")]
fn html_name_pop(ctxt: &mut HtmlParserCtxt) -> Option<Rc<str>> {
let res = ctxt.name_tab.pop();
let name = ctxt.name_tab.last().cloned();
ctxt.name = name;
res
}
#[doc(alias = "htmlAutoCloseOnEnd")]
fn html_auto_close_on_end(ctxt: &mut HtmlParserCtxt) {
if ctxt.name_tab.is_empty() {
return;
}
for _ in (0..ctxt.name_tab.len()).rev() {
if let Some(end_element) = ctxt.sax.as_deref_mut().and_then(|sax| sax.end_element) {
let name = ctxt.name.clone().unwrap();
end_element(ctxt, &name);
}
html_name_pop(ctxt);
}
}
#[doc(alias = "htmlAutoClose")]
fn html_auto_close(ctxt: &mut HtmlParserCtxt, newtag: Option<&str>) {
if let Some(newtag) = newtag {
while ctxt
.name
.as_deref()
.is_some_and(|name| html_check_auto_close(newtag, name))
{
if let Some(end_element) = ctxt.sax.as_deref_mut().and_then(|sax| sax.end_element) {
let name = ctxt.name.clone();
let name = name.as_deref().unwrap();
end_element(ctxt, name);
}
html_name_pop(ctxt);
}
}
if newtag.is_none() {
html_auto_close_on_end(ctxt);
return;
}
while newtag.is_none()
&& ctxt
.name
.as_deref()
.is_some_and(|name| name == "head" || name == "body" || name == "html")
{
if let Some(end_element) = ctxt.sax.as_deref_mut().and_then(|sax| sax.end_element) {
let name = ctxt.name.clone();
let name = name.as_deref().unwrap();
end_element(ctxt, name);
}
html_name_pop(ctxt);
}
}
static HTML_OMITTED_DEFAULT_VALUE: AtomicI32 = AtomicI32::new(1);
#[doc(alias = "htmlnamePush")]
fn html_name_push(ctxt: &mut HtmlParserCtxt, value: &str) -> i32 {
if ctxt.html < 3 && value == "head" {
ctxt.html = 3;
}
if ctxt.html < 10 && value == "body" {
ctxt.html = 10;
}
let name: Rc<str> = value.into();
ctxt.name = Some(name.clone());
ctxt.name_tab.push(name);
ctxt.name_tab.len() as i32 - 1
}
#[doc(alias = "htmlCheckImplied")]
fn html_check_implied(ctxt: &mut HtmlParserCtxt, newtag: &str) {
if ctxt.options & HtmlParserOption::HtmlParseNoimplied as i32 != 0 {
return;
}
if HTML_OMITTED_DEFAULT_VALUE.load(Ordering::Relaxed) == 0 {
return;
}
if newtag == "html" {
return;
}
if ctxt.name_tab.is_empty() {
html_name_push(ctxt, "html");
if let Some(start_element) = ctxt.sax.as_deref_mut().and_then(|sax| sax.start_element) {
start_element(ctxt, "html", &[]);
}
}
if newtag == "body" || newtag == "head" {
return;
}
if ctxt.name_tab.len() <= 1
&& (newtag == "script"
|| newtag == "style"
|| newtag == "meta"
|| newtag == "link"
|| newtag == "title"
|| newtag == "base")
{
if ctxt.html >= 3 {
return;
}
html_name_push(ctxt, "head");
if let Some(start_element) = ctxt.sax.as_deref_mut().and_then(|sax| sax.start_element) {
start_element(ctxt, "head", &[]);
}
} else if newtag != "noframes" && newtag != "frame" && newtag != "frameset" {
if ctxt.html >= 10 {
return;
}
for i in 0..ctxt.name_tab.len() {
if ctxt.name_tab[i].as_ref() == "body" {
return;
}
if ctxt.name_tab[i].as_ref() == "head" {
return;
}
}
html_name_push(ctxt, "body");
if let Some(start_element) = ctxt.sax.as_deref_mut().and_then(|sax| sax.start_element) {
start_element(ctxt, "body", &[]);
}
}
}
#[doc(alias = "htmlParseHTMLAttribute")]
fn html_parse_html_attribute(ctxt: &mut HtmlParserCtxt, stop: u8) -> Option<String> {
let max_length = if ctxt.options & XmlParserOption::XmlParseHuge as i32 != 0 {
XML_MAX_HUGE_LENGTH
} else {
XML_MAX_TEXT_LENGTH
};
let mut out = String::new();
while ctxt.current_byte() != 0 && ctxt.current_byte() != stop {
if stop == 0 && ctxt.current_byte() == b'>' {
break;
}
if stop == 0 && ctxt.current_byte().is_xml_blank_char() {
break;
}
if ctxt.current_byte() == b'&' {
if ctxt.nth_byte(1) == b'#' {
let c = html_parse_char_ref(ctxt) as u32;
out.push(char::from_u32(c).unwrap());
} else {
let mut name = None;
let ent = html_parse_entity_ref(ctxt, &mut name);
if let Some(name) = name {
if let Some(ent) = ent {
let c = ent.value;
out.push(char::from_u32(c).unwrap());
} else {
out.push('&');
out.push_str(&name);
}
} else {
out.push('&');
}
}
} else {
let mut l: i32 = 0;
let c = html_current_char(&mut *ctxt, &mut l) as u32;
if matches!(ctxt.instate, XmlParserInputState::XmlParserEOF) {
return None;
}
out.push(char::from_u32(c).unwrap());
ctxt.advance_with_line_handling(l as usize);
ctxt.token = 0;
}
if out.len() > max_length {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrAttributeNotFinished,
"attribute value too long\n",
None,
None,
);
return None;
}
}
Some(out)
}
#[doc(alias = "htmlParseAttValue")]
fn html_parse_att_value(ctxt: &mut HtmlParserCtxt) -> Option<String> {
let ret;
if ctxt.current_byte() == b'"' {
ctxt.skip_char();
ret = html_parse_html_attribute(ctxt, b'"');
if ctxt.current_byte() != b'"' {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrAttributeNotFinished,
"AttValue: \" expected\n",
None,
None,
);
} else {
ctxt.skip_char();
}
} else if ctxt.current_byte() == b'\'' {
ctxt.skip_char();
ret = html_parse_html_attribute(ctxt, b'\'');
if ctxt.current_byte() != b'\'' {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrAttributeNotFinished,
"AttValue: ' expected\n",
None,
None,
);
} else {
ctxt.skip_char();
}
} else {
ret = html_parse_html_attribute(ctxt, 0);
if ret.is_none() {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrAttributeWithoutValue,
"AttValue: no value found\n",
None,
None,
);
}
}
ret
}
#[doc(alias = "htmlParseAttribute")]
fn html_parse_attribute(ctxt: &mut HtmlParserCtxt) -> (Option<String>, Option<String>) {
let Some(name) = html_parse_html_name(ctxt) else {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrNameRequired,
"error parsing attribute name\n",
None,
None,
);
return (None, None);
};
html_skip_blank_chars(ctxt);
let mut val = None;
if ctxt.current_byte() == b'=' {
ctxt.skip_char();
html_skip_blank_chars(ctxt);
val = html_parse_att_value(ctxt);
}
(Some(name), val)
}
#[doc(alias = "htmlCheckEncodingDirect")]
fn html_check_encoding_direct(ctxt: &mut HtmlParserCtxt, encoding: Option<&str>) {
if encoding.is_none() || ctxt.options & HtmlParserOption::HtmlParseIgnoreEnc as i32 != 0 {
return;
}
if ctxt.input().unwrap().encoding.is_some() {
return;
}
if let Some(mut encoding) = encoding {
encoding = encoding.trim_start_matches([' ', '\t']);
ctxt.input_mut().unwrap().encoding = Some(encoding.to_owned());
let enc = encoding
.parse::<XmlCharEncoding>()
.unwrap_or(XmlCharEncoding::Error);
if !matches!(enc, XmlCharEncoding::Error) {
if matches!(
enc,
XmlCharEncoding::UTF16LE
| XmlCharEncoding::UTF16BE
| XmlCharEncoding::UCS4LE
| XmlCharEncoding::UCS4BE
) && ctxt.input().unwrap().buf.is_some()
&& ctxt
.input()
.unwrap()
.buf
.as_ref()
.unwrap()
.encoder
.is_none()
{
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInvalidEncoding,
"htmlCheckEncoding: wrong encoding meta\n",
None,
None,
);
} else {
ctxt.switch_encoding(enc);
}
ctxt.charset = XmlCharEncoding::UTF8;
} else {
if let Some(handler) = find_encoding_handler(encoding) {
ctxt.switch_to_encoding(handler);
ctxt.charset = XmlCharEncoding::UTF8;
} else {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrUnsupportedEncoding,
format!("htmlCheckEncoding: unknown encoding {encoding}\n").as_str(),
Some(encoding),
None,
);
}
}
if ctxt
.input()
.unwrap()
.buf
.as_ref()
.is_some_and(|buf| buf.encoder.is_some())
{
let processed = ctxt.input().unwrap().offset_from_base();
ctxt.input_mut()
.unwrap()
.buf
.as_mut()
.unwrap()
.trim_head(processed);
let res = ctxt.input_mut().unwrap().buf.as_mut().unwrap().decode(true);
ctxt.input_mut().unwrap().reset_base();
if res.is_err() {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInvalidEncoding,
"htmlCheckEncoding: encoder error\n",
None,
None,
);
}
}
}
}
#[doc(alias = "htmlCheckEncoding")]
fn html_check_encoding(ctxt: &mut HtmlParserCtxt, attvalue: &str) {
let mut encoding = attvalue;
let Some(pos) = attvalue
.as_bytes()
.windows(7)
.position(|v| v.eq_ignore_ascii_case(b"charset"))
else {
return;
};
encoding = &encoding[pos + 7..];
encoding = encoding.trim_start_matches(|c: char| c.is_xml_blank_char());
if let Some(encoding) = encoding.strip_prefix('=') {
html_check_encoding_direct(ctxt, Some(encoding));
}
}
#[doc(alias = "htmlCheckMeta")]
fn html_check_meta(ctxt: &mut HtmlParserCtxt, atts: &[(String, Option<String>)]) {
let mut http: i32 = 0;
let mut content = None;
for (att, value) in atts {
if value
.as_deref()
.is_some_and(|v| v.eq_ignore_ascii_case("Content-Type"))
&& att.eq_ignore_ascii_case("http-equiv")
{
http = 1;
} else if value.is_some() && att.eq_ignore_ascii_case("charset") {
html_check_encoding_direct(ctxt, value.as_deref());
} else if value.is_some() && att.eq_ignore_ascii_case("content") {
content = value.as_deref();
}
}
if let Some(content) = content.filter(|_| http != 0) {
html_check_encoding(ctxt, content);
}
}
#[doc(alias = "htmlParseStartTag")]
fn html_parse_start_tag(ctxt: &mut HtmlParserCtxt) -> i32 {
let mut meta: i32 = 0;
let mut discardtag: i32 = 0;
if ctxt.input().is_none() {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInternalError,
"htmlParseStartTag: context error\n",
None,
None,
);
return -1;
}
if matches!(ctxt.instate, XmlParserInputState::XmlParserEOF) {
return -1;
}
if ctxt.current_byte() != b'<' {
return -1;
}
ctxt.skip_char();
ctxt.grow();
let Some(name) = html_parse_html_name(&mut *ctxt) else {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrNameRequired,
"htmlParseStartTag: invalid element name\n",
None,
None,
);
while ctxt.current_byte() != 0
&& ctxt.current_byte() != b'>'
&& !matches!(ctxt.instate, XmlParserInputState::XmlParserEOF)
{
ctxt.skip_char();
}
return -1;
};
if name == "meta" {
meta = 1;
}
html_auto_close(ctxt, Some(&name));
html_check_implied(ctxt, &name);
if !ctxt.name_tab.is_empty() && name == "html" {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlHTMLStrucureError,
"htmlParseStartTag: misplaced <html> tag\n",
Some(&name),
None,
);
discardtag = 1;
ctxt.depth += 1;
}
if ctxt.name_tab.len() != 1 && name == "head" {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlHTMLStrucureError,
"htmlParseStartTag: misplaced <head> tag\n",
Some(&name),
None,
);
discardtag = 1;
ctxt.depth += 1;
}
if name == "body" {
for indx in 0..ctxt.name_tab.len() {
if ctxt.name_tab[indx].as_ref() == "body" {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlHTMLStrucureError,
"htmlParseStartTag: misplaced <body> tag\n",
Some(&name),
None,
);
discardtag = 1;
ctxt.depth += 1;
}
}
}
html_skip_blank_chars(ctxt);
'failed: while ctxt.current_byte() != 0
&& ctxt.current_byte() != b'>'
&& !ctxt.content_bytes().starts_with(b"/>")
&& !matches!(ctxt.instate, XmlParserInputState::XmlParserEOF)
{
ctxt.grow();
if let (Some(attname), attvalue) = html_parse_attribute(ctxt) {
for i in 0..ctxt.atts.len() {
let (name, _) = &ctxt.atts[i];
if name.as_str() == attname {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrAttributeRedefined,
format!("Attribute {attname} redefined\n").as_str(),
Some(&attname),
None,
);
html_skip_blank_chars(ctxt);
continue 'failed;
}
}
ctxt.atts.push((attname, attvalue));
} else {
while ctxt.current_byte() != 0
&& !ctxt.current_byte().is_xml_blank_char()
&& ctxt.current_byte() != b'>'
&& !ctxt.content_bytes().starts_with(b"/>")
&& !matches!(ctxt.instate, XmlParserInputState::XmlParserEOF)
{
ctxt.skip_char();
}
}
html_skip_blank_chars(ctxt);
}
if meta != 0 && !ctxt.atts.is_empty() {
let atts = take(&mut ctxt.atts);
html_check_meta(ctxt, &atts);
ctxt.atts = atts;
}
if discardtag == 0 {
html_name_push(ctxt, &name);
if let Some(start_element) = ctxt.sax.as_deref_mut().and_then(|sax| sax.start_element) {
let atts = take(&mut ctxt.atts);
start_element(ctxt, &name, &atts);
ctxt.atts = atts;
}
}
ctxt.atts.clear();
discardtag
}
#[doc(alias = "htmlGetEndPriority")]
fn html_get_end_priority(name: &str) -> i32 {
HTML_END_PRIORITY
.iter()
.find_map(|entry| (entry.name == name || entry.name.is_empty()).then_some(entry.priority))
.unwrap()
}
#[doc(alias = "htmlAutoCloseOnClose")]
fn html_auto_close_on_close(ctxt: &mut HtmlParserCtxt, newtag: &str) {
let priority = html_get_end_priority(newtag);
for i in (0..ctxt.name_tab.len()).rev() {
if newtag == ctxt.name_tab[i].as_ref() {
while Some(newtag) != ctxt.name.as_deref() {
let info = ctxt.name.as_deref().and_then(|name| html_tag_lookup(name));
if info.filter(|info| info.end_tag == 3).is_some() {
let name = ctxt.name.as_deref().unwrap().to_owned();
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrTagNameMismatch,
format!("Opening and ending tag mismatch: {newtag} and {name}\n").as_str(),
Some(newtag),
Some(&name),
);
}
if let Some(end_element) = ctxt.sax.as_deref_mut().and_then(|sax| sax.end_element) {
let name = ctxt.name.as_deref().unwrap().to_owned();
end_element(ctxt, &name);
}
html_name_pop(ctxt);
}
return;
}
if html_get_end_priority(&ctxt.name_tab[i]) > priority {
return;
}
}
}
#[doc(alias = "htmlNodeInfoPop")]
fn html_node_info_pop(ctxt: &mut HtmlParserCtxt) -> Option<Rc<RefCell<HtmlParserNodeInfo>>> {
ctxt.node_info_tab.pop()
}
#[doc(alias = "htmlParseEndTag")]
fn html_parse_end_tag(ctxt: &mut HtmlParserCtxt) -> i32 {
let ret: i32;
if !ctxt.content_bytes().starts_with(b"</") {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrLtSlashRequired,
"htmlParseEndTag: '</' not found\n",
None,
None,
);
return 0;
}
ctxt.advance(2);
let Some(name) = html_parse_html_name(ctxt) else {
return 0;
};
html_skip_blank_chars(ctxt);
if ctxt.current_byte() != b'>' {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrGtRequired,
"End tag : expected '>'\n",
None,
None,
);
#[allow(clippy::while_immutable_condition)]
while ctxt.current_byte() != 0 && ctxt.current_byte() != b'>' {
ctxt.skip_char();
}
}
if ctxt.current_byte() == b'>' {
ctxt.skip_char();
}
if ctxt.depth > 0 && (name == "html" || name == "body" || name == "head") {
ctxt.depth -= 1;
return 0;
}
for i in (0..ctxt.name_tab.len()).rev() {
if name == ctxt.name_tab[i].as_ref() {
html_auto_close_on_close(ctxt, &name);
if let Some(ctxt_name) = ctxt.name.as_deref().filter(|&ctxt_name| ctxt_name != name) {
let ctxt_name = ctxt_name.to_owned();
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrTagNameMismatch,
format!("Opening and ending tag mismatch: {name} and {ctxt_name}\n",).as_str(),
Some(&name),
Some(&ctxt_name.to_owned()),
);
}
let oldname = ctxt.name.as_deref();
if oldname.is_some_and(|oldname| oldname == name) {
if let Some(end_element) = ctxt.sax.as_deref_mut().and_then(|sax| sax.end_element) {
end_element(ctxt, &name);
}
html_node_info_pop(ctxt);
html_name_pop(ctxt);
ret = 1;
} else {
ret = 0;
}
return ret;
}
}
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrTagNameMismatch,
format!("Unexpected end tag : {name}\n").as_str(),
Some(&name),
None,
);
0
}
#[doc(alias = "htmlParseHTMLName_nonInvasive")]
fn html_parse_html_name_non_invasive(ctxt: &mut HtmlParserCtxt) -> Option<String> {
let mut i: usize = 0;
let mut loc = [0; HTML_PARSER_BUFFER_SIZE];
let next = ctxt.nth_byte(1);
if !next.is_ascii_alphabetic() && next != b'_' && next != b':' {
return None;
}
while i < HTML_PARSER_BUFFER_SIZE
&& (ctxt.nth_byte(1 + i).is_ascii_alphabetic()
|| ctxt.nth_byte(1 + i).is_ascii_digit()
|| ctxt.nth_byte(1 + i) == b':'
|| ctxt.nth_byte(1 + i) == b'-'
|| ctxt.nth_byte(1 + i) == b'_')
{
if ctxt.nth_byte(1 + i) >= b'A' && ctxt.nth_byte(1 + i) <= b'Z' {
loc[i] = ctxt.nth_byte(1 + i) + 0x20;
} else {
loc[i] = ctxt.nth_byte(1 + i);
}
i += 1;
}
unsafe {
Some(String::from_utf8_unchecked(loc[..i].to_vec()))
}
}
#[doc(alias = "htmlParseScript")]
fn html_parse_script(ctxt: &mut HtmlParserCtxt) {
let mut buf: [XmlChar; HTML_PARSER_BIG_BUFFER_SIZE + 5] = [0; HTML_PARSER_BIG_BUFFER_SIZE + 5];
let mut nbchar: i32 = 0;
let mut cur: i32;
let mut l: i32 = 0;
cur = html_current_char(ctxt, &mut l);
while cur != 0 {
if cur == b'<' as i32 && ctxt.nth_byte(1) == b'/' {
if ctxt.recovery {
let context_name = ctxt.name.as_deref().unwrap();
let content = &ctxt.content_bytes()[2..];
if context_name.len() <= content.len()
&& content[..context_name.len()].eq_ignore_ascii_case(context_name.as_bytes())
{
break;
} else {
let name = ctxt.name.as_deref().unwrap().to_owned();
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrTagNameMismatch,
format!("Element {name} embeds close tag\n").as_str(),
Some(&name),
None,
);
}
} else if (ctxt.nth_byte(2) >= b'A' && ctxt.nth_byte(2) <= b'Z')
|| (ctxt.nth_byte(2) >= b'a' && ctxt.nth_byte(2) <= b'z')
{
break;
}
}
if (cur as u32).is_xml_char() {
let c = char::from_u32(cur as u32).unwrap();
let s = c.encode_utf8(&mut buf[nbchar as usize..]);
nbchar += s.len() as i32;
} else {
html_parse_err_int!(
ctxt,
XmlParserErrors::XmlErrInvalidChar,
"Invalid char in CDATA 0x{:X}\n",
cur
);
}
ctxt.advance_with_line_handling(l as usize);
ctxt.token = 0;
if nbchar >= HTML_PARSER_BIG_BUFFER_SIZE as i32 {
buf[nbchar as usize] = 0;
let s = from_utf8(&buf[..nbchar as usize]).expect("Internal Error");
if let Some(sax) = ctxt.sax.as_deref_mut() {
if let Some(cdata_block) = sax.cdata_block {
cdata_block(ctxt, s);
} else if let Some(characters) = sax.characters {
characters(ctxt, s);
}
}
nbchar = 0;
ctxt.shrink();
}
cur = html_current_char(ctxt, &mut l);
}
if matches!(ctxt.instate, XmlParserInputState::XmlParserEOF) {
return;
}
if nbchar != 0 && !ctxt.disable_sax {
if let Some(sax) = ctxt.sax.as_deref_mut() {
buf[nbchar as usize] = 0;
let s = from_utf8(&buf[..nbchar as usize]).expect("Internal Error");
if let Some(cdata_block) = sax.cdata_block {
cdata_block(ctxt, s);
} else if let Some(characters) = sax.characters {
characters(ctxt, s);
}
}
}
}
#[doc(alias = "htmlParseSystemLiteral")]
fn html_parse_system_literal(ctxt: &mut HtmlParserCtxt) -> Option<String> {
let mut len = 0;
let mut err = 0;
if !matches!(ctxt.current_byte(), b'"' | b'\'') {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrLiteralNotStarted,
"SystemLiteral \" or ' expected\n",
None,
None,
);
return None;
}
let quote = ctxt.current_byte() as i32;
ctxt.skip_char();
let start_position = ctxt.input().unwrap().offset_from_base();
while ctxt.current_byte() != 0 && ctxt.current_byte() as i32 != quote {
if !ctxt.current_byte().is_xml_char() {
html_parse_err_int!(
&mut *ctxt,
XmlParserErrors::XmlErrInvalidChar,
"Invalid char in SystemLiteral 0x{:X}\n",
ctxt.current_byte() as i32
);
err = 1;
}
ctxt.skip_char();
len += 1;
}
if ctxt.current_byte() as i32 != quote {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrLiteralNotFinished,
"Unfinished SystemLiteral\n",
None,
None,
);
None
} else {
let mut ret = None;
if err == 0 {
let content =
&ctxt.input().unwrap().base_contents()[start_position..start_position + len];
ret = Some(String::from_utf8(content.to_vec()).unwrap());
}
ctxt.skip_char();
ret
}
}
#[doc(alias = "htmlParsePubidLiteral")]
fn html_parse_pubid_literal(ctxt: &mut HtmlParserCtxt) -> Option<String> {
let mut len: size_t = 0;
let mut err: i32 = 0;
if ctxt.current_byte() != b'"' && ctxt.current_byte() != b'\'' {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrLiteralNotStarted,
"PubidLiteral \" or ' expected\n",
None,
None,
);
return None;
}
let quote: i32 = ctxt.current_byte() as _;
ctxt.skip_char();
let start_position = ctxt.input().unwrap().offset_from_base();
while ctxt.current_byte() != 0 && ctxt.current_byte() as i32 != quote {
if !ctxt.current_byte().is_xml_pubid_char() {
html_parse_err_int!(
&mut *ctxt,
XmlParserErrors::XmlErrInvalidChar,
"Invalid char in PubidLiteral 0x{:X}\n",
ctxt.current_byte() as i32
);
err = 1;
}
len += 1;
ctxt.skip_char();
}
let mut ret = None;
if ctxt.current_byte() as i32 != quote {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrLiteralNotFinished,
"Unfinished PubidLiteral\n",
None,
None,
);
} else {
if err == 0 {
let content =
&ctxt.input().unwrap().base_contents()[start_position..start_position + len];
ret = Some(String::from_utf8(content.to_vec()).unwrap());
}
ctxt.skip_char();
}
ret
}
#[doc(alias = "htmlParseExternalID")]
fn html_parse_external_id(ctxt: &mut HtmlParserCtxt) -> (Option<String>, Option<String>) {
let mut uri = None;
if ctxt.content_bytes().len() >= 6 && ctxt.content_bytes()[..6].eq_ignore_ascii_case(b"SYSTEM")
{
ctxt.advance(6);
if !ctxt.current_byte().is_xml_blank_char() {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrSpaceRequired,
"Space required after 'SYSTEM'\n",
None,
None,
);
}
html_skip_blank_chars(ctxt);
let uri = html_parse_system_literal(ctxt);
if uri.is_none() {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrURIRequired,
"htmlParseExternalID: SYSTEM, no URI\n",
None,
None,
);
}
(None, uri)
} else if ctxt.content_bytes().len() >= 6
&& ctxt.content_bytes()[..6].eq_ignore_ascii_case(b"PUBLIC")
{
ctxt.advance(6);
if !ctxt.current_byte().is_xml_blank_char() {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrSpaceRequired,
"Space required after 'PUBLIC'\n",
None,
None,
);
}
html_skip_blank_chars(ctxt);
let public_id = html_parse_pubid_literal(ctxt);
if public_id.is_none() {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrPubidRequired,
"htmlParseExternalID: PUBLIC, no Public Identifier\n",
None,
None,
);
}
html_skip_blank_chars(ctxt);
if ctxt.current_byte() == b'"' || ctxt.current_byte() == b'\'' {
uri = html_parse_system_literal(ctxt);
}
(public_id, uri)
} else {
(None, None)
}
}
#[doc(alias = "htmlParseDocTypeDecl")]
fn html_parse_doc_type_decl(ctxt: &mut HtmlParserCtxt) {
ctxt.advance(9);
html_skip_blank_chars(ctxt);
let name = html_parse_name(ctxt);
if name.is_none() {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrNameRequired,
"htmlParseDocTypeDecl : no DOCTYPE name !\n",
None,
None,
);
}
html_skip_blank_chars(ctxt);
let (external_id, uri) = html_parse_external_id(ctxt);
html_skip_blank_chars(ctxt);
if ctxt.current_byte() != b'>' {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrDoctypeNotFinished,
"DOCTYPE improperly terminated\n",
None,
None,
);
while !ctxt.content_bytes().is_empty()
&& ctxt.current_byte() != b'>'
&& !matches!(ctxt.instate, XmlParserInputState::XmlParserEOF)
{
ctxt.skip_char();
}
}
if ctxt.current_byte() == b'>' {
ctxt.skip_char();
}
if !ctxt.disable_sax {
if let Some(internal_subset) = ctxt.sax.as_deref_mut().and_then(|sax| sax.internal_subset) {
internal_subset(
ctxt,
name.as_deref(),
external_id.as_deref(),
uri.as_deref(),
);
}
}
}
#[doc(alias = "htmlParseComment")]
fn html_parse_comment(ctxt: &mut HtmlParserCtxt) {
let mut q: i32;
let mut ql: i32 = 0;
let mut r: i32;
let mut rl: i32 = 0;
let mut cur: i32;
let mut l: i32 = 0;
let mut next: i32;
let mut nl: i32 = 0;
let max_length = if ctxt.options & XmlParserOption::XmlParseHuge as i32 != 0 {
XML_MAX_HUGE_LENGTH
} else {
XML_MAX_TEXT_LENGTH
};
if ctxt.token == 0 && !ctxt.content_bytes().starts_with(b"<!--") {
return;
}
let state: XmlParserInputState = ctxt.instate;
ctxt.instate = XmlParserInputState::XmlParserComment;
ctxt.advance(4);
let mut buf = String::new();
q = html_current_char(&mut *ctxt, &mut ql);
if q == 0 {
} else {
if q == b'>' as i32 {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrCommentAbruptlyEnded,
"Comment abruptly ended",
None,
None,
);
cur = b'>' as i32;
} else {
ctxt.advance_with_line_handling(ql as usize);
ctxt.token = 0;
r = html_current_char(&mut *ctxt, &mut rl);
if r == 0 {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrCommentNotFinished,
format!("Comment not terminated \n<!--{buf}\n").as_str(),
Some(&buf),
None,
);
return;
}
if q == b'-' as i32 && r == b'>' as i32 {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrCommentAbruptlyEnded,
"Comment abruptly ended",
None,
None,
);
cur = b'>' as i32;
} else {
ctxt.advance_with_line_handling(rl as usize);
ctxt.token = 0;
cur = html_current_char(&mut *ctxt, &mut l);
while cur != 0 && (cur != b'>' as i32 || r != b'-' as i32 || q != b'-' as i32) {
ctxt.advance_with_line_handling(l as usize);
ctxt.token = 0;
next = html_current_char(&mut *ctxt, &mut nl);
if q == b'-' as i32
&& r == b'-' as i32
&& cur == b'!' as i32
&& next == b'>' as i32
{
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrCommentNotFinished,
"Comment incorrectly closed by '--!>'",
None,
None,
);
cur = b'>' as i32;
break;
}
if (q as u32).is_xml_char() {
buf.push(char::from_u32(q as u32).unwrap());
} else {
html_parse_err_int!(
&mut *ctxt,
XmlParserErrors::XmlErrInvalidChar,
"Invalid char in comment 0x{:X}\n",
q
);
}
if buf.len() > max_length {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrCommentNotFinished,
"comment too long",
None,
None,
);
ctxt.instate = state;
return;
}
q = r;
r = cur;
cur = next;
l = nl;
}
}
}
if matches!(ctxt.instate, XmlParserInputState::XmlParserEOF) {
return;
}
if cur == b'>' as i32 {
ctxt.skip_char();
if !ctxt.disable_sax {
if let Some(comment) = ctxt.sax.as_deref_mut().and_then(|sax| sax.comment) {
comment(&mut *ctxt, &buf);
}
}
ctxt.instate = state;
return;
}
}
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrCommentNotFinished,
format!("Comment not terminated \n<!--{buf}\n").as_str(),
Some(&buf),
None,
);
}
fn html_skip_bogus_comment(ctxt: &mut HtmlParserCtxt) {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlHTMLIncorrectlyOpenedComment,
"Incorrectly opened comment\n",
None,
None,
);
'b: while {
let c = ctxt.current_byte();
if c == 0 {
break 'b;
}
ctxt.skip_char();
c != b'>'
} {}
}
#[doc(alias = "xmlParsePI")]
fn html_parse_pi(ctxt: &mut HtmlParserCtxt) {
let mut cur: i32;
let mut l: i32 = 0;
let max_length = if ctxt.options & XmlParserOption::XmlParseHuge as i32 != 0 {
XML_MAX_HUGE_LENGTH
} else {
XML_MAX_TEXT_LENGTH
};
let state: XmlParserInputState;
if ctxt.token == 0 && ctxt.content_bytes().starts_with(b"<?") {
state = ctxt.instate;
ctxt.instate = XmlParserInputState::XmlParserPI;
ctxt.advance(2);
if let Some(target) = html_parse_name(ctxt) {
if ctxt.token == 0 && ctxt.current_byte() == b'>' {
ctxt.advance(1);
if !ctxt.disable_sax {
if let Some(processing_instruction) = ctxt
.sax
.as_deref_mut()
.and_then(|sax| sax.processing_instruction)
{
processing_instruction(ctxt, &target, None);
}
}
ctxt.instate = state;
return;
}
let mut buf = String::new();
cur = ctxt.current_byte() as _;
if !(cur as u32).is_xml_blank_char() {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrSpaceRequired,
format!("ParsePI: PI {target} space expected\n").as_str(),
Some(&target),
None,
);
}
html_skip_blank_chars(ctxt);
cur = html_current_char(ctxt, &mut l);
while cur != 0 && cur != b'>' as i32 {
if (cur as u32).is_xml_char() {
buf.push(char::from_u32(cur as u32).unwrap());
} else {
html_parse_err_int!(
ctxt,
XmlParserErrors::XmlErrInvalidChar,
"Invalid char in processing instruction 0x{:X}\n",
cur
);
}
if buf.len() > max_length {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrPINotFinished,
format!("PI {target} too long").as_str(),
Some(&target),
None,
);
ctxt.instate = state;
return;
}
ctxt.advance_with_line_handling(l as usize);
ctxt.token = 0;
cur = html_current_char(ctxt, &mut l);
}
if matches!(ctxt.instate, XmlParserInputState::XmlParserEOF) {
return;
}
if cur != b'>' as i32 {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrPINotFinished,
format!("ParsePI: PI {target} never end ...\n").as_str(),
Some(&target),
None,
);
} else {
ctxt.advance(1);
if !ctxt.disable_sax {
if let Some(processing_instruction) = ctxt
.sax
.as_deref_mut()
.and_then(|sax| sax.processing_instruction)
{
processing_instruction(ctxt, &target, Some(&buf));
}
}
}
} else {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrPINotStarted,
"PI is not started correctly",
None,
None,
);
}
ctxt.instate = state;
}
}
const HTML_NO_CONTENT_ELEMENTS: &[&str] = &["html", "head"];
#[doc(alias = "htmlCheckParagraph")]
fn html_check_paragraph(ctxt: &mut HtmlParserCtxt) -> i32 {
let tag = ctxt.name.as_deref();
let Some(tag) = tag else {
html_auto_close(ctxt, Some("p"));
html_check_implied(ctxt, "p");
html_name_push(ctxt, "p");
if let Some(start_element) = ctxt.sax.as_deref_mut().and_then(|sax| sax.start_element) {
start_element(ctxt, "p", &[]);
}
return 1;
};
if HTML_OMITTED_DEFAULT_VALUE.load(Ordering::Relaxed) == 0 {
return 0;
}
for &elem in HTML_NO_CONTENT_ELEMENTS {
if tag == elem {
html_auto_close(ctxt, Some("p"));
html_check_implied(ctxt, "p");
html_name_push(ctxt, "p");
if let Some(start_element) = ctxt.sax.as_deref_mut().and_then(|sax| sax.start_element) {
start_element(ctxt, "p", &[]);
}
return 1;
}
}
0
}
#[doc(alias = "htmlParseReference")]
fn html_parse_reference(ctxt: &mut HtmlParserCtxt) {
let mut out = [0; 6];
if ctxt.current_byte() != b'&' {
return;
}
if ctxt.nth_byte(1) == b'#' {
let mut bits: i32;
let mut i = 0;
let c: u32 = html_parse_char_ref(ctxt) as _;
if c == 0 {
return;
}
if c < 0x80 {
out[i] = c as _;
i += 1;
bits = -6;
} else if c < 0x800 {
out[i] = ((c >> 6) & 0x1F) as u8 | 0xC0;
i += 1;
bits = 0;
} else if c < 0x10000 {
out[i] = ((c >> 12) & 0x0F) as u8 | 0xE0;
i += 1;
bits = 6;
} else {
out[i] = ((c >> 18) & 0x07) as u8 | 0xF0;
i += 1;
bits = 12;
}
while bits >= 0 {
out[i] = ((c >> bits) & 0x3F) as u8 | 0x80;
i += 1;
bits -= 6;
}
out[i] = 0;
html_check_paragraph(ctxt);
if let Some(characters) = ctxt.sax.as_deref_mut().and_then(|sax| sax.characters) {
let s = from_utf8(&out[..i]).expect("Internal Error");
characters(ctxt, s);
}
} else {
let mut name = None;
let ent = html_parse_entity_ref(ctxt, &mut name);
let Some(name) = name else {
html_check_paragraph(ctxt);
if let Some(characters) = ctxt.sax.as_deref_mut().and_then(|sax| sax.characters) {
characters(ctxt, "&");
}
return;
};
if let Some(ent) = ent.filter(|ent| ent.value != 0) {
let mut bits: i32;
let mut i: i32 = 0;
let c: u32 = ent.value;
if c < 0x80 {
out[i as usize] = c as _;
i += 1;
bits = -6;
} else if c < 0x800 {
out[i as usize] = ((c >> 6) & 0x1F) as u8 | 0xC0;
i += 1;
bits = 0;
} else if c < 0x10000 {
out[i as usize] = ((c >> 12) & 0x0F) as u8 | 0xE0;
i += 1;
bits = 6;
} else {
out[i as usize] = ((c >> 18) & 0x07) as u8 | 0xF0;
i += 1;
bits = 12;
}
while bits >= 0 {
out[i as usize] = ((c >> bits) & 0x3F) as u8 | 0x80;
i += 1;
bits -= 6;
}
out[i as usize] = 0;
html_check_paragraph(ctxt);
if let Some(characters) = ctxt.sax.as_deref_mut().and_then(|sax| sax.characters) {
let s = from_utf8(&out[..i as usize]).expect("Internal Error");
characters(ctxt, s);
}
} else {
html_check_paragraph(ctxt);
if let Some(characters) = ctxt.sax.as_deref_mut().and_then(|sax| sax.characters) {
characters(ctxt, "&");
characters(ctxt, &name);
}
}
}
}
#[doc(alias = "areBlanks")]
fn are_blanks(ctxt: &mut HtmlParserCtxt, s: &str) -> i32 {
if s.chars().any(|c| !c.is_xml_blank_char()) {
return 0;
}
if ctxt.current_byte() == 0 {
return 1;
}
if ctxt.current_byte() != b'<' {
return 0;
}
let Some(name) = ctxt.name.as_deref() else {
return 1;
};
if name == "html" {
return 1;
}
if name == "head" {
return 1;
}
if name == "body" {
if let Some(my_doc) = ctxt.my_doc {
let dtd = my_doc.get_int_subset();
if dtd.is_some_and(|dtd| {
dtd.external_id
.as_deref()
.filter(|e| {
e.eq_ignore_ascii_case("-//W3C//DTD HTML 4.01//EN")
|| e.eq_ignore_ascii_case("-//W3C//DTD HTML 4//EN")
})
.is_some()
}) {
return 1;
}
}
}
let Some(context_node) = ctxt.node else {
return 0;
};
let mut last_child = context_node.get_last_child();
while let Some(now) =
last_child.filter(|last_child| last_child.element_type() == XmlElementType::XmlCommentNode)
{
last_child = now.prev();
}
if let Some(last_child) = last_child {
if last_child.is_text_node() {
return 0;
}
for &pcdata in ALLOW_PCDATA {
if last_child.name().as_deref() == Some(pcdata) {
return 0;
}
}
} else {
if context_node.element_type() != XmlElementType::XmlElementNode
&& context_node.content.is_some()
{
return 0;
}
for &pcdata in ALLOW_PCDATA {
if name == pcdata {
return 0;
}
}
}
1
}
#[doc(alias = "htmlParseCharDataInternal")]
fn html_parse_char_data_internal(ctxt: &mut HtmlParserCtxt, readahead: i32) {
let mut buf = [0; HTML_PARSER_BIG_BUFFER_SIZE + 6];
let mut nbchar: i32 = 0;
let mut cur: i32;
let mut l: i32 = 0;
if readahead != 0 {
buf[nbchar as usize] = readahead as _;
nbchar += 1;
}
cur = html_current_char(ctxt, &mut l);
while (cur != b'<' as i32 || ctxt.token == b'<' as i32)
&& (cur != b'&' as i32 || ctxt.token == b'&' as i32)
&& cur != 0
{
if !(cur as u32).is_xml_char() {
html_parse_err_int!(
ctxt,
XmlParserErrors::XmlErrInvalidChar,
"Invalid char in CDATA 0x{:X}\n",
cur
);
} else {
let c = char::from_u32(cur as u32).unwrap();
let s = c.encode_utf8(&mut buf[nbchar as usize..]);
nbchar += s.len() as i32;
}
ctxt.advance_with_line_handling(l as usize);
ctxt.token = 0;
if nbchar >= HTML_PARSER_BIG_BUFFER_SIZE as i32 {
buf[nbchar as usize] = 0;
if !ctxt.disable_sax && ctxt.sax.is_some() {
let s = from_utf8(&buf[..nbchar as usize]).expect("Internal Error");
if are_blanks(ctxt, s) != 0 {
if ctxt.keep_blanks {
if let Some(characters) = ctxt.sax.as_deref_mut().unwrap().characters {
characters(ctxt, s);
}
} else if let Some(ignorable_whitespace) =
ctxt.sax.as_deref_mut().unwrap().ignorable_whitespace
{
ignorable_whitespace(ctxt, s);
}
} else {
html_check_paragraph(ctxt);
if let Some(characters) = ctxt.sax.as_deref_mut().unwrap().characters {
characters(ctxt, s);
}
}
}
nbchar = 0;
ctxt.shrink();
}
cur = html_current_char(ctxt, &mut l);
}
if matches!(ctxt.instate, XmlParserInputState::XmlParserEOF) {
return;
}
if nbchar != 0 {
buf[nbchar as usize] = 0;
if !ctxt.disable_sax && ctxt.sax.is_some() {
let s = from_utf8(&buf[..nbchar as usize]).expect("Internal Error");
if are_blanks(ctxt, s) != 0 {
if ctxt.keep_blanks {
if let Some(characters) = ctxt.sax.as_deref_mut().unwrap().characters {
characters(ctxt, s);
}
} else if let Some(ignorable_whitespace) =
ctxt.sax.as_deref_mut().unwrap().ignorable_whitespace
{
ignorable_whitespace(ctxt, s);
}
} else {
html_check_paragraph(ctxt);
if let Some(characters) = ctxt.sax.as_deref_mut().unwrap().characters {
characters(ctxt, s);
}
}
}
}
}
#[doc(alias = "htmlParseCharData")]
fn html_parse_char_data(ctxt: &mut HtmlParserCtxt) {
html_parse_char_data_internal(ctxt, 0);
}
#[doc(alias = "htmlNewParserCtxt")]
pub fn html_new_parser_ctxt<'a>() -> Option<XmlParserCtxt<'a>> {
html_new_sax_parser_ctxt(None, None)
}
#[doc(alias = "htmlInitParserCtxt")]
fn html_init_parser_ctxt(
ctxt: &mut HtmlParserCtxt,
sax: Option<Box<HtmlSAXHandler>>,
user_data: Option<GenericErrorContext>,
) -> i32 {
*ctxt = HtmlParserCtxt::default();
if sax.is_none() {
let mut sax = HtmlSAXHandler::default();
xml_sax2_init_html_default_sax_handler(&mut sax);
ctxt.sax = Some(Box::new(sax));
ctxt.user_data = None;
} else {
ctxt.sax = sax;
ctxt.user_data = user_data;
}
ctxt.input_tab.clear();
ctxt.input_tab.shrink_to(5);
ctxt.version = None;
ctxt.encoding = None;
ctxt.standalone = -1;
ctxt.instate = XmlParserInputState::XmlParserStart;
ctxt.input_tab.clear();
ctxt.node_tab.shrink_to(10);
ctxt.node = None;
ctxt.name_tab.clear();
ctxt.name_tab.shrink_to(10);
ctxt.name = None;
ctxt.node_info_tab.clear();
ctxt.my_doc = None;
ctxt.well_formed = true;
ctxt.replace_entities = false;
ctxt.linenumbers = get_line_numbers_default_value();
ctxt.keep_blanks = get_keep_blanks_default_value();
ctxt.html = 1;
ctxt.vctxt.flags = XML_VCTXT_USE_PCTXT as _;
ctxt.vctxt.user_data = None;
ctxt.vctxt.error = Some(parser_validity_error);
ctxt.vctxt.warning = Some(parser_validity_warning);
ctxt.record_info = false;
ctxt.validate = false;
ctxt.check_index = 0;
#[cfg(feature = "catalog")]
{
ctxt.catalogs = None;
}
ctxt.node_seq.clear();
0
}
#[doc(alias = "htmlNewSAXParserCtxt")]
pub fn html_new_sax_parser_ctxt<'a>(
sax: Option<Box<XmlSAXHandler>>,
user_data: Option<GenericErrorContext>,
) -> Option<HtmlParserCtxt<'a>> {
let mut ctxt = XmlParserCtxt::default();
if html_init_parser_ctxt(&mut ctxt, sax, user_data) < 0 {
return None;
}
Some(ctxt)
}
#[doc(alias = "htmlCreateMemoryParserCtxt")]
pub fn html_create_memory_parser_ctxt(buffer: &[u8]) -> Option<HtmlParserCtxt> {
if buffer.is_empty() {
return None;
}
let mut ctxt = html_new_parser_ctxt()?;
let buf = XmlParserInputBuffer::from_memory(buffer, XmlCharEncoding::None)?;
let mut input = XmlParserInput::new(Some(&mut ctxt))?;
input.filename = None;
input.buf = Some(buf);
input.reset_base();
ctxt.input_push(input);
Some(ctxt)
}
#[doc(alias = "htmlParserFinishElementParsing")]
fn html_parser_finish_element_parsing(ctxt: &mut HtmlParserCtxt) {
if let Some(node) = ctxt.node {
if ctxt.record_info {
let end_pos =
ctxt.input().unwrap().consumed + ctxt.input().unwrap().offset_from_base() as u64;
let end_line = ctxt.input().unwrap().line as u64;
let node_info = ctxt.node_info_tab.last_mut().unwrap();
let mut info = node_info.borrow_mut();
info.end_pos = end_pos;
info.end_line = end_line;
info.node = Some(node);
drop(info);
let node_info = node_info.clone();
ctxt.add_node_info(node_info);
html_node_info_pop(ctxt);
}
}
if ctxt.current_byte() == 0 {
html_auto_close_on_end(ctxt);
}
}
#[doc(alias = "htmlNodeInfoPush")]
fn html_node_info_push(ctxt: &mut HtmlParserCtxt, value: Rc<RefCell<HtmlParserNodeInfo>>) -> usize {
ctxt.node_info_tab.push(value);
ctxt.node_info_tab.len()
}
#[doc(alias = "htmlParseElementInternal")]
fn html_parse_element_internal(ctxt: &mut HtmlParserCtxt) {
let mut node_info = HtmlParserNodeInfo::default();
if ctxt.input().is_none() {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInternalError,
"htmlParseElementInternal: context error\n",
None,
None,
);
return;
}
if matches!(ctxt.instate, XmlParserInputState::XmlParserEOF) {
return;
}
if ctxt.record_info {
node_info.begin_pos =
ctxt.input().unwrap().consumed + ctxt.input().unwrap().offset_from_base() as u64;
node_info.begin_line = ctxt.input().unwrap().line as _;
}
let failed: i32 = html_parse_start_tag(ctxt);
let Some(name) = ctxt.name.clone().filter(|_| failed != -1) else {
if ctxt.current_byte() == b'>' {
ctxt.skip_char();
}
return;
};
let info = html_tag_lookup(&name);
if info.is_none() {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlHTMLUnknownTag,
format!("Tag {name} invalid\n").as_str(),
Some(&name),
None,
);
}
if ctxt.current_byte() == b'/' && ctxt.nth_byte(1) == b'>' {
ctxt.advance(2);
if let Some(end_element) = ctxt.sax.as_deref_mut().and_then(|sax| sax.end_element) {
end_element(&mut *ctxt, &name);
}
html_name_pop(ctxt);
return;
}
if ctxt.current_byte() == b'>' {
ctxt.skip_char();
} else {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrGtRequired,
format!("Couldn't find end of Start Tag {name}\n").as_str(),
Some(name.as_ref()),
None,
);
if Some(name.as_ref()) == ctxt.name.as_deref() {
ctxt.node_pop();
html_name_pop(ctxt);
}
if ctxt.record_info {
html_node_info_push(ctxt, Rc::new(RefCell::new(node_info)));
}
html_parser_finish_element_parsing(ctxt);
return;
}
if info.is_some_and(|info| info.empty != 0) {
if let Some(end_element) = ctxt.sax.as_deref_mut().and_then(|sax| sax.end_element) {
end_element(&mut *ctxt, &name);
}
html_name_pop(ctxt);
return;
}
if ctxt.record_info {
html_node_info_push(ctxt, Rc::new(RefCell::new(node_info)));
}
}
#[doc(alias = "htmlParseContentInternal")]
fn html_parse_content_internal(ctxt: &mut HtmlParserCtxt) {
let mut depth = ctxt.name_tab.len();
let mut current_node = if depth == 0 { None } else { ctxt.name.clone() };
loop {
ctxt.grow();
if matches!(ctxt.instate, XmlParserInputState::XmlParserEOF) {
break;
}
if ctxt.current_byte() == b'<' && ctxt.nth_byte(1) == b'/' {
if html_parse_end_tag(ctxt) != 0 && (current_node.is_some() || ctxt.name_tab.is_empty())
{
depth = ctxt.name_tab.len();
if depth == 0 {
current_node = None;
} else {
current_node = ctxt.name.clone();
}
}
continue;
} else if ctxt.current_byte() == b'<'
&& (ctxt.nth_byte(1).is_ascii_alphabetic()
|| ctxt.nth_byte(1) == b'_'
|| ctxt.nth_byte(1) == b':')
{
let Some(name) = html_parse_html_name_non_invasive(ctxt) else {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrNameRequired,
"htmlParseStartTag: invalid element name\n",
None,
None,
);
while ctxt.current_byte() == 0 && ctxt.current_byte() != b'>' {
ctxt.skip_char();
}
html_parser_finish_element_parsing(ctxt);
current_node = ctxt.name.clone();
depth = ctxt.name_tab.len();
continue;
};
if ctxt.name.is_some() && html_check_auto_close(&name, ctxt.name.as_deref().unwrap()) {
html_auto_close(ctxt, Some(&name));
continue;
}
}
if !ctxt.name_tab.is_empty() && depth >= ctxt.name_tab.len() && current_node != ctxt.name {
html_parser_finish_element_parsing(ctxt);
current_node = ctxt.name.clone();
depth = ctxt.name_tab.len();
continue;
}
if ctxt.current_byte() != 0
&& (current_node.as_deref() == Some("script")
|| current_node.as_deref() == Some("style"))
{
html_parse_script(ctxt);
} else if ctxt.current_byte() == b'<' && ctxt.nth_byte(1) == b'!' {
if ctxt.content_bytes().len() >= 9
&& ctxt.content_bytes()[2..9].eq_ignore_ascii_case(b"DOCTYPE")
{
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlHTMLStrucureError,
"Misplaced DOCTYPE declaration\n",
Some("DOCTYPE"),
None,
);
html_parse_doc_type_decl(ctxt);
} else if ctxt.nth_byte(2) == b'-' && ctxt.nth_byte(3) == b'-' {
html_parse_comment(ctxt);
} else {
html_skip_bogus_comment(ctxt);
}
} else if ctxt.current_byte() == b'<' && ctxt.nth_byte(1) == b'?' {
html_parse_pi(ctxt);
} else if ctxt.current_byte() == b'<' && ctxt.nth_byte(1).is_ascii_alphabetic() {
html_parse_element_internal(ctxt);
current_node = ctxt.name.clone();
depth = ctxt.name_tab.len();
} else if ctxt.current_byte() == b'<' {
if !ctxt.disable_sax {
if let Some(characters) = ctxt.sax.as_deref_mut().and_then(|sax| sax.characters) {
characters(ctxt, "<");
}
}
ctxt.skip_char();
} else if ctxt.current_byte() == b'&' {
html_parse_reference(ctxt);
} else if ctxt.current_byte() == 0 {
html_auto_close_on_end(ctxt);
break;
} else {
html_parse_char_data(ctxt);
}
ctxt.shrink();
ctxt.grow();
}
}
#[doc(alias = "htmlParseDocument")]
pub fn html_parse_document(ctxt: &mut HtmlParserCtxt) -> i32 {
xml_init_parser();
if ctxt.input().is_none() {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInternalError,
"htmlParseDocument: context error\n",
None,
None,
);
return XmlParserErrors::XmlErrInternalError as i32;
}
ctxt.grow();
if let Some(set_document_locator) = ctxt
.sax
.as_deref_mut()
.and_then(|sax| sax.set_document_locator)
{
set_document_locator(ctxt, XmlSAXLocator::default());
}
if ctxt.encoding.is_none() && ctxt.input().unwrap().remainder_len() >= 4 {
if ctxt.token == 0 {
let enc = detect_encoding(&ctxt.content_bytes()[..4]);
if !matches!(enc, XmlCharEncoding::None) {
ctxt.switch_encoding(enc);
}
}
}
html_skip_blank_chars(ctxt);
if ctxt.current_byte() == 0 {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrDocumentEmpty,
"Document is empty\n",
None,
None,
);
}
if !ctxt.disable_sax {
if let Some(start_document) = ctxt.sax.as_deref_mut().and_then(|sax| sax.start_document) {
start_document(ctxt);
}
}
while ctxt.content_bytes().starts_with(b"<!--") || ctxt.content_bytes().starts_with(b"<?") {
html_parse_comment(ctxt);
html_parse_pi(ctxt);
html_skip_blank_chars(ctxt);
}
if ctxt.content_bytes().len() >= 9
&& ctxt.content_bytes()[..9].eq_ignore_ascii_case(b"<!DOCTYPE")
{
html_parse_doc_type_decl(ctxt);
}
html_skip_blank_chars(ctxt);
while ctxt.content_bytes().starts_with(b"<!--") || ctxt.content_bytes().starts_with(b"<?") {
html_parse_comment(ctxt);
html_parse_pi(ctxt);
html_skip_blank_chars(ctxt);
}
html_parse_content_internal(ctxt);
if ctxt.current_byte() == 0 {
html_auto_close_on_end(ctxt);
}
if let Some(end_document) = ctxt.sax.as_deref_mut().and_then(|sax| sax.end_document) {
end_document(ctxt);
}
if ctxt.options & HtmlParserOption::HtmlParseNodefdtd as i32 == 0 {
if let Some(mut my_doc) = ctxt.my_doc {
let dtd = my_doc.get_int_subset();
if dtd.is_none() {
my_doc.int_subset = xml_create_int_subset(
Some(my_doc),
Some("html"),
Some("-//W3C//DTD HTML 4.0 Transitional//EN"),
Some("http://www.w3.org/TR/REC-html40/loose.dtd"),
);
}
}
}
if !ctxt.well_formed {
return -1;
}
0
}
#[doc(alias = "htmlCreateDocParserCtxt")]
fn html_create_doc_parser_ctxt<'a>(
cur: &'a [u8],
encoding: Option<&str>,
) -> Option<HtmlParserCtxt<'a>> {
let mut ctxt = html_create_memory_parser_ctxt(cur)?;
if let Some(encoding) = encoding {
ctxt.input_mut().unwrap().encoding = Some(encoding.to_owned());
let enc = encoding.parse().unwrap_or(XmlCharEncoding::Error);
if !matches!(enc, XmlCharEncoding::Error) {
ctxt.switch_encoding(enc);
if ctxt.err_no == XmlParserErrors::XmlErrUnsupportedEncoding as i32 {
html_parse_err(
Some(&mut ctxt),
XmlParserErrors::XmlErrUnsupportedEncoding,
format!("Unsupported encoding {encoding}\n").as_str(),
Some(encoding),
None,
);
}
} else {
if let Some(handler) = find_encoding_handler(encoding) {
ctxt.switch_to_encoding(handler);
} else {
html_parse_err(
Some(&mut ctxt),
XmlParserErrors::XmlErrUnsupportedEncoding,
format!("Unsupported encoding {encoding}\n").as_str(),
Some(encoding),
None,
);
}
}
}
Some(ctxt)
}
#[doc(alias = "htmlSAXParseDoc")]
#[deprecated = "Use htmlNewSAXParserCtxt and htmlCtxtReadDoc"]
pub fn html_sax_parse_doc(
cur: &[u8],
encoding: Option<&str>,
sax: Option<Box<HtmlSAXHandler>>,
user_data: Option<GenericErrorContext>,
) -> Option<HtmlDocPtr> {
xml_init_parser();
let mut ctxt = html_create_doc_parser_ctxt(cur, encoding)?;
let replaced = sax.is_some();
if let Some(sax) = sax {
ctxt.sax = Some(sax);
ctxt.user_data = user_data;
}
html_parse_document(&mut ctxt);
let ret = ctxt.my_doc;
if replaced {
ctxt.sax = None;
ctxt.user_data = None;
}
ret
}
#[doc(alias = "htmlParseDoc")]
pub fn html_parse_doc(cur: &[u8], encoding: Option<&str>) -> Option<HtmlDocPtr> {
html_sax_parse_doc(cur, encoding, None, None)
}
#[doc(alias = "htmlCreateFileParserCtxt")]
pub fn html_create_file_parser_ctxt<'a>(
filename: &str,
encoding: Option<&str>,
) -> Option<HtmlParserCtxt<'a>> {
let mut ctxt = html_new_parser_ctxt()?;
let canonic_filename = canonic_path(filename);
let input_stream = xml_load_external_entity(Some(&canonic_filename), None, &mut ctxt)?;
ctxt.input_push(input_stream);
if let Some(encoding) = encoding {
let l = encoding.len();
if l < 1000 {
let content = format!("charset={encoding}");
html_check_encoding(&mut ctxt, &content);
}
}
Some(ctxt)
}
#[doc(alias = "htmlSAXParseFile")]
#[deprecated = "Use htmlNewSAXParserCtxt and htmlCtxtReadFile"]
pub fn html_sax_parse_file(
filename: &str,
encoding: Option<&str>,
sax: Option<Box<HtmlSAXHandler>>,
user_data: Option<GenericErrorContext>,
) -> Option<HtmlDocPtr> {
let mut oldsax = None;
xml_init_parser();
let mut ctxt = html_create_file_parser_ctxt(filename, encoding)?;
let replaced = sax.is_some();
if let Some(sax) = sax {
oldsax = ctxt.sax.replace(sax);
ctxt.user_data = user_data;
}
html_parse_document(&mut ctxt);
let ret = ctxt.my_doc;
if replaced {
ctxt.sax = oldsax;
ctxt.user_data = None;
}
ret
}
#[doc(alias = "htmlParseFile")]
pub fn html_parse_file(filename: &str, encoding: Option<&str>) -> Option<HtmlDocPtr> {
html_sax_parse_file(filename, encoding, None, None)
}
#[doc(alias = "UTF8ToHtml")]
pub fn utf8_to_html(src: &str, dst: &mut [u8]) -> Result<(usize, usize), EncodingError> {
let mut read = 0;
let mut write = 0;
for c in src.chars() {
if c.len_utf8() == 1 {
if write == dst.len() {
break;
}
dst[write] = c as u8;
write += 1;
} else {
let cp = if let Some(ent) = html_entity_value_lookup(c as u32) {
Cow::Borrowed(ent.name)
} else {
Cow::Owned(format!("#{}", c as u32))
};
let len = cp.len();
if write + len + 2 > dst.len() {
break;
}
write!(&mut dst[write..], "&{cp};").ok();
write += len + 2;
}
read += c.len_utf8();
}
Ok((read, write))
}
#[doc(alias = "htmlEncodeEntities")]
pub fn html_encode_entities(src: &str, dst: &mut [u8], quote_char: Option<char>) -> (usize, usize) {
let mut read = 0;
let mut write = 0;
for c in src.chars() {
if c.len_utf8() == 1 && Some(c) != quote_char && c != '&' && c != '<' && c != '>' {
if write == dst.len() {
break;
}
dst[write] = c as u8;
write += 1;
} else {
let cp = if let Some(ent) = html_entity_value_lookup(c as u32) {
Cow::Borrowed(ent.name)
} else {
Cow::Owned(format!("#{}", c as u32))
};
let len = cp.len();
if write + len + 2 > dst.len() {
break;
}
write!(&mut dst[write..], "&{cp};").ok();
write += len + 2;
}
read += c.len_utf8();
}
(read, write)
}
#[doc(alias = "htmlIsScriptAttribute")]
pub fn html_is_script_attribute(name: &str) -> bool {
if !name.starts_with("on") {
return false;
}
HTML_SCRIPT_ATTRIBUTES.iter().any(|&attr| attr == name)
}
#[doc(alias = "htmlHandleOmittedElem")]
pub fn html_handle_omitted_elem(val: i32) -> i32 {
let old: i32 = HTML_OMITTED_DEFAULT_VALUE.load(Ordering::Acquire);
HTML_OMITTED_DEFAULT_VALUE.store(val, Ordering::Release);
old
}
#[doc(alias = "htmlNewInputStream")]
#[cfg(feature = "libxml_push")]
fn html_new_input_stream<'a>(ctxt: &mut HtmlParserCtxt) -> HtmlParserInput<'a> {
let mut input = HtmlParserInput {
filename: None,
directory: None,
base: 0,
cur: 0,
buf: None,
line: 1,
col: 1,
version: None,
consumed: 0,
length: 0,
..Default::default()
};
input.id = ctxt.input_id;
ctxt.input_id += 1;
input
}
#[doc(alias = "htmlCreatePushParserCtxt")]
#[cfg(feature = "libxml_push")]
pub fn html_create_push_parser_ctxt<'a>(
sax: Option<Box<XmlSAXHandler>>,
user_data: Option<GenericErrorContext>,
chunk: &[u8],
filename: Option<&str>,
enc: XmlCharEncoding,
) -> Option<HtmlParserCtxt<'a>> {
use crate::io::{XmlParserInputBuffer, xml_parser_get_directory};
xml_init_parser();
let buf = XmlParserInputBuffer::new(enc);
let mut ctxt = html_new_sax_parser_ctxt(sax, user_data)?;
if matches!(enc, XmlCharEncoding::UTF8) || buf.encoder.is_some() {
ctxt.charset = XmlCharEncoding::UTF8;
}
if filename.is_none() {
ctxt.directory = None;
} else if let Some(dir) = filename.and_then(xml_parser_get_directory) {
ctxt.directory = Some(dir.to_string_lossy().into_owned());
}
let mut input_stream = html_new_input_stream(&mut ctxt);
if let Some(filename) = filename {
let canonic = canonic_path(filename);
input_stream.filename = Some(canonic.into_owned());
} else {
input_stream.filename = None;
}
input_stream.buf = Some(buf);
input_stream.reset_base();
ctxt.input_push(input_stream);
if !chunk.is_empty() && ctxt.input().is_some() && ctxt.input().unwrap().buf.is_some() {
ctxt.input_mut()
.unwrap()
.buf
.as_mut()
.unwrap()
.push_bytes(chunk);
}
ctxt.progressive = true;
Some(ctxt)
}
#[doc(alias = "htmlParseLookupSequence")]
#[cfg(feature = "libxml_push")]
fn html_parse_lookup_sequence(
ctxt: &mut HtmlParserCtxt,
first: XmlChar,
next: XmlChar,
third: XmlChar,
ignoreattrval: i32,
) -> i32 {
let mut quote: i32;
if ctxt.input().is_none() {
return -1;
}
let base: size_t = ctxt.check_index as _;
quote = ctxt.end_check_state;
let buf = ctxt.content_bytes();
let mut len = buf.len();
if third != 0 {
len -= 2;
} else if next != 0 {
len -= 1;
}
for base in base..len {
if base >= i32::MAX as usize / 2 {
ctxt.check_index = 0;
ctxt.end_check_state = 0;
return base as i32 - 2;
}
if ignoreattrval != 0 {
if quote != 0 {
if buf[base] == quote as u8 {
quote = 0;
}
continue;
}
if buf[base] == b'"' || buf[base] == b'\'' {
quote = buf[base] as _;
continue;
}
}
if buf[base] == first {
if third != 0 {
if buf[base + 1] != next || buf[base + 2] != third {
continue;
}
} else if next != 0 && buf[base + 1] != next {
continue;
}
ctxt.check_index = 0;
ctxt.end_check_state = 0;
return base as _;
}
}
ctxt.check_index = base.max(len) as _;
ctxt.end_check_state = quote;
-1
}
#[doc(alias = "htmlParseLookupCommentEnd")]
#[cfg(feature = "libxml_push")]
fn html_parse_lookup_comment_end(ctxt: &mut HtmlParserCtxt) -> i32 {
let mut mark: i32;
loop {
mark = html_parse_lookup_sequence(ctxt, b'-', b'-', 0, 0);
if mark < 0 {
break;
}
if ctxt.nth_byte(mark as usize + 2) == b'>'
|| (ctxt.nth_byte(mark as usize + 2) == b'!'
&& ctxt.nth_byte(mark as usize + 3) == b'>')
{
ctxt.check_index = 0;
break;
}
let offset = if ctxt.nth_byte(mark as usize + 2) == b'!' {
3
} else {
2
};
if mark + offset >= ctxt.input().unwrap().remainder_len() as i32 {
ctxt.check_index = mark as _;
return -1;
}
ctxt.check_index = mark as usize + 1;
}
mark
}
#[doc(alias = "htmlParseTryOrFinish")]
#[cfg(feature = "libxml_push")]
fn html_parse_try_or_finish(ctxt: &mut HtmlParserCtxt, terminate: i32) -> i32 {
let ret: i32 = 0;
let mut avail = 0;
'done: loop {
let Some(input) = ctxt.input() else {
break;
};
avail = input.remainder_len();
if avail == 0 && terminate != 0 {
html_auto_close_on_end(ctxt);
if ctxt.name_tab.is_empty()
&& !matches!(ctxt.instate, XmlParserInputState::XmlParserEOF)
{
ctxt.instate = XmlParserInputState::XmlParserEOF;
if let Some(end_document) = ctxt.sax.as_deref_mut().and_then(|sax| sax.end_document)
{
end_document(ctxt);
}
}
}
let input = ctxt.input().unwrap();
if avail < 1 {
break 'done;
}
if ctxt.current_byte() == 0 {
ctxt.advance(1);
continue;
}
match ctxt.instate {
XmlParserInputState::XmlParserEOF => {
break 'done;
}
XmlParserInputState::XmlParserStart => {
let cur = ctxt.current_byte();
if cur.is_xml_blank_char() {
html_skip_blank_chars(ctxt);
avail = ctxt.input().unwrap().remainder_len();
}
if let Some(set_document_locator) = ctxt
.sax
.as_deref_mut()
.and_then(|sax| sax.set_document_locator)
{
set_document_locator(&mut *ctxt, XmlSAXLocator::default());
}
if !ctxt.disable_sax {
if let Some(start_document) =
ctxt.sax.as_deref_mut().and_then(|sax| sax.start_document)
{
start_document(&mut *ctxt);
}
}
if ctxt.content_bytes().len() >= 9
&& ctxt.content_bytes().starts_with(b"<!")
&& ctxt.content_bytes()[2..9].eq_ignore_ascii_case(b"DOCTYPE")
{
if terminate == 0 && html_parse_lookup_sequence(ctxt, b'>', 0, 0, 1) < 0 {
break 'done;
}
html_parse_doc_type_decl(ctxt);
ctxt.instate = XmlParserInputState::XmlParserProlog;
} else {
ctxt.instate = XmlParserInputState::XmlParserMisc;
}
}
XmlParserInputState::XmlParserMisc => {
html_skip_blank_chars(ctxt);
avail = ctxt.input().unwrap().remainder_len();
if avail < 1 {
break 'done;
}
if avail < 2 && terminate == 0 {
break 'done;
}
if ctxt.content_bytes().starts_with(b"<!--") {
if terminate == 0 && html_parse_lookup_comment_end(ctxt) < 0 {
break 'done;
}
html_parse_comment(ctxt);
ctxt.instate = XmlParserInputState::XmlParserMisc;
} else if ctxt.content_bytes().starts_with(b"<?") {
if terminate == 0 && html_parse_lookup_sequence(ctxt, b'>', 0, 0, 0) < 0 {
break 'done;
}
html_parse_pi(ctxt);
ctxt.instate = XmlParserInputState::XmlParserMisc;
} else if ctxt.content_bytes().len() >= 9
&& ctxt.content_bytes().starts_with(b"<!")
&& ctxt.content_bytes()[2..9].eq_ignore_ascii_case(b"DOCTYPE")
{
if terminate == 0 && html_parse_lookup_sequence(ctxt, b'>', 0, 0, 1) < 0 {
break 'done;
}
html_parse_doc_type_decl(ctxt);
ctxt.instate = XmlParserInputState::XmlParserProlog;
} else if ctxt.content_bytes().starts_with(b"<!") && avail < 9 {
break 'done;
} else {
ctxt.instate = XmlParserInputState::XmlParserContent;
}
}
XmlParserInputState::XmlParserProlog => {
html_skip_blank_chars(&mut *ctxt);
avail = ctxt.input().unwrap().remainder_len();
if avail < 2 {
break 'done;
}
if ctxt.content_bytes().starts_with(b"<!--") {
if terminate == 0 && html_parse_lookup_comment_end(ctxt) < 0 {
break 'done;
}
html_parse_comment(ctxt);
ctxt.instate = XmlParserInputState::XmlParserProlog;
} else if ctxt.content_bytes().starts_with(b"<?") {
if terminate == 0 && html_parse_lookup_sequence(ctxt, b'>', 0, 0, 0) < 0 {
break 'done;
}
html_parse_pi(ctxt);
ctxt.instate = XmlParserInputState::XmlParserProlog;
} else if ctxt.content_bytes().starts_with(b"<!") && avail < 4 {
break 'done;
} else {
ctxt.instate = XmlParserInputState::XmlParserContent;
}
}
XmlParserInputState::XmlParserEpilog => {
avail = input.remainder_len();
if avail < 1 {
break 'done;
}
if ctxt.current_byte().is_xml_blank_char() {
html_parse_char_data(ctxt);
break 'done;
}
if avail < 2 {
break 'done;
}
if ctxt.content_bytes().starts_with(b"<!--") {
if terminate == 0 && html_parse_lookup_comment_end(ctxt) < 0 {
break 'done;
}
html_parse_comment(ctxt);
ctxt.instate = XmlParserInputState::XmlParserEpilog;
} else if ctxt.content_bytes().starts_with(b"<?") {
if terminate == 0 && html_parse_lookup_sequence(ctxt, b'>', 0, 0, 0) < 0 {
break 'done;
}
html_parse_pi(ctxt);
ctxt.instate = XmlParserInputState::XmlParserEpilog;
} else if ctxt.content_bytes().starts_with(b"<!") && avail < 4 {
break 'done;
} else {
ctxt.err_no = XmlParserErrors::XmlErrDocumentEnd as i32;
ctxt.well_formed = false;
ctxt.instate = XmlParserInputState::XmlParserEOF;
if let Some(end_document) =
ctxt.sax.as_deref_mut().and_then(|sax| sax.end_document)
{
end_document(&mut *ctxt);
}
break 'done;
}
}
XmlParserInputState::XmlParserStartTag => 'to_break: {
if avail < 1 {
break 'done;
}
if avail < 2 && terminate == 0 {
break 'done;
}
if ctxt.current_byte() != b'<' {
ctxt.instate = XmlParserInputState::XmlParserContent;
break 'to_break;
}
if ctxt.nth_byte(1) == b'/' {
ctxt.instate = XmlParserInputState::XmlParserEndTag;
ctxt.check_index = 0;
break 'to_break;
}
if terminate == 0 && html_parse_lookup_sequence(ctxt, b'>', 0, 0, 1) < 0 {
break 'done;
}
let mut node_info = HtmlParserNodeInfo::default();
if ctxt.record_info {
node_info.begin_pos = ctxt.input().unwrap().consumed
+ ctxt.input().unwrap().offset_from_base() as u64;
node_info.begin_line = ctxt.input().unwrap().line as _;
}
let failed: i32 = html_parse_start_tag(ctxt);
let Some(name) = ctxt.name.clone().filter(|_| failed != -1) else {
if ctxt.current_byte() == b'>' {
ctxt.skip_char();
}
break 'to_break;
};
let info = html_tag_lookup(&name);
if info.is_none() {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlHTMLUnknownTag,
format!("Tag {name} invalid\n").as_str(),
Some(&name),
None,
);
}
if ctxt.content_bytes().starts_with(b"/>") {
ctxt.advance(2);
if let Some(end_element) =
ctxt.sax.as_deref_mut().and_then(|sax| sax.end_element)
{
end_element(&mut *ctxt, &name);
}
html_name_pop(ctxt);
ctxt.instate = XmlParserInputState::XmlParserContent;
break 'to_break;
}
if ctxt.current_byte() == b'>' {
ctxt.skip_char();
} else {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrGtRequired,
format!("Couldn't find end of Start Tag {name}\n").as_str(),
Some(&name),
None,
);
if Some(name.as_ref()) == ctxt.name.as_deref() {
ctxt.node_pop();
html_name_pop(ctxt);
}
if ctxt.record_info {
html_node_info_push(ctxt, Rc::new(RefCell::new(node_info)));
}
ctxt.instate = XmlParserInputState::XmlParserContent;
break 'to_break;
}
if info.is_some_and(|info| info.empty != 0) {
if let Some(end_element) =
ctxt.sax.as_deref_mut().and_then(|sax| sax.end_element)
{
end_element(&mut *ctxt, &name);
}
html_name_pop(ctxt);
}
if ctxt.record_info {
html_node_info_push(ctxt, Rc::new(RefCell::new(node_info)));
}
ctxt.instate = XmlParserInputState::XmlParserContent;
}
XmlParserInputState::XmlParserContent => 'to_break: {
let mut chr: [XmlChar; 2] = [0, 0];
if ctxt.token != 0 {
chr[0] = ctxt.token as _;
html_check_paragraph(ctxt);
if let Some(characters) = ctxt.sax.as_deref_mut().and_then(|sax| sax.characters)
{
let s = from_utf8(&chr[..1]).expect("Internal Error");
characters(ctxt, s);
}
ctxt.token = 0;
ctxt.check_index = 0;
}
if avail == 1 && terminate != 0 {
let cur = ctxt.current_byte();
if cur != b'<' && cur != b'&' {
if let Some(sax) = ctxt.sax.as_deref_mut() {
chr[0] = cur;
let s = from_utf8(&chr[..1]).expect("Internal Error");
if cur.is_xml_blank_char() {
if ctxt.keep_blanks {
if let Some(characters) = sax.characters {
characters(ctxt, s);
}
} else if let Some(ignorable_whitespace) = sax.ignorable_whitespace
{
ignorable_whitespace(ctxt, s);
}
} else {
html_check_paragraph(ctxt);
if let Some(characters) =
ctxt.sax.as_deref_mut().unwrap().characters
{
characters(ctxt, s);
}
}
}
ctxt.token = 0;
ctxt.check_index = 0;
ctxt.input_mut().unwrap().cur += 1;
break 'to_break;
}
}
if avail < 2 {
break 'done;
}
if ctxt.name.as_deref() == Some("script") || ctxt.name.as_deref() == Some("style") {
if terminate == 0 {
let idx: i32 = html_parse_lookup_sequence(ctxt, b'<', b'/', 0, 0);
if idx < 0 {
break 'done;
}
let val = ctxt.nth_byte(idx as usize + 2);
if val == 0 {
ctxt.check_index = idx as _;
break 'done;
}
}
html_parse_script(ctxt);
if ctxt.content_bytes().starts_with(b"</") {
ctxt.instate = XmlParserInputState::XmlParserEndTag;
ctxt.check_index = 0;
break 'to_break;
}
} else if ctxt.content_bytes().starts_with(b"<!") {
if avail < 4 {
break 'done;
}
if ctxt.content_bytes().len() >= 9
&& ctxt.content_bytes()[2..9].eq_ignore_ascii_case(b"DOCTYPE")
{
if terminate == 0 && html_parse_lookup_sequence(ctxt, b'>', 0, 0, 1) < 0 {
break 'done;
}
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlHTMLStrucureError,
"Misplaced DOCTYPE declaration\n",
Some("DOCTYPE"),
None,
);
html_parse_doc_type_decl(ctxt);
} else if ctxt.content_bytes()[2..].starts_with(b"--") {
if terminate == 0 && html_parse_lookup_comment_end(ctxt) < 0 {
break 'done;
}
html_parse_comment(ctxt);
ctxt.instate = XmlParserInputState::XmlParserContent;
} else {
if terminate == 0 && html_parse_lookup_sequence(ctxt, b'>', 0, 0, 0) < 0 {
break 'done;
}
html_skip_bogus_comment(ctxt);
}
} else if ctxt.content_bytes().starts_with(b"<?") {
if terminate == 0 && html_parse_lookup_sequence(ctxt, b'>', 0, 0, 0) < 0 {
break 'done;
}
html_parse_pi(ctxt);
ctxt.instate = XmlParserInputState::XmlParserContent;
} else if ctxt.content_bytes().starts_with(b"</") {
ctxt.instate = XmlParserInputState::XmlParserEndTag;
ctxt.check_index = 0;
break 'to_break;
} else if ctxt.current_byte() == b'<' && ctxt.nth_byte(1).is_ascii_alphabetic() {
if terminate == 0 && ctxt.nth_byte(1) == 0 {
break 'done;
}
ctxt.instate = XmlParserInputState::XmlParserStartTag;
ctxt.check_index = 0;
break 'to_break;
} else if ctxt.current_byte() == b'<' {
if !ctxt.disable_sax {
if let Some(characters) =
ctxt.sax.as_deref_mut().and_then(|sax| sax.characters)
{
characters(&mut *ctxt, "<");
}
}
ctxt.skip_char();
} else {
if terminate == 0 && html_parse_lookup_sequence(ctxt, b'<', 0, 0, 0) < 0 {
break 'done;
}
ctxt.check_index = 0;
while !matches!(ctxt.instate, XmlParserInputState::XmlParserEOF)
&& !ctxt.content_bytes().is_empty()
&& ctxt.current_byte() != b'<'
{
if ctxt.current_byte() == b'&' {
html_parse_reference(ctxt);
} else {
html_parse_char_data(ctxt);
}
}
}
}
XmlParserInputState::XmlParserEndTag => {
if avail < 2 {
break 'done;
}
if terminate == 0 && html_parse_lookup_sequence(ctxt, b'>', 0, 0, 0) < 0 {
break 'done;
}
html_parse_end_tag(ctxt);
if ctxt.name_tab.is_empty() {
ctxt.instate = XmlParserInputState::XmlParserEpilog;
} else {
ctxt.instate = XmlParserInputState::XmlParserContent;
}
ctxt.check_index = 0;
}
XmlParserInputState::XmlParserCDATASection => {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInternalError,
"HPP: internal error, state == CDATA\n",
None,
None,
);
ctxt.instate = XmlParserInputState::XmlParserContent;
ctxt.check_index = 0;
}
XmlParserInputState::XmlParserDTD => {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInternalError,
"HPP: internal error, state == DTD\n",
None,
None,
);
ctxt.instate = XmlParserInputState::XmlParserContent;
ctxt.check_index = 0;
}
XmlParserInputState::XmlParserComment => {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInternalError,
"HPP: internal error, state == COMMENT\n",
None,
None,
);
ctxt.instate = XmlParserInputState::XmlParserContent;
ctxt.check_index = 0;
}
XmlParserInputState::XmlParserPI => {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInternalError,
"HPP: internal error, state == PI\n",
None,
None,
);
ctxt.instate = XmlParserInputState::XmlParserContent;
ctxt.check_index = 0;
}
XmlParserInputState::XmlParserEntityDecl => {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInternalError,
"HPP: internal error, state == ENTITY_DECL\n",
None,
None,
);
ctxt.instate = XmlParserInputState::XmlParserContent;
ctxt.check_index = 0;
}
XmlParserInputState::XmlParserEntityValue => {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInternalError,
"HPP: internal error, state == ENTITY_VALUE\n",
None,
None,
);
ctxt.instate = XmlParserInputState::XmlParserContent;
ctxt.check_index = 0;
}
XmlParserInputState::XmlParserAttributeValue => {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInternalError,
"HPP: internal error, state == ATTRIBUTE_VALUE\n",
None,
None,
);
ctxt.instate = XmlParserInputState::XmlParserStartTag;
ctxt.check_index = 0;
}
XmlParserInputState::XmlParserSystemLiteral => {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInternalError,
"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
None,
None,
);
ctxt.instate = XmlParserInputState::XmlParserContent;
ctxt.check_index = 0;
}
XmlParserInputState::XmlParserIgnore => {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInternalError,
"HPP: internal error, state == XML_PARSER_IGNORE\n",
None,
None,
);
ctxt.instate = XmlParserInputState::XmlParserContent;
ctxt.check_index = 0;
}
XmlParserInputState::XmlParserPublicLiteral => {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInternalError,
"HPP: internal error, state == XML_PARSER_LITERAL\n",
None,
None,
);
ctxt.instate = XmlParserInputState::XmlParserContent;
ctxt.check_index = 0;
}
}
}
if avail == 0 && terminate != 0 {
html_auto_close_on_end(ctxt);
if ctxt.name_tab.is_empty() && !matches!(ctxt.instate, XmlParserInputState::XmlParserEOF) {
ctxt.instate = XmlParserInputState::XmlParserEOF;
if let Some(end_document) = ctxt.sax.as_deref_mut().and_then(|sax| sax.end_document) {
end_document(ctxt);
}
}
}
if ctxt.options & HtmlParserOption::HtmlParseNodefdtd as i32 == 0
&& (terminate != 0
|| matches!(
ctxt.instate,
XmlParserInputState::XmlParserEOF | XmlParserInputState::XmlParserEpilog
))
{
if let Some(mut my_doc) = ctxt.my_doc {
let dtd = my_doc.get_int_subset();
if dtd.is_none() {
my_doc.int_subset = xml_create_int_subset(
ctxt.my_doc,
Some("html"),
Some("-//W3C//DTD HTML 4.0 Transitional//EN"),
Some("http://www.w3.org/TR/REC-html40/loose.dtd"),
);
}
}
}
ret
}
#[doc(alias = "htmlParseChunk")]
#[cfg(feature = "libxml_push")]
pub fn html_parse_chunk(ctxt: &mut HtmlParserCtxt, chunk: &[u8], terminate: i32) -> i32 {
if ctxt.input().is_none() {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInternalError,
"htmlParseChunk: context error\n",
None,
None,
);
return XmlParserErrors::XmlErrInternalError as i32;
}
if !chunk.is_empty()
&& ctxt.input().is_some()
&& ctxt.input().unwrap().buf.is_some()
&& !matches!(ctxt.instate, XmlParserInputState::XmlParserEOF)
{
let res: i32 = ctxt
.input_mut()
.unwrap()
.buf
.as_mut()
.unwrap()
.push_bytes(chunk);
if res < 0 {
html_err_memory(Some(ctxt), None);
return ctxt.err_no;
}
} else if !matches!(ctxt.instate, XmlParserInputState::XmlParserEOF)
&& (ctxt.input().is_some() && ctxt.input().unwrap().buf.is_some())
{
let input = ctxt.input_mut().unwrap().buf.as_mut().unwrap();
if input.encoder.is_some() {
let res = input.decode(terminate != 0);
if res.is_err() {
html_parse_err(
Some(ctxt),
XmlParserErrors::XmlErrInvalidEncoding,
"encoder error\n",
None,
None,
);
return XmlParserErrors::XmlErrInvalidEncoding as i32;
}
}
}
html_parse_try_or_finish(ctxt, terminate);
if terminate != 0 {
if !matches!(
ctxt.instate,
XmlParserInputState::XmlParserEOF
| XmlParserInputState::XmlParserEpilog
| XmlParserInputState::XmlParserMisc
) {
ctxt.err_no = XmlParserErrors::XmlErrDocumentEnd as i32;
ctxt.well_formed = false;
}
if !matches!(ctxt.instate, XmlParserInputState::XmlParserEOF) {
if let Some(end_document) = ctxt.sax.as_deref_mut().and_then(|sax| sax.end_document) {
end_document(&mut *ctxt);
}
}
ctxt.instate = XmlParserInputState::XmlParserEOF;
}
ctxt.err_no
}
#[doc(alias = "xmlParserOption")]
#[repr(C)]
pub enum HtmlParserOption {
HtmlParseRecover = 1 << 0,
HtmlParseNodefdtd = 1 << 2,
HtmlParseNoerror = 1 << 5,
HtmlParseNowarning = 1 << 6,
HtmlParsePedantic = 1 << 7,
HtmlParseNoblanks = 1 << 8,
HtmlParseNonet = 1 << 11,
HtmlParseNoimplied = 1 << 13,
HtmlParseCompact = 1 << 16,
HtmlParseIgnoreEnc = 1 << 21,
}
#[doc(alias = "htmlCtxtReset")]
pub fn html_ctxt_reset(ctxt: &mut HtmlParserCtxt) {
xml_init_parser();
ctxt.input_tab.clear();
ctxt.space_tab.clear();
ctxt.node_tab.clear();
ctxt.node = None;
ctxt.name_tab.clear();
ctxt.name = None;
ctxt.ns_tab.clear();
ctxt.version = None;
ctxt.encoding = None;
ctxt.directory = None;
ctxt.ext_sub_uri = None;
ctxt.ext_sub_system = None;
if let Some(my_doc) = ctxt.my_doc.take() {
unsafe {
xml_free_doc(my_doc);
}
}
ctxt.standalone = -1;
ctxt.has_external_subset = false;
ctxt.has_perefs = false;
ctxt.html = 1;
ctxt.external = 0;
ctxt.instate = XmlParserInputState::XmlParserStart;
ctxt.token = 0;
ctxt.well_formed = true;
ctxt.ns_well_formed = true;
ctxt.disable_sax = false;
ctxt.valid = 1;
ctxt.vctxt.user_data = None;
ctxt.vctxt.flags = XML_VCTXT_USE_PCTXT as _;
ctxt.vctxt.error = Some(parser_validity_error);
ctxt.vctxt.warning = Some(parser_validity_warning);
ctxt.record_info = false;
ctxt.check_index = 0;
ctxt.end_check_state = 0;
ctxt.in_subset = 0;
ctxt.err_no = XmlParserErrors::XmlErrOK as i32;
ctxt.depth = 0;
ctxt.charset = XmlCharEncoding::None;
#[cfg(feature = "catalog")]
{
ctxt.catalogs = None;
}
ctxt.node_seq.clear();
ctxt.atts_default.clear();
ctxt.atts_special.clear();
ctxt.nb_errors = 0;
ctxt.nb_warnings = 0;
if ctxt.last_error.is_err() {
ctxt.last_error.reset();
}
}
#[doc(alias = "htmlCtxtUseOptions")]
pub fn html_ctxt_use_options(ctxt: &mut HtmlParserCtxt, mut options: i32) -> i32 {
if options & HtmlParserOption::HtmlParseNowarning as i32 != 0 {
if let Some(sax) = ctxt.sax.as_deref_mut() {
sax.warning = None;
}
ctxt.vctxt.warning = None;
options -= XmlParserOption::XmlParseNoWarning as i32;
ctxt.options |= XmlParserOption::XmlParseNoWarning as i32;
}
if options & HtmlParserOption::HtmlParseNoerror as i32 != 0 {
if let Some(sax) = ctxt.sax.as_deref_mut() {
sax.error = None;
sax.fatal_error = None;
}
ctxt.vctxt.error = None;
options -= XmlParserOption::XmlParseNoError as i32;
ctxt.options |= XmlParserOption::XmlParseNoError as i32;
}
if options & HtmlParserOption::HtmlParsePedantic as i32 != 0 {
ctxt.pedantic = true;
options -= XmlParserOption::XmlParsePedantic as i32;
ctxt.options |= XmlParserOption::XmlParsePedantic as i32;
} else {
ctxt.pedantic = false;
}
if options & XmlParserOption::XmlParseNoBlanks as i32 != 0 {
ctxt.keep_blanks = false;
if let Some(sax) = ctxt.sax.as_deref_mut() {
sax.ignorable_whitespace = Some(xml_sax2_ignorable_whitespace);
}
options -= XmlParserOption::XmlParseNoBlanks as i32;
ctxt.options |= XmlParserOption::XmlParseNoBlanks as i32;
} else {
ctxt.keep_blanks = true;
}
if options & HtmlParserOption::HtmlParseRecover as i32 != 0 {
ctxt.recovery = true;
options -= HtmlParserOption::HtmlParseRecover as i32;
} else {
ctxt.recovery = false;
}
if options & HtmlParserOption::HtmlParseCompact as i32 != 0 {
ctxt.options |= HtmlParserOption::HtmlParseCompact as i32;
options -= HtmlParserOption::HtmlParseCompact as i32;
}
if options & XmlParserOption::XmlParseHuge as i32 != 0 {
ctxt.options |= XmlParserOption::XmlParseHuge as i32;
options -= XmlParserOption::XmlParseHuge as i32;
}
if options & HtmlParserOption::HtmlParseNodefdtd as i32 != 0 {
ctxt.options |= HtmlParserOption::HtmlParseNodefdtd as i32;
options -= HtmlParserOption::HtmlParseNodefdtd as i32;
}
if options & HtmlParserOption::HtmlParseIgnoreEnc as i32 != 0 {
ctxt.options |= HtmlParserOption::HtmlParseIgnoreEnc as i32;
options -= HtmlParserOption::HtmlParseIgnoreEnc as i32;
}
if options & HtmlParserOption::HtmlParseNoimplied as i32 != 0 {
ctxt.options |= HtmlParserOption::HtmlParseNoimplied as i32;
options -= HtmlParserOption::HtmlParseNoimplied as i32;
}
ctxt.linenumbers = 1;
options
}
#[doc(alias = "htmlDoRead")]
fn html_do_read(
ctxt: &mut HtmlParserCtxt,
url: Option<&str>,
encoding: Option<&str>,
options: i32,
) -> Option<HtmlDocPtr> {
html_ctxt_use_options(ctxt, options);
ctxt.html = 1;
if let Some(encoding) = encoding {
if let Some(handler) = find_encoding_handler(encoding) {
ctxt.switch_to_encoding(handler);
ctxt.input_mut().unwrap().encoding = Some(encoding.to_owned());
}
}
if url.is_some() {
if let Some(input) = ctxt.input_mut().filter(|input| input.filename.is_none()) {
input.filename = url.map(|u| u.to_owned());
}
}
html_parse_document(ctxt);
ctxt.my_doc.take()
}
#[doc(alias = "htmlReadDoc")]
pub fn html_read_doc(
cur: &[u8],
url: Option<&str>,
encoding: Option<&str>,
options: i32,
) -> Option<HtmlDocPtr> {
xml_init_parser();
let mut ctxt = html_create_doc_parser_ctxt(cur, None)?;
html_do_read(&mut ctxt, url, encoding, options)
}
#[doc(alias = "htmlReadFile")]
pub fn html_read_file(filename: &str, encoding: Option<&str>, options: i32) -> Option<HtmlDocPtr> {
xml_init_parser();
let mut ctxt = html_create_file_parser_ctxt(filename, encoding)?;
html_do_read(&mut ctxt, None, None, options)
}
#[doc(alias = "htmlReadMemory")]
pub fn html_read_memory(
buffer: &[u8],
url: Option<&str>,
encoding: Option<&str>,
options: i32,
) -> Option<HtmlDocPtr> {
xml_init_parser();
let mut ctxt = html_create_memory_parser_ctxt(buffer)?;
html_do_read(&mut ctxt, url, encoding, options)
}
#[doc(alias = "htmlReadIO")]
pub fn html_read_io<'a>(
ioctx: impl Read + 'a,
url: Option<&str>,
encoding: Option<&str>,
options: i32,
) -> Option<HtmlDocPtr> {
xml_init_parser();
let input = XmlParserInputBuffer::from_reader(ioctx, XmlCharEncoding::None);
let mut ctxt = html_new_parser_ctxt()?;
let stream = XmlParserInput::from_io(&mut ctxt, input, XmlCharEncoding::None)?;
ctxt.input_push(stream);
html_do_read(&mut ctxt, url, encoding, options)
}
#[doc(alias = "htmlCtxtReadDoc")]
pub fn html_ctxt_read_doc<'a>(
ctxt: &mut XmlParserCtxt<'a>,
cur: &'a [u8],
url: Option<&str>,
encoding: Option<&str>,
options: i32,
) -> Option<HtmlDocPtr> {
html_ctxt_read_memory(ctxt, cur, url, encoding, options)
}
#[doc(alias = "htmlCtxtReadFile")]
pub fn html_ctxt_read_file(
ctxt: &mut XmlParserCtxt,
filename: &str,
encoding: Option<&str>,
options: i32,
) -> Option<HtmlDocPtr> {
xml_init_parser();
html_ctxt_reset(ctxt);
let stream = xml_load_external_entity(Some(filename), None, ctxt)?;
ctxt.input_push(stream);
html_do_read(ctxt, None, encoding, options)
}
#[doc(alias = "htmlCtxtReadMemory")]
pub fn html_ctxt_read_memory<'a>(
ctxt: &mut XmlParserCtxt<'a>,
buffer: &'a [u8],
url: Option<&str>,
encoding: Option<&str>,
options: i32,
) -> Option<HtmlDocPtr> {
xml_init_parser();
html_ctxt_reset(ctxt);
let input = XmlParserInputBuffer::from_memory(buffer, XmlCharEncoding::None)?;
let stream = XmlParserInput::from_io(ctxt, input, XmlCharEncoding::None)?;
ctxt.input_push(stream);
html_do_read(ctxt, url, encoding, options)
}
#[doc(alias = "htmlCtxtReadIO")]
pub fn html_ctxt_read_io<'a>(
ctxt: &mut XmlParserCtxt<'a>,
ioctx: impl Read + 'a,
url: Option<&str>,
encoding: Option<&str>,
options: i32,
) -> Option<HtmlDocPtr> {
xml_init_parser();
html_ctxt_reset(ctxt);
let input = XmlParserInputBuffer::from_reader(ioctx, XmlCharEncoding::None);
let stream = XmlParserInput::from_io(ctxt, input, XmlCharEncoding::None)?;
ctxt.input_push(stream);
html_do_read(ctxt, url, encoding, options)
}
#[repr(C)]
pub enum HtmlStatus {
HtmlNa = 0,
HtmlInvalid = 0x1,
HtmlDeprecated = 0x2,
HtmlValid = 0x4,
HtmlRequired = 0xc,
}
#[doc(alias = "htmlAttrAllowed")]
pub fn html_attr_allowed(elt: &HtmlElemDesc, attr: &str, legacy: bool) -> HtmlStatus {
if elt.attrs_req.iter().any(|&p| p == attr) {
return HtmlStatus::HtmlRequired;
}
if elt.attrs_opt.iter().any(|&p| p == attr) {
return HtmlStatus::HtmlValid;
}
if legacy && elt.attrs_depr.iter().any(|&p| p == attr) {
return HtmlStatus::HtmlDeprecated;
}
HtmlStatus::HtmlInvalid
}
#[doc(alias = "htmlElementAllowedHere")]
pub fn html_element_allowed_here(parent: &HtmlElemDesc, elt: &str) -> bool {
parent.subelts.iter().any(|&sub| sub == elt)
}
#[doc(alias = "htmlElementStatusHere")]
pub fn html_element_status_here(parent: &HtmlElemDesc, elt: &HtmlElemDesc) -> HtmlStatus {
if !html_element_allowed_here(parent, elt.name) {
return HtmlStatus::HtmlInvalid;
}
if elt.dtd == 0 {
HtmlStatus::HtmlValid
} else {
HtmlStatus::HtmlDeprecated
}
}
#[doc(alias = "htmlNodeStatus")]
pub fn html_node_status(node: HtmlNodePtr, legacy: bool) -> HtmlStatus {
match node.element_type() {
XmlElementType::XmlElementNode => {
if legacy {
if html_tag_lookup(&node.parent().unwrap().name().unwrap())
.is_some_and(|desc| html_element_allowed_here(desc, &node.name().unwrap()))
{
HtmlStatus::HtmlValid
} else {
HtmlStatus::HtmlInvalid
}
} else {
html_tag_lookup(&node.parent().unwrap().name().unwrap())
.zip(html_tag_lookup(&node.name().unwrap()))
.map(|(par, chi)| html_element_status_here(par, chi))
.unwrap_or(HtmlStatus::HtmlInvalid)
}
}
XmlElementType::XmlAttributeNode => {
html_tag_lookup(&node.parent().unwrap().name().unwrap())
.map(|desc| html_attr_allowed(desc, node.name().as_deref().unwrap(), legacy))
.unwrap_or(HtmlStatus::HtmlInvalid)
}
_ => HtmlStatus::HtmlNa,
}
}
#[cfg(test)]
mod tests {
use crate::{globals::reset_last_error, libxml::xmlmemory::xml_mem_blocks, test_util::*};
use super::*;
#[test]
fn test_html_handle_omitted_elem() {
#[cfg(feature = "html")]
unsafe {
let mut leaks = 0;
for n_val in 0..GEN_NB_INT {
let mem_base = xml_mem_blocks();
let val = gen_int(n_val, 0);
let ret_val = html_handle_omitted_elem(val);
desret_int(ret_val);
des_int(n_val, val, 0);
reset_last_error();
if mem_base != xml_mem_blocks() {
leaks += 1;
eprint!(
"Leak of {} blocks found in htmlHandleOmittedElem",
xml_mem_blocks() - mem_base
);
eprintln!(" {}", n_val);
}
}
assert!(
leaks == 0,
"{leaks} Leaks are found in htmlHandleOmittedElem()"
);
}
}
}