use std::cell::{OnceCell};
use std::cmp::Ordering;
use std::collections::HashMap;
use std::fs;
use std::io::Read;
use std::path::Path;
use regex::Regex;
use crate::errors::KissXmlError;
pub mod errors;
pub mod dom;
mod parsing;
pub fn text_escape(text: impl Into<String>) -> String {
let buffer: String = text.into();
buffer.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
}
pub fn attribute_escape(text: impl Into<String>) -> String {
escape(text)
}
pub fn escape(text: impl Into<String>) -> String {
let buffer: String = text.into();
buffer.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace("'", "'")
.replace("\"", """)
}
pub fn unescape(text: impl Into<String>) -> String {
let mut buffer: String = text.into();
let mut last_i: usize = 0;
loop {
if last_i >= buffer.len(){break;}
match (&buffer[last_i..]).find("&") {
None => break,
Some(i) => {
let i = i+last_i;
let start = i;
let slice = (&buffer[i..]).to_string();
let mut char_size: usize = 1;
for (j, k) in slice.char_indices() {
char_size = k.len_utf8();
if k == ';' {
let end = i + j + 1;
let slice = &slice[..j];
if slice == "&" {
string_insert(&mut buffer, (start, end), "&");
}
if slice == "<" {
string_insert(&mut buffer, (start, end), "<");
}
if slice == ">" {
string_insert(&mut buffer, (start, end), ">");
}
if slice == "&apos" {
string_insert(&mut buffer, (start, end), "'");
}
if slice == """ {
string_insert(&mut buffer, (start, end), "\"");
}
if slice.starts_with("&#") {
match u32::from_str_radix(&slice[2..], 16) {
Ok(codepoint) => {
match char::from_u32(codepoint) {
Some(unicode) => {
let unicode_str = unicode.to_string();
string_insert(&mut buffer, (start, end), unicode_str.as_str());
char_size = unicode.len_utf8();
},
None => { }
}
}
Err(_) => { }
}
}
}
}
last_i = i+char_size;
}
}
}
buffer
}
pub(crate) fn attribute_order(kv_tup1: &(&String, &String), kv_tup2: &(&String, &String)) -> Ordering {
let a = kv_tup1.0.as_str();
let b = kv_tup2.0.as_str();
if a == b {
return kv_tup1.1.cmp(&kv_tup2.1);
}
if a.starts_with("xmlns") && !b.starts_with("xmlns") {
return Ordering::Less;
} else if !a.starts_with("xmlns") && b.starts_with("xmlns") {
return Ordering::Greater;
} else {
return a.cmp(&b);
}
}
fn string_insert(buffer: &mut String, indices: (usize, usize), insert: &str) {
let back = (&buffer[indices.1..]).to_string();
buffer.truncate(indices.0);
buffer.push_str(insert);
buffer.push_str(back.as_str());
}
pub fn parse_filepath(path: impl AsRef<Path>) -> Result<dom::Document, errors::KissXmlError> {
let path_ref = path.as_ref();
let content = fs::read_to_string(path_ref)?;
parse_str(content)
}
pub fn parse_stream(mut reader: impl Read) -> Result<dom::Document, errors::KissXmlError> {
let mut buffer = String::new();
reader.read_to_string(&mut buffer)?;
parse_str(buffer)
}
pub fn parse_str(xml_string: impl Into<String>) -> Result<dom::Document, errors::KissXmlError> {
let buffer = xml_string.into();
let mut decl: Option<dom::Declaration> = None;
let mut dtds: Vec<dom::DTD> = Vec::new();
let mut no_comment_warn = 0;
let mut tag_span: (usize, usize) = (0, 0);
loop {
let (tag_start, tag_end) = next_tag(&buffer, tag_span.1);
if tag_start.is_none() {
return Err(errors::ParsingError::new(format!("no XML content")).into());
}
if tag_end.is_none(){
let (line, col) = line_and_column(&buffer, tag_start.unwrap());
return Err(errors::ParsingError::new(format!(
"'<' has not matching '>' (syntax error on line {line}, column {col})"
)).into());
}
let tag_start = tag_start.unwrap();
let tag_end = tag_end.unwrap();
let text_between = &buffer[tag_span.1..tag_start];
if real_text(text_between).is_some() {
let (line, col) = line_and_column(&buffer, tag_span.1);
return Err(errors::ParsingError::new(format!(
"Text outside the root element is not supported (syntax error on line {line}, column {col})"
)).into());
}
let slice = &buffer[tag_start..tag_end];
if slice.starts_with("<?xml") {
if tag_span.0 != 0 {
let (line, col) = line_and_column(&buffer, tag_start);
return Err(errors::ParsingError::new(format!(
"<?xml ...?> declaration must at start of XML (syntax error on line {line}, column {col})"
)).into());
}
decl = Some(dom::Declaration::from_str(slice)?);
} else if slice.starts_with("<!--") {
if no_comment_warn == 0 {
eprintln!("WARNING: Encountered comment {} outside of root element. Comments outside of the root are not supported and will be ignored.", abbreviate(slice, 32));
}
no_comment_warn += 1;
} else if slice.starts_with("<!DOCTYPE") {
let dtd = dom::DTD::from_string(slice)?;
dtds.push(dtd);
} else if slice.starts_with("<!"){
eprintln!("WARNING: Ignoring {slice} (not supported outside root element)");
} else if slice.starts_with("</") {
let (line, col) = line_and_column(&buffer, tag_start);
return Err(errors::ParsingError::new(format!(
"cannot start with closing tag (syntax error on line {line}, column {col})"
)).into());
} else {
check_element_tag(slice).map_err(|_e| {
let (line, col) = line_and_column(&buffer, tag_start);
errors::ParsingError::new(format!(
"invalid XML syntax on line {line}, column {col}"
))
})?;
tag_span = (tag_start, tag_end);
break;
}
tag_span = (tag_start, tag_end);
}
let mut parse_stack = parsing::ParseTree::new();
let root_slice = &buffer[tag_span.0 .. tag_span.1];
let root_element: dom::Element = parse_new_element(strip_tag(root_slice).as_str(), &buffer, &tag_span, None)?;
parse_stack.push(root_element);
let selfclosing_root = root_slice.ends_with("/>");
if selfclosing_root {parse_stack.pop()?;} let mut last_span: (usize, usize);
loop {
let next_span = next_tag(&buffer, tag_span.1);
if next_span.0.is_none() {
break
} else if next_span.1.is_none() {
let (line, col) = line_and_column(&buffer, next_span.0.unwrap());
return Err(errors::ParsingError::new(format!(
"invalid XML syntax on line {line}, column {col}"
)).into());
} else {
if selfclosing_root {
let (line, col) = line_and_column(&buffer, next_span.0.unwrap());
return Err(errors::ParsingError::new(format!(
"only 1 root element is allowed (syntax error on line {line}, column {col})"
)).into());
}
last_span = tag_span;
tag_span = (next_span.0.unwrap(), next_span.1.unwrap());
}
let text = &buffer[last_span.1 .. tag_span.0];
match real_text(text) {
None => {},
Some(content) => {
parse_stack.append(dom::Text::new(content))
.map_err(|e|{
let (line, col) = line_and_column(&buffer, next_span.0.unwrap());
errors::ParsingError::new(format!(
"{} (syntax error on line {line}, column {col})", e
))
})?;
}
};
let slice = &buffer[tag_span.0 .. tag_span.1];
if slice.starts_with("<!--") && slice.ends_with("-->") {
let begin = 4;
let end = slice.len().saturating_sub(3);
if begin < end {
parse_stack.append(dom::Comment::new(&slice[begin..end])?)
.map_err(|e| {
let (line, col) = line_and_column(&buffer, next_span.0.unwrap());
errors::ParsingError::new(format!(
"{} (syntax error on line {line}, column {col})", e
))
})?;
} else if begin == end {
parse_stack.append(dom::Comment::new(String::new())?)?;
} else {
let (line, col) = line_and_column(&buffer, next_span.0.unwrap());
return Err(errors::ParsingError::new(format!(
"invalid comment syntax on line {line}, column {col}"
)).into());
}
} else if slice.starts_with("<![CDATA["){
if !slice.ends_with("]]>") {
let (line, col) = line_and_column(&buffer, next_span.0.unwrap());
return Err(errors::ParsingError::new(format!(
"Unclosed CDATA. '<![CDATA[' must be followed by ']]>' (syntax error on line {line}, column {col})"
)).into());
}
parse_stack.append(dom::CData::new(&slice[9 .. slice.len().saturating_sub(3)])?)
.map_err(|e|{
let (line, col) = line_and_column(&buffer, next_span.0.unwrap());
errors::ParsingError::new(format!(
"{} (syntax error on line {line}, column {col})", e
))
})?;
} else if slice.starts_with("<!") {
let (line, col) = line_and_column(&buffer, tag_span.0);
return Err(errors::NotSupportedError::new(format!(
"kiss-xml does not support '{}' (error on line {line}, column {col})",
abbreviate(slice, 32)
)).into());
} else {
let tag_def = strip_tag(slice);
check_element_tag(slice).map_err(|e| {
let (line, col) = line_and_column(&buffer, tag_span.0);
errors::ParsingError::new(format!(
"{} (syntax error on line {line}, column {col})", e
))
})?;
if slice.starts_with("</") {
let active_element = parse_stack.top_element()
.ok_or_else(||{
let (line, col) = line_and_column(&buffer, next_span.0.unwrap());
errors::ParsingError::new(format!(
"root element already closed (syntax error on line {line}, column {col})"
))
})?;
let open_tagname = active_element.tag_name();
if tag_def != open_tagname {
let (line, col) = line_and_column(&buffer, tag_span.0);
return Err(errors::ParsingError::new(format!(
"closing tag {slice} does not match <{open_tagname}> (syntax error on line {line}, column {col})"
)).into());
}
parse_stack.pop()?;
} else {
let new_element = parse_new_element(tag_def.as_str(), &buffer, &tag_span, parse_stack.top_element())?;
if slice.ends_with("/>") {
parse_stack.append(new_element).map_err(|e| {
let (line, col) = line_and_column(&buffer, tag_span.0);
errors::ParsingError::new(format!(
"{} (syntax error on line {line}, column {col})", e
))
})?;
} else {
parse_stack.push(new_element);
}
}
}
}
if ! parse_stack.empty_stack() {
return Err(errors::ParsingError::new(format!(
"root element not closed"
)).into());
}
Ok(dom::Document::new_with_decl_dtd(
parse_stack.to_dom()?,
decl,
Some(&dtds)
))
}
fn abbreviate(text: &str, limit: usize) -> String {
if limit < 4 || text.len() <= limit {
text.to_string()
} else {
let mut buffer = (&text[0..(limit / 2 - 1)]).to_string();
buffer.push_str("…");
buffer.push_str(&text[(text.len() - limit / 2)..]);
buffer
}
}
fn parse_new_element(tag_content: &str, buffer: &String, tag_span: &(usize, usize), parent: Option<&dom::Element>) -> Result<dom::Element, KissXmlError> {
let components = quote_aware_split(tag_content);
if components.len() == 0 {
let (line, col) = line_and_column(&buffer, tag_span.0);
return Err(errors::ParsingError::new(format!(
"invalid XML syntax on line {line}, column {col}: empty tags not supported"
)).into());
}
let mut attrs: HashMap<String, String> = HashMap::new();
for i in 1..components.len() {
let kv = &components[i];
if !kv.contains("=") {
let (line, col) = line_and_column(&buffer, tag_span.0);
return Err(errors::ParsingError::new(format!(
"invalid XML syntax on line {line}, column {col}: attributes must be in the form 'key=\"value\"'"
)).into());
}
let (k, mut v) = kv.split_once("=").unwrap();
if v.len() == 0 { let (line, col) = line_and_column(&buffer, tag_span.0);
return Err(errors::ParsingError::new(format!(
"invalid XML syntax on line {line}, column {col}: no content after ="
)).into());
}
if v.len() < 2 || !v.starts_with('"') || !v.ends_with('"') {
let (line, col) = line_and_column(&buffer, tag_span.0);
return Err(errors::ParsingError::new(format!(
"invalid XML syntax on line {line}, column {col}: attribute value must be quoted with double-quotes"
)).into());
}
v = &v[1..(v.len()-1)]; attrs.insert(k.to_string(), v.to_string());
}
let mut name = components[0].as_str();
let mut xmlns: Option<String> = None;
let mut xmlns_prefix: Option<String> = None;
let (inherited_default_namespace, inherited_xmlns_context) = match parent {
None => (None, None),
Some(parent) => (parent.default_namespace(), Some(parent.get_namespace_context()))
};
if name.contains(":"){
let (a, b) = name.split_once(":").unwrap();
name = b;
xmlns_prefix = Some(a.to_string());
let prefix_key = format!("xmlns:{a}");
xmlns = match attrs.contains_key(&prefix_key){
true => attrs.get(prefix_key.as_str()).map(String::clone),
false => match &inherited_xmlns_context{
None => {
let (line, col) = line_and_column(&buffer, tag_span.0);
return Err(errors::ParsingError::new(format!(
"invalid XML syntax on line {line}, column {col}: XML namespace prefix '{a}' has no defined namespace (missing 'xmlns:{a}=\"...\"')"
)).into());
}
Some(ctx) => {ctx.get(prefix_key.as_str()).map(String::clone)}
}
};
}
let mut new_element = dom::Element::new(
name, None, Some(attrs), xmlns, xmlns_prefix, None
)?;
new_element.set_namespace_context(inherited_default_namespace, inherited_xmlns_context);
Ok(new_element)
}
fn strip_tag(tag: &str) -> String {
let mut tag = tag;
if tag.starts_with("<") {tag = &tag[1..];}
if tag.starts_with("/") {tag = &tag[1..];}
if tag.ends_with(">") {tag = &tag[..tag.len().saturating_sub(1)];}
if tag.ends_with("/") {tag = &tag[..tag.len().saturating_sub(1)];}
tag.trim().to_string()
}
const ELEM_MATCHER_SINGLETON: OnceCell<Regex> = OnceCell::new();
fn check_element_tag(text: &str) -> Result<(), errors::KissXmlError> {
let singleton = ELEM_MATCHER_SINGLETON;
let matcher = singleton.get_or_init(||{
let name_start_char = r#"[:A-Z_a-z\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]"#;
let name_char = r#"[:A-Z_a-z\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}.\-0-9\xB7\x{0300}-\x{036F}\x{203F}-\x{2040}]"#;
let pattern = format!(r#"(?ms)</?{name_start_char}{name_char}*(:{name_start_char}{name_char}*)?(\s+{name_start_char}{name_char}*=(".*?"|'.*?'))*\s*/?>"#);
Regex::new(pattern.as_str()).unwrap()
});
match matcher.is_match(text){
true => Ok(()),
false => Err(errors::ParsingError::new("Invalid XML Element").into())
}
}
fn next_tag(buffer: &String, from: usize) -> (Option<usize>, Option<usize>) {
let _i = from;
let start: Option<usize> = (&buffer[from..]).find("<")
.map(|i|i+from);
if start.is_none() {
return (None, None);
}
let start_index = start.expect("logic error");
let sub_buffer = &buffer[start_index..];
if sub_buffer.starts_with("<!--") {
return (start, sub_buffer.find("-->").map(|i|i+start_index+3));
} else if sub_buffer.starts_with("<?") {
return (start, quote_aware_find(sub_buffer, "?>", 2).map(|i|i+start_index+2))
} else if sub_buffer.starts_with("<![CDATA[") {
return (start, sub_buffer.find("]]>").map(|i|i+start_index+3));
} else if sub_buffer.starts_with("<!") {
return (start, nested_quote_aware_find_close(sub_buffer,2).map(|i|i+start_index+1))
} else {
return (start, quote_aware_find(sub_buffer, ">", 1).map(|i|i+start_index+1))
}
}
fn quote_aware_split(text: &str) -> Vec<String> {
let mut builder = String::new();
let mut vec: Vec<String> = Vec::new();
let mut in_quote = false;
let mut quote_char = '\0';
for (_i, c) in text.char_indices() {
if !in_quote && (c == '\'' || c == '"') {
in_quote = true;
quote_char = c;
builder.push(c);
} else if in_quote {
builder.push(c);
if c == quote_char {
in_quote = false;
}
} else if c.is_whitespace() {
if builder.len() > 0 {
vec.push(builder);
builder = String::new();
}
} else {
builder.push(c);
}
}
if !builder.is_empty() {
vec.push(builder);
}
return vec;
}
fn quote_aware_find(text: &str, pattern: &str, from: usize) -> Option<usize> {
let mut in_quote = false;
let mut quote_char = '\0';
for (i, c) in text[from..].char_indices() {
if in_quote {
if c == quote_char { in_quote = false;
}
} else {
if c == '"' { quote_char = '"';
in_quote = true;
} else if c == '\'' { quote_char = '\'';
in_quote = true;
} else if text[(from + i)..].starts_with(pattern) {
return Some(from+i);
}
}
}
None
}
fn nested_quote_aware_find_close(text: &str, from: usize) -> Option<usize> {
let mut depth: i32 = 0;
let mut in_quote = false;
let mut quote_char = '\0';
for (i, c) in text[from..].char_indices() {
if in_quote {
if c == quote_char { in_quote = false;
}
} else {
if c == '"' { quote_char = '"';
in_quote = true;
} else if c == '\'' { quote_char = '\'';
in_quote = true;
} else if c == '<' {
depth += 1;
} else if c == '>' {
if depth == 0 {
return Some(from+i)
}
depth -= 1;
}
}
}
None
}
const IS_BLANK_MATCHER_SINGLETON: OnceCell<Regex> = OnceCell::new();
fn real_text(text: &str) -> Option<String> {
let singleton = IS_BLANK_MATCHER_SINGLETON;
let matcher = singleton.get_or_init(|| Regex::new(r#"^\s*$"#).unwrap());
if matcher.is_match(text) {
return None;
}
Some(unescape(text))
}
fn line_and_column(text: &String, pos: usize) -> (usize, usize){
let mut line = 1;
let mut col = 1;
for (i, c) in text.char_indices(){
col += 1;
if c == '\n' {
line += 1;
col = 1;
}
if i >= pos {break;}
}
(line, col)
}
pub(crate) fn validate_indent(indent: &str) -> Result<(), ()> {
if indent == "\t" {return Ok(());}
for c in indent.chars() {
if c != ' ' {
return Err(());
}
}
Ok(())
}