use super::error::XPathError;
pub fn normalize_space(value: &str) -> String {
let mut result = String::with_capacity(value.len());
let mut prev_was_space = true;
for ch in value.chars() {
if is_xml_whitespace(ch) {
if !prev_was_space {
result.push(' ');
prev_was_space = true;
}
} else {
result.push(ch);
prev_was_space = false;
}
}
if result.ends_with(' ') {
result.pop();
}
result
}
#[inline]
pub fn is_xml_whitespace(ch: char) -> bool {
matches!(ch, ' ' | '\t' | '\n' | '\r')
}
#[inline]
pub fn is_xml_whitespace_str(s: &str) -> bool {
s.bytes().all(|b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'))
}
pub fn normalize_string_value(
value: &str,
is_attr: bool,
raise_on_error: bool,
) -> Result<String, XPathError> {
let mut result = String::with_capacity(value.len());
let mut chars = value.chars().peekable();
while let Some(ch) = chars.next() {
if ch == '&' {
let mut entity = String::new();
loop {
match chars.next() {
Some(';') => break,
Some(c) => entity.push(c),
None => {
if raise_on_error {
return Err(XPathError::syntax_error(
"Entity reference not terminated by semicolon",
));
}
result.push('&');
result.push_str(&entity);
break;
}
}
}
match resolve_entity(&entity) {
Some(resolved) => result.push(resolved),
None => {
if raise_on_error {
return Err(XPathError::syntax_error(format!(
"Unknown entity reference '&{};'",
entity
)));
}
result.push('&');
result.push_str(&entity);
result.push(';');
}
}
} else if is_attr && (ch == '\t' || ch == '\n' || ch == '\r') {
result.push(' ');
} else if ch == '\r' {
if chars.peek() == Some(&'\n') {
chars.next();
}
result.push('\n');
} else {
result.push(ch);
}
}
Ok(result)
}
fn resolve_entity(entity: &str) -> Option<char> {
match entity {
"lt" => Some('<'),
"gt" => Some('>'),
"amp" => Some('&'),
"quot" => Some('"'),
"apos" => Some('\''),
_ if entity.starts_with('#') => resolve_numeric_entity(&entity[1..]),
_ => None,
}
}
fn resolve_numeric_entity(entity: &str) -> Option<char> {
let code = if let Some(hex) = entity.strip_prefix('x') {
u32::from_str_radix(hex, 16).ok()?
} else {
entity.parse::<u32>().ok()?
};
char::from_u32(code)
}
pub fn concat(values: &[&str]) -> String {
values.concat()
}
pub fn starts_with(value: &str, prefix: &str) -> bool {
value.starts_with(prefix)
}
pub fn ends_with(value: &str, suffix: &str) -> bool {
value.ends_with(suffix)
}
pub fn contains(value: &str, substring: &str) -> bool {
value.contains(substring)
}
pub fn substring_before(value: &str, pattern: &str) -> String {
match value.find(pattern) {
Some(pos) => value[..pos].to_string(),
None => String::new(),
}
}
pub fn substring_after(value: &str, pattern: &str) -> String {
match value.find(pattern) {
Some(pos) => value[pos + pattern.len()..].to_string(),
None => String::new(),
}
}
pub fn string_length(value: &str) -> usize {
value.chars().count()
}
pub fn substring(value: &str, start: f64, length: Option<f64>) -> String {
if start.is_nan() {
return String::new();
}
let chars: Vec<char> = value.chars().collect();
let str_len = chars.len() as i64;
let start_rounded = start.round() as i64;
match length {
Some(len) => {
if len.is_nan() {
return String::new();
}
let len_rounded = len.round() as i64;
let first_pos = start_rounded.max(1); let last_pos = start_rounded + len_rounded;
if last_pos <= 1 || first_pos > str_len {
return String::new();
}
let begin_idx = (first_pos - 1) as usize;
let end_idx = ((last_pos - 1) as usize).min(chars.len());
if begin_idx >= end_idx {
return String::new();
}
chars[begin_idx..end_idx].iter().collect()
}
None => {
if start_rounded > str_len {
return String::new();
}
let begin_idx = (start_rounded.max(1) - 1) as usize;
chars[begin_idx..].iter().collect()
}
}
}
pub fn upper_case(value: &str) -> String {
value.to_uppercase()
}
pub fn lower_case(value: &str) -> String {
value.to_lowercase()
}
pub fn translate(value: &str, map_from: &str, map_to: &str) -> String {
let from_chars: Vec<char> = map_from.chars().collect();
let to_chars: Vec<char> = map_to.chars().collect();
value
.chars()
.filter_map(|ch| {
match from_chars.iter().position(|&c| c == ch) {
Some(pos) => {
if pos < to_chars.len() {
Some(to_chars[pos])
} else {
None }
}
None => Some(ch),
}
})
.collect()
}
pub fn string_to_codepoints(value: &str) -> Vec<u32> {
value.chars().map(|c| c as u32).collect()
}
pub fn codepoints_to_string(codepoints: &[u32]) -> Option<String> {
codepoints
.iter()
.map(|&cp| char::from_u32(cp))
.collect::<Option<String>>()
}
pub fn compare(a: &str, b: &str) -> i32 {
match a.cmp(b) {
std::cmp::Ordering::Less => -1,
std::cmp::Ordering::Equal => 0,
std::cmp::Ordering::Greater => 1,
}
}
pub fn string_join(values: &[&str], separator: &str) -> String {
values.join(separator)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum UnicodeNormalizationForm {
NFC,
NFD,
NFKC,
NFKD,
}
impl UnicodeNormalizationForm {
pub fn parse(s: &str) -> Option<Self> {
let trimmed = s.trim();
if trimmed.eq_ignore_ascii_case("NFC") {
Some(Self::NFC)
} else if trimmed.eq_ignore_ascii_case("NFD") {
Some(Self::NFD)
} else if trimmed.eq_ignore_ascii_case("NFKC") {
Some(Self::NFKC)
} else if trimmed.eq_ignore_ascii_case("NFKD") {
Some(Self::NFKD)
} else if trimmed.is_empty() {
None
} else {
None
}
}
}
#[cfg(feature = "unicode-normalization")]
pub fn normalize_unicode(value: &str, form: Option<UnicodeNormalizationForm>) -> String {
use unicode_normalization::UnicodeNormalization;
match form {
Some(UnicodeNormalizationForm::NFC) => value.nfc().collect(),
Some(UnicodeNormalizationForm::NFD) => value.nfd().collect(),
Some(UnicodeNormalizationForm::NFKC) => value.nfkc().collect(),
Some(UnicodeNormalizationForm::NFKD) => value.nfkd().collect(),
None => value.to_string(),
}
}
#[cfg(not(feature = "unicode-normalization"))]
pub fn normalize_unicode(
value: &str,
form: Option<UnicodeNormalizationForm>,
) -> Result<String, super::error::XPathError> {
match form {
None => Ok(value.to_string()),
Some(f) => Err(super::error::XPathError::not_implemented(format!(
"Unicode normalization form {:?} requires unicode-normalization feature",
f
))),
}
}
pub fn encode_for_uri(value: &str) -> String {
let mut result = String::with_capacity(value.len() * 3);
for byte in value.bytes() {
if byte.is_ascii_alphanumeric()
|| byte == b'-'
|| byte == b'_'
|| byte == b'.'
|| byte == b'~'
{
result.push(byte as char);
} else {
result.push('%');
result.push(to_hex_digit(byte >> 4));
result.push(to_hex_digit(byte & 0x0F));
}
}
result
}
pub fn iri_to_uri(value: &str) -> String {
let mut result = String::with_capacity(value.len() * 3);
for byte in value.bytes() {
if byte == b' ' {
result.push_str("%20");
} else if (0x20..0x7F).contains(&byte)
&& byte != b'<'
&& byte != b'>'
&& byte != b'"'
&& byte != b'{'
&& byte != b'}'
&& byte != b'|'
&& byte != b'\\'
&& byte != b'^'
&& byte != b'`'
{
result.push(byte as char);
} else {
result.push('%');
result.push(to_hex_digit(byte >> 4));
result.push(to_hex_digit(byte & 0x0F));
}
}
result
}
pub fn escape_html_uri(value: &str) -> String {
let mut result = String::with_capacity(value.len() * 3);
for byte in value.bytes() {
if (0x20..0x7F).contains(&byte) {
result.push(byte as char);
} else {
result.push('%');
result.push(to_hex_digit(byte >> 4));
result.push(to_hex_digit(byte & 0x0F));
}
}
result
}
#[inline]
fn to_hex_digit(nibble: u8) -> char {
if nibble < 10 {
(b'0' + nibble) as char
} else {
(b'A' + nibble - 10) as char
}
}
pub fn codepoint_equal(a: &str, b: &str) -> bool {
a == b
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_space() {
assert_eq!(normalize_space(" hello world "), "hello world");
assert_eq!(normalize_space("\t\nhello\r\nworld\t"), "hello world");
assert_eq!(normalize_space(""), "");
assert_eq!(normalize_space(" "), "");
assert_eq!(normalize_space("no extra spaces"), "no extra spaces");
}
#[test]
fn test_is_xml_whitespace() {
assert!(is_xml_whitespace(' '));
assert!(is_xml_whitespace('\t'));
assert!(is_xml_whitespace('\n'));
assert!(is_xml_whitespace('\r'));
assert!(!is_xml_whitespace('a'));
}
#[test]
fn test_is_xml_whitespace_str() {
assert!(is_xml_whitespace_str(""));
assert!(is_xml_whitespace_str(" "));
assert!(is_xml_whitespace_str(" \t\n\r"));
assert!(!is_xml_whitespace_str("hello"));
assert!(!is_xml_whitespace_str(" a "));
}
#[test]
fn test_normalize_string_value_entities() {
assert_eq!(
normalize_string_value("<>&"'", false, true).unwrap(),
"<>&\"'"
);
}
#[test]
fn test_normalize_string_value_numeric_entities() {
assert_eq!(
normalize_string_value("AB", false, true).unwrap(),
"AB"
);
}
#[test]
fn test_normalize_string_value_attr() {
assert_eq!(
normalize_string_value("a\tb\nc", true, true).unwrap(),
"a b c"
);
}
#[test]
fn test_normalize_string_value_newlines() {
assert_eq!(
normalize_string_value("a\r\nb\rc\n", false, true).unwrap(),
"a\nb\nc\n"
);
}
#[test]
fn test_concat() {
assert_eq!(concat(&["a", "b", "c"]), "abc");
assert_eq!(concat(&[]), "");
}
#[test]
fn test_starts_ends_with() {
assert!(starts_with("hello", "he"));
assert!(!starts_with("hello", "lo"));
assert!(ends_with("hello", "lo"));
assert!(!ends_with("hello", "he"));
}
#[test]
fn test_substring_before_after() {
assert_eq!(substring_before("hello world", " "), "hello");
assert_eq!(substring_after("hello world", " "), "world");
assert_eq!(substring_before("hello", " "), "");
assert_eq!(substring_after("hello", " "), "");
}
#[test]
fn test_string_length() {
assert_eq!(string_length("hello"), 5);
assert_eq!(string_length(""), 0);
assert_eq!(string_length("日本語"), 3); }
#[test]
fn test_substring() {
assert_eq!(substring("hello", 2.0, Some(3.0)), "ell");
assert_eq!(substring("hello", 2.0, None), "ello");
assert_eq!(substring("hello", 1.0, Some(5.0)), "hello");
assert_eq!(substring("hello", 0.0, Some(3.0)), "he");
}
#[test]
fn test_case_conversion() {
assert_eq!(upper_case("Hello World"), "HELLO WORLD");
assert_eq!(lower_case("Hello World"), "hello world");
}
#[test]
fn test_translate() {
assert_eq!(translate("bar", "abc", "ABC"), "BAr");
assert_eq!(translate("--aaa--", "abc-", "ABC"), "AAA");
}
#[test]
fn test_codepoints() {
assert_eq!(string_to_codepoints("ABC"), vec![65, 66, 67]);
assert_eq!(codepoints_to_string(&[65, 66, 67]).unwrap(), "ABC");
}
#[test]
fn test_compare() {
assert_eq!(compare("abc", "abd"), -1);
assert_eq!(compare("abc", "abc"), 0);
assert_eq!(compare("abd", "abc"), 1);
}
}