Skip to main content

xsd_schema/xpath/
string_ops.rs

1//! String operations for XPath evaluation.
2//!
3//! This module implements XPath 2.0 string functions and normalization:
4//! - `normalize-space`
5//! - `normalize-unicode`
6//! - Entity reference handling
7//! - Whitespace normalization
8
9use super::error::XPathError;
10
11/// Normalize whitespace in a string (XPath fn:normalize-space).
12///
13/// - Strips leading and trailing whitespace
14/// - Replaces sequences of whitespace with a single space
15///
16/// # Arguments
17///
18/// * `value` - The string to normalize
19///
20/// # Returns
21///
22/// The normalized string.
23pub fn normalize_space(value: &str) -> String {
24    let mut result = String::with_capacity(value.len());
25    let mut prev_was_space = true; // Start true to skip leading spaces
26
27    for ch in value.chars() {
28        if is_xml_whitespace(ch) {
29            if !prev_was_space {
30                result.push(' ');
31                prev_was_space = true;
32            }
33        } else {
34            result.push(ch);
35            prev_was_space = false;
36        }
37    }
38
39    // Remove trailing space if present
40    if result.ends_with(' ') {
41        result.pop();
42    }
43
44    result
45}
46
47/// Check if a character is XML whitespace.
48///
49/// XML defines whitespace as: space (0x20), tab (0x09), newline (0x0A), carriage return (0x0D)
50#[inline]
51pub fn is_xml_whitespace(ch: char) -> bool {
52    matches!(ch, ' ' | '\t' | '\n' | '\r')
53}
54
55/// Check if a string consists entirely of XML whitespace characters.
56///
57/// Returns `true` for the empty string (vacuously all-whitespace).
58#[inline]
59pub fn is_xml_whitespace_str(s: &str) -> bool {
60    s.bytes().all(|b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'))
61}
62
63/// Normalize a string value with entity reference handling.
64///
65/// Handles standard XML entity references:
66/// - `&lt;` -> `<`
67/// - `&gt;` -> `>`
68/// - `&amp;` -> `&`
69/// - `&quot;` -> `"`
70/// - `&apos;` -> `'`
71/// - `&#xNN;` -> character by hex code
72/// - `&#NN;` -> character by decimal code
73///
74/// # Arguments
75///
76/// * `value` - The string to normalize
77/// * `is_attr` - Whether this is an attribute value (applies additional normalization)
78/// * `raise_on_error` - Whether to raise an error on invalid entity references
79///
80/// # Returns
81///
82/// The normalized string, or an error for invalid entity references.
83pub fn normalize_string_value(
84    value: &str,
85    is_attr: bool,
86    raise_on_error: bool,
87) -> Result<String, XPathError> {
88    let mut result = String::with_capacity(value.len());
89    let mut chars = value.chars().peekable();
90
91    while let Some(ch) = chars.next() {
92        if ch == '&' {
93            // Parse entity reference
94            let mut entity = String::new();
95
96            loop {
97                match chars.next() {
98                    Some(';') => break,
99                    Some(c) => entity.push(c),
100                    None => {
101                        if raise_on_error {
102                            return Err(XPathError::syntax_error(
103                                "Entity reference not terminated by semicolon",
104                            ));
105                        }
106                        result.push('&');
107                        result.push_str(&entity);
108                        break;
109                    }
110                }
111            }
112
113            match resolve_entity(&entity) {
114                Some(resolved) => result.push(resolved),
115                None => {
116                    if raise_on_error {
117                        return Err(XPathError::syntax_error(format!(
118                            "Unknown entity reference '&{};'",
119                            entity
120                        )));
121                    }
122                    result.push('&');
123                    result.push_str(&entity);
124                    result.push(';');
125                }
126            }
127        } else if is_attr && (ch == '\t' || ch == '\n' || ch == '\r') {
128            // In attribute values, normalize newlines and tabs to space
129            result.push(' ');
130        } else if ch == '\r' {
131            // Normalize \r\n to \n, and standalone \r to \n
132            if chars.peek() == Some(&'\n') {
133                chars.next();
134            }
135            result.push('\n');
136        } else {
137            result.push(ch);
138        }
139    }
140
141    Ok(result)
142}
143
144/// Resolve an entity reference name to its character.
145fn resolve_entity(entity: &str) -> Option<char> {
146    match entity {
147        "lt" => Some('<'),
148        "gt" => Some('>'),
149        "amp" => Some('&'),
150        "quot" => Some('"'),
151        "apos" => Some('\''),
152        _ if entity.starts_with('#') => resolve_numeric_entity(&entity[1..]),
153        _ => None,
154    }
155}
156
157/// Resolve a numeric entity reference (decimal or hex).
158fn resolve_numeric_entity(entity: &str) -> Option<char> {
159    let code = if let Some(hex) = entity.strip_prefix('x') {
160        u32::from_str_radix(hex, 16).ok()?
161    } else {
162        entity.parse::<u32>().ok()?
163    };
164
165    char::from_u32(code)
166}
167
168/// Concatenate strings.
169pub fn concat(values: &[&str]) -> String {
170    values.concat()
171}
172
173/// Check if a string starts with a prefix.
174pub fn starts_with(value: &str, prefix: &str) -> bool {
175    value.starts_with(prefix)
176}
177
178/// Check if a string ends with a suffix.
179pub fn ends_with(value: &str, suffix: &str) -> bool {
180    value.ends_with(suffix)
181}
182
183/// Check if a string contains a substring.
184pub fn contains(value: &str, substring: &str) -> bool {
185    value.contains(substring)
186}
187
188/// Get the substring before the first occurrence of a pattern.
189pub fn substring_before(value: &str, pattern: &str) -> String {
190    match value.find(pattern) {
191        Some(pos) => value[..pos].to_string(),
192        None => String::new(),
193    }
194}
195
196/// Get the substring after the first occurrence of a pattern.
197pub fn substring_after(value: &str, pattern: &str) -> String {
198    match value.find(pattern) {
199        Some(pos) => value[pos + pattern.len()..].to_string(),
200        None => String::new(),
201    }
202}
203
204/// Get the length of a string in characters.
205pub fn string_length(value: &str) -> usize {
206    value.chars().count()
207}
208
209/// Get a substring (XPath-style 1-based indexing).
210///
211/// XPath spec: Returns characters whose position p satisfies:
212/// `round(start) <= p < round(start) + round(length)`
213///
214/// # Arguments
215///
216/// * `value` - The source string
217/// * `start` - Start position (1-based, can be negative or fractional)
218/// * `length` - Optional length
219pub fn substring(value: &str, start: f64, length: Option<f64>) -> String {
220    // Handle NaN cases
221    if start.is_nan() {
222        return String::new();
223    }
224
225    let chars: Vec<char> = value.chars().collect();
226    let str_len = chars.len() as i64;
227
228    // XPath uses round() for positions (round half away from zero)
229    let start_rounded = start.round() as i64;
230
231    match length {
232        Some(len) => {
233            if len.is_nan() {
234                return String::new();
235            }
236            let len_rounded = len.round() as i64;
237
238            // XPath condition: round(start) <= p < round(start) + round(length)
239            // Convert to 0-based: positions [start_rounded, start_rounded + len_rounded)
240            // In 0-based indices: [start_rounded - 1, start_rounded + len_rounded - 1)
241
242            // Handle start < 1 (reduces effective length from the beginning)
243            let first_pos = start_rounded.max(1); // First valid position (1-based)
244            let last_pos = start_rounded + len_rounded; // Exclusive end position (1-based)
245
246            if last_pos <= 1 || first_pos > str_len {
247                return String::new();
248            }
249
250            let begin_idx = (first_pos - 1) as usize;
251            let end_idx = ((last_pos - 1) as usize).min(chars.len());
252
253            if begin_idx >= end_idx {
254                return String::new();
255            }
256
257            chars[begin_idx..end_idx].iter().collect()
258        }
259        None => {
260            // No length: from start to end of string
261            if start_rounded > str_len {
262                return String::new();
263            }
264            let begin_idx = (start_rounded.max(1) - 1) as usize;
265            chars[begin_idx..].iter().collect()
266        }
267    }
268}
269
270/// Convert string to uppercase.
271pub fn upper_case(value: &str) -> String {
272    value.to_uppercase()
273}
274
275/// Convert string to lowercase.
276pub fn lower_case(value: &str) -> String {
277    value.to_lowercase()
278}
279
280/// Translate characters in a string.
281///
282/// Implements XPath `translate(string, map-from, map-to)`.
283pub fn translate(value: &str, map_from: &str, map_to: &str) -> String {
284    let from_chars: Vec<char> = map_from.chars().collect();
285    let to_chars: Vec<char> = map_to.chars().collect();
286
287    value
288        .chars()
289        .filter_map(|ch| {
290            match from_chars.iter().position(|&c| c == ch) {
291                Some(pos) => {
292                    if pos < to_chars.len() {
293                        Some(to_chars[pos])
294                    } else {
295                        None // Remove character if no replacement
296                    }
297                }
298                None => Some(ch),
299            }
300        })
301        .collect()
302}
303
304/// Convert a string to a sequence of codepoints.
305pub fn string_to_codepoints(value: &str) -> Vec<u32> {
306    value.chars().map(|c| c as u32).collect()
307}
308
309/// Convert a sequence of codepoints to a string.
310pub fn codepoints_to_string(codepoints: &[u32]) -> Option<String> {
311    codepoints
312        .iter()
313        .map(|&cp| char::from_u32(cp))
314        .collect::<Option<String>>()
315}
316
317/// Compare two strings.
318///
319/// Returns -1, 0, or 1 for less than, equal, or greater than.
320pub fn compare(a: &str, b: &str) -> i32 {
321    match a.cmp(b) {
322        std::cmp::Ordering::Less => -1,
323        std::cmp::Ordering::Equal => 0,
324        std::cmp::Ordering::Greater => 1,
325    }
326}
327
328/// Join strings with a separator.
329///
330/// Implements XPath `fn:string-join($strings, $separator)`.
331pub fn string_join(values: &[&str], separator: &str) -> String {
332    values.join(separator)
333}
334
335/// Unicode normalization forms.
336#[derive(Debug, Clone, Copy, PartialEq, Eq)]
337pub enum UnicodeNormalizationForm {
338    /// NFC (Canonical Decomposition, followed by Canonical Composition)
339    NFC,
340    /// NFD (Canonical Decomposition)
341    NFD,
342    /// NFKC (Compatibility Decomposition, followed by Canonical Composition)
343    NFKC,
344    /// NFKD (Compatibility Decomposition)
345    NFKD,
346}
347
348impl UnicodeNormalizationForm {
349    /// Parse normalization form from string (case-insensitive).
350    pub fn parse(s: &str) -> Option<Self> {
351        let trimmed = s.trim();
352        if trimmed.eq_ignore_ascii_case("NFC") {
353            Some(Self::NFC)
354        } else if trimmed.eq_ignore_ascii_case("NFD") {
355            Some(Self::NFD)
356        } else if trimmed.eq_ignore_ascii_case("NFKC") {
357            Some(Self::NFKC)
358        } else if trimmed.eq_ignore_ascii_case("NFKD") {
359            Some(Self::NFKD)
360        } else if trimmed.is_empty() {
361            // Empty string means no normalization
362            None
363        } else {
364            None
365        }
366    }
367}
368
369/// Normalize a string using Unicode normalization.
370///
371/// Uses the `unicode-normalization` crate for actual normalization.
372/// If form is None (empty string input), returns the input unchanged.
373#[cfg(feature = "unicode-normalization")]
374pub fn normalize_unicode(value: &str, form: Option<UnicodeNormalizationForm>) -> String {
375    use unicode_normalization::UnicodeNormalization;
376
377    match form {
378        Some(UnicodeNormalizationForm::NFC) => value.nfc().collect(),
379        Some(UnicodeNormalizationForm::NFD) => value.nfd().collect(),
380        Some(UnicodeNormalizationForm::NFKC) => value.nfkc().collect(),
381        Some(UnicodeNormalizationForm::NFKD) => value.nfkd().collect(),
382        None => value.to_string(),
383    }
384}
385
386/// Normalize a string using Unicode normalization (fallback without feature).
387///
388/// Without the unicode-normalization feature, this only handles the no-op case.
389#[cfg(not(feature = "unicode-normalization"))]
390pub fn normalize_unicode(
391    value: &str,
392    form: Option<UnicodeNormalizationForm>,
393) -> Result<String, super::error::XPathError> {
394    match form {
395        None => Ok(value.to_string()),
396        Some(f) => Err(super::error::XPathError::not_implemented(format!(
397            "Unicode normalization form {:?} requires unicode-normalization feature",
398            f
399        ))),
400    }
401}
402
403/// Encode a string for use in a URI per RFC 3986.
404///
405/// Only alphanumeric characters and `-`, `_`, `.`, `~` are left unescaped.
406/// All other characters are percent-encoded using UTF-8.
407pub fn encode_for_uri(value: &str) -> String {
408    let mut result = String::with_capacity(value.len() * 3);
409    for byte in value.bytes() {
410        if byte.is_ascii_alphanumeric()
411            || byte == b'-'
412            || byte == b'_'
413            || byte == b'.'
414            || byte == b'~'
415        {
416            result.push(byte as char);
417        } else {
418            result.push('%');
419            result.push(to_hex_digit(byte >> 4));
420            result.push(to_hex_digit(byte & 0x0F));
421        }
422    }
423    result
424}
425
426/// Escape an IRI to produce a valid URI.
427///
428/// Less restrictive than encode-for-uri: allows most ASCII printable characters
429/// except space, `<`, `>`, `"`, `{`, `}`, `|`, `\`, `^`, and `` ` ``.
430pub fn iri_to_uri(value: &str) -> String {
431    let mut result = String::with_capacity(value.len() * 3);
432    for byte in value.bytes() {
433        // Space is always encoded
434        if byte == b' ' {
435            result.push_str("%20");
436        } else if (0x20..0x7F).contains(&byte)
437            && byte != b'<'
438            && byte != b'>'
439            && byte != b'"'
440            && byte != b'{'
441            && byte != b'}'
442            && byte != b'|'
443            && byte != b'\\'
444            && byte != b'^'
445            && byte != b'`'
446        {
447            result.push(byte as char);
448        } else {
449            result.push('%');
450            result.push(to_hex_digit(byte >> 4));
451            result.push(to_hex_digit(byte & 0x0F));
452        }
453    }
454    result
455}
456
457/// Escape a URI for use in HTML.
458///
459/// Escapes characters outside the ASCII printable range (0x20-0x7E).
460pub fn escape_html_uri(value: &str) -> String {
461    let mut result = String::with_capacity(value.len() * 3);
462    for byte in value.bytes() {
463        if (0x20..0x7F).contains(&byte) {
464            result.push(byte as char);
465        } else {
466            result.push('%');
467            result.push(to_hex_digit(byte >> 4));
468            result.push(to_hex_digit(byte & 0x0F));
469        }
470    }
471    result
472}
473
474/// Convert a nibble (0-15) to a hex digit character.
475#[inline]
476fn to_hex_digit(nibble: u8) -> char {
477    if nibble < 10 {
478        (b'0' + nibble) as char
479    } else {
480        (b'A' + nibble - 10) as char
481    }
482}
483
484/// Compare two strings by codepoint (ordinal comparison).
485///
486/// Returns true if the strings are equal by codepoint comparison.
487pub fn codepoint_equal(a: &str, b: &str) -> bool {
488    a == b
489}
490
491#[cfg(test)]
492mod tests {
493    use super::*;
494
495    #[test]
496    fn test_normalize_space() {
497        assert_eq!(normalize_space("  hello   world  "), "hello world");
498        assert_eq!(normalize_space("\t\nhello\r\nworld\t"), "hello world");
499        assert_eq!(normalize_space(""), "");
500        assert_eq!(normalize_space("   "), "");
501        assert_eq!(normalize_space("no extra spaces"), "no extra spaces");
502    }
503
504    #[test]
505    fn test_is_xml_whitespace() {
506        assert!(is_xml_whitespace(' '));
507        assert!(is_xml_whitespace('\t'));
508        assert!(is_xml_whitespace('\n'));
509        assert!(is_xml_whitespace('\r'));
510        assert!(!is_xml_whitespace('a'));
511    }
512
513    #[test]
514    fn test_is_xml_whitespace_str() {
515        assert!(is_xml_whitespace_str(""));
516        assert!(is_xml_whitespace_str(" "));
517        assert!(is_xml_whitespace_str(" \t\n\r"));
518        assert!(!is_xml_whitespace_str("hello"));
519        assert!(!is_xml_whitespace_str(" a "));
520    }
521
522    #[test]
523    fn test_normalize_string_value_entities() {
524        assert_eq!(
525            normalize_string_value("&lt;&gt;&amp;&quot;&apos;", false, true).unwrap(),
526            "<>&\"'"
527        );
528    }
529
530    #[test]
531    fn test_normalize_string_value_numeric_entities() {
532        assert_eq!(
533            normalize_string_value("&#65;&#x42;", false, true).unwrap(),
534            "AB"
535        );
536    }
537
538    #[test]
539    fn test_normalize_string_value_attr() {
540        assert_eq!(
541            normalize_string_value("a\tb\nc", true, true).unwrap(),
542            "a b c"
543        );
544    }
545
546    #[test]
547    fn test_normalize_string_value_newlines() {
548        assert_eq!(
549            normalize_string_value("a\r\nb\rc\n", false, true).unwrap(),
550            "a\nb\nc\n"
551        );
552    }
553
554    #[test]
555    fn test_concat() {
556        assert_eq!(concat(&["a", "b", "c"]), "abc");
557        assert_eq!(concat(&[]), "");
558    }
559
560    #[test]
561    fn test_starts_ends_with() {
562        assert!(starts_with("hello", "he"));
563        assert!(!starts_with("hello", "lo"));
564        assert!(ends_with("hello", "lo"));
565        assert!(!ends_with("hello", "he"));
566    }
567
568    #[test]
569    fn test_substring_before_after() {
570        assert_eq!(substring_before("hello world", " "), "hello");
571        assert_eq!(substring_after("hello world", " "), "world");
572        assert_eq!(substring_before("hello", " "), "");
573        assert_eq!(substring_after("hello", " "), "");
574    }
575
576    #[test]
577    fn test_string_length() {
578        assert_eq!(string_length("hello"), 5);
579        assert_eq!(string_length(""), 0);
580        assert_eq!(string_length("日本語"), 3); // Multi-byte chars
581    }
582
583    #[test]
584    fn test_substring() {
585        assert_eq!(substring("hello", 2.0, Some(3.0)), "ell");
586        assert_eq!(substring("hello", 2.0, None), "ello");
587        assert_eq!(substring("hello", 1.0, Some(5.0)), "hello");
588        assert_eq!(substring("hello", 0.0, Some(3.0)), "he");
589    }
590
591    #[test]
592    fn test_case_conversion() {
593        assert_eq!(upper_case("Hello World"), "HELLO WORLD");
594        assert_eq!(lower_case("Hello World"), "hello world");
595    }
596
597    #[test]
598    fn test_translate() {
599        assert_eq!(translate("bar", "abc", "ABC"), "BAr");
600        assert_eq!(translate("--aaa--", "abc-", "ABC"), "AAA");
601    }
602
603    #[test]
604    fn test_codepoints() {
605        assert_eq!(string_to_codepoints("ABC"), vec![65, 66, 67]);
606        assert_eq!(codepoints_to_string(&[65, 66, 67]).unwrap(), "ABC");
607    }
608
609    #[test]
610    fn test_compare() {
611        assert_eq!(compare("abc", "abd"), -1);
612        assert_eq!(compare("abc", "abc"), 0);
613        assert_eq!(compare("abd", "abc"), 1);
614    }
615}