Skip to main content

xml_3dm/node/
xml_content.rs

1//! XML content types for tree nodes.
2//!
3//! This module provides `XmlContent`, which represents the content of an XML node,
4//! either an element (tag with attributes) or text content.
5
6use crate::constants::{ATTR_INFO, ATTR_VALUE_THRESHOLD, ELEMENT_NAME_INFO, TEXT_THRESHOLD};
7use md5::{Digest, Md5};
8use std::collections::HashMap;
9
10use super::namespace::ExpandedName;
11
12/// Represents the content of an XML node.
13#[derive(Debug, Clone)]
14pub enum XmlContent {
15    /// An XML element with a qualified name and attributes.
16    Element(XmlElement),
17    /// XML text content.
18    Text(XmlText),
19    /// XML comment.
20    Comment(XmlComment),
21    /// XML processing instruction.
22    ProcessingInstruction(XmlProcessingInstruction),
23}
24
25impl XmlContent {
26    /// Returns the information size of this content.
27    ///
28    /// This metric is used for similarity calculations and copy detection.
29    pub fn info_size(&self) -> i32 {
30        match self {
31            XmlContent::Element(e) => e.info_size,
32            XmlContent::Text(t) => t.info_size,
33            XmlContent::Comment(c) => c.info_size,
34            XmlContent::ProcessingInstruction(pi) => pi.info_size,
35        }
36    }
37
38    /// Tests content equality using MD5 hash comparison.
39    pub fn content_equals(&self, other: &XmlContent) -> bool {
40        match (self, other) {
41            (XmlContent::Element(a), XmlContent::Element(b)) => a.content_equals(b),
42            (XmlContent::Text(a), XmlContent::Text(b)) => a.content_equals(b),
43            (XmlContent::Comment(a), XmlContent::Comment(b)) => a.content_equals(b),
44            (XmlContent::ProcessingInstruction(a), XmlContent::ProcessingInstruction(b)) => {
45                a.content_equals(b)
46            }
47            _ => false,
48        }
49    }
50
51    /// Returns a 32-bit hash code for this content.
52    ///
53    /// This is used for fast equality pre-checks and child list distance calculations.
54    pub fn content_hash(&self) -> i32 {
55        match self {
56            XmlContent::Element(e) => e.content_hash(),
57            XmlContent::Text(t) => t.content_hash(),
58            XmlContent::Comment(c) => c.content_hash(),
59            XmlContent::ProcessingInstruction(pi) => pi.content_hash(),
60        }
61    }
62
63    /// Returns true if this is an element node.
64    pub fn is_element(&self) -> bool {
65        matches!(self, XmlContent::Element(_))
66    }
67
68    /// Returns true if this is a text node.
69    pub fn is_text(&self) -> bool {
70        matches!(self, XmlContent::Text(_))
71    }
72
73    /// Returns true if this is a comment node.
74    pub fn is_comment(&self) -> bool {
75        matches!(self, XmlContent::Comment(_))
76    }
77
78    /// Returns true if this is a processing instruction node.
79    pub fn is_processing_instruction(&self) -> bool {
80        matches!(self, XmlContent::ProcessingInstruction(_))
81    }
82
83    /// Returns a reference to the element, if this is an element node.
84    pub fn as_element(&self) -> Option<&XmlElement> {
85        match self {
86            XmlContent::Element(e) => Some(e),
87            _ => None,
88        }
89    }
90
91    /// Returns a mutable reference to the element, if this is an element node.
92    pub fn as_element_mut(&mut self) -> Option<&mut XmlElement> {
93        match self {
94            XmlContent::Element(e) => Some(e),
95            _ => None,
96        }
97    }
98
99    /// Returns a reference to the text, if this is a text node.
100    pub fn as_text(&self) -> Option<&XmlText> {
101        match self {
102            XmlContent::Text(t) => Some(t),
103            _ => None,
104        }
105    }
106
107    /// Returns a mutable reference to the text, if this is a text node.
108    pub fn as_text_mut(&mut self) -> Option<&mut XmlText> {
109        match self {
110            XmlContent::Text(t) => Some(t),
111            _ => None,
112        }
113    }
114
115    /// Returns a reference to the PI, if this is a processing instruction node.
116    pub fn as_processing_instruction(&self) -> Option<&XmlProcessingInstruction> {
117        match self {
118            XmlContent::ProcessingInstruction(pi) => Some(pi),
119            _ => None,
120        }
121    }
122}
123
124/// Calculates MD5 hash of character data, matching Java's byte ordering.
125///
126/// Java's `MessageDigest.update()` is called with:
127/// - `(byte) (char & 0xff)` - low byte
128/// - `(byte) (char >> 8)` - high byte
129///
130/// This is little-endian byte order for each UTF-16 code unit.
131fn calculate_hash_chars(data: &[char]) -> [u8; 16] {
132    let mut hasher = Md5::new();
133    for &c in data {
134        let code = c as u16;
135        hasher.update([(code & 0xff) as u8, (code >> 8) as u8]);
136    }
137    hasher.finalize().into()
138}
139
140/// Calculates MD5 hash of a string, matching Java's byte ordering.
141fn calculate_hash_str(data: &str) -> [u8; 16] {
142    let mut hasher = Md5::new();
143    // Java strings are UTF-16, so we iterate over UTF-16 code units
144    for code in data.encode_utf16() {
145        hasher.update([(code & 0xff) as u8, (code >> 8) as u8]);
146    }
147    hasher.finalize().into()
148}
149
150/// Converts the first 4 bytes of an MD5 hash to a 32-bit integer.
151///
152/// This matches Java's behavior where bytes are sign-extended when used
153/// in arithmetic expressions. In Java:
154/// ```java
155/// hash[0] + (hash[1] << 8) + (hash[2] << 16) + (hash[3] << 24)
156/// ```
157/// Since Java bytes are signed, negative values get sign-extended.
158fn hash_to_i32(hash: &[u8; 16]) -> i32 {
159    // Cast to i8 first to get Java's signed byte behavior, then sign-extend to i32
160    let b0 = hash[0] as i8 as i32;
161    let b1 = hash[1] as i8 as i32;
162    let b2 = hash[2] as i8 as i32;
163    let b3 = hash[3] as i8 as i32;
164    b0 + (b1 << 8) + (b2 << 16) + (b3 << 24)
165}
166
167/// An XML element with a qualified name and attributes.
168#[derive(Debug, Clone)]
169pub struct XmlElement {
170    /// The qualified name of the element (e.g., "div", "ns:element").
171    name: String,
172    /// The expanded name (namespace URI + local name), if parsed with namespace awareness.
173    expanded_name: Option<ExpandedName>,
174    /// Namespace declarations on this element (prefix -> URI).
175    namespace_decls: HashMap<String, String>,
176    /// Attributes as key-value pairs. The key is the qualified attribute name.
177    attributes: HashMap<String, String>,
178    /// Cached hash code of the element name (matches Java's String.hashCode()).
179    name_hash: i32,
180    /// MD5 hash of the attributes.
181    attr_hash: [u8; 16],
182    /// Information size metric.
183    info_size: i32,
184}
185
186impl XmlElement {
187    /// Creates a new XML element with the given name and attributes.
188    pub fn new(name: String, attributes: HashMap<String, String>) -> Self {
189        Self::new_with_namespace(name, None, HashMap::new(), attributes)
190    }
191
192    /// Creates a new element with namespace information.
193    pub fn new_with_namespace(
194        name: String,
195        expanded_name: Option<ExpandedName>,
196        namespace_decls: HashMap<String, String>,
197        attributes: HashMap<String, String>,
198    ) -> Self {
199        let mut element = XmlElement {
200            name,
201            expanded_name,
202            namespace_decls,
203            attributes,
204            name_hash: 0,
205            attr_hash: [0; 16],
206            info_size: 0,
207        };
208        element.rehash();
209        element
210    }
211
212    /// Recalculates the hash values and info size.
213    ///
214    /// This should be called after modifying the name or attributes.
215    pub fn rehash(&mut self) {
216        self.name_hash = java_string_hash(&self.name);
217        self.info_size = ELEMENT_NAME_INFO;
218
219        let mut hasher = Md5::new();
220
221        // Sort attribute names for deterministic hashing
222        // Note: Java's AttributesImpl iteration order may differ, but for our
223        // purposes we need consistent ordering. The Java code iterates in
224        // insertion order, which we'll match by sorting.
225        let mut attr_names: Vec<&String> = self.attributes.keys().collect();
226        attr_names.sort();
227
228        for attr_name in attr_names {
229            let attr_value = &self.attributes[attr_name];
230            let vsize = attr_value.chars().count() as i32;
231            self.info_size += ATTR_INFO
232                + if vsize > ATTR_VALUE_THRESHOLD {
233                    vsize - ATTR_VALUE_THRESHOLD
234                } else {
235                    1
236                };
237            hasher.update(calculate_hash_str(attr_name));
238            hasher.update(calculate_hash_str(attr_value));
239        }
240
241        self.attr_hash = hasher.finalize().into();
242    }
243
244    /// Returns the qualified name of the element.
245    pub fn qname(&self) -> &str {
246        &self.name
247    }
248
249    /// Sets the qualified name of the element.
250    ///
251    /// Note: This does not automatically rehash. Call `rehash()` after
252    /// modifying if hash values are needed.
253    pub fn set_qname(&mut self, name: String) {
254        self.name = name;
255    }
256
257    /// Returns the attributes.
258    pub fn attributes(&self) -> &HashMap<String, String> {
259        &self.attributes
260    }
261
262    /// Returns a mutable reference to the attributes.
263    ///
264    /// Note: This does not automatically rehash. Call `rehash()` after
265    /// modifying if hash values are needed.
266    pub fn attributes_mut(&mut self) -> &mut HashMap<String, String> {
267        &mut self.attributes
268    }
269
270    /// Sets the attributes.
271    ///
272    /// Note: This does not automatically rehash. Call `rehash()` after
273    /// modifying if hash values are needed.
274    pub fn set_attributes(&mut self, attributes: HashMap<String, String>) {
275        self.attributes = attributes;
276    }
277
278    /// Returns the expanded name, if available.
279    pub fn expanded_name(&self) -> Option<&ExpandedName> {
280        self.expanded_name.as_ref()
281    }
282
283    /// Returns namespace declarations on this element.
284    pub fn namespace_decls(&self) -> &HashMap<String, String> {
285        &self.namespace_decls
286    }
287
288    /// Compares element names with namespace awareness.
289    /// Falls back to string comparison if neither has expanded names.
290    pub fn names_match(&self, other: &XmlElement) -> bool {
291        match (&self.expanded_name, &other.expanded_name) {
292            (Some(a), Some(b)) => a == b,
293            (None, None) => self.name == other.name,
294            _ => false,
295        }
296    }
297
298    /// Tests content equality using hash comparison.
299    ///
300    /// Note: This compares element name and attributes only, not namespace
301    /// declarations. Use `namespace_decls_equal` for namespace comparison.
302    pub fn content_equals(&self, other: &XmlElement) -> bool {
303        self.name_hash == other.name_hash && self.attr_hash == other.attr_hash
304    }
305
306    /// Tests whether namespace declarations are equal.
307    pub fn namespace_decls_equal(&self, other: &XmlElement) -> bool {
308        self.namespace_decls == other.namespace_decls
309    }
310
311    /// Returns a 32-bit hash code for this element.
312    pub fn content_hash(&self) -> i32 {
313        hash_to_i32(&self.attr_hash) ^ self.name_hash
314    }
315
316    /// Returns the information size.
317    pub fn info_size(&self) -> i32 {
318        self.info_size
319    }
320}
321
322impl std::fmt::Display for XmlElement {
323    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
324        write!(f, "{} {{", self.name)?;
325        let mut first = true;
326        // Sort for consistent output
327        let mut attr_names: Vec<&String> = self.attributes.keys().collect();
328        attr_names.sort();
329        for name in attr_names {
330            if !first {
331                write!(f, " ")?;
332            }
333            first = false;
334            write!(f, " {}={}", name, self.attributes[name])?;
335        }
336        write!(f, "}}")
337    }
338}
339
340/// XML text content.
341#[derive(Debug, Clone)]
342pub struct XmlText {
343    /// The text content as a character array (matching Java's char[]).
344    text: Vec<char>,
345    /// MD5 hash of the text content.
346    content_hash: [u8; 16],
347    /// Information size metric.
348    info_size: i32,
349}
350
351impl XmlText {
352    /// Creates a new text node from a string.
353    pub fn new(text: &str) -> Self {
354        let chars: Vec<char> = text.chars().collect();
355        Self::from_chars(chars)
356    }
357
358    /// Creates a new text node from a character array.
359    pub fn from_chars(text: Vec<char>) -> Self {
360        let content_hash = calculate_hash_chars(&text);
361        let len = text.len() as i32;
362        let info_size = if len > TEXT_THRESHOLD {
363            len - TEXT_THRESHOLD
364        } else {
365            1
366        };
367        XmlText {
368            text,
369            content_hash,
370            info_size,
371        }
372    }
373
374    /// Creates a new text node from a slice of characters.
375    pub fn from_char_slice(text: &[char], start: usize, length: usize) -> Self {
376        let chars: Vec<char> = text[start..start + length].to_vec();
377        Self::from_chars(chars)
378    }
379
380    /// Tests content equality using MD5 hash comparison.
381    pub fn content_equals(&self, other: &XmlText) -> bool {
382        self.content_hash == other.content_hash
383    }
384
385    /// Returns the text as a character slice.
386    pub fn text(&self) -> &[char] {
387        &self.text
388    }
389
390    /// Sets the text content.
391    ///
392    /// Note: This recalculates the hash and info size.
393    pub fn set_text(&mut self, text: Vec<char>) {
394        self.content_hash = calculate_hash_chars(&text);
395        let len = text.len() as i32;
396        self.info_size = if len > TEXT_THRESHOLD {
397            len - TEXT_THRESHOLD
398        } else {
399            1
400        };
401        self.text = text;
402    }
403
404    /// Returns a 32-bit hash code for this text node.
405    pub fn content_hash(&self) -> i32 {
406        hash_to_i32(&self.content_hash)
407    }
408
409    /// Returns the information size.
410    pub fn info_size(&self) -> i32 {
411        self.info_size
412    }
413}
414
415impl std::fmt::Display for XmlText {
416    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
417        let s: String = self.text.iter().collect();
418        write!(f, "{}", s)
419    }
420}
421
422/// XML comment content.
423#[derive(Debug, Clone)]
424pub struct XmlComment {
425    /// The comment text (without the <!-- and --> markers).
426    text: Vec<char>,
427    /// MD5 hash of the comment content.
428    content_hash: [u8; 16],
429    /// Information size metric (comments have minimal info size).
430    info_size: i32,
431}
432
433impl XmlComment {
434    /// Creates a new comment node from a string.
435    pub fn new(text: &str) -> Self {
436        let chars: Vec<char> = text.chars().collect();
437        Self::from_chars(chars)
438    }
439
440    /// Creates a new comment node from a character array.
441    pub fn from_chars(text: Vec<char>) -> Self {
442        let content_hash = calculate_hash_chars(&text);
443        // Comments have minimal info size (don't contribute much to structure)
444        let info_size = 1;
445        XmlComment {
446            text,
447            content_hash,
448            info_size,
449        }
450    }
451
452    /// Tests content equality using MD5 hash comparison.
453    pub fn content_equals(&self, other: &XmlComment) -> bool {
454        self.content_hash == other.content_hash
455    }
456
457    /// Returns the comment text as a character slice.
458    pub fn text(&self) -> &[char] {
459        &self.text
460    }
461
462    /// Sets the comment text.
463    pub fn set_text(&mut self, text: Vec<char>) {
464        self.content_hash = calculate_hash_chars(&text);
465        self.text = text;
466    }
467
468    /// Returns a 32-bit hash code for this comment node.
469    pub fn content_hash(&self) -> i32 {
470        hash_to_i32(&self.content_hash)
471    }
472
473    /// Returns the information size.
474    pub fn info_size(&self) -> i32 {
475        self.info_size
476    }
477}
478
479impl std::fmt::Display for XmlComment {
480    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
481        let s: String = self.text.iter().collect();
482        write!(f, "<!-- {} -->", s)
483    }
484}
485
486/// XML processing instruction content.
487#[derive(Debug, Clone)]
488pub struct XmlProcessingInstruction {
489    /// The target of the PI (e.g., "xml-stylesheet").
490    target: String,
491    /// The content/data of the PI (everything after the target).
492    content: String,
493    /// MD5 hash of combined target and content.
494    content_hash: [u8; 16],
495    /// Information size metric (PIs have minimal info size like comments).
496    info_size: i32,
497}
498
499impl XmlProcessingInstruction {
500    /// Creates a new PI from target and content strings.
501    pub fn new(target: &str, content: &str) -> Self {
502        let content_hash = Self::calculate_hash(target, content);
503        XmlProcessingInstruction {
504            target: target.to_string(),
505            content: content.to_string(),
506            content_hash,
507            info_size: 1,
508        }
509    }
510
511    fn calculate_hash(target: &str, content: &str) -> [u8; 16] {
512        use md5::{Digest, Md5};
513        let mut hasher = Md5::new();
514        for code in target.encode_utf16() {
515            hasher.update([(code & 0xff) as u8, (code >> 8) as u8]);
516        }
517        for code in content.encode_utf16() {
518            hasher.update([(code & 0xff) as u8, (code >> 8) as u8]);
519        }
520        hasher.finalize().into()
521    }
522
523    /// Tests content equality using MD5 hash comparison.
524    pub fn content_equals(&self, other: &XmlProcessingInstruction) -> bool {
525        self.content_hash == other.content_hash
526    }
527
528    /// Returns the PI target.
529    pub fn target(&self) -> &str {
530        &self.target
531    }
532
533    /// Returns the PI content.
534    pub fn content(&self) -> &str {
535        &self.content
536    }
537
538    /// Returns a 32-bit hash code for this PI node.
539    pub fn content_hash(&self) -> i32 {
540        hash_to_i32(&self.content_hash)
541    }
542
543    /// Returns the information size.
544    pub fn info_size(&self) -> i32 {
545        self.info_size
546    }
547}
548
549impl std::fmt::Display for XmlProcessingInstruction {
550    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
551        if self.content.is_empty() {
552            write!(f, "<?{}?>", self.target)
553        } else {
554            write!(f, "<?{} {}?>", self.target, self.content)
555        }
556    }
557}
558
559/// Computes a hash code compatible with Java's String.hashCode().
560///
561/// Java's algorithm: `s[0]*31^(n-1) + s[1]*31^(n-2) + ... + s[n-1]`
562/// where n is the length of the string.
563///
564/// This uses wrapping arithmetic to match Java's int overflow behavior.
565pub fn java_string_hash(s: &str) -> i32 {
566    let mut hash: i32 = 0;
567    for code in s.encode_utf16() {
568        hash = hash.wrapping_mul(31).wrapping_add(code as i32);
569    }
570    hash
571}
572
573#[cfg(test)]
574mod tests {
575    use super::*;
576
577    #[test]
578    fn test_java_string_hash() {
579        // Test some known Java String.hashCode() values
580        assert_eq!(java_string_hash(""), 0);
581        assert_eq!(java_string_hash("a"), 97);
582        assert_eq!(java_string_hash("ab"), 97 * 31 + 98);
583        assert_eq!(java_string_hash("hello"), 99162322);
584    }
585
586    #[test]
587    fn test_text_node_equality() {
588        let t1 = XmlText::new("hello world");
589        let t2 = XmlText::new("hello world");
590        let t3 = XmlText::new("hello world!");
591
592        assert!(t1.content_equals(&t2));
593        assert!(!t1.content_equals(&t3));
594    }
595
596    #[test]
597    fn test_element_equality() {
598        let mut attrs1 = HashMap::new();
599        attrs1.insert("id".to_string(), "foo".to_string());
600
601        let mut attrs2 = HashMap::new();
602        attrs2.insert("id".to_string(), "foo".to_string());
603
604        let mut attrs3 = HashMap::new();
605        attrs3.insert("id".to_string(), "bar".to_string());
606
607        let e1 = XmlElement::new("div".to_string(), attrs1);
608        let e2 = XmlElement::new("div".to_string(), attrs2);
609        let e3 = XmlElement::new("div".to_string(), attrs3);
610        let e4 = XmlElement::new("span".to_string(), HashMap::new());
611
612        assert!(e1.content_equals(&e2));
613        assert!(!e1.content_equals(&e3));
614        assert!(!e1.content_equals(&e4));
615    }
616
617    #[test]
618    fn test_info_size() {
619        // Text shorter than threshold
620        let t1 = XmlText::new("hi");
621        assert_eq!(t1.info_size(), 1);
622
623        // Text longer than threshold (5)
624        let t2 = XmlText::new("hello world");
625        assert_eq!(t2.info_size(), 11 - TEXT_THRESHOLD);
626
627        // Element with no attributes
628        let e1 = XmlElement::new("div".to_string(), HashMap::new());
629        assert_eq!(e1.info_size(), ELEMENT_NAME_INFO);
630
631        // Element with attribute (value shorter than threshold)
632        let mut attrs = HashMap::new();
633        attrs.insert("id".to_string(), "x".to_string());
634        let e2 = XmlElement::new("div".to_string(), attrs);
635        assert_eq!(e2.info_size(), ELEMENT_NAME_INFO + ATTR_INFO + 1);
636    }
637
638    #[test]
639    fn test_xml_content_enum() {
640        let elem = XmlContent::Element(XmlElement::new("div".to_string(), HashMap::new()));
641        let text = XmlContent::Text(XmlText::new("hello"));
642
643        assert!(elem.is_element());
644        assert!(!elem.is_text());
645        assert!(!text.is_element());
646        assert!(text.is_text());
647
648        assert!(elem.as_element().is_some());
649        assert!(elem.as_text().is_none());
650        assert!(text.as_text().is_some());
651        assert!(text.as_element().is_none());
652    }
653
654    #[test]
655    fn test_namespace_decls_affect_equality() {
656        let attrs = HashMap::new();
657
658        let mut ns1 = HashMap::new();
659        ns1.insert("a".to_string(), "http://example.com/a".to_string());
660
661        let mut ns2 = HashMap::new();
662        ns2.insert("b".to_string(), "http://example.com/b".to_string());
663
664        let e1 = XmlElement::new_with_namespace("root".to_string(), None, ns1, attrs.clone());
665        let e2 = XmlElement::new_with_namespace("root".to_string(), None, ns2, attrs.clone());
666        let e3 = XmlElement::new("root".to_string(), attrs);
667
668        // content_equals ignores namespace declarations (intentional)
669        assert!(e1.content_equals(&e2));
670        assert!(e1.content_equals(&e3));
671
672        // namespace_decls_equal detects namespace differences
673        assert!(!e1.namespace_decls_equal(&e2));
674        assert!(!e1.namespace_decls_equal(&e3));
675        assert!(e1.namespace_decls_equal(&e1));
676    }
677}