Skip to main content

xml_3dm/node/
xml_content.rs

1//! XML content types for tree nodes.
2//!
3//! This module provides `XmlContent`, which represents the content of an XML node,
4//! either an element (tag with attributes) or text content.
5
6use crate::constants::{ATTR_INFO, ATTR_VALUE_THRESHOLD, ELEMENT_NAME_INFO, TEXT_THRESHOLD};
7use md5::{Digest, Md5};
8use std::collections::HashMap;
9
10use super::namespace::ExpandedName;
11
12/// Represents the content of an XML node.
13#[derive(Debug, Clone)]
14pub enum XmlContent {
15    /// An XML element with a qualified name and attributes.
16    Element(XmlElement),
17    /// XML text content.
18    Text(XmlText),
19    /// XML comment.
20    Comment(XmlComment),
21    /// XML processing instruction.
22    ProcessingInstruction(XmlProcessingInstruction),
23}
24
25impl XmlContent {
26    /// Returns the information size of this content.
27    ///
28    /// This metric is used for similarity calculations and copy detection.
29    pub fn info_size(&self) -> i32 {
30        match self {
31            XmlContent::Element(e) => e.info_size,
32            XmlContent::Text(t) => t.info_size,
33            XmlContent::Comment(c) => c.info_size,
34            XmlContent::ProcessingInstruction(pi) => pi.info_size,
35        }
36    }
37
38    /// Tests content equality using MD5 hash comparison.
39    pub fn content_equals(&self, other: &XmlContent) -> bool {
40        match (self, other) {
41            (XmlContent::Element(a), XmlContent::Element(b)) => a.content_equals(b),
42            (XmlContent::Text(a), XmlContent::Text(b)) => a.content_equals(b),
43            (XmlContent::Comment(a), XmlContent::Comment(b)) => a.content_equals(b),
44            (XmlContent::ProcessingInstruction(a), XmlContent::ProcessingInstruction(b)) => {
45                a.content_equals(b)
46            }
47            _ => false,
48        }
49    }
50
51    /// Returns a 32-bit hash code for this content.
52    ///
53    /// This is used for fast equality pre-checks and child list distance calculations.
54    pub fn content_hash(&self) -> i32 {
55        match self {
56            XmlContent::Element(e) => e.content_hash(),
57            XmlContent::Text(t) => t.content_hash(),
58            XmlContent::Comment(c) => c.content_hash(),
59            XmlContent::ProcessingInstruction(pi) => pi.content_hash(),
60        }
61    }
62
63    /// Returns true if this is an element node.
64    pub fn is_element(&self) -> bool {
65        matches!(self, XmlContent::Element(_))
66    }
67
68    /// Returns true if this is a text node.
69    pub fn is_text(&self) -> bool {
70        matches!(self, XmlContent::Text(_))
71    }
72
73    /// Returns true if this is a comment node.
74    pub fn is_comment(&self) -> bool {
75        matches!(self, XmlContent::Comment(_))
76    }
77
78    /// Returns true if this is a processing instruction node.
79    pub fn is_processing_instruction(&self) -> bool {
80        matches!(self, XmlContent::ProcessingInstruction(_))
81    }
82
83    /// Returns a reference to the element, if this is an element node.
84    pub fn as_element(&self) -> Option<&XmlElement> {
85        match self {
86            XmlContent::Element(e) => Some(e),
87            _ => None,
88        }
89    }
90
91    /// Returns a mutable reference to the element, if this is an element node.
92    pub fn as_element_mut(&mut self) -> Option<&mut XmlElement> {
93        match self {
94            XmlContent::Element(e) => Some(e),
95            _ => None,
96        }
97    }
98
99    /// Returns a reference to the text, if this is a text node.
100    pub fn as_text(&self) -> Option<&XmlText> {
101        match self {
102            XmlContent::Text(t) => Some(t),
103            _ => None,
104        }
105    }
106
107    /// Returns a mutable reference to the text, if this is a text node.
108    pub fn as_text_mut(&mut self) -> Option<&mut XmlText> {
109        match self {
110            XmlContent::Text(t) => Some(t),
111            _ => None,
112        }
113    }
114
115    /// Returns a reference to the PI, if this is a processing instruction node.
116    pub fn as_processing_instruction(&self) -> Option<&XmlProcessingInstruction> {
117        match self {
118            XmlContent::ProcessingInstruction(pi) => Some(pi),
119            _ => None,
120        }
121    }
122}
123
124/// Calculates MD5 hash of character data, matching Java's byte ordering.
125///
126/// Java's `MessageDigest.update()` is called with:
127/// - `(byte) (char & 0xff)` - low byte
128/// - `(byte) (char >> 8)` - high byte
129///
130/// This is little-endian byte order for each UTF-16 code unit.
131fn calculate_hash_chars(data: &[char]) -> [u8; 16] {
132    let mut hasher = Md5::new();
133    for &c in data {
134        let code = c as u16;
135        hasher.update([(code & 0xff) as u8, (code >> 8) as u8]);
136    }
137    hasher.finalize().into()
138}
139
140/// Calculates MD5 hash of a string, matching Java's byte ordering.
141fn calculate_hash_str(data: &str) -> [u8; 16] {
142    let mut hasher = Md5::new();
143    // Java strings are UTF-16, so we iterate over UTF-16 code units
144    for code in data.encode_utf16() {
145        hasher.update([(code & 0xff) as u8, (code >> 8) as u8]);
146    }
147    hasher.finalize().into()
148}
149
150/// Converts the first 4 bytes of an MD5 hash to a 32-bit integer.
151///
152/// This matches Java's behavior where bytes are sign-extended when used
153/// in arithmetic expressions. In Java:
154/// ```java
155/// hash[0] + (hash[1] << 8) + (hash[2] << 16) + (hash[3] << 24)
156/// ```
157/// Since Java bytes are signed, negative values get sign-extended.
158fn hash_to_i32(hash: &[u8; 16]) -> i32 {
159    // Cast to i8 first to get Java's signed byte behavior, then sign-extend to i32
160    let b0 = hash[0] as i8 as i32;
161    let b1 = hash[1] as i8 as i32;
162    let b2 = hash[2] as i8 as i32;
163    let b3 = hash[3] as i8 as i32;
164    b0.wrapping_add(b1 << 8)
165        .wrapping_add(b2 << 16)
166        .wrapping_add(b3 << 24)
167}
168
169/// An XML element with a qualified name and attributes.
170#[derive(Debug, Clone)]
171pub struct XmlElement {
172    /// The qualified name of the element (e.g., "div", "ns:element").
173    name: String,
174    /// The expanded name (namespace URI + local name), if parsed with namespace awareness.
175    expanded_name: Option<ExpandedName>,
176    /// Namespace declarations on this element (prefix -> URI).
177    namespace_decls: HashMap<String, String>,
178    /// Attributes as key-value pairs. The key is the qualified attribute name.
179    attributes: HashMap<String, String>,
180    /// Cached hash code of the element name (matches Java's String.hashCode()).
181    name_hash: i32,
182    /// MD5 hash of the attributes.
183    attr_hash: [u8; 16],
184    /// Information size metric.
185    info_size: i32,
186}
187
188impl XmlElement {
189    /// Creates a new XML element with the given name and attributes.
190    pub fn new(name: String, attributes: HashMap<String, String>) -> Self {
191        Self::new_with_namespace(name, None, HashMap::new(), attributes)
192    }
193
194    /// Creates a new element with namespace information.
195    pub fn new_with_namespace(
196        name: String,
197        expanded_name: Option<ExpandedName>,
198        namespace_decls: HashMap<String, String>,
199        attributes: HashMap<String, String>,
200    ) -> Self {
201        let mut element = XmlElement {
202            name,
203            expanded_name,
204            namespace_decls,
205            attributes,
206            name_hash: 0,
207            attr_hash: [0; 16],
208            info_size: 0,
209        };
210        element.rehash();
211        element
212    }
213
214    /// Recalculates the hash values and info size.
215    ///
216    /// This should be called after modifying the name or attributes.
217    pub fn rehash(&mut self) {
218        self.name_hash = java_string_hash(&self.name);
219        self.info_size = ELEMENT_NAME_INFO;
220
221        let mut hasher = Md5::new();
222
223        // Sort attribute names for deterministic hashing
224        // Note: Java's AttributesImpl iteration order may differ, but for our
225        // purposes we need consistent ordering. The Java code iterates in
226        // insertion order, which we'll match by sorting.
227        let mut attr_names: Vec<&String> = self.attributes.keys().collect();
228        attr_names.sort();
229
230        for attr_name in attr_names {
231            let attr_value = &self.attributes[attr_name];
232            let vsize = attr_value.chars().count() as i32;
233            self.info_size += ATTR_INFO
234                + if vsize > ATTR_VALUE_THRESHOLD {
235                    vsize - ATTR_VALUE_THRESHOLD
236                } else {
237                    1
238                };
239            hasher.update(calculate_hash_str(attr_name));
240            hasher.update(calculate_hash_str(attr_value));
241        }
242
243        self.attr_hash = hasher.finalize().into();
244    }
245
246    /// Returns the qualified name of the element.
247    pub fn qname(&self) -> &str {
248        &self.name
249    }
250
251    /// Sets the qualified name of the element.
252    ///
253    /// Note: This does not automatically rehash. Call `rehash()` after
254    /// modifying if hash values are needed.
255    pub fn set_qname(&mut self, name: String) {
256        self.name = name;
257    }
258
259    /// Returns the attributes.
260    pub fn attributes(&self) -> &HashMap<String, String> {
261        &self.attributes
262    }
263
264    /// Returns a mutable reference to the attributes.
265    ///
266    /// Note: This does not automatically rehash. Call `rehash()` after
267    /// modifying if hash values are needed.
268    pub fn attributes_mut(&mut self) -> &mut HashMap<String, String> {
269        &mut self.attributes
270    }
271
272    /// Sets the attributes.
273    ///
274    /// Note: This does not automatically rehash. Call `rehash()` after
275    /// modifying if hash values are needed.
276    pub fn set_attributes(&mut self, attributes: HashMap<String, String>) {
277        self.attributes = attributes;
278    }
279
280    /// Returns the expanded name, if available.
281    pub fn expanded_name(&self) -> Option<&ExpandedName> {
282        self.expanded_name.as_ref()
283    }
284
285    /// Returns namespace declarations on this element.
286    pub fn namespace_decls(&self) -> &HashMap<String, String> {
287        &self.namespace_decls
288    }
289
290    /// Compares element names with namespace awareness.
291    /// Falls back to string comparison if neither has expanded names.
292    pub fn names_match(&self, other: &XmlElement) -> bool {
293        match (&self.expanded_name, &other.expanded_name) {
294            (Some(a), Some(b)) => a == b,
295            (None, None) => self.name == other.name,
296            _ => false,
297        }
298    }
299
300    /// Tests content equality using hash comparison.
301    ///
302    /// Note: This compares element name and attributes only, not namespace
303    /// declarations. Use `namespace_decls_equal` for namespace comparison.
304    pub fn content_equals(&self, other: &XmlElement) -> bool {
305        self.name_hash == other.name_hash && self.attr_hash == other.attr_hash
306    }
307
308    /// Tests whether namespace declarations are equal.
309    pub fn namespace_decls_equal(&self, other: &XmlElement) -> bool {
310        self.namespace_decls == other.namespace_decls
311    }
312
313    /// Returns a 32-bit hash code for this element.
314    pub fn content_hash(&self) -> i32 {
315        hash_to_i32(&self.attr_hash) ^ self.name_hash
316    }
317
318    /// Returns the information size.
319    pub fn info_size(&self) -> i32 {
320        self.info_size
321    }
322}
323
324impl std::fmt::Display for XmlElement {
325    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
326        write!(f, "{} {{", self.name)?;
327        let mut first = true;
328        // Sort for consistent output
329        let mut attr_names: Vec<&String> = self.attributes.keys().collect();
330        attr_names.sort();
331        for name in attr_names {
332            if !first {
333                write!(f, " ")?;
334            }
335            first = false;
336            write!(f, " {}={}", name, self.attributes[name])?;
337        }
338        write!(f, "}}")
339    }
340}
341
342/// XML text content.
343#[derive(Debug, Clone)]
344pub struct XmlText {
345    /// The text content as a character array (matching Java's char[]).
346    text: Vec<char>,
347    /// MD5 hash of the text content.
348    content_hash: [u8; 16],
349    /// Information size metric.
350    info_size: i32,
351}
352
353impl XmlText {
354    /// Creates a new text node from a string.
355    pub fn new(text: &str) -> Self {
356        let chars: Vec<char> = text.chars().collect();
357        Self::from_chars(chars)
358    }
359
360    /// Creates a new text node from a character array.
361    pub fn from_chars(text: Vec<char>) -> Self {
362        let content_hash = calculate_hash_chars(&text);
363        let len = text.len() as i32;
364        let info_size = if len > TEXT_THRESHOLD {
365            len - TEXT_THRESHOLD
366        } else {
367            1
368        };
369        XmlText {
370            text,
371            content_hash,
372            info_size,
373        }
374    }
375
376    /// Creates a new text node from a slice of characters.
377    pub fn from_char_slice(text: &[char], start: usize, length: usize) -> Self {
378        let chars: Vec<char> = text[start..start + length].to_vec();
379        Self::from_chars(chars)
380    }
381
382    /// Tests content equality using MD5 hash comparison.
383    pub fn content_equals(&self, other: &XmlText) -> bool {
384        self.content_hash == other.content_hash
385    }
386
387    /// Returns the text as a character slice.
388    pub fn text(&self) -> &[char] {
389        &self.text
390    }
391
392    /// Sets the text content.
393    ///
394    /// Note: This recalculates the hash and info size.
395    pub fn set_text(&mut self, text: Vec<char>) {
396        self.content_hash = calculate_hash_chars(&text);
397        let len = text.len() as i32;
398        self.info_size = if len > TEXT_THRESHOLD {
399            len - TEXT_THRESHOLD
400        } else {
401            1
402        };
403        self.text = text;
404    }
405
406    /// Returns a 32-bit hash code for this text node.
407    pub fn content_hash(&self) -> i32 {
408        hash_to_i32(&self.content_hash)
409    }
410
411    /// Returns the information size.
412    pub fn info_size(&self) -> i32 {
413        self.info_size
414    }
415}
416
417impl std::fmt::Display for XmlText {
418    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
419        let s: String = self.text.iter().collect();
420        write!(f, "{}", s)
421    }
422}
423
424/// XML comment content.
425#[derive(Debug, Clone)]
426pub struct XmlComment {
427    /// The comment text (without the <!-- and --> markers).
428    text: Vec<char>,
429    /// MD5 hash of the comment content.
430    content_hash: [u8; 16],
431    /// Information size metric (comments have minimal info size).
432    info_size: i32,
433}
434
435impl XmlComment {
436    /// Creates a new comment node from a string.
437    pub fn new(text: &str) -> Self {
438        let chars: Vec<char> = text.chars().collect();
439        Self::from_chars(chars)
440    }
441
442    /// Creates a new comment node from a character array.
443    pub fn from_chars(text: Vec<char>) -> Self {
444        let content_hash = calculate_hash_chars(&text);
445        // Comments have minimal info size (don't contribute much to structure)
446        let info_size = 1;
447        XmlComment {
448            text,
449            content_hash,
450            info_size,
451        }
452    }
453
454    /// Tests content equality using MD5 hash comparison.
455    pub fn content_equals(&self, other: &XmlComment) -> bool {
456        self.content_hash == other.content_hash
457    }
458
459    /// Returns the comment text as a character slice.
460    pub fn text(&self) -> &[char] {
461        &self.text
462    }
463
464    /// Sets the comment text.
465    pub fn set_text(&mut self, text: Vec<char>) {
466        self.content_hash = calculate_hash_chars(&text);
467        self.text = text;
468    }
469
470    /// Returns a 32-bit hash code for this comment node.
471    pub fn content_hash(&self) -> i32 {
472        hash_to_i32(&self.content_hash)
473    }
474
475    /// Returns the information size.
476    pub fn info_size(&self) -> i32 {
477        self.info_size
478    }
479}
480
481impl std::fmt::Display for XmlComment {
482    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
483        let s: String = self.text.iter().collect();
484        write!(f, "<!-- {} -->", s)
485    }
486}
487
488/// XML processing instruction content.
489#[derive(Debug, Clone)]
490pub struct XmlProcessingInstruction {
491    /// The target of the PI (e.g., "xml-stylesheet").
492    target: String,
493    /// The content/data of the PI (everything after the target).
494    content: String,
495    /// MD5 hash of combined target and content.
496    content_hash: [u8; 16],
497    /// Information size metric (PIs have minimal info size like comments).
498    info_size: i32,
499}
500
501impl XmlProcessingInstruction {
502    /// Creates a new PI from target and content strings.
503    pub fn new(target: &str, content: &str) -> Self {
504        let content_hash = Self::calculate_hash(target, content);
505        XmlProcessingInstruction {
506            target: target.to_string(),
507            content: content.to_string(),
508            content_hash,
509            info_size: 1,
510        }
511    }
512
513    fn calculate_hash(target: &str, content: &str) -> [u8; 16] {
514        use md5::{Digest, Md5};
515        let mut hasher = Md5::new();
516        for code in target.encode_utf16() {
517            hasher.update([(code & 0xff) as u8, (code >> 8) as u8]);
518        }
519        for code in content.encode_utf16() {
520            hasher.update([(code & 0xff) as u8, (code >> 8) as u8]);
521        }
522        hasher.finalize().into()
523    }
524
525    /// Tests content equality using MD5 hash comparison.
526    pub fn content_equals(&self, other: &XmlProcessingInstruction) -> bool {
527        self.content_hash == other.content_hash
528    }
529
530    /// Returns the PI target.
531    pub fn target(&self) -> &str {
532        &self.target
533    }
534
535    /// Returns the PI content.
536    pub fn content(&self) -> &str {
537        &self.content
538    }
539
540    /// Returns a 32-bit hash code for this PI node.
541    pub fn content_hash(&self) -> i32 {
542        hash_to_i32(&self.content_hash)
543    }
544
545    /// Returns the information size.
546    pub fn info_size(&self) -> i32 {
547        self.info_size
548    }
549}
550
551impl std::fmt::Display for XmlProcessingInstruction {
552    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
553        if self.content.is_empty() {
554            write!(f, "<?{}?>", self.target)
555        } else {
556            write!(f, "<?{} {}?>", self.target, self.content)
557        }
558    }
559}
560
561/// Computes a hash code compatible with Java's String.hashCode().
562///
563/// Java's algorithm: `s[0]*31^(n-1) + s[1]*31^(n-2) + ... + s[n-1]`
564/// where n is the length of the string.
565///
566/// This uses wrapping arithmetic to match Java's int overflow behavior.
567pub fn java_string_hash(s: &str) -> i32 {
568    let mut hash: i32 = 0;
569    for code in s.encode_utf16() {
570        hash = hash.wrapping_mul(31).wrapping_add(code as i32);
571    }
572    hash
573}
574
575#[cfg(test)]
576mod tests {
577    use super::*;
578
579    #[test]
580    fn test_java_string_hash() {
581        // Test some known Java String.hashCode() values
582        assert_eq!(java_string_hash(""), 0);
583        assert_eq!(java_string_hash("a"), 97);
584        assert_eq!(java_string_hash("ab"), 97 * 31 + 98);
585        assert_eq!(java_string_hash("hello"), 99162322);
586    }
587
588    #[test]
589    fn test_text_node_equality() {
590        let t1 = XmlText::new("hello world");
591        let t2 = XmlText::new("hello world");
592        let t3 = XmlText::new("hello world!");
593
594        assert!(t1.content_equals(&t2));
595        assert!(!t1.content_equals(&t3));
596    }
597
598    #[test]
599    fn test_element_equality() {
600        let mut attrs1 = HashMap::new();
601        attrs1.insert("id".to_string(), "foo".to_string());
602
603        let mut attrs2 = HashMap::new();
604        attrs2.insert("id".to_string(), "foo".to_string());
605
606        let mut attrs3 = HashMap::new();
607        attrs3.insert("id".to_string(), "bar".to_string());
608
609        let e1 = XmlElement::new("div".to_string(), attrs1);
610        let e2 = XmlElement::new("div".to_string(), attrs2);
611        let e3 = XmlElement::new("div".to_string(), attrs3);
612        let e4 = XmlElement::new("span".to_string(), HashMap::new());
613
614        assert!(e1.content_equals(&e2));
615        assert!(!e1.content_equals(&e3));
616        assert!(!e1.content_equals(&e4));
617    }
618
619    #[test]
620    fn test_info_size() {
621        // Text shorter than threshold
622        let t1 = XmlText::new("hi");
623        assert_eq!(t1.info_size(), 1);
624
625        // Text longer than threshold (5)
626        let t2 = XmlText::new("hello world");
627        assert_eq!(t2.info_size(), 11 - TEXT_THRESHOLD);
628
629        // Element with no attributes
630        let e1 = XmlElement::new("div".to_string(), HashMap::new());
631        assert_eq!(e1.info_size(), ELEMENT_NAME_INFO);
632
633        // Element with attribute (value shorter than threshold)
634        let mut attrs = HashMap::new();
635        attrs.insert("id".to_string(), "x".to_string());
636        let e2 = XmlElement::new("div".to_string(), attrs);
637        assert_eq!(e2.info_size(), ELEMENT_NAME_INFO + ATTR_INFO + 1);
638    }
639
640    #[test]
641    fn test_xml_content_enum() {
642        let elem = XmlContent::Element(XmlElement::new("div".to_string(), HashMap::new()));
643        let text = XmlContent::Text(XmlText::new("hello"));
644
645        assert!(elem.is_element());
646        assert!(!elem.is_text());
647        assert!(!text.is_element());
648        assert!(text.is_text());
649
650        assert!(elem.as_element().is_some());
651        assert!(elem.as_text().is_none());
652        assert!(text.as_text().is_some());
653        assert!(text.as_element().is_none());
654    }
655
656    #[test]
657    fn test_namespace_decls_affect_equality() {
658        let attrs = HashMap::new();
659
660        let mut ns1 = HashMap::new();
661        ns1.insert("a".to_string(), "http://example.com/a".to_string());
662
663        let mut ns2 = HashMap::new();
664        ns2.insert("b".to_string(), "http://example.com/b".to_string());
665
666        let e1 = XmlElement::new_with_namespace("root".to_string(), None, ns1, attrs.clone());
667        let e2 = XmlElement::new_with_namespace("root".to_string(), None, ns2, attrs.clone());
668        let e3 = XmlElement::new("root".to_string(), attrs);
669
670        // content_equals ignores namespace declarations (intentional)
671        assert!(e1.content_equals(&e2));
672        assert!(e1.content_equals(&e3));
673
674        // namespace_decls_equal detects namespace differences
675        assert!(!e1.namespace_decls_equal(&e2));
676        assert!(!e1.namespace_decls_equal(&e3));
677        assert!(e1.namespace_decls_equal(&e1));
678    }
679}