xml_3dm/node/
xml_content.rs

1//! XML content types for tree nodes.
2//!
3//! This module provides `XmlContent`, which represents the content of an XML node,
4//! either an element (tag with attributes) or text content.
5
6use crate::constants::{ATTR_INFO, ATTR_VALUE_THRESHOLD, ELEMENT_NAME_INFO, TEXT_THRESHOLD};
7use md5::{Digest, Md5};
8use std::collections::HashMap;
9
10/// Represents the content of an XML node.
11#[derive(Debug, Clone)]
12pub enum XmlContent {
13    /// An XML element with a qualified name and attributes.
14    Element(XmlElement),
15    /// XML text content.
16    Text(XmlText),
17    /// XML comment.
18    Comment(XmlComment),
19}
20
21impl XmlContent {
22    /// Returns the information size of this content.
23    ///
24    /// This metric is used for similarity calculations and copy detection.
25    pub fn info_size(&self) -> i32 {
26        match self {
27            XmlContent::Element(e) => e.info_size,
28            XmlContent::Text(t) => t.info_size,
29            XmlContent::Comment(c) => c.info_size,
30        }
31    }
32
33    /// Tests content equality using MD5 hash comparison.
34    pub fn content_equals(&self, other: &XmlContent) -> bool {
35        match (self, other) {
36            (XmlContent::Element(a), XmlContent::Element(b)) => a.content_equals(b),
37            (XmlContent::Text(a), XmlContent::Text(b)) => a.content_equals(b),
38            (XmlContent::Comment(a), XmlContent::Comment(b)) => a.content_equals(b),
39            _ => false,
40        }
41    }
42
43    /// Returns a 32-bit hash code for this content.
44    ///
45    /// This is used for fast equality pre-checks and child list distance calculations.
46    pub fn content_hash(&self) -> i32 {
47        match self {
48            XmlContent::Element(e) => e.content_hash(),
49            XmlContent::Text(t) => t.content_hash(),
50            XmlContent::Comment(c) => c.content_hash(),
51        }
52    }
53
54    /// Returns true if this is an element node.
55    pub fn is_element(&self) -> bool {
56        matches!(self, XmlContent::Element(_))
57    }
58
59    /// Returns true if this is a text node.
60    pub fn is_text(&self) -> bool {
61        matches!(self, XmlContent::Text(_))
62    }
63
64    /// Returns true if this is a comment node.
65    pub fn is_comment(&self) -> bool {
66        matches!(self, XmlContent::Comment(_))
67    }
68
69    /// Returns a reference to the element, if this is an element node.
70    pub fn as_element(&self) -> Option<&XmlElement> {
71        match self {
72            XmlContent::Element(e) => Some(e),
73            _ => None,
74        }
75    }
76
77    /// Returns a mutable reference to the element, if this is an element node.
78    pub fn as_element_mut(&mut self) -> Option<&mut XmlElement> {
79        match self {
80            XmlContent::Element(e) => Some(e),
81            _ => None,
82        }
83    }
84
85    /// Returns a reference to the text, if this is a text node.
86    pub fn as_text(&self) -> Option<&XmlText> {
87        match self {
88            XmlContent::Text(t) => Some(t),
89            _ => None,
90        }
91    }
92
93    /// Returns a mutable reference to the text, if this is a text node.
94    pub fn as_text_mut(&mut self) -> Option<&mut XmlText> {
95        match self {
96            XmlContent::Text(t) => Some(t),
97            _ => None,
98        }
99    }
100}
101
102/// Calculates MD5 hash of character data, matching Java's byte ordering.
103///
104/// Java's `MessageDigest.update()` is called with:
105/// - `(byte) (char & 0xff)` - low byte
106/// - `(byte) (char >> 8)` - high byte
107///
108/// This is little-endian byte order for each UTF-16 code unit.
109fn calculate_hash_chars(data: &[char]) -> [u8; 16] {
110    let mut hasher = Md5::new();
111    for &c in data {
112        let code = c as u16;
113        hasher.update([(code & 0xff) as u8, (code >> 8) as u8]);
114    }
115    hasher.finalize().into()
116}
117
118/// Calculates MD5 hash of a string, matching Java's byte ordering.
119fn calculate_hash_str(data: &str) -> [u8; 16] {
120    let mut hasher = Md5::new();
121    // Java strings are UTF-16, so we iterate over UTF-16 code units
122    for code in data.encode_utf16() {
123        hasher.update([(code & 0xff) as u8, (code >> 8) as u8]);
124    }
125    hasher.finalize().into()
126}
127
128/// Converts the first 4 bytes of an MD5 hash to a 32-bit integer.
129///
130/// This matches Java's behavior where bytes are sign-extended when used
131/// in arithmetic expressions. In Java:
132/// ```java
133/// hash[0] + (hash[1] << 8) + (hash[2] << 16) + (hash[3] << 24)
134/// ```
135/// Since Java bytes are signed, negative values get sign-extended.
136fn hash_to_i32(hash: &[u8; 16]) -> i32 {
137    // Cast to i8 first to get Java's signed byte behavior, then sign-extend to i32
138    let b0 = hash[0] as i8 as i32;
139    let b1 = hash[1] as i8 as i32;
140    let b2 = hash[2] as i8 as i32;
141    let b3 = hash[3] as i8 as i32;
142    b0 + (b1 << 8) + (b2 << 16) + (b3 << 24)
143}
144
145/// An XML element with a qualified name and attributes.
146#[derive(Debug, Clone)]
147pub struct XmlElement {
148    /// The qualified name of the element (e.g., "div", "ns:element").
149    name: String,
150    /// Attributes as key-value pairs. The key is the qualified attribute name.
151    attributes: HashMap<String, String>,
152    /// Cached hash code of the element name (matches Java's String.hashCode()).
153    name_hash: i32,
154    /// MD5 hash of the attributes.
155    attr_hash: [u8; 16],
156    /// Information size metric.
157    info_size: i32,
158}
159
160impl XmlElement {
161    /// Creates a new XML element with the given name and attributes.
162    pub fn new(name: String, attributes: HashMap<String, String>) -> Self {
163        let mut element = XmlElement {
164            name,
165            attributes,
166            name_hash: 0,
167            attr_hash: [0; 16],
168            info_size: 0,
169        };
170        element.rehash();
171        element
172    }
173
174    /// Recalculates the hash values and info size.
175    ///
176    /// This should be called after modifying the name or attributes.
177    pub fn rehash(&mut self) {
178        self.name_hash = java_string_hash(&self.name);
179        self.info_size = ELEMENT_NAME_INFO;
180
181        let mut hasher = Md5::new();
182
183        // Sort attribute names for deterministic hashing
184        // Note: Java's AttributesImpl iteration order may differ, but for our
185        // purposes we need consistent ordering. The Java code iterates in
186        // insertion order, which we'll match by sorting.
187        let mut attr_names: Vec<&String> = self.attributes.keys().collect();
188        attr_names.sort();
189
190        for attr_name in attr_names {
191            let attr_value = &self.attributes[attr_name];
192            let vsize = attr_value.chars().count() as i32;
193            self.info_size += ATTR_INFO
194                + if vsize > ATTR_VALUE_THRESHOLD {
195                    vsize - ATTR_VALUE_THRESHOLD
196                } else {
197                    1
198                };
199            hasher.update(calculate_hash_str(attr_name));
200            hasher.update(calculate_hash_str(attr_value));
201        }
202
203        self.attr_hash = hasher.finalize().into();
204    }
205
206    /// Returns the qualified name of the element.
207    pub fn qname(&self) -> &str {
208        &self.name
209    }
210
211    /// Sets the qualified name of the element.
212    ///
213    /// Note: This does not automatically rehash. Call `rehash()` after
214    /// modifying if hash values are needed.
215    pub fn set_qname(&mut self, name: String) {
216        self.name = name;
217    }
218
219    /// Returns the attributes.
220    pub fn attributes(&self) -> &HashMap<String, String> {
221        &self.attributes
222    }
223
224    /// Returns a mutable reference to the attributes.
225    ///
226    /// Note: This does not automatically rehash. Call `rehash()` after
227    /// modifying if hash values are needed.
228    pub fn attributes_mut(&mut self) -> &mut HashMap<String, String> {
229        &mut self.attributes
230    }
231
232    /// Sets the attributes.
233    ///
234    /// Note: This does not automatically rehash. Call `rehash()` after
235    /// modifying if hash values are needed.
236    pub fn set_attributes(&mut self, attributes: HashMap<String, String>) {
237        self.attributes = attributes;
238    }
239
240    /// Tests content equality using hash comparison.
241    pub fn content_equals(&self, other: &XmlElement) -> bool {
242        self.name_hash == other.name_hash && self.attr_hash == other.attr_hash
243    }
244
245    /// Returns a 32-bit hash code for this element.
246    pub fn content_hash(&self) -> i32 {
247        hash_to_i32(&self.attr_hash) ^ self.name_hash
248    }
249
250    /// Returns the information size.
251    pub fn info_size(&self) -> i32 {
252        self.info_size
253    }
254}
255
256impl std::fmt::Display for XmlElement {
257    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
258        write!(f, "{} {{", self.name)?;
259        let mut first = true;
260        // Sort for consistent output
261        let mut attr_names: Vec<&String> = self.attributes.keys().collect();
262        attr_names.sort();
263        for name in attr_names {
264            if !first {
265                write!(f, " ")?;
266            }
267            first = false;
268            write!(f, " {}={}", name, self.attributes[name])?;
269        }
270        write!(f, "}}")
271    }
272}
273
274/// XML text content.
275#[derive(Debug, Clone)]
276pub struct XmlText {
277    /// The text content as a character array (matching Java's char[]).
278    text: Vec<char>,
279    /// MD5 hash of the text content.
280    content_hash: [u8; 16],
281    /// Information size metric.
282    info_size: i32,
283}
284
285impl XmlText {
286    /// Creates a new text node from a string.
287    pub fn new(text: &str) -> Self {
288        let chars: Vec<char> = text.chars().collect();
289        Self::from_chars(chars)
290    }
291
292    /// Creates a new text node from a character array.
293    pub fn from_chars(text: Vec<char>) -> Self {
294        let content_hash = calculate_hash_chars(&text);
295        let len = text.len() as i32;
296        let info_size = if len > TEXT_THRESHOLD {
297            len - TEXT_THRESHOLD
298        } else {
299            1
300        };
301        XmlText {
302            text,
303            content_hash,
304            info_size,
305        }
306    }
307
308    /// Creates a new text node from a slice of characters.
309    pub fn from_char_slice(text: &[char], start: usize, length: usize) -> Self {
310        let chars: Vec<char> = text[start..start + length].to_vec();
311        Self::from_chars(chars)
312    }
313
314    /// Tests content equality using MD5 hash comparison.
315    pub fn content_equals(&self, other: &XmlText) -> bool {
316        self.content_hash == other.content_hash
317    }
318
319    /// Returns the text as a character slice.
320    pub fn text(&self) -> &[char] {
321        &self.text
322    }
323
324    /// Sets the text content.
325    ///
326    /// Note: This recalculates the hash and info size.
327    pub fn set_text(&mut self, text: Vec<char>) {
328        self.content_hash = calculate_hash_chars(&text);
329        let len = text.len() as i32;
330        self.info_size = if len > TEXT_THRESHOLD {
331            len - TEXT_THRESHOLD
332        } else {
333            1
334        };
335        self.text = text;
336    }
337
338    /// Returns a 32-bit hash code for this text node.
339    pub fn content_hash(&self) -> i32 {
340        hash_to_i32(&self.content_hash)
341    }
342
343    /// Returns the information size.
344    pub fn info_size(&self) -> i32 {
345        self.info_size
346    }
347}
348
349impl std::fmt::Display for XmlText {
350    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
351        let s: String = self.text.iter().collect();
352        write!(f, "{}", s)
353    }
354}
355
356/// XML comment content.
357#[derive(Debug, Clone)]
358pub struct XmlComment {
359    /// The comment text (without the <!-- and --> markers).
360    text: Vec<char>,
361    /// MD5 hash of the comment content.
362    content_hash: [u8; 16],
363    /// Information size metric (comments have minimal info size).
364    info_size: i32,
365}
366
367impl XmlComment {
368    /// Creates a new comment node from a string.
369    pub fn new(text: &str) -> Self {
370        let chars: Vec<char> = text.chars().collect();
371        Self::from_chars(chars)
372    }
373
374    /// Creates a new comment node from a character array.
375    pub fn from_chars(text: Vec<char>) -> Self {
376        let content_hash = calculate_hash_chars(&text);
377        // Comments have minimal info size (don't contribute much to structure)
378        let info_size = 1;
379        XmlComment {
380            text,
381            content_hash,
382            info_size,
383        }
384    }
385
386    /// Tests content equality using MD5 hash comparison.
387    pub fn content_equals(&self, other: &XmlComment) -> bool {
388        self.content_hash == other.content_hash
389    }
390
391    /// Returns the comment text as a character slice.
392    pub fn text(&self) -> &[char] {
393        &self.text
394    }
395
396    /// Sets the comment text.
397    pub fn set_text(&mut self, text: Vec<char>) {
398        self.content_hash = calculate_hash_chars(&text);
399        self.text = text;
400    }
401
402    /// Returns a 32-bit hash code for this comment node.
403    pub fn content_hash(&self) -> i32 {
404        hash_to_i32(&self.content_hash)
405    }
406
407    /// Returns the information size.
408    pub fn info_size(&self) -> i32 {
409        self.info_size
410    }
411}
412
413impl std::fmt::Display for XmlComment {
414    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
415        let s: String = self.text.iter().collect();
416        write!(f, "<!-- {} -->", s)
417    }
418}
419
420/// Computes a hash code compatible with Java's String.hashCode().
421///
422/// Java's algorithm: `s[0]*31^(n-1) + s[1]*31^(n-2) + ... + s[n-1]`
423/// where n is the length of the string.
424///
425/// This uses wrapping arithmetic to match Java's int overflow behavior.
426pub fn java_string_hash(s: &str) -> i32 {
427    let mut hash: i32 = 0;
428    for code in s.encode_utf16() {
429        hash = hash.wrapping_mul(31).wrapping_add(code as i32);
430    }
431    hash
432}
433
434#[cfg(test)]
435mod tests {
436    use super::*;
437
438    #[test]
439    fn test_java_string_hash() {
440        // Test some known Java String.hashCode() values
441        assert_eq!(java_string_hash(""), 0);
442        assert_eq!(java_string_hash("a"), 97);
443        assert_eq!(java_string_hash("ab"), 97 * 31 + 98);
444        assert_eq!(java_string_hash("hello"), 99162322);
445    }
446
447    #[test]
448    fn test_text_node_equality() {
449        let t1 = XmlText::new("hello world");
450        let t2 = XmlText::new("hello world");
451        let t3 = XmlText::new("hello world!");
452
453        assert!(t1.content_equals(&t2));
454        assert!(!t1.content_equals(&t3));
455    }
456
457    #[test]
458    fn test_element_equality() {
459        let mut attrs1 = HashMap::new();
460        attrs1.insert("id".to_string(), "foo".to_string());
461
462        let mut attrs2 = HashMap::new();
463        attrs2.insert("id".to_string(), "foo".to_string());
464
465        let mut attrs3 = HashMap::new();
466        attrs3.insert("id".to_string(), "bar".to_string());
467
468        let e1 = XmlElement::new("div".to_string(), attrs1);
469        let e2 = XmlElement::new("div".to_string(), attrs2);
470        let e3 = XmlElement::new("div".to_string(), attrs3);
471        let e4 = XmlElement::new("span".to_string(), HashMap::new());
472
473        assert!(e1.content_equals(&e2));
474        assert!(!e1.content_equals(&e3));
475        assert!(!e1.content_equals(&e4));
476    }
477
478    #[test]
479    fn test_info_size() {
480        // Text shorter than threshold
481        let t1 = XmlText::new("hi");
482        assert_eq!(t1.info_size(), 1);
483
484        // Text longer than threshold (5)
485        let t2 = XmlText::new("hello world");
486        assert_eq!(t2.info_size(), 11 - TEXT_THRESHOLD);
487
488        // Element with no attributes
489        let e1 = XmlElement::new("div".to_string(), HashMap::new());
490        assert_eq!(e1.info_size(), ELEMENT_NAME_INFO);
491
492        // Element with attribute (value shorter than threshold)
493        let mut attrs = HashMap::new();
494        attrs.insert("id".to_string(), "x".to_string());
495        let e2 = XmlElement::new("div".to_string(), attrs);
496        assert_eq!(e2.info_size(), ELEMENT_NAME_INFO + ATTR_INFO + 1);
497    }
498
499    #[test]
500    fn test_xml_content_enum() {
501        let elem = XmlContent::Element(XmlElement::new("div".to_string(), HashMap::new()));
502        let text = XmlContent::Text(XmlText::new("hello"));
503
504        assert!(elem.is_element());
505        assert!(!elem.is_text());
506        assert!(!text.is_element());
507        assert!(text.is_text());
508
509        assert!(elem.as_element().is_some());
510        assert!(elem.as_text().is_none());
511        assert!(text.as_text().is_some());
512        assert!(text.as_element().is_none());
513    }
514}