gistools/parsers/xml/
mod.rs

1use alloc::{format, string::String, vec, vec::Vec};
2use regex::Regex;
3
4/// # NO_INDEX means it's not pointing to a location in memory
5pub static NO_INDEX: usize = usize::MAX;
6
7/// XMLOptions for xml parsing
8#[derive(Debug, Default, Clone)]
9pub struct XMLOptions {
10    // /// Set true if you want debug info reported
11    // pub debug: Option<bool>,
12    /// Set the start index
13    pub start_index: Option<usize>,
14    /// Set true if you want to parse nested tags
15    pub nested: Option<bool>,
16    /// Set true if you want to return on the first match
17    pub return_on_first: Option<bool>,
18}
19
20/// A Tag is a pair of an inner and an outer strings with their indexes
21#[derive(Debug, Default, Clone, PartialEq)]
22pub struct XMLTag {
23    /// The inner string
24    pub inner: Option<String>,
25    /// The outer string
26    pub outer: String,
27    /// The start index of the tag
28    pub start: usize,
29    /// The end index of the tag
30    pub end: usize,
31}
32/// A PathItem is a String or a Step
33#[derive(Debug, Clone, PartialEq)]
34pub enum XMLTagItem {
35    /// A String item
36    String(String),
37    /// A Step item
38    XMLTag(XMLTag),
39}
40/// A Step is a name and an index
41#[derive(Debug, Default, Clone, PartialEq)]
42pub struct XMLStep {
43    /// The name of the step
44    pub name: String,
45    /// The index of the step
46    pub index: Option<usize>,
47}
48/// A PathItem is a String or a Step
49#[derive(Debug, Clone, PartialEq)]
50pub enum XMLPathItem {
51    /// A String item
52    String(String),
53    /// A Step item
54    XMLStep(XMLStep),
55}
56/// A Path is an array of Steps or Strings
57pub type XMLPath = Vec<XMLPathItem>;
58
59/// Count the number of times a substring appears in a string
60pub fn xml_count_substring(string: &str, substring: &str) -> usize {
61    let re = Regex::new(substring).unwrap();
62    re.find_iter(string).count()
63}
64
65/// Find the first tag with the given name
66pub fn xml_find_tag_by_name(
67    xml: &str,
68    tag_name: &str,
69    options: Option<XMLOptions>,
70) -> Option<XMLTag> {
71    let options = options.unwrap_or_default();
72    let nested = options.nested == Some(true);
73
74    let start_index = options.start_index.unwrap_or(0);
75
76    // Find the starting index of the tag
77    let start = xml_index_of_match(xml, &format!("<{tag_name}[ \n>/]"), start_index);
78    if start == NO_INDEX {
79        return None;
80    }
81
82    let after_start = &xml[start + tag_name.len()..]; // Slice correctly here
83
84    let mut relative_end = xml_index_of_match_end(after_start, "^[^<]*[ /]>", 0);
85    let rel_end_char = after_start.chars().nth(relative_end - 1).unwrap_or('\0');
86    let self_closing = relative_end != NO_INDEX && rel_end_char == '/';
87
88    if !self_closing {
89        // check if tag has subtags with the same name
90        if nested {
91            let mut start_index = 0;
92            let mut openings = 1;
93            let mut closings = 0;
94            while {
95                relative_end =
96                    xml_index_of_match_end(after_start, &format!("[ /]{tag_name}>"), start_index);
97                relative_end != NO_INDEX
98            } {
99                let clip = &after_start[start_index..relative_end + 1];
100                openings += xml_count_substring(clip, &format!("<{tag_name}[ \n\t>]"));
101                closings += xml_count_substring(clip, &format!("</{tag_name}>"));
102                // we can't have more openings than closings
103                if closings >= openings {
104                    break;
105                }
106                start_index = relative_end;
107            }
108        } else {
109            relative_end = xml_index_of_match_end(after_start, &format!("[ /]{tag_name}>"), 0);
110        }
111    }
112
113    let end = start + tag_name.len() + relative_end + 1;
114    if end == NO_INDEX {
115        return None;
116    }
117
118    let outer = &xml[start..end]; // Get the full outer tag
119
120    // Extract inner text if it's not self-closing
121    let inner: Option<String> = if self_closing {
122        None
123    } else {
124        let start_pos = outer.find('>').unwrap_or(0);
125        let end_pos = outer.rfind('<').unwrap_or(outer.len());
126        Some(outer[start_pos + 1..end_pos].into())
127    };
128
129    Some(XMLTag { inner, outer: outer.into(), start, end })
130}
131
132/// Find the first tag with the given path
133pub fn xml_find_tag_by_path(
134    xml: &str,
135    path: &XMLPath,
136    options: Option<XMLOptions>,
137) -> Option<XMLTag> {
138    let found = xml_find_tags_by_path(
139        xml,
140        path,
141        Some(XMLOptions { return_on_first: Some(true), ..options.unwrap_or_default() }),
142    );
143
144    found.into_iter().next()
145}
146
147/// Find all tags with the given name
148///
149/// ## Parameters
150/// - `xml`: the xml string
151/// - `tag_name`: the tag name
152/// - `options`: user defined options
153///
154/// ## Returns
155/// All tags with the given name
156pub fn xml_find_tags_by_name(
157    xml: &str,
158    tag_name: &str,
159    options: Option<XMLOptions>,
160) -> Vec<XMLTag> {
161    let options = options.unwrap_or_default();
162    let nested = options.nested.unwrap_or(true);
163    let mut start_index = options.start_index.unwrap_or(0);
164    let mut tags = vec![];
165    loop {
166        let tag = xml_find_tag_by_name(
167            xml,
168            tag_name,
169            Some(XMLOptions { start_index: Some(start_index), ..options }),
170        );
171        if let Some(tag) = tag {
172            if nested {
173                start_index = tag.start + 1 + tag_name.len();
174            } else {
175                start_index = tag.end;
176            }
177            tags.push(tag);
178        } else {
179            break;
180        }
181    }
182
183    tags
184}
185
186/// Find all tags with the given path
187pub fn xml_find_tags_by_path(
188    xml: &str,
189    path: &XMLPath,
190    options: Option<XMLOptions>,
191) -> Vec<XMLTag> {
192    let options = options.unwrap_or_default();
193    let return_on_first = options.return_on_first.unwrap_or(false);
194
195    if path.is_empty() {
196        return vec![];
197    }
198
199    // Extract the first path step
200    let path0 = match &path[0] {
201        XMLPathItem::String(name) => XMLStep { name: name.clone(), index: None },
202        XMLPathItem::XMLStep(step) => step.clone(),
203    };
204
205    // Find initial tags
206    let mut tags = xml_find_tags_by_name(
207        xml,
208        &path0.name,
209        Some(XMLOptions { nested: Some(false), ..options }),
210    );
211
212    // Apply index filtering if present
213    if let Some(index) = path0.index {
214        tags = tags.get(index).cloned().into_iter().collect();
215    }
216
217    let path = &path[1..];
218
219    for (path_index, part) in path.iter().enumerate() {
220        let part = match part {
221            XMLPathItem::String(name) => XMLStep { name: name.clone(), index: None },
222            XMLPathItem::XMLStep(step) => step.clone(),
223        };
224
225        let mut all_sub_tags = Vec::new();
226
227        for tag in &tags {
228            let mut sub_tags = xml_find_tags_by_name(
229                &tag.outer,
230                &part.name,
231                Some(XMLOptions { start_index: Some(1), ..options }),
232            );
233
234            // Adjust tag start positions
235            for sub_tag in &mut sub_tags {
236                sub_tag.start += tag.start;
237                sub_tag.end += tag.start;
238            }
239
240            // Early return if return_on_first is set
241            if return_on_first && path_index == path.len() - 1 && !sub_tags.is_empty() {
242                return vec![sub_tags.remove(0)];
243            }
244
245            all_sub_tags.extend(sub_tags);
246        }
247
248        tags = all_sub_tags;
249
250        // Apply index filtering at each step if present
251        if let Some(index) = part.index {
252            tags = tags.get(index).cloned().into_iter().collect();
253        }
254    }
255
256    tags
257}
258
259/// Get the value of an attribute
260pub fn xml_get_attribute(tag: &XMLTagItem, attribute_name: &str) -> Option<String> {
261    let xml = match tag {
262        XMLTagItem::String(s) => s,
263        XMLTagItem::XMLTag(t) => &t.outer,
264    };
265
266    // Only search for attributes in the opening tag
267    if let Some(end) = xml.find('>') {
268        let opening = &xml[..=end];
269
270        let quote_chars = ['"', '\''];
271        for &quote in &quote_chars {
272            let pattern = format!(r#"{attribute_name}={quote}([^{quote}]*){quote}"#);
273            let re = Regex::new(&pattern).ok()?;
274            if let Some(captures) = re.captures(opening) {
275                return captures.get(1).map(|m| m.as_str().into());
276            }
277        }
278    }
279    None
280}
281
282/// Find the index of the last match
283pub fn xml_index_of_match_end(xml: &str, pattern: &str, start_index: usize) -> usize {
284    // let mtch: Vec<(usize, &str)> = xml[start_index..].match_indices(pattern).collect();
285    let re = Regex::new(pattern).unwrap();
286    let mtch: Vec<(usize, &str)> = re
287        .captures_iter(&xml[start_index..])
288        .map(|cap| (cap.get(0).unwrap().start(), cap.get(0).unwrap().as_str()))
289        .collect();
290    if !mtch.is_empty() { start_index + mtch[0].0 + mtch[0].1.len() - 1 } else { NO_INDEX }
291}
292
293/// Find the index of the first match
294///
295/// ## Parameters
296/// - `xml`: the xml string
297/// - `pattern`: the pattern
298/// - `start_index`: the start index
299///
300/// ## Returns
301/// The index of the first match
302pub fn xml_index_of_match(xml: &str, pattern: &str, start_index: usize) -> usize {
303    // let mtch: Vec<(usize, &str)> = xml[start_index..].match_indices(pattern).collect();
304    let re = Regex::new(pattern).unwrap();
305    let mtch: Vec<(usize, &str)> = re
306        .captures_iter(&xml[start_index..])
307        .map(|cap| (cap.get(0).unwrap().start(), cap.get(0).unwrap().as_str()))
308        .collect();
309    if !mtch.is_empty() { start_index + mtch[0].0 } else { NO_INDEX }
310}
311
312/// Remove comments
313pub fn xml_remove_comments(xml: &str) -> String {
314    // return xml.replace(/<!--[^]*-->/g, '');
315    let mut result = String::with_capacity(xml.len());
316    let mut inside_comment = false;
317    let mut chars = xml.chars().peekable();
318
319    while let Some(c) = chars.next() {
320        if inside_comment {
321            if c == '-' && chars.peek() == Some(&'-') {
322                chars.next(); // Consume second '-'
323                if chars.peek() == Some(&'>') {
324                    chars.next(); // Consume '>'
325                    inside_comment = false;
326                }
327            }
328        } else if c == '<' && chars.peek() == Some(&'!') {
329            let mut temp_iter = chars.clone();
330            temp_iter.next(); // Consume '!'
331            if temp_iter.next() == Some('-') && temp_iter.next() == Some('-') {
332                inside_comment = true;
333                chars.next(); // Consume '-'
334                chars.next(); // Consume '-'
335            } else {
336                result.push('<');
337            }
338        } else {
339            result.push(c);
340        }
341    }
342
343    result
344}
345
346/// Remove tags
347pub fn xml_remove_tags_by_name(xml: &str, tag_name: &str, options: Option<XMLOptions>) -> String {
348    let mut res: String = xml.into();
349    loop {
350        let tag = xml_find_tag_by_name(&res, tag_name, options.as_ref().cloned());
351        if let Some(tag) = tag {
352            res = format!("{}{}", &res[0..tag.start], &res[tag.end..]);
353        } else {
354            break;
355        }
356    }
357    res
358}