fiscal_core/
xml_utils.rs

1//! Low-level XML building primitives used throughout the crate.
2//!
3//! These utilities are deliberately simple and allocation-efficient: they work
4//! on `&str` slices and return owned `String`s, with no external XML library
5//! dependency.
6
7/// Escape special XML characters in text content and attribute values,
8/// replacing `&`, `<`, `>`, `"`, and `'` with their XML entity equivalents.
9///
10/// # Examples
11///
12/// ```
13/// use fiscal_core::xml_utils::escape_xml;
14/// assert_eq!(escape_xml("Tom & Jerry <cats>"), "Tom &amp; Jerry &lt;cats&gt;");
15/// ```
16pub fn escape_xml(s: &str) -> String {
17    let mut result = String::with_capacity(s.len());
18    for ch in s.chars() {
19        match ch {
20            '&' => result.push_str("&amp;"),
21            '<' => result.push_str("&lt;"),
22            '>' => result.push_str("&gt;"),
23            '"' => result.push_str("&quot;"),
24            '\'' => result.push_str("&apos;"),
25            c => result.push(c),
26        }
27    }
28    result
29}
30
31/// Extract the text content of the first occurrence of a simple XML tag in a
32/// raw XML string.
33///
34/// Searches for `<tag_name>…</tag_name>` and returns the inner text.  Does not
35/// handle namespaced tags, nested tags of the same name, or CDATA sections.
36///
37/// Returns `None` if the tag is absent.
38///
39/// # Examples
40///
41/// ```
42/// use fiscal_core::xml_utils::extract_xml_tag_value;
43/// let xml = "<root><cStat>100</cStat></root>";
44/// assert_eq!(extract_xml_tag_value(xml, "cStat"), Some("100".to_string()));
45/// assert_eq!(extract_xml_tag_value(xml, "missing"), None);
46/// ```
47pub fn extract_xml_tag_value(xml: &str, tag_name: &str) -> Option<String> {
48    let open = format!("<{tag_name}>");
49    let close = format!("</{tag_name}>");
50    let start = xml.find(&open)? + open.len();
51    let end = xml[start..].find(&close)? + start;
52    Some(xml[start..end].to_string())
53}
54
55/// Build an XML tag with optional attributes and children.
56///
57/// If children is a string, it is escaped. If children is an array
58/// of pre-built strings, they are concatenated as-is.
59pub fn tag(name: &str, attrs: &[(&str, &str)], children: TagContent<'_>) -> String {
60    let attr_str: String = attrs
61        .iter()
62        .map(|(k, v)| format!(" {k}=\"{}\"", escape_xml(v)))
63        .collect();
64
65    match children {
66        TagContent::None => format!("<{name}{attr_str}></{name}>"),
67        TagContent::Text(text) => {
68            format!("<{name}{attr_str}>{}</{name}>", escape_xml(text))
69        }
70        TagContent::Children(kids) => {
71            let inner: String = kids.into_iter().collect();
72            format!("<{name}{attr_str}>{inner}</{name}>")
73        }
74    }
75}
76
77/// Content variants for the [`tag`] builder function.
78///
79/// Use [`TagContent::None`] for self-closing elements, [`TagContent::Text`]
80/// for text nodes (automatically XML-escaped), and [`TagContent::Children`]
81/// for pre-built child element strings.
82#[non_exhaustive]
83pub enum TagContent<'a> {
84    /// Empty element: `<name></name>`.
85    None,
86    /// Text content (will be XML-escaped): `<name>text</name>`.
87    Text(&'a str),
88    /// Pre-built child elements concatenated verbatim: `<name><a/><b/></name>`.
89    Children(Vec<String>),
90}
91
92impl<'a> From<&'a str> for TagContent<'a> {
93    fn from(s: &'a str) -> Self {
94        TagContent::Text(s)
95    }
96}
97
98impl From<Vec<String>> for TagContent<'_> {
99    fn from(v: Vec<String>) -> Self {
100        TagContent::Children(v)
101    }
102}
103
104impl From<String> for TagContent<'_> {
105    fn from(s: String) -> Self {
106        TagContent::Text(Box::leak(s.into_boxed_str()))
107    }
108}
109
110/// Pretty-print an XML string by adding indentation.
111///
112/// This is a lightweight formatter that does not parse XML semantically --
113/// it works by splitting on `<` / `>` boundaries and inserting newlines and
114/// indentation. Suitable for debugging/display purposes. Equivalent to the
115/// PHP `FakePretty::prettyPrint` formatting behaviour (via DOMDocument::formatOutput).
116///
117/// # Examples
118///
119/// ```
120/// use fiscal_core::xml_utils::pretty_print_xml;
121/// let compact = "<root><child>text</child></root>";
122/// let pretty = pretty_print_xml(compact);
123/// assert!(pretty.contains("  <child>"));
124/// ```
125pub fn pretty_print_xml(xml: &str) -> String {
126    // Tokenise into tags and text segments
127    let mut tokens: Vec<XmlToken> = Vec::new();
128    let mut pos = 0;
129    let bytes = xml.as_bytes();
130
131    while pos < bytes.len() {
132        if bytes[pos] == b'<' {
133            // Find end of tag
134            let end = xml[pos..]
135                .find('>')
136                .map(|i| pos + i + 1)
137                .unwrap_or(bytes.len());
138            tokens.push(XmlToken::Tag(xml[pos..end].to_string()));
139            pos = end;
140        } else {
141            // Text until next '<'
142            let end = xml[pos..].find('<').map(|i| pos + i).unwrap_or(bytes.len());
143            let text = &xml[pos..end];
144            if !text.trim().is_empty() {
145                tokens.push(XmlToken::Text(text.trim().to_string()));
146            }
147            pos = end;
148        }
149    }
150
151    // Now render with indentation
152    let indent = "  ";
153    let mut result = String::with_capacity(xml.len() * 2);
154    let mut depth: usize = 0;
155
156    let mut i = 0;
157    while i < tokens.len() {
158        match &tokens[i] {
159            XmlToken::Tag(t) if t.starts_with("<?") => {
160                // XML declaration
161                result.push_str(t);
162                result.push('\n');
163            }
164            XmlToken::Tag(t) if t.starts_with("</") => {
165                // Closing tag
166                depth = depth.saturating_sub(1);
167                for _ in 0..depth {
168                    result.push_str(indent);
169                }
170                result.push_str(t);
171                result.push('\n');
172            }
173            XmlToken::Tag(t) if t.ends_with("/>") => {
174                // Self-closing tag
175                for _ in 0..depth {
176                    result.push_str(indent);
177                }
178                result.push_str(t);
179                result.push('\n');
180            }
181            XmlToken::Tag(t) => {
182                // Opening tag -- check if next token is Text followed by closing tag
183                if i + 2 < tokens.len() {
184                    if let (XmlToken::Text(text), XmlToken::Tag(close)) =
185                        (&tokens[i + 1], &tokens[i + 2])
186                    {
187                        if close.starts_with("</") {
188                            // Inline text element: <tag>text</tag>
189                            for _ in 0..depth {
190                                result.push_str(indent);
191                            }
192                            result.push_str(t);
193                            result.push_str(text);
194                            result.push_str(close);
195                            result.push('\n');
196                            i += 3;
197                            continue;
198                        }
199                    }
200                }
201                for _ in 0..depth {
202                    result.push_str(indent);
203                }
204                result.push_str(t);
205                result.push('\n');
206                depth += 1;
207            }
208            XmlToken::Text(t) => {
209                // Standalone text (unusual)
210                for _ in 0..depth {
211                    result.push_str(indent);
212                }
213                result.push_str(t);
214                result.push('\n');
215            }
216        }
217        i += 1;
218    }
219
220    // Remove trailing newline
221    while result.ends_with('\n') {
222        result.pop();
223    }
224    result
225}
226
227/// Internal token type for XML pretty-printing.
228enum XmlToken {
229    Tag(String),
230    Text(String),
231}
232
233/// Replace characters that are valid in XML but rejected by SEFAZ.
234///
235/// This is a **SEFAZ-level** sanitisation function, distinct from [`escape_xml`].
236/// While `escape_xml` performs standard XML entity encoding, this function
237/// mirrors the PHP `Strings::replaceUnacceptableCharacters` from `sped-common`:
238///
239/// 1. Remove `<` and `>`.
240/// 2. Replace `&` with ` & ` (space-padded).
241/// 3. Remove single quotes (`'`) and double quotes (`"`).
242/// 4. Collapse multiple consecutive whitespace characters into a single space.
243/// 5. Encode the remaining `&` as `&amp;`.
244/// 6. Remove carriage return (`\r`), tab (`\t`), and line feed (`\n`).
245/// 7. Collapse multiple whitespace again (from normalize step).
246/// 8. Remove ASCII control characters (`0x00`–`0x1F`, `0x7F`), except space.
247/// 9. Trim leading and trailing whitespace.
248///
249/// The function is designed to be called on user-provided field values
250/// (e.g. `xJust`, `xCorrecao`, `xPag`) before they are placed into the
251/// NF-e XML, so that the SEFAZ web-service will not reject the document
252/// because of forbidden characters.
253///
254/// # Examples
255///
256/// ```
257/// use fiscal_core::xml_utils::replace_unacceptable_characters;
258/// assert_eq!(
259///     replace_unacceptable_characters("Tom & Jerry <cats>"),
260///     "Tom &amp; Jerry cats"
261/// );
262/// assert_eq!(
263///     replace_unacceptable_characters("  hello   world  "),
264///     "hello world"
265/// );
266/// ```
267pub fn replace_unacceptable_characters(input: &str) -> String {
268    if input.is_empty() {
269        return String::new();
270    }
271
272    // Step 1: Remove < and >
273    let s = input.replace(['<', '>'], "");
274
275    // Step 2: Replace & with " & " (space-padded)
276    let s = s.replace('&', " & ");
277
278    // Step 3-4: Remove single quotes and double quotes
279    let s = s.replace(['\'', '"'], "");
280
281    // Step 5: Collapse multiple whitespace into single space
282    let s = collapse_whitespace(&s);
283
284    // Step 6: Encode & as &amp; (the only entity that can remain after steps 1-4)
285    let s = s.replace('&', "&amp;");
286
287    // Step 7: Remove \r, \t, \n (normalize)
288    let s = s.replace(['\r', '\t', '\n'], "");
289
290    // Step 8: Collapse multiple whitespace again (normalize)
291    let s = collapse_whitespace(&s);
292
293    // Step 9: Remove control characters (0x00-0x1F except space 0x20, and 0x7F)
294    let s: String = s
295        .chars()
296        .filter(|&c| !c.is_ascii_control() || c == ' ')
297        .collect();
298
299    // Step 10: Trim
300    s.trim().to_string()
301}
302
303/// Collapse runs of whitespace characters into a single ASCII space.
304///
305/// Equivalent to the PHP `preg_replace('/(?:\s\s+)/', ' ', …)` pattern used
306/// throughout `sped-common`.
307fn collapse_whitespace(s: &str) -> String {
308    let mut result = String::with_capacity(s.len());
309    let mut prev_ws = false;
310    for ch in s.chars() {
311        if ch.is_whitespace() {
312            if !prev_ws {
313                result.push(' ');
314            }
315            prev_ws = true;
316        } else {
317            result.push(ch);
318            prev_ws = false;
319        }
320    }
321    result
322}
323
324/// Validate an NF-e XML string by checking for the presence of required tags.
325///
326/// This is a lightweight structural validator that checks for mandatory tags
327/// in the NF-e/NFC-e XML. It does **not** perform full XSD schema validation
328/// (which would require shipping XSD files and a full XML schema parser), but
329/// covers the most common errors that would cause SEFAZ rejection.
330///
331/// Validated items:
332/// - Required root structure (`<NFe>`, `<infNFe>`)
333/// - Required `<ide>` fields (cUF, cNF, natOp, mod, serie, nNF, dhEmi, tpNF, etc.)
334/// - Required `<emit>` fields (CNPJ/CPF, xNome, enderEmit, IE, CRT)
335/// - Required `<det>` with at least one item
336/// - Required `<total>` / `<ICMSTot>`
337/// - Required `<transp>` and `<pag>`
338/// - Access key format (44 digits)
339///
340/// # Errors
341///
342/// Returns [`FiscalError::XmlParsing`] with a description of all missing tags.
343///
344/// # Examples
345///
346/// ```
347/// use fiscal_core::xml_utils::validate_xml;
348/// let xml = "<NFe><infNFe>...</infNFe></NFe>";
349/// // Will return an error listing all missing required tags
350/// assert!(validate_xml(xml).is_err());
351/// ```
352pub fn validate_xml(xml: &str) -> Result<(), crate::FiscalError> {
353    let mut errors: Vec<String> = Vec::new();
354
355    // Check root structure
356    let required_structure = [
357        ("NFe", "Elemento raiz <NFe> ausente"),
358        ("infNFe", "Elemento <infNFe> ausente"),
359    ];
360    for (tag_name, msg) in &required_structure {
361        if !xml.contains(&format!("<{tag_name}")) {
362            errors.push(msg.to_string());
363        }
364    }
365
366    // Check IDE required tags
367    let ide_tags = [
368        "cUF", "cNF", "natOp", "mod", "serie", "nNF", "dhEmi", "tpNF", "idDest", "cMunFG", "tpImp",
369        "tpEmis", "cDV", "tpAmb", "finNFe", "indFinal", "indPres", "procEmi", "verProc",
370    ];
371    for tag_name in &ide_tags {
372        if extract_xml_tag_value(xml, tag_name).is_none() {
373            errors.push(format!("Tag obrigatória <{tag_name}> ausente em <ide>"));
374        }
375    }
376
377    // Check emit required tags
378    let emit_required = ["xNome", "IE", "CRT"];
379    for tag_name in &emit_required {
380        if extract_xml_tag_value(xml, tag_name).is_none() {
381            errors.push(format!("Tag obrigatória <{tag_name}> ausente em <emit>"));
382        }
383    }
384    // CNPJ or CPF must be present
385    if extract_xml_tag_value(xml, "CNPJ").is_none() && extract_xml_tag_value(xml, "CPF").is_none() {
386        errors.push("Tag <CNPJ> ou <CPF> ausente em <emit>".to_string());
387    }
388
389    // Check required blocks
390    let required_blocks = [
391        ("enderEmit", "Bloco <enderEmit> ausente"),
392        ("det ", "Nenhum item <det> encontrado"),
393        ("total", "Bloco <total> ausente"),
394        ("ICMSTot", "Bloco <ICMSTot> ausente"),
395        ("transp", "Bloco <transp> ausente"),
396        ("pag", "Bloco <pag> ausente"),
397    ];
398    for (fragment, msg) in &required_blocks {
399        if !xml.contains(&format!("<{fragment}")) {
400            errors.push(msg.to_string());
401        }
402    }
403
404    // Validate access key format (44 digits) from infNFe Id attribute
405    if let Some(id_start) = xml.find("Id=\"NFe") {
406        let after_id = &xml[id_start + 7..];
407        if let Some(quote_end) = after_id.find('"') {
408            let key = &after_id[..quote_end];
409            if key.len() != 44 || !key.chars().all(|c| c.is_ascii_digit()) {
410                errors.push(format!(
411                    "Chave de acesso inválida: esperado 44 dígitos, encontrado '{key}'"
412                ));
413            }
414        }
415    }
416
417    if errors.is_empty() {
418        Ok(())
419    } else {
420        Err(crate::FiscalError::XmlParsing(errors.join("; ")))
421    }
422}
423
424/// Remove characters that are invalid in XML 1.0 documents.
425///
426/// Per the XML 1.0 specification (Section 2.2), the only valid characters are:
427///
428/// - `#x9` (tab), `#xA` (line feed), `#xD` (carriage return)
429/// - `#x20`–`#xD7FF`
430/// - `#xE000`–`#xFFFD`
431/// - `#x10000`–`#x10FFFF`
432///
433/// All other characters (control characters `\x00`–`\x08`, `\x0B`–`\x0C`,
434/// `\x0E`–`\x1F`, surrogates `\xD800`–`\xDFFF`, `\xFFFE`–`\xFFFF`) are
435/// stripped from the output.
436///
437/// This mirrors the character-level cleaning portion of the PHP
438/// `Strings::normalize()` function in `sped-common`.
439///
440/// # Examples
441///
442/// ```
443/// use fiscal_core::xml_utils::remove_invalid_xml_chars;
444/// assert_eq!(remove_invalid_xml_chars("hello\x00world"), "helloworld");
445/// assert_eq!(remove_invalid_xml_chars("tab\there"), "tab\there");
446/// assert_eq!(remove_invalid_xml_chars("line\nfeed"), "line\nfeed");
447/// ```
448pub fn remove_invalid_xml_chars(input: &str) -> String {
449    let mut result = String::with_capacity(input.len());
450    for ch in input.chars() {
451        if is_valid_xml_char(ch) {
452            result.push(ch);
453        }
454    }
455    result
456}
457
458/// Check whether a character is valid in XML 1.0 documents.
459///
460/// Valid characters per the XML 1.0 spec:
461/// `#x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]`
462fn is_valid_xml_char(ch: char) -> bool {
463    matches!(ch,
464        '\u{09}' | '\u{0A}' | '\u{0D}' |
465        '\u{20}'..='\u{D7FF}' |
466        '\u{E000}'..='\u{FFFD}' |
467        '\u{10000}'..='\u{10FFFF}'
468    )
469}
470
471/// Clean an XML string by removing namespace artifacts, collapsing inter-tag
472/// whitespace, and optionally stripping the `<?xml … ?>` declaration.
473///
474/// This is a direct port of the PHP `Strings::clearXmlString()` from
475/// `sped-common`. It performs the following transformations:
476///
477/// 1. Removes the `xmlns:default="http://www.w3.org/2000/09/xmldsig#"` attribute.
478/// 2. Removes the `standalone="no"` attribute.
479/// 3. Removes `default:` namespace prefixes and `:default` suffixes.
480/// 4. Strips `\n`, `\r`, and `\t` characters.
481/// 5. Collapses whitespace between adjacent XML tags (`> <` becomes `><`).
482/// 6. If `remove_encoding_tag` is `true`, removes the `<?xml … ?>` declaration.
483///
484/// # Examples
485///
486/// ```
487/// use fiscal_core::xml_utils::clear_xml_string;
488///
489/// let xml = "<root>\n  <child>text</child>\n</root>";
490/// assert_eq!(clear_xml_string(xml, false), "<root><child>text</child></root>");
491///
492/// let xml2 = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><root><a>1</a></root>";
493/// assert_eq!(clear_xml_string(xml2, true), "<root><a>1</a></root>");
494/// ```
495pub fn clear_xml_string(input: &str, remove_encoding_tag: bool) -> String {
496    // Remove namespace artifacts and control whitespace (matches PHP $aFind array)
497    let mut result = input.to_string();
498
499    let removals = [
500        "xmlns:default=\"http://www.w3.org/2000/09/xmldsig#\"",
501        " standalone=\"no\"",
502        "default:",
503        ":default",
504        "\n",
505        "\r",
506        "\t",
507    ];
508    for pattern in &removals {
509        result = result.replace(pattern, "");
510    }
511
512    // Collapse whitespace between tags: >   < becomes ><
513    // This replicates: preg_replace('/(\>)\s*(\<)/m', '$1$2', $retXml)
514    let mut collapsed = String::with_capacity(result.len());
515    let mut chars = result.chars().peekable();
516    while let Some(ch) = chars.next() {
517        collapsed.push(ch);
518        if ch == '>' {
519            // Skip whitespace until we hit '<' or a non-whitespace char
520            let mut ws_buf = String::new();
521            while let Some(&next) = chars.peek() {
522                if next.is_ascii_whitespace() {
523                    ws_buf.push(next);
524                    chars.next();
525                } else {
526                    break;
527                }
528            }
529            // If the next char after whitespace is '<', drop the whitespace
530            // Otherwise, keep it
531            if let Some(&next) = chars.peek() {
532                if next != '<' {
533                    collapsed.push_str(&ws_buf);
534                }
535            } else {
536                // End of string; preserve trailing whitespace
537                collapsed.push_str(&ws_buf);
538            }
539        }
540    }
541    result = collapsed;
542
543    // Optionally remove <?xml ... ?> declaration
544    if remove_encoding_tag {
545        result = delete_all_between(&result, "<?xml", "?>");
546    }
547
548    result
549}
550
551/// Remove the first occurrence of text delimited by `beginning` and `end`
552/// (inclusive of the delimiters).
553///
554/// Port of PHP `Strings::deleteAllBetween()`.
555fn delete_all_between(input: &str, beginning: &str, end: &str) -> String {
556    let begin_pos = match input.find(beginning) {
557        Some(p) => p,
558        None => return input.to_string(),
559    };
560    let after_begin = begin_pos + beginning.len();
561    let end_pos = match input[after_begin..].find(end) {
562        Some(p) => after_begin + p + end.len(),
563        None => return input.to_string(),
564    };
565    let mut result = String::with_capacity(input.len() - (end_pos - begin_pos));
566    result.push_str(&input[..begin_pos]);
567    result.push_str(&input[end_pos..]);
568    result
569}
570
571#[cfg(test)]
572mod tests {
573    use super::*;
574
575    #[test]
576    fn pretty_print_simple_xml() {
577        let compact = "<root><child>text</child></root>";
578        let pretty = pretty_print_xml(compact);
579        assert!(pretty.contains("<root>"));
580        assert!(pretty.contains("  <child>text</child>"));
581        assert!(pretty.contains("</root>"));
582    }
583
584    #[test]
585    fn pretty_print_nested_xml() {
586        let compact = "<a><b><c>val</c></b></a>";
587        let pretty = pretty_print_xml(compact);
588        let lines: Vec<&str> = pretty.lines().collect();
589        assert_eq!(lines[0], "<a>");
590        assert_eq!(lines[1], "  <b>");
591        assert_eq!(lines[2], "    <c>val</c>");
592        assert_eq!(lines[3], "  </b>");
593        assert_eq!(lines[4], "</a>");
594    }
595
596    #[test]
597    fn pretty_print_with_declaration() {
598        let xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><root><a>1</a></root>";
599        let pretty = pretty_print_xml(xml);
600        assert!(pretty.starts_with("<?xml"));
601        assert!(pretty.contains("  <a>1</a>"));
602    }
603
604    #[test]
605    fn pretty_print_empty_input() {
606        let pretty = pretty_print_xml("");
607        assert_eq!(pretty, "");
608    }
609
610    #[test]
611    fn validate_xml_valid_nfe() {
612        let xml = concat!(
613            r#"<NFe><infNFe versao="4.00" Id="NFe41260304123456000190550010000001231123456780">"#,
614            "<ide><cUF>41</cUF><cNF>12345678</cNF><natOp>VENDA</natOp>",
615            "<mod>55</mod><serie>1</serie><nNF>123</nNF>",
616            "<dhEmi>2026-03-11T10:30:00-03:00</dhEmi>",
617            "<tpNF>1</tpNF><idDest>1</idDest><cMunFG>4106902</cMunFG>",
618            "<tpImp>1</tpImp><tpEmis>1</tpEmis><cDV>0</cDV>",
619            "<tpAmb>2</tpAmb><finNFe>1</finNFe><indFinal>1</indFinal>",
620            "<indPres>1</indPres><procEmi>0</procEmi><verProc>1.0</verProc></ide>",
621            "<emit><CNPJ>04123456000190</CNPJ><xNome>Test</xNome>",
622            "<enderEmit><xLgr>Rua</xLgr></enderEmit>",
623            "<IE>9012345678</IE><CRT>3</CRT></emit>",
624            "<det nItem=\"1\"><prod><cProd>001</cProd></prod></det>",
625            "<total><ICMSTot><vNF>150.00</vNF></ICMSTot></total>",
626            "<transp><modFrete>9</modFrete></transp>",
627            "<pag><detPag><tPag>01</tPag><vPag>150.00</vPag></detPag></pag>",
628            "</infNFe></NFe>",
629        );
630        assert!(validate_xml(xml).is_ok());
631    }
632
633    #[test]
634    fn validate_xml_missing_tags() {
635        let xml = "<root><something>val</something></root>";
636        let err = validate_xml(xml).unwrap_err();
637        let msg = err.to_string();
638        assert!(msg.contains("NFe"));
639        assert!(msg.contains("infNFe"));
640    }
641
642    #[test]
643    fn validate_xml_invalid_access_key() {
644        let xml = concat!(
645            r#"<NFe><infNFe versao="4.00" Id="NFe123">"#,
646            "<ide><cUF>41</cUF><cNF>12345678</cNF><natOp>VENDA</natOp>",
647            "<mod>55</mod><serie>1</serie><nNF>123</nNF>",
648            "<dhEmi>2026-03-11T10:30:00-03:00</dhEmi>",
649            "<tpNF>1</tpNF><idDest>1</idDest><cMunFG>4106902</cMunFG>",
650            "<tpImp>1</tpImp><tpEmis>1</tpEmis><cDV>0</cDV>",
651            "<tpAmb>2</tpAmb><finNFe>1</finNFe><indFinal>1</indFinal>",
652            "<indPres>1</indPres><procEmi>0</procEmi><verProc>1.0</verProc></ide>",
653            "<emit><CNPJ>04123456000190</CNPJ><xNome>Test</xNome>",
654            "<enderEmit><xLgr>Rua</xLgr></enderEmit>",
655            "<IE>9012345678</IE><CRT>3</CRT></emit>",
656            "<det nItem=\"1\"><prod><cProd>001</cProd></prod></det>",
657            "<total><ICMSTot><vNF>150.00</vNF></ICMSTot></total>",
658            "<transp><modFrete>9</modFrete></transp>",
659            "<pag><detPag><tPag>01</tPag><vPag>150.00</vPag></detPag></pag>",
660            "</infNFe></NFe>",
661        );
662        let err = validate_xml(xml).unwrap_err();
663        let msg = err.to_string();
664        assert!(msg.contains("Chave de acesso"));
665    }
666
667    // ── remove_invalid_xml_chars tests ──────────────────────────────────
668
669    #[test]
670    fn remove_invalid_xml_chars_preserves_valid_text() {
671        assert_eq!(remove_invalid_xml_chars("Hello, World!"), "Hello, World!");
672    }
673
674    #[test]
675    fn remove_invalid_xml_chars_preserves_tab_lf_cr() {
676        // \x09 (tab), \x0A (line feed), \x0D (carriage return) are valid
677        assert_eq!(
678            remove_invalid_xml_chars("a\x09b\x0Ac\x0Dd"),
679            "a\x09b\x0Ac\x0Dd"
680        );
681    }
682
683    #[test]
684    fn remove_invalid_xml_chars_strips_null_and_low_controls() {
685        // \x00 through \x08 are invalid
686        assert_eq!(
687            remove_invalid_xml_chars("\x00\x01\x02\x03\x04\x05\x06\x07\x08hello"),
688            "hello"
689        );
690    }
691
692    #[test]
693    fn remove_invalid_xml_chars_strips_0b_0c() {
694        // \x0B (vertical tab) and \x0C (form feed) are invalid
695        assert_eq!(remove_invalid_xml_chars("a\x0Bb\x0Cc"), "abc");
696    }
697
698    #[test]
699    fn remove_invalid_xml_chars_strips_0e_to_1f() {
700        // \x0E through \x1F are invalid
701        let mut input = String::from("ok");
702        for byte in 0x0Eu8..=0x1F {
703            input.push(byte as char);
704        }
705        input.push_str("end");
706        assert_eq!(remove_invalid_xml_chars(&input), "okend");
707    }
708
709    #[test]
710    fn remove_invalid_xml_chars_strips_del() {
711        // DEL (\x7F) is invalid — it falls outside the valid range
712        // (it's > \x1F but not in \x20..=\xD7FF since \x7F is a control char,
713        //  however by codepoint it IS in \x20..=\xD7FF so XML 1.0 actually
714        //  allows it as a valid character).
715        // Wait — XML 1.0 valid range includes #x20-#xD7FF, and \x7F = U+007F
716        // is within that range. So DEL is technically valid in XML 1.0.
717        // Our implementation follows the spec exactly.
718        assert_eq!(remove_invalid_xml_chars("a\x7Fb"), "a\x7Fb");
719    }
720
721    #[test]
722    fn remove_invalid_xml_chars_strips_fffe_ffff() {
723        // U+FFFE and U+FFFF are invalid
724        let input = format!("a{}b{}c", '\u{FFFE}', '\u{FFFF}');
725        assert_eq!(remove_invalid_xml_chars(&input), "abc");
726    }
727
728    #[test]
729    fn remove_invalid_xml_chars_preserves_bmp_and_supplementary() {
730        // Valid BMP characters (accented, CJK, etc.)
731        assert_eq!(
732            remove_invalid_xml_chars("café résumé 日本語"),
733            "café résumé 日本語"
734        );
735        // Valid supplementary plane characters (emoji, etc.)
736        let input = "hello \u{1F600} world"; // U+1F600 is valid (in #x10000-#x10FFFF)
737        assert_eq!(remove_invalid_xml_chars(input), input);
738    }
739
740    #[test]
741    fn remove_invalid_xml_chars_preserves_private_use_area() {
742        // U+E000-U+FFFD is valid
743        let input = "a\u{E000}b\u{FFFD}c";
744        assert_eq!(remove_invalid_xml_chars(input), input);
745    }
746
747    #[test]
748    fn remove_invalid_xml_chars_empty_string() {
749        assert_eq!(remove_invalid_xml_chars(""), "");
750    }
751
752    #[test]
753    fn remove_invalid_xml_chars_all_invalid() {
754        assert_eq!(remove_invalid_xml_chars("\x00\x01\x02\x03"), "");
755    }
756
757    #[test]
758    fn remove_invalid_xml_chars_mixed_xml_content() {
759        let input = "<tag>val\x00ue with \x0Bcontrol\x1F chars</tag>";
760        assert_eq!(
761            remove_invalid_xml_chars(input),
762            "<tag>value with control chars</tag>"
763        );
764    }
765
766    // ── clear_xml_string tests ──────────────────────────────────────────
767
768    #[test]
769    fn clear_xml_string_removes_whitespace_between_tags() {
770        let xml = "<root>\n  <child>text</child>\n</root>";
771        assert_eq!(
772            clear_xml_string(xml, false),
773            "<root><child>text</child></root>"
774        );
775    }
776
777    #[test]
778    fn clear_xml_string_removes_tabs_cr_lf() {
779        let xml = "<a>\t<b>\r\n<c>val</c>\n</b>\n</a>";
780        assert_eq!(clear_xml_string(xml, false), "<a><b><c>val</c></b></a>");
781    }
782
783    #[test]
784    fn clear_xml_string_removes_default_namespace() {
785        // Note: removing the xmlns attribute leaves a trailing space before '>',
786        // matching PHP str_replace behaviour exactly.
787        let xml = "<Signature xmlns:default=\"http://www.w3.org/2000/09/xmldsig#\"><default:SignedInfo>data</default:SignedInfo></Signature>";
788        assert_eq!(
789            clear_xml_string(xml, false),
790            "<Signature ><SignedInfo>data</SignedInfo></Signature>"
791        );
792    }
793
794    #[test]
795    fn clear_xml_string_removes_standalone_no() {
796        let xml = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?><root/>";
797        assert_eq!(
798            clear_xml_string(xml, false),
799            "<?xml version=\"1.0\" encoding=\"UTF-8\"?><root/>"
800        );
801    }
802
803    #[test]
804    fn clear_xml_string_removes_encoding_tag() {
805        let xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><root><a>1</a></root>";
806        assert_eq!(clear_xml_string(xml, true), "<root><a>1</a></root>");
807    }
808
809    #[test]
810    fn clear_xml_string_preserves_without_encoding_tag() {
811        let xml = "<?xml version=\"1.0\"?><root><a>1</a></root>";
812        assert_eq!(
813            clear_xml_string(xml, false),
814            "<?xml version=\"1.0\"?><root><a>1</a></root>"
815        );
816    }
817
818    #[test]
819    fn clear_xml_string_no_encoding_tag_present() {
820        let xml = "<root><a>1</a></root>";
821        assert_eq!(clear_xml_string(xml, true), "<root><a>1</a></root>");
822    }
823
824    #[test]
825    fn clear_xml_string_empty_input() {
826        assert_eq!(clear_xml_string("", false), "");
827        assert_eq!(clear_xml_string("", true), "");
828    }
829
830    #[test]
831    fn clear_xml_string_preserves_text_content_spaces() {
832        // Spaces inside text content (not between tags) should be preserved
833        let xml = "<tag>hello world</tag>";
834        assert_eq!(clear_xml_string(xml, false), "<tag>hello world</tag>");
835    }
836
837    #[test]
838    fn clear_xml_string_collapses_multiple_spaces_between_tags() {
839        let xml = "<a>   <b>text</b>   </a>";
840        assert_eq!(clear_xml_string(xml, false), "<a><b>text</b></a>");
841    }
842
843    #[test]
844    fn clear_xml_string_removes_colon_default_suffix() {
845        let xml = "<Signature:default><data/></Signature:default>";
846        assert_eq!(
847            clear_xml_string(xml, false),
848            "<Signature><data/></Signature>"
849        );
850    }
851
852    // ----- replace_unacceptable_characters tests -----
853
854    #[test]
855    fn replace_unacceptable_empty() {
856        assert_eq!(replace_unacceptable_characters(""), "");
857    }
858
859    #[test]
860    fn replace_unacceptable_plain_text() {
861        assert_eq!(
862            replace_unacceptable_characters("Venda de mercadorias"),
863            "Venda de mercadorias"
864        );
865    }
866
867    #[test]
868    fn replace_unacceptable_removes_angle_brackets() {
869        assert_eq!(replace_unacceptable_characters("foo<bar>baz"), "foobarbaz");
870    }
871
872    #[test]
873    fn replace_unacceptable_ampersand_encoding() {
874        assert_eq!(replace_unacceptable_characters("A&B"), "A &amp; B");
875    }
876
877    #[test]
878    fn replace_unacceptable_removes_quotes() {
879        assert_eq!(
880            replace_unacceptable_characters(r#"It's a "test""#),
881            "Its a test"
882        );
883    }
884
885    #[test]
886    fn replace_unacceptable_collapses_whitespace() {
887        assert_eq!(
888            replace_unacceptable_characters("hello    world"),
889            "hello world"
890        );
891    }
892
893    #[test]
894    fn replace_unacceptable_trims() {
895        assert_eq!(replace_unacceptable_characters("  hello  "), "hello");
896    }
897
898    #[test]
899    fn replace_unacceptable_removes_control_chars() {
900        assert_eq!(
901            replace_unacceptable_characters("abc\x00\x01\x02def"),
902            "abcdef"
903        );
904    }
905
906    #[test]
907    fn replace_unacceptable_removes_cr_lf_tab() {
908        assert_eq!(
909            replace_unacceptable_characters("line1\r\n\tline2"),
910            "line1 line2"
911        );
912    }
913
914    #[test]
915    fn replace_unacceptable_combined() {
916        assert_eq!(
917            replace_unacceptable_characters(
918                "  Cancelamento <por>  erro & \"duplicidade\"  na emissão\t\n  "
919            ),
920            "Cancelamento por erro &amp; duplicidade na emissão"
921        );
922    }
923
924    #[test]
925    fn replace_unacceptable_ampersand_already_spaced() {
926        assert_eq!(replace_unacceptable_characters("A & B"), "A &amp; B");
927    }
928
929    #[test]
930    fn replace_unacceptable_multiple_ampersands() {
931        assert_eq!(
932            replace_unacceptable_characters("A&B&C"),
933            "A &amp; B &amp; C"
934        );
935    }
936
937    #[test]
938    fn replace_unacceptable_preserves_accented_chars() {
939        assert_eq!(
940            replace_unacceptable_characters("São Paulo — café"),
941            "São Paulo — café"
942        );
943    }
944
945    #[test]
946    fn replace_unacceptable_only_special_chars() {
947        assert_eq!(replace_unacceptable_characters("<>\"'"), "");
948    }
949
950    #[test]
951    fn replace_unacceptable_del_char() {
952        assert_eq!(replace_unacceptable_characters("abc\x7Fdef"), "abcdef");
953    }
954
955    // ── TagContent::from impls ─────────────────────────────────────
956
957    #[test]
958    fn tag_content_from_string() {
959        let content: TagContent = String::from("hello").into();
960        match content {
961            TagContent::Text(t) => assert_eq!(t, "hello"),
962            _ => panic!("expected Text"),
963        }
964    }
965
966    #[test]
967    fn tag_content_from_vec_string() {
968        let content: TagContent = vec!["<a/>".to_string(), "<b/>".to_string()].into();
969        match content {
970            TagContent::Children(kids) => assert_eq!(kids.len(), 2),
971            _ => panic!("expected Children"),
972        }
973    }
974
975    // ── pretty_print_xml self-closing ──────────────────────────────
976
977    #[test]
978    fn pretty_print_self_closing_tag() {
979        let xml = "<root><empty/></root>";
980        let pretty = pretty_print_xml(xml);
981        assert!(pretty.contains("  <empty/>"));
982    }
983
984    #[test]
985    fn pretty_print_standalone_text() {
986        // This is unusual XML but the formatter should handle it
987        let xml = "<root><a><b>text</b></a></root>";
988        let pretty = pretty_print_xml(xml);
989        assert!(pretty.contains("    <b>text</b>"));
990    }
991
992    // ── clear_xml_string preserves trailing ws after non-tag ───────
993
994    #[test]
995    fn clear_xml_string_non_tag_after_whitespace() {
996        // After '>' if the next non-whitespace is NOT '<', preserve whitespace
997        let xml = "<a>text after close</a>";
998        let result = clear_xml_string(xml, false);
999        assert_eq!(result, "<a>text after close</a>");
1000    }
1001
1002    // ── delete_all_between ────────────────────────────────────────
1003
1004    #[test]
1005    fn delete_all_between_no_match() {
1006        let result = delete_all_between("hello world", "<?xml", "?>");
1007        assert_eq!(result, "hello world");
1008    }
1009
1010    #[test]
1011    fn delete_all_between_no_end_match() {
1012        let result = delete_all_between("<?xml version start", "<?xml", "?>");
1013        assert_eq!(result, "<?xml version start");
1014    }
1015}
fiscal_core/xml_utils.rs

fiscal_core/
xml_utils.rs