Skip to main content

serdes_ai_core/
format.rs

1//! Format data as XML for LLM prompts.
2//!
3//! LLMs often work better with XML-formatted data for structured examples.
4//! This module provides utilities to convert serializable Rust types into
5//! XML format suitable for inclusion in prompts.
6//!
7//! # Example
8//!
9//! ```rust
10//! use serdes_ai_core::format::{format_as_xml, XmlFormatOptions};
11//!
12//! let data = serde_json::json!({
13//!     "name": "John",
14//!     "age": 30,
15//!     "hobbies": ["reading", "coding"]
16//! });
17//!
18//! let xml = format_as_xml(&data, Some("user")).unwrap();
19//! // <user>
20//! //   <name>John</name>
21//! //   <age>30</age>
22//! //   <hobbies>
23//! //     <item>reading</item>
24//! //     <item>coding</item>
25//! //   </hobbies>
26//! // </user>
27//! ```
28
29use serde::Serialize;
30use thiserror::Error;
31
32/// Options for XML formatting.
33#[derive(Debug, Clone)]
34pub struct XmlFormatOptions {
35    /// Root tag name. If None, no root tag is added.
36    pub root_tag: Option<String>,
37    /// Tag name for items in sequences.
38    pub item_tag: String,
39    /// String representation for None values.
40    pub none_str: String,
41    /// Indentation string (None for compact output).
42    pub indent: Option<String>,
43}
44
45impl Default for XmlFormatOptions {
46    fn default() -> Self {
47        Self {
48            root_tag: None,
49            item_tag: "item".to_string(),
50            none_str: "null".to_string(),
51            indent: Some("  ".to_string()),
52        }
53    }
54}
55
56impl XmlFormatOptions {
57    /// Create new options with default values.
58    #[must_use]
59    pub fn new() -> Self {
60        Self::default()
61    }
62
63    /// Set the root tag.
64    #[must_use]
65    pub fn with_root_tag(mut self, tag: impl Into<String>) -> Self {
66        self.root_tag = Some(tag.into());
67        self
68    }
69
70    /// Set the item tag for sequences.
71    #[must_use]
72    pub fn with_item_tag(mut self, tag: impl Into<String>) -> Self {
73        self.item_tag = tag.into();
74        self
75    }
76
77    /// Set the none string representation.
78    #[must_use]
79    pub fn with_none_str(mut self, s: impl Into<String>) -> Self {
80        self.none_str = s.into();
81        self
82    }
83
84    /// Set the indentation string. None for compact output.
85    #[must_use]
86    pub fn with_indent(mut self, indent: Option<String>) -> Self {
87        self.indent = indent;
88        self
89    }
90
91    /// Disable indentation for compact output.
92    #[must_use]
93    pub fn compact(mut self) -> Self {
94        self.indent = None;
95        self
96    }
97}
98
99/// Error type for XML formatting operations.
100#[derive(Debug, Error)]
101pub enum XmlFormatError {
102    /// Serialization error when converting to JSON intermediate.
103    #[error("Serialization error: {0}")]
104    Serialization(#[from] serde_json::Error),
105}
106
107/// Format a serializable value as XML.
108///
109/// This is the simple API that uses default options with an optional root tag.
110///
111/// # Arguments
112///
113/// * `value` - Any serializable value
114/// * `root_tag` - Optional root tag name to wrap the output
115///
116/// # Example
117///
118/// ```rust
119/// use serdes_ai_core::format::format_as_xml;
120///
121/// let data = serde_json::json!({
122///     "name": "Alice",
123///     "age": 25
124/// });
125///
126/// let xml = format_as_xml(&data, Some("person")).unwrap();
127/// assert!(xml.contains("<person>"));
128/// assert!(xml.contains("<name>Alice</name>"));
129/// ```
130pub fn format_as_xml<T: Serialize>(
131    value: &T,
132    root_tag: Option<&str>,
133) -> Result<String, XmlFormatError> {
134    let options = XmlFormatOptions {
135        root_tag: root_tag.map(String::from),
136        ..Default::default()
137    };
138    format_as_xml_with_options(value, &options)
139}
140
141/// Format with full options control.
142///
143/// # Arguments
144///
145/// * `value` - Any serializable value
146/// * `options` - Formatting options
147///
148/// # Example
149///
150/// ```rust
151/// use serdes_ai_core::format::{format_as_xml_with_options, XmlFormatOptions};
152///
153/// let data = vec!["apple", "banana", "cherry"];
154///
155/// let options = XmlFormatOptions::new()
156///     .with_root_tag("fruits")
157///     .with_item_tag("fruit");
158///
159/// let xml = format_as_xml_with_options(&data, &options).unwrap();
160/// assert!(xml.contains("<fruit>apple</fruit>"));
161/// ```
162pub fn format_as_xml_with_options<T: Serialize>(
163    value: &T,
164    options: &XmlFormatOptions,
165) -> Result<String, XmlFormatError> {
166    // Convert to serde_json::Value first for uniform handling
167    let json_value = serde_json::to_value(value)?;
168
169    let mut output = String::new();
170
171    if let Some(ref root_tag) = options.root_tag {
172        // With root tag, wrap the content
173        output.push_str(&format!("<{root_tag}>"));
174        if options.indent.is_some() {
175            output.push('\n');
176        }
177        value_to_xml_inner(&json_value, options, 1, &mut output);
178        output.push_str(&format!("</{root_tag}>"));
179    } else {
180        // No root tag, just output the content
181        value_to_xml_inner(&json_value, options, 0, &mut output);
182    }
183
184    Ok(output)
185}
186
187/// Internal function to convert JSON value to XML string.
188fn value_to_xml_inner(
189    value: &serde_json::Value,
190    options: &XmlFormatOptions,
191    depth: usize,
192    output: &mut String,
193) {
194    let indent = get_indent(options, depth);
195
196    match value {
197        serde_json::Value::Null => {
198            output.push_str(&options.none_str);
199        }
200        serde_json::Value::Bool(b) => {
201            output.push_str(if *b { "true" } else { "false" });
202        }
203        serde_json::Value::Number(n) => {
204            output.push_str(&n.to_string());
205        }
206        serde_json::Value::String(s) => {
207            output.push_str(&escape_xml(s));
208        }
209        serde_json::Value::Array(arr) => {
210            for item in arr {
211                output.push_str(&indent);
212                output.push_str(&format!("<{}>", options.item_tag));
213
214                if is_complex_value(item) {
215                    if options.indent.is_some() {
216                        output.push('\n');
217                    }
218                    value_to_xml_inner(item, options, depth + 1, output);
219                    output.push_str(&indent);
220                } else {
221                    value_to_xml_inner(item, options, depth + 1, output);
222                }
223
224                output.push_str(&format!("</{}>", options.item_tag));
225                if options.indent.is_some() {
226                    output.push('\n');
227                }
228            }
229        }
230        serde_json::Value::Object(map) => {
231            for (key, val) in map {
232                let tag = sanitize_tag_name(key);
233                output.push_str(&indent);
234                output.push_str(&format!("<{tag}>"));
235
236                if is_complex_value(val) {
237                    if options.indent.is_some() {
238                        output.push('\n');
239                    }
240                    value_to_xml_inner(val, options, depth + 1, output);
241                    output.push_str(&indent);
242                } else {
243                    value_to_xml_inner(val, options, depth + 1, output);
244                }
245
246                output.push_str(&format!("</{tag}>"));
247                if options.indent.is_some() {
248                    output.push('\n');
249                }
250            }
251        }
252    }
253}
254
255/// Check if a value is complex (object or array) and needs nested formatting.
256fn is_complex_value(value: &serde_json::Value) -> bool {
257    matches!(
258        value,
259        serde_json::Value::Object(_) | serde_json::Value::Array(_)
260    )
261}
262
263/// Get the indentation string for a given depth.
264fn get_indent(options: &XmlFormatOptions, depth: usize) -> String {
265    options
266        .indent
267        .as_ref()
268        .map(|i| i.repeat(depth))
269        .unwrap_or_default()
270}
271
272/// Sanitize a string to be a valid XML tag name.
273///
274/// XML tag names must:
275/// - Start with a letter or underscore
276/// - Contain only letters, digits, hyphens, underscores, and periods
277/// - Not start with "xml" (case-insensitive)
278fn sanitize_tag_name(name: &str) -> String {
279    let mut result = String::with_capacity(name.len());
280
281    for (i, c) in name.chars().enumerate() {
282        if i == 0 {
283            // First character must be letter or underscore
284            if c.is_ascii_alphabetic() || c == '_' {
285                result.push(c);
286            } else {
287                result.push('_');
288                if c.is_ascii_alphanumeric() {
289                    result.push(c);
290                }
291            }
292        } else {
293            // Subsequent characters can be letters, digits, hyphens, underscores, periods
294            if c.is_ascii_alphanumeric() || c == '-' || c == '_' || c == '.' {
295                result.push(c);
296            } else {
297                result.push('_');
298            }
299        }
300    }
301
302    // Handle empty result
303    if result.is_empty() {
304        return "_".to_string();
305    }
306
307    result
308}
309
310/// Escape special XML characters in a string.
311fn escape_xml(s: &str) -> String {
312    let mut result = String::with_capacity(s.len());
313    for c in s.chars() {
314        match c {
315            '&' => result.push_str("&amp;"),
316            '<' => result.push_str("&lt;"),
317            '>' => result.push_str("&gt;"),
318            '"' => result.push_str("&quot;"),
319            '\'' => result.push_str("&apos;"),
320            _ => result.push(c),
321        }
322    }
323    result
324}
325
326#[cfg(test)]
327mod tests {
328    use super::*;
329    use serde_json::json;
330
331    #[test]
332    fn test_simple_object() {
333        let data = json!({
334            "name": "John",
335            "age": 30
336        });
337
338        let xml = format_as_xml(&data, Some("user")).unwrap();
339        assert!(xml.contains("<user>"));
340        assert!(xml.contains("</user>"));
341        assert!(xml.contains("<name>John</name>"));
342        assert!(xml.contains("<age>30</age>"));
343    }
344
345    #[test]
346    fn test_nested_object() {
347        let data = json!({
348            "person": {
349                "name": "Alice",
350                "address": {
351                    "city": "NYC"
352                }
353            }
354        });
355
356        let xml = format_as_xml(&data, Some("root")).unwrap();
357        assert!(xml.contains("<city>NYC</city>"));
358    }
359
360    #[test]
361    fn test_array() {
362        let data = json!({
363            "hobbies": ["reading", "coding", "gaming"]
364        });
365
366        let xml = format_as_xml(&data, None).unwrap();
367        assert!(xml.contains("<item>reading</item>"));
368        assert!(xml.contains("<item>coding</item>"));
369        assert!(xml.contains("<item>gaming</item>"));
370    }
371
372    #[test]
373    fn test_custom_item_tag() {
374        let data = vec!["apple", "banana"];
375
376        let options = XmlFormatOptions::new()
377            .with_root_tag("fruits")
378            .with_item_tag("fruit");
379
380        let xml = format_as_xml_with_options(&data, &options).unwrap();
381        assert!(xml.contains("<fruit>apple</fruit>"));
382        assert!(xml.contains("<fruit>banana</fruit>"));
383    }
384
385    #[test]
386    fn test_compact_output() {
387        let data = json!({"a": 1, "b": 2});
388
389        let options = XmlFormatOptions::new().with_root_tag("data").compact();
390
391        let xml = format_as_xml_with_options(&data, &options).unwrap();
392        // Should not have newlines
393        assert!(!xml.contains("\n"));
394    }
395
396    #[test]
397    fn test_null_value() {
398        let data = json!({"value": null});
399
400        let xml = format_as_xml(&data, None).unwrap();
401        assert!(xml.contains("<value>null</value>"));
402    }
403
404    #[test]
405    fn test_custom_none_str() {
406        let data = json!({"value": null});
407
408        let options = XmlFormatOptions::new().with_none_str("N/A");
409
410        let xml = format_as_xml_with_options(&data, &options).unwrap();
411        assert!(xml.contains("<value>N/A</value>"));
412    }
413
414    #[test]
415    fn test_boolean_values() {
416        let data = json!({"active": true, "disabled": false});
417
418        let xml = format_as_xml(&data, None).unwrap();
419        assert!(xml.contains("<active>true</active>"));
420        assert!(xml.contains("<disabled>false</disabled>"));
421    }
422
423    #[test]
424    fn test_xml_escape() {
425        let data = json!({"text": "<script>alert('xss')</script>"});
426
427        let xml = format_as_xml(&data, None).unwrap();
428        assert!(xml.contains("&lt;script&gt;"));
429        assert!(xml.contains("&apos;"));
430    }
431
432    #[test]
433    fn test_sanitize_tag_name() {
434        assert_eq!(sanitize_tag_name("valid_name"), "valid_name");
435        assert_eq!(sanitize_tag_name("123start"), "_123start");
436        assert_eq!(sanitize_tag_name("has space"), "has_space");
437        assert_eq!(sanitize_tag_name("special@char"), "special_char");
438        assert_eq!(sanitize_tag_name(""), "_");
439    }
440
441    #[test]
442    fn test_escape_xml() {
443        assert_eq!(escape_xml("hello"), "hello");
444        assert_eq!(escape_xml("a & b"), "a &amp; b");
445        assert_eq!(escape_xml("<tag>"), "&lt;tag&gt;");
446        assert_eq!(escape_xml("\"quoted\""), "&quot;quoted&quot;");
447        assert_eq!(escape_xml("it's"), "it&apos;s");
448    }
449
450    #[test]
451    fn test_no_root_tag() {
452        let data = json!({"key": "value"});
453
454        let xml = format_as_xml(&data, None).unwrap();
455        assert!(xml.contains("<key>value</key>"));
456        // Should not have a root wrapper
457        assert!(!xml.starts_with("<None"));
458    }
459
460    #[test]
461    fn test_complex_nested_structure() {
462        let data = json!({
463            "users": [
464                {"name": "Alice", "roles": ["admin", "user"]},
465                {"name": "Bob", "roles": ["user"]}
466            ],
467            "metadata": {
468                "version": "1.0",
469                "count": 2
470            }
471        });
472
473        let xml = format_as_xml(&data, Some("response")).unwrap();
474        assert!(xml.contains("<response>"));
475        assert!(xml.contains("</response>"));
476        assert!(xml.contains("<name>Alice</name>"));
477        assert!(xml.contains("<version>1.0</version>"));
478    }
479
480    #[test]
481    fn test_options_builder() {
482        let options = XmlFormatOptions::new()
483            .with_root_tag("root")
484            .with_item_tag("entry")
485            .with_none_str("nil")
486            .with_indent(Some("    ".to_string()));
487
488        assert_eq!(options.root_tag, Some("root".to_string()));
489        assert_eq!(options.item_tag, "entry");
490        assert_eq!(options.none_str, "nil");
491        assert_eq!(options.indent, Some("    ".to_string()));
492    }
493
494    #[test]
495    fn test_struct_serialization() {
496        #[derive(Serialize)]
497        struct Person {
498            name: String,
499            age: u32,
500        }
501
502        let person = Person {
503            name: "Charlie".to_string(),
504            age: 35,
505        };
506
507        let xml = format_as_xml(&person, Some("person")).unwrap();
508        assert!(xml.contains("<name>Charlie</name>"));
509        assert!(xml.contains("<age>35</age>"));
510    }
511}