use crate::core::item::{ItemReader, ItemReaderResult};
use crate::error::BatchError;
use log::{debug, error};
use quick_xml::de::from_str;
use quick_xml::events::Event;
use quick_xml::reader::Reader as XmlReader;
use serde::de::DeserializeOwned;
use std::any::type_name;
use std::cell::RefCell;
use std::fs::File;
use std::io::{BufReader, Read};
use std::marker::PhantomData;
use std::path::Path;
use std::str;
pub struct XmlItemReaderBuilder<I: DeserializeOwned> {
tag_name: Option<String>,
capacity: usize,
_marker: PhantomData<I>,
}
impl<I: DeserializeOwned> Default for XmlItemReaderBuilder<I> {
fn default() -> Self {
Self {
tag_name: None,
capacity: 1024,
_marker: PhantomData,
}
}
}
impl<I: DeserializeOwned> XmlItemReaderBuilder<I> {
pub fn new() -> Self {
Self::default()
}
pub fn capacity(mut self, capacity: usize) -> Self {
self.capacity = capacity;
self
}
pub fn tag<S: AsRef<str>>(mut self, tag_name: S) -> Self {
self.tag_name = Some(tag_name.as_ref().to_string());
self
}
pub fn from_reader<R: Read + 'static>(self, reader: R) -> XmlItemReader<R, I> {
let tag = match self.tag_name {
Some(tag) => tag.into_bytes(),
None => {
let type_str = type_name::<I>();
let tag_name = type_str.split("::").last().unwrap_or(type_str);
tag_name.as_bytes().to_vec()
}
};
XmlItemReader::with_tag(reader, self.capacity, tag)
}
pub fn from_path<P: AsRef<Path>>(self, path: P) -> Result<XmlItemReader<File, I>, BatchError> {
let file_path = path.as_ref();
let file = File::open(file_path).map_err(|e| {
error!("Failed to open XML file {}: {}", file_path.display(), e);
BatchError::ItemReader(format!(
"Failed to open XML file {}: {}",
file_path.display(),
e
))
})?;
Ok(self.from_reader(file))
}
}
pub struct XmlItemReader<R, I> {
reader: RefCell<XmlReader<BufReader<R>>>,
buffer: RefCell<Vec<u8>>,
item_tag_name: Vec<u8>,
_marker: PhantomData<I>,
}
impl<R: Read, I: DeserializeOwned> XmlItemReader<R, I> {
fn with_tag<S: AsRef<[u8]>>(rdr: R, capacity: usize, tag: S) -> Self {
let buf_reader = BufReader::with_capacity(capacity, rdr);
let mut xml_reader = XmlReader::from_reader(buf_reader);
xml_reader.config_mut().trim_text(false);
Self {
reader: RefCell::new(xml_reader),
buffer: RefCell::new(Vec::with_capacity(1024)),
item_tag_name: tag.as_ref().to_vec(),
_marker: PhantomData,
}
}
}
impl<R: Read, I: DeserializeOwned> ItemReader<I> for XmlItemReader<R, I> {
fn read(&self) -> ItemReaderResult<I> {
let mut reader = self.reader.borrow_mut();
let mut buffer = self.buffer.borrow_mut();
let tag_name_str = str::from_utf8(&self.item_tag_name).unwrap_or("<binary>");
debug!("Looking for tag: '{}'", tag_name_str);
loop {
buffer.clear();
let event = reader
.read_event_into(&mut buffer)
.map_err(|e| BatchError::ItemReader(format!("XML parsing error: {}", e)))?;
match event {
Event::Start(ref e) => {
let e_name = e.name();
let name_ref = e_name.as_ref();
let tag_name = str::from_utf8(name_ref).unwrap_or("<binary>");
if name_ref == self.item_tag_name.as_slice() {
debug!("Found start tag: '{}'", tag_name);
let mut xml_string = String::new();
xml_string.push('<');
if let Ok(name) = str::from_utf8(tag_name.as_ref()) {
xml_string.push_str(name);
}
for attr in e.attributes().flatten() {
xml_string.push(' ');
if let Ok(key) = str::from_utf8(attr.key.as_ref()) {
xml_string.push_str(key);
}
xml_string.push_str("=\"");
if let Ok(value) = str::from_utf8(attr.value.as_ref()) {
xml_string.push_str(value);
}
xml_string.push('"');
}
xml_string.push('>');
let mut depth = 1;
while depth > 0 {
buffer.clear();
match reader.read_event_into(&mut buffer) {
Ok(Event::Start(ref start)) => {
depth += 1;
let s_name = start.name();
if let Ok(name) = str::from_utf8(s_name.as_ref()) {
xml_string.push('<');
xml_string.push_str(name);
for attr in start.attributes().flatten() {
xml_string.push(' ');
if let Ok(key) = str::from_utf8(attr.key.as_ref()) {
xml_string.push_str(key);
}
xml_string.push_str("=\"");
if let Ok(value) = str::from_utf8(attr.value.as_ref()) {
xml_string.push_str(value);
}
xml_string.push('"');
}
xml_string.push('>');
}
}
Ok(Event::End(ref end)) => {
depth -= 1;
let e_name = end.name();
if let Ok(name) = str::from_utf8(e_name.as_ref()) {
xml_string.push_str("</");
xml_string.push_str(name);
xml_string.push('>');
}
}
Ok(Event::Text(ref text)) => {
let bytes = text.as_ref();
if let Ok(s) = str::from_utf8(bytes) {
xml_string.push_str(s);
}
}
Ok(Event::GeneralRef(ref entity_ref)) => {
let entity_name = entity_ref.as_ref();
if let Ok(name) = str::from_utf8(entity_name) {
xml_string.push('&');
xml_string.push_str(name);
xml_string.push(';');
}
}
Ok(Event::CData(ref cdata)) => {
let bytes = cdata.as_ref();
if let Ok(s) = str::from_utf8(bytes) {
xml_string.push_str("<![CDATA[");
xml_string.push_str(s);
xml_string.push_str("]]>");
}
}
Ok(Event::Eof) => {
return Err(BatchError::ItemReader(
"Unexpected end of file".to_string(),
));
}
Err(e) => {
return Err(BatchError::ItemReader(format!(
"Error reading XML: {}",
e
)));
}
_ => { }
}
}
debug!("Finished reading XML item: {}", xml_string);
match from_str(&xml_string) {
Ok(item) => return Ok(Some(item)),
Err(e) => {
error!(
"Failed to deserialize XML item: {} from: {}",
e, xml_string
);
return Err(BatchError::ItemReader(format!(
"Failed to deserialize XML item: {} from: {}",
e, xml_string
)));
}
}
}
}
Event::Eof => {
debug!("Reached end of file");
return Ok(None);
}
_ => continue, }
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use serde::{Deserialize, Serialize};
use std::io::{Cursor, Write};
use tempfile::NamedTempFile;
#[derive(Debug, Deserialize, Serialize, PartialEq, Clone)]
#[serde(rename = "TestItem")]
struct TestItem {
name: String,
value: i32,
}
#[derive(Debug, Deserialize, Serialize, PartialEq, Clone)]
struct EngineSpecs {
#[serde(rename = "@type")]
engine_type: String,
#[serde(rename = "@cylinders")]
cylinders: i32,
horsepower: i32,
#[serde(rename = "fuelEfficiency")]
fuel_efficiency: f32,
}
#[derive(Debug, Deserialize, Serialize, PartialEq, Clone)]
struct Features {
#[serde(rename = "feature", default)]
items: Vec<String>,
}
#[derive(Debug, Deserialize, Serialize, PartialEq, Clone)]
#[serde(rename = "vehicle")]
struct Vehicle {
#[serde(rename = "@id")]
id: String,
#[serde(rename = "@category")]
category: String,
make: String,
model: String,
year: i32,
engine: EngineSpecs,
features: Features,
}
#[test]
fn test_xml_reader() {
let xml_content = r#"
<items>
<TestItem>
<name>test1</name>
<value>42</value>
</TestItem>
<TestItem>
<name>test2</name>
<value>43</value>
</TestItem>
</items>
"#;
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(xml_content.as_bytes()).unwrap();
let reader = XmlItemReaderBuilder::<TestItem>::new()
.tag("TestItem")
.capacity(1024)
.from_path(temp_file.path())
.unwrap();
let item1 = reader.read().unwrap().unwrap();
assert_eq!(
item1,
TestItem {
name: "test1".to_string(),
value: 42,
}
);
let item2 = reader.read().unwrap().unwrap();
assert_eq!(
item2,
TestItem {
name: "test2".to_string(),
value: 43,
}
);
assert!(reader.read().unwrap().is_none());
}
#[test]
fn test_xml_reader_with_custom_tag() {
let xml_content = r#"
<root>
<car>
<name>test1</name>
<value>42</value>
</car>
<car>
<name>test2</name>
<value>43</value>
</car>
</root>
"#;
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(xml_content.as_bytes()).unwrap();
let reader = XmlItemReaderBuilder::<TestItem>::new()
.tag("car")
.capacity(1024)
.from_path(temp_file.path())
.unwrap();
let item1 = reader.read().unwrap().unwrap();
assert_eq!(
item1,
TestItem {
name: "test1".to_string(),
value: 42,
}
);
let item2 = reader.read().unwrap().unwrap();
assert_eq!(
item2,
TestItem {
name: "test2".to_string(),
value: 43,
}
);
assert!(reader.read().unwrap().is_none());
}
#[test]
fn test_complex_nested_objects() {
let xml_content = r#"
<root>
<vehicle id="v001" category="sedan">
<make>Toyota</make>
<model>Camry</model>
<year>2022</year>
<engine type="hybrid" cylinders="4">
<horsepower>208</horsepower>
<fuelEfficiency>4.5</fuelEfficiency>
</engine>
<features>
<feature>Bluetooth</feature>
<feature>Navigation</feature>
<feature>Leather Seats</feature>
</features>
</vehicle>
<vehicle id="v002" category="suv">
<make>Honda</make>
<model>CR-V</model>
<year>2023</year>
<engine type="gasoline" cylinders="4">
<horsepower>190</horsepower>
<fuelEfficiency>7.2</fuelEfficiency>
</engine>
<features>
<feature>All-wheel drive</feature>
<feature>Sunroof</feature>
</features>
</vehicle>
</root>
"#;
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(xml_content.as_bytes()).unwrap();
let reader = XmlItemReaderBuilder::<Vehicle>::new()
.tag("vehicle")
.capacity(1024)
.from_path(temp_file.path())
.unwrap();
let vehicle1 = reader.read().unwrap().unwrap();
assert_eq!(vehicle1.id, "v001");
assert_eq!(vehicle1.category, "sedan");
assert_eq!(vehicle1.make, "Toyota");
assert_eq!(vehicle1.model, "Camry");
assert_eq!(vehicle1.year, 2022);
assert_eq!(vehicle1.engine.engine_type, "hybrid");
assert_eq!(vehicle1.engine.cylinders, 4);
assert_eq!(vehicle1.engine.horsepower, 208);
assert_eq!(vehicle1.engine.fuel_efficiency, 4.5);
assert_eq!(vehicle1.features.items.len(), 3);
assert_eq!(vehicle1.features.items[0], "Bluetooth");
assert_eq!(vehicle1.features.items[1], "Navigation");
assert_eq!(vehicle1.features.items[2], "Leather Seats");
let vehicle2 = reader.read().unwrap().unwrap();
assert_eq!(vehicle2.id, "v002");
assert_eq!(vehicle2.category, "suv");
assert_eq!(vehicle2.make, "Honda");
assert_eq!(vehicle2.model, "CR-V");
assert_eq!(vehicle2.year, 2023);
assert_eq!(vehicle2.engine.engine_type, "gasoline");
assert_eq!(vehicle2.engine.cylinders, 4);
assert_eq!(vehicle2.engine.horsepower, 190);
assert_eq!(vehicle2.engine.fuel_efficiency, 7.2);
assert_eq!(vehicle2.features.items.len(), 2);
assert_eq!(vehicle2.features.items[0], "All-wheel drive");
assert_eq!(vehicle2.features.items[1], "Sunroof");
assert!(reader.read().unwrap().is_none());
}
#[test]
fn test_xml_reader_builder() {
let xml_content = r#"
<data>
<vehicle id="v001" category="sedan">
<make>Toyota</make>
<model>Camry</model>
<year>2022</year>
<engine type="hybrid" cylinders="4">
<horsepower>208</horsepower>
<fuelEfficiency>4.5</fuelEfficiency>
</engine>
<features>
<feature>Bluetooth</feature>
<feature>Navigation</feature>
</features>
</vehicle>
</data>
"#;
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(xml_content.as_bytes()).unwrap();
let reader = XmlItemReaderBuilder::<Vehicle>::new()
.tag("vehicle")
.capacity(2048)
.from_path(temp_file.path())
.unwrap();
let vehicle = reader.read().unwrap().unwrap();
assert_eq!(vehicle.id, "v001");
assert_eq!(vehicle.make, "Toyota");
assert_eq!(vehicle.model, "Camry");
assert_eq!(vehicle.year, 2022);
assert!(reader.read().unwrap().is_none());
}
#[test]
fn test_empty_xml_file() {
let xml_content = "<root></root>";
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(xml_content.as_bytes()).unwrap();
let reader = XmlItemReaderBuilder::<TestItem>::new()
.tag("TestItem")
.from_path(temp_file.path())
.unwrap();
assert!(reader.read().unwrap().is_none());
}
#[test]
fn test_xml_with_empty_tags() {
let xml_content = r#"
<root>
<TestItem>
<name></name>
<value>0</value>
</TestItem>
<TestItem>
<name></name>
<value>0</value>
</TestItem>
</root>
"#;
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(xml_content.as_bytes()).unwrap();
let reader = XmlItemReaderBuilder::<TestItem>::new()
.tag("TestItem")
.from_path(temp_file.path())
.unwrap();
let item1 = reader.read().unwrap().unwrap();
assert_eq!(item1.name, "");
assert_eq!(item1.value, 0);
let item2 = reader.read().unwrap().unwrap();
assert_eq!(item2.name, "");
assert_eq!(item2.value, 0);
assert!(reader.read().unwrap().is_none());
}
#[test]
fn test_xml_with_attributes() {
#[derive(Debug, Deserialize, Serialize, PartialEq)]
struct ItemWithAttrs {
#[serde(rename = "@id")]
id: String,
#[serde(rename = "@type")]
item_type: String,
content: String,
}
let xml_content = r#"
<root>
<item id="1" type="normal">
<content>First item</content>
</item>
<item id="2" type="special">
<content>Second item</content>
</item>
</root>
"#;
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(xml_content.as_bytes()).unwrap();
let reader = XmlItemReaderBuilder::<ItemWithAttrs>::new()
.tag("item")
.from_path(temp_file.path())
.unwrap();
let item1 = reader.read().unwrap().unwrap();
assert_eq!(item1.id, "1");
assert_eq!(item1.item_type, "normal");
assert_eq!(item1.content, "First item");
let item2 = reader.read().unwrap().unwrap();
assert_eq!(item2.id, "2");
assert_eq!(item2.item_type, "special");
assert_eq!(item2.content, "Second item");
assert!(reader.read().unwrap().is_none());
}
#[test]
fn test_xml_with_cdata() {
let xml_content = r#"
<root>
<TestItem>
<name><![CDATA[name with <special> & chars]]></name>
<value>42</value>
</TestItem>
<TestItem>
<name>regular name</name>
<value><![CDATA[55]]></value>
</TestItem>
</root>
"#;
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(xml_content.as_bytes()).unwrap();
let reader = XmlItemReaderBuilder::<TestItem>::new()
.tag("TestItem")
.from_path(temp_file.path())
.unwrap();
let item1 = reader.read().unwrap().unwrap();
assert_eq!(item1.name, "name with <special> & chars");
assert_eq!(item1.value, 42);
let item2 = reader.read().unwrap().unwrap();
assert_eq!(item2.name, "regular name");
assert_eq!(item2.value, 55);
assert!(reader.read().unwrap().is_none());
}
#[test]
fn test_malformed_xml() {
let xml_content = r#"
<root>
<TestItem>
<name>test1</name>
<value>42
</TestItem>
</root>
"#;
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(xml_content.as_bytes()).unwrap();
let reader = XmlItemReaderBuilder::<TestItem>::new()
.tag("TestItem")
.from_path(temp_file.path())
.unwrap();
let result = reader.read();
assert!(result.is_err());
}
#[test]
fn test_xml_type_mismatch() {
let xml_content = r#"
<root>
<TestItem>
<name>test1</name>
<value>not_a_number</value>
</TestItem>
</root>
"#;
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(xml_content.as_bytes()).unwrap();
let reader = XmlItemReaderBuilder::<TestItem>::new()
.tag("TestItem")
.from_path(temp_file.path())
.unwrap();
let result = reader.read();
assert!(result.is_err()); }
#[test]
fn test_default_tag_inference() {
let xml_content = r#"
<root>
<TestItem>
<name>test1</name>
<value>42</value>
</TestItem>
</root>
"#;
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(xml_content.as_bytes()).unwrap();
let reader = XmlItemReaderBuilder::<TestItem>::new()
.from_path(temp_file.path())
.unwrap();
let item = reader.read().unwrap().unwrap();
assert_eq!(item.name, "test1");
assert_eq!(item.value, 42);
assert!(reader.read().unwrap().is_none());
}
#[test]
fn test_read_from_memory() {
let xml_content = r#"
<root>
<TestItem>
<name>memory test</name>
<value>100</value>
</TestItem>
</root>
"#;
let reader = XmlItemReaderBuilder::<TestItem>::new()
.tag("TestItem")
.from_reader(xml_content.as_bytes());
let item = reader.read().unwrap().unwrap();
assert_eq!(item.name, "memory test");
assert_eq!(item.value, 100);
assert!(reader.read().unwrap().is_none());
}
#[test]
fn test_xml_reader_with_invalid_xml() {
let invalid_xml = r#"
<items>
<item>
<name>Invalid Item</name>
<value>123
</item>
</items>
"#;
let cursor = Cursor::new(invalid_xml);
let reader = XmlItemReaderBuilder::<TestItem>::new()
.tag("item")
.from_reader(cursor);
let result = reader.read();
assert!(result.is_err());
}
#[test]
fn test_xml_reader_with_empty_file() {
let empty_xml = "";
let cursor = Cursor::new(empty_xml);
let reader = XmlItemReaderBuilder::<TestItem>::new()
.tag("item")
.from_reader(cursor);
let result = reader.read();
assert!(result.is_ok());
assert!(result.unwrap().is_none());
}
#[test]
fn test_xml_reader_with_no_matching_tags() {
let xml_data = r#"
<root>
<other>
<name>Not an item</name>
<value>123</value>
</other>
</root>
"#;
let cursor = Cursor::new(xml_data);
let reader = XmlItemReaderBuilder::<TestItem>::new()
.tag("item")
.from_reader(cursor);
let result = reader.read();
assert!(result.is_ok());
assert!(result.unwrap().is_none());
}
#[test]
fn test_xml_reader_builder_with_custom_capacity() {
let xml_data = r#"
<items>
<item>
<name>Test Item</name>
<value>123</value>
</item>
</items>
"#;
let cursor = Cursor::new(xml_data);
let reader = XmlItemReaderBuilder::<TestItem>::new()
.tag("item")
.capacity(2048)
.from_reader(cursor);
let result = reader.read();
assert!(result.is_ok());
assert!(result.unwrap().is_some());
}
#[test]
fn test_xml_reader_with_nested_elements() {
#[derive(Debug, Deserialize, PartialEq)]
struct NestedItem {
name: String,
value: i32,
}
let xml_data = r#"
<items>
<nested>
<name>Nested Item</name>
<value>456</value>
</nested>
</items>
"#;
let cursor = Cursor::new(xml_data);
let reader = XmlItemReaderBuilder::<NestedItem>::new()
.tag("nested")
.from_reader(cursor);
let result = reader.read();
assert!(result.is_ok());
let item = result.unwrap().unwrap();
assert_eq!(item.name, "Nested Item");
assert_eq!(item.value, 456);
}
#[test]
fn test_xml_reader_with_multiple_reads() {
let xml_data = r#"
<items>
<item>
<name>First Item</name>
<value>100</value>
</item>
<item>
<name>Second Item</name>
<value>200</value>
</item>
<item>
<name>Third Item</name>
<value>300</value>
</item>
</items>
"#;
let cursor = Cursor::new(xml_data);
let reader = XmlItemReaderBuilder::<TestItem>::new()
.tag("item")
.from_reader(cursor);
let mut items = Vec::new();
while let Some(item) = reader.read().unwrap() {
items.push(item);
}
assert_eq!(items.len(), 3);
assert_eq!(items[0].name, "First Item");
assert_eq!(items[1].name, "Second Item");
assert_eq!(items[2].name, "Third Item");
}
#[test]
fn test_xml_reader_with_whitespace_handling() {
let xml_data = r#"
<items>
<item>
<name>Whitespace Item</name>
<value>789</value>
</item>
</items>
"#;
let cursor = Cursor::new(xml_data);
let reader = XmlItemReaderBuilder::<TestItem>::new()
.tag("item")
.from_reader(cursor);
let result = reader.read();
assert!(result.is_ok());
let item = result.unwrap().unwrap();
assert_eq!(item.name, "Whitespace Item");
assert_eq!(item.value, 789);
}
#[test]
fn test_xml_reader_from_path_error_handling() {
let result = XmlItemReaderBuilder::<TestItem>::new()
.tag("item")
.from_path("/nonexistent/path/file.xml");
assert!(result.is_err());
}
#[test]
fn test_xml_reader_with_special_characters() {
let xml_data = r#"
<items>
<item>
<name>Special & Characters <></name>
<value>999</value>
</item>
</items>
"#;
let cursor = Cursor::new(xml_data);
let reader = XmlItemReaderBuilder::<TestItem>::new()
.tag("item")
.from_reader(cursor);
let result = reader.read();
assert!(result.is_ok());
let item = result.unwrap().unwrap();
assert_eq!(item.name, "Special & Characters <>");
assert_eq!(item.value, 999);
}
#[test]
fn test_xml_reader_builder_default() {
let builder1 = XmlItemReaderBuilder::<TestItem>::new();
let builder2 = XmlItemReaderBuilder::<TestItem>::default();
assert_eq!(builder1.capacity, builder2.capacity);
assert_eq!(builder1.tag_name, builder2.tag_name);
}
#[test]
fn should_derive_tag_from_type_name_when_not_set_in_from_reader() {
let xml_content =
r#"<root><TestItem><name>derived</name><value>7</value></TestItem></root>"#;
let cursor = Cursor::new(xml_content);
let reader = XmlItemReaderBuilder::<TestItem>::new().from_reader(cursor);
let item = reader.read().unwrap().unwrap();
assert_eq!(item.name, "derived");
assert_eq!(item.value, 7);
}
#[test]
fn should_return_error_on_unexpected_eof_inside_item() {
let xml_content = r#"<root><TestItem><name>truncated"#;
let cursor = Cursor::new(xml_content);
let reader = XmlItemReaderBuilder::<TestItem>::new()
.tag("TestItem")
.from_reader(cursor);
let result = reader.read();
assert!(result.is_err(), "expected error for truncated XML");
match result {
Err(BatchError::ItemReader(msg)) => {
assert!(
msg.contains("Unexpected end of file") || msg.contains("XML"),
"unexpected error message: {msg}"
);
}
other => panic!("expected ItemReader error, got {other:?}"),
}
}
#[test]
fn should_ignore_xml_comments_inside_items() {
let xml_content = r#"<root><TestItem><!-- a comment --><name>commented</name><value>5</value></TestItem></root>"#;
let cursor = Cursor::new(xml_content);
let reader = XmlItemReaderBuilder::<TestItem>::new()
.tag("TestItem")
.from_reader(cursor);
let item = reader.read().unwrap().unwrap();
assert_eq!(item.name, "commented");
assert_eq!(item.value, 5);
}
}