use crate::Result;
use crate::core::config::ExtractionConfig;
use crate::extraction::xml::{parse_xml, parse_xml_svg};
use crate::extractors::SyncExtractor;
use crate::plugins::{DocumentExtractor, Plugin};
use crate::types::internal::{ElementKind, InternalDocument, InternalElement};
use crate::types::metadata::Metadata;
use ahash::AHashMap;
use async_trait::async_trait;
fn build_internal_document(content: &[u8], mime_type: &str) -> InternalDocument {
use quick_xml::Reader;
use quick_xml::events::Event;
use std::borrow::Cow;
let mut doc = InternalDocument::new("xml");
let is_svg = mime_type == "image/svg+xml";
let mut reader = Reader::from_reader(content);
reader.config_mut().trim_text(true);
reader.config_mut().check_end_names = false;
let mut buf = Vec::new();
let mut depth: u16 = 0;
let mut element_stack: Vec<String> = Vec::new();
let mut index: u32 = 0;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) => {
let name_bytes = e.name().as_ref().to_vec();
let name_owned = String::from_utf8_lossy(&name_bytes).into_owned();
let mut attrs = AHashMap::new();
for attr in e.attributes().flatten() {
let key: Cow<str> = String::from_utf8_lossy(attr.key.as_ref());
let val: Cow<str> = String::from_utf8_lossy(&attr.value);
let trimmed_val = val.trim();
if !trimmed_val.is_empty() {
attrs.insert(key.to_string(), trimmed_val.to_string());
}
}
let level = ((depth as u8) + 1).min(6);
let mut elem =
InternalElement::text(ElementKind::Heading { level }, &name_owned, depth).with_index(index);
if !attrs.is_empty() {
elem = elem.with_attributes(attrs);
}
doc.push_element(elem);
index += 1;
element_stack.push(name_owned);
depth = depth.saturating_add(1);
}
Ok(Event::End(_)) => {
element_stack.pop();
depth = depth.saturating_sub(1);
}
Ok(Event::Text(e)) => {
if is_svg {
let in_text_elem = element_stack
.iter()
.any(|n| matches!(n.as_str(), "text" | "tspan" | "title" | "desc" | "textPath"));
if !in_text_elem {
buf.clear();
continue;
}
}
let text: std::borrow::Cow<str> = String::from_utf8_lossy(e.as_ref());
let trimmed = text.trim();
if !trimmed.is_empty() {
let elem = InternalElement::text(ElementKind::Paragraph, trimmed, depth).with_index(index);
doc.push_element(elem);
index += 1;
}
}
Ok(Event::Empty(e)) => {
let name_bytes = e.name().as_ref().to_vec();
let name = String::from_utf8_lossy(&name_bytes).into_owned();
let mut attrs = AHashMap::new();
for attr in e.attributes().flatten() {
let key: std::borrow::Cow<str> = String::from_utf8_lossy(attr.key.as_ref());
let val: std::borrow::Cow<str> = String::from_utf8_lossy(&attr.value);
let trimmed_val = val.trim();
if !trimmed_val.is_empty() {
attrs.insert(key.to_string(), trimmed_val.to_string());
}
}
let mut elem = InternalElement::text(ElementKind::Paragraph, &name, depth).with_index(index);
if !attrs.is_empty() {
elem = elem.with_attributes(attrs);
}
doc.push_element(elem);
index += 1;
}
Ok(Event::CData(e)) => {
let text: std::borrow::Cow<str> = String::from_utf8_lossy(&e);
let trimmed = text.trim();
if !trimmed.is_empty() {
let elem = InternalElement::text(ElementKind::Paragraph, trimmed, depth).with_index(index);
doc.push_element(elem);
index += 1;
}
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
buf.clear();
}
doc
}
pub struct XmlExtractor;
impl XmlExtractor {
pub fn new() -> Self {
Self
}
}
impl Default for XmlExtractor {
fn default() -> Self {
Self::new()
}
}
impl Plugin for XmlExtractor {
fn name(&self) -> &str {
"xml-extractor"
}
fn version(&self) -> String {
env!("CARGO_PKG_VERSION").to_string()
}
fn initialize(&self) -> Result<()> {
Ok(())
}
fn shutdown(&self) -> Result<()> {
Ok(())
}
fn description(&self) -> &str {
"Extracts text content from XML files with element metadata"
}
fn author(&self) -> &str {
"Kreuzberg Team"
}
}
impl SyncExtractor for XmlExtractor {
fn extract_sync(&self, content: &[u8], mime_type: &str, _config: &ExtractionConfig) -> Result<InternalDocument> {
let xml_result = if mime_type == "image/svg+xml" {
parse_xml_svg(content, false)?
} else {
parse_xml(content, false)?
};
let mut doc = build_internal_document(content, mime_type);
doc.mime_type = std::borrow::Cow::Owned(mime_type.to_string());
doc.metadata = Metadata {
format: Some(crate::types::FormatMetadata::Xml(crate::types::XmlMetadata {
element_count: xml_result.element_count,
unique_elements: xml_result.unique_elements,
})),
..Default::default()
};
Ok(doc)
}
}
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
impl DocumentExtractor for XmlExtractor {
async fn extract_bytes(
&self,
content: &[u8],
mime_type: &str,
config: &ExtractionConfig,
) -> Result<InternalDocument> {
self.extract_sync(content, mime_type, config)
}
fn supported_mime_types(&self) -> &[&str] {
&[
"application/xml",
"text/xml",
"image/svg+xml",
"application/x-endnote+xml",
]
}
fn priority(&self) -> i32 {
50
}
fn as_sync_extractor(&self) -> Option<&dyn crate::extractors::SyncExtractor> {
Some(self)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_xml_extractor() {
let extractor = XmlExtractor::new();
let content = b"<root><item>Hello</item><item>World</item></root>";
let config = ExtractionConfig::default();
let result = extractor
.extract_bytes(content, "application/xml", &config)
.await
.unwrap();
assert!(result.metadata.format.is_some());
let xml_meta = match result.metadata.format.as_ref().unwrap() {
crate::types::FormatMetadata::Xml(meta) => meta,
_ => panic!("Expected Xml metadata"),
};
assert_eq!(xml_meta.element_count, 3);
assert!(xml_meta.unique_elements.contains(&"root".to_string()));
assert!(xml_meta.unique_elements.contains(&"item".to_string()));
}
#[test]
fn test_xml_plugin_interface() {
let extractor = XmlExtractor::new();
assert_eq!(extractor.name(), "xml-extractor");
assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
assert_eq!(
extractor.supported_mime_types(),
&[
"application/xml",
"text/xml",
"image/svg+xml",
"application/x-endnote+xml"
]
);
assert_eq!(extractor.priority(), 50);
}
}