use crate::Result;
use crate::core::config::ExtractionConfig;
use crate::extraction::{cells_to_markdown, cells_to_text};
use crate::plugins::{DocumentExtractor, Plugin};
use crate::types::{ExtractionResult, Metadata, Table};
use async_trait::async_trait;
use quick_xml::Reader;
use quick_xml::events::Event;
#[cfg(feature = "tokio-runtime")]
use std::path::Path;
fn strip_namespace(tag: &str) -> &str {
if tag.starts_with('{')
&& let Some(pos) = tag.find('}')
{
return &tag[pos + 1..];
}
tag
}
#[derive(Debug, Clone, Copy)]
struct ParsingState {
in_info: bool,
in_table: bool,
in_tgroup: bool,
in_thead: bool,
in_tbody: bool,
in_row: bool,
in_list: bool,
in_list_item: bool,
}
pub struct DocbookExtractor;
impl Default for DocbookExtractor {
fn default() -> Self {
Self::new()
}
}
impl DocbookExtractor {
pub fn new() -> Self {
Self
}
}
type DocBookParseResult = (String, String, Option<String>, Option<String>, Vec<Table>);
fn parse_docbook_single_pass(content: &str, plain: bool) -> Result<DocBookParseResult> {
let mut reader = Reader::from_str(content);
let mut output = String::new();
let mut title = String::new();
let mut author = Option::None;
let mut date = Option::None;
let mut tables = Vec::new();
let mut table_index = 0;
let mut state = ParsingState {
in_info: false,
in_table: false,
in_tgroup: false,
in_thead: false,
in_tbody: false,
in_row: false,
in_list: false,
in_list_item: false,
};
let mut title_extracted = false;
let mut current_table: Vec<Vec<String>> = Vec::new();
let mut current_row: Vec<String> = Vec::new();
let mut list_type = "";
loop {
match reader.read_event() {
Ok(Event::Start(e)) => {
let tag = String::from_utf8_lossy(e.name().as_ref()).to_string();
let tag = strip_namespace(&tag);
match tag {
"info" | "articleinfo" | "bookinfo" | "chapterinfo" => {
state.in_info = true;
}
"title" if !title_extracted && state.in_info => {
title = extract_element_text(&mut reader)?;
title_extracted = true;
}
"title" if !title_extracted => {
title = extract_element_text(&mut reader)?;
title_extracted = true;
}
"title" if title_extracted => {
let section_title = extract_element_text(&mut reader)?;
if !section_title.is_empty() {
if !plain {
output.push_str("## ");
}
output.push_str(§ion_title);
output.push_str("\n\n");
}
}
"author" | "personname" if state.in_info && author.is_none() => {
author = Some(extract_element_text(&mut reader)?);
}
"date" if state.in_info && date.is_none() => {
let date_text = extract_element_text(&mut reader)?;
if !date_text.is_empty() {
date = Some(date_text);
}
}
"para" => {
let para_text = extract_element_text(&mut reader)?;
if !para_text.is_empty() {
output.push_str(¶_text);
output.push_str("\n\n");
}
}
"programlisting" | "screen" => {
let code_text = extract_element_text(&mut reader)?;
if !code_text.is_empty() {
if !plain {
output.push_str("```\n");
}
output.push_str(&code_text);
if !plain {
output.push_str("\n```");
}
output.push_str("\n\n");
}
}
"itemizedlist" => {
state.in_list = true;
list_type = "itemized";
}
"orderedlist" => {
state.in_list = true;
list_type = "ordered";
}
"listitem" if state.in_list => {
state.in_list_item = true;
if !plain {
let prefix = if list_type == "ordered" { "1. " } else { "- " };
output.push_str(prefix);
}
let item_text = extract_element_text(&mut reader)?;
if !item_text.is_empty() {
output.push_str(&item_text);
}
output.push('\n');
state.in_list_item = false;
}
"blockquote" => {
if !plain {
output.push_str("> ");
}
let quote_text = extract_element_text(&mut reader)?;
if !quote_text.is_empty() {
output.push_str("e_text);
}
output.push_str("\n\n");
}
"figure" => {
let figure_text = extract_element_text(&mut reader)?;
if !figure_text.is_empty() {
if !plain {
output.push_str("**Figure:** ");
} else {
output.push_str("Figure: ");
}
output.push_str(&figure_text);
output.push_str("\n\n");
}
}
"footnote" => {
output.push('[');
let footnote_text = extract_element_text(&mut reader)?;
if !footnote_text.is_empty() {
output.push_str(&footnote_text);
}
output.push(']');
}
"table" | "informaltable" => {
state.in_table = true;
current_table.clear();
}
"tgroup" if state.in_table => {
state.in_tgroup = true;
}
"thead" if state.in_tgroup => {
state.in_thead = true;
}
"tbody" if state.in_tgroup => {
state.in_tbody = true;
}
"row" if (state.in_thead || state.in_tbody) && state.in_tgroup => {
state.in_row = true;
current_row.clear();
}
"entry" if state.in_row => {
let entry_text = extract_element_text(&mut reader)?;
current_row.push(entry_text);
}
_ => {}
}
}
Ok(Event::End(e)) => {
let tag = String::from_utf8_lossy(e.name().as_ref()).to_string();
let tag = strip_namespace(&tag);
match tag {
"info" | "articleinfo" | "bookinfo" | "chapterinfo" => {
state.in_info = false;
}
"itemizedlist" | "orderedlist" if state.in_list => {
output.push('\n');
state.in_list = false;
}
"table" | "informaltable" if state.in_table => {
if !current_table.is_empty() {
let markdown = cells_to_markdown(¤t_table);
if plain {
output.push_str(&cells_to_text(¤t_table));
} else {
output.push_str(&markdown);
}
output.push('\n');
tables.push(Table {
cells: current_table.clone(),
markdown,
page_number: table_index + 1,
bounding_box: None,
});
table_index += 1;
current_table.clear();
}
state.in_table = false;
}
"tgroup" if state.in_tgroup => {
state.in_tgroup = false;
}
"thead" if state.in_thead => {
state.in_thead = false;
}
"tbody" if state.in_tbody => {
state.in_tbody = false;
}
"row" if state.in_row => {
if !current_row.is_empty() {
current_table.push(current_row.clone());
current_row.clear();
}
state.in_row = false;
}
_ => {}
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(crate::error::KreuzbergError::parsing(format!(
"XML parsing error: {}",
e
)));
}
_ => {}
}
}
let mut final_output = output;
if !title.is_empty() {
final_output = format!("{}\n\n{}", title, final_output);
}
Ok((final_output.trim().to_string(), title, author, date, tables))
}
fn extract_element_text(reader: &mut Reader<&[u8]>) -> Result<String> {
let mut text = String::new();
let mut depth = 0;
loop {
match reader.read_event() {
Ok(Event::Start(_)) => {
depth += 1;
}
Ok(Event::End(_)) => {
if depth == 0 {
break;
}
depth -= 1;
}
Ok(Event::Text(t)) => {
let decoded = String::from_utf8_lossy(t.as_ref()).to_string();
if !decoded.trim().is_empty() {
if !text.is_empty() && !text.ends_with(' ') && !text.ends_with('\n') {
text.push(' ');
}
text.push_str(decoded.trim());
}
}
Ok(Event::CData(t)) => {
let decoded = std::str::from_utf8(t.as_ref()).unwrap_or("").to_string();
if !decoded.trim().is_empty() {
if !text.is_empty() {
text.push(' ');
}
text.push_str(decoded.trim());
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(crate::error::KreuzbergError::parsing(format!(
"XML parsing error: {}",
e
)));
}
_ => {}
}
}
Ok(text.trim().to_string())
}
impl Plugin for DocbookExtractor {
fn name(&self) -> &str {
"docbook-extractor"
}
fn version(&self) -> String {
env!("CARGO_PKG_VERSION").to_string()
}
fn initialize(&self) -> Result<()> {
Ok(())
}
fn shutdown(&self) -> Result<()> {
Ok(())
}
}
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
impl DocumentExtractor for DocbookExtractor {
#[cfg_attr(
feature = "otel",
tracing::instrument(
skip(self, content, config),
fields(
extractor.name = self.name(),
content.size_bytes = content.len(),
)
)
)]
async fn extract_bytes(
&self,
content: &[u8],
mime_type: &str,
config: &ExtractionConfig,
) -> Result<ExtractionResult> {
let plain = matches!(
config.output_format,
crate::core::config::OutputFormat::Plain | crate::core::config::OutputFormat::Structured
);
let docbook_content = std::str::from_utf8(content)
.map(|s| s.to_string())
.unwrap_or_else(|_| String::from_utf8_lossy(content).to_string());
let (extracted_content, title, author, date, tables) = parse_docbook_single_pass(&docbook_content, plain)?;
let mut metadata = Metadata::default();
let mut subject_parts = Vec::new();
if !title.is_empty() {
subject_parts.push(format!("Title: {}", title));
}
if let Some(author) = &author {
subject_parts.push(format!("Author: {}", author));
}
if !subject_parts.is_empty() {
metadata.subject = Some(subject_parts.join("; "));
}
if let Some(date_val) = date {
metadata.created_at = Some(date_val);
}
Ok(ExtractionResult {
content: extracted_content,
mime_type: mime_type.to_string().into(),
metadata,
tables,
detected_languages: None,
chunks: None,
images: None,
pages: None,
djot_content: None,
elements: None,
ocr_elements: None,
document: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
})
}
#[cfg(feature = "tokio-runtime")]
#[cfg_attr(
feature = "otel",
tracing::instrument(
skip(self, path, config),
fields(
extractor.name = self.name(),
)
)
)]
async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
let content = tokio::fs::read(path).await?;
self.extract_bytes(&content, mime_type, config).await
}
fn supported_mime_types(&self) -> &[&str] {
&["application/docbook+xml", "text/docbook"]
}
fn priority(&self) -> i32 {
50
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_docbook_extractor_plugin_interface() {
let extractor = DocbookExtractor::new();
assert_eq!(extractor.name(), "docbook-extractor");
assert!(extractor.initialize().is_ok());
assert!(extractor.shutdown().is_ok());
}
#[test]
fn test_docbook_extractor_supported_mime_types() {
let extractor = DocbookExtractor::new();
let mime_types = extractor.supported_mime_types();
assert_eq!(mime_types.len(), 2);
assert!(mime_types.contains(&"application/docbook+xml"));
assert!(mime_types.contains(&"text/docbook"));
}
#[test]
fn test_docbook_extractor_priority() {
let extractor = DocbookExtractor::new();
assert_eq!(extractor.priority(), 50);
}
#[test]
fn test_parse_simple_docbook() {
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
<article>
<title>Test Article</title>
<para>Test content.</para>
</article>"#;
let (content, title, _, _, _) = parse_docbook_single_pass(docbook, false).expect("Parse failed");
assert_eq!(title, "Test Article");
assert!(content.contains("Test content"));
}
#[test]
fn test_extract_docbook_tables_basic() {
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<table>
<tgroup cols="2">
<thead>
<row>
<entry>Col1</entry>
<entry>Col2</entry>
</row>
</thead>
<tbody>
<row>
<entry>Data1</entry>
<entry>Data2</entry>
</row>
</tbody>
</tgroup>
</table>
</article>"#;
let (_, _, _, _, tables) = parse_docbook_single_pass(docbook, false).expect("Table extraction failed");
assert_eq!(tables.len(), 1);
assert_eq!(tables[0].cells.len(), 2);
assert_eq!(tables[0].cells[0], vec!["Col1", "Col2"]);
}
}