mod elements;
mod metadata;
mod parser;
use crate::Result;
use crate::core::config::ExtractionConfig;
use crate::plugins::{DocumentExtractor, Plugin};
use crate::text::utf8_validation;
use crate::types::{ExtractionResult, Metadata};
use async_trait::async_trait;
use quick_xml::Reader;
use quick_xml::events::Event;
#[cfg(feature = "tokio-runtime")]
use std::path::Path;
use elements::extract_jats_all_in_one;
use parser::extract_text_content as jats_extract_text;
fn extract_para_with_annotations_jats(
reader: &mut Reader<&[u8]>,
) -> crate::Result<(String, Vec<crate::types::document_structure::TextAnnotation>)> {
use crate::types::builder;
let mut text = String::new();
let mut annotations = Vec::new();
let mut depth: u32 = 0;
let mut inline_stack: Vec<(&'static str, u32, u32, Option<String>)> = Vec::new();
loop {
match reader.read_event() {
Ok(Event::Start(e)) => {
depth += 1;
let name = e.name();
let tag = crate::utils::xml_tag_name(name.as_ref());
match tag.as_ref() {
"italic" => {
inline_stack.push(("italic", depth, text.len() as u32, None));
}
"bold" => {
inline_stack.push(("bold", depth, text.len() as u32, None));
}
"underline" => {
inline_stack.push(("underline", depth, text.len() as u32, None));
}
"sub" => {
inline_stack.push(("subscript", depth, text.len() as u32, None));
}
"sup" => {
inline_stack.push(("superscript", depth, text.len() as u32, None));
}
"ext-link" => {
let mut href = None;
for attr in e.attributes().flatten() {
let key = String::from_utf8_lossy(attr.key.as_ref());
if key == "xlink:href" || key.ends_with(":href") || key == "href" {
href = Some(String::from_utf8_lossy(attr.value.as_ref()).to_string());
}
}
inline_stack.push(("link", depth, text.len() as u32, href));
}
_ => {}
}
}
Ok(Event::End(_)) => {
if depth == 0 {
break;
}
if let Some(&(kind, open_depth, start, ref href)) = inline_stack.last()
&& open_depth == depth
{
let end = text.len() as u32;
let actual_start = if (start as usize) < text.len() {
let span = &text[start as usize..end as usize];
let trimmed = span.trim_start();
end - trimmed.len() as u32
} else {
start
};
if end > actual_start {
let href_clone = href.clone();
let annotation = match kind {
"bold" => builder::bold(actual_start, end),
"italic" => builder::italic(actual_start, end),
"underline" => builder::underline(actual_start, end),
"subscript" => builder::subscript(actual_start, end),
"superscript" => builder::superscript(actual_start, end),
"link" => {
let url = href_clone.as_deref().unwrap_or("");
builder::link(actual_start, end, url, None)
}
_ => unreachable!(),
};
annotations.push(annotation);
}
inline_stack.pop();
}
depth -= 1;
}
Ok(Event::Text(t)) => {
let decoded = String::from_utf8_lossy(t.as_ref()).to_string();
if !decoded.trim().is_empty() {
if !text.is_empty() && !text.ends_with(' ') && !text.ends_with('\n') {
text.push(' ');
}
text.push_str(decoded.trim());
}
}
Ok(Event::CData(t)) => {
let decoded = utf8_validation::from_utf8(t.as_ref()).unwrap_or("").to_string();
if !decoded.trim().is_empty() {
if !text.is_empty() {
text.push(' ');
}
text.push_str(decoded.trim());
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(crate::error::KreuzbergError::parsing(format!(
"XML parsing error: {}",
e
)));
}
_ => {}
}
}
Ok((text.trim().to_string(), annotations))
}
fn build_jats_document_structure(content: &str) -> crate::Result<crate::types::document_structure::DocumentStructure> {
use crate::types::builder::DocumentStructureBuilder;
let mut reader = Reader::from_str(content);
let mut builder = DocumentStructureBuilder::new().source_format("jats");
let mut in_article_meta = false;
let mut in_body = false;
let mut in_ref_list = false;
let mut in_table = false;
let mut in_thead = false;
let mut in_tbody = false;
let mut in_row = false;
let mut current_table: Vec<Vec<String>> = Vec::new();
let mut current_row: Vec<String> = Vec::new();
loop {
match reader.read_event() {
Ok(Event::Start(e)) => {
let name = e.name();
let tag = crate::utils::xml_tag_name(name.as_ref());
match tag.as_ref() {
"article-meta" => {
in_article_meta = true;
}
"article-title" if in_article_meta => {
let text = jats_extract_text(&mut reader)?;
if !text.is_empty() {
builder.push_heading(1, &text, None, None);
}
continue;
}
"body" => {
in_body = true;
}
"sec" if in_body => {
}
"title" if in_body && !in_article_meta => {
let text = jats_extract_text(&mut reader)?;
if !text.is_empty() {
builder.push_heading(2, &text, None, None);
}
continue;
}
"p" if in_body => {
let (text, annotations) = extract_para_with_annotations_jats(&mut reader)?;
if !text.is_empty() {
builder.push_paragraph(&text, annotations, None, None);
}
continue;
}
"fig" if in_body => {
let text = jats_extract_text(&mut reader)?;
let desc = if text.is_empty() { None } else { Some(text.as_str()) };
builder.push_image(desc, None, None, None);
continue;
}
"disp-formula" if in_body => {
let text = jats_extract_text(&mut reader)?;
if !text.is_empty() {
builder.push_formula(&text, None);
}
continue;
}
"inline-formula" if in_body => {
let text = jats_extract_text(&mut reader)?;
if !text.is_empty() {
builder.push_formula(&text, None);
}
continue;
}
"table-wrap" if in_body => {
}
"table" => {
in_table = true;
current_table.clear();
}
"thead" if in_table => {
in_thead = true;
}
"tbody" if in_table => {
in_tbody = true;
}
"tr" if (in_thead || in_tbody) && in_table => {
in_row = true;
current_row.clear();
}
"td" | "th" if in_row => {
let text = jats_extract_text(&mut reader)?;
current_row.push(text);
continue;
}
"ref-list" => {
in_ref_list = true;
}
"ref" if in_ref_list => {
let mut ref_id = String::new();
for attr in e.attributes() {
if let Ok(attr) = attr
&& String::from_utf8_lossy(attr.key.as_ref()) == "id"
{
ref_id = String::from_utf8_lossy(attr.value.as_ref()).to_string();
}
}
let text = jats_extract_text(&mut reader)?;
if !text.is_empty() {
let key = if ref_id.is_empty() { "ref" } else { &ref_id };
builder.push_citation(key, &text, None);
}
continue;
}
_ => {}
}
}
Ok(Event::End(e)) => {
let name = e.name();
let tag = crate::utils::xml_tag_name(name.as_ref());
match tag.as_ref() {
"article-meta" => {
in_article_meta = false;
}
"body" => {
in_body = false;
}
"ref-list" => {
in_ref_list = false;
}
"table" if in_table => {
if !current_table.is_empty() {
builder.push_table_from_cells(¤t_table, None);
current_table.clear();
}
in_table = false;
}
"thead" if in_thead => {
in_thead = false;
}
"tbody" if in_tbody => {
in_tbody = false;
}
"tr" if in_row => {
if !current_row.is_empty() {
current_table.push(std::mem::take(&mut current_row));
}
in_row = false;
}
_ => {}
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(crate::error::KreuzbergError::parsing(format!(
"XML parsing error: {}",
e
)));
}
_ => {}
}
}
Ok(builder.build())
}
pub struct JatsExtractor;
impl Default for JatsExtractor {
fn default() -> Self {
Self::new()
}
}
impl JatsExtractor {
pub fn new() -> Self {
Self
}
}
impl Plugin for JatsExtractor {
fn name(&self) -> &str {
"jats-extractor"
}
fn version(&self) -> String {
env!("CARGO_PKG_VERSION").to_string()
}
fn initialize(&self) -> Result<()> {
Ok(())
}
fn shutdown(&self) -> Result<()> {
Ok(())
}
}
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
impl DocumentExtractor for JatsExtractor {
#[cfg_attr(
feature = "otel",
tracing::instrument(
skip(self, content, config),
fields(
extractor.name = self.name(),
content.size_bytes = content.len(),
)
)
)]
async fn extract_bytes(
&self,
content: &[u8],
mime_type: &str,
config: &ExtractionConfig,
) -> Result<ExtractionResult> {
let jats_content = utf8_validation::from_utf8(content)
.map(|s| s.to_string())
.unwrap_or_else(|_| String::from_utf8_lossy(content).to_string());
let (jats_metadata, extracted_content, _title, tables) = extract_jats_all_in_one(&jats_content)?;
let mut metadata = Metadata::default();
let mut subject_parts = Vec::new();
if !jats_metadata.title.is_empty() {
metadata.subject = Some(jats_metadata.title.clone());
subject_parts.push(format!("Title: {}", jats_metadata.title));
}
if let Some(subtitle) = &jats_metadata.subtitle {
subject_parts.push(format!("Subtitle: {}", subtitle));
}
if !jats_metadata.authors.is_empty() {
subject_parts.push(format!("Authors: {}", jats_metadata.authors.join("; ")));
}
if !jats_metadata.affiliations.is_empty() {
subject_parts.push(format!("Affiliations: {}", jats_metadata.affiliations.join("; ")));
}
if let Some(doi) = &jats_metadata.doi {
subject_parts.push(format!("DOI: {}", doi));
}
if let Some(pii) = &jats_metadata.pii {
subject_parts.push(format!("PII: {}", pii));
}
if !jats_metadata.keywords.is_empty() {
subject_parts.push(format!("Keywords: {}", jats_metadata.keywords.join("; ")));
}
if let Some(date) = &jats_metadata.publication_date {
metadata.created_at = Some(date.clone());
subject_parts.push(format!("Publication Date: {}", date));
}
if let Some(volume) = &jats_metadata.volume {
subject_parts.push(format!("Volume: {}", volume));
}
if let Some(issue) = &jats_metadata.issue {
subject_parts.push(format!("Issue: {}", issue));
}
if let Some(pages) = &jats_metadata.pages {
subject_parts.push(format!("Pages: {}", pages));
}
if let Some(journal_title) = &jats_metadata.journal_title {
subject_parts.push(format!("Journal: {}", journal_title));
}
if let Some(article_type) = &jats_metadata.article_type {
subject_parts.push(format!("Article Type: {}", article_type));
}
if let Some(abstract_text) = &jats_metadata.abstract_text {
subject_parts.push(format!("Abstract: {}", abstract_text));
}
if let Some(corresp_author) = &jats_metadata.corresponding_author {
subject_parts.push(format!("Corresponding Author: {}", corresp_author));
}
if !jats_metadata.history_dates.is_empty() {
for (date_type, date_val) in &jats_metadata.history_dates {
subject_parts.push(format!(
"{}: {}",
date_type[..1].to_uppercase() + &date_type[1..],
date_val
));
metadata.additional.insert(
std::borrow::Cow::Owned(format!("date_{}", date_type)),
serde_json::json!(date_val),
);
}
}
if let Some(copyright) = &jats_metadata.copyright_statement {
subject_parts.push(format!("Copyright: {}", copyright));
metadata
.additional
.insert(std::borrow::Cow::Borrowed("copyright"), serde_json::json!(copyright));
}
if let Some(license) = &jats_metadata.license {
metadata
.additional
.insert(std::borrow::Cow::Borrowed("license"), serde_json::json!(license));
}
if !jats_metadata.contributor_roles.is_empty() {
let roles: Vec<serde_json::Value> = jats_metadata
.contributor_roles
.iter()
.map(|(name, role)| serde_json::json!({"name": name, "role": role}))
.collect();
metadata.additional.insert(
std::borrow::Cow::Borrowed("contributor_roles"),
serde_json::json!(roles),
);
}
if !subject_parts.is_empty() {
metadata.subject = Some(subject_parts.join(" | "));
}
let document = if config.include_document_structure {
Some(build_jats_document_structure(&jats_content)?)
} else {
None
};
Ok(ExtractionResult {
content: extracted_content,
mime_type: mime_type.to_string().into(),
metadata,
tables,
detected_languages: None,
chunks: None,
images: None,
pages: None,
djot_content: None,
elements: None,
ocr_elements: None,
document,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
children: None,
})
}
#[cfg(feature = "tokio-runtime")]
#[cfg_attr(
feature = "otel",
tracing::instrument(
skip(self, path, config),
fields(
extractor.name = self.name(),
)
)
)]
async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
let bytes = tokio::fs::read(path).await?;
self.extract_bytes(&bytes, mime_type, config).await
}
fn supported_mime_types(&self) -> &[&str] {
&["application/x-jats+xml", "text/jats"]
}
fn priority(&self) -> i32 {
50
}
}
#[cfg(test)]
mod tests {
use super::*;
use elements::extract_jats_all_in_one;
#[test]
fn test_jats_extractor_plugin_interface() {
let extractor = JatsExtractor::new();
assert_eq!(extractor.name(), "jats-extractor");
assert!(extractor.initialize().is_ok());
assert!(extractor.shutdown().is_ok());
}
#[test]
fn test_jats_extractor_supported_mime_types() {
let extractor = JatsExtractor::new();
let mime_types = extractor.supported_mime_types();
assert_eq!(mime_types.len(), 2);
assert!(mime_types.contains(&"application/x-jats+xml"));
assert!(mime_types.contains(&"text/jats"));
}
#[test]
fn test_jats_extractor_priority() {
let extractor = JatsExtractor::new();
assert_eq!(extractor.priority(), 50);
}
#[test]
fn test_parse_simple_jats_article() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Test Article Title</article-title>
</article-meta>
</front>
<body>
<p>Test content paragraph.</p>
</body>
</article>"#;
let (metadata, content, title, _tables) = extract_jats_all_in_one(jats).expect("Parse failed");
assert_eq!(title, "Test Article Title");
assert_eq!(metadata.title, "Test Article Title");
assert!(content.contains("Test Article Title"));
assert!(content.contains("Test content"));
}
#[test]
fn test_extract_jats_title() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Effects of Caffeine on Human Health</article-title>
</article-meta>
</front>
</article>"#;
let (metadata, _content, _title, _tables) = extract_jats_all_in_one(jats).expect("Metadata extraction failed");
assert_eq!(metadata.title, "Effects of Caffeine on Human Health");
}
#[test]
fn test_extract_jats_subtitle() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Main Title</article-title>
<subtitle>A Systematic Review</subtitle>
</article-meta>
</front>
</article>"#;
let (metadata, _content, _title, _tables) = extract_jats_all_in_one(jats).expect("Metadata extraction failed");
assert_eq!(metadata.title, "Main Title");
assert_eq!(metadata.subtitle, Some("A Systematic Review".to_string()));
}
#[test]
fn test_extract_jats_authors() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Smith</surname>
<given-names>John A.</given-names>
</name>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Johnson</surname>
<given-names>Jane B.</given-names>
</name>
</contrib>
</contrib-group>
</article-meta>
</front>
</article>"#;
let (metadata, _content, _title, _tables) = extract_jats_all_in_one(jats).expect("Metadata extraction failed");
assert_eq!(metadata.authors.len(), 2);
assert!(metadata.authors[0].contains("Smith"));
assert!(metadata.authors[1].contains("Johnson"));
}
#[test]
fn test_extract_jats_affiliations() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<aff id="aff1">Department of Medicine, Harvard University, Cambridge, MA</aff>
<aff id="aff2">Center for Health Research, Boston Medical Center, Boston, MA</aff>
</article-meta>
</front>
</article>"#;
let (metadata, _content, _title, _tables) = extract_jats_all_in_one(jats).expect("Metadata extraction failed");
assert_eq!(metadata.affiliations.len(), 2);
assert!(metadata.affiliations[0].contains("Harvard"));
assert!(metadata.affiliations[1].contains("Boston"));
}
#[test]
fn test_extract_jats_doi_and_pii() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-id pub-id-type="doi">10.1371/journal.pmed.0020124</article-id>
<article-id pub-id-type="pii">05-PLME-RA-0071R2</article-id>
</article-meta>
</front>
</article>"#;
let (metadata, _content, _title, _tables) = extract_jats_all_in_one(jats).expect("Metadata extraction failed");
assert_eq!(metadata.doi, Some("10.1371/journal.pmed.0020124".to_string()));
assert_eq!(metadata.pii, Some("05-PLME-RA-0071R2".to_string()));
}
#[test]
fn test_extract_jats_keywords() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<kwd-group>
<kwd>caffeine</kwd>
<kwd>meta-analysis</kwd>
<kwd>systematic review</kwd>
</kwd-group>
</article-meta>
</front>
</article>"#;
let (metadata, _content, _title, _tables) = extract_jats_all_in_one(jats).expect("Metadata extraction failed");
assert_eq!(metadata.keywords.len(), 3);
assert!(metadata.keywords.contains(&"caffeine".to_string()));
assert!(metadata.keywords.contains(&"meta-analysis".to_string()));
}
#[test]
fn test_extract_jats_publication_info() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<pub-date pub-type="epub">
<day>18</day>
<month>04</month>
<year>2005</year>
</pub-date>
<volume>2</volume>
<issue>4</issue>
<fpage>e124</fpage>
<lpage>e132</lpage>
</article-meta>
</front>
</article>"#;
let (metadata, _content, _title, _tables) = extract_jats_all_in_one(jats).expect("Metadata extraction failed");
assert!(metadata.publication_date.is_some());
assert_eq!(metadata.volume, Some("2".to_string()));
assert_eq!(metadata.issue, Some("4".to_string()));
assert!(metadata.pages.is_some());
}
#[test]
fn test_extract_jats_abstract() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<abstract>
<sec>
<title>Background</title>
<p>This is the background information of the study.</p>
</sec>
<sec>
<title>Methods</title>
<p>We used quantitative analysis to evaluate the hypothesis.</p>
</sec>
</abstract>
</article-meta>
</front>
</article>"#;
let (metadata, _content, _title, _tables) = extract_jats_all_in_one(jats).expect("Metadata extraction failed");
assert!(metadata.abstract_text.is_some());
let abstract_text = metadata.abstract_text.expect("abstract_text should be present");
assert!(abstract_text.contains("background"));
assert!(abstract_text.contains("quantitative"));
}
#[test]
fn test_extract_jats_tables_basic() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<body>
<table-wrap id="tbl1">
<table>
<thead>
<tr>
<th>Study</th>
<th>Year</th>
</tr>
</thead>
<tbody>
<tr>
<td>Study A</td>
<td>2003</td>
</tr>
<tr>
<td>Study B</td>
<td>2004</td>
</tr>
</tbody>
</table>
</table-wrap>
</body>
</article>"#;
let (_metadata, _content, _title, tables) = extract_jats_all_in_one(jats).expect("Table extraction failed");
assert_eq!(tables.len(), 1);
assert_eq!(tables[0].cells.len(), 3);
}
#[test]
fn test_extract_jats_corresponding_author() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<author-notes>
<corresp id="cor1">To whom correspondence should be addressed. E-mail: rwilliams@yale.edu</corresp>
</author-notes>
</article-meta>
</front>
</article>"#;
let (metadata, _content, _title, _tables) = extract_jats_all_in_one(jats).expect("Metadata extraction failed");
assert!(metadata.corresponding_author.is_some());
let corresp = metadata
.corresponding_author
.expect("corresponding_author should be present");
assert!(corresp.contains("rwilliams"));
}
#[test]
fn test_extract_jats_section_hierarchy() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Article Title</article-title>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Introduction</title>
<p>Intro content.</p>
</sec>
<sec id="s2">
<title>Methods</title>
<p>Methods content.</p>
</sec>
<sec id="s3">
<title>Results</title>
<p>Results content.</p>
</sec>
</body>
</article>"#;
let (_metadata, content, _title, _tables) = extract_jats_all_in_one(jats).expect("Parse failed");
assert!(content.contains("Introduction"));
assert!(content.contains("Methods"));
assert!(content.contains("Results"));
}
#[test]
fn test_jats_extractor_full_metadata_extraction() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Sample Article</article-title>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Smith</surname>
<given-names>John</given-names>
</name>
</contrib>
</contrib-group>
<article-id pub-id-type="doi">10.1234/test</article-id>
<kwd-group>
<kwd>test</kwd>
</kwd-group>
<abstract>
<p>Test abstract.</p>
</abstract>
</article-meta>
</front>
<body>
<p>Sample content.</p>
</body>
</article>"#;
let (metadata_extracted, _content, _title, _tables) =
extract_jats_all_in_one(jats).expect("Metadata extraction failed");
assert_eq!(metadata_extracted.title, "Sample Article");
assert_eq!(metadata_extracted.authors.len(), 1);
assert_eq!(metadata_extracted.doi, Some("10.1234/test".to_string()));
assert_eq!(metadata_extracted.keywords.len(), 1);
assert!(metadata_extracted.abstract_text.is_some());
}
#[test]
fn test_jats_extractor_empty_article() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
</article-meta>
</front>
<body>
</body>
</article>"#;
let (metadata, content, _title, _tables) = extract_jats_all_in_one(jats).expect("Metadata extraction failed");
assert!(metadata.title.is_empty());
assert!(content.is_empty() || content.trim().is_empty());
}
#[test]
fn test_extract_jats_journal_title() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Test Article</article-title>
<journal-title>Nature Medicine</journal-title>
</article-meta>
</front>
</article>"#;
let (metadata, _content, _title, _tables) = extract_jats_all_in_one(jats).expect("Metadata extraction failed");
assert_eq!(metadata.journal_title, Some("Nature Medicine".to_string()));
}
#[test]
fn test_extract_jats_article_type() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article article-type="research-article">
<front>
<article-meta>
<article-title>Test Article</article-title>
</article-meta>
</front>
</article>"#;
let (metadata, _content, _title, _tables) = extract_jats_all_in_one(jats).expect("Metadata extraction failed");
assert_eq!(metadata.article_type, Some("research-article".to_string()));
}
#[test]
fn test_extract_all_13_metadata_fields() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article article-type="research-article">
<front>
<article-meta>
<article-title>Full Metadata Test</article-title>
<subtitle>A Complete Example</subtitle>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Author</surname>
<given-names>First</given-names>
</name>
</contrib>
</contrib-group>
<aff>Department of Testing, Test University</aff>
<article-id pub-id-type="doi">10.1234/full-test</article-id>
<article-id pub-id-type="pii">TEST-001</article-id>
<kwd-group>
<kwd>testing</kwd>
<kwd>metadata</kwd>
</kwd-group>
<pub-date pub-type="epub">
<year>2024</year>
</pub-date>
<volume>5</volume>
<issue>3</issue>
<fpage>100</fpage>
<lpage>110</lpage>
<journal-title>Test Journal</journal-title>
<abstract>
<p>This is a test abstract for all metadata fields.</p>
</abstract>
<author-notes>
<corresp>Correspondence: test@example.com</corresp>
</author-notes>
</article-meta>
</front>
<body>
<p>Test content.</p>
</body>
</article>"#;
let (metadata, _content, _title, _tables) = extract_jats_all_in_one(jats).expect("Metadata extraction failed");
assert_eq!(metadata.title, "Full Metadata Test");
assert_eq!(metadata.subtitle, Some("A Complete Example".to_string()));
assert_eq!(metadata.authors.len(), 1);
assert!(metadata.authors[0].contains("Author"));
assert_eq!(metadata.affiliations.len(), 1);
assert!(metadata.affiliations[0].contains("Testing"));
assert_eq!(metadata.doi, Some("10.1234/full-test".to_string()));
assert_eq!(metadata.pii, Some("TEST-001".to_string()));
assert_eq!(metadata.keywords.len(), 2);
assert!(metadata.publication_date.is_some());
assert_eq!(metadata.volume, Some("5".to_string()));
assert_eq!(metadata.issue, Some("3".to_string()));
assert!(metadata.pages.is_some());
assert_eq!(metadata.journal_title, Some("Test Journal".to_string()));
assert_eq!(metadata.article_type, Some("research-article".to_string()));
assert!(metadata.abstract_text.is_some());
assert!(metadata.corresponding_author.is_some());
}
#[test]
fn test_extract_jats_history_dates() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Test Article</article-title>
<history>
<date date-type="received">
<day>15</day>
<month>01</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>20</day>
<month>03</month>
<year>2024</year>
</date>
</history>
</article-meta>
</front>
</article>"#;
let (metadata, _content, _title, _tables) = extract_jats_all_in_one(jats).expect("Parse failed");
assert_eq!(metadata.history_dates.len(), 2);
assert_eq!(metadata.history_dates[0].0, "received");
assert!(metadata.history_dates[0].1.contains("2024"));
assert_eq!(metadata.history_dates[1].0, "accepted");
}
#[test]
fn test_extract_jats_permissions() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Test Article</article-title>
<permissions>
<copyright-statement>Copyright 2024 The Authors</copyright-statement>
<license>
<license-p>This is an open-access article distributed under the CC BY 4.0 license.</license-p>
</license>
</permissions>
</article-meta>
</front>
</article>"#;
let (metadata, _content, _title, _tables) = extract_jats_all_in_one(jats).expect("Parse failed");
assert_eq!(
metadata.copyright_statement,
Some("Copyright 2024 The Authors".to_string())
);
assert!(metadata.license.is_some());
assert!(
metadata
.license
.expect("license should be present")
.contains("CC BY 4.0")
);
}
#[test]
fn test_extract_jats_inline_formula() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Formula Test</article-title>
</article-meta>
</front>
<body>
<p>The equation <inline-formula>E = mc^2</inline-formula> is famous.</p>
</body>
</article>"#;
let (_metadata, content, _title, _tables) = extract_jats_all_in_one(jats).expect("Parse failed");
assert!(content.contains("E = mc^2"), "expected inline formula, got: {content}");
}
#[test]
fn test_extract_jats_contributor_roles() {
let jats = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Test Article</article-title>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Smith</surname>
<given-names>John</given-names>
</name>
</contrib>
<contrib contrib-type="editor">
<name>
<surname>Jones</surname>
<given-names>Mary</given-names>
</name>
</contrib>
</contrib-group>
</article-meta>
</front>
</article>"#;
let (metadata, _content, _title, _tables) = extract_jats_all_in_one(jats).expect("Parse failed");
assert_eq!(metadata.contributor_roles.len(), 2);
let author_role = metadata
.contributor_roles
.iter()
.find(|(_, role)| role == "author")
.expect("expected author role");
assert!(author_role.0.contains("Smith"));
let editor_role = metadata
.contributor_roles
.iter()
.find(|(_, role)| role == "editor")
.expect("expected editor role");
assert!(editor_role.0.contains("Jones"));
}
}