use super::preprocessing::strip_inline_html_tags;
use super::xml_types::PubmedArticleSet;
use crate::error::{PubMedError, Result};
use crate::pubmed::models::PubMedArticle;
use quick_xml::de::from_str;
use tracing::{instrument, warn};
#[instrument(skip(xml), fields(xml_size = xml.len()))]
pub fn parse_articles_from_xml(xml: &str) -> Result<Vec<PubMedArticle>> {
let cleaned_xml = strip_inline_html_tags(xml);
let article_set: PubmedArticleSet = from_str(&cleaned_xml)
.map_err(|e| PubMedError::XmlError(format!("Failed to deserialize XML: {}", e)))?;
let articles: Vec<PubMedArticle> = article_set
.articles
.into_iter()
.filter_map(|article_xml| {
let pmid = article_xml
.medline_citation
.pmid
.as_ref()
.map(|p| p.value.clone())?;
match article_xml.into_article(&pmid) {
Ok(article) => Some(article),
Err(e) => {
warn!(pmid = %pmid, error = %e, "Failed to parse article, skipping");
None
}
}
})
.collect();
Ok(articles)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_multiple_articles() {
let xml = r#"<?xml version="1.0" ?>
<PubmedArticleSet>
<PubmedArticle>
<MedlineCitation>
<PMID>12345678</PMID>
<Article>
<ArticleTitle>First Article</ArticleTitle>
<Journal><Title>Journal One</Title></Journal>
</Article>
</MedlineCitation>
</PubmedArticle>
<PubmedArticle>
<MedlineCitation>
<PMID>87654321</PMID>
<Article>
<ArticleTitle>Second Article</ArticleTitle>
<Journal><Title>Journal Two</Title></Journal>
</Article>
</MedlineCitation>
</PubmedArticle>
</PubmedArticleSet>"#;
let articles = parse_articles_from_xml(xml).unwrap();
assert_eq!(articles.len(), 2);
assert_eq!(articles[0].pmid, "12345678");
assert_eq!(articles[0].title, "First Article");
assert_eq!(articles[1].pmid, "87654321");
assert_eq!(articles[1].title, "Second Article");
}
#[test]
fn test_parse_empty_set() {
let xml = r#"<?xml version="1.0" ?>
<PubmedArticleSet>
</PubmedArticleSet>"#;
let articles = parse_articles_from_xml(xml).unwrap();
assert!(articles.is_empty());
}
#[test]
fn test_parse_with_inline_html() {
let xml = r#"<?xml version="1.0" ?>
<PubmedArticleSet>
<PubmedArticle>
<MedlineCitation>
<PMID>11111111</PMID>
<Article>
<ArticleTitle>Article with <i>italic</i> text</ArticleTitle>
<Abstract>
<AbstractText>Abstract with H<sub>2</sub>O formula</AbstractText>
</Abstract>
<Journal><Title>Test Journal</Title></Journal>
</Article>
</MedlineCitation>
</PubmedArticle>
</PubmedArticleSet>"#;
let articles = parse_articles_from_xml(xml).unwrap();
assert_eq!(articles.len(), 1);
assert!(articles[0].title.contains("italic"));
}
}