use super::xml_utils;
use crate::pmc::models::{FundingInfo, JournalInfo, SupplementaryMaterial};
pub fn extract_journal_info(content: &str) -> JournalInfo {
let mut journal = JournalInfo::new(
xml_utils::extract_text_between(content, "<journal-title>", "</journal-title>")
.unwrap_or_else(|| "Unknown Journal".to_string()),
);
journal.abbreviation = xml_utils::extract_text_between(
content,
"<journal-id journal-id-type=\"iso-abbrev\">",
"</journal-id>",
);
let mut pos = 0;
while let Some(issn_start) = content[pos..].find("<issn") {
let issn_start = pos + issn_start;
if let Some(issn_end) = content[issn_start..].find("</issn>") {
let issn_end = issn_start + issn_end;
let issn_section = &content[issn_start..issn_end];
if let Some(content_start) = issn_section.find(">") {
let issn_value = &issn_section[content_start + 1..];
if issn_section.contains("pub-type=\"epub\"") {
journal.issn_electronic = Some(issn_value.to_string());
} else if issn_section.contains("pub-type=\"ppub\"") {
journal.issn_print = Some(issn_value.to_string());
}
}
pos = issn_end;
} else {
break;
}
}
journal.publisher =
xml_utils::extract_text_between(content, "<publisher-name>", "</publisher-name>");
journal.volume = xml_utils::extract_text_between(content, "<volume>", "</volume>");
journal.issue = xml_utils::extract_text_between(content, "<issue>", "</issue>");
journal
}
pub fn extract_pub_date(content: &str) -> String {
if let Some(year) = xml_utils::extract_text_between(content, "<year>", "</year>") {
if let Some(month) = xml_utils::extract_text_between(content, "<month>", "</month>") {
if let Some(day) = xml_utils::extract_text_between(content, "<day>", "</day>") {
return format!(
"{}-{:02}-{:02}",
year,
month.parse::<u32>().unwrap_or(1),
day.parse::<u32>().unwrap_or(1)
);
}
return format!("{}-{:02}", year, month.parse::<u32>().unwrap_or(1));
}
return year;
}
"Unknown Date".to_string()
}
pub fn extract_doi(content: &str) -> Option<String> {
let mut pos = 0;
while let Some(id_start) = content[pos..].find(r#"<article-id pub-id-type="doi""#) {
let id_start = pos + id_start;
if let Some(content_start) = content[id_start..].find(">") {
let content_start = id_start + content_start + 1;
if let Some(content_end) = content[content_start..].find("</article-id>") {
let content_end = content_start + content_end;
return Some(content[content_start..content_end].trim().to_string());
}
}
pos = id_start + 1;
}
None
}
pub fn extract_pmid(content: &str) -> Option<String> {
let mut pos = 0;
while let Some(id_start) = content[pos..].find(r#"<article-id pub-id-type="pmid""#) {
let id_start = pos + id_start;
if let Some(content_start) = content[id_start..].find(">") {
let content_start = id_start + content_start + 1;
if let Some(content_end) = content[content_start..].find("</article-id>") {
let content_end = content_start + content_end;
return Some(content[content_start..content_end].trim().to_string());
}
}
pos = id_start + 1;
}
None
}
pub fn extract_article_type(content: &str) -> Option<String> {
if let Some(article_start) = content.find("<article") {
if let Some(article_end) = content[article_start..].find(">") {
let article_tag = &content[article_start..article_start + article_end];
if let Some(type_start) = article_tag.find("article-type=\"") {
let type_start = type_start + 14; if let Some(type_end) = article_tag[type_start..].find('"') {
return Some(article_tag[type_start..type_start + type_end].to_string());
}
}
}
}
xml_utils::extract_text_between(content, "<subject>", "</subject>")
}
pub fn extract_keywords(content: &str) -> Vec<String> {
let mut keywords = Vec::new();
if let Some(kwd_start) = content.find("<kwd-group") {
if let Some(kwd_end) = content[kwd_start..].find("</kwd-group>") {
let kwd_section = &content[kwd_start..kwd_start + kwd_end];
let mut pos = 0;
while let Some(kwd_start) = kwd_section[pos..].find("<kwd>") {
let kwd_start = pos + kwd_start + 5; if let Some(kwd_end) = kwd_section[kwd_start..].find("</kwd>") {
let raw_keyword = kwd_section[kwd_start..kwd_start + kwd_end].trim();
let keyword = xml_utils::strip_xml_tags(raw_keyword);
if !keyword.is_empty() {
keywords.push(keyword);
}
pos = kwd_start + kwd_end;
} else {
break;
}
}
}
}
keywords
}
pub fn extract_funding(content: &str) -> Vec<FundingInfo> {
let mut funding = Vec::new();
if let Some(funding_start) = content.find("<funding-group>") {
if let Some(funding_end) = content[funding_start..].find("</funding-group>") {
let funding_section = &content[funding_start..funding_start + funding_end];
let mut pos = 0;
while let Some(award_start) = funding_section[pos..].find("<award-group") {
let award_start = pos + award_start;
if let Some(award_end) = funding_section[award_start..].find("</award-group>") {
let award_end = award_start + award_end;
let award_section = &funding_section[award_start..award_end];
let source = xml_utils::extract_text_between(
award_section,
"<funding-source>",
"</funding-source>",
)
.unwrap_or_else(|| "Unknown Source".to_string());
let mut funding_info = FundingInfo::new(source);
funding_info.award_id =
xml_utils::extract_text_between(award_section, "<award-id>", "</award-id>");
funding.push(funding_info);
pos = award_end;
} else {
break;
}
}
}
}
funding
}
pub fn extract_conflict_of_interest(content: &str) -> Option<String> {
if let Some(fn_start) = content.find("<fn-group") {
if let Some(fn_end) = content[fn_start..].find("</fn-group>") {
let fn_section = &content[fn_start..fn_start + fn_end];
let mut pos = 0;
while let Some(fn_start) = fn_section[pos..].find("<fn") {
let fn_start = pos + fn_start;
if let Some(fn_end) = fn_section[fn_start..].find("</fn>") {
let fn_end = fn_start + fn_end;
let fn_content = &fn_section[fn_start..fn_end];
if fn_content.contains("conflict") || fn_content.contains("competing") {
if let Some(p_start) = fn_content.find("<p>") {
if let Some(p_end) = fn_content[p_start..].find("</p>") {
let coi = &fn_content[p_start + 3..p_start + p_end];
return Some(xml_utils::strip_xml_tags(coi));
}
}
}
pos = fn_end;
} else {
break;
}
}
}
}
if let Some(coi_start) = content.find("<sec") {
if let Some(coi_end) = content[coi_start..].find("</sec>") {
let coi_section = &content[coi_start..coi_start + coi_end];
if coi_section.contains("conflict") || coi_section.contains("competing") {
if let Some(title_start) = coi_section.find("<title>") {
if let Some(title_end) = coi_section[title_start..].find("</title>") {
let title = &coi_section[title_start + 7..title_start + title_end];
if title.to_lowercase().contains("conflict")
|| title.to_lowercase().contains("competing")
{
if let Some(p_start) = coi_section.find("<p>") {
if let Some(p_end) = coi_section[p_start..].find("</p>") {
let coi = &coi_section[p_start + 3..p_start + p_end];
return Some(xml_utils::strip_xml_tags(coi));
}
}
}
}
}
}
}
}
None
}
pub fn extract_acknowledgments(content: &str) -> Option<String> {
xml_utils::extract_text_between(content, "<ack>", "</ack>")
.map(|ack| xml_utils::strip_xml_tags(&ack))
}
pub fn extract_data_availability(content: &str) -> Option<String> {
if let Some(data_start) = content.find("<sec") {
if let Some(data_end) = content[data_start..].find("</sec>") {
let data_section = &content[data_start..data_start + data_end];
if data_section.contains("data") && data_section.contains("availab") {
return Some(xml_utils::strip_xml_tags(data_section));
}
}
}
if let Some(supp_start) = content.find("<supplementary-material") {
if let Some(supp_end) = content[supp_start..].find("</supplementary-material>") {
let supp_section = &content[supp_start..supp_start + supp_end];
if supp_section.contains("data") && supp_section.contains("availab") {
return Some(xml_utils::strip_xml_tags(supp_section));
}
}
}
None
}
pub fn extract_supplementary_materials(content: &str) -> Vec<SupplementaryMaterial> {
let mut materials = Vec::new();
let mut pos = 0;
while let Some(supp_start) = content[pos..].find("<supplementary-material") {
let supp_start = pos + supp_start;
if let Some(supp_end) = content[supp_start..].find("</supplementary-material>") {
let supp_end = supp_start + supp_end;
let supp_content = &content[supp_start..supp_end];
let id = xml_utils::extract_attribute_value(supp_content, "id").unwrap_or_else(|| {
let supp_num = materials.len() + 1;
format!("supp_{supp_num}")
});
let label = xml_utils::extract_text_between(supp_content, "<label>", "</label>");
let caption = xml_utils::extract_text_between(supp_content, "<caption>", "</caption>")
.and_then(|caption_content| {
xml_utils::extract_text_between(&caption_content, "<title>", "</title>")
.or_else(|| {
Some(xml_utils::strip_xml_tags(&caption_content))
})
})
.unwrap_or_else(|| "No caption available".to_string());
let content_type = xml_utils::extract_attribute_value(supp_content, "content-type");
let mime_type = xml_utils::extract_attribute_value(supp_content, "mimetype");
let position = xml_utils::extract_attribute_value(supp_content, "position");
let href = xml_utils::extract_attribute_value(supp_content, "href")
.or_else(|| xml_utils::extract_attribute_value(supp_content, "xlink:href"))
.or_else(|| {
if let Some(media_start) = supp_content.find("<media") {
if let Some(media_end) = supp_content[media_start..].find(">") {
let media_tag = &supp_content[media_start..media_start + media_end + 1];
xml_utils::extract_attribute_value(media_tag, "xlink:href")
.or_else(|| xml_utils::extract_attribute_value(media_tag, "href"))
} else {
None
}
} else {
None
}
});
let mut material = SupplementaryMaterial::new(id);
material.title = Some(caption);
material.description = label;
material.content_type = content_type;
material.file_type = mime_type;
material.position = position;
material.file_url = href;
materials.push(material);
pos = supp_end;
} else {
break;
}
}
materials
}
pub fn extract_title(content: &str) -> String {
xml_utils::extract_text_between(content, "<article-title>", "</article-title>")
.unwrap_or_else(|| "Unknown Title".to_string())
}
pub fn extract_language(content: &str) -> Option<String> {
if let Some(article_start) = content.find("<article") {
if let Some(article_end) = content[article_start..].find(">") {
let article_tag = &content[article_start..article_start + article_end];
if let Some(lang) = xml_utils::extract_attribute_value(article_tag, "xml:lang") {
return Some(lang);
}
}
}
None
}
pub fn extract_article_ids(content: &str) -> Vec<(String, String)> {
let mut ids = Vec::new();
let id_tags = xml_utils::find_all_tags(content, "article-id");
for id_tag in id_tags {
if let Some(id_type) = xml_utils::extract_attribute_value(&id_tag, "pub-id-type") {
if let Some(id_value) = xml_utils::extract_element_content(&id_tag, "article-id") {
ids.push((id_type, id_value.trim().to_string()));
}
}
}
ids
}
pub fn extract_copyright(content: &str) -> Option<String> {
xml_utils::extract_text_between(content, "<copyright-statement>", "</copyright-statement>")
.or_else(|| {
xml_utils::extract_text_between(content, "<copyright-year>", "</copyright-year>")
})
}
pub fn extract_license(content: &str) -> Option<String> {
xml_utils::extract_element_content(content, "license")
.map(|license_content| xml_utils::strip_xml_tags(&license_content))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_title() {
let content = r#"<article-title>Test Article Title</article-title>"#;
let title = extract_title(content);
assert_eq!(title, "Test Article Title");
}
#[test]
fn test_extract_doi() {
let content = r#"<article-id pub-id-type="doi">10.1234/test.doi</article-id>"#;
let doi = extract_doi(content);
assert_eq!(doi, Some("10.1234/test.doi".to_string()));
}
#[test]
fn test_extract_pmid() {
let content = r#"<article-id pub-id-type="pmid">12345678</article-id>"#;
let pmid = extract_pmid(content);
assert_eq!(pmid, Some("12345678".to_string()));
}
#[test]
fn test_extract_keywords() {
let content = r#"
<kwd-group>
<kwd>keyword1</kwd>
<kwd>keyword2</kwd>
<kwd>keyword3</kwd>
</kwd-group>
"#;
let keywords = extract_keywords(content);
assert_eq!(keywords, vec!["keyword1", "keyword2", "keyword3"]);
}
#[test]
fn test_extract_keywords_with_nested_tags() {
let content = r#"
<kwd-group>
<kwd><italic toggle="yes">Prevotella copri</italic></kwd>
<kwd>normal keyword</kwd>
<kwd><bold>important</bold> keyword</kwd>
</kwd-group>
"#;
let keywords = extract_keywords(content);
assert_eq!(
keywords,
vec!["Prevotella copri", "normal keyword", "important keyword"]
);
}
#[test]
fn test_extract_pub_date() {
let content_full = r#"<year>2023</year><month>12</month><day>25</day>"#;
assert_eq!(extract_pub_date(content_full), "2023-12-25");
let content_year_month = r#"<year>2023</year><month>12</month>"#;
assert_eq!(extract_pub_date(content_year_month), "2023-12");
let content_year_only = r#"<year>2023</year>"#;
assert_eq!(extract_pub_date(content_year_only), "2023");
let content_no_date = r#"<title>No date here</title>"#;
assert_eq!(extract_pub_date(content_no_date), "Unknown Date");
}
#[test]
fn test_extract_article_type() {
let content = r#"<article article-type="research-article">Content</article>"#;
let article_type = extract_article_type(content);
assert_eq!(article_type, Some("research-article".to_string()));
}
#[test]
fn test_extract_language() {
let content = r#"<article xml:lang="en">Content</article>"#;
let language = extract_language(content);
assert_eq!(language, Some("en".to_string()));
}
#[test]
fn test_extract_article_ids() {
let content = r#"
<article-id pub-id-type="doi">10.1234/test</article-id>
<article-id pub-id-type="pmid">12345</article-id>
<article-id pub-id-type="pmc">PMC123456</article-id>
"#;
let ids = extract_article_ids(content);
assert_eq!(ids.len(), 3);
assert!(ids.contains(&("doi".to_string(), "10.1234/test".to_string())));
assert!(ids.contains(&("pmid".to_string(), "12345".to_string())));
assert!(ids.contains(&("pmc".to_string(), "PMC123456".to_string())));
}
#[test]
fn test_extract_acknowledgments() {
let content = r#"<ack><p>We thank the contributors for their valuable input.</p></ack>"#;
let ack = extract_acknowledgments(content);
assert_eq!(
ack,
Some("We thank the contributors for their valuable input.".to_string())
);
}
}