use crate::Author;
use crate::error::SourceSpan;
use crate::ris::tags::RisTag;
use std::collections::HashMap;
#[derive(Debug, Clone)]
pub(crate) struct RawRisData {
pub(crate) data: HashMap<RisTag, Vec<String>>,
pub(crate) authors: Vec<Author>,
pub(crate) ignored_lines: Vec<(usize, String)>,
pub(crate) start_line: Option<usize>,
pub(crate) record_span: Option<SourceSpan>,
}
impl RawRisData {
pub(crate) fn new() -> Self {
Self {
data: HashMap::new(),
authors: Vec::new(),
ignored_lines: Vec::new(),
start_line: None,
record_span: None,
}
}
pub(crate) fn add_data(&mut self, tag: RisTag, value: String) {
self.data.entry(tag).or_default().push(value);
}
pub(crate) fn add_author(&mut self, author: Author) {
self.authors.push(author);
}
pub(crate) fn add_ignored_line(&mut self, line_number: usize, line: String) {
self.ignored_lines.push((line_number, line));
}
pub(crate) fn get_first(&self, tag: &RisTag) -> Option<&String> {
self.data.get(tag).and_then(|values| values.first())
}
pub(crate) fn remove(&mut self, tag: &RisTag) -> Option<Vec<String>> {
self.data.remove(tag)
}
pub(crate) fn has_content(&self) -> bool {
!self.data.is_empty() || !self.authors.is_empty()
}
fn get_best_value_by_priority<F>(&self, priority_fn: F) -> Option<String>
where
F: Fn(&RisTag) -> Option<u8>,
{
let mut best_value = None;
let mut best_priority = u8::MAX;
for (tag, values) in &self.data {
if let Some(priority) = priority_fn(tag)
&& priority < best_priority
&& !values.is_empty()
&& let Some(first_value) = values.first()
&& !first_value.trim().is_empty()
{
best_priority = priority;
best_value = Some(first_value.clone());
}
}
best_value
}
pub(crate) fn get_best_journal(&self) -> Option<String> {
self.get_best_value_by_priority(|tag| tag.journal_priority())
}
pub(crate) fn get_best_journal_abbr(&self) -> Option<String> {
self.get_best_value_by_priority(|tag| tag.journal_abbr_priority())
}
}
impl TryFrom<RawRisData> for crate::Citation {
type Error = crate::error::ParseError;
fn try_from(mut raw: RawRisData) -> Result<Self, Self::Error> {
let citation_type = raw
.remove(&RisTag::Type)
.unwrap_or_default()
.into_iter()
.map(|t| map_ris_type(&t).to_string())
.collect();
let title = Self::extract_title(&mut raw)?;
let (journal, journal_abbr) = Self::extract_journal_info(&mut raw);
let date = Self::extract_date(&mut raw);
let (volume, issue, pages) = Self::extract_publication_details(&mut raw);
let (doi, urls) = Self::extract_doi_and_urls(&mut raw);
let (pmid, pmc_id) = Self::extract_identifiers(&mut raw);
let abstract_text = Self::extract_abstract(&mut raw);
let keywords = raw.remove(&RisTag::Keywords).unwrap_or_default();
let issn = raw.remove(&RisTag::SerialNumber).unwrap_or_default();
let (language, publisher) = Self::extract_metadata(&mut raw);
let extra_fields = Self::extract_extra_fields(&mut raw);
Ok(crate::Citation {
citation_type,
title,
authors: raw.authors,
journal,
journal_abbr,
date: date.clone(),
volume,
issue,
pages,
issn,
doi,
pmid,
pmc_id,
abstract_text,
keywords,
urls,
language,
mesh_terms: Vec::new(), publisher,
extra_fields,
})
}
}
impl crate::Citation {
fn extract_title(raw: &mut RawRisData) -> Result<String, crate::error::ParseError> {
let start_line = raw.start_line;
let record_span = raw.record_span.clone();
let title = raw
.get_first(&RisTag::Title)
.filter(|s| !s.trim().is_empty())
.or_else(|| {
raw.get_first(&RisTag::TitleAlternative)
.filter(|s| !s.trim().is_empty())
})
.cloned()
.ok_or_else(|| {
let err = crate::error::ParseError::new(
start_line,
None,
crate::CitationFormat::Ris,
crate::error::ValueError::MissingValue {
field: crate::error::fields::TITLE,
key: "TI",
},
);
if let Some(span) = record_span {
err.with_span(span)
} else {
err
}
})?;
raw.remove(&RisTag::Title);
raw.remove(&RisTag::TitleAlternative);
Ok(title)
}
fn extract_journal_info(raw: &mut RawRisData) -> (Option<String>, Option<String>) {
let journal = raw.get_best_journal();
let journal_abbr = raw.get_best_journal_abbr();
raw.remove(&RisTag::JournalFull);
raw.remove(&RisTag::JournalFullAlternative);
raw.remove(&RisTag::JournalAbbreviation);
raw.remove(&RisTag::JournalAbbreviationAlternative);
raw.remove(&RisTag::SecondaryTitle);
(journal, journal_abbr)
}
fn extract_date(raw: &mut RawRisData) -> Option<crate::Date> {
let date = raw
.get_first(&RisTag::PublicationYear)
.or_else(|| raw.get_first(&RisTag::DatePrimary))
.and_then(|date_str| {
crate::utils::parse_ris_date(date_str)
});
raw.remove(&RisTag::PublicationYear);
raw.remove(&RisTag::DatePrimary);
raw.remove(&RisTag::DateAccess);
date
}
fn extract_publication_details(
raw: &mut RawRisData,
) -> (Option<String>, Option<String>, Option<String>) {
let volume = raw
.remove(&RisTag::Volume)
.and_then(|v| v.into_iter().next());
let issue = raw
.remove(&RisTag::Issue)
.and_then(|v| v.into_iter().next());
let start_page = raw
.remove(&RisTag::StartPage)
.and_then(|v| v.into_iter().next());
let end_page = raw
.remove(&RisTag::EndPage)
.and_then(|v| v.into_iter().next());
let pages = match (start_page, end_page) {
(Some(start), Some(end)) => Some(crate::utils::format_page_numbers(&format!(
"{}-{}",
start, end
))),
(Some(start), None) => Some(crate::utils::format_page_numbers(&start)),
(None, Some(end)) => Some(end),
(None, None) => None,
};
(volume, issue, pages)
}
fn extract_doi_and_urls(raw: &mut RawRisData) -> (Option<String>, Vec<String>) {
let mut doi = raw
.remove(&RisTag::Doi)
.and_then(|v| v.into_iter().next())
.and_then(|doi_str| crate::utils::format_doi(&doi_str));
let mut urls = Vec::new();
for tag in [
RisTag::LinkPdf,
RisTag::LinkFullText,
RisTag::LinkRelated,
RisTag::LinkImages,
RisTag::Url,
RisTag::Link,
] {
if let Some(mut tag_urls) = raw.remove(&tag) {
if doi.is_none() {
for url in &tag_urls {
if url.contains("doi.org")
&& let Some(extracted_doi) = crate::utils::format_doi(url)
{
doi = Some(extracted_doi);
break;
}
}
}
urls.append(&mut tag_urls);
}
}
(doi, urls)
}
fn extract_identifiers(raw: &mut RawRisData) -> (Option<String>, Option<String>) {
let pmid = raw
.remove(&RisTag::ReferenceId)
.and_then(|v| v.into_iter().next());
let pmc_id = raw
.remove(&RisTag::PmcId)
.and_then(|v| v.into_iter().next())
.filter(|s| s.contains("PMC"));
(pmid, pmc_id)
}
fn extract_abstract(raw: &mut RawRisData) -> Option<String> {
let abstract_text = raw
.get_first(&RisTag::Abstract)
.or_else(|| raw.get_first(&RisTag::AbstractAlternative))
.cloned();
raw.remove(&RisTag::Abstract);
raw.remove(&RisTag::AbstractAlternative);
abstract_text
}
fn extract_metadata(raw: &mut RawRisData) -> (Option<String>, Option<String>) {
let language = raw
.remove(&RisTag::Language)
.and_then(|v| v.into_iter().next());
let publisher = raw
.remove(&RisTag::Publisher)
.and_then(|v| v.into_iter().next());
(language, publisher)
}
fn extract_extra_fields(raw: &mut RawRisData) -> HashMap<String, Vec<String>> {
raw.remove(&RisTag::EndOfReference);
raw.data
.drain()
.map(|(tag, values)| (tag.as_tag().to_string(), values))
.collect()
}
}
fn map_ris_type(abbr: &str) -> &str {
match abbr {
"ABST" => "Abstract",
"ADVS" => "Audiovisual Material",
"ART" => "Art Work",
"BILL" => "Bill/Resolution",
"BOOK" => "Book",
"CASE" => "Case",
"CHAP" => "Book Chapter",
"COMP" => "Computer Program",
"CONF" => "Conference Proceeding",
"CTLG" => "Catalog",
"DATA" => "Data File",
"ELEC" => "Electronic Citation",
"GEN" => "Generic",
"HEAR" => "Hearing",
"ICOMM" => "Internet Communication",
"INPR" => "In Press",
"JFULL" => "Journal/Periodical (Full)",
"JOUR" => "Journal Article",
"MAP" => "Map",
"MGZN" => "Magazine Article",
"MPCT" => "Motion Picture",
"MUSIC" => "Music Score",
"NEWS" => "Newspaper",
"PAMP" => "Pamphlet",
"PAT" => "Patent",
"PCOMM" => "Personal Communication",
"RPRT" => "Report",
"SER" => "Serial Publication",
"SLIDE" => "Slide",
"SOUND" => "Sound Recording",
"STAT" => "Statute",
"THES" => "Thesis/Dissertation",
"UNBILL" => "Unenacted Bill/Resolution",
"UNPB" => "Unpublished Work",
"VIDEO" => "Video Recording",
other => other,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::ris::tags::RisTag;
#[test]
fn test_raw_ris_data_new() {
let raw = RawRisData::new();
assert!(raw.data.is_empty());
assert!(raw.authors.is_empty());
assert!(raw.ignored_lines.is_empty());
assert!(!raw.has_content());
}
#[test]
fn test_add_data() {
let mut raw = RawRisData::new();
raw.add_data(RisTag::Title, "Test Title".to_string());
raw.add_data(RisTag::Title, "Another Title".to_string());
assert_eq!(
raw.get_first(&RisTag::Title),
Some(&"Test Title".to_string())
);
assert!(raw.has_content());
}
#[test]
fn test_journal_priority() {
let mut raw = RawRisData::new();
raw.add_data(RisTag::JournalFullAlternative, "Alt Journal".to_string());
raw.add_data(RisTag::JournalFull, "Main Journal".to_string());
raw.add_data(RisTag::SecondaryTitle, "Secondary".to_string());
assert_eq!(raw.get_best_journal(), Some("Main Journal".to_string()));
}
#[test]
fn test_conversion_to_citation() {
let mut raw = RawRisData::new();
raw.add_data(RisTag::Type, "JOUR".to_string());
raw.add_data(RisTag::Title, "Test Article".to_string());
raw.add_author(Author {
name: "Smith".to_string(),
given_name: Some("John".to_string()),
middle_name: None,
affiliations: Vec::new(),
});
let citation: crate::Citation = raw.try_into().unwrap();
assert_eq!(citation.title, "Test Article");
assert_eq!(citation.citation_type, vec!["Journal Article"]);
assert_eq!(citation.authors.len(), 1);
}
#[test]
fn test_missing_title_error() {
let raw = RawRisData::new();
let result: Result<crate::Citation, _> = raw.try_into();
assert!(matches!(result, Err(_parse_err)));
}
#[test]
fn test_doi_extraction_from_urls() {
let mut raw = RawRisData::new();
raw.add_data(RisTag::Type, "JOUR".to_string());
raw.add_data(RisTag::Title, "Test Article".to_string());
raw.add_data(RisTag::Url, "https://doi.org/10.1234/example".to_string());
raw.add_data(RisTag::LinkPdf, "https://example.com/pdf".to_string());
let citation: crate::Citation = raw.try_into().unwrap();
assert_eq!(citation.doi, Some("10.1234/example".to_string()));
assert_eq!(citation.urls.len(), 2);
assert!(
citation
.urls
.contains(&"https://doi.org/10.1234/example".to_string())
);
}
#[test]
fn test_doi_extraction_prioritizes_doi_field() {
let mut raw = RawRisData::new();
raw.add_data(RisTag::Type, "JOUR".to_string());
raw.add_data(RisTag::Title, "Test Article".to_string());
raw.add_data(RisTag::Doi, "10.5678/primary".to_string());
raw.add_data(RisTag::Url, "https://doi.org/10.1234/secondary".to_string());
let citation: crate::Citation = raw.try_into().unwrap();
assert_eq!(citation.doi, Some("10.5678/primary".to_string()));
}
#[test]
fn test_title_extraction_edge_cases() {
let mut raw = RawRisData::new();
raw.add_data(RisTag::Type, "JOUR".to_string());
raw.add_data(RisTag::Title, "".to_string());
raw.add_data(RisTag::TitleAlternative, "Fallback Title".to_string());
let citation: crate::Citation = raw.try_into().unwrap();
assert_eq!(citation.title, "Fallback Title");
let mut raw2 = RawRisData::new();
raw2.add_data(RisTag::Type, "JOUR".to_string());
raw2.add_data(RisTag::TitleAlternative, "Fallback Title".to_string());
let citation2: crate::Citation = raw2.try_into().unwrap();
assert_eq!(citation2.title, "Fallback Title");
let mut raw3 = RawRisData::new();
raw3.add_data(RisTag::Type, "JOUR".to_string());
raw3.add_data(RisTag::Title, " ".to_string());
raw3.add_data(RisTag::TitleAlternative, "Fallback Title".to_string());
let citation3: crate::Citation = raw3.try_into().unwrap();
assert_eq!(citation3.title, "Fallback Title");
}
#[test]
fn test_complex_doi_extraction_scenarios() {
let mut raw = RawRisData::new();
raw.add_data(RisTag::Type, "JOUR".to_string());
raw.add_data(RisTag::Title, "Test Article".to_string());
raw.add_data(RisTag::Url, "https://malformed-doi-url".to_string());
raw.add_data(RisTag::LinkPdf, "https://doi.org/malformed".to_string());
let citation: crate::Citation = raw.try_into().unwrap();
assert_eq!(
citation.doi, None,
"Should not extract DOI from malformed URLs"
);
assert_eq!(citation.urls.len(), 2, "Should still preserve all URLs");
assert!(
citation
.urls
.contains(&"https://malformed-doi-url".to_string())
);
assert!(
citation
.urls
.contains(&"https://doi.org/malformed".to_string())
);
}
#[test]
fn test_journal_priority_with_empty_values() {
let mut raw = RawRisData::new();
raw.add_data(RisTag::JournalFull, "".to_string()); raw.add_data(RisTag::SecondaryTitle, "Secondary Journal".to_string());
raw.add_data(RisTag::JournalFullAlternative, "Alt Journal".to_string());
assert_eq!(
raw.get_best_journal(),
Some("Secondary Journal".to_string())
);
}
}