use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[cfg(feature = "csv")]
extern crate csv as csv_crate;
#[cfg(feature = "csv")]
pub mod csv;
#[cfg(feature = "dedupe")]
pub mod dedupe;
#[cfg(feature = "diagnostics")]
pub mod diagnostics;
#[cfg(feature = "xml")]
pub mod endnote_xml;
pub mod error;
#[cfg(feature = "pubmed")]
pub mod pubmed;
#[cfg(feature = "ris")]
pub mod ris;
#[cfg(feature = "csv")]
pub use csv::CsvParser;
#[cfg(feature = "diagnostics")]
pub use diagnostics::parse_with_diagnostics;
#[cfg(feature = "xml")]
pub use endnote_xml::EndNoteXmlParser;
pub use error::{CitationError, ParseError, SourceSpan, ValueError};
#[cfg(feature = "pubmed")]
pub use pubmed::PubMedParser;
#[cfg(feature = "ris")]
pub use ris::RisParser;
mod regex;
mod utils;
#[derive(Debug, Clone, PartialEq)]
pub enum CitationFormat {
Ris,
PubMed,
EndNoteXml,
Csv,
Unknown,
}
impl CitationFormat {
pub fn as_str(&self) -> &'static str {
match self {
CitationFormat::Ris => "RIS",
CitationFormat::PubMed => "PubMed",
CitationFormat::EndNoteXml => "EndNote XML",
CitationFormat::Csv => "CSV",
CitationFormat::Unknown => "Unknown",
}
}
}
impl std::fmt::Display for CitationFormat {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.as_str())
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Date {
pub year: i32,
pub month: Option<u8>,
pub day: Option<u8>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Author {
pub name: String,
pub given_name: Option<String>,
pub middle_name: Option<String>,
pub affiliations: Vec<String>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct Citation {
pub citation_type: Vec<String>,
pub title: String,
pub authors: Vec<Author>,
pub journal: Option<String>,
pub journal_abbr: Option<String>,
pub date: Option<Date>,
pub volume: Option<String>,
pub issue: Option<String>,
pub pages: Option<String>,
pub issn: Vec<String>,
pub doi: Option<String>,
pub pmid: Option<String>,
pub pmc_id: Option<String>,
pub abstract_text: Option<String>,
pub keywords: Vec<String>,
pub urls: Vec<String>,
pub language: Option<String>,
pub mesh_terms: Vec<String>,
pub publisher: Option<String>,
pub extra_fields: HashMap<String, Vec<String>>,
}
impl Citation {
pub fn new() -> Self {
Self::default()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DuplicateGroup {
pub unique: Citation,
pub duplicates: Vec<Citation>,
}
pub trait CitationParser {
fn parse(&self, input: &str) -> std::result::Result<Vec<Citation>, crate::error::ParseError>;
}
pub fn detect_and_parse(
content: &str,
) -> std::result::Result<(Vec<Citation>, CitationFormat), CitationError> {
let trimmed = content.trim();
if trimmed.is_empty() {
return Ok((Vec::new(), CitationFormat::Unknown));
}
if trimmed.starts_with("<?xml") || trimmed.starts_with("<xml>") {
#[cfg(feature = "xml")]
{
let parser = EndNoteXmlParser::new();
let citations = parser.parse(content).map_err(CitationError::Parse)?;
return Ok((citations, CitationFormat::EndNoteXml));
}
#[cfg(not(feature = "xml"))]
return Err(CitationError::UnknownFormat);
}
if trimmed.starts_with("TY -") || trimmed.contains("\nTY -") {
#[cfg(feature = "ris")]
{
let parser = RisParser::new();
return parser
.parse(content)
.map(|citations| (citations, CitationFormat::Ris))
.map_err(CitationError::Parse);
}
#[cfg(not(feature = "ris"))]
return Err(CitationError::UnknownFormat);
}
if trimmed.starts_with("PMID-") || trimmed.contains("\nPMID-") {
#[cfg(feature = "pubmed")]
{
let parser = PubMedParser::new();
return parser
.parse(content)
.map(|citations| (citations, CitationFormat::PubMed))
.map_err(CitationError::Parse);
}
#[cfg(not(feature = "pubmed"))]
return Err(CitationError::UnknownFormat);
}
Err(CitationError::UnknownFormat)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_author_equality() {
let author1 = Author {
name: "Smith".to_string(),
given_name: Some("John".to_string()),
middle_name: None,
affiliations: Vec::new(),
};
let author2 = Author {
name: "Smith".to_string(),
given_name: Some("John".to_string()),
middle_name: None,
affiliations: Vec::new(),
};
assert_eq!(author1, author2);
}
#[test]
fn test_detect_and_parse_ris() {
let content = r#"TY - JOUR
TI - Test Title
AU - Smith, John
ER -"#;
let (citations, format) = detect_and_parse(content).unwrap();
assert_eq!(format, CitationFormat::Ris);
assert_eq!(citations[0].title, "Test Title");
}
#[test]
fn test_detect_and_parse_pubmed() {
let content = r#"PMID- 12345678
TI - Test Title
FAU - Smith, John"#;
let (citations, format) = detect_and_parse(content).unwrap();
assert_eq!(format, CitationFormat::PubMed);
assert_eq!(citations[0].title, "Test Title");
}
#[test]
fn test_detect_and_parse_endnote() {
let content = r#"<?xml version="1.0" encoding="UTF-8"?>
<xml><records><record>
<titles><title>Test Title</title></titles>
</record></records></xml>"#;
let (citations, format) = detect_and_parse(content).unwrap();
assert_eq!(format, CitationFormat::EndNoteXml);
assert_eq!(citations[0].title, "Test Title");
}
#[test]
fn test_detect_and_parse_empty() {
let result = detect_and_parse("");
assert!(
matches!(result, Ok((citations, format)) if citations.is_empty() && format == CitationFormat::Unknown)
);
}
#[test]
fn test_detect_and_parse_unknown() {
let content = "Some random content\nthat doesn't match\nany known format";
let result = detect_and_parse(content);
assert!(matches!(result, Err(CitationError::UnknownFormat)));
}
}