Skip to main content

research_master/models/
paper.rs

1//! Paper model representing a research paper from any source.
2
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6/// The source/repository where the paper was found
7#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
8#[serde(rename_all = "lowercase")]
9pub enum SourceType {
10    Arxiv,
11    PubMed,
12    BioRxiv,
13    MedRxiv,
14    SemanticScholar,
15    OpenAlex,
16    CrossRef,
17    IACR,
18    PMC,
19    HAL,
20    DBLP,
21    SSRN,
22    GoogleScholar,
23    SciHub,
24    CORE,
25    EuropePMC,
26    Dimensions,
27    IeeeXplore,
28    Zenodo,
29    Unpaywall,
30    MDPI,
31    Jstor,
32    Scispace,
33    Acm,
34    ConnectedPapers,
35    Doaj,
36    WorldWideScience,
37    Osf,
38    Base,
39    Springer,
40    #[serde(untagged)]
41    Other(String),
42}
43
44impl SourceType {
45    /// Returns the display name of the source
46    pub fn name(&self) -> &str {
47        match self {
48            SourceType::Arxiv => "arXiv",
49            SourceType::PubMed => "PubMed",
50            SourceType::BioRxiv => "bioRxiv",
51            SourceType::MedRxiv => "medRxiv",
52            SourceType::SemanticScholar => "Semantic Scholar",
53            SourceType::OpenAlex => "OpenAlex",
54            SourceType::CrossRef => "CrossRef",
55            SourceType::IACR => "IACR ePrint",
56            SourceType::PMC => "PubMed Central",
57            SourceType::HAL => "HAL",
58            SourceType::DBLP => "DBLP",
59            SourceType::SSRN => "SSRN",
60            SourceType::GoogleScholar => "Google Scholar",
61            SourceType::SciHub => "Sci-Hub",
62            SourceType::CORE => "CORE",
63            SourceType::EuropePMC => "Europe PMC",
64            SourceType::Dimensions => "Dimensions",
65            SourceType::IeeeXplore => "IEEE Xplore",
66            SourceType::Zenodo => "Zenodo",
67            SourceType::Unpaywall => "Unpaywall",
68            SourceType::MDPI => "MDPI",
69            SourceType::Jstor => "JSTOR",
70            SourceType::Scispace => "SciSpace",
71            SourceType::Acm => "ACM Digital Library",
72            SourceType::ConnectedPapers => "Connected Papers",
73            SourceType::Doaj => "DOAJ",
74            SourceType::WorldWideScience => "WorldWideScience",
75            SourceType::Osf => "OSF Preprints",
76            SourceType::Base => "BASE",
77            SourceType::Springer => "Springer",
78            SourceType::Other(s) => s,
79        }
80    }
81
82    /// Returns the source identifier (for tool naming)
83    pub fn id(&self) -> &str {
84        match self {
85            SourceType::Arxiv => "arxiv",
86            SourceType::PubMed => "pubmed",
87            SourceType::BioRxiv => "biorxiv",
88            SourceType::MedRxiv => "medrxiv",
89            SourceType::SemanticScholar => "semantic",
90            SourceType::OpenAlex => "openalex",
91            SourceType::CrossRef => "crossref",
92            SourceType::IACR => "iacr",
93            SourceType::PMC => "pmc",
94            SourceType::HAL => "hal",
95            SourceType::DBLP => "dblp",
96            SourceType::SSRN => "ssrn",
97            SourceType::GoogleScholar => "google_scholar",
98            SourceType::SciHub => "sci_hub",
99            SourceType::CORE => "core",
100            SourceType::EuropePMC => "europe_pmc",
101            SourceType::Dimensions => "dimensions",
102            SourceType::IeeeXplore => "ieee_xplore",
103            SourceType::Zenodo => "zenodo",
104            SourceType::Unpaywall => "unpaywall",
105            SourceType::MDPI => "mdpi",
106            SourceType::Jstor => "jstor",
107            SourceType::Scispace => "scispace",
108            SourceType::Acm => "acm",
109            SourceType::ConnectedPapers => "connected_papers",
110            SourceType::Doaj => "doaj",
111            SourceType::WorldWideScience => "worldwidescience",
112            SourceType::Osf => "osf",
113            SourceType::Base => "base",
114            SourceType::Springer => "springer",
115            SourceType::Other(s) => s,
116        }
117    }
118}
119
120impl std::fmt::Display for SourceType {
121    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
122        write!(f, "{}", self.name())
123    }
124}
125
126/// A research paper from any academic source
127///
128/// This struct provides a standardized format for papers across all sources,
129/// making it easy to work with papers from multiple repositories.
130#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct Paper {
132    /// Unique identifier (source-specific: DOI, PMID, arXiv ID, etc.)
133    pub paper_id: String,
134
135    /// Paper title
136    pub title: String,
137
138    /// Authors (semicolon-separated)
139    pub authors: String,
140
141    /// Abstract text
142    pub r#abstract: String,
143
144    /// Digital Object Identifier
145    pub doi: Option<String>,
146
147    /// Publication date (ISO format)
148    pub published_date: Option<String>,
149
150    /// Last updated date (ISO format)
151    pub updated_date: Option<String>,
152
153    /// Direct PDF URL
154    pub pdf_url: Option<String>,
155
156    /// Paper page URL
157    pub url: String,
158
159    /// Source where the paper was found
160    pub source: SourceType,
161
162    /// Categories/tags (semicolon-separated)
163    pub categories: Option<String>,
164
165    /// Keywords (semicolon-separated)
166    pub keywords: Option<String>,
167
168    /// Citation count
169    pub citations: Option<u32>,
170
171    /// Reference IDs (semicolon-separated)
172    pub references: Option<String>,
173
174    /// Source-specific metadata (flexible JSON)
175    pub extra: Option<HashMap<String, serde_json::Value>>,
176}
177
178impl Paper {
179    /// Create a new paper with required fields
180    pub fn new(paper_id: String, title: String, url: String, source: SourceType) -> Self {
181        Self {
182            paper_id,
183            title,
184            authors: String::new(),
185            r#abstract: String::new(),
186            doi: None,
187            published_date: None,
188            updated_date: None,
189            pdf_url: None,
190            url,
191            source,
192            categories: None,
193            keywords: None,
194            citations: None,
195            references: None,
196            extra: None,
197        }
198    }
199
200    /// Returns the primary identifier for this paper (DOI if available, else paper_id)
201    pub fn primary_id(&self) -> &str {
202        self.doi.as_ref().unwrap_or(&self.paper_id)
203    }
204
205    /// Returns the author names as a vector
206    pub fn author_list(&self) -> Vec<&str> {
207        self.authors
208            .split(';')
209            .map(|s| s.trim())
210            .filter(|s| !s.is_empty())
211            .collect()
212    }
213
214    /// Returns the categories as a vector
215    pub fn category_list(&self) -> Vec<&str> {
216        self.categories
217            .as_ref()
218            .map(|c| {
219                c.split(';')
220                    .map(|s| s.trim())
221                    .filter(|s| !s.is_empty())
222                    .collect()
223            })
224            .unwrap_or_default()
225    }
226
227    /// Returns the keywords as a vector
228    pub fn keyword_list(&self) -> Vec<&str> {
229        self.keywords
230            .as_ref()
231            .map(|k| {
232                k.split(';')
233                    .map(|s| s.trim())
234                    .filter(|s| !s.is_empty())
235                    .collect()
236            })
237            .unwrap_or_default()
238    }
239
240    /// Check if paper has a downloadable PDF
241    pub fn has_pdf(&self) -> bool {
242        self.pdf_url.is_some()
243    }
244}
245
246/// Builder for constructing Paper objects
247#[derive(Debug, Clone)]
248pub struct PaperBuilder {
249    paper: Paper,
250}
251
252impl PaperBuilder {
253    /// Create a new builder with required fields
254    pub fn new(
255        paper_id: impl Into<String>,
256        title: impl Into<String>,
257        url: impl Into<String>,
258        source: SourceType,
259    ) -> Self {
260        Self {
261            paper: Paper::new(paper_id.into(), title.into(), url.into(), source),
262        }
263    }
264
265    /// Set authors
266    pub fn authors(mut self, authors: impl Into<String>) -> Self {
267        self.paper.authors = authors.into();
268        self
269    }
270
271    /// Set abstract
272    pub fn abstract_text(mut self, abstract_text: impl Into<String>) -> Self {
273        self.paper.r#abstract = abstract_text.into();
274        self
275    }
276
277    /// Set DOI
278    pub fn doi(mut self, doi: impl Into<String>) -> Self {
279        self.paper.doi = Some(doi.into());
280        self
281    }
282
283    /// Set publication date
284    pub fn published_date(mut self, date: impl Into<String>) -> Self {
285        self.paper.published_date = Some(date.into());
286        self
287    }
288
289    /// Set updated date
290    pub fn updated_date(mut self, date: impl Into<String>) -> Self {
291        self.paper.updated_date = Some(date.into());
292        self
293    }
294
295    /// Set PDF URL
296    pub fn pdf_url(mut self, url: impl Into<String>) -> Self {
297        self.paper.pdf_url = Some(url.into());
298        self
299    }
300
301    /// Set categories
302    pub fn categories(mut self, categories: impl Into<String>) -> Self {
303        self.paper.categories = Some(categories.into());
304        self
305    }
306
307    /// Set keywords
308    pub fn keywords(mut self, keywords: impl Into<String>) -> Self {
309        self.paper.keywords = Some(keywords.into());
310        self
311    }
312
313    /// Set citation count
314    pub fn citations(mut self, count: u32) -> Self {
315        self.paper.citations = Some(count);
316        self
317    }
318
319    /// Set references
320    pub fn references(mut self, references: impl Into<String>) -> Self {
321        self.paper.references = Some(references.into());
322        self
323    }
324
325    /// Add extra metadata
326    pub fn extra(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
327        self.paper
328            .extra
329            .get_or_insert_with(HashMap::new)
330            .insert(key.into(), value);
331        self
332    }
333
334    /// Build the Paper
335    pub fn build(self) -> Paper {
336        self.paper
337    }
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343
344    #[test]
345    fn test_paper_builder() {
346        let paper = PaperBuilder::new(
347            "1234.5678",
348            "Test Paper",
349            "https://example.com",
350            SourceType::Arxiv,
351        )
352        .authors("John Doe; Jane Smith")
353        .abstract_text("This is a test abstract.")
354        .doi("10.1234/test.1234")
355        .pdf_url("https://example.com/paper.pdf")
356        .citations(42)
357        .build();
358
359        assert_eq!(paper.paper_id, "1234.5678");
360        assert_eq!(paper.title, "Test Paper");
361        assert_eq!(paper.authors, "John Doe; Jane Smith");
362        assert_eq!(paper.doi, Some("10.1234/test.1234".to_string()));
363        assert_eq!(paper.citations, Some(42));
364    }
365
366    #[test]
367    fn test_author_list() {
368        let paper = PaperBuilder::new(
369            "1234".to_string(),
370            "Test".to_string(),
371            "https://example.com".to_string(),
372            SourceType::Arxiv,
373        )
374        .authors("John Doe; Jane Smith; Bob Jones")
375        .build();
376
377        let authors = paper.author_list();
378        assert_eq!(authors, vec!["John Doe", "Jane Smith", "Bob Jones"]);
379    }
380
381    #[test]
382    fn test_primary_id() {
383        let with_doi = PaperBuilder::new(
384            "1234".to_string(),
385            "Test".to_string(),
386            "https://example.com".to_string(),
387            SourceType::Arxiv,
388        )
389        .doi("10.1234/test")
390        .build();
391
392        assert_eq!(with_doi.primary_id(), "10.1234/test");
393
394        let without_doi = Paper::new(
395            "1234".to_string(),
396            "Test".to_string(),
397            "https://example.com".to_string(),
398            SourceType::Arxiv,
399        );
400
401        assert_eq!(without_doi.primary_id(), "1234");
402    }
403
404    #[test]
405    fn test_paper_builder_all_fields() {
406        let paper = PaperBuilder::new(
407            "PMC12345",
408            "Medical Research Paper",
409            "https://pubmed.ncbi.nlm.nih.gov/12345/",
410            SourceType::PubMed,
411        )
412        .authors("Alice Johnson; Bob Williams")
413        .abstract_text("This is a medical abstract.")
414        .doi("10.1000/abc123")
415        .pdf_url("https://example.com/fulltext.pdf")
416        .published_date("2023-05-15")
417        .categories("Medicine;Biology")
418        .keywords("gene therapy;CRISPR")
419        .citations(100)
420        .references("ref1;ref2")
421        .build();
422
423        assert_eq!(paper.paper_id, "PMC12345");
424        assert_eq!(paper.title, "Medical Research Paper");
425        assert_eq!(paper.source, SourceType::PubMed);
426        assert_eq!(paper.authors, "Alice Johnson; Bob Williams");
427        assert_eq!(paper.doi, Some("10.1000/abc123".to_string()));
428        assert_eq!(paper.published_date, Some("2023-05-15".to_string()));
429        assert_eq!(paper.categories, Some("Medicine;Biology".to_string()));
430        assert_eq!(paper.keywords, Some("gene therapy;CRISPR".to_string()));
431        assert_eq!(paper.citations, Some(100));
432        assert_eq!(paper.references, Some("ref1;ref2".to_string()));
433    }
434
435    #[test]
436    fn test_paper_builder_empty_authors() {
437        let paper = PaperBuilder::new(
438            "1234",
439            "Anonymous Paper",
440            "https://example.com",
441            SourceType::Arxiv,
442        )
443        .authors("")
444        .build();
445
446        let authors = paper.author_list();
447        assert!(authors.is_empty());
448    }
449
450    #[test]
451    fn test_paper_builder_minimal() {
452        let paper = PaperBuilder::new(
453            "minimal",
454            "Minimal Paper",
455            "https://example.com",
456            SourceType::SemanticScholar,
457        )
458        .build();
459
460        assert_eq!(paper.paper_id, "minimal");
461        assert_eq!(paper.title, "Minimal Paper");
462        assert!(paper.authors.is_empty());
463        assert!(paper.doi.is_none());
464        assert!(paper.r#abstract.is_empty());
465    }
466
467    #[test]
468    fn test_paper_with_pdf() {
469        let paper = PaperBuilder::new(
470            "1234",
471            "Paper with PDF",
472            "https://example.com",
473            SourceType::Arxiv,
474        )
475        .pdf_url("https://arxiv.org/pdf/1234.pdf")
476        .build();
477
478        assert!(paper.has_pdf());
479        assert!(paper.pdf_url.is_some());
480    }
481
482    #[test]
483    fn test_paper_without_pdf() {
484        let paper = Paper::new(
485            "1234".to_string(),
486            "Paper without PDF".to_string(),
487            "https://example.com".to_string(),
488            SourceType::Arxiv,
489        );
490
491        assert!(!paper.has_pdf());
492    }
493
494    #[test]
495    fn test_category_list() {
496        let paper = PaperBuilder::new("1234", "Test", "https://example.com", SourceType::Arxiv)
497            .categories("cs.AI;cs.LG")
498            .build();
499
500        let categories = paper.category_list();
501        assert_eq!(categories, vec!["cs.AI", "cs.LG"]);
502    }
503
504    #[test]
505    fn test_keyword_list() {
506        let paper = PaperBuilder::new("1234", "Test", "https://example.com", SourceType::Arxiv)
507            .keywords("neural networks;deep learning")
508            .build();
509
510        let keywords = paper.keyword_list();
511        assert_eq!(keywords, vec!["neural networks", "deep learning"]);
512    }
513}