arxiv_tools/
lib.rs

1//! # Description
2//! This library provides a simple interface to query the arXiv API.
3//!
4//! # Example
5//! ## Simple Query
6//! ```rust
7//! # use arxiv_tools::{ArXiv, QueryParams, Paper};
8//! # #[tokio::main]
9//! # async fn main() {
10//! // get arxiv object from query parameters
11//! let mut arxiv = ArXiv::from_args(QueryParams::title("attention is all you need"));
12//!
13//! // execute
14//! let response: Vec<Paper> = arxiv.query().await;
15//!
16//! //verify
17//! let paper = response.first().unwrap();
18//! assert!(paper.title.to_lowercase().contains("attention is all you need"));
19//! # }
20//! ```
21//!
22//! ## Query by arXiv ID
23//! ```rust
24//! # use arxiv_tools::{ArXiv, Paper};
25//! # #[tokio::main]
26//! # async fn main() {
27//! // fetch specific papers by their arXiv IDs
28//! let mut arxiv = ArXiv::from_id_list(vec!["1706.03762", "1810.04805"]);
29//!
30//! // execute
31//! let response: Vec<Paper> = arxiv.query().await;
32//!
33//! // verify
34//! assert_eq!(response.len(), 2);
35//! # }
36//! ```
37//!
38//! ## Complex Query
39//! ```rust
40//! # use arxiv_tools::{ArXiv, QueryParams, Category, SortBy, SortOrder};
41//! # #[tokio::main]
42//! # async fn main() {
43//! // build query parameters
44//! let args = QueryParams::and(vec![
45//!     QueryParams::or(vec![QueryParams::title("ai"), QueryParams::title("llm")]),
46//!     QueryParams::group(vec![QueryParams::or(vec![
47//!         QueryParams::subject_category(Category::CsAi),
48//!         QueryParams::subject_category(Category::CsLg),
49//!     ])]),
50//!     QueryParams::SubmittedDate(String::from("202412010000"), String::from("202412012359")),
51//! ]);
52//! let mut arxiv = ArXiv::from_args(args);
53//!
54//! // set additional parameters
55//! arxiv.start(10);
56//! arxiv.max_results(100);
57//! arxiv.sort_by(SortBy::SubmittedDate);
58//! arxiv.sort_order(SortOrder::Ascending);
59//!
60//! // execute
61//! let response = arxiv.query().await;
62//!
63//! // verify
64//! assert!(response.len() > 0);
65//! # }
66//! ```
67use chrono::{DateTime, Utc};
68use quick_xml::events::Event;
69use quick_xml::reader::Reader;
70use reqwest as request;
71use serde::{Deserialize, Serialize};
72use urlencoding::encode;
73
74pub enum Category {
75    CsAi,
76    CsCl,
77    CsLg,
78    CsGt,
79    CsCv,
80    CsCr,
81    CsCc,
82    CsCe,
83    CsCy,
84    CsDs,
85    CsDm,
86    CsDc,
87    CsEt,
88    CsFl,
89    CsGl,
90    CsGr,
91    CsAr,
92    CsHc,
93    CsIr,
94}
95
96impl Category {
97    pub fn to_string(&self) -> String {
98        match self {
99            Category::CsAi => String::from("cs.AI"),
100            Category::CsCl => String::from("cs.CL"),
101            Category::CsLg => String::from("cs.LG"),
102            Category::CsGt => String::from("cs.GT"),
103            Category::CsCv => String::from("cs.CV"),
104            Category::CsCr => String::from("cs.CR"),
105            Category::CsCc => String::from("cs.CC"),
106            Category::CsCe => String::from("cs.CE"),
107            Category::CsCy => String::from("cs.CY"),
108            Category::CsDs => String::from("cs.DS"),
109            Category::CsDm => String::from("cs.DM"),
110            Category::CsDc => String::from("cs.DC"),
111            Category::CsEt => String::from("cs.ET"),
112            Category::CsFl => String::from("cs.FL"),
113            Category::CsGl => String::from("cs.GL"),
114            Category::CsGr => String::from("cs.GR"),
115            Category::CsAr => String::from("cs.AR"),
116            Category::CsHc => String::from("cs.HC"),
117            Category::CsIr => String::from("cs.IR"),
118        }
119    }
120}
121
122#[derive(Clone, Debug)]
123pub enum QueryParams {
124    Title(String),
125    Author(String),
126    Abstract(String),
127    Comment(String),
128    JournalRef(String),
129    SubjectCategory(String),
130    ReportNumber(String),
131    Id(String),
132    All(String),
133    And(String),
134    Or(String),
135    AndNot(String),
136    Group(String),
137    SubmittedDate(String, String),
138}
139
140impl Default for QueryParams {
141    fn default() -> Self {
142        return QueryParams::title("default");
143    }
144}
145
146#[derive(Clone, Debug, Default)]
147pub enum SortBy {
148    #[default]
149    Relevance,
150    LastUpdatedDate,
151    SubmittedDate,
152}
153
154impl SortBy {
155    pub fn to_string(&self) -> String {
156        match self {
157            SortBy::Relevance => String::from("relevance"),
158            SortBy::LastUpdatedDate => String::from("lastUpdatedDate"),
159            SortBy::SubmittedDate => String::from("submittedDate"),
160        }
161    }
162}
163
164#[derive(Clone, Debug, Default)]
165pub enum SortOrder {
166    #[default]
167    Ascending,
168    Descending,
169}
170
171impl SortOrder {
172    pub fn to_string(&self) -> String {
173        match self {
174            SortOrder::Ascending => String::from("ascending"),
175            SortOrder::Descending => String::from("descending"),
176        }
177    }
178}
179
180impl QueryParams {
181    pub fn title(arg: &str) -> Self {
182        return QueryParams::Title(format!("ti:\"{}\"", encode(arg)));
183    }
184    pub fn author(arg: &str) -> Self {
185        return QueryParams::Author(format!("au:\"{}\"", encode(arg)));
186    }
187    pub fn abstract_text(arg: &str) -> Self {
188        return QueryParams::Abstract(format!("abs:\"{}\"", encode(arg)));
189    }
190    pub fn comment(arg: &str) -> Self {
191        return QueryParams::Comment(format!("co:\"{}\"", encode(arg)));
192    }
193    pub fn journal_ref(arg: &str) -> Self {
194        return QueryParams::JournalRef(format!("jr:\"{}\"", encode(arg)));
195    }
196    pub fn subject_category(arg: Category) -> Self {
197        return QueryParams::SubjectCategory(format!("cat:\"{}\"", encode(&arg.to_string())));
198    }
199    pub fn report_number(arg: &str) -> Self {
200        return QueryParams::ReportNumber(format!("rn:\"{}\"", encode(arg)));
201    }
202    pub fn id(id: &str) -> Self {
203        return QueryParams::Id(format!("id:\"{}\"", encode(id)));
204    }
205    pub fn all(arg: &str) -> Self {
206        return QueryParams::All(format!("all:\"{}\"", encode(arg)));
207    }
208    pub fn to_string(&self) -> String {
209        match self {
210            QueryParams::Title(arg) => arg.to_string(),
211            QueryParams::Author(arg) => arg.to_string(),
212            QueryParams::Abstract(arg) => arg.to_string(),
213            QueryParams::Comment(arg) => arg.to_string(),
214            QueryParams::JournalRef(arg) => arg.to_string(),
215            QueryParams::SubjectCategory(arg) => arg.to_string(),
216            QueryParams::ReportNumber(arg) => arg.to_string(),
217            QueryParams::Id(arg) => arg.to_string(),
218            QueryParams::All(arg) => arg.to_string(),
219            QueryParams::And(arg) => arg.to_string(),
220            QueryParams::Or(arg) => arg.to_string(),
221            QueryParams::AndNot(arg) => arg.to_string(),
222            QueryParams::Group(arg) => arg.to_string(),
223            QueryParams::SubmittedDate(from, to) => {
224                format!("submittedDate:[{}+TO+{}]", from, to)
225            }
226        }
227    }
228    pub fn and(args: Vec<QueryParams>) -> Self {
229        let args = args
230            .iter()
231            .map(|arg| arg.to_string())
232            .collect::<Vec<String>>();
233        let query = args.join("+AND+");
234        return QueryParams::And(query);
235    }
236    pub fn or(args: Vec<QueryParams>) -> Self {
237        let args = args
238            .iter()
239            .map(|arg| arg.to_string())
240            .collect::<Vec<String>>();
241        let query = args.join("+OR+");
242        return QueryParams::Or(query);
243    }
244    pub fn and_not(args: Vec<QueryParams>) -> Self {
245        let args = args
246            .iter()
247            .map(|arg| arg.to_string())
248            .collect::<Vec<String>>();
249        let query = args.join("+ANDNOT+");
250        return QueryParams::Or(query);
251    }
252    pub fn group(args: Vec<QueryParams>) -> Self {
253        let mut args = args
254            .iter()
255            .map(|arg| arg.to_string())
256            .collect::<Vec<String>>();
257        args.insert(0, String::from("%28"));
258        args.push(String::from("%29"));
259        let query = args.join("");
260        return QueryParams::Group(query);
261    }
262}
263
264#[derive(Debug, Clone, Serialize, Deserialize)]
265pub struct Paper {
266    pub id: String,
267    pub title: String,
268    pub authors: Vec<String>,
269    #[serde(rename = "abstract")]
270    pub abstract_text: String,
271    pub published: String,
272    pub updated: String,
273    pub doi: String,
274    pub comment: Vec<String>,
275    pub journal_ref: String,
276    pub pdf_url: String,
277    pub primary_category: String,
278    pub categories: Vec<String>,
279}
280
281impl Paper {
282    pub fn default() -> Self {
283        return Paper {
284            id: "".to_string(),
285            title: "".to_string(),
286            authors: Vec::new(),
287            abstract_text: "".to_string(),
288            published: "".to_string(),
289            updated: "".to_string(),
290            doi: "".to_string(),
291            comment: Vec::new(),
292            journal_ref: "".to_string(),
293            pdf_url: "".to_string(),
294            primary_category: "".to_string(),
295            categories: Vec::new(),
296        };
297    }
298
299    pub fn published2utc(&self) -> DateTime<Utc> {
300        return DateTime::parse_from_rfc3339(&self.published)
301            .unwrap()
302            .with_timezone(&Utc);
303    }
304
305    pub fn updated2utc(&self) -> DateTime<Utc> {
306        return DateTime::parse_from_rfc3339(&self.updated)
307            .unwrap()
308            .with_timezone(&Utc);
309    }
310}
311
312#[derive(Clone, Debug, Default)]
313pub struct ArXiv {
314    pub args: QueryParams,
315    pub start: Option<u64>,
316    pub max_resutls: Option<u64>,
317    pub sort_by: Option<SortBy>,
318    pub sort_order: Option<SortOrder>,
319    pub id_list: Option<Vec<String>>,
320}
321
322impl ArXiv {
323    pub fn from_args(args: QueryParams) -> Self {
324        return ArXiv {
325            args: args,
326            max_resutls: None,
327            start: None,
328            sort_by: None,
329            sort_order: None,
330            id_list: None,
331        };
332    }
333
334    /// Create an ArXiv query to fetch papers by their arXiv IDs.
335    ///
336    /// # Example
337    /// ```rust
338    /// # use arxiv_tools::ArXiv;
339    /// # #[tokio::main]
340    /// # async fn main() {
341    /// let mut arxiv = ArXiv::from_id_list(vec!["1706.03762", "1810.04805"]);
342    /// let papers = arxiv.query().await;
343    /// # }
344    /// ```
345    pub fn from_id_list(ids: Vec<&str>) -> Self {
346        return ArXiv {
347            args: QueryParams::default(),
348            max_resutls: None,
349            start: None,
350            sort_by: None,
351            sort_order: None,
352            id_list: Some(ids.iter().map(|s| s.to_string()).collect()),
353        };
354    }
355
356    pub fn start(&mut self, start: u64) -> &mut Self {
357        self.start = Some(start);
358        return self;
359    }
360    pub fn max_results(&mut self, max_results: u64) -> &mut Self {
361        self.max_resutls = Some(max_results);
362        return self;
363    }
364    pub fn sort_by(&mut self, sort_by: SortBy) -> &mut Self {
365        self.sort_by = Some(sort_by);
366        return self;
367    }
368    pub fn sort_order(&mut self, sort_order: SortOrder) -> &mut Self {
369        self.sort_order = Some(sort_order);
370        return self;
371    }
372    /// Set the list of arXiv IDs to query.
373    ///
374    /// This can be combined with search_query to filter results.
375    pub fn id_list(&mut self, ids: Vec<&str>) -> &mut Self {
376        self.id_list = Some(ids.iter().map(|s| s.to_string()).collect());
377        return self;
378    }
379
380    fn parse_xml(&self, xml: String) -> Vec<Paper> {
381        let mut reader = Reader::from_str(&xml);
382        let mut buf = Vec::new();
383        let mut in_entry = false;
384        let mut in_id = false;
385        let mut in_title = false;
386        let mut in_author = false;
387        let mut in_name = false;
388        let mut in_abstract = false;
389        let mut in_published = false;
390        let mut in_updated = false;
391        let mut in_comment = false;
392        let mut in_journal_ref = false;
393
394        let mut responses: Vec<Paper> = Vec::new();
395        let mut res = Paper::default();
396        loop {
397            match reader.read_event_into(&mut buf) {
398                Ok(Event::Start(ref e)) => {
399                    if e.name().as_ref() == b"entry" {
400                        in_entry = true;
401                        res = Paper::default();
402                    } else if e.name().as_ref() == b"id" {
403                        in_id = true;
404                    } else if e.name().as_ref() == b"title" {
405                        in_title = true;
406                    } else if e.name().as_ref() == b"author" {
407                        in_author = true;
408                    } else if e.name().as_ref() == b"name" {
409                        if in_author {
410                            in_name = true;
411                        }
412                    } else if e.name().as_ref() == b"summary" {
413                        in_abstract = true;
414                    } else if e.name().as_ref() == b"published" {
415                        in_published = true;
416                    } else if e.name().as_ref() == b"updated" {
417                        in_updated = true;
418                    } else if e.name().as_ref() == b"arxiv:comment" {
419                        in_comment = true;
420                    } else if e.name().as_ref() == b"arxiv:journal_ref" {
421                        in_journal_ref = true;
422                    } else if e.name().as_ref() == b"link" && in_entry {
423                        let mut is_pdf = false;
424                        let mut is_doi = false;
425                        e.attributes().for_each(|attr| {
426                            if let Ok(attr) = attr {
427                                if attr.key.as_ref() == b"title" && attr.value.as_ref() == b"pdf" {
428                                    is_pdf = true;
429                                } else if attr.key.as_ref() == b"title"
430                                    && attr.value.as_ref() == b"doi"
431                                {
432                                    is_doi = true;
433                                }
434                            }
435                        });
436                        e.attributes().for_each(|attr| {
437                            if let Ok(attr) = attr {
438                                if attr.key.as_ref() == b"href" {
439                                    if is_pdf {
440                                        res.pdf_url = String::from_utf8_lossy(attr.value.as_ref())
441                                            .to_string();
442                                    } else if is_doi {
443                                        res.doi = String::from_utf8_lossy(attr.value.as_ref())
444                                            .to_string();
445                                    }
446                                }
447                            }
448                        });
449                    } else if e.name().as_ref() == b"arxiv:primary_category" {
450                        e.attributes().for_each(|attr| {
451                            if let Ok(attr) = attr {
452                                if attr.key.as_ref() == b"term" {
453                                    res.primary_category =
454                                        String::from_utf8_lossy(attr.value.as_ref()).to_string();
455                                }
456                            }
457                        });
458                    } else if e.name().as_ref() == b"category" {
459                        if let Some(attr) = e
460                            .attributes()
461                            .find(|attr| attr.as_ref().unwrap().key.as_ref() == b"term")
462                        {
463                            res.categories.push(
464                                String::from_utf8_lossy(attr.unwrap().value.as_ref()).to_string(),
465                            );
466                        }
467                    } else if e.name().as_ref() == b"category" {
468                        if let Some(attr) = e
469                            .attributes()
470                            .find(|attr| attr.as_ref().unwrap().key.as_ref() == b"term")
471                        {
472                            res.categories.push(
473                                String::from_utf8_lossy(attr.unwrap().value.as_ref()).to_string(),
474                            );
475                        }
476                    }
477                }
478                Ok(Event::End(ref e)) => {
479                    if e.name().as_ref() == b"entry" {
480                        in_entry = false;
481                        responses.push(res.clone());
482                        res = Paper::default();
483                    } else if e.name().as_ref() == b"id" {
484                        in_id = false;
485                    } else if e.name().as_ref() == b"title" {
486                        in_title = false;
487                    } else if e.name().as_ref() == b"author" {
488                        in_author = false;
489                    } else if e.name().as_ref() == b"name" {
490                        if in_author {
491                            in_name = false;
492                        }
493                    } else if e.name().as_ref() == b"summary" {
494                        in_abstract = false;
495                    } else if e.name().as_ref() == b"published" {
496                        in_published = false;
497                    } else if e.name().as_ref() == b"updated" {
498                        in_updated = false;
499                    } else if e.name().as_ref() == b"arxiv:comment" {
500                        in_comment = false;
501                    } else if e.name().as_ref() == b"arxiv:journal_ref" {
502                        in_journal_ref = true;
503                    }
504                }
505                Ok(Event::Text(e)) => {
506                    if in_entry {
507                        if in_id {
508                            res.id = e.unescape().unwrap().to_string();
509                        } else if in_title {
510                            res.title = e.unescape().unwrap().to_string();
511                        } else if in_author && in_name {
512                            res.authors.push(e.unescape().unwrap().to_string());
513                        } else if in_abstract {
514                            res.abstract_text =
515                                e.unescape().unwrap().to_string().trim().replace("\n", "");
516                        } else if in_published {
517                            res.published = e.unescape().unwrap().to_string();
518                        } else if in_updated {
519                            res.updated = e.unescape().unwrap().to_string();
520                        } else if in_comment {
521                            res.comment.push(e.unescape().unwrap().to_string());
522                        } else if in_journal_ref {
523                            res.journal_ref = e.unescape().unwrap().to_string();
524                        }
525                    }
526                }
527                Ok(Event::Empty(ref e)) => {
528                    if e.name().as_ref() == b"link" && in_entry {
529                        let mut is_pdf = false;
530                        let mut is_doi = false;
531                        e.attributes().for_each(|attr| {
532                            if let Ok(attr) = attr {
533                                if attr.key.as_ref() == b"title" && attr.value.as_ref() == b"pdf" {
534                                    is_pdf = true;
535                                } else if attr.key.as_ref() == b"title"
536                                    && attr.value.as_ref() == b"doi"
537                                {
538                                    is_doi = true;
539                                }
540                            }
541                        });
542                        e.attributes().for_each(|attr| {
543                            if let Ok(attr) = attr {
544                                if attr.key.as_ref() == b"href" {
545                                    if is_pdf {
546                                        res.pdf_url = String::from_utf8_lossy(attr.value.as_ref())
547                                            .to_string();
548                                    } else if is_doi {
549                                        res.doi = String::from_utf8_lossy(attr.value.as_ref())
550                                            .to_string();
551                                    }
552                                }
553                            }
554                        });
555                    } else if e.name().as_ref() == b"arxiv:primary_category" && in_entry {
556                        e.attributes().for_each(|attr| {
557                            if let Ok(attr) = attr {
558                                if attr.key.as_ref() == b"term" {
559                                    res.primary_category =
560                                        String::from_utf8_lossy(attr.value.as_ref()).to_string();
561                                }
562                            }
563                        });
564                    } else if e.name().as_ref() == b"category" && in_entry {
565                        if let Some(attr) = e
566                            .attributes()
567                            .find(|attr| attr.as_ref().unwrap().key.as_ref() == b"term")
568                        {
569                            res.categories.push(
570                                String::from_utf8_lossy(attr.unwrap().value.as_ref()).to_string(),
571                            );
572                        }
573                    }
574                }
575                Ok(Event::Eof) => break,
576                Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
577                _ => (),
578            }
579            buf.clear();
580        }
581        return responses;
582    }
583
584    fn build_query(&self) -> String {
585        let mut params: Vec<String> = Vec::new();
586
587        // Add search_query if id_list is not the only parameter
588        if self.id_list.is_none() {
589            let mut search_query = self.args.to_string();
590            search_query = search_query.replace("%20", "+");
591            params.push(format!("search_query={}", search_query));
592        } else if let Some(ref id_list) = self.id_list {
593            // When id_list is provided, check if args is not default
594            let default_query = QueryParams::default().to_string();
595            let current_query = self.args.to_string();
596            if current_query != default_query {
597                let mut search_query = current_query;
598                search_query = search_query.replace("%20", "+");
599                params.push(format!("search_query={}", search_query));
600            }
601            params.push(format!("id_list={}", id_list.join(",")));
602        }
603
604        if let Some(start) = &self.start {
605            params.push(format!("start={}", start));
606        }
607        if let Some(max_resutls) = &self.max_resutls {
608            params.push(format!("max_results={}", max_resutls));
609        }
610        if let Some(sort_by) = &self.sort_by {
611            params.push(format!("sortBy={}", sort_by.to_string()));
612        }
613        if let Some(sort_order) = &self.sort_order {
614            params.push(format!("sortOrder={}", sort_order.to_string()));
615        }
616
617        return format!("https://export.arxiv.org/api/query?{}", params.join("&"));
618    }
619
620    pub async fn query(&mut self) -> Vec<Paper> {
621        let url = self.build_query();
622        let body = request::get(&url).await.unwrap().text().await.unwrap();
623        let responses = self.parse_xml(body);
624        return responses;
625    }
626}
627
628#[cfg(test)]
629mod tests;