arxiv_tools/
lib.rs

1//! # Description
2//! This library provides a simple interface to query the arXiv API.
3//!
4//! # Example
5//! ## Simple Query
6//! ```rust
7//! # use arxiv_tools::{ArXiv, QueryParams, Paper};
8//! # #[tokio::main]
9//! # async fn main() {
10//! // get arxiv object from query parameters
11//! let mut arxiv = ArXiv::from_args(QueryParams::title("attention is all you need"));
12//!
13//! // execute
14//! let response: Vec<Paper> = arxiv.query().await;
15//!
16//! //verify
17//! let paper = response.first().unwrap();
18//! assert!(paper.title.to_lowercase().contains("attention is all you need"));
19//! # }
20//! ```
21//!
22//! ## Complex Query
23//! ```rust
24//! # use arxiv_tools::{ArXiv, QueryParams, Category, SortBy, SortOrder};
25//! # #[tokio::main]
26//! # async fn main() {
27//! // build query parameters
28//! let args = QueryParams::and(vec![
29//!     QueryParams::or(vec![QueryParams::title("ai"), QueryParams::title("llm")]),
30//!     QueryParams::group(vec![QueryParams::or(vec![
31//!         QueryParams::subject_category(Category::CsAi),
32//!         QueryParams::subject_category(Category::CsLg),
33//!     ])]),
34//!     QueryParams::SubmittedDate(String::from("202412010000"), String::from("202412012359")),
35//! ]);
36//! let mut arxiv = ArXiv::from_args(args);
37//!
38//! // set additional parameters
39//! arxiv.start(10);
40//! arxiv.max_results(100);
41//! arxiv.sort_by(SortBy::SubmittedDate);
42//! arxiv.sort_order(SortOrder::Ascending);
43//!
44//! // execute
45//! let response = arxiv.query().await;
46//!
47//! // verify
48//! assert!(response.len() > 0);
49//! # }
50//! ```
51use chrono::{DateTime, Utc};
52use quick_xml::events::Event;
53use quick_xml::reader::Reader;
54use reqwest as request;
55use serde::{Deserialize, Serialize};
56use urlencoding::encode;
57
58pub enum Category {
59    CsAi,
60    CsCl,
61    CsLg,
62    CsGt,
63    CsCv,
64    CsCr,
65    CsCc,
66    CsCe,
67    CsCy,
68    CsDs,
69    CsDm,
70    CsDc,
71    CsEt,
72    CsFl,
73    CsGl,
74    CsGr,
75    CsAr,
76    CsHc,
77    CsIr,
78}
79
80impl Category {
81    pub fn to_string(&self) -> String {
82        match self {
83            Category::CsAi => String::from("cs.AI"),
84            Category::CsCl => String::from("cs.CL"),
85            Category::CsLg => String::from("cs.LG"),
86            Category::CsGt => String::from("cs.GT"),
87            Category::CsCv => String::from("cs.CV"),
88            Category::CsCr => String::from("cs.CR"),
89            Category::CsCc => String::from("cs.CC"),
90            Category::CsCe => String::from("cs.CE"),
91            Category::CsCy => String::from("cs.CY"),
92            Category::CsDs => String::from("cs.DS"),
93            Category::CsDm => String::from("cs.DM"),
94            Category::CsDc => String::from("cs.DC"),
95            Category::CsEt => String::from("cs.ET"),
96            Category::CsFl => String::from("cs.FL"),
97            Category::CsGl => String::from("cs.GL"),
98            Category::CsGr => String::from("cs.GR"),
99            Category::CsAr => String::from("cs.AR"),
100            Category::CsHc => String::from("cs.HC"),
101            Category::CsIr => String::from("cs.IR"),
102        }
103    }
104}
105
106#[derive(Clone, Debug)]
107pub enum QueryParams {
108    Title(String),
109    Author(String),
110    Abstract(String),
111    Comment(String),
112    JournalRef(String),
113    SubjectCategory(String),
114    ReportNumber(String),
115    Id(String),
116    All(String),
117    And(String),
118    Or(String),
119    AndNot(String),
120    Group(String),
121    SubmittedDate(String, String),
122}
123
124impl Default for QueryParams {
125    fn default() -> Self {
126        return QueryParams::title("default");
127    }
128}
129
130#[derive(Clone, Debug, Default)]
131pub enum SortBy {
132    #[default]
133    Relevance,
134    LastUpdatedDate,
135    SubmittedDate,
136}
137
138impl SortBy {
139    pub fn to_string(&self) -> String {
140        match self {
141            SortBy::Relevance => String::from("relevance"),
142            SortBy::LastUpdatedDate => String::from("lastUpdatedDate"),
143            SortBy::SubmittedDate => String::from("submittedDate"),
144        }
145    }
146}
147
148#[derive(Clone, Debug, Default)]
149pub enum SortOrder {
150    #[default]
151    Ascending,
152    Descending,
153}
154
155impl SortOrder {
156    pub fn to_string(&self) -> String {
157        match self {
158            SortOrder::Ascending => String::from("ascending"),
159            SortOrder::Descending => String::from("descending"),
160        }
161    }
162}
163
164impl QueryParams {
165    pub fn title(arg: &str) -> Self {
166        return QueryParams::Title(format!("ti:\"{}\"", encode(arg)));
167    }
168    pub fn author(arg: &str) -> Self {
169        return QueryParams::Author(format!("au:\"{}\"", encode(arg)));
170    }
171    pub fn abstract_text(arg: &str) -> Self {
172        return QueryParams::Abstract(format!("abs:\"{}\"", encode(arg)));
173    }
174    pub fn comment(arg: &str) -> Self {
175        return QueryParams::Comment(format!("co:\"{}\"", encode(arg)));
176    }
177    pub fn journal_ref(arg: &str) -> Self {
178        return QueryParams::JournalRef(format!("jr:\"{}\"", encode(arg)));
179    }
180    pub fn subject_category(arg: Category) -> Self {
181        return QueryParams::SubjectCategory(format!("cat:\"{}\"", encode(&arg.to_string())));
182    }
183    pub fn report_number(arg: &str) -> Self {
184        return QueryParams::ReportNumber(format!("rn:\"{}\"", encode(arg)));
185    }
186    pub fn id(id: &str) -> Self {
187        return QueryParams::Id(format!("id:\"{}\"", encode(id)));
188    }
189    pub fn all(arg: &str) -> Self {
190        return QueryParams::All(format!("all:\"{}\"", encode(arg)));
191    }
192    pub fn to_string(&self) -> String {
193        match self {
194            QueryParams::Title(arg) => arg.to_string(),
195            QueryParams::Author(arg) => arg.to_string(),
196            QueryParams::Abstract(arg) => arg.to_string(),
197            QueryParams::Comment(arg) => arg.to_string(),
198            QueryParams::JournalRef(arg) => arg.to_string(),
199            QueryParams::SubjectCategory(arg) => arg.to_string(),
200            QueryParams::ReportNumber(arg) => arg.to_string(),
201            QueryParams::Id(arg) => arg.to_string(),
202            QueryParams::All(arg) => arg.to_string(),
203            QueryParams::And(arg) => arg.to_string(),
204            QueryParams::Or(arg) => arg.to_string(),
205            QueryParams::AndNot(arg) => arg.to_string(),
206            QueryParams::Group(arg) => arg.to_string(),
207            QueryParams::SubmittedDate(from, to) => {
208                format!("submittedDate:[{}+TO+{}]", from, to)
209            }
210        }
211    }
212    pub fn and(args: Vec<QueryParams>) -> Self {
213        let args = args
214            .iter()
215            .map(|arg| arg.to_string())
216            .collect::<Vec<String>>();
217        let query = args.join("+AND+");
218        return QueryParams::And(query);
219    }
220    pub fn or(args: Vec<QueryParams>) -> Self {
221        let args = args
222            .iter()
223            .map(|arg| arg.to_string())
224            .collect::<Vec<String>>();
225        let query = args.join("+OR+");
226        return QueryParams::Or(query);
227    }
228    pub fn and_not(args: Vec<QueryParams>) -> Self {
229        let args = args
230            .iter()
231            .map(|arg| arg.to_string())
232            .collect::<Vec<String>>();
233        let query = args.join("+ANDNOT+");
234        return QueryParams::Or(query);
235    }
236    pub fn group(args: Vec<QueryParams>) -> Self {
237        let mut args = args
238            .iter()
239            .map(|arg| arg.to_string())
240            .collect::<Vec<String>>();
241        args.insert(0, String::from("%28"));
242        args.push(String::from("%29"));
243        let query = args.join("");
244        return QueryParams::Group(query);
245    }
246}
247
248#[derive(Debug, Clone, Serialize, Deserialize)]
249pub struct Paper {
250    pub id: String,
251    pub title: String,
252    pub authors: Vec<String>,
253    #[serde(rename = "abstract")]
254    pub abstract_text: String,
255    pub published: String,
256    pub updated: String,
257    pub doi: String,
258    pub comment: Vec<String>,
259    pub journal_ref: String,
260    pub pdf_url: String,
261    pub primary_category: String,
262    pub categories: Vec<String>,
263}
264
265impl Paper {
266    pub fn default() -> Self {
267        return Paper {
268            id: "".to_string(),
269            title: "".to_string(),
270            authors: Vec::new(),
271            abstract_text: "".to_string(),
272            published: "".to_string(),
273            updated: "".to_string(),
274            doi: "".to_string(),
275            comment: Vec::new(),
276            journal_ref: "".to_string(),
277            pdf_url: "".to_string(),
278            primary_category: "".to_string(),
279            categories: Vec::new(),
280        };
281    }
282
283    pub fn published2utc(&self) -> DateTime<Utc> {
284        return DateTime::parse_from_rfc3339(&self.published)
285            .unwrap()
286            .with_timezone(&Utc);
287    }
288
289    pub fn updated2utc(&self) -> DateTime<Utc> {
290        return DateTime::parse_from_rfc3339(&self.updated)
291            .unwrap()
292            .with_timezone(&Utc);
293    }
294}
295
296#[derive(Clone, Debug, Default)]
297pub struct ArXiv {
298    pub args: QueryParams,
299    pub start: Option<u64>,
300    pub max_resutls: Option<u64>,
301    pub sort_by: Option<SortBy>,
302    pub sort_order: Option<SortOrder>,
303}
304
305impl ArXiv {
306    pub fn from_args(args: QueryParams) -> Self {
307        return ArXiv {
308            args: args,
309            max_resutls: None,
310            start: None,
311            sort_by: None,
312            sort_order: None,
313        };
314    }
315
316    pub fn start(&mut self, start: u64) -> &mut Self {
317        self.start = Some(start);
318        return self;
319    }
320    pub fn max_results(&mut self, max_results: u64) -> &mut Self {
321        self.max_resutls = Some(max_results);
322        return self;
323    }
324    pub fn sort_by(&mut self, sort_by: SortBy) -> &mut Self {
325        self.sort_by = Some(sort_by);
326        return self;
327    }
328    pub fn sort_order(&mut self, sort_order: SortOrder) -> &mut Self {
329        self.sort_order = Some(sort_order);
330        return self;
331    }
332
333    fn parse_xml(&self, xml: String) -> Vec<Paper> {
334        let mut reader = Reader::from_str(&xml);
335        let mut buf = Vec::new();
336        let mut in_entry = false;
337        let mut in_id = false;
338        let mut in_title = false;
339        let mut in_author = false;
340        let mut in_name = false;
341        let mut in_abstract = false;
342        let mut in_published = false;
343        let mut in_updated = false;
344        let mut in_comment = false;
345        let mut in_journal_ref = false;
346
347        let mut responses: Vec<Paper> = Vec::new();
348        let mut res = Paper::default();
349        loop {
350            match reader.read_event_into(&mut buf) {
351                Ok(Event::Start(ref e)) => {
352                    if e.name().as_ref() == b"entry" {
353                        in_entry = true;
354                        res = Paper::default();
355                    } else if e.name().as_ref() == b"id" {
356                        in_id = true;
357                    } else if e.name().as_ref() == b"title" {
358                        in_title = true;
359                    } else if e.name().as_ref() == b"author" {
360                        in_author = true;
361                    } else if e.name().as_ref() == b"name" {
362                        if in_author {
363                            in_name = true;
364                        }
365                    } else if e.name().as_ref() == b"summary" {
366                        in_abstract = true;
367                    } else if e.name().as_ref() == b"published" {
368                        in_published = true;
369                    } else if e.name().as_ref() == b"updated" {
370                        in_updated = true;
371                    } else if e.name().as_ref() == b"arxiv:comment" {
372                        in_comment = true;
373                    } else if e.name().as_ref() == b"arxiv:journal_ref" {
374                        in_journal_ref = true;
375                    } else if e.name().as_ref() == b"link" && in_entry {
376                        let mut is_pdf = false;
377                        let mut is_doi = false;
378                        e.attributes().for_each(|attr| {
379                            if let Ok(attr) = attr {
380                                if attr.key.as_ref() == b"title" && attr.value.as_ref() == b"pdf" {
381                                    is_pdf = true;
382                                } else if attr.key.as_ref() == b"title"
383                                    && attr.value.as_ref() == b"doi"
384                                {
385                                    is_doi = true;
386                                }
387                            }
388                        });
389                        e.attributes().for_each(|attr| {
390                            if let Ok(attr) = attr {
391                                if attr.key.as_ref() == b"href" {
392                                    if is_pdf {
393                                        res.pdf_url = String::from_utf8_lossy(attr.value.as_ref())
394                                            .to_string();
395                                    } else if is_doi {
396                                        res.doi = String::from_utf8_lossy(attr.value.as_ref())
397                                            .to_string();
398                                    }
399                                }
400                            }
401                        });
402                    } else if e.name().as_ref() == b"arxiv:primary_category" {
403                        e.attributes().for_each(|attr| {
404                            if let Ok(attr) = attr {
405                                if attr.key.as_ref() == b"term" {
406                                    res.primary_category =
407                                        String::from_utf8_lossy(attr.value.as_ref()).to_string();
408                                }
409                            }
410                        });
411                    } else if e.name().as_ref() == b"category" {
412                        if let Some(attr) = e
413                            .attributes()
414                            .find(|attr| attr.as_ref().unwrap().key.as_ref() == b"term")
415                        {
416                            res.categories.push(
417                                String::from_utf8_lossy(attr.unwrap().value.as_ref()).to_string(),
418                            );
419                        }
420                    } else if e.name().as_ref() == b"category" {
421                        if let Some(attr) = e
422                            .attributes()
423                            .find(|attr| attr.as_ref().unwrap().key.as_ref() == b"term")
424                        {
425                            res.categories.push(
426                                String::from_utf8_lossy(attr.unwrap().value.as_ref()).to_string(),
427                            );
428                        }
429                    }
430                }
431                Ok(Event::End(ref e)) => {
432                    if e.name().as_ref() == b"entry" {
433                        in_entry = false;
434                        responses.push(res.clone());
435                        res = Paper::default();
436                    } else if e.name().as_ref() == b"id" {
437                        in_id = false;
438                    } else if e.name().as_ref() == b"title" {
439                        in_title = false;
440                    } else if e.name().as_ref() == b"author" {
441                        in_author = false;
442                    } else if e.name().as_ref() == b"name" {
443                        if in_author {
444                            in_name = false;
445                        }
446                    } else if e.name().as_ref() == b"summary" {
447                        in_abstract = false;
448                    } else if e.name().as_ref() == b"published" {
449                        in_published = false;
450                    } else if e.name().as_ref() == b"updated" {
451                        in_updated = false;
452                    } else if e.name().as_ref() == b"arxiv:comment" {
453                        in_comment = false;
454                    } else if e.name().as_ref() == b"arxiv:journal_ref" {
455                        in_journal_ref = true;
456                    }
457                }
458                Ok(Event::Text(e)) => {
459                    if in_entry {
460                        if in_id {
461                            res.id = e.unescape().unwrap().to_string();
462                        } else if in_title {
463                            res.title = e.unescape().unwrap().to_string();
464                        } else if in_author && in_name {
465                            res.authors.push(e.unescape().unwrap().to_string());
466                        } else if in_abstract {
467                            res.abstract_text =
468                                e.unescape().unwrap().to_string().trim().replace("\n", "");
469                        } else if in_published {
470                            res.published = e.unescape().unwrap().to_string();
471                        } else if in_updated {
472                            res.updated = e.unescape().unwrap().to_string();
473                        } else if in_comment {
474                            res.comment.push(e.unescape().unwrap().to_string());
475                        } else if in_journal_ref {
476                            res.journal_ref = e.unescape().unwrap().to_string();
477                        }
478                    }
479                }
480                Ok(Event::Empty(ref e)) => {
481                    if e.name().as_ref() == b"link" && in_entry {
482                        let mut is_pdf = false;
483                        let mut is_doi = false;
484                        e.attributes().for_each(|attr| {
485                            if let Ok(attr) = attr {
486                                if attr.key.as_ref() == b"title" && attr.value.as_ref() == b"pdf" {
487                                    is_pdf = true;
488                                } else if attr.key.as_ref() == b"title"
489                                    && attr.value.as_ref() == b"doi"
490                                {
491                                    is_doi = true;
492                                }
493                            }
494                        });
495                        e.attributes().for_each(|attr| {
496                            if let Ok(attr) = attr {
497                                if attr.key.as_ref() == b"href" {
498                                    if is_pdf {
499                                        res.pdf_url = String::from_utf8_lossy(attr.value.as_ref())
500                                            .to_string();
501                                    } else if is_doi {
502                                        res.doi = String::from_utf8_lossy(attr.value.as_ref())
503                                            .to_string();
504                                    }
505                                }
506                            }
507                        });
508                    } else if e.name().as_ref() == b"arxiv:primary_category" && in_entry {
509                        e.attributes().for_each(|attr| {
510                            if let Ok(attr) = attr {
511                                if attr.key.as_ref() == b"term" {
512                                    res.primary_category =
513                                        String::from_utf8_lossy(attr.value.as_ref()).to_string();
514                                }
515                            }
516                        });
517                    } else if e.name().as_ref() == b"category" && in_entry {
518                        if let Some(attr) = e
519                            .attributes()
520                            .find(|attr| attr.as_ref().unwrap().key.as_ref() == b"term")
521                        {
522                            res.categories.push(
523                                String::from_utf8_lossy(attr.unwrap().value.as_ref()).to_string(),
524                            );
525                        }
526                    }
527                }
528                Ok(Event::Eof) => break,
529                Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
530                _ => (),
531            }
532            buf.clear();
533        }
534        return responses;
535    }
536
537    fn build_query(&self) -> String {
538        let mut query = self.args.to_string();
539        query = query.replace("%20", "+");
540        if let Some(start) = &self.start {
541            query.push_str(&format!("&start={}", start));
542        }
543        if let Some(max_resutls) = &self.max_resutls {
544            query.push_str(&format!("&max_results={}", max_resutls));
545        }
546        if let Some(sort_by) = &self.sort_by {
547            query.push_str(&format!("&sortBy={}", sort_by.to_string()));
548        }
549        if let Some(sort_order) = &self.sort_order {
550            query.push_str(&format!("&sortOrder={}", sort_order.to_string()));
551        }
552
553        return format!("http://export.arxiv.org/api/query?search_query={}", query);
554    }
555
556    pub async fn query(&mut self) -> Vec<Paper> {
557        let url = self.build_query();
558        let body = request::get(&url).await.unwrap().text().await.unwrap();
559        let responses = self.parse_xml(body);
560        return responses;
561    }
562}
563
564#[cfg(test)]
565mod tests;