rsxiv 0.4.3

Tools for working with arXiv and the arXiv API
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
//! # ArXiv API response parsing
//!
//! This module provides a deserialization interface for the XML response generated by the [arXiv
//! API][api]. The main entry points are the [`parse`] function and the [`Response`] struct.
//!
//! For the simplest use-cases, parse into a [`Vec`] of [`Entry`]s.
//! ```
//! use rsxiv::{
//!     id::ArticleId,
//!     response::{Entry, Response, parse},
//! };
//!
//! // abridged arXiv response obtained by querying
//! // `export.arxiv.org/api/query?id_list=2206.06921`
//! let xml = // br#"<feed xmlns...
#![doc = include_str!("response/tests/query_doc.txt")]
//! # let xml = xml.as_bytes();
//! let response = parse(xml).unwrap();
//!
//! assert_eq!(response.entries.len(), 1);
//! assert_eq!(response.entries[0].id, ArticleId::parse("2206.06921v3").unwrap());
//! ```
//!
//! ## Deserialization
//! The [`Response`] struct also exposes a [`serde`] deserialization interface. View the
//! [deserialization docs](crate::de) for more detail.
//!
//! [api]: https://info.arxiv.org/help/api/user-manual.html

#[cfg(test)]
mod tests;
mod xml;

use std::{borrow::Cow, fmt::Display};

use chrono::{DateTime, FixedOffset};

pub(crate) use self::xml::ResponseReader;
#[cfg(feature = "serde")]
pub(crate) use self::xml::Term;
use crate::id::ArticleId;

/// A parsed arXiv API response.
///
/// Typically constructed using the [`parse`] method or the [deserialize implementation](crate::de).
///
/// ### Customizing deserialization
///
/// This struct is also designed as a deserialization wrapper target. Enable this with the `serde`
/// feature to read an arxiv API response into an arbitrary [`Response<T>`] where `T` is
/// [`Deserialize`](serde::Deserialize). Read the [`de` module](crate::de) for more detail.
///
/// [api]: https://info.arxiv.org/help/api/user-manual.html#332-entry-metadata
#[derive(Debug, Clone, PartialEq)]
pub struct Response<T> {
    /// When this query was last updated.
    pub updated: DateTime<FixedOffset>,
    /// Pagination information.
    pub pagination: Pagination,
    /// A container for the entries in the response.
    pub entries: T,
}

/// Pagination information for paged queries.
#[derive(Debug, Clone, PartialEq)]
pub struct Pagination {
    /// The total number of results matching the query.
    pub total_results: u64,
    /// The 0-based index corresponding to the first index in this response.
    pub start_index: u64,
    /// The maximum number of items per page.
    pub items_per_page: u64,
}

/// Parse a [`Response<Vec<Entry<'r>>>`] from the raw XML response returned by the arXiv API.
///
/// This implementation borrows as much as possible from the input data, but sometimes borrowing is
/// impossible due to the presence of XML escape sequences.
pub fn parse<'r>(xml: &'r [u8]) -> Result<Response<Vec<Entry<'r>>>, ResponseError> {
    let (updated, pagination, mut reader) = ResponseReader::init(xml)?;

    // if this does not fit in a `usize`, `xml` should not fit in memory
    let expected_count = pagination.items_per_page.min(pagination.total_results) as usize;
    let mut entries = Vec::with_capacity(expected_count);

    while let Some(id) = reader.next_id()? {
        let id = ArticleId::parse_bytes(id)?;

        let title = reader.next_title()?;

        let updated = DateTime::parse_from_rfc3339(&reader.next_updated()?)?;

        let summary = reader.next_summary()?;

        let mut categories = Vec::new();
        while let Some(term) = reader.next_category()? {
            categories.push(term.get()?.into_owned().into())
        }

        let published = DateTime::parse_from_rfc3339(&reader.next_published()?)?;

        let comment = reader.next_comment()?;

        let primary_category = reader.next_primary_category()?.get()?.into_owned().into();

        let journal_ref = reader.next_journal_ref()?;

        let mut authors = Vec::new();
        while reader.next_author()? {
            let name = AuthorName::from_arxiv(&reader.next_author_name()?);
            let affiliation = reader.next_author_affiliation()?;
            authors.push(Author { name, affiliation });
        }
        let doi = reader.next_doi()?;

        entries.push(Entry {
            id,
            updated,
            published,
            title,
            summary,
            authors,
            doi,
            comment,
            journal_ref,
            primary_category,
            categories,
        });
    }

    Ok(Response {
        updated,
        pagination,
        entries,
    })
}

/// A representation of an arXiv author name.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct AuthorName {
    /// Often the last name or surname.
    pub keyname: String,
    /// Other names or name fragments (such as initials)
    pub firstnames: String,
    /// A suffix, such as `Jr.` or `IV`
    pub suffix: String,
}

impl Display for AuthorName {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let mut wrote = false;
        for k in [&self.firstnames, &self.keyname, &self.suffix] {
            if !k.is_empty() {
                if wrote {
                    f.write_str(" ")?;
                } else {
                    wrote = true;
                }
                f.write_str(k)?;
            }
        }
        Ok(())
    }
}

impl AuthorName {
    /// Parse an arxiv author name according to the arXiv name parsing rules from the [arXiv
    /// repository][repo].
    ///
    /// # Parsing rules
    /// This method assumes that the name is in the format
    /// ```txt
    /// <firstnames> <keyname> <suffix>
    /// ```
    /// and separated by ASCII whitespace.
    ///
    /// At most one suffix may be present, and must match the following pattern:
    /// ```txt
    /// "I" | "II" | "III" | "IV" | "Jr" | "Jr." | "Sr" | "Sr." | "V"
    /// ```
    ///
    /// The `keyname` is the last element (not including the suffix), along with at most two
    /// prefixes. Any prefix must match the following pattern:
    /// ```txt
    /// "da" | "de" | "del" | "della" | "dem" | "der" | "di" | "la" | "mac" | "ter" | "van" | "vaziri" | "von"
    /// ```
    /// Any remaining components are part of the `firstnames`.
    ///
    /// # Examples
    /// For example, `Ursula von der Leyen` has firstnames "Ursula" and keyname "von der Leyen":
    /// ```
    /// use rsxiv::response::AuthorName;
    ///
    /// assert_eq!(
    ///     AuthorName::from_arxiv("Ursula von der Leyen"),
    ///     AuthorName {
    ///         keyname: "von der Leyen".to_owned(),
    ///         firstnames: "Ursula".to_owned(),
    ///         suffix: String::new(),
    ///     },
    /// );
    /// ```
    ///
    /// [repo]: https://github.com/arXiv/arxiv-base/blob/develop/arxiv/authors/__init__.py
    pub fn from_arxiv(name: &str) -> Self {
        /// A convenience function to collect an iterator into a string buffer, combing elements
        /// with whitespace
        fn join_into<'a, T: IntoIterator<Item = &'a str>>(target: &mut String, it: T) {
            let mut iter = it.into_iter();

            match iter.next() {
                Some(e) => target.push_str(e),
                None => return,
            }

            for e in iter {
                target.push(' ');
                target.push_str(e)
            }
        }

        let mut components = name.split_ascii_whitespace();

        let mut keyname = String::new();
        let mut firstnames = String::new();
        let mut suffix = String::new();

        // first, prune the suffix
        let Some(maybe_suffix) = components.next_back() else {
            // this should never happen
            return Self {
                keyname,
                firstnames,
                suffix,
            };
        };

        // check if the suffix is a suffix
        let last_name = if Self::is_arxiv_suffix(maybe_suffix) {
            suffix.push_str(maybe_suffix);
            match components.next_back() {
                Some(last) => last,
                // this should never happen
                None => {
                    return Self {
                        keyname: suffix,
                        firstnames,
                        suffix: String::new(),
                    };
                }
            }
        } else {
            maybe_suffix
        };

        // take two prefixes
        let prefix1 = match components.next_back() {
            Some(pref) if Self::is_arxiv_prefix(pref) => pref,
            Some(not_pref) => {
                join_into(&mut firstnames, components.chain(Some(not_pref)));
                keyname.push_str(last_name);
                return Self {
                    keyname,
                    firstnames,
                    suffix,
                };
            }
            None => {
                // only one component, and it is the last name
                keyname.push_str(last_name);
                return Self {
                    keyname,
                    firstnames,
                    suffix,
                };
            }
        };

        let prefix2 = match components.next_back() {
            Some(pref) if Self::is_arxiv_prefix(pref) => pref,
            Some(not_pref) => {
                join_into(&mut firstnames, components.chain(Some(not_pref)));
                keyname.reserve_exact(prefix1.len() + last_name.len() + 1);
                keyname.push_str(prefix1);
                keyname.push(' ');
                keyname.push_str(last_name);
                return Self {
                    keyname,
                    firstnames,
                    suffix,
                };
            }
            None => {
                // only one component, and it is the last name
                keyname.reserve_exact(prefix1.len() + last_name.len() + 1);
                keyname.push_str(prefix1);
                keyname.push(' ');
                keyname.push_str(last_name);
                return Self {
                    keyname,
                    firstnames,
                    suffix,
                };
            }
        };

        // merge remaining components
        join_into(&mut firstnames, components);

        keyname.reserve_exact(prefix2.len() + prefix1.len() + last_name.len() + 2);
        keyname.push_str(prefix2);
        keyname.push(' ');
        keyname.push_str(prefix1);
        keyname.push(' ');
        keyname.push_str(last_name);

        Self {
            keyname,
            firstnames,
            suffix,
        }
    }

    /// Check if a name component is an arxiv prefix.
    fn is_arxiv_prefix(s: &str) -> bool {
        matches!(
            s,
            "da" | "de"
                | "del"
                | "della"
                | "dem"
                | "der"
                | "di"
                | "la"
                | "mac"
                | "ter"
                | "van"
                | "vaziri"
                | "von"
        )
    }

    /// Check if a name component is an arxiv suffix.
    fn is_arxiv_suffix(s: &str) -> bool {
        matches!(
            s,
            "I" | "II" | "III" | "IV" | "Jr" | "Jr." | "Sr" | "Sr." | "V"
        )
    }
}

/// Typed representation of a single entry in the arXiv API response.
#[derive(Debug, Clone, PartialEq)]
#[cfg_attr(all(test, feature = "serde"), derive(serde::Deserialize))]
pub struct Entry<'r> {
    /// The arXiv identifier of the entry.
    pub id: ArticleId,
    /// The date that the retrieved version of the article was submitted.
    pub updated: DateTime<FixedOffset>,
    /// The date that version 1 was submitted.
    pub published: DateTime<FixedOffset>,
    /// The title of the article.
    pub title: Cow<'r, str>,
    /// The article abstract.
    pub summary: Cow<'r, str>,
    /// The article authors.
    pub authors: Vec<Author<'r>>,
    /// A url for the resolved DOI to an external resource.
    pub doi: Option<Cow<'r, str>>,
    /// The author comment.
    pub comment: Option<Cow<'r, str>>,
    /// A journal reference.
    pub journal_ref: Option<Cow<'r, str>>,
    /// The primary arXiv or ACM or MSC category for an article.
    pub primary_category: Cow<'r, str>,
    /// The arXiv or ACM or MSC category for an article.
    pub categories: Vec<Cow<'r, str>>,
}

/// An error which results from response parsing.
#[derive(Debug)]
pub enum ResponseError {
    /// An XML parsing error>
    Parse(quick_xml::errors::Error),
    /// An error parsing the contents of an XML tag.
    Attribute(quick_xml::events::attributes::AttrError),
    /// A datetime error when parsing a RFC 3339 datetime.
    InvalidDateTime(chrono::ParseError),
    /// Attempted to parse an Arxiv API error message.
    Arxiv(String),
    /// Contains more entries than expected.
    TrailingEntries,
    /// API response is missing tag.
    MissingTag(&'static str),
    /// A `category` or `primary_category` tag is missing the `term` attribute.
    MissingTerm,
    /// API response header information is invalid.
    InvalidHeader(String),
    /// API error format is invalid.
    InvalidError(String),
    /// An entry contained an invalid identifier.
    InvalidId(crate::id::IdError),
    /// A custom error which occurs during deserialization.
    Custom(String),
}

/// An article author.
#[derive(Debug, Clone, PartialEq)]
#[cfg_attr(all(test, feature = "serde"), derive(serde::Deserialize))]
pub struct Author<'r> {
    /// The name of the author.
    pub name: AuthorName,
    /// The affiliation of the author.
    pub affiliation: Option<Cow<'r, str>>,
}

/// Various trait implementations for [`ResponseError`]
mod error_impl {
    use super::ResponseError;

    impl From<crate::id::IdError> for ResponseError {
        fn from(value: crate::id::IdError) -> Self {
            Self::InvalidId(value)
        }
    }

    impl From<chrono::ParseError> for ResponseError {
        fn from(value: chrono::ParseError) -> Self {
            Self::InvalidDateTime(value)
        }
    }

    impl From<std::str::Utf8Error> for ResponseError {
        fn from(value: std::str::Utf8Error) -> Self {
            ResponseError::Parse(quick_xml::errors::Error::Encoding(
                quick_xml::encoding::EncodingError::Utf8(value),
            ))
        }
    }

    impl From<quick_xml::events::attributes::AttrError> for ResponseError {
        fn from(value: quick_xml::events::attributes::AttrError) -> Self {
            ResponseError::Attribute(value)
        }
    }

    impl From<quick_xml::errors::Error> for ResponseError {
        fn from(value: quick_xml::errors::Error) -> Self {
            ResponseError::Parse(value)
        }
    }

    impl std::fmt::Display for ResponseError {
        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
            match self {
                ResponseError::Parse(error) => write!(f, "XML parse error: {error}"),
                ResponseError::Attribute(attr_error) => {
                    write!(f, "XML error while reading attribute: {attr_error}")
                }
                ResponseError::Arxiv(error) => {
                    write!(f, "arXiv API error response: {error}")
                }
                ResponseError::MissingTag(tag) => {
                    write!(f, "missing tag `{tag}`")
                }
                ResponseError::Custom(err) => write!(f, "{err}"),
                ResponseError::MissingTerm => f.write_str(
                    "`category` or `primary_category` tag is missing the `term` attribute",
                ),
                ResponseError::InvalidHeader(msg) => write!(f, "Unexpected API header: {msg}"),
                ResponseError::InvalidError(msg) => write!(f, "Unexpected API error format: {msg}"),
                ResponseError::InvalidDateTime(parse_error) => {
                    write!(f, "Error parsing datetime field: {parse_error}")
                }
                ResponseError::InvalidId(id_error) => {
                    write!(f, "Entry contains invalid identifier: {id_error}")
                }
                ResponseError::TrailingEntries => write!(f, "Response contains trailing entries"),
            }
        }
    }

    impl std::error::Error for ResponseError {}
}