Skip to main content

synaptic_loaders/
arxiv.rs

1use async_trait::async_trait;
2use serde_json::Value;
3use std::collections::HashMap;
4use synaptic_core::{Document, Loader, SynapticError};
5
6/// Loader for arXiv papers via the arXiv API (returns abstracts as documents).
7pub struct ArxivLoader {
8    client: reqwest::Client,
9    query: String,
10    max_results: usize,
11}
12
13impl ArxivLoader {
14    pub fn new(query: impl Into<String>) -> Self {
15        Self {
16            client: reqwest::Client::new(),
17            query: query.into(),
18            max_results: 10,
19        }
20    }
21
22    pub fn with_max_results(mut self, n: usize) -> Self {
23        self.max_results = n;
24        self
25    }
26}
27
28#[async_trait]
29impl Loader for ArxivLoader {
30    async fn load(&self) -> Result<Vec<Document>, SynapticError> {
31        let encoded_query = urlencoding::encode(&self.query);
32        let url = format!(
33            "http://export.arxiv.org/api/query?search_query={}&max_results={}&sortBy=submittedDate",
34            encoded_query, self.max_results
35        );
36        let resp = self
37            .client
38            .get(&url)
39            .send()
40            .await
41            .map_err(|e| SynapticError::Loader(format!("arXiv fetch: {e}")))?;
42        let text = resp
43            .text()
44            .await
45            .map_err(|e| SynapticError::Loader(format!("arXiv read: {e}")))?;
46
47        parse_arxiv_xml(&text)
48    }
49}
50
51fn parse_arxiv_xml(xml: &str) -> Result<Vec<Document>, SynapticError> {
52    use quick_xml::events::Event;
53    use quick_xml::Reader;
54
55    let mut reader = Reader::from_str(xml);
56    reader.config_mut().trim_text(true);
57
58    let mut documents = Vec::new();
59    let mut current_entry: Option<HashMap<String, String>> = None;
60    let mut current_field: Option<String> = None;
61    let mut buf = Vec::new();
62
63    loop {
64        match reader.read_event_into(&mut buf) {
65            Ok(Event::Start(e)) => {
66                let name = std::str::from_utf8(e.name().as_ref())
67                    .unwrap_or("")
68                    .to_string();
69                match name.as_str() {
70                    "entry" => {
71                        current_entry = Some(HashMap::new());
72                    }
73                    "id" | "title" | "summary" | "published" => {
74                        if current_entry.is_some() {
75                            current_field = Some(name);
76                        }
77                    }
78                    "author" if current_entry.is_some() => {
79                        current_field = Some("author_container".to_string());
80                    }
81                    "name" if current_field.as_deref() == Some("author_container") => {
82                        current_field = Some("author_name".to_string());
83                    }
84                    _ => {}
85                }
86            }
87            Ok(Event::Text(e)) => {
88                if let (Some(entry), Some(field)) = (current_entry.as_mut(), &current_field) {
89                    let text = e.unescape().unwrap_or_default().trim().to_string();
90                    if !text.is_empty() {
91                        match field.as_str() {
92                            "id" => {
93                                entry.insert(
94                                    "id".into(),
95                                    text.replace("http://arxiv.org/abs/", "")
96                                        .replace("https://arxiv.org/abs/", ""),
97                                );
98                            }
99                            "title" => {
100                                entry.entry("title".into()).or_insert(text);
101                            }
102                            "summary" => {
103                                entry.insert("summary".into(), text);
104                            }
105                            "published" => {
106                                entry.insert("published".into(), text);
107                            }
108                            "author_name" => {
109                                let authors =
110                                    entry.entry("authors".into()).or_insert_with(String::new);
111                                if !authors.is_empty() {
112                                    authors.push_str(", ");
113                                }
114                                authors.push_str(&text);
115                            }
116                            _ => {}
117                        }
118                    }
119                }
120            }
121            Ok(Event::End(e)) => {
122                let name = std::str::from_utf8(e.name().as_ref())
123                    .unwrap_or("")
124                    .to_string();
125                if name == "entry" {
126                    if let Some(entry) = current_entry.take() {
127                        let arxiv_id = entry
128                            .get("id")
129                            .cloned()
130                            .unwrap_or_else(|| format!("arxiv-{}", documents.len()));
131                        let content = entry.get("summary").cloned().unwrap_or_default();
132                        let mut metadata = HashMap::new();
133                        if let Some(title) = entry.get("title") {
134                            metadata.insert("title".to_string(), Value::String(title.clone()));
135                        }
136                        if let Some(authors) = entry.get("authors") {
137                            metadata.insert("authors".to_string(), Value::String(authors.clone()));
138                        }
139                        if let Some(published) = entry.get("published") {
140                            metadata
141                                .insert("published".to_string(), Value::String(published.clone()));
142                        }
143                        metadata.insert(
144                            "source".to_string(),
145                            Value::String(format!("arxiv:{}", arxiv_id)),
146                        );
147                        metadata.insert(
148                            "url".to_string(),
149                            Value::String(format!("https://arxiv.org/abs/{}", arxiv_id)),
150                        );
151                        documents.push(Document {
152                            id: arxiv_id,
153                            content,
154                            metadata,
155                        });
156                    }
157                }
158                if matches!(
159                    name.as_str(),
160                    "id" | "title" | "summary" | "published" | "name" | "author"
161                ) {
162                    current_field = None;
163                }
164            }
165            Ok(Event::Eof) => break,
166            Err(e) => return Err(SynapticError::Loader(format!("XML parse error: {e}"))),
167            _ => {}
168        }
169        buf.clear();
170    }
171    Ok(documents)
172}