synaptic_loaders/
arxiv.rs1use async_trait::async_trait;
2use serde_json::Value;
3use std::collections::HashMap;
4use synaptic_core::{Document, Loader, SynapticError};
5
6pub struct ArxivLoader {
8 client: reqwest::Client,
9 query: String,
10 max_results: usize,
11}
12
13impl ArxivLoader {
14 pub fn new(query: impl Into<String>) -> Self {
15 Self {
16 client: reqwest::Client::new(),
17 query: query.into(),
18 max_results: 10,
19 }
20 }
21
22 pub fn with_max_results(mut self, n: usize) -> Self {
23 self.max_results = n;
24 self
25 }
26}
27
28#[async_trait]
29impl Loader for ArxivLoader {
30 async fn load(&self) -> Result<Vec<Document>, SynapticError> {
31 let encoded_query = urlencoding::encode(&self.query);
32 let url = format!(
33 "http://export.arxiv.org/api/query?search_query={}&max_results={}&sortBy=submittedDate",
34 encoded_query, self.max_results
35 );
36 let resp = self
37 .client
38 .get(&url)
39 .send()
40 .await
41 .map_err(|e| SynapticError::Loader(format!("arXiv fetch: {e}")))?;
42 let text = resp
43 .text()
44 .await
45 .map_err(|e| SynapticError::Loader(format!("arXiv read: {e}")))?;
46
47 parse_arxiv_xml(&text)
48 }
49}
50
51fn parse_arxiv_xml(xml: &str) -> Result<Vec<Document>, SynapticError> {
52 use quick_xml::events::Event;
53 use quick_xml::Reader;
54
55 let mut reader = Reader::from_str(xml);
56 reader.config_mut().trim_text(true);
57
58 let mut documents = Vec::new();
59 let mut current_entry: Option<HashMap<String, String>> = None;
60 let mut current_field: Option<String> = None;
61 let mut buf = Vec::new();
62
63 loop {
64 match reader.read_event_into(&mut buf) {
65 Ok(Event::Start(e)) => {
66 let name = std::str::from_utf8(e.name().as_ref())
67 .unwrap_or("")
68 .to_string();
69 match name.as_str() {
70 "entry" => {
71 current_entry = Some(HashMap::new());
72 }
73 "id" | "title" | "summary" | "published" => {
74 if current_entry.is_some() {
75 current_field = Some(name);
76 }
77 }
78 "author" if current_entry.is_some() => {
79 current_field = Some("author_container".to_string());
80 }
81 "name" if current_field.as_deref() == Some("author_container") => {
82 current_field = Some("author_name".to_string());
83 }
84 _ => {}
85 }
86 }
87 Ok(Event::Text(e)) => {
88 if let (Some(entry), Some(field)) = (current_entry.as_mut(), ¤t_field) {
89 let text = e.unescape().unwrap_or_default().trim().to_string();
90 if !text.is_empty() {
91 match field.as_str() {
92 "id" => {
93 entry.insert(
94 "id".into(),
95 text.replace("http://arxiv.org/abs/", "")
96 .replace("https://arxiv.org/abs/", ""),
97 );
98 }
99 "title" => {
100 entry.entry("title".into()).or_insert(text);
101 }
102 "summary" => {
103 entry.insert("summary".into(), text);
104 }
105 "published" => {
106 entry.insert("published".into(), text);
107 }
108 "author_name" => {
109 let authors =
110 entry.entry("authors".into()).or_insert_with(String::new);
111 if !authors.is_empty() {
112 authors.push_str(", ");
113 }
114 authors.push_str(&text);
115 }
116 _ => {}
117 }
118 }
119 }
120 }
121 Ok(Event::End(e)) => {
122 let name = std::str::from_utf8(e.name().as_ref())
123 .unwrap_or("")
124 .to_string();
125 if name == "entry" {
126 if let Some(entry) = current_entry.take() {
127 let arxiv_id = entry
128 .get("id")
129 .cloned()
130 .unwrap_or_else(|| format!("arxiv-{}", documents.len()));
131 let content = entry.get("summary").cloned().unwrap_or_default();
132 let mut metadata = HashMap::new();
133 if let Some(title) = entry.get("title") {
134 metadata.insert("title".to_string(), Value::String(title.clone()));
135 }
136 if let Some(authors) = entry.get("authors") {
137 metadata.insert("authors".to_string(), Value::String(authors.clone()));
138 }
139 if let Some(published) = entry.get("published") {
140 metadata
141 .insert("published".to_string(), Value::String(published.clone()));
142 }
143 metadata.insert(
144 "source".to_string(),
145 Value::String(format!("arxiv:{}", arxiv_id)),
146 );
147 metadata.insert(
148 "url".to_string(),
149 Value::String(format!("https://arxiv.org/abs/{}", arxiv_id)),
150 );
151 documents.push(Document {
152 id: arxiv_id,
153 content,
154 metadata,
155 });
156 }
157 }
158 if matches!(
159 name.as_str(),
160 "id" | "title" | "summary" | "published" | "name" | "author"
161 ) {
162 current_field = None;
163 }
164 }
165 Ok(Event::Eof) => break,
166 Err(e) => return Err(SynapticError::Loader(format!("XML parse error: {e}"))),
167 _ => {}
168 }
169 buf.clear();
170 }
171 Ok(documents)
172}