use std::collections::HashMap;
use async_trait::async_trait;
use reqwest::Client;
use serde_json::Value;
use crate::error::RetrieverError;
use crate::schemas::{Document, Retriever};
#[derive(Debug, Clone)]
pub struct ArxivRetrieverConfig {
pub max_docs: usize,
pub timeout: Option<std::time::Duration>,
}
impl Default for ArxivRetrieverConfig {
fn default() -> Self {
Self {
max_docs: 3,
timeout: Some(std::time::Duration::from_secs(30)),
}
}
}
#[derive(Debug, Clone)]
pub struct ArxivRetriever {
config: ArxivRetrieverConfig,
client: Client,
}
impl ArxivRetriever {
pub fn new() -> Self {
Self::with_config(ArxivRetrieverConfig::default())
}
pub fn with_config(config: ArxivRetrieverConfig) -> Self {
let mut client_builder = Client::builder();
if let Some(timeout) = config.timeout {
client_builder = client_builder.timeout(timeout);
}
let client = client_builder.build().unwrap_or_else(|_| Client::new());
Self { config, client }
}
pub fn with_max_docs(mut self, max_docs: usize) -> Self {
self.config.max_docs = max_docs;
self
}
async fn search_arxiv(&self, query: &str) -> Result<Vec<Document>, RetrieverError> {
let url = "http://export.arxiv.org/api/query";
let params = [
("search_query", query),
("start", "0"),
("max_results", &self.config.max_docs.to_string()),
("sortBy", "relevance"),
("sortOrder", "descending"),
];
let response = self
.client
.get(url)
.query(¶ms)
.send()
.await
.map_err(|e| RetrieverError::ArxivError(e.to_string()))?;
let xml_content = response
.text()
.await
.map_err(|e| RetrieverError::ArxivError(e.to_string()))?;
let mut documents = Vec::new();
let mut current_entry = HashMap::new();
let mut current_text = String::new();
let mut in_entry = false;
let mut in_title = false;
let mut in_summary = false;
let mut in_author = false;
let _current_tag = String::new();
for line in xml_content.lines() {
let line = line.trim();
if line.starts_with("<entry>") {
in_entry = true;
current_entry.clear();
current_text.clear();
} else if line.starts_with("</entry>") {
if in_entry {
let title = current_entry
.get("title")
.cloned()
.unwrap_or_else(|| "Untitled".to_string());
let summary = current_entry
.get("summary")
.cloned()
.unwrap_or_else(|| String::new());
let authors = current_entry
.get("authors")
.cloned()
.unwrap_or_else(|| String::new());
let id = current_entry
.get("id")
.cloned()
.unwrap_or_else(|| String::new());
let content = format!(
"Title: {}\n\nAuthors: {}\n\nAbstract: {}",
title, authors, summary
);
let mut metadata = HashMap::new();
metadata.insert("source".to_string(), Value::from("arxiv"));
metadata.insert("title".to_string(), Value::from(title));
metadata.insert("id".to_string(), Value::from(id));
if !authors.is_empty() {
metadata.insert("authors".to_string(), Value::from(authors));
}
documents.push(Document::new(content).with_metadata(metadata));
}
in_entry = false;
in_title = false;
in_summary = false;
in_author = false;
} else if line.starts_with("<title>") {
in_title = true;
current_text.clear();
} else if line.starts_with("</title>") {
if in_entry && in_title {
current_entry.insert("title".to_string(), current_text.trim().to_string());
}
in_title = false;
current_text.clear();
} else if line.starts_with("<summary>") {
in_summary = true;
current_text.clear();
} else if line.starts_with("</summary>") {
if in_entry && in_summary {
current_entry.insert("summary".to_string(), current_text.trim().to_string());
}
in_summary = false;
current_text.clear();
} else if line.starts_with("<id>") {
let id_start = line.find('>').unwrap_or(0) + 1;
let id_end = line.find('<').unwrap_or(line.len());
if id_start < id_end {
let id = line[id_start..id_end].trim();
if in_entry && id.starts_with("http://arxiv.org/abs/") {
current_entry.insert("id".to_string(), id.to_string());
}
}
} else if line.starts_with("<name>") {
in_author = true;
current_text.clear();
} else if line.starts_with("</name>") {
if in_entry && in_author {
let author_name = current_text.trim().to_string();
if let Some(existing) = current_entry.get_mut("authors") {
*existing = format!("{}, {}", existing, author_name);
} else {
current_entry.insert("authors".to_string(), author_name);
}
}
in_author = false;
current_text.clear();
} else if in_title || in_summary || in_author {
current_text.push_str(line);
current_text.push(' ');
}
}
Ok(documents)
}
}
impl Default for ArxivRetriever {
fn default() -> Self {
Self::new()
}
}
#[async_trait]
impl Retriever for ArxivRetriever {
async fn get_relevant_documents(&self, query: &str) -> Result<Vec<Document>, RetrieverError> {
self.search_arxiv(query).await
}
}