rustant_tools/
arxiv_api.rs

1//! ArXiv API client — HTTP client, Atom XML parser, and data models.
2
3use chrono::{DateTime, Utc};
4use serde::{Deserialize, Serialize};
5use std::time::Duration;
6
7/// A paper from the ArXiv API.
8#[derive(Debug, Clone, Serialize, Deserialize)]
9pub struct ArxivPaper {
10    pub arxiv_id: String,
11    pub title: String,
12    pub authors: Vec<String>,
13    pub summary: String,
14    pub categories: Vec<String>,
15    pub primary_category: String,
16    pub published: String,
17    pub updated: String,
18    pub pdf_url: String,
19    pub abs_url: String,
20    #[serde(skip_serializing_if = "Option::is_none")]
21    pub doi: Option<String>,
22    #[serde(skip_serializing_if = "Option::is_none")]
23    pub comment: Option<String>,
24    #[serde(skip_serializing_if = "Option::is_none")]
25    pub journal_ref: Option<String>,
26}
27
28/// Search parameters for the ArXiv API.
29#[derive(Debug, Clone)]
30pub struct ArxivSearchParams {
31    pub query: String,
32    pub category: Option<String>,
33    pub max_results: usize,
34    pub sort_by: ArxivSortBy,
35    pub sort_order: ArxivSortOrder,
36    pub start: usize,
37}
38
39impl Default for ArxivSearchParams {
40    fn default() -> Self {
41        Self {
42            query: String::new(),
43            category: None,
44            max_results: 10,
45            sort_by: ArxivSortBy::Relevance,
46            sort_order: ArxivSortOrder::Descending,
47            start: 0,
48        }
49    }
50}
51
52/// Sort criteria for ArXiv search.
53#[derive(Debug, Clone, Copy)]
54pub enum ArxivSortBy {
55    Relevance,
56    LastUpdatedDate,
57    SubmittedDate,
58}
59
60impl ArxivSortBy {
61    pub fn as_api_str(&self) -> &str {
62        match self {
63            ArxivSortBy::Relevance => "relevance",
64            ArxivSortBy::LastUpdatedDate => "lastUpdatedDate",
65            ArxivSortBy::SubmittedDate => "submittedDate",
66        }
67    }
68
69    pub fn from_str_loose(s: &str) -> Self {
70        match s.to_lowercase().as_str() {
71            "date" | "submitted" | "submitteddate" => ArxivSortBy::SubmittedDate,
72            "updated" | "lastupdateddate" => ArxivSortBy::LastUpdatedDate,
73            _ => ArxivSortBy::Relevance,
74        }
75    }
76}
77
78/// Sort order for ArXiv search.
79#[derive(Debug, Clone, Copy)]
80pub enum ArxivSortOrder {
81    Ascending,
82    Descending,
83}
84
85impl ArxivSortOrder {
86    pub fn as_api_str(&self) -> &str {
87        match self {
88            ArxivSortOrder::Ascending => "ascending",
89            ArxivSortOrder::Descending => "descending",
90        }
91    }
92}
93
94/// Search result containing papers and metadata.
95#[derive(Debug, Clone, Serialize, Deserialize)]
96pub struct ArxivSearchResult {
97    pub papers: Vec<ArxivPaper>,
98    pub total_results: usize,
99    pub start_index: usize,
100    pub items_per_page: usize,
101}
102
103/// Depth for paper analysis.
104#[derive(Debug, Clone, Copy)]
105pub enum AnalysisDepth {
106    Quick,
107    Standard,
108    Full,
109}
110
111impl AnalysisDepth {
112    pub fn from_str_loose(s: &str) -> Self {
113        match s.to_lowercase().as_str() {
114            "quick" | "brief" => AnalysisDepth::Quick,
115            "full" | "detailed" | "deep" => AnalysisDepth::Full,
116            _ => AnalysisDepth::Standard,
117        }
118    }
119}
120
121/// A saved paper in the user's library.
122#[derive(Debug, Clone, Serialize, Deserialize)]
123pub struct LibraryEntry {
124    pub paper: ArxivPaper,
125    pub tags: Vec<String>,
126    pub collection: Option<String>,
127    pub notes: Option<String>,
128    pub saved_at: DateTime<Utc>,
129}
130
131/// Persistent library state.
132#[derive(Debug, Clone, Default, Serialize, Deserialize)]
133pub struct ArxivLibraryState {
134    pub entries: Vec<LibraryEntry>,
135    pub collections: Vec<String>,
136    #[serde(default)]
137    pub digest_config: Option<DigestConfig>,
138    #[serde(default)]
139    pub implementations: Vec<ImplementationRecord>,
140}
141
142/// Configuration for daily paper digest.
143#[derive(Debug, Clone, Serialize, Deserialize)]
144pub struct DigestConfig {
145    pub keywords: Vec<String>,
146    pub categories: Vec<String>,
147    pub enabled: bool,
148}
149
150/// Configuration for a target programming language's project scaffold.
151#[derive(Debug, Clone, Serialize, Deserialize)]
152pub struct LanguageConfig {
153    pub language: String,
154    pub package_manager: String,
155    pub test_framework: String,
156    pub file_extension: String,
157    pub common_ml_libraries: Vec<String>,
158    /// Command to create an isolated environment (venv, etc.)
159    pub env_setup_commands: Vec<String>,
160    /// Command to activate the environment
161    pub env_activate: String,
162}
163
164/// A file in a project scaffold.
165#[derive(Debug, Clone, Serialize, Deserialize)]
166pub struct ScaffoldFile {
167    pub path: String,
168    pub content: String,
169    pub is_test: bool,
170}
171
172/// A complete project scaffold generated from a paper.
173#[derive(Debug, Clone, Serialize, Deserialize)]
174pub struct ProjectScaffold {
175    pub paper_id: String,
176    pub project_name: String,
177    pub language_config: LanguageConfig,
178    pub directory_structure: Vec<String>,
179    pub files: Vec<ScaffoldFile>,
180    pub dependencies: Vec<String>,
181    pub setup_commands: Vec<String>,
182    pub test_commands: Vec<String>,
183}
184
185/// Implementation mode for paper-to-code.
186#[derive(Debug, Clone, Serialize, Deserialize)]
187pub enum ImplementationMode {
188    StandaloneProject,
189    Notebook,
190}
191
192/// Status of a paper implementation.
193#[derive(Debug, Clone, Serialize, Deserialize)]
194pub enum ImplementationStatus {
195    Scaffolded,
196    DepsInstalled,
197    TestsGenerated,
198    Implementing,
199    TestsPassing,
200    Complete,
201    Failed(String),
202}
203
204impl std::fmt::Display for ImplementationStatus {
205    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
206        match self {
207            Self::Scaffolded => write!(f, "scaffolded"),
208            Self::DepsInstalled => write!(f, "deps_installed"),
209            Self::TestsGenerated => write!(f, "tests_generated"),
210            Self::Implementing => write!(f, "implementing"),
211            Self::TestsPassing => write!(f, "tests_passing"),
212            Self::Complete => write!(f, "complete"),
213            Self::Failed(msg) => write!(f, "failed: {}", msg),
214        }
215    }
216}
217
218/// Record of a paper implementation tracked in the library.
219#[derive(Debug, Clone, Serialize, Deserialize)]
220pub struct ImplementationRecord {
221    pub paper_id: String,
222    pub project_path: String,
223    pub language: String,
224    pub mode: ImplementationMode,
225    pub status: ImplementationStatus,
226    pub created_at: DateTime<Utc>,
227}
228
229/// Get language-specific project configuration.
230///
231/// IMPORTANT: All language configs include environment isolation commands
232/// (venv for Python, cargo for Rust, etc.) to prevent polluting the system.
233pub fn language_config(lang: &str) -> Option<LanguageConfig> {
234    match lang.to_lowercase().as_str() {
235        "python" | "py" => Some(LanguageConfig {
236            language: "python".to_string(),
237            package_manager: "pip".to_string(),
238            test_framework: "pytest".to_string(),
239            file_extension: "py".to_string(),
240            common_ml_libraries: vec![
241                "numpy".into(),
242                "torch".into(),
243                "tensorflow".into(),
244                "scikit-learn".into(),
245                "matplotlib".into(),
246                "pandas".into(),
247            ],
248            env_setup_commands: vec!["python3 -m venv .venv".to_string()],
249            env_activate: "source .venv/bin/activate".to_string(),
250        }),
251        "rust" | "rs" => Some(LanguageConfig {
252            language: "rust".to_string(),
253            package_manager: "cargo".to_string(),
254            test_framework: "cargo test".to_string(),
255            file_extension: "rs".to_string(),
256            common_ml_libraries: vec![
257                "ndarray".into(),
258                "burn".into(),
259                "candle".into(),
260                "linfa".into(),
261                "plotters".into(),
262            ],
263            env_setup_commands: vec![], // Cargo handles isolation via Cargo.toml
264            env_activate: String::new(),
265        }),
266        "typescript" | "ts" | "javascript" | "js" => Some(LanguageConfig {
267            language: "typescript".to_string(),
268            package_manager: "npm".to_string(),
269            test_framework: "jest".to_string(),
270            file_extension: "ts".to_string(),
271            common_ml_libraries: vec![
272                "@tensorflow/tfjs".into(),
273                "onnxruntime-node".into(),
274                "mathjs".into(),
275                "chart.js".into(),
276            ],
277            env_setup_commands: vec!["npm init -y".to_string()],
278            env_activate: String::new(), // node_modules is project-local by default
279        }),
280        "go" | "golang" => Some(LanguageConfig {
281            language: "go".to_string(),
282            package_manager: "go mod".to_string(),
283            test_framework: "go test".to_string(),
284            file_extension: "go".to_string(),
285            common_ml_libraries: vec!["gonum.org/v1/gonum".into(), "gorgonia.org/gorgonia".into()],
286            env_setup_commands: vec!["go mod init paper_impl".to_string()],
287            env_activate: String::new(), // Go modules are project-local
288        }),
289        "cpp" | "c++" => Some(LanguageConfig {
290            language: "cpp".to_string(),
291            package_manager: "cmake".to_string(),
292            test_framework: "ctest".to_string(),
293            file_extension: "cpp".to_string(),
294            common_ml_libraries: vec!["Eigen".into(), "libtorch".into(), "xtensor".into()],
295            env_setup_commands: vec!["mkdir -p build".to_string()],
296            env_activate: String::new(),
297        }),
298        "julia" | "jl" => Some(LanguageConfig {
299            language: "julia".to_string(),
300            package_manager: "Pkg".to_string(),
301            test_framework: "Test".to_string(),
302            file_extension: "jl".to_string(),
303            common_ml_libraries: vec![
304                "Flux".into(),
305                "MLJ".into(),
306                "Plots".into(),
307                "DataFrames".into(),
308            ],
309            env_setup_commands: vec![], // Julia uses project-local Manifest.toml
310            env_activate: String::new(),
311        }),
312        _ => None,
313    }
314}
315
316// ── ArXiv API Client ──────────────────────────────────────────
317
318const ARXIV_API_BASE: &str = "https://export.arxiv.org/api/query";
319const USER_AGENT: &str = "Rustant/1.0 (https://github.com/rustant)";
320
321/// HTTP client for the ArXiv API.
322pub struct ArxivClient {
323    client: reqwest::Client,
324    last_request: std::sync::Mutex<Option<std::time::Instant>>,
325}
326
327impl ArxivClient {
328    pub fn new() -> Result<Self, String> {
329        let client = reqwest::Client::builder()
330            .timeout(Duration::from_secs(30))
331            .connect_timeout(Duration::from_secs(10))
332            .user_agent(USER_AGENT)
333            .build()
334            .map_err(|e| format!("Failed to create HTTP client: {}", e))?;
335        Ok(Self {
336            client,
337            last_request: std::sync::Mutex::new(None),
338        })
339    }
340
341    /// Enforce a minimum 3-second delay between ArXiv API requests.
342    async fn rate_limit(&self) {
343        let wait_duration = {
344            let last = self.last_request.lock().unwrap();
345            if let Some(instant) = *last {
346                let elapsed = instant.elapsed();
347                if elapsed < Duration::from_secs(3) {
348                    Some(Duration::from_secs(3) - elapsed)
349                } else {
350                    None
351                }
352            } else {
353                None
354            }
355        }; // MutexGuard is dropped here before any .await
356
357        if let Some(wait) = wait_duration {
358            tokio::time::sleep(wait).await;
359        }
360
361        let mut last = self.last_request.lock().unwrap();
362        *last = Some(std::time::Instant::now());
363    }
364
365    /// Search ArXiv with the given parameters.
366    pub async fn search(&self, params: &ArxivSearchParams) -> Result<ArxivSearchResult, String> {
367        self.rate_limit().await;
368        let url = build_search_url(params);
369        tracing::debug!("ArXiv search URL: {}", url);
370
371        let response = self
372            .client
373            .get(&url)
374            .send()
375            .await
376            .map_err(|e| format!("ArXiv API request failed: {}", e))?;
377
378        let status = response.status();
379        if !status.is_success() {
380            return Err(format!("ArXiv API returned status {}", status));
381        }
382
383        let body = response
384            .text()
385            .await
386            .map_err(|e| format!("Failed to read ArXiv response: {}", e))?;
387
388        parse_atom_response(&body)
389    }
390
391    /// Fetch a single paper by its ArXiv ID.
392    pub async fn fetch_paper(&self, arxiv_id: &str) -> Result<ArxivPaper, String> {
393        self.rate_limit().await;
394        let clean_id = arxiv_id.trim();
395        validate_arxiv_id(clean_id)?;
396
397        let url = format!(
398            "{}?id_list={}",
399            ARXIV_API_BASE,
400            urlencoding::encode(clean_id)
401        );
402        tracing::debug!("ArXiv fetch URL: {}", url);
403
404        let response = self
405            .client
406            .get(&url)
407            .send()
408            .await
409            .map_err(|e| format!("ArXiv API request failed: {}", e))?;
410
411        let body = response
412            .text()
413            .await
414            .map_err(|e| format!("Failed to read ArXiv response: {}", e))?;
415
416        let result = parse_atom_response(&body)?;
417        result
418            .papers
419            .into_iter()
420            .next()
421            .ok_or_else(|| format!("Paper '{}' not found on ArXiv", clean_id))
422    }
423}
424
425// ── URL Building ──────────────────────────────────────────────
426
427/// Build the ArXiv API search URL from parameters.
428pub fn build_search_url(params: &ArxivSearchParams) -> String {
429    let mut search_query = if params.query.is_empty() {
430        "all:*".to_string()
431    } else {
432        format!("all:{}", params.query)
433    };
434
435    if let Some(cat) = &params.category {
436        search_query = format!("{} AND cat:{}", search_query, cat);
437    }
438
439    format!(
440        "{}?search_query={}&start={}&max_results={}&sortBy={}&sortOrder={}",
441        ARXIV_API_BASE,
442        urlencoding::encode(&search_query),
443        params.start,
444        params.max_results,
445        params.sort_by.as_api_str(),
446        params.sort_order.as_api_str(),
447    )
448}
449
450// ── XML Parsing ───────────────────────────────────────────────
451
452/// Parse the Atom XML response from the ArXiv API.
453pub fn parse_atom_response(xml: &str) -> Result<ArxivSearchResult, String> {
454    let total_results = extract_opensearch_value(xml, "totalResults").unwrap_or(0);
455    let start_index = extract_opensearch_value(xml, "startIndex").unwrap_or(0);
456    let items_per_page = extract_opensearch_value(xml, "itemsPerPage").unwrap_or(0);
457
458    let entries = extract_entries(xml);
459    let mut papers = Vec::new();
460
461    for entry_xml in &entries {
462        if let Some(paper) = parse_entry(entry_xml) {
463            papers.push(paper);
464        }
465    }
466
467    Ok(ArxivSearchResult {
468        papers,
469        total_results,
470        start_index,
471        items_per_page,
472    })
473}
474
475/// Extract all <entry>...</entry> blocks from the XML.
476fn extract_entries(xml: &str) -> Vec<String> {
477    let mut entries = Vec::new();
478    let mut search_from = 0;
479
480    loop {
481        let start_tag = "<entry>";
482        let end_tag = "</entry>";
483
484        let start = match xml[search_from..].find(start_tag) {
485            Some(pos) => search_from + pos,
486            None => break,
487        };
488
489        let end = match xml[start..].find(end_tag) {
490            Some(pos) => start + pos + end_tag.len(),
491            None => break,
492        };
493
494        entries.push(xml[start..end].to_string());
495        search_from = end;
496    }
497
498    entries
499}
500
501/// Parse a single <entry> XML block into an ArxivPaper.
502fn parse_entry(entry: &str) -> Option<ArxivPaper> {
503    let id_url = extract_tag_text(entry, "id")?;
504    let arxiv_id = extract_arxiv_id_from_url(&id_url);
505    let title = normalize_whitespace(&extract_tag_text(entry, "title")?);
506
507    // Extract authors
508    let mut authors = Vec::new();
509    let mut author_search = 0;
510    while let Some(pos) = entry[author_search..].find("<author>") {
511        let author_start = author_search + pos;
512        let Some(end_pos) = entry[author_start..].find("</author>") else {
513            break;
514        };
515        let author_end = author_start + end_pos + "</author>".len();
516        let author_block = &entry[author_start..author_end];
517        if let Some(name) = extract_tag_text(author_block, "name") {
518            authors.push(name);
519        }
520        author_search = author_end;
521    }
522
523    let summary = normalize_whitespace(&extract_tag_text(entry, "summary").unwrap_or_default());
524    let published = extract_tag_text(entry, "published").unwrap_or_default();
525    let updated = extract_tag_text(entry, "updated").unwrap_or_default();
526
527    // Extract categories
528    let mut categories = Vec::new();
529    let mut primary_category = String::new();
530    let mut cat_search = 0;
531    while let Some(pos) = entry[cat_search..].find("<category") {
532        let cat_start = cat_search + pos;
533        let cat_end = if let Some(end_pos) = entry[cat_start..].find("/>") {
534            cat_start + end_pos + 2
535        } else if let Some(end_pos) = entry[cat_start..].find('>') {
536            cat_start + end_pos + 1
537        } else {
538            break;
539        };
540        let cat_tag = &entry[cat_start..cat_end];
541        if let Some(term) = extract_attribute(cat_tag, "term") {
542            categories.push(term);
543        }
544        cat_search = cat_end;
545    }
546
547    // Primary category from arxiv:primary_category
548    if let Some(pc_start) = entry.find("primary_category")
549        && let Some(pc_end) = entry[pc_start..]
550            .find("/>")
551            .or_else(|| entry[pc_start..].find(">"))
552    {
553        let pc_tag = &entry[pc_start..pc_start + pc_end + 2];
554        if let Some(term) = extract_attribute(pc_tag, "term") {
555            primary_category = term;
556        }
557    }
558    if primary_category.is_empty() {
559        primary_category = categories.first().cloned().unwrap_or_default();
560    }
561
562    // Extract links
563    let mut pdf_url = String::new();
564    let mut abs_url = id_url.clone();
565    let mut link_search = 0;
566    while let Some(pos) = entry[link_search..].find("<link") {
567        let link_start = link_search + pos;
568        let Some(end_pos) = entry[link_start..]
569            .find("/>")
570            .or_else(|| entry[link_start..].find('>'))
571        else {
572            break;
573        };
574        let link_end = link_start + end_pos + 2;
575        let link_tag = &entry[link_start..link_end];
576        let href = extract_attribute(link_tag, "href").unwrap_or_default();
577        let title_attr = extract_attribute(link_tag, "title").unwrap_or_default();
578        let link_type = extract_attribute(link_tag, "type").unwrap_or_default();
579
580        if title_attr == "pdf" || link_type == "application/pdf" {
581            pdf_url = href;
582        } else if link_type.is_empty() && href.contains("/abs/") {
583            abs_url = href;
584        }
585        link_search = link_end;
586    }
587
588    if pdf_url.is_empty() {
589        pdf_url = format!("https://arxiv.org/pdf/{}", arxiv_id);
590    }
591
592    let doi = extract_tag_text_ns(entry, "arxiv:doi");
593    let comment = extract_tag_text_ns(entry, "arxiv:comment").map(|c| normalize_whitespace(&c));
594    let journal_ref = extract_tag_text_ns(entry, "arxiv:journal_ref");
595
596    Some(ArxivPaper {
597        arxiv_id,
598        title,
599        authors,
600        summary,
601        categories,
602        primary_category,
603        published,
604        updated,
605        pdf_url,
606        abs_url,
607        doi,
608        comment,
609        journal_ref,
610    })
611}
612
613/// Extract the text content of the first occurrence of <tag>text</tag>.
614fn extract_tag_text(xml: &str, tag: &str) -> Option<String> {
615    let open = format!("<{}", tag);
616    let close = format!("</{}>", tag);
617
618    let start_pos = xml.find(&open)?;
619    // Find the end of the opening tag (could have attributes)
620    let content_start = xml[start_pos..].find('>')? + start_pos + 1;
621    let content_end = xml[content_start..].find(&close)? + content_start;
622
623    Some(xml[content_start..content_end].trim().to_string())
624}
625
626/// Extract text from a namespaced tag like <arxiv:doi>.
627fn extract_tag_text_ns(xml: &str, tag: &str) -> Option<String> {
628    extract_tag_text(xml, tag)
629}
630
631/// Extract an attribute value from a tag string.
632pub fn extract_attribute(tag: &str, attr: &str) -> Option<String> {
633    let search = format!("{}=\"", attr);
634    let start = tag.find(&search)? + search.len();
635    let end = tag[start..].find('"')? + start;
636    Some(tag[start..end].to_string())
637}
638
639/// Extract the ArXiv ID from a URL like "http://arxiv.org/abs/1706.03762v7".
640pub fn extract_arxiv_id_from_url(url: &str) -> String {
641    if let Some(pos) = url.rfind("/abs/") {
642        url[pos + 5..].to_string()
643    } else if let Some(pos) = url.rfind("/pdf/") {
644        url[pos + 5..].trim_end_matches(".pdf").to_string()
645    } else {
646        // Already just an ID
647        url.to_string()
648    }
649}
650
651/// Normalize whitespace: collapse runs of whitespace into single spaces.
652pub fn normalize_whitespace(s: &str) -> String {
653    s.split_whitespace().collect::<Vec<_>>().join(" ")
654}
655
656/// Extract an OpenSearch value like <opensearch:totalResults>100</opensearch:totalResults>.
657fn extract_opensearch_value(xml: &str, field: &str) -> Option<usize> {
658    let tag = format!("opensearch:{}", field);
659    extract_tag_text(xml, &tag).and_then(|s| s.trim().parse().ok())
660}
661
662// ── ID Validation ─────────────────────────────────────────────
663
664/// Validate that a string looks like an ArXiv ID.
665/// Accepts new format (YYMM.NNNNN) and old format (category/NNNNNNN).
666pub fn validate_arxiv_id(id: &str) -> Result<(), String> {
667    let id = id.trim();
668    if id.is_empty() {
669        return Err("ArXiv ID cannot be empty".to_string());
670    }
671
672    // New format: YYMM.NNNNN (optionally with version vN)
673    let new_format = regex_lite_match_arxiv_new(id);
674    // Old format: category/NNNNNNN (e.g., hep-th/9901001)
675    let old_format = regex_lite_match_arxiv_old(id);
676
677    if new_format || old_format {
678        Ok(())
679    } else {
680        Err(format!(
681            "Invalid ArXiv ID '{}'. Expected format: YYMM.NNNNN (e.g., 2301.12345) or category/NNNNNNN (e.g., hep-th/9901001)",
682            id
683        ))
684    }
685}
686
687/// Match new-format ArXiv IDs: YYMM.NNNNN[vN]
688fn regex_lite_match_arxiv_new(id: &str) -> bool {
689    let base = id.split('v').next().unwrap_or(id);
690    let parts: Vec<&str> = base.split('.').collect();
691    if parts.len() != 2 {
692        return false;
693    }
694    let yymm = parts[0];
695    let nnnnn = parts[1];
696
697    if yymm.len() != 4 || !yymm.chars().all(|c| c.is_ascii_digit()) {
698        return false;
699    }
700    if nnnnn.is_empty() || nnnnn.len() > 5 || !nnnnn.chars().all(|c| c.is_ascii_digit()) {
701        return false;
702    }
703
704    // Check version suffix if present
705    if let Some(v_pos) = id.find('v') {
706        let version = &id[v_pos + 1..];
707        if version.is_empty() || !version.chars().all(|c| c.is_ascii_digit()) {
708            return false;
709        }
710    }
711
712    true
713}
714
715/// Match old-format ArXiv IDs: category/NNNNNNN
716fn regex_lite_match_arxiv_old(id: &str) -> bool {
717    let parts: Vec<&str> = id.splitn(2, '/').collect();
718    if parts.len() != 2 {
719        return false;
720    }
721    let category = parts[0];
722    let number = parts[1].split('v').next().unwrap_or(parts[1]);
723
724    // Category: letters and hyphens
725    if category.is_empty()
726        || !category
727            .chars()
728            .all(|c| c.is_ascii_alphanumeric() || c == '-')
729    {
730        return false;
731    }
732    // Number: digits
733    if number.is_empty() || !number.chars().all(|c| c.is_ascii_digit()) {
734        return false;
735    }
736
737    true
738}
739
740// ── BibTeX Generation ─────────────────────────────────────────
741
742/// Generate a BibTeX entry for a paper.
743pub fn generate_bibtex(paper: &ArxivPaper) -> String {
744    let cite_key = generate_cite_key(paper);
745    let authors_bibtex = paper.authors.join(" and ");
746    let title_escaped = escape_bibtex(&paper.title);
747    let year = extract_year(&paper.published);
748
749    let mut entry = format!(
750        "@article{{{},\n  title = {{{}}},\n  author = {{{}}},\n  year = {{{}}},\n  eprint = {{{}}},\n  archivePrefix = {{arXiv}},\n  primaryClass = {{{}}}",
751        cite_key, title_escaped, authors_bibtex, year, paper.arxiv_id, paper.primary_category,
752    );
753
754    if let Some(doi) = &paper.doi {
755        entry.push_str(&format!(",\n  doi = {{{}}}", doi));
756    }
757    if let Some(journal) = &paper.journal_ref {
758        entry.push_str(&format!(",\n  journal = {{{}}}", escape_bibtex(journal)));
759    }
760
761    entry.push_str("\n}");
762    entry
763}
764
765/// Generate a citation key like "vaswani2017attention".
766fn generate_cite_key(paper: &ArxivPaper) -> String {
767    let first_author = paper
768        .authors
769        .first()
770        .map(|a| {
771            a.split_whitespace()
772                .last()
773                .unwrap_or(a)
774                .to_lowercase()
775                .chars()
776                .filter(|c| c.is_ascii_alphanumeric())
777                .collect::<String>()
778        })
779        .unwrap_or_else(|| "unknown".to_string());
780
781    let year = extract_year(&paper.published);
782
783    let title_word = paper
784        .title
785        .split_whitespace()
786        .find(|w| w.len() > 3 && w.chars().next().is_some_and(|c| c.is_alphabetic()))
787        .unwrap_or("paper")
788        .to_lowercase()
789        .chars()
790        .filter(|c| c.is_ascii_alphanumeric())
791        .collect::<String>();
792
793    format!("{}{}{}", first_author, year, title_word)
794}
795
796/// Escape special LaTeX characters in BibTeX fields.
797fn escape_bibtex(s: &str) -> String {
798    s.replace('&', "\\&")
799        .replace('%', "\\%")
800        .replace('$', "\\$")
801        .replace('#', "\\#")
802        .replace('_', "\\_")
803        .replace('{', "\\{")
804        .replace('}', "\\}")
805        .replace('~', "\\textasciitilde{}")
806        .replace('^', "\\textasciicircum{}")
807}
808
809/// Extract year from a date string like "2017-06-12T17:57:34Z".
810fn extract_year(date_str: &str) -> String {
811    date_str.split('-').next().unwrap_or("0000").to_string()
812}
813
814// ── Tests ─────────────────────────────────────────────────────
815
816#[cfg(test)]
817mod tests {
818    use super::*;
819
820    const SAMPLE_ENTRY: &str = r#"<entry>
821    <id>http://arxiv.org/abs/1706.03762v7</id>
822    <updated>2023-08-02T01:09:28Z</updated>
823    <published>2017-06-12T17:57:34Z</published>
824    <title>Attention Is All You Need</title>
825    <summary>  The dominant sequence transduction models are based on complex recurrent or
826convolutional neural networks that include an encoder and a decoder.  </summary>
827    <author><name>Ashish Vaswani</name></author>
828    <author><name>Noam Shazeer</name></author>
829    <author><name>Niki Parmar</name></author>
830    <arxiv:doi xmlns:arxiv="http://arxiv.org/schemas/atom">10.1234/nips.2017</arxiv:doi>
831    <arxiv:comment xmlns:arxiv="http://arxiv.org/schemas/atom">15 pages, 5 figures</arxiv:comment>
832    <arxiv:journal_ref xmlns:arxiv="http://arxiv.org/schemas/atom">NeurIPS 2017</arxiv:journal_ref>
833    <link href="http://arxiv.org/abs/1706.03762v7" rel="alternate" type="text/html"/>
834    <link href="http://arxiv.org/pdf/1706.03762v7" title="pdf" type="application/pdf"/>
835    <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="cs.CL" scheme="http://arxiv.org/schemas/atom"/>
836    <category term="cs.CL" scheme="http://arxiv.org/schemas/atom"/>
837    <category term="cs.AI" scheme="http://arxiv.org/schemas/atom"/>
838</entry>"#;
839
840    const SAMPLE_FEED: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
841<feed xmlns="http://www.w3.org/2005/Atom"
842      xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/"
843      xmlns:arxiv="http://arxiv.org/schemas/atom">
844  <title>ArXiv Query</title>
845  <opensearch:totalResults>100</opensearch:totalResults>
846  <opensearch:startIndex>0</opensearch:startIndex>
847  <opensearch:itemsPerPage>3</opensearch:itemsPerPage>
848  <entry>
849    <id>http://arxiv.org/abs/1706.03762v7</id>
850    <updated>2023-08-02T01:09:28Z</updated>
851    <published>2017-06-12T17:57:34Z</published>
852    <title>Attention Is All You Need</title>
853    <summary>The dominant sequence transduction models.</summary>
854    <author><name>Ashish Vaswani</name></author>
855    <link href="http://arxiv.org/abs/1706.03762v7" rel="alternate" type="text/html"/>
856    <link href="http://arxiv.org/pdf/1706.03762v7" title="pdf" type="application/pdf"/>
857    <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="cs.CL"/>
858    <category term="cs.CL"/>
859  </entry>
860  <entry>
861    <id>http://arxiv.org/abs/1810.04805v2</id>
862    <updated>2019-05-24T12:00:00Z</updated>
863    <published>2018-10-11T00:00:00Z</published>
864    <title>BERT: Pre-training of Deep Bidirectional Transformers</title>
865    <summary>We introduce a new language representation model.</summary>
866    <author><name>Jacob Devlin</name></author>
867    <link href="http://arxiv.org/pdf/1810.04805v2" title="pdf" type="application/pdf"/>
868    <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="cs.CL"/>
869    <category term="cs.CL"/>
870  </entry>
871  <entry>
872    <id>http://arxiv.org/abs/2005.14165v4</id>
873    <updated>2020-07-22T00:00:00Z</updated>
874    <published>2020-05-28T00:00:00Z</published>
875    <title>Language Models are Few-Shot Learners</title>
876    <summary>Recent work demonstrates substantial gains.</summary>
877    <author><name>Tom Brown</name></author>
878    <link href="http://arxiv.org/pdf/2005.14165v4" title="pdf" type="application/pdf"/>
879    <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="cs.CL"/>
880    <category term="cs.CL"/>
881  </entry>
882</feed>"#;
883
884    #[test]
885    fn test_parse_single_entry() {
886        let feed = format!(
887            r#"<feed><opensearch:totalResults>1</opensearch:totalResults>
888            <opensearch:startIndex>0</opensearch:startIndex>
889            <opensearch:itemsPerPage>1</opensearch:itemsPerPage>{}</feed>"#,
890            SAMPLE_ENTRY
891        );
892        let result = parse_atom_response(&feed).unwrap();
893        assert_eq!(result.papers.len(), 1);
894        let paper = &result.papers[0];
895        assert_eq!(paper.arxiv_id, "1706.03762v7");
896        assert_eq!(paper.title, "Attention Is All You Need");
897        assert_eq!(paper.authors.len(), 3);
898        assert_eq!(paper.authors[0], "Ashish Vaswani");
899    }
900
901    #[test]
902    fn test_parse_multiple_entries() {
903        let result = parse_atom_response(SAMPLE_FEED).unwrap();
904        assert_eq!(result.papers.len(), 3);
905        assert_eq!(result.total_results, 100);
906        assert_eq!(result.start_index, 0);
907        assert_eq!(result.items_per_page, 3);
908    }
909
910    #[test]
911    fn test_parse_empty_results() {
912        let feed = r#"<feed>
913            <opensearch:totalResults>0</opensearch:totalResults>
914            <opensearch:startIndex>0</opensearch:startIndex>
915            <opensearch:itemsPerPage>10</opensearch:itemsPerPage>
916        </feed>"#;
917        let result = parse_atom_response(feed).unwrap();
918        assert_eq!(result.papers.len(), 0);
919        assert_eq!(result.total_results, 0);
920    }
921
922    #[test]
923    fn test_parse_entry_all_fields() {
924        let feed = format!(
925            r#"<feed><opensearch:totalResults>1</opensearch:totalResults>
926            <opensearch:startIndex>0</opensearch:startIndex>
927            <opensearch:itemsPerPage>1</opensearch:itemsPerPage>{}</feed>"#,
928            SAMPLE_ENTRY
929        );
930        let result = parse_atom_response(&feed).unwrap();
931        let paper = &result.papers[0];
932        assert_eq!(paper.doi.as_deref(), Some("10.1234/nips.2017"));
933        assert_eq!(paper.comment.as_deref(), Some("15 pages, 5 figures"));
934        assert_eq!(paper.journal_ref.as_deref(), Some("NeurIPS 2017"));
935        assert_eq!(paper.primary_category, "cs.CL");
936        assert!(paper.categories.contains(&"cs.CL".to_string()));
937        assert!(paper.categories.contains(&"cs.AI".to_string()));
938        assert!(paper.pdf_url.contains("1706.03762"));
939    }
940
941    #[test]
942    fn test_parse_entry_missing_optionals() {
943        let entry = r#"<feed>
944            <opensearch:totalResults>1</opensearch:totalResults>
945            <opensearch:startIndex>0</opensearch:startIndex>
946            <opensearch:itemsPerPage>1</opensearch:itemsPerPage>
947            <entry>
948                <id>http://arxiv.org/abs/2301.12345v1</id>
949                <published>2023-01-15T00:00:00Z</published>
950                <updated>2023-01-15T00:00:00Z</updated>
951                <title>A Simple Paper</title>
952                <summary>A summary.</summary>
953                <author><name>John Doe</name></author>
954                <category term="cs.AI"/>
955            </entry>
956        </feed>"#;
957        let result = parse_atom_response(entry).unwrap();
958        let paper = &result.papers[0];
959        assert!(paper.doi.is_none());
960        assert!(paper.comment.is_none());
961        assert!(paper.journal_ref.is_none());
962    }
963
964    #[test]
965    fn test_extract_arxiv_id_from_url() {
966        assert_eq!(
967            extract_arxiv_id_from_url("http://arxiv.org/abs/1706.03762v7"),
968            "1706.03762v7"
969        );
970        assert_eq!(
971            extract_arxiv_id_from_url("http://arxiv.org/pdf/2301.12345"),
972            "2301.12345"
973        );
974        assert_eq!(extract_arxiv_id_from_url("2301.12345"), "2301.12345");
975    }
976
977    #[test]
978    fn test_normalize_whitespace() {
979        assert_eq!(
980            normalize_whitespace("  Hello   World\n  Test  "),
981            "Hello World Test"
982        );
983        assert_eq!(normalize_whitespace("single"), "single");
984    }
985
986    #[test]
987    fn test_build_search_url_basic() {
988        let params = ArxivSearchParams {
989            query: "transformer attention".to_string(),
990            ..Default::default()
991        };
992        let url = build_search_url(&params);
993        assert!(url.starts_with(ARXIV_API_BASE));
994        assert!(url.contains("transformer"));
995        assert!(url.contains("attention"));
996        assert!(url.contains("max_results=10"));
997    }
998
999    #[test]
1000    fn test_build_search_url_with_category() {
1001        let params = ArxivSearchParams {
1002            query: "attention".to_string(),
1003            category: Some("cs.AI".to_string()),
1004            ..Default::default()
1005        };
1006        let url = build_search_url(&params);
1007        assert!(url.contains("cat%3Acs.AI") || url.contains("cat:cs.AI"));
1008    }
1009
1010    #[test]
1011    fn test_build_search_url_with_sort() {
1012        let params = ArxivSearchParams {
1013            query: "test".to_string(),
1014            sort_by: ArxivSortBy::SubmittedDate,
1015            sort_order: ArxivSortOrder::Descending,
1016            ..Default::default()
1017        };
1018        let url = build_search_url(&params);
1019        assert!(url.contains("sortBy=submittedDate"));
1020        assert!(url.contains("sortOrder=descending"));
1021    }
1022
1023    #[test]
1024    fn test_generate_bibtex() {
1025        let paper = ArxivPaper {
1026            arxiv_id: "1706.03762v7".to_string(),
1027            title: "Attention Is All You Need".to_string(),
1028            authors: vec!["Ashish Vaswani".to_string(), "Noam Shazeer".to_string()],
1029            summary: "A summary.".to_string(),
1030            categories: vec!["cs.CL".to_string()],
1031            primary_category: "cs.CL".to_string(),
1032            published: "2017-06-12T17:57:34Z".to_string(),
1033            updated: "2023-08-02T01:09:28Z".to_string(),
1034            pdf_url: "http://arxiv.org/pdf/1706.03762v7".to_string(),
1035            abs_url: "http://arxiv.org/abs/1706.03762v7".to_string(),
1036            doi: None,
1037            comment: None,
1038            journal_ref: None,
1039        };
1040        let bib = generate_bibtex(&paper);
1041        assert!(bib.starts_with("@article{"));
1042        assert!(bib.contains("Attention Is All You Need"));
1043        assert!(bib.contains("Ashish Vaswani and Noam Shazeer"));
1044        assert!(bib.contains("2017"));
1045        assert!(bib.contains("1706.03762v7"));
1046        assert!(bib.contains("cs.CL"));
1047        assert!(bib.ends_with('}'));
1048    }
1049
1050    #[test]
1051    fn test_generate_bibtex_special_chars() {
1052        let paper = ArxivPaper {
1053            arxiv_id: "2301.00001".to_string(),
1054            title: "A & B: 50% Better $Models$ with #Tags".to_string(),
1055            authors: vec!["Jane Smith".to_string()],
1056            summary: String::new(),
1057            categories: vec!["cs.AI".to_string()],
1058            primary_category: "cs.AI".to_string(),
1059            published: "2023-01-01T00:00:00Z".to_string(),
1060            updated: "2023-01-01T00:00:00Z".to_string(),
1061            pdf_url: String::new(),
1062            abs_url: String::new(),
1063            doi: None,
1064            comment: None,
1065            journal_ref: None,
1066        };
1067        let bib = generate_bibtex(&paper);
1068        assert!(bib.contains("\\&"));
1069        assert!(bib.contains("\\%"));
1070        assert!(bib.contains("\\$"));
1071        assert!(bib.contains("\\#"));
1072    }
1073
1074    #[test]
1075    fn test_library_state_roundtrip() {
1076        let state = ArxivLibraryState {
1077            entries: vec![LibraryEntry {
1078                paper: ArxivPaper {
1079                    arxiv_id: "2301.12345".to_string(),
1080                    title: "Test Paper".to_string(),
1081                    authors: vec!["Author One".to_string()],
1082                    summary: "A test.".to_string(),
1083                    categories: vec!["cs.AI".to_string()],
1084                    primary_category: "cs.AI".to_string(),
1085                    published: "2023-01-15T00:00:00Z".to_string(),
1086                    updated: "2023-01-15T00:00:00Z".to_string(),
1087                    pdf_url: "https://arxiv.org/pdf/2301.12345".to_string(),
1088                    abs_url: "https://arxiv.org/abs/2301.12345".to_string(),
1089                    doi: None,
1090                    comment: None,
1091                    journal_ref: None,
1092                },
1093                tags: vec!["ml".to_string(), "test".to_string()],
1094                collection: Some("Favorites".to_string()),
1095                notes: Some("Great paper".to_string()),
1096                saved_at: Utc::now(),
1097            }],
1098            collections: vec!["Favorites".to_string()],
1099            digest_config: Some(DigestConfig {
1100                keywords: vec!["transformer".to_string()],
1101                categories: vec!["cs.AI".to_string()],
1102                enabled: true,
1103            }),
1104            implementations: Vec::new(),
1105        };
1106
1107        let json = serde_json::to_string_pretty(&state).unwrap();
1108        let restored: ArxivLibraryState = serde_json::from_str(&json).unwrap();
1109        assert_eq!(restored.entries.len(), 1);
1110        assert_eq!(restored.entries[0].paper.arxiv_id, "2301.12345");
1111        assert_eq!(restored.collections, vec!["Favorites"]);
1112        assert!(restored.digest_config.unwrap().enabled);
1113    }
1114
1115    #[test]
1116    fn test_validate_arxiv_id_new_format() {
1117        assert!(validate_arxiv_id("2301.12345").is_ok());
1118        assert!(validate_arxiv_id("2301.12345v2").is_ok());
1119        assert!(validate_arxiv_id("1706.03762").is_ok());
1120        assert!(validate_arxiv_id("1706.03762v7").is_ok());
1121    }
1122
1123    #[test]
1124    fn test_validate_arxiv_id_old_format() {
1125        assert!(validate_arxiv_id("hep-th/9901001").is_ok());
1126        assert!(validate_arxiv_id("math/0211159").is_ok());
1127        assert!(validate_arxiv_id("cs/0112017").is_ok());
1128    }
1129
1130    #[test]
1131    fn test_validate_arxiv_id_invalid() {
1132        assert!(validate_arxiv_id("not-an-id").is_err());
1133        assert!(validate_arxiv_id("").is_err());
1134        assert!(validate_arxiv_id("abc").is_err());
1135        assert!(validate_arxiv_id("12345").is_err());
1136    }
1137
1138    // Integration tests — require network access
1139    #[tokio::test]
1140    #[ignore]
1141    async fn test_real_search() {
1142        let client = ArxivClient::new().unwrap();
1143        let params = ArxivSearchParams {
1144            query: "attention is all you need".to_string(),
1145            max_results: 3,
1146            ..Default::default()
1147        };
1148        let result = client.search(&params).await.unwrap();
1149        assert!(!result.papers.is_empty());
1150    }
1151
1152    #[tokio::test]
1153    #[ignore]
1154    async fn test_real_fetch_attention_paper() {
1155        let client = ArxivClient::new().unwrap();
1156        let paper = client.fetch_paper("1706.03762").await.unwrap();
1157        assert!(paper.title.contains("Attention"));
1158        assert!(!paper.authors.is_empty());
1159    }
1160}
rustant_tools/arxiv_api.rs

rustant_tools/
arxiv_api.rs