1use chrono::{DateTime, Utc};
4use serde::{Deserialize, Serialize};
5use std::time::Duration;
6
7#[derive(Debug, Clone, Serialize, Deserialize)]
9pub struct ArxivPaper {
10 pub arxiv_id: String,
11 pub title: String,
12 pub authors: Vec<String>,
13 pub summary: String,
14 pub categories: Vec<String>,
15 pub primary_category: String,
16 pub published: String,
17 pub updated: String,
18 pub pdf_url: String,
19 pub abs_url: String,
20 #[serde(skip_serializing_if = "Option::is_none")]
21 pub doi: Option<String>,
22 #[serde(skip_serializing_if = "Option::is_none")]
23 pub comment: Option<String>,
24 #[serde(skip_serializing_if = "Option::is_none")]
25 pub journal_ref: Option<String>,
26}
27
28#[derive(Debug, Clone)]
30pub struct ArxivSearchParams {
31 pub query: String,
32 pub category: Option<String>,
33 pub max_results: usize,
34 pub sort_by: ArxivSortBy,
35 pub sort_order: ArxivSortOrder,
36 pub start: usize,
37}
38
39impl Default for ArxivSearchParams {
40 fn default() -> Self {
41 Self {
42 query: String::new(),
43 category: None,
44 max_results: 10,
45 sort_by: ArxivSortBy::Relevance,
46 sort_order: ArxivSortOrder::Descending,
47 start: 0,
48 }
49 }
50}
51
52#[derive(Debug, Clone, Copy)]
54pub enum ArxivSortBy {
55 Relevance,
56 LastUpdatedDate,
57 SubmittedDate,
58}
59
60impl ArxivSortBy {
61 pub fn as_api_str(&self) -> &str {
62 match self {
63 ArxivSortBy::Relevance => "relevance",
64 ArxivSortBy::LastUpdatedDate => "lastUpdatedDate",
65 ArxivSortBy::SubmittedDate => "submittedDate",
66 }
67 }
68
69 pub fn from_str_loose(s: &str) -> Self {
70 match s.to_lowercase().as_str() {
71 "date" | "submitted" | "submitteddate" => ArxivSortBy::SubmittedDate,
72 "updated" | "lastupdateddate" => ArxivSortBy::LastUpdatedDate,
73 _ => ArxivSortBy::Relevance,
74 }
75 }
76}
77
78#[derive(Debug, Clone, Copy)]
80pub enum ArxivSortOrder {
81 Ascending,
82 Descending,
83}
84
85impl ArxivSortOrder {
86 pub fn as_api_str(&self) -> &str {
87 match self {
88 ArxivSortOrder::Ascending => "ascending",
89 ArxivSortOrder::Descending => "descending",
90 }
91 }
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize)]
96pub struct ArxivSearchResult {
97 pub papers: Vec<ArxivPaper>,
98 pub total_results: usize,
99 pub start_index: usize,
100 pub items_per_page: usize,
101}
102
103#[derive(Debug, Clone, Copy)]
105pub enum AnalysisDepth {
106 Quick,
107 Standard,
108 Full,
109}
110
111impl AnalysisDepth {
112 pub fn from_str_loose(s: &str) -> Self {
113 match s.to_lowercase().as_str() {
114 "quick" | "brief" => AnalysisDepth::Quick,
115 "full" | "detailed" | "deep" => AnalysisDepth::Full,
116 _ => AnalysisDepth::Standard,
117 }
118 }
119}
120
121#[derive(Debug, Clone, Serialize, Deserialize)]
123pub struct LibraryEntry {
124 pub paper: ArxivPaper,
125 pub tags: Vec<String>,
126 pub collection: Option<String>,
127 pub notes: Option<String>,
128 pub saved_at: DateTime<Utc>,
129}
130
131#[derive(Debug, Clone, Default, Serialize, Deserialize)]
133pub struct ArxivLibraryState {
134 pub entries: Vec<LibraryEntry>,
135 pub collections: Vec<String>,
136 #[serde(default)]
137 pub digest_config: Option<DigestConfig>,
138 #[serde(default)]
139 pub implementations: Vec<ImplementationRecord>,
140}
141
142#[derive(Debug, Clone, Serialize, Deserialize)]
144pub struct DigestConfig {
145 pub keywords: Vec<String>,
146 pub categories: Vec<String>,
147 pub enabled: bool,
148}
149
150#[derive(Debug, Clone, Serialize, Deserialize)]
152pub struct LanguageConfig {
153 pub language: String,
154 pub package_manager: String,
155 pub test_framework: String,
156 pub file_extension: String,
157 pub common_ml_libraries: Vec<String>,
158 pub env_setup_commands: Vec<String>,
160 pub env_activate: String,
162}
163
164#[derive(Debug, Clone, Serialize, Deserialize)]
166pub struct ScaffoldFile {
167 pub path: String,
168 pub content: String,
169 pub is_test: bool,
170}
171
172#[derive(Debug, Clone, Serialize, Deserialize)]
174pub struct ProjectScaffold {
175 pub paper_id: String,
176 pub project_name: String,
177 pub language_config: LanguageConfig,
178 pub directory_structure: Vec<String>,
179 pub files: Vec<ScaffoldFile>,
180 pub dependencies: Vec<String>,
181 pub setup_commands: Vec<String>,
182 pub test_commands: Vec<String>,
183}
184
185#[derive(Debug, Clone, Serialize, Deserialize)]
187pub enum ImplementationMode {
188 StandaloneProject,
189 Notebook,
190}
191
192#[derive(Debug, Clone, Serialize, Deserialize)]
194pub enum ImplementationStatus {
195 Scaffolded,
196 DepsInstalled,
197 TestsGenerated,
198 Implementing,
199 TestsPassing,
200 Complete,
201 Failed(String),
202}
203
204impl std::fmt::Display for ImplementationStatus {
205 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
206 match self {
207 Self::Scaffolded => write!(f, "scaffolded"),
208 Self::DepsInstalled => write!(f, "deps_installed"),
209 Self::TestsGenerated => write!(f, "tests_generated"),
210 Self::Implementing => write!(f, "implementing"),
211 Self::TestsPassing => write!(f, "tests_passing"),
212 Self::Complete => write!(f, "complete"),
213 Self::Failed(msg) => write!(f, "failed: {}", msg),
214 }
215 }
216}
217
218#[derive(Debug, Clone, Serialize, Deserialize)]
220pub struct ImplementationRecord {
221 pub paper_id: String,
222 pub project_path: String,
223 pub language: String,
224 pub mode: ImplementationMode,
225 pub status: ImplementationStatus,
226 pub created_at: DateTime<Utc>,
227}
228
229pub fn language_config(lang: &str) -> Option<LanguageConfig> {
234 match lang.to_lowercase().as_str() {
235 "python" | "py" => Some(LanguageConfig {
236 language: "python".to_string(),
237 package_manager: "pip".to_string(),
238 test_framework: "pytest".to_string(),
239 file_extension: "py".to_string(),
240 common_ml_libraries: vec![
241 "numpy".into(),
242 "torch".into(),
243 "tensorflow".into(),
244 "scikit-learn".into(),
245 "matplotlib".into(),
246 "pandas".into(),
247 ],
248 env_setup_commands: vec!["python3 -m venv .venv".to_string()],
249 env_activate: "source .venv/bin/activate".to_string(),
250 }),
251 "rust" | "rs" => Some(LanguageConfig {
252 language: "rust".to_string(),
253 package_manager: "cargo".to_string(),
254 test_framework: "cargo test".to_string(),
255 file_extension: "rs".to_string(),
256 common_ml_libraries: vec![
257 "ndarray".into(),
258 "burn".into(),
259 "candle".into(),
260 "linfa".into(),
261 "plotters".into(),
262 ],
263 env_setup_commands: vec![], env_activate: String::new(),
265 }),
266 "typescript" | "ts" | "javascript" | "js" => Some(LanguageConfig {
267 language: "typescript".to_string(),
268 package_manager: "npm".to_string(),
269 test_framework: "jest".to_string(),
270 file_extension: "ts".to_string(),
271 common_ml_libraries: vec![
272 "@tensorflow/tfjs".into(),
273 "onnxruntime-node".into(),
274 "mathjs".into(),
275 "chart.js".into(),
276 ],
277 env_setup_commands: vec!["npm init -y".to_string()],
278 env_activate: String::new(), }),
280 "go" | "golang" => Some(LanguageConfig {
281 language: "go".to_string(),
282 package_manager: "go mod".to_string(),
283 test_framework: "go test".to_string(),
284 file_extension: "go".to_string(),
285 common_ml_libraries: vec!["gonum.org/v1/gonum".into(), "gorgonia.org/gorgonia".into()],
286 env_setup_commands: vec!["go mod init paper_impl".to_string()],
287 env_activate: String::new(), }),
289 "cpp" | "c++" => Some(LanguageConfig {
290 language: "cpp".to_string(),
291 package_manager: "cmake".to_string(),
292 test_framework: "ctest".to_string(),
293 file_extension: "cpp".to_string(),
294 common_ml_libraries: vec!["Eigen".into(), "libtorch".into(), "xtensor".into()],
295 env_setup_commands: vec!["mkdir -p build".to_string()],
296 env_activate: String::new(),
297 }),
298 "julia" | "jl" => Some(LanguageConfig {
299 language: "julia".to_string(),
300 package_manager: "Pkg".to_string(),
301 test_framework: "Test".to_string(),
302 file_extension: "jl".to_string(),
303 common_ml_libraries: vec![
304 "Flux".into(),
305 "MLJ".into(),
306 "Plots".into(),
307 "DataFrames".into(),
308 ],
309 env_setup_commands: vec![], env_activate: String::new(),
311 }),
312 _ => None,
313 }
314}
315
316const ARXIV_API_BASE: &str = "https://export.arxiv.org/api/query";
319const USER_AGENT: &str = "Rustant/1.0 (https://github.com/rustant)";
320
321pub struct ArxivClient {
323 client: reqwest::Client,
324 last_request: std::sync::Mutex<Option<std::time::Instant>>,
325}
326
327impl ArxivClient {
328 pub fn new() -> Result<Self, String> {
329 let client = reqwest::Client::builder()
330 .timeout(Duration::from_secs(30))
331 .connect_timeout(Duration::from_secs(10))
332 .user_agent(USER_AGENT)
333 .build()
334 .map_err(|e| format!("Failed to create HTTP client: {}", e))?;
335 Ok(Self {
336 client,
337 last_request: std::sync::Mutex::new(None),
338 })
339 }
340
341 async fn rate_limit(&self) {
343 let wait_duration = {
344 let last = self.last_request.lock().unwrap();
345 if let Some(instant) = *last {
346 let elapsed = instant.elapsed();
347 if elapsed < Duration::from_secs(3) {
348 Some(Duration::from_secs(3) - elapsed)
349 } else {
350 None
351 }
352 } else {
353 None
354 }
355 }; if let Some(wait) = wait_duration {
358 tokio::time::sleep(wait).await;
359 }
360
361 let mut last = self.last_request.lock().unwrap();
362 *last = Some(std::time::Instant::now());
363 }
364
365 pub async fn search(&self, params: &ArxivSearchParams) -> Result<ArxivSearchResult, String> {
367 self.rate_limit().await;
368 let url = build_search_url(params);
369 tracing::debug!("ArXiv search URL: {}", url);
370
371 let response = self
372 .client
373 .get(&url)
374 .send()
375 .await
376 .map_err(|e| format!("ArXiv API request failed: {}", e))?;
377
378 let status = response.status();
379 if !status.is_success() {
380 return Err(format!("ArXiv API returned status {}", status));
381 }
382
383 let body = response
384 .text()
385 .await
386 .map_err(|e| format!("Failed to read ArXiv response: {}", e))?;
387
388 parse_atom_response(&body)
389 }
390
391 pub async fn fetch_paper(&self, arxiv_id: &str) -> Result<ArxivPaper, String> {
393 self.rate_limit().await;
394 let clean_id = arxiv_id.trim();
395 validate_arxiv_id(clean_id)?;
396
397 let url = format!(
398 "{}?id_list={}",
399 ARXIV_API_BASE,
400 urlencoding::encode(clean_id)
401 );
402 tracing::debug!("ArXiv fetch URL: {}", url);
403
404 let response = self
405 .client
406 .get(&url)
407 .send()
408 .await
409 .map_err(|e| format!("ArXiv API request failed: {}", e))?;
410
411 let body = response
412 .text()
413 .await
414 .map_err(|e| format!("Failed to read ArXiv response: {}", e))?;
415
416 let result = parse_atom_response(&body)?;
417 result
418 .papers
419 .into_iter()
420 .next()
421 .ok_or_else(|| format!("Paper '{}' not found on ArXiv", clean_id))
422 }
423}
424
425pub fn build_search_url(params: &ArxivSearchParams) -> String {
429 let mut search_query = if params.query.is_empty() {
430 "all:*".to_string()
431 } else {
432 format!("all:{}", params.query)
433 };
434
435 if let Some(cat) = ¶ms.category {
436 search_query = format!("{} AND cat:{}", search_query, cat);
437 }
438
439 format!(
440 "{}?search_query={}&start={}&max_results={}&sortBy={}&sortOrder={}",
441 ARXIV_API_BASE,
442 urlencoding::encode(&search_query),
443 params.start,
444 params.max_results,
445 params.sort_by.as_api_str(),
446 params.sort_order.as_api_str(),
447 )
448}
449
450pub fn parse_atom_response(xml: &str) -> Result<ArxivSearchResult, String> {
454 let total_results = extract_opensearch_value(xml, "totalResults").unwrap_or(0);
455 let start_index = extract_opensearch_value(xml, "startIndex").unwrap_or(0);
456 let items_per_page = extract_opensearch_value(xml, "itemsPerPage").unwrap_or(0);
457
458 let entries = extract_entries(xml);
459 let mut papers = Vec::new();
460
461 for entry_xml in &entries {
462 if let Some(paper) = parse_entry(entry_xml) {
463 papers.push(paper);
464 }
465 }
466
467 Ok(ArxivSearchResult {
468 papers,
469 total_results,
470 start_index,
471 items_per_page,
472 })
473}
474
475fn extract_entries(xml: &str) -> Vec<String> {
477 let mut entries = Vec::new();
478 let mut search_from = 0;
479
480 loop {
481 let start_tag = "<entry>";
482 let end_tag = "</entry>";
483
484 let start = match xml[search_from..].find(start_tag) {
485 Some(pos) => search_from + pos,
486 None => break,
487 };
488
489 let end = match xml[start..].find(end_tag) {
490 Some(pos) => start + pos + end_tag.len(),
491 None => break,
492 };
493
494 entries.push(xml[start..end].to_string());
495 search_from = end;
496 }
497
498 entries
499}
500
501fn parse_entry(entry: &str) -> Option<ArxivPaper> {
503 let id_url = extract_tag_text(entry, "id")?;
504 let arxiv_id = extract_arxiv_id_from_url(&id_url);
505 let title = normalize_whitespace(&extract_tag_text(entry, "title")?);
506
507 let mut authors = Vec::new();
509 let mut author_search = 0;
510 while let Some(pos) = entry[author_search..].find("<author>") {
511 let author_start = author_search + pos;
512 let Some(end_pos) = entry[author_start..].find("</author>") else {
513 break;
514 };
515 let author_end = author_start + end_pos + "</author>".len();
516 let author_block = &entry[author_start..author_end];
517 if let Some(name) = extract_tag_text(author_block, "name") {
518 authors.push(name);
519 }
520 author_search = author_end;
521 }
522
523 let summary = normalize_whitespace(&extract_tag_text(entry, "summary").unwrap_or_default());
524 let published = extract_tag_text(entry, "published").unwrap_or_default();
525 let updated = extract_tag_text(entry, "updated").unwrap_or_default();
526
527 let mut categories = Vec::new();
529 let mut primary_category = String::new();
530 let mut cat_search = 0;
531 while let Some(pos) = entry[cat_search..].find("<category") {
532 let cat_start = cat_search + pos;
533 let cat_end = if let Some(end_pos) = entry[cat_start..].find("/>") {
534 cat_start + end_pos + 2
535 } else if let Some(end_pos) = entry[cat_start..].find('>') {
536 cat_start + end_pos + 1
537 } else {
538 break;
539 };
540 let cat_tag = &entry[cat_start..cat_end];
541 if let Some(term) = extract_attribute(cat_tag, "term") {
542 categories.push(term);
543 }
544 cat_search = cat_end;
545 }
546
547 if let Some(pc_start) = entry.find("primary_category")
549 && let Some(pc_end) = entry[pc_start..]
550 .find("/>")
551 .or_else(|| entry[pc_start..].find(">"))
552 {
553 let pc_tag = &entry[pc_start..pc_start + pc_end + 2];
554 if let Some(term) = extract_attribute(pc_tag, "term") {
555 primary_category = term;
556 }
557 }
558 if primary_category.is_empty() {
559 primary_category = categories.first().cloned().unwrap_or_default();
560 }
561
562 let mut pdf_url = String::new();
564 let mut abs_url = id_url.clone();
565 let mut link_search = 0;
566 while let Some(pos) = entry[link_search..].find("<link") {
567 let link_start = link_search + pos;
568 let Some(end_pos) = entry[link_start..]
569 .find("/>")
570 .or_else(|| entry[link_start..].find('>'))
571 else {
572 break;
573 };
574 let link_end = link_start + end_pos + 2;
575 let link_tag = &entry[link_start..link_end];
576 let href = extract_attribute(link_tag, "href").unwrap_or_default();
577 let title_attr = extract_attribute(link_tag, "title").unwrap_or_default();
578 let link_type = extract_attribute(link_tag, "type").unwrap_or_default();
579
580 if title_attr == "pdf" || link_type == "application/pdf" {
581 pdf_url = href;
582 } else if link_type.is_empty() && href.contains("/abs/") {
583 abs_url = href;
584 }
585 link_search = link_end;
586 }
587
588 if pdf_url.is_empty() {
589 pdf_url = format!("https://arxiv.org/pdf/{}", arxiv_id);
590 }
591
592 let doi = extract_tag_text_ns(entry, "arxiv:doi");
593 let comment = extract_tag_text_ns(entry, "arxiv:comment").map(|c| normalize_whitespace(&c));
594 let journal_ref = extract_tag_text_ns(entry, "arxiv:journal_ref");
595
596 Some(ArxivPaper {
597 arxiv_id,
598 title,
599 authors,
600 summary,
601 categories,
602 primary_category,
603 published,
604 updated,
605 pdf_url,
606 abs_url,
607 doi,
608 comment,
609 journal_ref,
610 })
611}
612
613fn extract_tag_text(xml: &str, tag: &str) -> Option<String> {
615 let open = format!("<{}", tag);
616 let close = format!("</{}>", tag);
617
618 let start_pos = xml.find(&open)?;
619 let content_start = xml[start_pos..].find('>')? + start_pos + 1;
621 let content_end = xml[content_start..].find(&close)? + content_start;
622
623 Some(xml[content_start..content_end].trim().to_string())
624}
625
626fn extract_tag_text_ns(xml: &str, tag: &str) -> Option<String> {
628 extract_tag_text(xml, tag)
629}
630
631pub fn extract_attribute(tag: &str, attr: &str) -> Option<String> {
633 let search = format!("{}=\"", attr);
634 let start = tag.find(&search)? + search.len();
635 let end = tag[start..].find('"')? + start;
636 Some(tag[start..end].to_string())
637}
638
639pub fn extract_arxiv_id_from_url(url: &str) -> String {
641 if let Some(pos) = url.rfind("/abs/") {
642 url[pos + 5..].to_string()
643 } else if let Some(pos) = url.rfind("/pdf/") {
644 url[pos + 5..].trim_end_matches(".pdf").to_string()
645 } else {
646 url.to_string()
648 }
649}
650
651pub fn normalize_whitespace(s: &str) -> String {
653 s.split_whitespace().collect::<Vec<_>>().join(" ")
654}
655
656fn extract_opensearch_value(xml: &str, field: &str) -> Option<usize> {
658 let tag = format!("opensearch:{}", field);
659 extract_tag_text(xml, &tag).and_then(|s| s.trim().parse().ok())
660}
661
662pub fn validate_arxiv_id(id: &str) -> Result<(), String> {
667 let id = id.trim();
668 if id.is_empty() {
669 return Err("ArXiv ID cannot be empty".to_string());
670 }
671
672 let new_format = regex_lite_match_arxiv_new(id);
674 let old_format = regex_lite_match_arxiv_old(id);
676
677 if new_format || old_format {
678 Ok(())
679 } else {
680 Err(format!(
681 "Invalid ArXiv ID '{}'. Expected format: YYMM.NNNNN (e.g., 2301.12345) or category/NNNNNNN (e.g., hep-th/9901001)",
682 id
683 ))
684 }
685}
686
687fn regex_lite_match_arxiv_new(id: &str) -> bool {
689 let base = id.split('v').next().unwrap_or(id);
690 let parts: Vec<&str> = base.split('.').collect();
691 if parts.len() != 2 {
692 return false;
693 }
694 let yymm = parts[0];
695 let nnnnn = parts[1];
696
697 if yymm.len() != 4 || !yymm.chars().all(|c| c.is_ascii_digit()) {
698 return false;
699 }
700 if nnnnn.is_empty() || nnnnn.len() > 5 || !nnnnn.chars().all(|c| c.is_ascii_digit()) {
701 return false;
702 }
703
704 if let Some(v_pos) = id.find('v') {
706 let version = &id[v_pos + 1..];
707 if version.is_empty() || !version.chars().all(|c| c.is_ascii_digit()) {
708 return false;
709 }
710 }
711
712 true
713}
714
715fn regex_lite_match_arxiv_old(id: &str) -> bool {
717 let parts: Vec<&str> = id.splitn(2, '/').collect();
718 if parts.len() != 2 {
719 return false;
720 }
721 let category = parts[0];
722 let number = parts[1].split('v').next().unwrap_or(parts[1]);
723
724 if category.is_empty()
726 || !category
727 .chars()
728 .all(|c| c.is_ascii_alphanumeric() || c == '-')
729 {
730 return false;
731 }
732 if number.is_empty() || !number.chars().all(|c| c.is_ascii_digit()) {
734 return false;
735 }
736
737 true
738}
739
740pub fn generate_bibtex(paper: &ArxivPaper) -> String {
744 let cite_key = generate_cite_key(paper);
745 let authors_bibtex = paper.authors.join(" and ");
746 let title_escaped = escape_bibtex(&paper.title);
747 let year = extract_year(&paper.published);
748
749 let mut entry = format!(
750 "@article{{{},\n title = {{{}}},\n author = {{{}}},\n year = {{{}}},\n eprint = {{{}}},\n archivePrefix = {{arXiv}},\n primaryClass = {{{}}}",
751 cite_key, title_escaped, authors_bibtex, year, paper.arxiv_id, paper.primary_category,
752 );
753
754 if let Some(doi) = &paper.doi {
755 entry.push_str(&format!(",\n doi = {{{}}}", doi));
756 }
757 if let Some(journal) = &paper.journal_ref {
758 entry.push_str(&format!(",\n journal = {{{}}}", escape_bibtex(journal)));
759 }
760
761 entry.push_str("\n}");
762 entry
763}
764
765fn generate_cite_key(paper: &ArxivPaper) -> String {
767 let first_author = paper
768 .authors
769 .first()
770 .map(|a| {
771 a.split_whitespace()
772 .last()
773 .unwrap_or(a)
774 .to_lowercase()
775 .chars()
776 .filter(|c| c.is_ascii_alphanumeric())
777 .collect::<String>()
778 })
779 .unwrap_or_else(|| "unknown".to_string());
780
781 let year = extract_year(&paper.published);
782
783 let title_word = paper
784 .title
785 .split_whitespace()
786 .find(|w| w.len() > 3 && w.chars().next().is_some_and(|c| c.is_alphabetic()))
787 .unwrap_or("paper")
788 .to_lowercase()
789 .chars()
790 .filter(|c| c.is_ascii_alphanumeric())
791 .collect::<String>();
792
793 format!("{}{}{}", first_author, year, title_word)
794}
795
796fn escape_bibtex(s: &str) -> String {
798 s.replace('&', "\\&")
799 .replace('%', "\\%")
800 .replace('$', "\\$")
801 .replace('#', "\\#")
802 .replace('_', "\\_")
803 .replace('{', "\\{")
804 .replace('}', "\\}")
805 .replace('~', "\\textasciitilde{}")
806 .replace('^', "\\textasciicircum{}")
807}
808
809fn extract_year(date_str: &str) -> String {
811 date_str.split('-').next().unwrap_or("0000").to_string()
812}
813
814#[cfg(test)]
817mod tests {
818 use super::*;
819
820 const SAMPLE_ENTRY: &str = r#"<entry>
821 <id>http://arxiv.org/abs/1706.03762v7</id>
822 <updated>2023-08-02T01:09:28Z</updated>
823 <published>2017-06-12T17:57:34Z</published>
824 <title>Attention Is All You Need</title>
825 <summary> The dominant sequence transduction models are based on complex recurrent or
826convolutional neural networks that include an encoder and a decoder. </summary>
827 <author><name>Ashish Vaswani</name></author>
828 <author><name>Noam Shazeer</name></author>
829 <author><name>Niki Parmar</name></author>
830 <arxiv:doi xmlns:arxiv="http://arxiv.org/schemas/atom">10.1234/nips.2017</arxiv:doi>
831 <arxiv:comment xmlns:arxiv="http://arxiv.org/schemas/atom">15 pages, 5 figures</arxiv:comment>
832 <arxiv:journal_ref xmlns:arxiv="http://arxiv.org/schemas/atom">NeurIPS 2017</arxiv:journal_ref>
833 <link href="http://arxiv.org/abs/1706.03762v7" rel="alternate" type="text/html"/>
834 <link href="http://arxiv.org/pdf/1706.03762v7" title="pdf" type="application/pdf"/>
835 <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="cs.CL" scheme="http://arxiv.org/schemas/atom"/>
836 <category term="cs.CL" scheme="http://arxiv.org/schemas/atom"/>
837 <category term="cs.AI" scheme="http://arxiv.org/schemas/atom"/>
838</entry>"#;
839
840 const SAMPLE_FEED: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
841<feed xmlns="http://www.w3.org/2005/Atom"
842 xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/"
843 xmlns:arxiv="http://arxiv.org/schemas/atom">
844 <title>ArXiv Query</title>
845 <opensearch:totalResults>100</opensearch:totalResults>
846 <opensearch:startIndex>0</opensearch:startIndex>
847 <opensearch:itemsPerPage>3</opensearch:itemsPerPage>
848 <entry>
849 <id>http://arxiv.org/abs/1706.03762v7</id>
850 <updated>2023-08-02T01:09:28Z</updated>
851 <published>2017-06-12T17:57:34Z</published>
852 <title>Attention Is All You Need</title>
853 <summary>The dominant sequence transduction models.</summary>
854 <author><name>Ashish Vaswani</name></author>
855 <link href="http://arxiv.org/abs/1706.03762v7" rel="alternate" type="text/html"/>
856 <link href="http://arxiv.org/pdf/1706.03762v7" title="pdf" type="application/pdf"/>
857 <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="cs.CL"/>
858 <category term="cs.CL"/>
859 </entry>
860 <entry>
861 <id>http://arxiv.org/abs/1810.04805v2</id>
862 <updated>2019-05-24T12:00:00Z</updated>
863 <published>2018-10-11T00:00:00Z</published>
864 <title>BERT: Pre-training of Deep Bidirectional Transformers</title>
865 <summary>We introduce a new language representation model.</summary>
866 <author><name>Jacob Devlin</name></author>
867 <link href="http://arxiv.org/pdf/1810.04805v2" title="pdf" type="application/pdf"/>
868 <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="cs.CL"/>
869 <category term="cs.CL"/>
870 </entry>
871 <entry>
872 <id>http://arxiv.org/abs/2005.14165v4</id>
873 <updated>2020-07-22T00:00:00Z</updated>
874 <published>2020-05-28T00:00:00Z</published>
875 <title>Language Models are Few-Shot Learners</title>
876 <summary>Recent work demonstrates substantial gains.</summary>
877 <author><name>Tom Brown</name></author>
878 <link href="http://arxiv.org/pdf/2005.14165v4" title="pdf" type="application/pdf"/>
879 <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="cs.CL"/>
880 <category term="cs.CL"/>
881 </entry>
882</feed>"#;
883
884 #[test]
885 fn test_parse_single_entry() {
886 let feed = format!(
887 r#"<feed><opensearch:totalResults>1</opensearch:totalResults>
888 <opensearch:startIndex>0</opensearch:startIndex>
889 <opensearch:itemsPerPage>1</opensearch:itemsPerPage>{}</feed>"#,
890 SAMPLE_ENTRY
891 );
892 let result = parse_atom_response(&feed).unwrap();
893 assert_eq!(result.papers.len(), 1);
894 let paper = &result.papers[0];
895 assert_eq!(paper.arxiv_id, "1706.03762v7");
896 assert_eq!(paper.title, "Attention Is All You Need");
897 assert_eq!(paper.authors.len(), 3);
898 assert_eq!(paper.authors[0], "Ashish Vaswani");
899 }
900
901 #[test]
902 fn test_parse_multiple_entries() {
903 let result = parse_atom_response(SAMPLE_FEED).unwrap();
904 assert_eq!(result.papers.len(), 3);
905 assert_eq!(result.total_results, 100);
906 assert_eq!(result.start_index, 0);
907 assert_eq!(result.items_per_page, 3);
908 }
909
910 #[test]
911 fn test_parse_empty_results() {
912 let feed = r#"<feed>
913 <opensearch:totalResults>0</opensearch:totalResults>
914 <opensearch:startIndex>0</opensearch:startIndex>
915 <opensearch:itemsPerPage>10</opensearch:itemsPerPage>
916 </feed>"#;
917 let result = parse_atom_response(feed).unwrap();
918 assert_eq!(result.papers.len(), 0);
919 assert_eq!(result.total_results, 0);
920 }
921
922 #[test]
923 fn test_parse_entry_all_fields() {
924 let feed = format!(
925 r#"<feed><opensearch:totalResults>1</opensearch:totalResults>
926 <opensearch:startIndex>0</opensearch:startIndex>
927 <opensearch:itemsPerPage>1</opensearch:itemsPerPage>{}</feed>"#,
928 SAMPLE_ENTRY
929 );
930 let result = parse_atom_response(&feed).unwrap();
931 let paper = &result.papers[0];
932 assert_eq!(paper.doi.as_deref(), Some("10.1234/nips.2017"));
933 assert_eq!(paper.comment.as_deref(), Some("15 pages, 5 figures"));
934 assert_eq!(paper.journal_ref.as_deref(), Some("NeurIPS 2017"));
935 assert_eq!(paper.primary_category, "cs.CL");
936 assert!(paper.categories.contains(&"cs.CL".to_string()));
937 assert!(paper.categories.contains(&"cs.AI".to_string()));
938 assert!(paper.pdf_url.contains("1706.03762"));
939 }
940
941 #[test]
942 fn test_parse_entry_missing_optionals() {
943 let entry = r#"<feed>
944 <opensearch:totalResults>1</opensearch:totalResults>
945 <opensearch:startIndex>0</opensearch:startIndex>
946 <opensearch:itemsPerPage>1</opensearch:itemsPerPage>
947 <entry>
948 <id>http://arxiv.org/abs/2301.12345v1</id>
949 <published>2023-01-15T00:00:00Z</published>
950 <updated>2023-01-15T00:00:00Z</updated>
951 <title>A Simple Paper</title>
952 <summary>A summary.</summary>
953 <author><name>John Doe</name></author>
954 <category term="cs.AI"/>
955 </entry>
956 </feed>"#;
957 let result = parse_atom_response(entry).unwrap();
958 let paper = &result.papers[0];
959 assert!(paper.doi.is_none());
960 assert!(paper.comment.is_none());
961 assert!(paper.journal_ref.is_none());
962 }
963
964 #[test]
965 fn test_extract_arxiv_id_from_url() {
966 assert_eq!(
967 extract_arxiv_id_from_url("http://arxiv.org/abs/1706.03762v7"),
968 "1706.03762v7"
969 );
970 assert_eq!(
971 extract_arxiv_id_from_url("http://arxiv.org/pdf/2301.12345"),
972 "2301.12345"
973 );
974 assert_eq!(extract_arxiv_id_from_url("2301.12345"), "2301.12345");
975 }
976
977 #[test]
978 fn test_normalize_whitespace() {
979 assert_eq!(
980 normalize_whitespace(" Hello World\n Test "),
981 "Hello World Test"
982 );
983 assert_eq!(normalize_whitespace("single"), "single");
984 }
985
986 #[test]
987 fn test_build_search_url_basic() {
988 let params = ArxivSearchParams {
989 query: "transformer attention".to_string(),
990 ..Default::default()
991 };
992 let url = build_search_url(¶ms);
993 assert!(url.starts_with(ARXIV_API_BASE));
994 assert!(url.contains("transformer"));
995 assert!(url.contains("attention"));
996 assert!(url.contains("max_results=10"));
997 }
998
999 #[test]
1000 fn test_build_search_url_with_category() {
1001 let params = ArxivSearchParams {
1002 query: "attention".to_string(),
1003 category: Some("cs.AI".to_string()),
1004 ..Default::default()
1005 };
1006 let url = build_search_url(¶ms);
1007 assert!(url.contains("cat%3Acs.AI") || url.contains("cat:cs.AI"));
1008 }
1009
1010 #[test]
1011 fn test_build_search_url_with_sort() {
1012 let params = ArxivSearchParams {
1013 query: "test".to_string(),
1014 sort_by: ArxivSortBy::SubmittedDate,
1015 sort_order: ArxivSortOrder::Descending,
1016 ..Default::default()
1017 };
1018 let url = build_search_url(¶ms);
1019 assert!(url.contains("sortBy=submittedDate"));
1020 assert!(url.contains("sortOrder=descending"));
1021 }
1022
1023 #[test]
1024 fn test_generate_bibtex() {
1025 let paper = ArxivPaper {
1026 arxiv_id: "1706.03762v7".to_string(),
1027 title: "Attention Is All You Need".to_string(),
1028 authors: vec!["Ashish Vaswani".to_string(), "Noam Shazeer".to_string()],
1029 summary: "A summary.".to_string(),
1030 categories: vec!["cs.CL".to_string()],
1031 primary_category: "cs.CL".to_string(),
1032 published: "2017-06-12T17:57:34Z".to_string(),
1033 updated: "2023-08-02T01:09:28Z".to_string(),
1034 pdf_url: "http://arxiv.org/pdf/1706.03762v7".to_string(),
1035 abs_url: "http://arxiv.org/abs/1706.03762v7".to_string(),
1036 doi: None,
1037 comment: None,
1038 journal_ref: None,
1039 };
1040 let bib = generate_bibtex(&paper);
1041 assert!(bib.starts_with("@article{"));
1042 assert!(bib.contains("Attention Is All You Need"));
1043 assert!(bib.contains("Ashish Vaswani and Noam Shazeer"));
1044 assert!(bib.contains("2017"));
1045 assert!(bib.contains("1706.03762v7"));
1046 assert!(bib.contains("cs.CL"));
1047 assert!(bib.ends_with('}'));
1048 }
1049
1050 #[test]
1051 fn test_generate_bibtex_special_chars() {
1052 let paper = ArxivPaper {
1053 arxiv_id: "2301.00001".to_string(),
1054 title: "A & B: 50% Better $Models$ with #Tags".to_string(),
1055 authors: vec!["Jane Smith".to_string()],
1056 summary: String::new(),
1057 categories: vec!["cs.AI".to_string()],
1058 primary_category: "cs.AI".to_string(),
1059 published: "2023-01-01T00:00:00Z".to_string(),
1060 updated: "2023-01-01T00:00:00Z".to_string(),
1061 pdf_url: String::new(),
1062 abs_url: String::new(),
1063 doi: None,
1064 comment: None,
1065 journal_ref: None,
1066 };
1067 let bib = generate_bibtex(&paper);
1068 assert!(bib.contains("\\&"));
1069 assert!(bib.contains("\\%"));
1070 assert!(bib.contains("\\$"));
1071 assert!(bib.contains("\\#"));
1072 }
1073
1074 #[test]
1075 fn test_library_state_roundtrip() {
1076 let state = ArxivLibraryState {
1077 entries: vec![LibraryEntry {
1078 paper: ArxivPaper {
1079 arxiv_id: "2301.12345".to_string(),
1080 title: "Test Paper".to_string(),
1081 authors: vec!["Author One".to_string()],
1082 summary: "A test.".to_string(),
1083 categories: vec!["cs.AI".to_string()],
1084 primary_category: "cs.AI".to_string(),
1085 published: "2023-01-15T00:00:00Z".to_string(),
1086 updated: "2023-01-15T00:00:00Z".to_string(),
1087 pdf_url: "https://arxiv.org/pdf/2301.12345".to_string(),
1088 abs_url: "https://arxiv.org/abs/2301.12345".to_string(),
1089 doi: None,
1090 comment: None,
1091 journal_ref: None,
1092 },
1093 tags: vec!["ml".to_string(), "test".to_string()],
1094 collection: Some("Favorites".to_string()),
1095 notes: Some("Great paper".to_string()),
1096 saved_at: Utc::now(),
1097 }],
1098 collections: vec!["Favorites".to_string()],
1099 digest_config: Some(DigestConfig {
1100 keywords: vec!["transformer".to_string()],
1101 categories: vec!["cs.AI".to_string()],
1102 enabled: true,
1103 }),
1104 implementations: Vec::new(),
1105 };
1106
1107 let json = serde_json::to_string_pretty(&state).unwrap();
1108 let restored: ArxivLibraryState = serde_json::from_str(&json).unwrap();
1109 assert_eq!(restored.entries.len(), 1);
1110 assert_eq!(restored.entries[0].paper.arxiv_id, "2301.12345");
1111 assert_eq!(restored.collections, vec!["Favorites"]);
1112 assert!(restored.digest_config.unwrap().enabled);
1113 }
1114
1115 #[test]
1116 fn test_validate_arxiv_id_new_format() {
1117 assert!(validate_arxiv_id("2301.12345").is_ok());
1118 assert!(validate_arxiv_id("2301.12345v2").is_ok());
1119 assert!(validate_arxiv_id("1706.03762").is_ok());
1120 assert!(validate_arxiv_id("1706.03762v7").is_ok());
1121 }
1122
1123 #[test]
1124 fn test_validate_arxiv_id_old_format() {
1125 assert!(validate_arxiv_id("hep-th/9901001").is_ok());
1126 assert!(validate_arxiv_id("math/0211159").is_ok());
1127 assert!(validate_arxiv_id("cs/0112017").is_ok());
1128 }
1129
1130 #[test]
1131 fn test_validate_arxiv_id_invalid() {
1132 assert!(validate_arxiv_id("not-an-id").is_err());
1133 assert!(validate_arxiv_id("").is_err());
1134 assert!(validate_arxiv_id("abc").is_err());
1135 assert!(validate_arxiv_id("12345").is_err());
1136 }
1137
1138 #[tokio::test]
1140 #[ignore]
1141 async fn test_real_search() {
1142 let client = ArxivClient::new().unwrap();
1143 let params = ArxivSearchParams {
1144 query: "attention is all you need".to_string(),
1145 max_results: 3,
1146 ..Default::default()
1147 };
1148 let result = client.search(¶ms).await.unwrap();
1149 assert!(!result.papers.is_empty());
1150 }
1151
1152 #[tokio::test]
1153 #[ignore]
1154 async fn test_real_fetch_attention_paper() {
1155 let client = ArxivClient::new().unwrap();
1156 let paper = client.fetch_paper("1706.03762").await.unwrap();
1157 assert!(paper.title.contains("Attention"));
1158 assert!(!paper.authors.is_empty());
1159 }
1160}