Skip to main content

batuta/oracle/coursera/
arxiv_db.rs

1//! Curated arXiv database for citation matching
2//!
3//! Contains ~120 curated entries across ML, NLP, systems, DevOps, and cloud topics.
4//! URLs are deterministic `https://arxiv.org/abs/{id}` — stable, no 404 risk.
5
6use super::types::ArxivCitation;
7
8/// Curated arXiv database compiled into the binary.
9pub struct ArxivDatabase {
10    entries: Vec<ArxivCitation>,
11}
12
13impl ArxivDatabase {
14    /// Load the built-in curated database.
15    pub fn builtin() -> Self {
16        Self { entries: builtin_entries() }
17    }
18
19    /// Find citations by topic keyword (single topic, case-insensitive).
20    pub fn find_by_topic(&self, topic: &str, limit: usize) -> Vec<ArxivCitation> {
21        let topic_lower = topic.to_lowercase();
22        let mut results: Vec<_> = self
23            .entries
24            .iter()
25            .filter(|e| {
26                e.topics.iter().any(|t| t.to_lowercase().contains(&topic_lower))
27                    || e.title.to_lowercase().contains(&topic_lower)
28            })
29            .cloned()
30            .collect();
31        results.truncate(limit);
32        results
33    }
34
35    /// Find citations by multiple keywords using Jaccard scoring.
36    pub fn find_by_keywords(&self, keywords: &[&str], limit: usize) -> Vec<ArxivCitation> {
37        let kw_lower: Vec<String> = keywords.iter().map(|k| k.to_lowercase()).collect();
38        let kw_count = kw_lower.len() as f64;
39        if kw_count == 0.0 {
40            return Vec::new();
41        }
42
43        let mut scored: Vec<(f64, &ArxivCitation)> = self
44            .entries
45            .iter()
46            .map(|entry| {
47                let entry_topics: Vec<String> =
48                    entry.topics.iter().map(|t| t.to_lowercase()).collect();
49                let title_lower = entry.title.to_lowercase();
50
51                let matches = kw_lower
52                    .iter()
53                    .filter(|kw| {
54                        entry_topics.iter().any(|t| t.contains(kw.as_str()))
55                            || title_lower.contains(kw.as_str())
56                    })
57                    .count() as f64;
58
59                let union = kw_count + entry.topics.len() as f64 - matches;
60                let score = if union > 0.0 { matches / union } else { 0.0 };
61                (score, entry)
62            })
63            .filter(|(score, _)| *score > 0.0)
64            .collect();
65
66        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
67        scored.into_iter().take(limit).map(|(_, e)| e.clone()).collect()
68    }
69
70    /// Get total number of entries.
71    pub fn len(&self) -> usize {
72        self.entries.len()
73    }
74
75    /// Check if database is empty.
76    pub fn is_empty(&self) -> bool {
77        self.entries.is_empty()
78    }
79}
80
81fn cite(
82    id: &str,
83    title: &str,
84    authors: &str,
85    year: u16,
86    snippet: &str,
87    topics: &[&str],
88) -> ArxivCitation {
89    ArxivCitation {
90        arxiv_id: id.to_string(),
91        title: title.to_string(),
92        authors: authors.to_string(),
93        year,
94        url: format!("https://arxiv.org/abs/{id}"),
95        abstract_snippet: snippet.to_string(),
96        topics: topics.iter().map(|s| (*s).to_string()).collect(),
97    }
98}
99
100fn builtin_entries() -> Vec<ArxivCitation> {
101    include!("arxiv_entries.rs")
102}
103
104#[cfg(test)]
105mod tests {
106    use super::*;
107
108    #[test]
109    fn test_builtin_database_size() {
110        let db = ArxivDatabase::builtin();
111        assert!(db.len() >= 100, "Expected at least 100 entries, got {}", db.len());
112        assert!(!db.is_empty());
113    }
114
115    #[test]
116    fn test_find_by_topic() {
117        let db = ArxivDatabase::builtin();
118        let results = db.find_by_topic("transformer", 5);
119        assert!(!results.is_empty());
120        assert!(results.len() <= 5);
121        // Should find "Attention Is All You Need"
122        assert!(results.iter().any(|r| r.arxiv_id == "1706.03762"));
123    }
124
125    #[test]
126    fn test_find_by_topic_case_insensitive() {
127        let db = ArxivDatabase::builtin();
128        let lower = db.find_by_topic("rag", 10);
129        let upper = db.find_by_topic("RAG", 10);
130        assert_eq!(lower.len(), upper.len());
131    }
132
133    #[test]
134    fn test_find_by_keywords_jaccard() {
135        let db = ArxivDatabase::builtin();
136        let results = db.find_by_keywords(&["mlops", "pipeline", "ci/cd"], 5);
137        assert!(!results.is_empty());
138        // Results should be scored and sorted
139    }
140
141    #[test]
142    fn test_find_by_keywords_empty() {
143        let db = ArxivDatabase::builtin();
144        let results = db.find_by_keywords(&[], 5);
145        assert!(results.is_empty());
146    }
147
148    #[test]
149    fn test_find_by_topic_no_results() {
150        let db = ArxivDatabase::builtin();
151        let results = db.find_by_topic("xyznonexistent", 5);
152        assert!(results.is_empty());
153    }
154
155    #[test]
156    fn test_arxiv_url_format() {
157        let db = ArxivDatabase::builtin();
158        for entry in &db.entries {
159            assert!(entry.url.starts_with("https://arxiv.org/abs/"), "Bad URL: {}", entry.url);
160            assert!(entry.url.ends_with(&entry.arxiv_id));
161        }
162    }
163
164    #[test]
165    fn test_all_entries_have_topics() {
166        let db = ArxivDatabase::builtin();
167        for entry in &db.entries {
168            assert!(!entry.topics.is_empty(), "Entry {} has no topics", entry.arxiv_id);
169        }
170    }
171
172    #[test]
173    fn test_find_by_topic_limit() {
174        let db = ArxivDatabase::builtin();
175        let results = db.find_by_topic("deep learning", 2);
176        assert!(results.len() <= 2);
177    }
178}