batuta/oracle/coursera/
arxiv_db.rs1use super::types::ArxivCitation;
7
8pub struct ArxivDatabase {
10 entries: Vec<ArxivCitation>,
11}
12
13impl ArxivDatabase {
14 pub fn builtin() -> Self {
16 Self { entries: builtin_entries() }
17 }
18
19 pub fn find_by_topic(&self, topic: &str, limit: usize) -> Vec<ArxivCitation> {
21 let topic_lower = topic.to_lowercase();
22 let mut results: Vec<_> = self
23 .entries
24 .iter()
25 .filter(|e| {
26 e.topics.iter().any(|t| t.to_lowercase().contains(&topic_lower))
27 || e.title.to_lowercase().contains(&topic_lower)
28 })
29 .cloned()
30 .collect();
31 results.truncate(limit);
32 results
33 }
34
35 pub fn find_by_keywords(&self, keywords: &[&str], limit: usize) -> Vec<ArxivCitation> {
37 let kw_lower: Vec<String> = keywords.iter().map(|k| k.to_lowercase()).collect();
38 let kw_count = kw_lower.len() as f64;
39 if kw_count == 0.0 {
40 return Vec::new();
41 }
42
43 let mut scored: Vec<(f64, &ArxivCitation)> = self
44 .entries
45 .iter()
46 .map(|entry| {
47 let entry_topics: Vec<String> =
48 entry.topics.iter().map(|t| t.to_lowercase()).collect();
49 let title_lower = entry.title.to_lowercase();
50
51 let matches = kw_lower
52 .iter()
53 .filter(|kw| {
54 entry_topics.iter().any(|t| t.contains(kw.as_str()))
55 || title_lower.contains(kw.as_str())
56 })
57 .count() as f64;
58
59 let union = kw_count + entry.topics.len() as f64 - matches;
60 let score = if union > 0.0 { matches / union } else { 0.0 };
61 (score, entry)
62 })
63 .filter(|(score, _)| *score > 0.0)
64 .collect();
65
66 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
67 scored.into_iter().take(limit).map(|(_, e)| e.clone()).collect()
68 }
69
70 pub fn len(&self) -> usize {
72 self.entries.len()
73 }
74
75 pub fn is_empty(&self) -> bool {
77 self.entries.is_empty()
78 }
79}
80
81fn cite(
82 id: &str,
83 title: &str,
84 authors: &str,
85 year: u16,
86 snippet: &str,
87 topics: &[&str],
88) -> ArxivCitation {
89 ArxivCitation {
90 arxiv_id: id.to_string(),
91 title: title.to_string(),
92 authors: authors.to_string(),
93 year,
94 url: format!("https://arxiv.org/abs/{id}"),
95 abstract_snippet: snippet.to_string(),
96 topics: topics.iter().map(|s| (*s).to_string()).collect(),
97 }
98}
99
100fn builtin_entries() -> Vec<ArxivCitation> {
101 include!("arxiv_entries.rs")
102}
103
104#[cfg(test)]
105mod tests {
106 use super::*;
107
108 #[test]
109 fn test_builtin_database_size() {
110 let db = ArxivDatabase::builtin();
111 assert!(db.len() >= 100, "Expected at least 100 entries, got {}", db.len());
112 assert!(!db.is_empty());
113 }
114
115 #[test]
116 fn test_find_by_topic() {
117 let db = ArxivDatabase::builtin();
118 let results = db.find_by_topic("transformer", 5);
119 assert!(!results.is_empty());
120 assert!(results.len() <= 5);
121 assert!(results.iter().any(|r| r.arxiv_id == "1706.03762"));
123 }
124
125 #[test]
126 fn test_find_by_topic_case_insensitive() {
127 let db = ArxivDatabase::builtin();
128 let lower = db.find_by_topic("rag", 10);
129 let upper = db.find_by_topic("RAG", 10);
130 assert_eq!(lower.len(), upper.len());
131 }
132
133 #[test]
134 fn test_find_by_keywords_jaccard() {
135 let db = ArxivDatabase::builtin();
136 let results = db.find_by_keywords(&["mlops", "pipeline", "ci/cd"], 5);
137 assert!(!results.is_empty());
138 }
140
141 #[test]
142 fn test_find_by_keywords_empty() {
143 let db = ArxivDatabase::builtin();
144 let results = db.find_by_keywords(&[], 5);
145 assert!(results.is_empty());
146 }
147
148 #[test]
149 fn test_find_by_topic_no_results() {
150 let db = ArxivDatabase::builtin();
151 let results = db.find_by_topic("xyznonexistent", 5);
152 assert!(results.is_empty());
153 }
154
155 #[test]
156 fn test_arxiv_url_format() {
157 let db = ArxivDatabase::builtin();
158 for entry in &db.entries {
159 assert!(entry.url.starts_with("https://arxiv.org/abs/"), "Bad URL: {}", entry.url);
160 assert!(entry.url.ends_with(&entry.arxiv_id));
161 }
162 }
163
164 #[test]
165 fn test_all_entries_have_topics() {
166 let db = ArxivDatabase::builtin();
167 for entry in &db.entries {
168 assert!(!entry.topics.is_empty(), "Entry {} has no topics", entry.arxiv_id);
169 }
170 }
171
172 #[test]
173 fn test_find_by_topic_limit() {
174 let db = ArxivDatabase::builtin();
175 let results = db.find_by_topic("deep learning", 2);
176 assert!(results.len() <= 2);
177 }
178}