Skip to main content

research_master/sources/
registry.rs

1//! Registry for managing research source plugins.
2
3use std::collections::{HashMap, HashSet};
4use std::sync::Arc;
5
6use super::{Source, SourceError};
7use crate::config::SourceConfig;
8
9// Conditionally import source types based on feature flags
10#[cfg(feature = "source-acm")]
11use super::acm::AcmSource;
12#[cfg(feature = "source-arxiv")]
13use super::arxiv::ArxivSource;
14#[cfg(feature = "source-base")]
15use super::base::BaseSource;
16#[cfg(feature = "source-biorxiv")]
17use super::biorxiv::BiorxivSource;
18#[cfg(feature = "source-connected_papers")]
19use super::connected_papers::ConnectedPapersSource;
20#[cfg(feature = "source-core-repo")]
21use super::core::CoreSource;
22#[cfg(feature = "source-crossref")]
23use super::crossref::CrossRefSource;
24#[cfg(feature = "source-dblp")]
25use super::dblp::DblpSource;
26#[cfg(feature = "source-dimensions")]
27use super::dimensions::DimensionsSource;
28#[cfg(feature = "source-doaj")]
29use super::doaj::DoajSource;
30#[cfg(feature = "source-europe_pmc")]
31use super::europe_pmc::EuropePmcSource;
32#[cfg(feature = "source-google_scholar")]
33use super::google_scholar::GoogleScholarSource;
34#[cfg(feature = "source-hal")]
35use super::hal::HalSource;
36#[cfg(feature = "source-iacr")]
37use super::iacr::IacrSource;
38#[cfg(feature = "source-ieee_xplore")]
39use super::ieee_xplore::IeeeXploreSource;
40#[cfg(feature = "source-jstor")]
41use super::jstor::JstorSource;
42#[cfg(feature = "source-mdpi")]
43use super::mdpi::MdpiSource;
44#[cfg(feature = "source-openalex")]
45use super::openalex::OpenAlexSource;
46#[cfg(feature = "source-osf")]
47use super::osf::OsfSource;
48#[cfg(feature = "source-pmc")]
49use super::pmc::PmcSource;
50#[cfg(feature = "source-pubmed")]
51use super::pubmed::PubMedSource;
52#[cfg(feature = "source-scispace")]
53use super::scispace::ScispaceSource;
54#[cfg(feature = "source-semantic")]
55use super::semantic::SemanticScholarSource;
56#[cfg(feature = "source-springer")]
57use super::springer::SpringerSource;
58#[cfg(feature = "source-ssrn")]
59use super::ssrn::SsrnSource;
60#[cfg(feature = "source-unpaywall")]
61use super::unpaywall::UnpaywallSource;
62#[cfg(feature = "source-worldwidescience")]
63use super::worldwidescience::WorldWideScienceSource;
64#[cfg(feature = "source-zenodo")]
65use super::zenodo::ZenodoSource;
66
67/// Result of source filtering from config/environment
68#[derive(Debug, Clone, Default)]
69struct SourceFilter {
70    /// Set of explicitly enabled sources (None means all are enabled)
71    enabled: Option<HashSet<String>>,
72    /// Set of explicitly disabled sources (None means none are disabled)
73    disabled: Option<HashSet<String>>,
74}
75
76impl SourceFilter {
77    /// Create a new filter from config (which may include env vars)
78    fn from_config(config: &SourceConfig) -> Self {
79        let enabled = config
80            .enabled_sources
81            .as_ref()
82            .filter(|s| !s.is_empty())
83            .map(|value| {
84                value
85                    .split(',')
86                    .map(|s| s.trim().to_lowercase())
87                    .filter(|s| !s.is_empty())
88                    .collect::<HashSet<_>>()
89            })
90            .filter(|set| !set.is_empty());
91
92        let disabled = config
93            .disabled_sources
94            .as_ref()
95            .filter(|s| !s.is_empty())
96            .map(|value| {
97                value
98                    .split(',')
99                    .map(|s| s.trim().to_lowercase())
100                    .filter(|s| !s.is_empty())
101                    .collect::<HashSet<_>>()
102            })
103            .filter(|set| !set.is_empty());
104
105        Self { enabled, disabled }
106    }
107
108    /// Check if a source should be enabled based on the filter
109    ///
110    /// Logic:
111    /// - If ENABLE is set and DISABLE is not: only enabled sources
112    /// - If DISABLE is set and ENABLE is not: all except disabled sources
113    /// - If both are set: enabled sources minus disabled sources
114    /// - If neither is set: all sources enabled
115    fn is_enabled(&self, source_id: &str) -> bool {
116        let id_lower = source_id.to_lowercase();
117
118        match (&self.enabled, &self.disabled) {
119            // Both specified: enabled minus disabled
120            (Some(enabled), Some(disabled)) => {
121                enabled.contains(&id_lower) && !disabled.contains(&id_lower)
122            }
123            // Only enabled specified: must be in enabled set
124            (Some(enabled), None) => enabled.contains(&id_lower),
125            // Only disabled specified: must not be in disabled set
126            (None, Some(disabled)) => !disabled.contains(&id_lower),
127            // Neither specified: all enabled
128            (None, None) => true,
129        }
130    }
131}
132
133bitflags::bitflags! {
134    /// Capabilities that a source can support
135    #[derive(Debug, Clone, Copy, PartialEq, Eq)]
136    pub struct SourceCapabilities: u32 {
137        const SEARCH = 1 << 0;
138        const DOWNLOAD = 1 << 1;
139        const READ = 1 << 2;
140        const CITATIONS = 1 << 3;
141        const DOI_LOOKUP = 1 << 4;
142        const AUTHOR_SEARCH = 1 << 5;
143    }
144}
145
146/// Registry for all available research sources
147///
148/// The SourceRegistry manages all available source plugins and provides
149/// methods to query and use them.
150#[derive(Debug, Clone)]
151pub struct SourceRegistry {
152    sources: HashMap<String, Arc<dyn Source>>,
153}
154
155impl SourceRegistry {
156    /// Create a new registry with all available sources
157    pub fn new() -> Self {
158        Self::try_new().expect("Failed to initialize any sources")
159    }
160
161    /// Try to create a new registry with all available sources
162    ///
163    /// This will:
164    /// 1. Filter sources based on config file or environment variables
165    /// 2. Skip sources that fail to initialize (e.g., missing API keys)
166    /// 3. Return an error only if no sources could be initialized
167    pub fn try_new() -> Result<Self, SourceError> {
168        let source_config = crate::config::get_config().sources;
169        let filter = SourceFilter::from_config(&source_config);
170        let mut registry = Self {
171            sources: HashMap::new(),
172        };
173
174        // Helper macro to register a source with error handling
175        macro_rules! try_register {
176            ($source:expr) => {
177                if let Ok(source) = $source {
178                    let source_id = source.id().to_string();
179                    if filter.is_enabled(&source_id) {
180                        registry.register(Arc::new(source));
181                        tracing::info!("Registered source: {}", source_id);
182                    } else {
183                        tracing::debug!("Source '{}' filtered out by source filter", source_id);
184                    }
185                } else {
186                    tracing::warn!("Skipping source: initialization failed");
187                }
188            };
189        }
190
191        // Register all available sources (will skip any that fail to initialize)
192        // Each source is conditionally compiled based on feature flags
193        #[cfg(feature = "source-arxiv")]
194        try_register!(ArxivSource::new());
195
196        #[cfg(feature = "source-pubmed")]
197        try_register!(PubMedSource::new());
198
199        #[cfg(feature = "source-biorxiv")]
200        try_register!(BiorxivSource::new());
201
202        #[cfg(feature = "source-semantic")]
203        try_register!(SemanticScholarSource::new());
204
205        #[cfg(feature = "source-openalex")]
206        try_register!(OpenAlexSource::new());
207
208        #[cfg(feature = "source-crossref")]
209        try_register!(CrossRefSource::new());
210
211        #[cfg(feature = "source-iacr")]
212        try_register!(IacrSource::new());
213
214        #[cfg(feature = "source-pmc")]
215        try_register!(PmcSource::new());
216
217        #[cfg(feature = "source-hal")]
218        try_register!(HalSource::new());
219
220        #[cfg(feature = "source-dblp")]
221        try_register!(DblpSource::new());
222
223        #[cfg(feature = "source-dimensions")]
224        try_register!(DimensionsSource::new());
225
226        #[cfg(feature = "source-ieee_xplore")]
227        try_register!(IeeeXploreSource::new());
228
229        #[cfg(feature = "source-core-repo")]
230        try_register!(CoreSource::new());
231
232        #[cfg(feature = "source-zenodo")]
233        try_register!(ZenodoSource::new());
234
235        #[cfg(feature = "source-unpaywall")]
236        try_register!(UnpaywallSource::new());
237
238        #[cfg(feature = "source-mdpi")]
239        try_register!(MdpiSource::new());
240
241        #[cfg(feature = "source-ssrn")]
242        try_register!(SsrnSource::new());
243
244        #[cfg(feature = "source-jstor")]
245        try_register!(JstorSource::new());
246
247        #[cfg(feature = "source-scispace")]
248        try_register!(ScispaceSource::new());
249
250        #[cfg(feature = "source-acm")]
251        try_register!(AcmSource::new());
252
253        #[cfg(feature = "source-connected_papers")]
254        try_register!(ConnectedPapersSource::new());
255
256        #[cfg(feature = "source-doaj")]
257        try_register!(DoajSource::new());
258
259        #[cfg(feature = "source-europe_pmc")]
260        try_register!(EuropePmcSource::new());
261
262        #[cfg(feature = "source-worldwidescience")]
263        try_register!(WorldWideScienceSource::new());
264
265        #[cfg(feature = "source-osf")]
266        try_register!(OsfSource::new());
267
268        #[cfg(feature = "source-base")]
269        try_register!(BaseSource::new());
270
271        #[cfg(feature = "source-springer")]
272        try_register!(SpringerSource::new());
273
274        #[cfg(feature = "source-google_scholar")]
275        try_register!(GoogleScholarSource::new());
276
277        if registry.is_empty() {
278            return Err(SourceError::Other(
279                "No sources could be initialized. Check configuration and API keys.".to_string(),
280            ));
281        }
282
283        tracing::info!("Initialized {} sources", registry.len());
284
285        Ok(registry)
286    }
287
288    /// Register a new source
289    pub fn register(&mut self, source: Arc<dyn Source>) {
290        self.sources.insert(source.id().to_string(), source);
291    }
292
293    /// Get a source by ID
294    pub fn get(&self, id: &str) -> Option<&Arc<dyn Source>> {
295        self.sources.get(id)
296    }
297
298    /// Get a source by ID, returning an error if not found
299    pub fn get_required(&self, id: &str) -> Result<&Arc<dyn Source>, SourceError> {
300        self.get(id)
301            .ok_or_else(|| SourceError::NotFound(format!("Source '{}' not found", id)))
302    }
303
304    /// Get all registered sources
305    pub fn all(&self) -> impl Iterator<Item = &Arc<dyn Source>> {
306        self.sources.values()
307    }
308
309    /// Get all source IDs
310    pub fn ids(&self) -> impl Iterator<Item = &str> {
311        self.sources.keys().map(|s| s.as_str())
312    }
313
314    /// Get sources that support a specific capability
315    pub fn with_capability(&self, capability: SourceCapabilities) -> Vec<&Arc<dyn Source>> {
316        self.all()
317            .filter(|s| s.capabilities().contains(capability))
318            .collect()
319    }
320
321    /// Get sources that support search
322    pub fn searchable(&self) -> Vec<&Arc<dyn Source>> {
323        self.with_capability(SourceCapabilities::SEARCH)
324    }
325
326    /// Get sources that support download
327    pub fn downloadable(&self) -> Vec<&Arc<dyn Source>> {
328        self.with_capability(SourceCapabilities::DOWNLOAD)
329    }
330
331    /// Get sources that support citations
332    pub fn with_citations(&self) -> Vec<&Arc<dyn Source>> {
333        self.with_capability(SourceCapabilities::CITATIONS)
334    }
335
336    /// Check if a source exists
337    pub fn has(&self, id: &str) -> bool {
338        self.sources.contains_key(id)
339    }
340
341    /// Get the number of registered sources
342    pub fn len(&self) -> usize {
343        self.sources.len()
344    }
345
346    /// Check if the registry is empty
347    pub fn is_empty(&self) -> bool {
348        self.sources.is_empty()
349    }
350}
351
352impl Default for SourceRegistry {
353    fn default() -> Self {
354        Self::new()
355    }
356}
357
358#[cfg(test)]
359mod tests {
360    use super::*;
361    use std::sync::{Mutex, OnceLock};
362
363    fn env_lock() -> &'static Mutex<()> {
364        static LOCK: OnceLock<Mutex<()>> = OnceLock::new();
365        LOCK.get_or_init(|| Mutex::new(()))
366    }
367
368    #[test]
369    fn test_registry_basic() {
370        let registry = SourceRegistry::new();
371
372        // Should have some sources (at least one)
373        assert!(!registry.is_empty());
374    }
375
376    #[test]
377    fn test_get_source() {
378        let registry = SourceRegistry::new();
379
380        let arxiv = registry.get("arxiv");
381        // arxiv should be available if not filtered
382        if let Some(arxiv) = arxiv {
383            assert_eq!(arxiv.id(), "arxiv");
384        }
385
386        let missing = registry.get("nonexistent");
387        assert!(missing.is_none());
388    }
389
390    /// Helper to set up env vars for tests with proper isolation
391    fn with_source_env_vars<F>(enabled: Option<&str>, disabled: Option<&str>, test: F)
392    where
393        F: FnOnce(),
394    {
395        let _guard = env_lock().lock().expect("env lock poisoned");
396        // Save original values
397        let orig_enabled = std::env::var("RESEARCH_MASTER_ENABLED_SOURCES").ok();
398        let orig_disabled = std::env::var("RESEARCH_MASTER_DISABLED_SOURCES").ok();
399
400        // Set new values
401        match enabled {
402            Some(v) => std::env::set_var("RESEARCH_MASTER_ENABLED_SOURCES", v),
403            None => std::env::remove_var("RESEARCH_MASTER_ENABLED_SOURCES"),
404        }
405        match disabled {
406            Some(v) => std::env::set_var("RESEARCH_MASTER_DISABLED_SOURCES", v),
407            None => std::env::remove_var("RESEARCH_MASTER_DISABLED_SOURCES"),
408        }
409
410        // Run test
411        test();
412
413        // Restore original values
414        match orig_enabled {
415            Some(v) => std::env::set_var("RESEARCH_MASTER_ENABLED_SOURCES", v),
416            None => std::env::remove_var("RESEARCH_MASTER_ENABLED_SOURCES"),
417        }
418        match orig_disabled {
419            Some(v) => std::env::set_var("RESEARCH_MASTER_DISABLED_SOURCES", v),
420            None => std::env::remove_var("RESEARCH_MASTER_DISABLED_SOURCES"),
421        }
422    }
423
424    #[test]
425    fn test_source_filter_only_enabled() {
426        // Test: ENABLE only - only enabled sources
427        with_source_env_vars(Some("arxiv,pubmed"), None, || {
428            let config = crate::config::get_config().sources;
429            let filter = SourceFilter::from_config(&config);
430            assert!(filter.is_enabled("arxiv"));
431            assert!(filter.is_enabled("pubmed"));
432            assert!(!filter.is_enabled("semantic"));
433            assert!(filter.is_enabled("ARXIV")); // Case insensitive - ARXIV should be enabled
434        });
435    }
436
437    #[test]
438    fn test_source_filter_only_disabled() {
439        // Test: DISABLE only - all except disabled
440        with_source_env_vars(None, Some("dblp,jstor"), || {
441            let config = crate::config::get_config().sources;
442            let filter = SourceFilter::from_config(&config);
443            assert!(filter.is_enabled("arxiv"));
444            assert!(filter.is_enabled("pubmed"));
445            assert!(!filter.is_enabled("dblp"));
446            assert!(!filter.is_enabled("jstor"));
447            assert!(!filter.is_enabled("DBLP")); // Case insensitive
448        });
449    }
450
451    #[test]
452    fn test_source_filter_both_enabled_and_disabled() {
453        // Test: Both ENABLE and DISABLE - enabled minus disabled
454        with_source_env_vars(Some("arxiv,pubmed,semantic,dblp"), Some("dblp"), || {
455            let config = crate::config::get_config().sources;
456            let filter = SourceFilter::from_config(&config);
457            assert!(filter.is_enabled("arxiv"));
458            assert!(filter.is_enabled("pubmed"));
459            assert!(filter.is_enabled("semantic"));
460            assert!(!filter.is_enabled("dblp")); // In enabled but also in disabled
461        });
462    }
463
464    #[test]
465    fn test_source_filter_neither() {
466        // Test: Neither set - all enabled
467        with_source_env_vars(None, None, || {
468            let config = crate::config::get_config().sources;
469            let filter = SourceFilter::from_config(&config);
470            assert!(filter.is_enabled("arxiv"));
471            assert!(filter.is_enabled("pubmed"));
472            assert!(filter.is_enabled("semantic"));
473            assert!(filter.is_enabled("dblp"));
474        });
475    }
476
477    #[test]
478    fn test_source_filter_empty_values() {
479        // Test: Empty values should be treated as not set
480        with_source_env_vars(Some(""), Some(""), || {
481            let config = crate::config::get_config().sources;
482            let filter = SourceFilter::from_config(&config);
483            // Empty strings should result in all sources enabled
484            assert!(filter.is_enabled("arxiv"));
485            assert!(filter.is_enabled("pubmed"));
486        });
487    }
488
489    #[test]
490    fn test_searchable_sources() {
491        let registry = SourceRegistry::new();
492
493        let searchable = registry.searchable();
494        // Should have at least some searchable sources
495        assert!(!searchable.is_empty());
496    }
497
498    #[test]
499    fn test_capabilities() {
500        let registry = SourceRegistry::new();
501
502        // arXiv should support search, download, read (if available)
503        if let Some(arxiv) = registry.get("arxiv") {
504            assert!(arxiv.capabilities().contains(SourceCapabilities::SEARCH));
505            assert!(arxiv.capabilities().contains(SourceCapabilities::DOWNLOAD));
506            assert!(arxiv.capabilities().contains(SourceCapabilities::READ));
507        }
508
509        // Semantic Scholar should support citations (if available)
510        if let Some(semantic) = registry.get("semantic") {
511            assert!(semantic
512                .capabilities()
513                .contains(SourceCapabilities::CITATIONS));
514            assert!(semantic
515                .capabilities()
516                .contains(SourceCapabilities::AUTHOR_SEARCH));
517        }
518
519        // DBLP should only support search (if available)
520        if let Some(dblp) = registry.get("dblp") {
521            assert!(dblp.capabilities().contains(SourceCapabilities::SEARCH));
522            assert!(!dblp.capabilities().contains(SourceCapabilities::DOWNLOAD));
523        }
524    }
525}