ruvector_data_framework/
lib.rs

1//! # RuVector Data Discovery Framework
2//!
3//! Core traits and types for building dataset integrations with RuVector's
4//! vector memory, graph structures, and dynamic minimum cut algorithms.
5//!
6//! ## Architecture
7//!
8//! The framework provides three core abstractions:
9//!
10//! 1. **DataIngester**: Streaming data ingestion with batched graph/vector updates
11//! 2. **CoherenceEngine**: Real-time coherence signal computation using min-cut
12//! 3. **DiscoveryEngine**: Pattern detection for emerging structures and anomalies
13//!
14//! ## Quick Start
15//!
16//! ```rust,ignore
17//! use ruvector_data_framework::{
18//!     DataIngester, CoherenceEngine, DiscoveryEngine,
19//!     IngestionConfig, CoherenceConfig, DiscoveryConfig,
20//! };
21//!
22//! // Configure the discovery pipeline
23//! let ingester = DataIngester::new(ingestion_config);
24//! let coherence = CoherenceEngine::new(coherence_config);
25//! let discovery = DiscoveryEngine::new(discovery_config);
26//!
27//! // Stream data and detect patterns
28//! let stream = ingester.stream_from_source(source).await?;
29//! let signals = coherence.compute_signals(stream).await?;
30//! let patterns = discovery.detect_patterns(signals).await?;
31//! ```
32
33#![warn(missing_docs)]
34#![warn(clippy::all)]
35
36pub mod academic_clients;
37pub mod api_clients;
38pub mod arxiv_client;
39pub mod biorxiv_client;
40pub mod coherence;
41pub mod crossref_client;
42pub mod discovery;
43pub mod dynamic_mincut;
44pub mod economic_clients;
45pub mod export;
46pub mod finance_clients;
47pub mod forecasting;
48pub mod genomics_clients;
49pub mod geospatial_clients;
50pub mod government_clients;
51pub mod hnsw;
52pub mod cut_aware_hnsw;
53pub mod ingester;
54pub mod mcp_server;
55pub mod medical_clients;
56pub mod ml_clients;
57pub mod news_clients;
58pub mod optimized;
59pub mod patent_clients;
60pub mod persistence;
61pub mod physics_clients;
62pub mod realtime;
63pub mod ruvector_native;
64pub mod semantic_scholar;
65pub mod space_clients;
66pub mod streaming;
67pub mod transportation_clients;
68pub mod visualization;
69pub mod wiki_clients;
70
71use std::collections::HashMap;
72use std::sync::Arc;
73
74use async_trait::async_trait;
75use chrono::{DateTime, Utc};
76use serde::{Deserialize, Serialize};
77use thiserror::Error;
78
79// Re-exports
80pub use academic_clients::{CoreClient, EricClient, UnpaywallClient};
81pub use api_clients::{EdgarClient, Embedder, NoaaClient, OpenAlexClient, SimpleEmbedder};
82#[cfg(feature = "onnx-embeddings")]
83pub use api_clients::OnnxEmbedder;
84#[cfg(feature = "onnx-embeddings")]
85pub use ruvector_onnx_embeddings::{PretrainedModel, EmbedderConfig, PoolingStrategy};
86pub use arxiv_client::ArxivClient;
87pub use biorxiv_client::{BiorxivClient, MedrxivClient};
88pub use crossref_client::CrossRefClient;
89pub use economic_clients::{AlphaVantageClient, FredClient, WorldBankClient};
90pub use finance_clients::{BlsClient, CoinGeckoClient, EcbClient, FinnhubClient, TwelveDataClient};
91pub use genomics_clients::{EnsemblClient, GwasClient, NcbiClient, UniProtClient};
92pub use geospatial_clients::{GeonamesClient, NominatimClient, OpenElevationClient, OverpassClient};
93pub use government_clients::{
94    CensusClient, DataGovClient, EuOpenDataClient, UkGovClient, UNDataClient,
95    WorldBankClient as WorldBankGovClient,
96};
97pub use medical_clients::{ClinicalTrialsClient, FdaClient, PubMedClient};
98pub use ml_clients::{
99    HuggingFaceClient, HuggingFaceDataset, HuggingFaceModel, OllamaClient, OllamaModel,
100    PapersWithCodeClient, PaperWithCodeDataset, PaperWithCodePaper, ReplicateClient,
101    ReplicateModel, TogetherAiClient, TogetherModel,
102};
103pub use news_clients::{GuardianClient, HackerNewsClient, NewsDataClient, RedditClient};
104pub use patent_clients::{EpoClient, UsptoPatentClient};
105pub use physics_clients::{ArgoClient, CernOpenDataClient, GeoUtils, MaterialsProjectClient, UsgsEarthquakeClient};
106pub use semantic_scholar::SemanticScholarClient;
107pub use space_clients::{AstronomyClient, ExoplanetClient, NasaClient, SpaceXClient};
108pub use transportation_clients::{GtfsClient, MobilityDatabaseClient, OpenChargeMapClient, OpenRouteServiceClient};
109pub use wiki_clients::{WikidataClient, WikidataEntity, WikipediaClient};
110pub use coherence::{
111    CoherenceBoundary, CoherenceConfig, CoherenceEngine, CoherenceEvent, CoherenceSignal,
112};
113pub use cut_aware_hnsw::{
114    CutAwareHNSW, CutAwareConfig, CutAwareMetrics, CoherenceZone,
115    SearchResult as CutAwareSearchResult, EdgeUpdate as CutAwareEdgeUpdate, UpdateKind, LayerCutStats,
116};
117pub use discovery::{
118    DiscoveryConfig, DiscoveryEngine, DiscoveryPattern, PatternCategory, PatternStrength,
119};
120pub use dynamic_mincut::{
121    CutGatedSearch, CutWatcherConfig, DynamicCutWatcher, DynamicMinCutError,
122    EdgeUpdate as DynamicEdgeUpdate, EdgeUpdateType, EulerTourTree, HNSWGraph,
123    LocalCut, LocalMinCutProcedure, WatcherStats,
124};
125pub use export::{
126    export_all, export_coherence_csv, export_dot, export_graphml, export_patterns_csv,
127    export_patterns_with_evidence_csv, ExportFilter,
128};
129pub use forecasting::{CoherenceForecaster, CrossDomainForecaster, Forecast, Trend};
130pub use ingester::{DataIngester, IngestionConfig, IngestionStats, SourceConfig};
131pub use realtime::{FeedItem, FeedSource, NewsAggregator, NewsSource, RealTimeEngine};
132pub use ruvector_native::{
133    CoherenceHistoryEntry, CoherenceSnapshot, Domain, DiscoveredPattern,
134    GraphExport, NativeDiscoveryEngine, NativeEngineConfig, SemanticVector,
135};
136pub use streaming::{StreamingConfig, StreamingEngine, StreamingEngineBuilder, StreamingMetrics};
137
138/// Framework error types
139#[derive(Error, Debug)]
140pub enum FrameworkError {
141    /// Data ingestion failed
142    #[error("Ingestion error: {0}")]
143    Ingestion(String),
144
145    /// Coherence computation failed
146    #[error("Coherence error: {0}")]
147    Coherence(String),
148
149    /// Discovery algorithm failed
150    #[error("Discovery error: {0}")]
151    Discovery(String),
152
153    /// Network/API error
154    #[error("Network error: {0}")]
155    Network(#[from] reqwest::Error),
156
157    /// Serialization error
158    #[error("Serialization error: {0}")]
159    Serialization(#[from] serde_json::Error),
160
161    /// Graph operation failed
162    #[error("Graph error: {0}")]
163    Graph(String),
164
165    /// Configuration error
166    #[error("Config error: {0}")]
167    Config(String),
168}
169
170/// Result type for framework operations
171pub type Result<T> = std::result::Result<T, FrameworkError>;
172
173/// A timestamped data record from any source
174#[derive(Debug, Clone, Serialize, Deserialize)]
175pub struct DataRecord {
176    /// Unique identifier
177    pub id: String,
178
179    /// Source dataset (e.g., "openalex", "noaa", "edgar")
180    pub source: String,
181
182    /// Record type within source (e.g., "work", "author", "filing")
183    pub record_type: String,
184
185    /// Timestamp when data was observed/published
186    pub timestamp: DateTime<Utc>,
187
188    /// Raw data payload
189    pub data: serde_json::Value,
190
191    /// Pre-computed embedding vector (optional)
192    pub embedding: Option<Vec<f32>>,
193
194    /// Relationships to other records
195    pub relationships: Vec<Relationship>,
196}
197
198/// A relationship between two records
199#[derive(Debug, Clone, Serialize, Deserialize)]
200pub struct Relationship {
201    /// Target record ID
202    pub target_id: String,
203
204    /// Relationship type (e.g., "cites", "authored_by", "filed_by")
205    pub rel_type: String,
206
207    /// Relationship weight/strength
208    pub weight: f64,
209
210    /// Additional properties
211    pub properties: HashMap<String, serde_json::Value>,
212}
213
214/// Trait for data sources that can be ingested
215#[async_trait]
216pub trait DataSource: Send + Sync {
217    /// Source identifier
218    fn source_id(&self) -> &str;
219
220    /// Fetch a batch of records starting from cursor
221    async fn fetch_batch(
222        &self,
223        cursor: Option<String>,
224        batch_size: usize,
225    ) -> Result<(Vec<DataRecord>, Option<String>)>;
226
227    /// Get total record count (if known)
228    async fn total_count(&self) -> Result<Option<u64>>;
229
230    /// Check if source is available
231    async fn health_check(&self) -> Result<bool>;
232}
233
234/// Trait for computing embeddings from records
235#[async_trait]
236pub trait EmbeddingProvider: Send + Sync {
237    /// Compute embedding for a single record
238    async fn embed_record(&self, record: &DataRecord) -> Result<Vec<f32>>;
239
240    /// Compute embeddings for a batch of records
241    async fn embed_batch(&self, records: &[DataRecord]) -> Result<Vec<Vec<f32>>>;
242
243    /// Embedding dimension
244    fn dimension(&self) -> usize;
245}
246
247/// Trait for graph building from records
248pub trait GraphBuilder: Send + Sync {
249    /// Add a node from a data record
250    fn add_node(&mut self, record: &DataRecord) -> Result<u64>;
251
252    /// Add an edge between nodes
253    fn add_edge(&mut self, source: u64, target: u64, weight: f64, rel_type: &str) -> Result<()>;
254
255    /// Get node count
256    fn node_count(&self) -> usize;
257
258    /// Get edge count
259    fn edge_count(&self) -> usize;
260}
261
262/// Temporal window for time-series analysis
263#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
264pub struct TemporalWindow {
265    /// Window start
266    pub start: DateTime<Utc>,
267
268    /// Window end
269    pub end: DateTime<Utc>,
270
271    /// Window identifier (for sliding windows)
272    pub window_id: u64,
273}
274
275impl TemporalWindow {
276    /// Create a new temporal window
277    pub fn new(start: DateTime<Utc>, end: DateTime<Utc>, window_id: u64) -> Self {
278        Self {
279            start,
280            end,
281            window_id,
282        }
283    }
284
285    /// Duration in seconds
286    pub fn duration_secs(&self) -> i64 {
287        (self.end - self.start).num_seconds()
288    }
289
290    /// Check if timestamp falls within window
291    pub fn contains(&self, timestamp: DateTime<Utc>) -> bool {
292        timestamp >= self.start && timestamp < self.end
293    }
294}
295
296/// Statistics for a discovery session
297#[derive(Debug, Clone, Default, Serialize, Deserialize)]
298pub struct DiscoveryStats {
299    /// Records processed
300    pub records_processed: u64,
301
302    /// Nodes in graph
303    pub nodes_created: u64,
304
305    /// Edges in graph
306    pub edges_created: u64,
307
308    /// Coherence signals computed
309    pub signals_computed: u64,
310
311    /// Patterns discovered
312    pub patterns_discovered: u64,
313
314    /// Processing duration in milliseconds
315    pub duration_ms: u64,
316
317    /// Peak memory usage in bytes
318    pub peak_memory_bytes: u64,
319}
320
321/// Configuration for the entire discovery pipeline
322#[derive(Debug, Clone, Serialize, Deserialize)]
323pub struct PipelineConfig {
324    /// Ingestion configuration
325    pub ingestion: IngestionConfig,
326
327    /// Coherence engine configuration
328    pub coherence: CoherenceConfig,
329
330    /// Discovery engine configuration
331    pub discovery: DiscoveryConfig,
332
333    /// Enable parallel processing
334    pub parallel: bool,
335
336    /// Checkpoint interval (records)
337    pub checkpoint_interval: u64,
338
339    /// Output directory for results
340    pub output_dir: String,
341}
342
343impl Default for PipelineConfig {
344    fn default() -> Self {
345        Self {
346            ingestion: IngestionConfig::default(),
347            coherence: CoherenceConfig::default(),
348            discovery: DiscoveryConfig::default(),
349            parallel: true,
350            checkpoint_interval: 10_000,
351            output_dir: "./discovery_output".to_string(),
352        }
353    }
354}
355
356/// Main discovery pipeline orchestrator
357pub struct DiscoveryPipeline {
358    config: PipelineConfig,
359    ingester: DataIngester,
360    coherence: CoherenceEngine,
361    discovery: DiscoveryEngine,
362    stats: Arc<std::sync::RwLock<DiscoveryStats>>,
363}
364
365impl DiscoveryPipeline {
366    /// Create a new discovery pipeline
367    pub fn new(config: PipelineConfig) -> Self {
368        let ingester = DataIngester::new(config.ingestion.clone());
369        let coherence = CoherenceEngine::new(config.coherence.clone());
370        let discovery = DiscoveryEngine::new(config.discovery.clone());
371
372        Self {
373            config,
374            ingester,
375            coherence,
376            discovery,
377            stats: Arc::new(std::sync::RwLock::new(DiscoveryStats::default())),
378        }
379    }
380
381    /// Run the discovery pipeline on a data source
382    pub async fn run<S: DataSource>(&mut self, source: S) -> Result<Vec<DiscoveryPattern>> {
383        let start_time = std::time::Instant::now();
384
385        // Phase 1: Ingest data
386        tracing::info!("Starting ingestion from source: {}", source.source_id());
387        let records = self.ingester.ingest_all(&source).await?;
388
389        {
390            let mut stats = self.stats.write().unwrap();
391            stats.records_processed = records.len() as u64;
392        }
393
394        // Phase 2: Build graph and compute coherence
395        tracing::info!("Computing coherence signals over {} records", records.len());
396        let signals = self.coherence.compute_from_records(&records)?;
397
398        {
399            let mut stats = self.stats.write().unwrap();
400            stats.signals_computed = signals.len() as u64;
401            stats.nodes_created = self.coherence.node_count() as u64;
402            stats.edges_created = self.coherence.edge_count() as u64;
403        }
404
405        // Phase 3: Detect patterns
406        tracing::info!("Detecting discovery patterns");
407        let patterns = self.discovery.detect(&signals)?;
408
409        {
410            let mut stats = self.stats.write().unwrap();
411            stats.patterns_discovered = patterns.len() as u64;
412            stats.duration_ms = start_time.elapsed().as_millis() as u64;
413        }
414
415        tracing::info!(
416            "Discovery complete: {} patterns found in {}ms",
417            patterns.len(),
418            start_time.elapsed().as_millis()
419        );
420
421        Ok(patterns)
422    }
423
424    /// Get current statistics
425    pub fn stats(&self) -> DiscoveryStats {
426        self.stats.read().unwrap().clone()
427    }
428}
429
430#[cfg(test)]
431mod tests {
432    use super::*;
433
434    #[test]
435    fn test_temporal_window() {
436        let start = Utc::now();
437        let end = start + chrono::Duration::hours(1);
438        let window = TemporalWindow::new(start, end, 1);
439
440        assert_eq!(window.duration_secs(), 3600);
441        assert!(window.contains(start + chrono::Duration::minutes(30)));
442        assert!(!window.contains(start - chrono::Duration::minutes(1)));
443        assert!(!window.contains(end + chrono::Duration::minutes(1)));
444    }
445
446    #[test]
447    fn test_default_pipeline_config() {
448        let config = PipelineConfig::default();
449        assert!(config.parallel);
450        assert_eq!(config.checkpoint_interval, 10_000);
451    }
452
453    #[test]
454    fn test_data_record_serialization() {
455        let record = DataRecord {
456            id: "test-1".to_string(),
457            source: "test".to_string(),
458            record_type: "document".to_string(),
459            timestamp: Utc::now(),
460            data: serde_json::json!({"title": "Test"}),
461            embedding: Some(vec![0.1, 0.2, 0.3]),
462            relationships: vec![],
463        };
464
465        let json = serde_json::to_string(&record).unwrap();
466        let parsed: DataRecord = serde_json::from_str(&json).unwrap();
467        assert_eq!(parsed.id, record.id);
468    }
469}