ruvector_data_framework/
lib.rs

1//! # RuVector Data Discovery Framework
2//!
3//! Core traits and types for building dataset integrations with RuVector's
4//! vector memory, graph structures, and dynamic minimum cut algorithms.
5//!
6//! ## Architecture
7//!
8//! The framework provides three core abstractions:
9//!
10//! 1. **DataIngester**: Streaming data ingestion with batched graph/vector updates
11//! 2. **CoherenceEngine**: Real-time coherence signal computation using min-cut
12//! 3. **DiscoveryEngine**: Pattern detection for emerging structures and anomalies
13//!
14//! ## Quick Start
15//!
16//! ```rust,ignore
17//! use ruvector_data_framework::{
18//!     DataIngester, CoherenceEngine, DiscoveryEngine,
19//!     IngestionConfig, CoherenceConfig, DiscoveryConfig,
20//! };
21//!
22//! // Configure the discovery pipeline
23//! let ingester = DataIngester::new(ingestion_config);
24//! let coherence = CoherenceEngine::new(coherence_config);
25//! let discovery = DiscoveryEngine::new(discovery_config);
26//!
27//! // Stream data and detect patterns
28//! let stream = ingester.stream_from_source(source).await?;
29//! let signals = coherence.compute_signals(stream).await?;
30//! let patterns = discovery.detect_patterns(signals).await?;
31//! ```
32
33#![warn(missing_docs)]
34#![warn(clippy::all)]
35
36pub mod academic_clients;
37pub mod api_clients;
38pub mod arxiv_client;
39pub mod biorxiv_client;
40pub mod coherence;
41pub mod crossref_client;
42pub mod discovery;
43pub mod dynamic_mincut;
44pub mod economic_clients;
45pub mod export;
46pub mod finance_clients;
47pub mod forecasting;
48pub mod genomics_clients;
49pub mod geospatial_clients;
50pub mod government_clients;
51pub mod hnsw;
52pub mod cut_aware_hnsw;
53pub mod ingester;
54pub mod mcp_server;
55pub mod medical_clients;
56pub mod ml_clients;
57pub mod news_clients;
58pub mod optimized;
59pub mod patent_clients;
60pub mod persistence;
61pub mod physics_clients;
62pub mod realtime;
63pub mod ruvector_native;
64pub mod semantic_scholar;
65pub mod space_clients;
66pub mod streaming;
67pub mod transportation_clients;
68pub mod utils;
69pub mod visualization;
70pub mod wiki_clients;
71
72use std::collections::HashMap;
73use std::sync::Arc;
74
75use async_trait::async_trait;
76use chrono::{DateTime, Utc};
77use serde::{Deserialize, Serialize};
78use thiserror::Error;
79
80// Re-exports
81pub use academic_clients::{CoreClient, EricClient, UnpaywallClient};
82pub use api_clients::{EdgarClient, Embedder, NoaaClient, OpenAlexClient, SimpleEmbedder};
83#[cfg(feature = "onnx-embeddings")]
84pub use api_clients::OnnxEmbedder;
85#[cfg(feature = "onnx-embeddings")]
86pub use ruvector_onnx_embeddings::{PretrainedModel, EmbedderConfig, PoolingStrategy};
87pub use arxiv_client::ArxivClient;
88pub use biorxiv_client::{BiorxivClient, MedrxivClient};
89pub use crossref_client::CrossRefClient;
90pub use economic_clients::{AlphaVantageClient, FredClient, WorldBankClient};
91pub use finance_clients::{BlsClient, CoinGeckoClient, EcbClient, FinnhubClient, TwelveDataClient};
92pub use genomics_clients::{EnsemblClient, GwasClient, NcbiClient, UniProtClient};
93pub use geospatial_clients::{GeonamesClient, NominatimClient, OpenElevationClient, OverpassClient};
94pub use government_clients::{
95    CensusClient, DataGovClient, EuOpenDataClient, UkGovClient, UNDataClient,
96    WorldBankClient as WorldBankGovClient,
97};
98pub use medical_clients::{ClinicalTrialsClient, FdaClient, PubMedClient};
99pub use ml_clients::{
100    HuggingFaceClient, HuggingFaceDataset, HuggingFaceModel, OllamaClient, OllamaModel,
101    PapersWithCodeClient, PaperWithCodeDataset, PaperWithCodePaper, ReplicateClient,
102    ReplicateModel, TogetherAiClient, TogetherModel,
103};
104pub use news_clients::{GuardianClient, HackerNewsClient, NewsDataClient, RedditClient};
105pub use patent_clients::{EpoClient, UsptoPatentClient};
106pub use physics_clients::{ArgoClient, CernOpenDataClient, GeoUtils, MaterialsProjectClient, UsgsEarthquakeClient};
107pub use semantic_scholar::SemanticScholarClient;
108pub use space_clients::{AstronomyClient, ExoplanetClient, NasaClient, SpaceXClient};
109pub use transportation_clients::{GtfsClient, MobilityDatabaseClient, OpenChargeMapClient, OpenRouteServiceClient};
110pub use wiki_clients::{WikidataClient, WikidataEntity, WikipediaClient};
111pub use coherence::{
112    CoherenceBoundary, CoherenceConfig, CoherenceEngine, CoherenceEvent, CoherenceSignal,
113};
114pub use cut_aware_hnsw::{
115    CutAwareHNSW, CutAwareConfig, CutAwareMetrics, CoherenceZone,
116    SearchResult as CutAwareSearchResult, EdgeUpdate as CutAwareEdgeUpdate, UpdateKind, LayerCutStats,
117};
118pub use discovery::{
119    DiscoveryConfig, DiscoveryEngine, DiscoveryPattern, PatternCategory, PatternStrength,
120};
121pub use dynamic_mincut::{
122    CutGatedSearch, CutWatcherConfig, DynamicCutWatcher, DynamicMinCutError,
123    EdgeUpdate as DynamicEdgeUpdate, EdgeUpdateType, EulerTourTree, HNSWGraph,
124    LocalCut, LocalMinCutProcedure, WatcherStats,
125};
126pub use export::{
127    export_all, export_coherence_csv, export_dot, export_graphml, export_patterns_csv,
128    export_patterns_with_evidence_csv, ExportFilter,
129};
130pub use forecasting::{CoherenceForecaster, CrossDomainForecaster, Forecast, Trend};
131pub use ingester::{DataIngester, IngestionConfig, IngestionStats, SourceConfig};
132pub use realtime::{FeedItem, FeedSource, NewsAggregator, NewsSource, RealTimeEngine};
133pub use ruvector_native::{
134    CoherenceHistoryEntry, CoherenceSnapshot, Domain, DiscoveredPattern,
135    GraphExport, NativeDiscoveryEngine, NativeEngineConfig, SemanticVector,
136};
137pub use streaming::{StreamingConfig, StreamingEngine, StreamingEngineBuilder, StreamingMetrics};
138
139/// Framework error types
140#[derive(Error, Debug)]
141pub enum FrameworkError {
142    /// Data ingestion failed
143    #[error("Ingestion error: {0}")]
144    Ingestion(String),
145
146    /// Coherence computation failed
147    #[error("Coherence error: {0}")]
148    Coherence(String),
149
150    /// Discovery algorithm failed
151    #[error("Discovery error: {0}")]
152    Discovery(String),
153
154    /// Network/API error
155    #[error("Network error: {0}")]
156    Network(#[from] reqwest::Error),
157
158    /// Serialization error
159    #[error("Serialization error: {0}")]
160    Serialization(#[from] serde_json::Error),
161
162    /// Graph operation failed
163    #[error("Graph error: {0}")]
164    Graph(String),
165
166    /// Configuration error
167    #[error("Config error: {0}")]
168    Config(String),
169}
170
171/// Result type for framework operations
172pub type Result<T> = std::result::Result<T, FrameworkError>;
173
174/// A timestamped data record from any source
175#[derive(Debug, Clone, Serialize, Deserialize)]
176pub struct DataRecord {
177    /// Unique identifier
178    pub id: String,
179
180    /// Source dataset (e.g., "openalex", "noaa", "edgar")
181    pub source: String,
182
183    /// Record type within source (e.g., "work", "author", "filing")
184    pub record_type: String,
185
186    /// Timestamp when data was observed/published
187    pub timestamp: DateTime<Utc>,
188
189    /// Raw data payload
190    pub data: serde_json::Value,
191
192    /// Pre-computed embedding vector (optional)
193    pub embedding: Option<Vec<f32>>,
194
195    /// Relationships to other records
196    pub relationships: Vec<Relationship>,
197}
198
199/// A relationship between two records
200#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct Relationship {
202    /// Target record ID
203    pub target_id: String,
204
205    /// Relationship type (e.g., "cites", "authored_by", "filed_by")
206    pub rel_type: String,
207
208    /// Relationship weight/strength
209    pub weight: f64,
210
211    /// Additional properties
212    pub properties: HashMap<String, serde_json::Value>,
213}
214
215/// Trait for data sources that can be ingested
216#[async_trait]
217pub trait DataSource: Send + Sync {
218    /// Source identifier
219    fn source_id(&self) -> &str;
220
221    /// Fetch a batch of records starting from cursor
222    async fn fetch_batch(
223        &self,
224        cursor: Option<String>,
225        batch_size: usize,
226    ) -> Result<(Vec<DataRecord>, Option<String>)>;
227
228    /// Get total record count (if known)
229    async fn total_count(&self) -> Result<Option<u64>>;
230
231    /// Check if source is available
232    async fn health_check(&self) -> Result<bool>;
233}
234
235/// Trait for computing embeddings from records
236#[async_trait]
237pub trait EmbeddingProvider: Send + Sync {
238    /// Compute embedding for a single record
239    async fn embed_record(&self, record: &DataRecord) -> Result<Vec<f32>>;
240
241    /// Compute embeddings for a batch of records
242    async fn embed_batch(&self, records: &[DataRecord]) -> Result<Vec<Vec<f32>>>;
243
244    /// Embedding dimension
245    fn dimension(&self) -> usize;
246}
247
248/// Trait for graph building from records
249pub trait GraphBuilder: Send + Sync {
250    /// Add a node from a data record
251    fn add_node(&mut self, record: &DataRecord) -> Result<u64>;
252
253    /// Add an edge between nodes
254    fn add_edge(&mut self, source: u64, target: u64, weight: f64, rel_type: &str) -> Result<()>;
255
256    /// Get node count
257    fn node_count(&self) -> usize;
258
259    /// Get edge count
260    fn edge_count(&self) -> usize;
261}
262
263/// Temporal window for time-series analysis
264#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
265pub struct TemporalWindow {
266    /// Window start
267    pub start: DateTime<Utc>,
268
269    /// Window end
270    pub end: DateTime<Utc>,
271
272    /// Window identifier (for sliding windows)
273    pub window_id: u64,
274}
275
276impl TemporalWindow {
277    /// Create a new temporal window
278    pub fn new(start: DateTime<Utc>, end: DateTime<Utc>, window_id: u64) -> Self {
279        Self {
280            start,
281            end,
282            window_id,
283        }
284    }
285
286    /// Duration in seconds
287    pub fn duration_secs(&self) -> i64 {
288        (self.end - self.start).num_seconds()
289    }
290
291    /// Check if timestamp falls within window
292    pub fn contains(&self, timestamp: DateTime<Utc>) -> bool {
293        timestamp >= self.start && timestamp < self.end
294    }
295}
296
297/// Statistics for a discovery session
298#[derive(Debug, Clone, Default, Serialize, Deserialize)]
299pub struct DiscoveryStats {
300    /// Records processed
301    pub records_processed: u64,
302
303    /// Nodes in graph
304    pub nodes_created: u64,
305
306    /// Edges in graph
307    pub edges_created: u64,
308
309    /// Coherence signals computed
310    pub signals_computed: u64,
311
312    /// Patterns discovered
313    pub patterns_discovered: u64,
314
315    /// Processing duration in milliseconds
316    pub duration_ms: u64,
317
318    /// Peak memory usage in bytes
319    pub peak_memory_bytes: u64,
320}
321
322/// Configuration for the entire discovery pipeline
323#[derive(Debug, Clone, Serialize, Deserialize)]
324pub struct PipelineConfig {
325    /// Ingestion configuration
326    pub ingestion: IngestionConfig,
327
328    /// Coherence engine configuration
329    pub coherence: CoherenceConfig,
330
331    /// Discovery engine configuration
332    pub discovery: DiscoveryConfig,
333
334    /// Enable parallel processing
335    pub parallel: bool,
336
337    /// Checkpoint interval (records)
338    pub checkpoint_interval: u64,
339
340    /// Output directory for results
341    pub output_dir: String,
342}
343
344impl Default for PipelineConfig {
345    fn default() -> Self {
346        Self {
347            ingestion: IngestionConfig::default(),
348            coherence: CoherenceConfig::default(),
349            discovery: DiscoveryConfig::default(),
350            parallel: true,
351            checkpoint_interval: 10_000,
352            output_dir: "./discovery_output".to_string(),
353        }
354    }
355}
356
357/// Main discovery pipeline orchestrator
358pub struct DiscoveryPipeline {
359    config: PipelineConfig,
360    ingester: DataIngester,
361    coherence: CoherenceEngine,
362    discovery: DiscoveryEngine,
363    stats: Arc<std::sync::RwLock<DiscoveryStats>>,
364}
365
366impl DiscoveryPipeline {
367    /// Create a new discovery pipeline
368    pub fn new(config: PipelineConfig) -> Self {
369        let ingester = DataIngester::new(config.ingestion.clone());
370        let coherence = CoherenceEngine::new(config.coherence.clone());
371        let discovery = DiscoveryEngine::new(config.discovery.clone());
372
373        Self {
374            config,
375            ingester,
376            coherence,
377            discovery,
378            stats: Arc::new(std::sync::RwLock::new(DiscoveryStats::default())),
379        }
380    }
381
382    /// Run the discovery pipeline on a data source
383    pub async fn run<S: DataSource>(&mut self, source: S) -> Result<Vec<DiscoveryPattern>> {
384        let start_time = std::time::Instant::now();
385
386        // Phase 1: Ingest data
387        tracing::info!("Starting ingestion from source: {}", source.source_id());
388        let records = self.ingester.ingest_all(&source).await?;
389
390        {
391            let mut stats = self.stats.write().unwrap();
392            stats.records_processed = records.len() as u64;
393        }
394
395        // Phase 2: Build graph and compute coherence
396        tracing::info!("Computing coherence signals over {} records", records.len());
397        let signals = self.coherence.compute_from_records(&records)?;
398
399        {
400            let mut stats = self.stats.write().unwrap();
401            stats.signals_computed = signals.len() as u64;
402            stats.nodes_created = self.coherence.node_count() as u64;
403            stats.edges_created = self.coherence.edge_count() as u64;
404        }
405
406        // Phase 3: Detect patterns
407        tracing::info!("Detecting discovery patterns");
408        let patterns = self.discovery.detect(&signals)?;
409
410        {
411            let mut stats = self.stats.write().unwrap();
412            stats.patterns_discovered = patterns.len() as u64;
413            stats.duration_ms = start_time.elapsed().as_millis() as u64;
414        }
415
416        tracing::info!(
417            "Discovery complete: {} patterns found in {}ms",
418            patterns.len(),
419            start_time.elapsed().as_millis()
420        );
421
422        Ok(patterns)
423    }
424
425    /// Get current statistics
426    pub fn stats(&self) -> DiscoveryStats {
427        self.stats.read().unwrap().clone()
428    }
429}
430
431#[cfg(test)]
432mod tests {
433    use super::*;
434
435    #[test]
436    fn test_temporal_window() {
437        let start = Utc::now();
438        let end = start + chrono::Duration::hours(1);
439        let window = TemporalWindow::new(start, end, 1);
440
441        assert_eq!(window.duration_secs(), 3600);
442        assert!(window.contains(start + chrono::Duration::minutes(30)));
443        assert!(!window.contains(start - chrono::Duration::minutes(1)));
444        assert!(!window.contains(end + chrono::Duration::minutes(1)));
445    }
446
447    #[test]
448    fn test_default_pipeline_config() {
449        let config = PipelineConfig::default();
450        assert!(config.parallel);
451        assert_eq!(config.checkpoint_interval, 10_000);
452    }
453
454    #[test]
455    fn test_data_record_serialization() {
456        let record = DataRecord {
457            id: "test-1".to_string(),
458            source: "test".to_string(),
459            record_type: "document".to_string(),
460            timestamp: Utc::now(),
461            data: serde_json::json!({"title": "Test"}),
462            embedding: Some(vec![0.1, 0.2, 0.3]),
463            relationships: vec![],
464        };
465
466        let json = serde_json::to_string(&record).unwrap();
467        let parsed: DataRecord = serde_json::from_str(&json).unwrap();
468        assert_eq!(parsed.id, record.id);
469    }
470}