1#![warn(missing_docs)]
34#![warn(clippy::all)]
35
36pub mod academic_clients;
37pub mod api_clients;
38pub mod arxiv_client;
39pub mod biorxiv_client;
40pub mod coherence;
41pub mod crossref_client;
42pub mod discovery;
43pub mod dynamic_mincut;
44pub mod economic_clients;
45pub mod export;
46pub mod finance_clients;
47pub mod forecasting;
48pub mod genomics_clients;
49pub mod geospatial_clients;
50pub mod government_clients;
51pub mod hnsw;
52pub mod cut_aware_hnsw;
53pub mod ingester;
54pub mod mcp_server;
55pub mod medical_clients;
56pub mod ml_clients;
57pub mod news_clients;
58pub mod optimized;
59pub mod patent_clients;
60pub mod persistence;
61pub mod physics_clients;
62pub mod realtime;
63pub mod ruvector_native;
64pub mod semantic_scholar;
65pub mod space_clients;
66pub mod streaming;
67pub mod transportation_clients;
68pub mod visualization;
69pub mod wiki_clients;
70
71use std::collections::HashMap;
72use std::sync::Arc;
73
74use async_trait::async_trait;
75use chrono::{DateTime, Utc};
76use serde::{Deserialize, Serialize};
77use thiserror::Error;
78
79pub use academic_clients::{CoreClient, EricClient, UnpaywallClient};
81pub use api_clients::{EdgarClient, NoaaClient, OpenAlexClient, SimpleEmbedder};
82pub use arxiv_client::ArxivClient;
83pub use biorxiv_client::{BiorxivClient, MedrxivClient};
84pub use crossref_client::CrossRefClient;
85pub use economic_clients::{AlphaVantageClient, FredClient, WorldBankClient};
86pub use finance_clients::{BlsClient, CoinGeckoClient, EcbClient, FinnhubClient, TwelveDataClient};
87pub use genomics_clients::{EnsemblClient, GwasClient, NcbiClient, UniProtClient};
88pub use geospatial_clients::{GeonamesClient, NominatimClient, OpenElevationClient, OverpassClient};
89pub use government_clients::{
90 CensusClient, DataGovClient, EuOpenDataClient, UkGovClient, UNDataClient,
91 WorldBankClient as WorldBankGovClient,
92};
93pub use medical_clients::{ClinicalTrialsClient, FdaClient, PubMedClient};
94pub use ml_clients::{
95 HuggingFaceClient, HuggingFaceDataset, HuggingFaceModel, OllamaClient, OllamaModel,
96 PapersWithCodeClient, PaperWithCodeDataset, PaperWithCodePaper, ReplicateClient,
97 ReplicateModel, TogetherAiClient, TogetherModel,
98};
99pub use news_clients::{GuardianClient, HackerNewsClient, NewsDataClient, RedditClient};
100pub use patent_clients::{EpoClient, UsptoPatentClient};
101pub use physics_clients::{ArgoClient, CernOpenDataClient, GeoUtils, MaterialsProjectClient, UsgsEarthquakeClient};
102pub use semantic_scholar::SemanticScholarClient;
103pub use space_clients::{AstronomyClient, ExoplanetClient, NasaClient, SpaceXClient};
104pub use transportation_clients::{GtfsClient, MobilityDatabaseClient, OpenChargeMapClient, OpenRouteServiceClient};
105pub use wiki_clients::{WikidataClient, WikidataEntity, WikipediaClient};
106pub use coherence::{
107 CoherenceBoundary, CoherenceConfig, CoherenceEngine, CoherenceEvent, CoherenceSignal,
108};
109pub use cut_aware_hnsw::{
110 CutAwareHNSW, CutAwareConfig, CutAwareMetrics, CoherenceZone,
111 SearchResult as CutAwareSearchResult, EdgeUpdate as CutAwareEdgeUpdate, UpdateKind, LayerCutStats,
112};
113pub use discovery::{
114 DiscoveryConfig, DiscoveryEngine, DiscoveryPattern, PatternCategory, PatternStrength,
115};
116pub use dynamic_mincut::{
117 CutGatedSearch, CutWatcherConfig, DynamicCutWatcher, DynamicMinCutError,
118 EdgeUpdate as DynamicEdgeUpdate, EdgeUpdateType, EulerTourTree, HNSWGraph,
119 LocalCut, LocalMinCutProcedure, WatcherStats,
120};
121pub use export::{
122 export_all, export_coherence_csv, export_dot, export_graphml, export_patterns_csv,
123 export_patterns_with_evidence_csv, ExportFilter,
124};
125pub use forecasting::{CoherenceForecaster, CrossDomainForecaster, Forecast, Trend};
126pub use ingester::{DataIngester, IngestionConfig, IngestionStats, SourceConfig};
127pub use realtime::{FeedItem, FeedSource, NewsAggregator, NewsSource, RealTimeEngine};
128pub use ruvector_native::{
129 CoherenceHistoryEntry, CoherenceSnapshot, Domain, DiscoveredPattern,
130 GraphExport, NativeDiscoveryEngine, NativeEngineConfig, SemanticVector,
131};
132pub use streaming::{StreamingConfig, StreamingEngine, StreamingEngineBuilder, StreamingMetrics};
133
134#[derive(Error, Debug)]
136pub enum FrameworkError {
137 #[error("Ingestion error: {0}")]
139 Ingestion(String),
140
141 #[error("Coherence error: {0}")]
143 Coherence(String),
144
145 #[error("Discovery error: {0}")]
147 Discovery(String),
148
149 #[error("Network error: {0}")]
151 Network(#[from] reqwest::Error),
152
153 #[error("Serialization error: {0}")]
155 Serialization(#[from] serde_json::Error),
156
157 #[error("Graph error: {0}")]
159 Graph(String),
160
161 #[error("Config error: {0}")]
163 Config(String),
164}
165
166pub type Result<T> = std::result::Result<T, FrameworkError>;
168
169#[derive(Debug, Clone, Serialize, Deserialize)]
171pub struct DataRecord {
172 pub id: String,
174
175 pub source: String,
177
178 pub record_type: String,
180
181 pub timestamp: DateTime<Utc>,
183
184 pub data: serde_json::Value,
186
187 pub embedding: Option<Vec<f32>>,
189
190 pub relationships: Vec<Relationship>,
192}
193
194#[derive(Debug, Clone, Serialize, Deserialize)]
196pub struct Relationship {
197 pub target_id: String,
199
200 pub rel_type: String,
202
203 pub weight: f64,
205
206 pub properties: HashMap<String, serde_json::Value>,
208}
209
210#[async_trait]
212pub trait DataSource: Send + Sync {
213 fn source_id(&self) -> &str;
215
216 async fn fetch_batch(
218 &self,
219 cursor: Option<String>,
220 batch_size: usize,
221 ) -> Result<(Vec<DataRecord>, Option<String>)>;
222
223 async fn total_count(&self) -> Result<Option<u64>>;
225
226 async fn health_check(&self) -> Result<bool>;
228}
229
230#[async_trait]
232pub trait EmbeddingProvider: Send + Sync {
233 async fn embed_record(&self, record: &DataRecord) -> Result<Vec<f32>>;
235
236 async fn embed_batch(&self, records: &[DataRecord]) -> Result<Vec<Vec<f32>>>;
238
239 fn dimension(&self) -> usize;
241}
242
243pub trait GraphBuilder: Send + Sync {
245 fn add_node(&mut self, record: &DataRecord) -> Result<u64>;
247
248 fn add_edge(&mut self, source: u64, target: u64, weight: f64, rel_type: &str) -> Result<()>;
250
251 fn node_count(&self) -> usize;
253
254 fn edge_count(&self) -> usize;
256}
257
258#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
260pub struct TemporalWindow {
261 pub start: DateTime<Utc>,
263
264 pub end: DateTime<Utc>,
266
267 pub window_id: u64,
269}
270
271impl TemporalWindow {
272 pub fn new(start: DateTime<Utc>, end: DateTime<Utc>, window_id: u64) -> Self {
274 Self {
275 start,
276 end,
277 window_id,
278 }
279 }
280
281 pub fn duration_secs(&self) -> i64 {
283 (self.end - self.start).num_seconds()
284 }
285
286 pub fn contains(&self, timestamp: DateTime<Utc>) -> bool {
288 timestamp >= self.start && timestamp < self.end
289 }
290}
291
292#[derive(Debug, Clone, Default, Serialize, Deserialize)]
294pub struct DiscoveryStats {
295 pub records_processed: u64,
297
298 pub nodes_created: u64,
300
301 pub edges_created: u64,
303
304 pub signals_computed: u64,
306
307 pub patterns_discovered: u64,
309
310 pub duration_ms: u64,
312
313 pub peak_memory_bytes: u64,
315}
316
317#[derive(Debug, Clone, Serialize, Deserialize)]
319pub struct PipelineConfig {
320 pub ingestion: IngestionConfig,
322
323 pub coherence: CoherenceConfig,
325
326 pub discovery: DiscoveryConfig,
328
329 pub parallel: bool,
331
332 pub checkpoint_interval: u64,
334
335 pub output_dir: String,
337}
338
339impl Default for PipelineConfig {
340 fn default() -> Self {
341 Self {
342 ingestion: IngestionConfig::default(),
343 coherence: CoherenceConfig::default(),
344 discovery: DiscoveryConfig::default(),
345 parallel: true,
346 checkpoint_interval: 10_000,
347 output_dir: "./discovery_output".to_string(),
348 }
349 }
350}
351
352pub struct DiscoveryPipeline {
354 config: PipelineConfig,
355 ingester: DataIngester,
356 coherence: CoherenceEngine,
357 discovery: DiscoveryEngine,
358 stats: Arc<std::sync::RwLock<DiscoveryStats>>,
359}
360
361impl DiscoveryPipeline {
362 pub fn new(config: PipelineConfig) -> Self {
364 let ingester = DataIngester::new(config.ingestion.clone());
365 let coherence = CoherenceEngine::new(config.coherence.clone());
366 let discovery = DiscoveryEngine::new(config.discovery.clone());
367
368 Self {
369 config,
370 ingester,
371 coherence,
372 discovery,
373 stats: Arc::new(std::sync::RwLock::new(DiscoveryStats::default())),
374 }
375 }
376
377 pub async fn run<S: DataSource>(&mut self, source: S) -> Result<Vec<DiscoveryPattern>> {
379 let start_time = std::time::Instant::now();
380
381 tracing::info!("Starting ingestion from source: {}", source.source_id());
383 let records = self.ingester.ingest_all(&source).await?;
384
385 {
386 let mut stats = self.stats.write().unwrap();
387 stats.records_processed = records.len() as u64;
388 }
389
390 tracing::info!("Computing coherence signals over {} records", records.len());
392 let signals = self.coherence.compute_from_records(&records)?;
393
394 {
395 let mut stats = self.stats.write().unwrap();
396 stats.signals_computed = signals.len() as u64;
397 stats.nodes_created = self.coherence.node_count() as u64;
398 stats.edges_created = self.coherence.edge_count() as u64;
399 }
400
401 tracing::info!("Detecting discovery patterns");
403 let patterns = self.discovery.detect(&signals)?;
404
405 {
406 let mut stats = self.stats.write().unwrap();
407 stats.patterns_discovered = patterns.len() as u64;
408 stats.duration_ms = start_time.elapsed().as_millis() as u64;
409 }
410
411 tracing::info!(
412 "Discovery complete: {} patterns found in {}ms",
413 patterns.len(),
414 start_time.elapsed().as_millis()
415 );
416
417 Ok(patterns)
418 }
419
420 pub fn stats(&self) -> DiscoveryStats {
422 self.stats.read().unwrap().clone()
423 }
424}
425
426#[cfg(test)]
427mod tests {
428 use super::*;
429
430 #[test]
431 fn test_temporal_window() {
432 let start = Utc::now();
433 let end = start + chrono::Duration::hours(1);
434 let window = TemporalWindow::new(start, end, 1);
435
436 assert_eq!(window.duration_secs(), 3600);
437 assert!(window.contains(start + chrono::Duration::minutes(30)));
438 assert!(!window.contains(start - chrono::Duration::minutes(1)));
439 assert!(!window.contains(end + chrono::Duration::minutes(1)));
440 }
441
442 #[test]
443 fn test_default_pipeline_config() {
444 let config = PipelineConfig::default();
445 assert!(config.parallel);
446 assert_eq!(config.checkpoint_interval, 10_000);
447 }
448
449 #[test]
450 fn test_data_record_serialization() {
451 let record = DataRecord {
452 id: "test-1".to_string(),
453 source: "test".to_string(),
454 record_type: "document".to_string(),
455 timestamp: Utc::now(),
456 data: serde_json::json!({"title": "Test"}),
457 embedding: Some(vec![0.1, 0.2, 0.3]),
458 relationships: vec![],
459 };
460
461 let json = serde_json::to_string(&record).unwrap();
462 let parsed: DataRecord = serde_json::from_str(&json).unwrap();
463 assert_eq!(parsed.id, record.id);
464 }
465}