1mod applier;
30mod processor;
31mod sink;
32mod subscriber;
33
34pub use applier::{Applier, ApplierConfig, ApplierError, ApplyMode, ApplyResult};
35pub use processor::{ProcessResult, Processor, ProcessorConfig, ProcessorError, ProcessorMode};
36pub use sink::{DataSink, DataSinkError, DataSinkStats};
37pub use subscriber::{ActionEventSubscriber, EventSubscriberConfig, LearningEventSubscriber};
38
39use std::path::PathBuf;
40use std::sync::Arc;
41use std::time::Duration;
42
43use tokio::sync::mpsc;
44use tokio::time::interval;
45
46use crate::learn::learn_model::{LearnModel, WorkerDecisionSequenceLearn};
47use crate::learn::lora::{
48 LoraTrainer, LoraTrainerConfig, ModelApplicator, NoOpApplicator, TrainedModel,
49};
50use crate::learn::record::{LearnStatsRecord, Record};
51use crate::learn::snapshot::LearningStore;
52use crate::learn::store::{
53 EpisodeStore, FileEpisodeStore, FileRecordStore, InMemoryEpisodeStore, InMemoryRecordStore,
54 RecordStore, RecordStoreError, StoreError,
55};
56use crate::learn::trigger::{TrainTrigger, TriggerBuilder, TriggerContext};
57use crate::learn::LearnStats;
58use crate::util::epoch_millis;
59
60#[derive(Debug)]
66pub enum DaemonError {
67 Sink(DataSinkError),
69 Processor(ProcessorError),
71 Applier(ApplierError),
73 Io(std::io::Error),
75 Config(String),
77 Shutdown,
79}
80
81impl std::fmt::Display for DaemonError {
82 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
83 match self {
84 Self::Sink(e) => write!(f, "Sink error: {}", e),
85 Self::Processor(e) => write!(f, "Processor error: {}", e),
86 Self::Applier(e) => write!(f, "Applier error: {}", e),
87 Self::Io(e) => write!(f, "IO error: {}", e),
88 Self::Config(msg) => write!(f, "Config error: {}", msg),
89 Self::Shutdown => write!(f, "Daemon shutdown"),
90 }
91 }
92}
93
94impl std::error::Error for DaemonError {}
95
96impl From<DataSinkError> for DaemonError {
97 fn from(e: DataSinkError) -> Self {
98 Self::Sink(e)
99 }
100}
101
102impl From<ProcessorError> for DaemonError {
103 fn from(e: ProcessorError) -> Self {
104 Self::Processor(e)
105 }
106}
107
108impl From<ApplierError> for DaemonError {
109 fn from(e: ApplierError) -> Self {
110 Self::Applier(e)
111 }
112}
113
114impl From<std::io::Error> for DaemonError {
115 fn from(e: std::io::Error) -> Self {
116 Self::Io(e)
117 }
118}
119
120impl From<RecordStoreError> for DaemonError {
121 fn from(e: RecordStoreError) -> Self {
122 Self::Sink(DataSinkError::RecordStore(e))
123 }
124}
125
126impl From<StoreError> for DaemonError {
127 fn from(e: StoreError) -> Self {
128 Self::Sink(DataSinkError::EpisodeStore(e))
129 }
130}
131
132#[derive(Debug, Clone)]
138pub struct DaemonConfig {
139 pub scenario: String,
141 pub data_dir: PathBuf,
143 pub check_interval: Duration,
145 pub processor_mode: ProcessorMode,
147 pub max_sessions: usize,
149 pub auto_apply: bool,
151 pub lora_config: Option<LoraTrainerConfig>,
153}
154
155impl DaemonConfig {
156 pub fn new(scenario: impl Into<String>) -> Self {
158 Self {
159 scenario: scenario.into(),
160 data_dir: default_data_dir(),
161 check_interval: Duration::from_secs(10),
162 processor_mode: ProcessorMode::OfflineOnly,
163 max_sessions: 20,
164 auto_apply: false,
165 lora_config: None,
166 }
167 }
168
169 pub fn data_dir(mut self, path: impl Into<PathBuf>) -> Self {
171 self.data_dir = path.into();
172 self
173 }
174
175 pub fn check_interval(mut self, interval: Duration) -> Self {
177 self.check_interval = interval;
178 self
179 }
180
181 pub fn processor_mode(mut self, mode: ProcessorMode) -> Self {
183 self.processor_mode = mode;
184 self
185 }
186
187 pub fn max_sessions(mut self, n: usize) -> Self {
189 self.max_sessions = n;
190 self
191 }
192
193 pub fn auto_apply(mut self, enabled: bool) -> Self {
195 self.auto_apply = enabled;
196 self
197 }
198
199 pub fn with_lora(mut self, config: LoraTrainerConfig) -> Self {
201 self.lora_config = Some(config);
202 self
203 }
204}
205
206fn default_data_dir() -> PathBuf {
207 dirs::data_dir()
208 .unwrap_or_else(|| PathBuf::from("."))
209 .join("swarm-engine")
210 .join("learning")
211}
212
213#[derive(Debug, Clone, Default)]
219pub struct DaemonStats {
220 pub records_received: usize,
222 pub episodes_created: usize,
224 pub trainings_completed: usize,
226 pub models_applied: usize,
228 pub last_train_at: Option<u64>,
230 pub started_at: u64,
232}
233
234pub struct LearningDaemon {
240 config: DaemonConfig,
242 sink: DataSink,
244 trigger: Arc<dyn TrainTrigger>,
246 processor: Processor,
248 applier: Option<Applier>,
250 learning_store: LearningStore,
252 stats: DaemonStats,
254 last_train_count: usize,
256 record_rx: mpsc::Receiver<Vec<Record>>,
258 record_tx: mpsc::Sender<Vec<Record>>,
260 shutdown_rx: mpsc::Receiver<()>,
262 shutdown_tx: mpsc::Sender<()>,
264}
265
266impl LearningDaemon {
267 pub fn new(config: DaemonConfig, trigger: Arc<dyn TrainTrigger>) -> Result<Self, DaemonError> {
269 let record_store: Arc<dyn RecordStore> = Arc::new(InMemoryRecordStore::new());
270 let episode_store: Arc<dyn EpisodeStore> = Arc::new(InMemoryEpisodeStore::new());
271 let learn_model: Arc<dyn LearnModel> = Arc::new(WorkerDecisionSequenceLearn::new());
272
273 Self::with_stores(config, trigger, record_store, episode_store, learn_model)
274 }
275
276 pub fn with_file_stores(
278 config: DaemonConfig,
279 trigger: Arc<dyn TrainTrigger>,
280 ) -> Result<Self, DaemonError> {
281 std::fs::create_dir_all(&config.data_dir)?;
283
284 let record_store: Arc<dyn RecordStore> =
285 Arc::new(FileRecordStore::new(config.data_dir.join("records"))?);
286 let episode_store: Arc<dyn EpisodeStore> =
287 Arc::new(FileEpisodeStore::new(config.data_dir.join("episodes"))?);
288 let learn_model: Arc<dyn LearnModel> = Arc::new(WorkerDecisionSequenceLearn::new());
289
290 Self::with_stores(config, trigger, record_store, episode_store, learn_model)
291 }
292
293 pub fn with_stores(
295 config: DaemonConfig,
296 trigger: Arc<dyn TrainTrigger>,
297 record_store: Arc<dyn RecordStore>,
298 episode_store: Arc<dyn EpisodeStore>,
299 learn_model: Arc<dyn LearnModel>,
300 ) -> Result<Self, DaemonError> {
301 let sink = DataSink::new(
303 record_store,
304 Arc::clone(&episode_store),
305 Arc::clone(&learn_model),
306 );
307
308 let processor_config = ProcessorConfig::new(&config.scenario)
310 .mode(config.processor_mode)
311 .max_sessions(config.max_sessions);
312
313 let mut processor = Processor::new(processor_config);
314
315 let learning_store = LearningStore::new(&config.data_dir)?;
317 let learning_store_for_processor = LearningStore::new(&config.data_dir)?;
319 processor = processor.with_learning_store(learning_store_for_processor);
320
321 if let Some(lora_config) = &config.lora_config {
323 let trainer = LoraTrainer::new(lora_config.clone(), episode_store);
324 processor = processor
325 .with_lora_trainer(trainer)
326 .with_learn_model(learn_model);
327 }
328
329 let applier = if config.auto_apply {
331 let applier_config = ApplierConfig::default().auto_apply();
332 let applicator: Arc<dyn ModelApplicator> = Arc::new(NoOpApplicator::new());
334 Some(Applier::new(applier_config, applicator))
335 } else {
336 None
337 };
338
339 let (record_tx, record_rx) = mpsc::channel(1000);
341 let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
342
343 Ok(Self {
344 config,
345 sink,
346 trigger,
347 processor,
348 applier,
349 learning_store,
350 stats: DaemonStats {
351 started_at: epoch_millis(),
352 ..Default::default()
353 },
354 last_train_count: 0,
355 record_rx,
356 record_tx,
357 shutdown_rx,
358 shutdown_tx,
359 })
360 }
361
362 pub fn record_sender(&self) -> mpsc::Sender<Vec<Record>> {
364 self.record_tx.clone()
365 }
366
367 pub fn shutdown_sender(&self) -> mpsc::Sender<()> {
369 self.shutdown_tx.clone()
370 }
371
372 pub fn config(&self) -> &DaemonConfig {
374 &self.config
375 }
376
377 pub fn stats(&self) -> &DaemonStats {
379 &self.stats
380 }
381
382 pub async fn run(&mut self) -> Result<(), DaemonError> {
384 tracing::info!(
385 scenario = %self.config.scenario,
386 data_dir = %self.config.data_dir.display(),
387 trigger = self.trigger.name(),
388 "Learning daemon started"
389 );
390
391 let mut check_interval = interval(self.config.check_interval);
392
393 loop {
394 tokio::select! {
395 _ = self.shutdown_rx.recv() => {
397 tracing::info!("Shutdown signal received, draining remaining records...");
398
399 tokio::time::sleep(std::time::Duration::from_millis(100)).await;
402
403 while let Ok(records) = self.record_rx.try_recv() {
405 if let Err(e) = self.handle_records(records).await {
406 tracing::warn!("Error processing records during shutdown: {}", e);
407 }
408 }
409
410 tracing::info!(
411 records_received = self.stats.records_received,
412 episodes_created = self.stats.episodes_created,
413 "Shutdown complete"
414 );
415 return Ok(());
416 }
417
418 Some(records) = self.record_rx.recv() => {
420 self.handle_records(records).await?;
421 }
422
423 _ = check_interval.tick() => {
425 self.check_and_train().await?;
426 }
427 }
428 }
429 }
430
431 async fn handle_records(&mut self, records: Vec<Record>) -> Result<(), DaemonError> {
433 if records.is_empty() {
434 return Ok(());
435 }
436
437 let count = records.len();
438
439 for record in &records {
441 if let Record::LearnStats(stats_record) = record {
442 self.save_stats_to_learning_store(stats_record);
443 }
444 }
445
446 let episode_ids = self.sink.ingest(records)?;
447
448 self.stats.records_received += count;
449 self.stats.episodes_created += episode_ids.len();
450
451 tracing::debug!(
452 records = count,
453 episodes = episode_ids.len(),
454 "Processed records"
455 );
456
457 Ok(())
458 }
459
460 fn save_stats_to_learning_store(&self, stats_record: &LearnStatsRecord) {
462 use crate::learn::snapshot::{LearningSnapshot, SnapshotMetadata, SNAPSHOT_VERSION};
463 use crate::learn::{EpisodeTransitions, NgramStats, SelectionPerformance};
464 use crate::online_stats::ActionStats;
465 use std::collections::HashMap;
466
467 let learn_stats: Option<LearnStats> = serde_json::from_str(&stats_record.stats_json).ok();
469
470 let metadata = SnapshotMetadata {
472 scenario_name: Some(stats_record.scenario.clone()),
473 task_description: None,
474 created_at: stats_record.timestamp_ms / 1000, session_count: 1,
476 total_episodes: 1,
477 total_actions: stats_record.total_actions as u32,
478 phase: None,
479 group_id: None,
480 };
481
482 let (
484 episode_transitions,
485 action_stats,
486 ngram_stats,
487 selection_performance,
488 contextual_stats,
489 ) = if let Some(ref stats) = learn_stats {
490 let transitions = stats.episode_transitions.clone();
492
493 let ngram = stats.ngram_stats.clone();
495
496 let selection = stats.selection_performance.clone();
498
499 let mut ctx_stats: HashMap<(String, String), ActionStats> = HashMap::new();
501 for ((prev, action), ctx) in &stats.contextual_stats {
502 ctx_stats.insert(
503 (prev.clone(), action.clone()),
504 ActionStats {
505 visits: ctx.visits,
506 successes: ctx.successes,
507 failures: ctx.failures,
508 ..Default::default()
509 },
510 );
511 }
512
513 let action_stats: HashMap<String, ActionStats> = HashMap::new();
515
516 (transitions, action_stats, ngram, selection, ctx_stats)
517 } else {
518 (
519 EpisodeTransitions::default(),
520 HashMap::new(),
521 NgramStats::default(),
522 SelectionPerformance::default(),
523 HashMap::new(),
524 )
525 };
526
527 let snapshot = LearningSnapshot {
529 version: SNAPSHOT_VERSION,
530 metadata,
531 episode_transitions,
532 ngram_stats,
533 selection_performance,
534 contextual_stats,
535 action_stats,
536 };
537
538 match self
540 .learning_store
541 .save_session(&stats_record.scenario, &snapshot)
542 {
543 Ok(session_id) => {
544 tracing::info!(
545 scenario = %stats_record.scenario,
546 session_id = %session_id.0,
547 success = stats_record.is_success(),
548 "Saved session to LearningStore"
549 );
550 }
551 Err(e) => {
552 tracing::warn!(
553 scenario = %stats_record.scenario,
554 error = %e,
555 "Failed to save session to LearningStore"
556 );
557 }
558 }
559 }
560
561 async fn check_and_train(&mut self) -> Result<(), DaemonError> {
563 let current_count = self.sink.episode_count();
565 let ctx = TriggerContext::with_count(current_count)
566 .last_train_at(self.stats.last_train_at.unwrap_or(0))
567 .last_train_count(self.last_train_count);
568
569 if !self.trigger.should_train(&ctx).unwrap_or(false) {
570 return Ok(());
571 }
572
573 tracing::info!(
574 episode_count = current_count,
575 trigger = self.trigger.name(),
576 "Trigger fired, starting learning"
577 );
578
579 let result = self
581 .processor
582 .run(self.sink.episode_store().as_ref())
583 .await?;
584
585 self.stats.trainings_completed += 1;
587 self.stats.last_train_at = Some(epoch_millis());
588 self.last_train_count = current_count;
589
590 if let Some(applier) = &mut self.applier {
592 if let Some(model) = result.lora_model() {
593 let apply_result = applier.apply(model).await?;
594 if apply_result.is_applied() {
595 self.stats.models_applied += 1;
596 }
597 }
598 }
599
600 tracing::info!(
601 trainings = self.stats.trainings_completed,
602 models_applied = self.stats.models_applied,
603 "Learning cycle completed"
604 );
605
606 Ok(())
607 }
608
609 pub async fn train_now(&mut self) -> Result<ProcessResult, DaemonError> {
611 tracing::info!("Manual training triggered");
612
613 let result = self
614 .processor
615 .run(self.sink.episode_store().as_ref())
616 .await?;
617
618 self.stats.trainings_completed += 1;
619 self.stats.last_train_at = Some(epoch_millis());
620 self.last_train_count = self.sink.episode_count();
621
622 Ok(result)
623 }
624
625 pub async fn apply_model(&mut self, model: &TrainedModel) -> Result<ApplyResult, DaemonError> {
627 let applier = self
628 .applier
629 .as_mut()
630 .ok_or_else(|| DaemonError::Config("Applier not configured".into()))?;
631
632 let result = applier.apply_now(model).await?;
633 if result.is_applied() {
634 self.stats.models_applied += 1;
635 }
636
637 Ok(result)
638 }
639}
640
641pub struct DaemonBuilder {
647 config: DaemonConfig,
648 trigger: Option<Arc<dyn TrainTrigger>>,
649 record_store: Option<Arc<dyn RecordStore>>,
650 episode_store: Option<Arc<dyn EpisodeStore>>,
651 learn_model: Option<Arc<dyn LearnModel>>,
652 applicator: Option<Arc<dyn ModelApplicator>>,
653}
654
655impl DaemonBuilder {
656 pub fn new(scenario: impl Into<String>) -> Self {
658 Self {
659 config: DaemonConfig::new(scenario),
660 trigger: None,
661 record_store: None,
662 episode_store: None,
663 learn_model: None,
664 applicator: None,
665 }
666 }
667
668 pub fn data_dir(mut self, path: impl Into<PathBuf>) -> Self {
670 self.config.data_dir = path.into();
671 self
672 }
673
674 pub fn trigger(mut self, trigger: Arc<dyn TrainTrigger>) -> Self {
676 self.trigger = Some(trigger);
677 self
678 }
679
680 pub fn processor_mode(mut self, mode: ProcessorMode) -> Self {
682 self.config.processor_mode = mode;
683 self
684 }
685
686 pub fn auto_apply(mut self) -> Self {
688 self.config.auto_apply = true;
689 self
690 }
691
692 pub fn record_store(mut self, store: Arc<dyn RecordStore>) -> Self {
694 self.record_store = Some(store);
695 self
696 }
697
698 pub fn episode_store(mut self, store: Arc<dyn EpisodeStore>) -> Self {
700 self.episode_store = Some(store);
701 self
702 }
703
704 pub fn learn_model(mut self, model: Arc<dyn LearnModel>) -> Self {
706 self.learn_model = Some(model);
707 self
708 }
709
710 pub fn applicator(mut self, applicator: Arc<dyn ModelApplicator>) -> Self {
712 self.applicator = Some(applicator);
713 self
714 }
715
716 pub fn with_lora(mut self, config: LoraTrainerConfig) -> Self {
718 self.config.lora_config = Some(config);
719 self
720 }
721
722 pub fn build(self) -> Result<LearningDaemon, DaemonError> {
724 let trigger = self
725 .trigger
726 .unwrap_or_else(|| TriggerBuilder::default_watch());
727
728 let record_store = self
729 .record_store
730 .unwrap_or_else(|| Arc::new(InMemoryRecordStore::new()));
731
732 let episode_store = self
733 .episode_store
734 .unwrap_or_else(|| Arc::new(InMemoryEpisodeStore::new()));
735
736 let learn_model = self
737 .learn_model
738 .unwrap_or_else(|| Arc::new(WorkerDecisionSequenceLearn::new()));
739
740 LearningDaemon::with_stores(
741 self.config,
742 trigger,
743 record_store,
744 episode_store,
745 learn_model,
746 )
747 }
748}
749
750#[cfg(test)]
755mod tests {
756 use super::*;
757 use crate::events::{ActionContext, ActionEventBuilder, ActionEventResult};
758 use crate::learn::trigger::AlwaysTrigger;
759 use crate::types::WorkerId;
760
761 fn make_test_records(count: usize) -> Vec<Record> {
762 (0..count)
763 .map(|i| {
764 let event = ActionEventBuilder::new(i as u64, WorkerId(0), &format!("Action{}", i))
765 .result(ActionEventResult::success())
766 .duration(std::time::Duration::from_millis(10))
767 .context(ActionContext::new())
768 .build();
769 Record::from(&event)
770 })
771 .collect()
772 }
773
774 #[test]
775 fn test_daemon_config_builder() {
776 let config = DaemonConfig::new("test")
777 .data_dir("/tmp/test")
778 .check_interval(Duration::from_secs(30))
779 .processor_mode(ProcessorMode::Full)
780 .auto_apply(true);
781
782 assert_eq!(config.scenario, "test");
783 assert_eq!(config.data_dir, PathBuf::from("/tmp/test"));
784 assert_eq!(config.check_interval, Duration::from_secs(30));
785 assert_eq!(config.processor_mode, ProcessorMode::Full);
786 assert!(config.auto_apply);
787 }
788
789 #[tokio::test]
790 async fn test_daemon_creation() {
791 let config = DaemonConfig::new("test");
792 let trigger = TriggerBuilder::never();
793
794 let daemon = LearningDaemon::new(config, trigger).unwrap();
795 assert_eq!(daemon.config().scenario, "test");
796 assert_eq!(daemon.stats().records_received, 0);
797 }
798
799 #[tokio::test]
800 async fn test_daemon_record_ingestion() {
801 let config = DaemonConfig::new("test");
802 let trigger = TriggerBuilder::never(); let mut daemon = LearningDaemon::new(config, trigger).unwrap();
805 let sender = daemon.record_sender();
806
807 let records = make_test_records(5);
809 sender.send(records).await.unwrap();
810
811 daemon.handle_records(make_test_records(3)).await.unwrap();
813
814 assert_eq!(daemon.stats().records_received, 3);
815 }
816
817 #[tokio::test]
818 async fn test_daemon_builder() {
819 let daemon = DaemonBuilder::new("test-scenario")
820 .data_dir("/tmp/test")
821 .trigger(Arc::new(AlwaysTrigger))
822 .processor_mode(ProcessorMode::OfflineOnly)
823 .build()
824 .unwrap();
825
826 assert_eq!(daemon.config().scenario, "test-scenario");
827 }
828}