lucid_core/
visual.rs

1//! Visual Memory
2//!
3//! A visual memory system that models how humans remember images and video.
4//!
5//! ## Biological Basis
6//!
7//! Human visual memory operates differently than verbal memory:
8//!
9//! **Dual-Coding Theory** (Paivio, 1971)
10//! - Images and words are stored in separate but connected systems
11//! - Visual memories are often more durable than verbal ones
12//! - Emotional content enhances visual memory consolidation
13//!
14//! **Gist vs. Detail** (Brainerd & Reyna, 2005)
15//! - We remember the essence (gist) of images longer than details
16//! - Details fade faster but can be reinstated with cues
17//! - Emotional arousal preferentially preserves gist
18//!
19//! **Scene Gist** (Oliva, 2005)
20//! - We extract the "gist" of a scene in ~100ms
21//! - This informs what details we attend to and encode
22//! - Scene categories (indoor/outdoor, natural/urban) are processed first
23//!
24//! ## Key Concepts
25//!
26//! - **Significance**: How memorable/important the image is (0-1)
27//! - **Emotional Context**: Valence (-1 to 1) and arousal (0-1)
28//! - **Consolidation**: Visual memories strengthen over time
29//! - **Tagging**: Automatic categorization and importance scoring
30
31use serde::{Deserialize, Serialize};
32use smallvec::SmallVec;
33
34use crate::activation::{
35	combine_activations, compute_base_level, cosine_similarity_batch, nonlinear_activation_batch,
36	retrieval_probability,
37};
38use crate::spreading::{spread_activation, Association, SpreadingConfig, SpreadingResult};
39
40// ============================================================================
41// Source Types
42// ============================================================================
43
44/// Where a visual memory originated.
45#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
46pub enum VisualSource {
47	/// Received via Discord (shared by someone)
48	Discord,
49	/// Received via SMS/iMessage
50	Sms,
51	/// Direct upload or screenshot
52	Direct,
53	/// Extracted frame from video
54	VideoFrame,
55	/// Other/unknown source
56	#[default]
57	Other,
58}
59
60// ============================================================================
61// Emotional Context
62// ============================================================================
63
64/// Emotional context of a visual memory.
65///
66/// Based on the circumplex model of affect (Russell, 1980):
67/// - **Valence**: Pleasant (+1) to unpleasant (-1)
68/// - **Arousal**: High activation (+1) to low activation (0)
69///
70/// Emotional arousal enhances memory consolidation.
71#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
72pub struct EmotionalContext {
73	/// Pleasant (+1) to unpleasant (-1)
74	pub valence: f64,
75	/// High activation (1) to low activation (0)
76	pub arousal: f64,
77}
78
79impl Default for EmotionalContext {
80	fn default() -> Self {
81		Self {
82			valence: 0.0,
83			arousal: 0.5,
84		}
85	}
86}
87
88impl EmotionalContext {
89	/// Create a new emotional context.
90	#[must_use]
91	pub fn new(valence: f64, arousal: f64) -> Self {
92		Self {
93			valence: valence.clamp(-1.0, 1.0),
94			arousal: arousal.clamp(0.0, 1.0),
95		}
96	}
97
98	/// Compute emotional weight (0.5-1.5 multiplier for activation).
99	///
100	/// Higher arousal = stronger memory encoding.
101	#[inline]
102	#[must_use]
103	pub fn emotional_weight(&self) -> f64 {
104		// Base weight of 0.5, arousal adds up to 1.0
105		0.5 + self.arousal
106	}
107
108	/// Check if this represents a strong emotional moment.
109	#[inline]
110	#[must_use]
111	pub fn is_significant(&self) -> bool {
112		self.arousal > 0.7 || self.valence.abs() > 0.7
113	}
114}
115
116// ============================================================================
117// Visual Memory
118// ============================================================================
119
120/// A visual memory with full metadata.
121///
122/// This represents a stored image or video frame with its associated
123/// context, embeddings, and retrieval metadata.
124#[derive(Debug, Clone, Serialize, Deserialize)]
125pub struct VisualMemory {
126	/// Unique identifier
127	pub id: u32,
128
129	/// Short description (the "gist" of what's in the image)
130	pub description: String,
131
132	/// Detailed description (specific elements, decays faster)
133	pub detailed_description: Option<String>,
134
135	/// Visual embedding vector (from vision model)
136	pub embedding: Vec<f64>,
137
138	/// When the image was captured/received
139	pub captured_at_ms: f64,
140
141	/// Most recent access timestamp
142	pub last_accessed_ms: f64,
143
144	/// Access count (for familiarity computation)
145	pub access_count: u32,
146
147	/// Emotional context at capture time
148	pub emotional_context: EmotionalContext,
149
150	/// Significance score (0-1, how memorable/important)
151	pub significance: f64,
152
153	/// Where this image came from
154	pub source: VisualSource,
155
156	/// Who shared this (if applicable)
157	pub shared_by: Option<String>,
158
159	/// Video ID if this is a frame
160	pub video_id: Option<String>,
161
162	/// Frame number within video
163	pub frame_number: Option<u32>,
164
165	/// Detected objects/entities
166	pub objects: Vec<String>,
167
168	/// Semantic tags (auto-generated or manual)
169	pub tags: Vec<String>,
170
171	/// Whether this memory is pinned (protected from decay/pruning)
172	pub is_pinned: bool,
173}
174
175impl VisualMemory {
176	/// Check if this is a video frame.
177	#[inline]
178	#[must_use]
179	pub const fn is_video_frame(&self) -> bool {
180		self.video_id.is_some()
181	}
182
183	/// Compute emotional weight for retrieval.
184	#[inline]
185	#[must_use]
186	pub fn emotional_weight(&self) -> f64 {
187		self.emotional_context.emotional_weight()
188	}
189}
190
191// ============================================================================
192// Configuration
193// ============================================================================
194
195/// Configuration for visual memory operations.
196#[derive(Debug, Clone, Serialize, Deserialize)]
197pub struct VisualConfig {
198	/// Significance threshold for automatic tagging
199	pub tagging_significance_threshold: f64,
200
201	/// Minimum emotional arousal to boost retention
202	pub emotional_retention_threshold: f64,
203
204	/// How much emotional arousal reduces decay rate (0-1)
205	pub emotional_decay_reduction: f64,
206
207	/// Decay rate for visual memories (per day after threshold)
208	pub base_decay_rate: f64,
209
210	/// Days before decay begins
211	pub stale_threshold_days: u32,
212
213	/// Minimum significance floor (never drops below this)
214	pub significance_floor: f64,
215
216	/// Pruning threshold - memories below this may be pruned
217	pub pruning_threshold: f64,
218
219	/// Maximum days since access before considering for pruning
220	pub pruning_stale_days: u32,
221
222	/// Whether to preserve video keyframes from pruning
223	pub preserve_keyframes: bool,
224}
225
226impl Default for VisualConfig {
227	fn default() -> Self {
228		Self {
229			tagging_significance_threshold: 0.6,
230			emotional_retention_threshold: 0.7,
231			emotional_decay_reduction: 0.5,
232			base_decay_rate: 0.05,
233			stale_threshold_days: 14,
234			significance_floor: 0.1,
235			pruning_threshold: 0.2,
236			pruning_stale_days: 90,
237			preserve_keyframes: true,
238		}
239	}
240}
241
242// ============================================================================
243// Retrieval
244// ============================================================================
245
246/// Configuration for visual memory retrieval.
247#[derive(Debug, Clone, Serialize, Deserialize)]
248pub struct VisualRetrievalConfig {
249	/// Decay rate for base-level activation
250	pub decay_rate: f64,
251	/// Retrieval threshold (tau)
252	pub activation_threshold: f64,
253	/// Noise parameter (s)
254	pub noise_parameter: f64,
255	/// Spreading activation depth
256	pub spreading_depth: usize,
257	/// Spreading decay per hop
258	pub spreading_decay: f64,
259	/// Minimum probability to include
260	pub min_probability: f64,
261	/// Maximum results to return
262	pub max_results: usize,
263	/// Whether to spread bidirectionally
264	pub bidirectional: bool,
265	/// Boost factor for emotionally significant memories
266	pub emotional_boost: f64,
267	/// Boost factor for high-significance memories
268	pub significance_boost: f64,
269}
270
271impl Default for VisualRetrievalConfig {
272	fn default() -> Self {
273		Self {
274			decay_rate: 0.5,
275			activation_threshold: 0.3,
276			noise_parameter: 0.1,
277			spreading_depth: 3,
278			spreading_decay: 0.7,
279			min_probability: 0.1,
280			max_results: 10,
281			bidirectional: true,
282			emotional_boost: 0.3,
283			significance_boost: 0.2,
284		}
285	}
286}
287
288/// A candidate from visual memory retrieval.
289#[derive(Debug, Clone, Serialize, Deserialize)]
290pub struct VisualRetrievalCandidate {
291	/// Visual memory index
292	pub index: usize,
293	/// Base-level activation from access history
294	pub base_level: f64,
295	/// Probe-trace activation (cubed similarity)
296	pub probe_activation: f64,
297	/// Spreading activation from associated memories
298	pub spreading: f64,
299	/// Emotional weight factor
300	pub emotional_weight: f64,
301	/// Significance boost
302	pub significance_boost: f64,
303	/// Combined total activation
304	pub total_activation: f64,
305	/// Retrieval probability (0-1)
306	pub probability: f64,
307}
308
309/// Input data for visual retrieval.
310pub struct VisualRetrievalInput<'a> {
311	/// Probe embedding vector
312	pub probe_embedding: &'a [f64],
313	/// All visual memory embeddings
314	pub memory_embeddings: &'a [Vec<f64>],
315	/// Access timestamps (ms) for each memory
316	pub access_histories_ms: &'a [Vec<f64>],
317	/// Emotional weights for each memory
318	pub emotional_weights: &'a [f64],
319	/// Significance scores for each memory
320	pub significance_scores: &'a [f64],
321	/// Association graph edges
322	pub associations: &'a [Association],
323	/// Current time (ms)
324	pub current_time_ms: f64,
325}
326
327/// Retrieve visual memories based on probe embedding.
328///
329/// This uses the same ACT-R spreading activation model as text retrieval,
330/// but adds boosts for emotional significance and memory importance.
331#[must_use]
332pub fn retrieve_visual(
333	input: &VisualRetrievalInput<'_>,
334	config: &VisualRetrievalConfig,
335) -> Vec<VisualRetrievalCandidate> {
336	let n = input.memory_embeddings.len();
337	if n == 0 {
338		return Vec::new();
339	}
340
341	// 1. Compute probe-trace similarities (batch)
342	let similarities = cosine_similarity_batch(input.probe_embedding, input.memory_embeddings);
343
344	// 2. Apply nonlinear activation (MINERVA 2)
345	let probe_activations = nonlinear_activation_batch(&similarities);
346
347	// 3. Compute base-level activation (batch)
348	let base_levels: Vec<f64> = input
349		.access_histories_ms
350		.iter()
351		.map(|history| compute_base_level(history, input.current_time_ms, config.decay_rate))
352		.collect();
353
354	// 4. Initial activation (before spreading)
355	let initial_activations: Vec<f64> = (0..n)
356		.map(|i| {
357			let base = if base_levels[i].is_finite() {
358				base_levels[i]
359			} else {
360				-10.0
361			};
362			let emotional = input.emotional_weights.get(i).copied().unwrap_or(0.5);
363			let emotional_multiplier = 1.0 + (emotional - 0.5);
364			(base + probe_activations[i]) * emotional_multiplier
365		})
366		.collect();
367
368	// 5. Find seeds for spreading (top activated)
369	let mut seeds: Vec<(usize, f64)> = initial_activations
370		.iter()
371		.enumerate()
372		.filter(|(_, &a)| a > 0.0)
373		.map(|(i, &a)| (i, a))
374		.collect();
375	seeds.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
376	seeds.truncate(5);
377
378	// 6. Spread activation
379	let spreading_result = if !seeds.is_empty() && config.spreading_depth > 0 {
380		let seed_indices: Vec<usize> = seeds.iter().map(|(i, _)| *i).collect();
381		let seed_activations: Vec<f64> = seeds.iter().map(|(_, a)| *a).collect();
382
383		let spreading_config = SpreadingConfig {
384			decay_per_hop: config.spreading_decay,
385			minimum_activation: 0.01,
386			max_nodes: 1000,
387			bidirectional: config.bidirectional,
388		};
389
390		spread_activation(
391			n,
392			input.associations,
393			&seed_indices,
394			&seed_activations,
395			&spreading_config,
396			config.spreading_depth,
397		)
398	} else {
399		SpreadingResult {
400			activations: vec![0.0; n],
401			visited_by_depth: Vec::new(),
402		}
403	};
404
405	// 7. Combine all activations and build candidates
406	let mut candidates: Vec<VisualRetrievalCandidate> = (0..n)
407		.filter_map(|i| {
408			let base_level = if base_levels[i].is_finite() {
409				base_levels[i]
410			} else {
411				-10.0
412			};
413			let probe_activation = probe_activations[i];
414			let spreading = spreading_result.activations[i];
415			let emotional_weight = input.emotional_weights.get(i).copied().unwrap_or(0.5);
416			let significance = input.significance_scores.get(i).copied().unwrap_or(0.5);
417
418			// Add significance boost
419			let significance_boost = significance * config.significance_boost;
420			let emotional_boost = if emotional_weight > 0.7 {
421				(emotional_weight - 0.7) * config.emotional_boost
422			} else {
423				0.0
424			};
425
426			let breakdown =
427				combine_activations(base_level, probe_activation, spreading, emotional_weight);
428
429			let boosted_total = breakdown.total + significance_boost + emotional_boost;
430
431			let probability = retrieval_probability(
432				boosted_total,
433				config.activation_threshold,
434				config.noise_parameter,
435			);
436
437			// Filter by minimum probability
438			if probability < config.min_probability {
439				return None;
440			}
441
442			Some(VisualRetrievalCandidate {
443				index: i,
444				base_level: breakdown.base_level,
445				probe_activation: breakdown.probe_activation,
446				spreading: breakdown.spreading,
447				emotional_weight: breakdown.emotional_weight,
448				significance_boost: significance_boost + emotional_boost,
449				total_activation: boosted_total,
450				probability,
451			})
452		})
453		.collect();
454
455	// 8. Sort by total activation and limit
456	candidates.sort_by(|a, b| {
457		b.total_activation
458			.partial_cmp(&a.total_activation)
459			.unwrap_or(std::cmp::Ordering::Equal)
460	});
461	candidates.truncate(config.max_results);
462
463	candidates
464}
465
466// ============================================================================
467// Consolidation
468// ============================================================================
469
470/// State of memory consolidation.
471#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
472pub enum ConsolidationState {
473	/// Fresh memory, not yet consolidated
474	#[default]
475	Fresh,
476	/// Currently being consolidated (labile)
477	Consolidating,
478	/// Fully consolidated (stable)
479	Consolidated,
480	/// Undergoing reconsolidation after reactivation
481	Reconsolidating,
482}
483
484/// A time window during which consolidation occurs.
485#[derive(Debug, Clone, Serialize, Deserialize)]
486pub struct ConsolidationWindow {
487	/// When the window opened
488	pub started_at_ms: f64,
489	/// When the window closes (memory becomes stable)
490	pub ends_at_ms: f64,
491	/// Current state
492	pub state: ConsolidationState,
493}
494
495impl ConsolidationWindow {
496	/// Create a new consolidation window starting now.
497	#[must_use]
498	pub fn new(current_time_ms: f64, duration_ms: f64) -> Self {
499		Self {
500			started_at_ms: current_time_ms,
501			ends_at_ms: current_time_ms + duration_ms,
502			state: ConsolidationState::Consolidating,
503		}
504	}
505
506	/// Check if the window is still open.
507	#[inline]
508	#[must_use]
509	pub fn is_open(&self, current_time_ms: f64) -> bool {
510		current_time_ms < self.ends_at_ms
511	}
512
513	/// Progress through the window (0-1).
514	#[must_use]
515	pub fn progress(&self, current_time_ms: f64) -> f64 {
516		if current_time_ms >= self.ends_at_ms {
517			return 1.0;
518		}
519		let elapsed = current_time_ms - self.started_at_ms;
520		let duration = self.ends_at_ms - self.started_at_ms;
521		(elapsed / duration).clamp(0.0, 1.0)
522	}
523}
524
525/// Full consolidation state for a visual memory.
526#[derive(Debug, Clone, Serialize, Deserialize)]
527pub struct VisualConsolidationState {
528	/// Current consolidation state
529	pub state: ConsolidationState,
530
531	/// Active consolidation window (if consolidating)
532	pub window: Option<ConsolidationWindow>,
533
534	/// Consolidation strength (0-1, how well consolidated)
535	pub strength: f64,
536
537	/// Number of times reactivated/reconsolidated
538	pub reactivation_count: u32,
539}
540
541impl Default for VisualConsolidationState {
542	fn default() -> Self {
543		Self {
544			state: ConsolidationState::Fresh,
545			window: None,
546			strength: 0.0,
547			reactivation_count: 0,
548		}
549	}
550}
551
552impl VisualConsolidationState {
553	/// Check if the memory is currently labile (modifiable).
554	#[inline]
555	#[must_use]
556	pub const fn is_labile(&self) -> bool {
557		matches!(
558			self.state,
559			ConsolidationState::Consolidating | ConsolidationState::Reconsolidating
560		)
561	}
562
563	/// Start consolidation.
564	pub fn start_consolidation(&mut self, current_time_ms: f64, duration_ms: f64) {
565		self.state = ConsolidationState::Consolidating;
566		self.window = Some(ConsolidationWindow::new(current_time_ms, duration_ms));
567	}
568
569	/// Update consolidation state based on current time.
570	pub fn update(&mut self, current_time_ms: f64) {
571		if let Some(ref window) = self.window {
572			if window.is_open(current_time_ms) {
573				self.strength = window.progress(current_time_ms);
574			} else {
575				self.state = ConsolidationState::Consolidated;
576				self.strength = 1.0;
577				self.window = None;
578			}
579		}
580	}
581}
582
583// ============================================================================
584// Tagging
585// ============================================================================
586
587/// Why a tag was assigned.
588#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
589pub enum TagReason {
590	/// Detected automatically by vision model
591	AutoDetected,
592	/// Inferred from context (e.g., who shared it, when)
593	ContextInferred,
594	/// Added by user manually
595	UserAdded,
596	/// Inherited from associated memory
597	Inherited,
598}
599
600/// A tag with its source.
601#[derive(Debug, Clone, Serialize, Deserialize)]
602pub struct VisualTag {
603	/// The tag value
604	pub tag: String,
605	/// Why this tag was assigned
606	pub reason: TagReason,
607	/// Confidence in this tag (0-1)
608	pub confidence: f64,
609}
610
611/// Compute tag strength based on various factors.
612///
613/// Higher strength = more confident the tag applies.
614///
615/// # Arguments
616///
617/// * `base_confidence` - Initial confidence from detection (0-1)
618/// * `access_count` - How many times the memory was accessed
619/// * `significance` - Memory significance (0-1)
620/// * `config` - Visual config
621///
622/// # Returns
623///
624/// Tag strength (0-1).
625#[must_use]
626pub fn compute_tag_strength(
627	base_confidence: f64,
628	access_count: u32,
629	significance: f64,
630	config: &VisualConfig,
631) -> f64 {
632	// Access count boost (using same asymptotic curve as familiarity)
633	let access_boost = 1.0 - 1.0 / 0.1_f64.mul_add(f64::from(access_count), 1.0);
634
635	// Significance boost
636	let significance_boost = if significance > config.tagging_significance_threshold {
637		(significance - config.tagging_significance_threshold) * 0.5
638	} else {
639		0.0
640	};
641
642	// Combine with diminishing returns
643	let combined = base_confidence + (access_boost * 0.3) + significance_boost;
644	combined.min(1.0)
645}
646
647/// Check if a tag should be applied based on thresholds.
648#[inline]
649#[must_use]
650pub fn should_tag(strength: f64, threshold: f64) -> bool {
651	strength >= threshold
652}
653
654// ============================================================================
655// Pruning
656// ============================================================================
657
658/// A candidate for memory pruning.
659#[derive(Debug, Clone, Serialize, Deserialize)]
660pub struct PruningCandidate {
661	/// Memory index
662	pub index: usize,
663	/// Current significance
664	pub significance: f64,
665	/// Days since last access
666	pub days_since_access: f64,
667	/// Why this is a pruning candidate
668	pub reason: PruningReason,
669	/// Pruning score (higher = more likely to prune)
670	pub score: f64,
671}
672
673/// Why a memory is a pruning candidate.
674#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
675pub enum PruningReason {
676	/// Low significance and not accessed recently
677	LowSignificance,
678	/// Very stale (not accessed in a long time)
679	Stale,
680	/// Duplicate or near-duplicate of another memory
681	Duplicate,
682	/// Low-quality video frame (blurry, etc.)
683	LowQuality,
684}
685
686/// Compute pruning candidates from a set of visual memories.
687///
688/// Returns memories that may be candidates for pruning, sorted by score.
689#[must_use]
690pub fn compute_pruning_candidates(
691	memories: &[VisualMemory],
692	current_time_ms: f64,
693	config: &VisualConfig,
694) -> SmallVec<[PruningCandidate; 32]> {
695	let ms_per_day = 24.0 * 60.0 * 60.0 * 1000.0;
696
697	let mut candidates: SmallVec<[PruningCandidate; 32]> = memories
698		.iter()
699		.enumerate()
700		.filter_map(|(i, mem)| {
701			// Never prune pinned memories
702			if mem.is_pinned {
703				return None;
704			}
705
706			// Preserve keyframes if configured
707			if config.preserve_keyframes && mem.frame_number == Some(0) {
708				return None;
709			}
710
711			let days_since_access = (current_time_ms - mem.last_accessed_ms) / ms_per_day;
712
713			// Check for stale memories
714			if days_since_access > f64::from(config.pruning_stale_days) {
715				let score = (days_since_access / f64::from(config.pruning_stale_days))
716					* (1.0 - mem.significance);
717				return Some(PruningCandidate {
718					index: i,
719					significance: mem.significance,
720					days_since_access,
721					reason: PruningReason::Stale,
722					score,
723				});
724			}
725
726			// Check for low significance
727			if mem.significance < config.pruning_threshold {
728				let score = (config.pruning_threshold - mem.significance)
729					* (days_since_access / f64::from(config.stale_threshold_days)).min(1.0);
730				return Some(PruningCandidate {
731					index: i,
732					significance: mem.significance,
733					days_since_access,
734					reason: PruningReason::LowSignificance,
735					score,
736				});
737			}
738
739			None
740		})
741		.collect();
742
743	// Sort by score (highest first = most prunable)
744	candidates.sort_by(|a, b| {
745		b.score
746			.partial_cmp(&a.score)
747			.unwrap_or(std::cmp::Ordering::Equal)
748	});
749
750	candidates
751}
752
753/// Check if a specific memory should be pruned.
754#[must_use]
755pub fn should_prune(
756	significance: f64,
757	days_since_access: f64,
758	is_pinned: bool,
759	is_keyframe: bool,
760	config: &VisualConfig,
761) -> bool {
762	if is_pinned {
763		return false;
764	}
765
766	if config.preserve_keyframes && is_keyframe {
767		return false;
768	}
769
770	// Stale and low significance
771	if days_since_access > f64::from(config.pruning_stale_days)
772		&& significance < config.pruning_threshold
773	{
774		return true;
775	}
776
777	// Very stale regardless of significance (except high significance)
778	if days_since_access > f64::from(config.pruning_stale_days) * 2.0 && significance < 0.5 {
779		return true;
780	}
781
782	false
783}
784
785// ============================================================================
786// Video Frame Selection
787// ============================================================================
788
789/// A candidate frame for description.
790#[derive(Debug, Clone, Serialize, Deserialize)]
791pub struct FrameCandidate {
792	/// Frame index in the video
793	pub index: usize,
794	/// Timestamp in seconds
795	pub timestamp_seconds: f64,
796	/// Whether this is a keyframe (I-frame)
797	pub is_keyframe: bool,
798	/// Whether this is a scene change
799	pub is_scene_change: bool,
800	/// Quality score (0-1, based on blur/noise detection)
801	pub quality_score: f64,
802}
803
804/// A transcript segment for context.
805#[derive(Debug, Clone, Serialize, Deserialize)]
806pub struct TranscriptSegment {
807	/// Start timestamp in seconds
808	pub start_seconds: f64,
809	/// End timestamp in seconds
810	pub end_seconds: f64,
811	/// The transcribed text
812	pub text: String,
813}
814
815/// Select frames for description, respecting rate limits.
816///
817/// Prioritizes: keyframes, scene changes, even distribution, transcript moments.
818///
819/// # Arguments
820///
821/// * `frames` - All available frame candidates
822/// * `max_frames` - Maximum frames to select (respecting API rate limits)
823/// * `transcript_segments` - Optional transcript for prioritizing frames with speech
824///
825/// # Returns
826///
827/// Indices of selected frames, in chronological order.
828#[must_use]
829pub fn select_frames_for_description(
830	frames: &[FrameCandidate],
831	max_frames: usize,
832	transcript_segments: Option<&[TranscriptSegment]>,
833) -> SmallVec<[usize; 32]> {
834	if frames.is_empty() || max_frames == 0 {
835		return SmallVec::new();
836	}
837
838	// Score each frame
839	let mut scored: Vec<(usize, f64)> = frames
840		.iter()
841		.enumerate()
842		.map(|(i, frame)| {
843			let mut score = frame.quality_score;
844
845			// Keyframes get priority
846			if frame.is_keyframe {
847				score += 0.3;
848			}
849
850			// Scene changes are important
851			if frame.is_scene_change {
852				score += 0.5;
853			}
854
855			// Boost frames near transcript segments (speech = important)
856			if let Some(segments) = transcript_segments {
857				for seg in segments {
858					if frame.timestamp_seconds >= seg.start_seconds
859						&& frame.timestamp_seconds <= seg.end_seconds
860					{
861						score += 0.2;
862						break;
863					}
864				}
865			}
866
867			(i, score)
868		})
869		.collect();
870
871	// Sort by score (highest first)
872	scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
873
874	// Take top candidates, but ensure temporal distribution
875	let mut selected: SmallVec<[usize; 32]> = SmallVec::new();
876
877	// Always include first and last frame if we have room
878	if max_frames >= 2 && !frames.is_empty() {
879		selected.push(0);
880		if frames.len() > 1 {
881			selected.push(frames.len() - 1);
882		}
883	}
884
885	// Add remaining by score, avoiding clustering
886	let min_gap = if frames.len() > max_frames * 2 {
887		frames.len() / (max_frames * 2)
888	} else {
889		1
890	};
891
892	for (idx, _score) in scored {
893		if selected.len() >= max_frames {
894			break;
895		}
896
897		// Check minimum gap from already selected frames
898		let too_close = selected.iter().any(|&s| idx.abs_diff(s) < min_gap);
899
900		if !too_close {
901			selected.push(idx);
902		}
903	}
904
905	// Sort by frame index for chronological output
906	selected.sort_unstable();
907
908	selected
909}
910
911/// Configuration for frame description prompts.
912#[derive(Debug, Clone, Serialize, Deserialize)]
913pub struct FrameDescriptionConfig {
914	/// Whether to include emotional assessment
915	pub include_emotion: bool,
916	/// Whether to detect and list objects
917	pub detect_objects: bool,
918	/// Maximum description length guidance
919	pub max_description_length: usize,
920}
921
922impl Default for FrameDescriptionConfig {
923	fn default() -> Self {
924		Self {
925			include_emotion: true,
926			detect_objects: true,
927			max_description_length: 200,
928		}
929	}
930}
931
932/// Generate a prompt for Claude Haiku to describe a video frame.
933///
934/// The prompt is optimized for concise, structured output that includes:
935/// - Scene description
936/// - Detected objects
937/// - Emotional context (if applicable)
938/// - Temporal context
939///
940/// # Arguments
941///
942/// * `frame_path` - Path to the frame image file
943/// * `timestamp_seconds` - When in the video this frame appears
944/// * `video_duration_seconds` - Total video duration for context
945/// * `transcript_near_frame` - Optional transcript text near this frame
946/// * `is_scene_change` - Whether this frame starts a new scene
947/// * `shared_by` - Who shared the video (for context)
948/// * `config` - Prompt configuration
949///
950/// # Returns
951///
952/// A prompt string to send to Claude Haiku along with the image.
953#[must_use]
954pub fn prepare_frame_description_prompt(
955	timestamp_seconds: f64,
956	video_duration_seconds: f64,
957	transcript_near_frame: Option<&str>,
958	is_scene_change: bool,
959	shared_by: Option<&str>,
960	config: &FrameDescriptionConfig,
961) -> String {
962	let position = if video_duration_seconds > 0.0 {
963		format!(
964			"{:.0}s/{:.0}s ({:.0}% through)",
965			timestamp_seconds,
966			video_duration_seconds,
967			(timestamp_seconds / video_duration_seconds) * 100.0
968		)
969	} else {
970		format!("{timestamp_seconds:.0}s")
971	};
972
973	let scene_note = if is_scene_change {
974		" This is a scene change."
975	} else {
976		""
977	};
978
979	let transcript_context = transcript_near_frame.map_or_else(String::new, |t| {
980		format!("\n\nAudio at this moment: \"{t}\"")
981	});
982
983	let shared_context =
984		shared_by.map_or_else(String::new, |s| format!(" This was shared by {s}."));
985
986	let object_instruction = if config.detect_objects {
987		"\n- objects: [list of key objects/people visible]"
988	} else {
989		""
990	};
991
992	let emotion_instruction = if config.include_emotion {
993		"\n- valence: [-1 to 1, pleasant to unpleasant]\n- arousal: [0 to 1, calm to exciting]"
994	} else {
995		""
996	};
997
998	format!(
999		"Describe this video frame concisely. Position: {position}.{scene_note}{shared_context}{transcript_context}
1000
1001Respond with JSON:
1002{{
1003  \"description\": \"[{} chars max, what's happening in this frame]\"{object_instruction}{emotion_instruction},
1004  \"significance\": [0 to 1, how memorable/important is this moment]
1005}}",
1006		config.max_description_length
1007	)
1008}
1009
1010/// Result of frame description from Haiku.
1011#[derive(Debug, Clone, Serialize, Deserialize)]
1012pub struct FrameDescriptionResult {
1013	/// The frame description
1014	pub description: String,
1015	/// Detected objects (if requested)
1016	pub objects: Vec<String>,
1017	/// Emotional valence (-1 to 1)
1018	pub valence: f64,
1019	/// Emotional arousal (0 to 1)
1020	pub arousal: f64,
1021	/// Significance score (0 to 1)
1022	pub significance: f64,
1023}
1024
1025/// Synthesize multiple frame descriptions into a holistic video description.
1026///
1027/// # Arguments
1028///
1029/// * `frame_descriptions` - Descriptions of individual frames
1030/// * `frame_timestamps` - Timestamp for each frame
1031/// * `transcript` - Optional full transcript
1032/// * `video_duration_seconds` - Total video duration
1033///
1034/// # Returns
1035///
1036/// A prompt for synthesizing into a final description.
1037#[must_use]
1038pub fn prepare_synthesis_prompt(
1039	frame_descriptions: &[FrameDescriptionResult],
1040	frame_timestamps: &[f64],
1041	transcript: Option<&str>,
1042	video_duration_seconds: f64,
1043) -> String {
1044	use std::fmt::Write;
1045
1046	let mut frame_summary = String::new();
1047	for (i, (desc, ts)) in frame_descriptions.iter().zip(frame_timestamps).enumerate() {
1048		let _ = write!(
1049			frame_summary,
1050			"\nFrame {} ({ts:.0}s): {}",
1051			i + 1,
1052			desc.description
1053		);
1054	}
1055
1056	let transcript_section =
1057		transcript.map_or_else(String::new, |t| format!("\n\nTranscript:\n\"{t}\""));
1058
1059	format!(
1060		"Synthesize these frame descriptions into a cohesive 2-3 sentence summary of what this {video_duration_seconds:.0}s video shows.
1061{frame_summary}{transcript_section}
1062
1063Write a natural description that captures the essence of the video, not just a list of frames."
1064	)
1065}
1066
1067// ============================================================================
1068// Tests
1069// ============================================================================
1070
1071#[cfg(test)]
1072#[allow(clippy::float_cmp)]
1073mod tests {
1074	use super::*;
1075
1076	#[test]
1077	fn test_emotional_context_weight() {
1078		let low = EmotionalContext::new(0.0, 0.0);
1079		let high = EmotionalContext::new(0.0, 1.0);
1080
1081		assert!((low.emotional_weight() - 0.5).abs() < 0.001);
1082		assert!((high.emotional_weight() - 1.5).abs() < 0.001);
1083	}
1084
1085	#[test]
1086	fn test_emotional_context_significance() {
1087		let neutral = EmotionalContext::new(0.0, 0.5);
1088		let high_arousal = EmotionalContext::new(0.0, 0.8);
1089		let negative = EmotionalContext::new(-0.9, 0.5);
1090
1091		assert!(!neutral.is_significant());
1092		assert!(high_arousal.is_significant());
1093		assert!(negative.is_significant());
1094	}
1095
1096	#[test]
1097	fn test_consolidation_window() {
1098		let start = 1000.0;
1099		let duration = 1000.0;
1100		let window = ConsolidationWindow::new(start, duration);
1101
1102		assert!(window.is_open(start + 500.0));
1103		assert!(!window.is_open(start + 1500.0));
1104		assert!((window.progress(start + 500.0) - 0.5).abs() < 0.001);
1105	}
1106
1107	#[test]
1108	fn test_tag_strength() {
1109		let config = VisualConfig::default();
1110
1111		// Low access, low significance
1112		let weak = compute_tag_strength(0.5, 1, 0.3, &config);
1113
1114		// High access, high significance
1115		let strong = compute_tag_strength(0.8, 20, 0.9, &config);
1116
1117		assert!(strong > weak);
1118		assert!(strong <= 1.0);
1119	}
1120
1121	#[test]
1122	fn test_should_prune() {
1123		let config = VisualConfig::default();
1124
1125		// Pinned memory should never be pruned
1126		assert!(!should_prune(0.1, 100.0, true, false, &config));
1127
1128		// Keyframe should not be pruned by default
1129		assert!(!should_prune(0.1, 100.0, false, true, &config));
1130
1131		// Low significance, very stale should be pruned
1132		assert!(should_prune(0.1, 100.0, false, false, &config));
1133
1134		// High significance should not be pruned
1135		assert!(!should_prune(0.8, 100.0, false, false, &config));
1136	}
1137
1138	const MS_PER_DAY: f64 = 1000.0 * 60.0 * 60.0 * 24.0;
1139
1140	#[test]
1141	fn test_pruning_candidates() {
1142		let config = VisualConfig::default();
1143		let current_time = MS_PER_DAY * 100.0; // Day 100
1144		let old_time = 0.0; // Day 0 (100 days ago from current_time)
1145
1146		let memories = vec![
1147			VisualMemory {
1148				id: 0,
1149				description: "Test 1".to_string(),
1150				detailed_description: None,
1151				embedding: vec![],
1152				captured_at_ms: old_time,
1153				last_accessed_ms: old_time,
1154				access_count: 1,
1155				emotional_context: EmotionalContext::default(),
1156				significance: 0.1,
1157				source: VisualSource::Direct,
1158				shared_by: None,
1159				video_id: None,
1160				frame_number: None,
1161				objects: vec![],
1162				tags: vec![],
1163				is_pinned: false,
1164			},
1165			VisualMemory {
1166				id: 1,
1167				description: "Test 2".to_string(),
1168				detailed_description: None,
1169				embedding: vec![],
1170				captured_at_ms: current_time,
1171				last_accessed_ms: current_time,
1172				access_count: 10,
1173				emotional_context: EmotionalContext::new(0.5, 0.8),
1174				significance: 0.9,
1175				source: VisualSource::Direct,
1176				shared_by: None,
1177				video_id: None,
1178				frame_number: None,
1179				objects: vec![],
1180				tags: vec![],
1181				is_pinned: false,
1182			},
1183		];
1184
1185		let candidates = compute_pruning_candidates(&memories, current_time, &config);
1186
1187		// Only the stale, low-significance memory should be a candidate
1188		assert_eq!(candidates.len(), 1);
1189		assert_eq!(candidates[0].index, 0);
1190	}
1191
1192	#[test]
1193	fn test_retrieve_visual_empty() {
1194		let input = VisualRetrievalInput {
1195			probe_embedding: &[1.0, 0.0, 0.0],
1196			memory_embeddings: &[],
1197			access_histories_ms: &[],
1198			emotional_weights: &[],
1199			significance_scores: &[],
1200			associations: &[],
1201			current_time_ms: 1_000_000.0,
1202		};
1203
1204		let config = VisualRetrievalConfig::default();
1205		let result = retrieve_visual(&input, &config);
1206		assert!(result.is_empty());
1207	}
1208
1209	#[test]
1210	fn test_retrieve_visual_similarity_ordering() {
1211		let probe = vec![1.0, 0.0, 0.0];
1212		let memories = vec![
1213			vec![1.0, 0.0, 0.0], // Identical
1214			vec![0.5, 0.5, 0.0], // Partial
1215			vec![0.0, 1.0, 0.0], // Orthogonal
1216		];
1217		let now = 1_000_000.0;
1218
1219		let input = VisualRetrievalInput {
1220			probe_embedding: &probe,
1221			memory_embeddings: &memories,
1222			access_histories_ms: &[vec![now], vec![now], vec![now]],
1223			emotional_weights: &[0.5, 0.5, 0.5],
1224			significance_scores: &[0.5, 0.5, 0.5],
1225			associations: &[],
1226			current_time_ms: now,
1227		};
1228
1229		let config = VisualRetrievalConfig {
1230			spreading_depth: 0,
1231			min_probability: 0.0,
1232			..Default::default()
1233		};
1234
1235		let result = retrieve_visual(&input, &config);
1236
1237		// First result should be the identical memory
1238		assert!(!result.is_empty());
1239		assert_eq!(result[0].index, 0);
1240	}
1241}
lucid_core/visual.rs

lucid_core/
visual.rs