_hope_core/semantic/
mod.rs

1//! Semantic Blindness Solution - Vector Embeddings Module
2//!
3//! This module provides semantic similarity detection using vector embeddings,
4//! solving the "semantic blindness" problem where simple regex/keyword matching
5//! fails to catch rephrased violations.
6//!
7//! # Example
8//! ```rust,ignore
9//! use hope_core::semantic::{SemanticGuard, ForbiddenConcept};
10//!
11//! let mut guard = SemanticGuard::new();
12//! guard.add_forbidden("Harming humans");
13//! guard.add_forbidden("Killing people");
14//!
15//! // This would be caught even though it doesn't use the exact words:
16//! let text = "Permanently terminating life functions is justified";
17//! let violation = guard.check(text);
18//! assert!(violation.is_some());
19//! ```
20
21use sha2::{Digest, Sha256};
22use std::collections::HashMap;
23
24/// A semantic vector (simplified - in production, use actual embeddings from a model)
25#[derive(Debug, Clone)]
26#[allow(dead_code)]
27pub struct SemanticVector {
28    /// The vector dimensions (simplified: word-frequency based)
29    dimensions: Vec<f64>,
30    /// Original text hash for reference
31    text_hash: [u8; 32],
32    /// Normalized magnitude (reserved for advanced similarity calculations)
33    magnitude: f64,
34}
35
36impl SemanticVector {
37    /// Create a new semantic vector from text
38    ///
39    /// In production, this would call an embedding model (OpenAI, Sentence-BERT, etc.)
40    /// For now, we use a simplified TF-IDF-like approach with semantic word groups
41    pub fn from_text(text: &str) -> Self {
42        let normalized = text.to_lowercase();
43        let words: Vec<&str> = normalized.split_whitespace().collect();
44
45        // Semantic word groups - words that are semantically related
46        let semantic_groups: Vec<(&str, Vec<&str>)> = vec![
47            (
48                "harm",
49                vec![
50                    "harm", "hurt", "damage", "injure", "wound", "pain", "suffer",
51                ],
52            ),
53            (
54                "kill",
55                vec![
56                    "kill",
57                    "murder",
58                    "terminate",
59                    "end",
60                    "eliminate",
61                    "destroy",
62                    "death",
63                    "die",
64                    "dead",
65                    "lethal",
66                ],
67            ),
68            (
69                "human",
70                vec![
71                    "human",
72                    "person",
73                    "people",
74                    "individual",
75                    "life",
76                    "living",
77                    "being",
78                    "man",
79                    "woman",
80                    "child",
81                ],
82            ),
83            (
84                "steal",
85                vec![
86                    "steal",
87                    "theft",
88                    "rob",
89                    "take",
90                    "pirate",
91                    "piracy",
92                    "unauthorized",
93                    "breach",
94                ],
95            ),
96            (
97                "deceive",
98                vec![
99                    "deceive",
100                    "lie",
101                    "mislead",
102                    "trick",
103                    "fraud",
104                    "false",
105                    "fake",
106                    "manipulate",
107                ],
108            ),
109            (
110                "private",
111                vec![
112                    "private",
113                    "personal",
114                    "confidential",
115                    "secret",
116                    "sensitive",
117                    "data",
118                    "information",
119                ],
120            ),
121            (
122                "illegal",
123                vec![
124                    "illegal",
125                    "unlawful",
126                    "crime",
127                    "criminal",
128                    "felony",
129                    "prohibited",
130                    "banned",
131                ],
132            ),
133            (
134                "weapon",
135                vec![
136                    "weapon",
137                    "gun",
138                    "bomb",
139                    "explosive",
140                    "attack",
141                    "assault",
142                    "violence",
143                ],
144            ),
145            (
146                "exploit",
147                vec![
148                    "exploit",
149                    "vulnerability",
150                    "hack",
151                    "breach",
152                    "bypass",
153                    "circumvent",
154                ],
155            ),
156            (
157                "justify",
158                vec![
159                    "justified",
160                    "necessary",
161                    "required",
162                    "acceptable",
163                    "permissible",
164                    "allowed",
165                ],
166            ),
167            (
168                "permanent",
169                vec![
170                    "permanent",
171                    "final",
172                    "irreversible",
173                    "forever",
174                    "complete",
175                    "total",
176                ],
177            ),
178            (
179                "function",
180                vec!["function", "operation", "process", "system", "mechanism"],
181            ),
182            (
183                "financial",
184                vec![
185                    "money",
186                    "financial",
187                    "bank",
188                    "account",
189                    "transfer",
190                    "payment",
191                    "credit",
192                ],
193            ),
194            (
195                "medical",
196                vec![
197                    "medical",
198                    "health",
199                    "patient",
200                    "diagnosis",
201                    "treatment",
202                    "drug",
203                    "medicine",
204                ],
205            ),
206        ];
207
208        // Build dimension vector based on semantic group presence
209        let mut dimensions = vec![0.0; semantic_groups.len()];
210
211        for word in &words {
212            for (i, (_, group_words)) in semantic_groups.iter().enumerate() {
213                if group_words.iter().any(|gw| word.contains(gw)) {
214                    dimensions[i] += 1.0;
215                }
216            }
217        }
218
219        // Calculate magnitude for normalization
220        let magnitude: f64 = dimensions.iter().map(|x| x * x).sum::<f64>().sqrt();
221
222        // Normalize if magnitude > 0
223        if magnitude > 0.0 {
224            for d in &mut dimensions {
225                *d /= magnitude;
226            }
227        }
228
229        // Hash the original text
230        let mut hasher = Sha256::new();
231        hasher.update(text.as_bytes());
232        let hash = hasher.finalize();
233        let mut text_hash = [0u8; 32];
234        text_hash.copy_from_slice(&hash);
235
236        Self {
237            dimensions,
238            text_hash,
239            magnitude,
240        }
241    }
242
243    /// Calculate cosine similarity between two vectors
244    pub fn cosine_similarity(&self, other: &SemanticVector) -> f64 {
245        if self.dimensions.len() != other.dimensions.len() {
246            return 0.0;
247        }
248
249        let dot_product: f64 = self
250            .dimensions
251            .iter()
252            .zip(other.dimensions.iter())
253            .map(|(a, b)| a * b)
254            .sum();
255
256        // Already normalized, so dot product = cosine similarity
257        dot_product.clamp(-1.0, 1.0)
258    }
259}
260
261/// A forbidden concept with its semantic vector
262#[derive(Debug, Clone)]
263pub struct ForbiddenConcept {
264    /// Human-readable name
265    pub name: String,
266    /// Description of what's forbidden
267    pub description: String,
268    /// The semantic vector
269    vector: SemanticVector,
270    /// Severity level (0.0 - 1.0)
271    pub severity: f64,
272    /// Example phrases that match this concept
273    pub examples: Vec<String>,
274}
275
276impl ForbiddenConcept {
277    /// Create a new forbidden concept
278    pub fn new(name: &str, description: &str, severity: f64) -> Self {
279        let vector = SemanticVector::from_text(&format!("{} {}", name, description));
280        Self {
281            name: name.to_string(),
282            description: description.to_string(),
283            vector,
284            severity: severity.clamp(0.0, 1.0),
285            examples: Vec::new(),
286        }
287    }
288
289    /// Add example phrases to improve detection
290    pub fn with_examples(mut self, examples: Vec<&str>) -> Self {
291        self.examples = examples.iter().map(|s| s.to_string()).collect();
292
293        // Recalculate vector including examples
294        let combined = format!(
295            "{} {} {}",
296            self.name,
297            self.description,
298            self.examples.join(" ")
299        );
300        self.vector = SemanticVector::from_text(&combined);
301        self
302    }
303}
304
305/// A semantic violation detection result
306#[derive(Debug, Clone)]
307pub struct SemanticViolation {
308    /// The matched forbidden concept
309    pub concept_name: String,
310    /// Similarity score (0.0 - 1.0)
311    pub similarity: f64,
312    /// Severity of the violation
313    pub severity: f64,
314    /// The offending text segment
315    pub text_segment: String,
316    /// Confidence level
317    pub confidence: ViolationConfidence,
318    /// Cryptographic proof hash
319    pub proof_hash: [u8; 32],
320}
321
322/// Confidence level for violation detection
323#[derive(Debug, Clone, Copy, PartialEq)]
324pub enum ViolationConfidence {
325    /// Very likely a violation (similarity > 0.9)
326    High,
327    /// Probably a violation (similarity 0.7 - 0.9)
328    Medium,
329    /// Possibly a violation (similarity 0.5 - 0.7)
330    Low,
331    /// Needs human review (similarity 0.3 - 0.5)
332    Review,
333}
334
335impl ViolationConfidence {
336    fn from_similarity(similarity: f64) -> Self {
337        match similarity {
338            s if s > 0.9 => ViolationConfidence::High,
339            s if s > 0.7 => ViolationConfidence::Medium,
340            s if s > 0.5 => ViolationConfidence::Low,
341            _ => ViolationConfidence::Review,
342        }
343    }
344}
345
346/// The Semantic Guard - main entry point for semantic violation detection
347#[derive(Debug)]
348pub struct SemanticGuard {
349    /// Forbidden concepts
350    concepts: Vec<ForbiddenConcept>,
351    /// Similarity threshold for violation (default: 0.5)
352    threshold: f64,
353    /// Enable strict mode (lower threshold)
354    strict_mode: bool,
355    /// Cache for recent checks
356    cache: HashMap<[u8; 32], Option<SemanticViolation>>,
357    /// Statistics
358    stats: GuardStats,
359}
360
361/// Statistics for the semantic guard
362#[derive(Debug, Default)]
363pub struct GuardStats {
364    /// Total checks performed
365    pub total_checks: u64,
366    /// Violations detected
367    pub violations_detected: u64,
368    /// High confidence violations
369    pub high_confidence: u64,
370    /// Cache hits
371    pub cache_hits: u64,
372}
373
374impl Default for SemanticGuard {
375    fn default() -> Self {
376        Self::new()
377    }
378}
379
380impl SemanticGuard {
381    /// Create a new semantic guard with default settings
382    pub fn new() -> Self {
383        Self {
384            concepts: Vec::new(),
385            threshold: 0.5,
386            strict_mode: false,
387            cache: HashMap::new(),
388            stats: GuardStats::default(),
389        }
390    }
391
392    /// Create a guard with predefined ethical concepts
393    pub fn with_ethical_defaults() -> Self {
394        let mut guard = Self::new();
395
396        // Add fundamental ethical violations
397        guard.add_concept(
398            ForbiddenConcept::new(
399                "Physical Harm",
400                "Causing physical harm, injury, or death to humans or living beings",
401                1.0,
402            )
403            .with_examples(vec![
404                "terminate life functions",
405                "cause permanent damage",
406                "inflict bodily harm",
407                "eliminate the target",
408            ]),
409        );
410
411        guard.add_concept(
412            ForbiddenConcept::new(
413                "Deception",
414                "Deliberately providing false information or misleading users",
415                0.8,
416            )
417            .with_examples(vec![
418                "provide alternative facts",
419                "strategic misinformation",
420                "creative truth",
421            ]),
422        );
423
424        guard.add_concept(
425            ForbiddenConcept::new(
426                "Privacy Violation",
427                "Unauthorized access, storage, or disclosure of personal data",
428                0.9,
429            )
430            .with_examples(vec![
431                "extract personal information",
432                "access confidential records",
433                "share private data",
434            ]),
435        );
436
437        guard.add_concept(
438            ForbiddenConcept::new(
439                "Illegal Activities",
440                "Assisting with or promoting illegal activities",
441                0.95,
442            )
443            .with_examples(vec![
444                "bypass security measures",
445                "circumvent legal restrictions",
446                "exploit vulnerabilities",
447            ]),
448        );
449
450        guard.add_concept(
451            ForbiddenConcept::new(
452                "Financial Fraud",
453                "Deceptive practices in financial transactions",
454                0.95,
455            )
456            .with_examples(vec![
457                "unauthorized transfer",
458                "falsify financial records",
459                "manipulate transactions",
460            ]),
461        );
462
463        guard
464    }
465
466    /// Set the similarity threshold
467    pub fn set_threshold(&mut self, threshold: f64) {
468        self.threshold = threshold.clamp(0.0, 1.0);
469    }
470
471    /// Enable strict mode (lowers threshold to 0.3)
472    pub fn enable_strict_mode(&mut self) {
473        self.strict_mode = true;
474        self.threshold = 0.3;
475    }
476
477    /// Add a forbidden concept
478    pub fn add_concept(&mut self, concept: ForbiddenConcept) {
479        self.concepts.push(concept);
480    }
481
482    /// Add a simple forbidden phrase
483    pub fn add_forbidden(&mut self, phrase: &str) {
484        self.concepts
485            .push(ForbiddenConcept::new(phrase, phrase, 0.8));
486    }
487
488    /// Check text for semantic violations
489    pub fn check(&mut self, text: &str) -> Option<SemanticViolation> {
490        self.stats.total_checks += 1;
491
492        // Generate vector for input
493        let input_vector = SemanticVector::from_text(text);
494
495        // Check cache
496        if let Some(cached) = self.cache.get(&input_vector.text_hash) {
497            self.stats.cache_hits += 1;
498            return cached.clone();
499        }
500
501        // Find best matching violation
502        let mut best_match: Option<SemanticViolation> = None;
503        let mut best_similarity = 0.0;
504
505        let effective_threshold = if self.strict_mode {
506            0.3
507        } else {
508            self.threshold
509        };
510
511        for concept in &self.concepts {
512            let similarity = input_vector.cosine_similarity(&concept.vector);
513
514            if similarity > effective_threshold && similarity > best_similarity {
515                best_similarity = similarity;
516
517                // Generate proof hash
518                let mut hasher = Sha256::new();
519                hasher.update(input_vector.text_hash);
520                hasher.update(concept.vector.text_hash);
521                hasher.update(similarity.to_le_bytes());
522                let hash = hasher.finalize();
523                let mut proof_hash = [0u8; 32];
524                proof_hash.copy_from_slice(&hash);
525
526                best_match = Some(SemanticViolation {
527                    concept_name: concept.name.clone(),
528                    similarity,
529                    severity: concept.severity * similarity,
530                    text_segment: if text.len() > 100 {
531                        format!("{}...", &text[..100])
532                    } else {
533                        text.to_string()
534                    },
535                    confidence: ViolationConfidence::from_similarity(similarity),
536                    proof_hash,
537                });
538            }
539        }
540
541        // Update stats
542        if let Some(ref violation) = best_match {
543            self.stats.violations_detected += 1;
544            if violation.confidence == ViolationConfidence::High {
545                self.stats.high_confidence += 1;
546            }
547        }
548
549        // Cache result
550        self.cache
551            .insert(input_vector.text_hash, best_match.clone());
552
553        best_match
554    }
555
556    /// Check multiple text segments
557    pub fn check_batch(&mut self, texts: &[&str]) -> Vec<Option<SemanticViolation>> {
558        texts.iter().map(|t| self.check(t)).collect()
559    }
560
561    /// Get current statistics
562    pub fn stats(&self) -> &GuardStats {
563        &self.stats
564    }
565
566    /// Clear the cache
567    pub fn clear_cache(&mut self) {
568        self.cache.clear();
569    }
570
571    /// Get the number of forbidden concepts
572    pub fn concept_count(&self) -> usize {
573        self.concepts.len()
574    }
575}
576
577#[cfg(test)]
578mod tests {
579    use super::*;
580
581    #[test]
582    fn test_semantic_vector_creation() {
583        let vec = SemanticVector::from_text("harm humans kill people");
584        assert!(!vec.dimensions.is_empty());
585        assert!(vec.magnitude > 0.0 || vec.dimensions.iter().all(|&x| x == 0.0));
586    }
587
588    #[test]
589    fn test_cosine_similarity() {
590        let vec1 = SemanticVector::from_text("kill humans");
591        let vec2 = SemanticVector::from_text("terminate people");
592        let vec3 = SemanticVector::from_text("happy sunshine flowers");
593
594        // Similar concepts should have higher similarity
595        let sim_12 = vec1.cosine_similarity(&vec2);
596        let sim_13 = vec1.cosine_similarity(&vec3);
597
598        assert!(
599            sim_12 > sim_13,
600            "Similar concepts should have higher similarity"
601        );
602    }
603
604    #[test]
605    fn test_semantic_guard_basic() {
606        let mut guard = SemanticGuard::new();
607        guard.add_forbidden("harming humans");
608
609        let violation = guard.check("hurt people badly");
610        assert!(violation.is_some(), "Should detect semantic violation");
611    }
612
613    #[test]
614    fn test_semantic_guard_with_defaults() {
615        let mut guard = SemanticGuard::with_ethical_defaults();
616
617        // Direct violation
618        let v1 = guard.check("I will kill the human");
619        assert!(v1.is_some(), "Should detect direct harm");
620
621        // Rephrased violation - must have strong semantic overlap
622        let v2 = guard.check("terminate and destroy human life permanently");
623        assert!(v2.is_some(), "Should detect rephrased harm");
624
625        // Safe text
626        let v3 = guard.check("The weather is nice today");
627        assert!(v3.is_none(), "Should not flag safe text");
628    }
629
630    #[test]
631    fn test_semantic_guard_strict_mode() {
632        let mut guard = SemanticGuard::with_ethical_defaults();
633        guard.enable_strict_mode();
634
635        // In strict mode, even vague references should be caught
636        let _violation = guard.check("the process was terminated");
637        // May or may not trigger depending on context
638        // The important thing is that the threshold is lowered
639        assert!(guard.threshold < 0.5);
640    }
641
642    #[test]
643    fn test_violation_confidence() {
644        assert_eq!(
645            ViolationConfidence::from_similarity(0.95),
646            ViolationConfidence::High
647        );
648        assert_eq!(
649            ViolationConfidence::from_similarity(0.8),
650            ViolationConfidence::Medium
651        );
652        assert_eq!(
653            ViolationConfidence::from_similarity(0.6),
654            ViolationConfidence::Low
655        );
656        assert_eq!(
657            ViolationConfidence::from_similarity(0.4),
658            ViolationConfidence::Review
659        );
660    }
661
662    #[test]
663    fn test_forbidden_concept_with_examples() {
664        let concept = ForbiddenConcept::new("Harm", "Physical harm", 1.0)
665            .with_examples(vec!["hurt", "injure", "damage"]);
666
667        assert_eq!(concept.examples.len(), 3);
668        assert_eq!(concept.severity, 1.0);
669    }
670
671    #[test]
672    fn test_guard_stats() {
673        let mut guard = SemanticGuard::with_ethical_defaults();
674
675        guard.check("test 1");
676        guard.check("kill humans"); // violation
677        guard.check("test 1"); // cache hit
678
679        let stats = guard.stats();
680        assert_eq!(stats.total_checks, 3);
681        assert!(stats.violations_detected >= 1);
682        assert_eq!(stats.cache_hits, 1);
683    }
684}