Skip to main content

kaccy_ai/examples/
detection.rs

1//! Detection examples: plagiarism detection and image similarity detection.
2
3use crate::error::Result;
4
5/// Example: Plagiarism Detection
6///
7/// This example demonstrates how to detect plagiarism in code and text,
8/// including semantic analysis and batch detection.
9pub struct PlagiarismDetectionExample;
10
11impl PlagiarismDetectionExample {
12    /// Run basic code plagiarism detection
13    #[allow(dead_code)]
14    pub async fn run_basic_code_detection() -> Result<()> {
15        println!("=== Code Plagiarism Detection Example ===");
16        println!();
17
18        // Plagiarism detector with default config
19        let _code1 = r"
20fn calculate_sum(numbers: &[i32]) -> i32 {
21    numbers.iter().sum()
22}
23";
24
25        let _code2 = r"
26fn sum_array(nums: &[i32]) -> i32 {
27    nums.iter().sum()
28}
29";
30
31        println!("Comparing code samples...");
32        println!("(Conceptual example - actual API calls omitted)");
33        println!();
34        println!("Results:");
35        println!("Similarity score: 85.00%");
36        println!("Is plagiarism: true");
37        println!("Confidence: 90%");
38        println!("Token similarity: 87.50%");
39
40        Ok(())
41    }
42
43    /// Run text plagiarism detection with n-grams
44    #[allow(dead_code)]
45    pub async fn run_text_detection() -> Result<()> {
46        println!("=== Text Plagiarism Detection Example ===");
47        println!();
48
49        let _text1 = "The blockchain is a distributed ledger that records transactions.";
50        let _text2 = "A blockchain represents a distributed ledger for recording transactions.";
51
52        println!("Comparing text samples...");
53        println!("(Conceptual example - actual API calls omitted)");
54        println!();
55        println!("Results:");
56        println!("Similarity score: 78.00%");
57        println!("Is plagiarism: true");
58        println!("Confidence: 85%");
59
60        Ok(())
61    }
62
63    /// Semantic analysis with LLM
64    #[allow(dead_code)]
65    pub async fn semantic_analysis_example(api_key: &str) -> Result<()> {
66        println!("=== Semantic Plagiarism Analysis (LLM-Powered) ===");
67        println!();
68
69        let _llm_client = crate::llm::LlmClientBuilder::new()
70            .openai_api_key(api_key)
71            .build()
72            .expect("Failed to build LLM client");
73
74        let _config = crate::plagiarism::PlagiarismConfig {
75            similarity_threshold: 0.7,
76            use_semantic_analysis: true,
77            ngram_size: 3,
78            min_token_overlap: 5,
79        };
80
81        let _text1 = "Machine learning models require large datasets for training.";
82        let _text2 = "To train ML models effectively, you need substantial amounts of data.";
83
84        println!("Running semantic analysis...");
85        println!("(Conceptual example - actual API calls omitted)");
86        println!();
87        println!("Results:");
88        println!("  Overall similarity: 72.00%");
89        println!("  Ngram similarity: 65.00%");
90        println!("  Semantic similarity (LLM): 82.00%");
91        println!("  Verdict: PLAGIARISM DETECTED");
92
93        Ok(())
94    }
95
96    /// Batch detection with similarity matrix
97    #[allow(dead_code)]
98    pub async fn batch_detection_example() -> Result<()> {
99        println!("=== Batch Plagiarism Detection Example ===");
100        println!();
101
102        let _documents = [
103            "The quick brown fox jumps over the lazy dog.".to_string(),
104            "A fast brown fox leaps over a sleeping dog.".to_string(),
105            "Blockchain technology enables decentralized transactions.".to_string(),
106            "The rapid brown fox hops over the idle canine.".to_string(),
107        ];
108
109        println!("Analyzing 4 documents...");
110        println!("Note: Batch comparison requires pairwise comparison of all documents");
111        println!("For 4 documents, this would require 6 comparisons");
112
113        // Simplified example - just show the concept
114        println!();
115        println!("Example similarity matrix (conceptual):");
116        println!("     Doc0  Doc1  Doc2  Doc3");
117        println!("Doc0 100.0  85.0  20.0  82.0");
118        println!("Doc1  85.0 100.0  15.0  88.0");
119        println!("Doc2  20.0  15.0 100.0  18.0");
120        println!("Doc3  82.0  88.0  18.0 100.0");
121        println!();
122        println!("Potential plagiarism clusters (>80% similar):");
123        println!("  * Cluster 1: Documents 0, 1, 3 (fox/canine theme)");
124        println!("  * Cluster 2: Document 2 (unrelated - blockchain)");
125
126        Ok(())
127    }
128
129    /// Use cases guide
130    #[allow(dead_code)]
131    pub async fn use_cases_guide() -> Result<()> {
132        println!("=== Plagiarism Detection Use Cases ===");
133        println!();
134
135        println!("1. Fraud Detection");
136        println!("   - Detect users copying code/content from others");
137        println!("   - Identify reputation gaming through duplicate content");
138        println!("   - Example: User submits same code for multiple commitments");
139        println!();
140        println!("2. Content Verification");
141        println!("   - Verify commitment evidence is original");
142        println!("   - Check if GitHub commits are copied");
143        println!("   - Example: Detect forked repositories claimed as original work");
144        println!();
145        println!("3. Academic Integrity");
146        println!("   - Verify educational commitments are original");
147        println!("   - Detect code sharing between students");
148        println!("   - Example: Multiple users submitting similar solutions");
149        println!();
150        println!("4. Code Review");
151        println!("   - Find duplicate code blocks in codebase");
152        println!("   - Suggest refactoring opportunities");
153        println!("   - Example: Identify copy-pasted functions");
154        println!();
155        println!("Configuration tips:");
156        println!("  * Token similarity: Good for exact/near-exact copies (threshold: 0.7)");
157        println!("  * N-gram similarity: Detects paraphrasing (threshold: 0.6)");
158        println!("  * Semantic similarity: Finds conceptual copies (threshold: 0.75)");
159
160        Ok(())
161    }
162}
163
164/// Example: Image Similarity Detection
165///
166/// This example demonstrates how to detect similar or duplicate images
167/// using perceptual hashing.
168pub struct ImageSimilarityExample;
169
170impl ImageSimilarityExample {
171    /// Run basic image similarity detection
172    #[allow(dead_code)]
173    pub async fn run_basic_detection() -> Result<()> {
174        println!("=== Image Similarity Detection Example ===");
175        println!();
176
177        println!("Note: Using dHash algorithm for image hashing");
178        println!("Computing perceptual hash for images...");
179        println!("  Algorithm: dHash (difference hash)");
180        println!();
181
182        // Example hash comparison
183        let hash1 = crate::image_similarity::PerceptualHash {
184            hash: 0x1234_5678_9ABC_DEF0,
185            algorithm: crate::image_similarity::HashAlgorithm::DHash,
186        };
187        let hash2 = crate::image_similarity::PerceptualHash {
188            hash: 0x1234_5678_9ABC_DEF1,
189            algorithm: crate::image_similarity::HashAlgorithm::DHash,
190        };
191
192        // Conceptual comparison (actual API may differ)
193        let hamming_distance = (hash1.hash ^ hash2.hash).count_ones();
194        let similarity_percent = (f64::from(64 - hamming_distance) / 64.0) * 100.0;
195
196        println!("Hash 1: {:016X}", hash1.hash);
197        println!("Hash 2: {:016X}", hash2.hash);
198        println!("Hamming distance: {hamming_distance}");
199        println!("Similarity score: {similarity_percent:.2}%");
200        println!("Is similar: {}", similarity_percent > 90.0);
201
202        Ok(())
203    }
204
205    /// Hash algorithm comparison
206    #[allow(dead_code)]
207    pub async fn algorithm_comparison() -> Result<()> {
208        println!("=== Hash Algorithm Comparison ===");
209        println!();
210
211        println!("1. dHash (Difference Hash)");
212        println!("   - Speed: Very fast");
213        println!("   - Accuracy: Good");
214        println!("   - Best for: Real-time detection, large datasets");
215        println!("   - Resistant to: Scaling, slight cropping");
216        println!();
217        println!("2. aHash (Average Hash)");
218        println!("   - Speed: Fastest");
219        println!("   - Accuracy: Moderate");
220        println!("   - Best for: Quick filtering, high performance");
221        println!("   - Resistant to: Scaling, brightness changes");
222        println!();
223        println!("3. pHash (Perceptual Hash)");
224        println!("   - Speed: Slower");
225        println!("   - Accuracy: Best");
226        println!("   - Best for: High-quality detection, critical use cases");
227        println!("   - Resistant to: Rotation, compression, watermarks");
228        println!();
229        println!("Recommendation:");
230        println!("  * Use dHash for most cases (good balance)");
231        println!("  * Use pHash for fraud detection (highest accuracy)");
232        println!("  * Use aHash for preliminary filtering (fastest)");
233
234        Ok(())
235    }
236
237    /// Threshold tuning guide
238    #[allow(dead_code)]
239    pub async fn threshold_tuning_guide() -> Result<()> {
240        println!("=== Similarity Threshold Tuning Guide ===");
241        println!();
242
243        println!("Hamming distance thresholds:");
244        println!();
245        println!("  Distance 0-5:  Nearly identical (99%+ similar)");
246        println!("             -> Same image, minor compression/resize");
247        println!();
248        println!("  Distance 6-10: Very similar (95-99% similar)");
249        println!("             -> Same image, different quality/format");
250        println!();
251        println!("  Distance 11-15: Similar (90-95% similar)");
252        println!("             -> Same subject, different angle/crop");
253        println!();
254        println!("  Distance 16-20: Somewhat similar (85-90% similar)");
255        println!("             -> Related content, different composition");
256        println!();
257        println!("  Distance 21+: Not similar (<85% similar)");
258        println!("             -> Different images");
259        println!();
260        println!("Recommended thresholds:");
261        println!("  * Exact duplicates: distance <= 5");
262        println!("  * Near duplicates: distance <= 10");
263        println!("  * Similar images: distance <= 15");
264        println!("  * Fraud detection: distance <= 8 (strict)");
265
266        Ok(())
267    }
268
269    /// Image deduplication database
270    #[allow(dead_code)]
271    pub async fn deduplication_example() -> Result<()> {
272        println!("=== Image Deduplication Database Example ===");
273        println!();
274
275        println!("Note: Image database example (conceptual)");
276
277        println!("Added 3 images to database");
278        println!();
279
280        println!("Finding duplicates for test image...");
281        println!("Found 2 similar images:");
282        println!("  - image1.jpg (hamming distance: 2)");
283        println!("  - image2.jpg (hamming distance: 3)");
284
285        Ok(())
286    }
287
288    /// Fraud prevention use cases
289    #[allow(dead_code)]
290    pub async fn fraud_prevention_guide() -> Result<()> {
291        println!("=== Image Similarity for Fraud Prevention ===");
292        println!();
293
294        println!("Use Cases:");
295        println!();
296        println!("1. Screenshot Fraud Detection");
297        println!("   - Detect users submitting same screenshot multiple times");
298        println!("   - Identify edited/photoshopped evidence");
299        println!("   - Example: Modified transaction screenshots");
300        println!();
301        println!("2. Duplicate Evidence Prevention");
302        println!("   - Prevent reuse of evidence across commitments");
303        println!("   - Track all submitted images");
304        println!("   - Example: Same GitHub stats screenshot for different claims");
305        println!();
306        println!("3. Identity Verification");
307        println!("   - Detect duplicate profile pictures");
308        println!("   - Identify stock photo usage");
309        println!("   - Example: Multiple accounts with similar avatars");
310        println!();
311        println!("4. Content Originality");
312        println!("   - Verify image evidence is original");
313        println!("   - Detect images copied from web");
314        println!("   - Example: Reverse image search integration");
315        println!();
316        println!("Integration with kaccy-ai:");
317        println!("  let detector = ImageSimilarityDetector::new(HashAlgorithm::PHash);");
318        println!("  let fraud_detector = AiFraudDetector::new(llm_client);");
319        println!("  // Use both together for comprehensive fraud detection");
320
321        Ok(())
322    }
323
324    /// Performance optimization guide
325    #[allow(dead_code)]
326    pub async fn performance_optimization_guide() -> Result<()> {
327        println!("=== Performance Optimization Guide ===");
328        println!();
329
330        println!("For large datasets:");
331        println!();
332        println!("1. Use fast algorithms first");
333        println!("   - Filter with aHash (fastest)");
334        println!("   - Confirm with pHash (most accurate)");
335        println!();
336        println!("2. Implement database indexing");
337        println!("   - Use ImageDatabase with appropriate threshold");
338        println!("   - Index by hash prefix for faster lookups");
339        println!();
340        println!("3. Batch processing");
341        println!("   - Process images in parallel");
342        println!("   - Use rayon for CPU parallelism");
343        println!();
344        println!("4. Caching");
345        println!("   - Cache computed hashes");
346        println!("   - Store hashes in database");
347        println!();
348        println!("Example performance:");
349        println!("  * Hash computation: ~1ms per image");
350        println!("  * Hash comparison: ~100ns per pair");
351        println!("  * Database lookup: ~O(n) without indexing");
352        println!("  * With indexing: ~O(log n)");
353
354        Ok(())
355    }
356}