tuitbot-core 0.1.47

//! Duplicate reply prevention.
//!
//! Provides exact-match deduplication (never reply to the same tweet twice)
//! and phrasing similarity detection (reject replies too similar to recent ones).

use crate::error::StorageError;
use crate::storage::DbPool;
use std::collections::HashSet;

/// Checks for duplicate and similar replies.
pub struct DedupChecker {
    pool: DbPool,
}

impl DedupChecker {
    /// Create a new dedup checker backed by the given database pool.
    pub fn new(pool: DbPool) -> Self {
        Self { pool }
    }

    /// Check if a reply has already been sent to the given tweet.
    ///
    /// Returns `true` if a reply exists in `replies_sent` for this tweet ID.
    pub async fn has_replied_to(&self, tweet_id: &str) -> Result<bool, StorageError> {
        crate::storage::replies::has_replied_to(&self.pool, tweet_id).await
    }

    /// Check if a proposed reply is too similar to recent replies.
    ///
    /// Compares against the last `limit` replies using Jaccard word similarity.
    /// Returns `true` if any recent reply has >= 0.8 similarity or is an exact match.
    /// Replies shorter than 5 words skip the similarity check (too short for meaningful comparison).
    pub async fn is_phrasing_similar(
        &self,
        new_reply: &str,
        limit: i64,
    ) -> Result<bool, StorageError> {
        if new_reply.is_empty() {
            return Ok(false);
        }

        let recent = crate::storage::replies::get_recent_reply_contents(&self.pool, limit).await?;
        let new_tokens = tokenize(new_reply);

        for recent_reply in &recent {
            // Exact match check
            if new_reply == recent_reply {
                return Ok(true);
            }

            // Skip similarity check for very short replies
            if new_tokens.len() < 5 {
                continue;
            }

            let recent_tokens = tokenize(recent_reply);
            if jaccard_similarity(&new_tokens, &recent_tokens) >= 0.8 {
                return Ok(true);
            }
        }

        Ok(false)
    }

    /// Get recent reply contents for testing and debugging.
    pub async fn get_recent_reply_phrases(&self, limit: i64) -> Result<Vec<String>, StorageError> {
        crate::storage::replies::get_recent_reply_contents(&self.pool, limit).await
    }
}

/// Tokenize text into a set of lowercase alphanumeric words.
fn tokenize(text: &str) -> HashSet<String> {
    text.to_lowercase()
        .split_whitespace()
        .map(|w| w.trim_matches(|c: char| !c.is_alphanumeric()).to_string())
        .filter(|w| !w.is_empty())
        .collect()
}

/// Calculate Jaccard similarity between two word sets.
///
/// Returns a value between 0.0 (no overlap) and 1.0 (identical sets).
/// Two empty sets are considered identical (returns 1.0).
fn jaccard_similarity(a: &HashSet<String>, b: &HashSet<String>) -> f64 {
    if a.is_empty() && b.is_empty() {
        return 1.0;
    }
    let intersection = a.intersection(b).count() as f64;
    let union = a.union(b).count() as f64;
    intersection / union
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::storage::init_test_db;
    use crate::storage::replies::{insert_reply, ReplySent};

    fn sample_reply(target_id: &str, content: &str) -> ReplySent {
        ReplySent {
            id: 0,
            target_tweet_id: target_id.to_string(),
            reply_tweet_id: Some("r_123".to_string()),
            reply_content: content.to_string(),
            llm_provider: None,
            llm_model: None,
            created_at: chrono::Utc::now().format("%Y-%m-%dT%H:%M:%SZ").to_string(),
            status: "sent".to_string(),
            error_message: None,
        }
    }

    #[test]
    fn tokenize_basic() {
        let tokens = tokenize("Hello, World! This is a test.");
        assert!(tokens.contains("hello"));
        assert!(tokens.contains("world"));
        assert!(tokens.contains("test"));
        assert!(!tokens.contains(""));
    }

    #[test]
    fn tokenize_strips_punctuation() {
        let tokens = tokenize("(great) [tool] {for} developers!");
        assert!(tokens.contains("great"));
        assert!(tokens.contains("tool"));
        assert!(tokens.contains("for"));
        assert!(tokens.contains("developers"));
    }

    #[test]
    fn tokenize_empty_string() {
        let tokens = tokenize("");
        assert!(tokens.is_empty());
    }

    #[test]
    fn jaccard_identical_sets() {
        let a: HashSet<String> = ["hello", "world"].iter().map(|s| s.to_string()).collect();
        let b = a.clone();
        assert!((jaccard_similarity(&a, &b) - 1.0).abs() < f64::EPSILON);
    }

    #[test]
    fn jaccard_disjoint_sets() {
        let a: HashSet<String> = ["hello", "world"].iter().map(|s| s.to_string()).collect();
        let b: HashSet<String> = ["foo", "bar"].iter().map(|s| s.to_string()).collect();
        assert!((jaccard_similarity(&a, &b)).abs() < f64::EPSILON);
    }

    #[test]
    fn jaccard_partial_overlap() {
        let a: HashSet<String> = ["hello", "world", "foo"]
            .iter()
            .map(|s| s.to_string())
            .collect();
        let b: HashSet<String> = ["hello", "world", "bar"]
            .iter()
            .map(|s| s.to_string())
            .collect();
        // intersection=2, union=4 => 0.5
        let sim = jaccard_similarity(&a, &b);
        assert!((sim - 0.5).abs() < f64::EPSILON);
    }

    #[test]
    fn jaccard_empty_sets() {
        let a: HashSet<String> = HashSet::new();
        let b: HashSet<String> = HashSet::new();
        assert!((jaccard_similarity(&a, &b) - 1.0).abs() < f64::EPSILON);
    }

    #[tokio::test]
    async fn has_replied_to_works() {
        let pool = init_test_db().await.expect("init db");
        let checker = DedupChecker::new(pool.clone());

        assert!(!checker.has_replied_to("tweet_123").await.expect("check"));

        let reply = sample_reply("tweet_123", "Some reply");
        insert_reply(&pool, &reply).await.expect("insert");

        assert!(checker.has_replied_to("tweet_123").await.expect("check"));
        assert!(!checker.has_replied_to("tweet_456").await.expect("check"));
    }

    #[tokio::test]
    async fn is_phrasing_similar_exact_match() {
        let pool = init_test_db().await.expect("init db");
        let checker = DedupChecker::new(pool.clone());

        let reply = sample_reply("t1", "This is a great tool for developers");
        insert_reply(&pool, &reply).await.expect("insert");

        assert!(checker
            .is_phrasing_similar("This is a great tool for developers", 20)
            .await
            .expect("check"));
    }

    #[tokio::test]
    async fn is_phrasing_similar_high_overlap() {
        let pool = init_test_db().await.expect("init db");
        let checker = DedupChecker::new(pool.clone());

        let reply = sample_reply("t1", "This is a great tool for developers and engineers");
        insert_reply(&pool, &reply).await.expect("insert");

        // Very similar phrasing (most words overlap)
        assert!(checker
            .is_phrasing_similar("This is a great tool for developers and designers", 20)
            .await
            .expect("check"));
    }

    #[tokio::test]
    async fn is_phrasing_similar_no_overlap() {
        let pool = init_test_db().await.expect("init db");
        let checker = DedupChecker::new(pool.clone());

        let reply = sample_reply("t1", "This is a great tool for developers and engineers");
        insert_reply(&pool, &reply).await.expect("insert");

        assert!(!checker
            .is_phrasing_similar("I love cooking pasta with fresh basil and tomatoes", 20)
            .await
            .expect("check"));
    }

    #[tokio::test]
    async fn is_phrasing_similar_empty_string() {
        let pool = init_test_db().await.expect("init db");
        let checker = DedupChecker::new(pool.clone());

        let reply = sample_reply("t1", "Some reply");
        insert_reply(&pool, &reply).await.expect("insert");

        assert!(!checker.is_phrasing_similar("", 20).await.expect("check"));
    }

    #[tokio::test]
    async fn is_phrasing_similar_short_reply_skips_similarity() {
        let pool = init_test_db().await.expect("init db");
        let checker = DedupChecker::new(pool.clone());

        let reply = sample_reply("t1", "Great point!");
        insert_reply(&pool, &reply).await.expect("insert");

        // Short reply (< 5 words) - exact match still works
        assert!(checker
            .is_phrasing_similar("Great point!", 20)
            .await
            .expect("check"));

        // But similar short phrases don't trigger (avoids false positives)
        assert!(!checker
            .is_phrasing_similar("Good point!", 20)
            .await
            .expect("check"));
    }

    #[tokio::test]
    async fn is_phrasing_similar_no_recent_replies() {
        let pool = init_test_db().await.expect("init db");
        let checker = DedupChecker::new(pool.clone());

        assert!(!checker
            .is_phrasing_similar("Any reply text here that is long enough to test", 20)
            .await
            .expect("check"));
    }

    #[tokio::test]
    async fn get_recent_reply_phrases_works() {
        let pool = init_test_db().await.expect("init db");
        let checker = DedupChecker::new(pool.clone());

        let r1 = sample_reply("t1", "Reply one");
        let r2 = sample_reply("t2", "Reply two");
        insert_reply(&pool, &r1).await.expect("ins1");
        insert_reply(&pool, &r2).await.expect("ins2");

        let phrases = checker.get_recent_reply_phrases(5).await.expect("get");
        assert_eq!(phrases.len(), 2);
    }

    // -----------------------------------------------------------------------
    // Additional dedup coverage tests
    // -----------------------------------------------------------------------

    #[test]
    fn tokenize_mixed_punctuation() {
        let tokens = tokenize("Hello! World? This... is (a) test.");
        assert!(tokens.contains("hello"));
        assert!(tokens.contains("world"));
        assert!(tokens.contains("this"));
        assert!(tokens.contains("a"));
        assert!(tokens.contains("test"));
    }

    #[test]
    fn tokenize_duplicate_words() {
        // HashSet deduplicates
        let tokens = tokenize("hello hello hello world");
        assert_eq!(tokens.len(), 2);
    }

    #[test]
    fn tokenize_only_punctuation() {
        let tokens = tokenize("!!! ... ???");
        assert!(tokens.is_empty());
    }

    #[test]
    fn tokenize_single_word() {
        let tokens = tokenize("hello");
        assert_eq!(tokens.len(), 1);
        assert!(tokens.contains("hello"));
    }

    #[test]
    fn tokenize_case_normalization() {
        let tokens = tokenize("Hello HELLO hello");
        assert_eq!(tokens.len(), 1);
        assert!(tokens.contains("hello"));
    }

    #[test]
    fn jaccard_one_empty_one_not() {
        let a: HashSet<String> = HashSet::new();
        let b: HashSet<String> = ["hello"].iter().map(|s| s.to_string()).collect();
        assert!((jaccard_similarity(&a, &b) - 0.0).abs() < f64::EPSILON);
    }

    #[test]
    fn jaccard_single_element_match() {
        let a: HashSet<String> = ["hello"].iter().map(|s| s.to_string()).collect();
        let b: HashSet<String> = ["hello"].iter().map(|s| s.to_string()).collect();
        assert!((jaccard_similarity(&a, &b) - 1.0).abs() < f64::EPSILON);
    }

    #[test]
    fn jaccard_superset() {
        let a: HashSet<String> = ["hello", "world", "foo"]
            .iter()
            .map(|s| s.to_string())
            .collect();
        let b: HashSet<String> = ["hello", "world"].iter().map(|s| s.to_string()).collect();
        // intersection=2, union=3 => 0.667
        let sim = jaccard_similarity(&a, &b);
        assert!((sim - 2.0 / 3.0).abs() < 0.001);
    }

    #[test]
    fn jaccard_high_overlap_threshold() {
        // 8 common words out of 10 total = 0.8
        let common: Vec<String> = (0..8).map(|i| format!("word{i}")).collect();
        let mut a: HashSet<String> = common.iter().cloned().collect();
        a.insert("unique_a".to_string());
        let mut b: HashSet<String> = common.iter().cloned().collect();
        b.insert("unique_b".to_string());
        let sim = jaccard_similarity(&a, &b);
        assert!(sim >= 0.8);
    }

    #[tokio::test]
    async fn has_replied_to_multiple_tweets() {
        let pool = init_test_db().await.expect("init db");
        let checker = DedupChecker::new(pool.clone());

        let r1 = sample_reply("tweet_100", "Reply 1");
        let r2 = sample_reply("tweet_200", "Reply 2");
        insert_reply(&pool, &r1).await.expect("ins1");
        insert_reply(&pool, &r2).await.expect("ins2");

        assert!(checker.has_replied_to("tweet_100").await.expect("check"));
        assert!(checker.has_replied_to("tweet_200").await.expect("check"));
        assert!(!checker.has_replied_to("tweet_300").await.expect("check"));
    }

    #[tokio::test]
    async fn get_recent_reply_phrases_respects_limit() {
        let pool = init_test_db().await.expect("init db");
        let checker = DedupChecker::new(pool.clone());

        for i in 0..5 {
            let reply = sample_reply(&format!("t{i}"), &format!("Reply content {i}"));
            insert_reply(&pool, &reply).await.expect("insert");
        }

        let phrases = checker.get_recent_reply_phrases(3).await.expect("get");
        assert_eq!(phrases.len(), 3);
    }

    #[tokio::test]
    async fn is_phrasing_similar_moderate_overlap_not_triggered() {
        let pool = init_test_db().await.expect("init db");
        let checker = DedupChecker::new(pool.clone());

        let reply = sample_reply(
            "t1",
            "Rust has an amazing type system with lifetimes and borrowing rules",
        );
        insert_reply(&pool, &reply).await.expect("insert");

        // Moderate overlap (some words shared) — should NOT trigger (< 0.8)
        assert!(!checker
            .is_phrasing_similar(
                "Python has dynamic typing with duck typing and runtime checks",
                20,
            )
            .await
            .expect("check"));
    }
}