gdelt 0.1.0

CLI for GDELT Project - optimized for agentic usage with local data caching
//! Entity extraction and analysis from GDELT data.

#![allow(dead_code)]

use crate::db::AnalyticsDb;
use crate::error::Result;
use serde::{Deserialize, Serialize};

/// Types of entities that can be extracted
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum EntityType {
    Person,
    Organization,
    Location,
    Actor,
    Theme,
    All,
}

impl EntityType {
    pub fn as_str(&self) -> &'static str {
        match self {
            Self::Person => "person",
            Self::Organization => "organization",
            Self::Location => "location",
            Self::Actor => "actor",
            Self::Theme => "theme",
            Self::All => "all",
        }
    }
}

/// A single entity result
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EntityResult {
    /// Entity name/value
    pub name: String,
    /// Entity type
    pub entity_type: EntityType,
    /// Number of occurrences
    pub count: i64,
    /// Average tone when this entity appears
    pub avg_tone: Option<f64>,
    /// First seen date (YYYYMMDD)
    pub first_seen: Option<String>,
    /// Last seen date (YYYYMMDD)
    pub last_seen: Option<String>,
}

/// Configuration for entity extraction
#[derive(Debug, Clone)]
pub struct EntitiesConfig {
    pub entity_type: EntityType,
    pub min_count: u32,
    pub limit: u32,
    pub start_date: Option<String>,
    pub end_date: Option<String>,
}

impl Default for EntitiesConfig {
    fn default() -> Self {
        Self {
            entity_type: EntityType::All,
            min_count: 5,
            limit: 50,
            start_date: None,
            end_date: None,
        }
    }
}

/// Extract entities from the database
pub fn extract_entities(db: &AnalyticsDb, config: &EntitiesConfig) -> Result<Vec<EntityResult>> {
    let mut results = Vec::new();

    match config.entity_type {
        EntityType::Actor => {
            results.extend(extract_actors(db, config)?);
        }
        EntityType::Location => {
            results.extend(extract_locations(db, config)?);
        }
        EntityType::Person => {
            results.extend(extract_persons_from_gkg(db, config)?);
        }
        EntityType::Organization => {
            results.extend(extract_organizations_from_gkg(db, config)?);
        }
        EntityType::Theme => {
            results.extend(extract_themes_from_gkg(db, config)?);
        }
        EntityType::All => {
            results.extend(extract_actors(db, config)?);
            results.extend(extract_locations(db, config)?);
            results.extend(extract_persons_from_gkg(db, config)?);
            results.extend(extract_organizations_from_gkg(db, config)?);
        }
    }

    // Sort by count descending
    results.sort_by(|a, b| b.count.cmp(&a.count));

    // Limit results
    results.truncate(config.limit as usize);

    Ok(results)
}

fn build_date_filter(config: &EntitiesConfig, date_column: &str) -> String {
    let mut conditions = Vec::new();

    if let Some(ref start) = config.start_date {
        conditions.push(format!("{} >= {}", date_column, start.replace('-', "")));
    }
    if let Some(ref end) = config.end_date {
        conditions.push(format!("{} <= {}", date_column, end.replace('-', "")));
    }

    if conditions.is_empty() {
        String::new()
    } else {
        format!("WHERE {}", conditions.join(" AND "))
    }
}

fn extract_actors(db: &AnalyticsDb, config: &EntitiesConfig) -> Result<Vec<EntityResult>> {
    let date_filter = build_date_filter(config, "sql_date");
    let having_clause = format!("HAVING COUNT(*) >= {}", config.min_count);

    let sql = format!(
        r#"
        SELECT
            actor_code,
            actor_name,
            COUNT(*) as cnt,
            AVG(avg_tone) as tone,
            MIN(sql_date) as first_seen,
            MAX(sql_date) as last_seen
        FROM (
            SELECT actor1_code as actor_code, actor1_name as actor_name, avg_tone, sql_date
            FROM events
            {}
            WHERE actor1_code IS NOT NULL
            UNION ALL
            SELECT actor2_code as actor_code, actor2_name as actor_name, avg_tone, sql_date
            FROM events
            {}
            WHERE actor2_code IS NOT NULL
        )
        GROUP BY actor_code, actor_name
        {}
        ORDER BY cnt DESC
        LIMIT {}
        "#,
        date_filter, date_filter, having_clause, config.limit
    );

    let query_result = db.query(&sql)?;

    let mut results = Vec::new();
    for row in &query_result.rows {
        let code = row.get(0).and_then(|v| v.as_str()).unwrap_or("");
        let name = row.get(1).and_then(|v| v.as_str()).unwrap_or("");
        let display_name = if !name.is_empty() {
            format!("{} ({})", name, code)
        } else {
            code.to_string()
        };

        results.push(EntityResult {
            name: display_name,
            entity_type: EntityType::Actor,
            count: row.get(2).and_then(|v| v.as_i64()).unwrap_or(0),
            avg_tone: row.get(3).and_then(|v| v.as_f64()),
            first_seen: row.get(4).and_then(|v| v.as_i64()).map(|d| d.to_string()),
            last_seen: row.get(5).and_then(|v| v.as_i64()).map(|d| d.to_string()),
        });
    }

    Ok(results)
}

fn extract_locations(db: &AnalyticsDb, config: &EntitiesConfig) -> Result<Vec<EntityResult>> {
    let date_filter = build_date_filter(config, "sql_date");
    let having_clause = format!("HAVING COUNT(*) >= {}", config.min_count);

    let sql = format!(
        r#"
        SELECT
            action_geo_fullname,
            action_geo_country_code,
            COUNT(*) as cnt,
            AVG(avg_tone) as tone,
            MIN(sql_date) as first_seen,
            MAX(sql_date) as last_seen
        FROM events
        {}
        {} action_geo_fullname IS NOT NULL
        GROUP BY action_geo_fullname, action_geo_country_code
        {}
        ORDER BY cnt DESC
        LIMIT {}
        "#,
        if date_filter.is_empty() { "WHERE" } else { &date_filter },
        if date_filter.is_empty() { "" } else { "AND" },
        having_clause,
        config.limit
    );

    let query_result = db.query(&sql)?;

    let mut results = Vec::new();
    for row in &query_result.rows {
        let name = row.get(0).and_then(|v| v.as_str()).unwrap_or("");
        let country = row.get(1).and_then(|v| v.as_str()).unwrap_or("");
        let display_name = if !country.is_empty() {
            format!("{} [{}]", name, country)
        } else {
            name.to_string()
        };

        results.push(EntityResult {
            name: display_name,
            entity_type: EntityType::Location,
            count: row.get(2).and_then(|v| v.as_i64()).unwrap_or(0),
            avg_tone: row.get(3).and_then(|v| v.as_f64()),
            first_seen: row.get(4).and_then(|v| v.as_i64()).map(|d| d.to_string()),
            last_seen: row.get(5).and_then(|v| v.as_i64()).map(|d| d.to_string()),
        });
    }

    Ok(results)
}

fn extract_persons_from_gkg(db: &AnalyticsDb, config: &EntitiesConfig) -> Result<Vec<EntityResult>> {
    let having_clause = format!("HAVING COUNT(*) >= {}", config.min_count);

    // GKG stores persons as an array - unnest it
    let sql = format!(
        r#"
        SELECT
            person,
            COUNT(*) as cnt,
            AVG(tone) as tone,
            MIN(date) as first_seen,
            MAX(date) as last_seen
        FROM (
            SELECT UNNEST(persons) as person, tone, date
            FROM gkg
            WHERE persons IS NOT NULL
        )
        GROUP BY person
        {}
        ORDER BY cnt DESC
        LIMIT {}
        "#,
        having_clause, config.limit
    );

    let query_result = db.query(&sql)?;

    let mut results = Vec::new();
    for row in &query_result.rows {
        results.push(EntityResult {
            name: row.get(0).and_then(|v| v.as_str()).unwrap_or("").to_string(),
            entity_type: EntityType::Person,
            count: row.get(1).and_then(|v| v.as_i64()).unwrap_or(0),
            avg_tone: row.get(2).and_then(|v| v.as_f64()),
            first_seen: row.get(3).and_then(|v| v.as_i64()).map(|d| d.to_string()),
            last_seen: row.get(4).and_then(|v| v.as_i64()).map(|d| d.to_string()),
        });
    }

    Ok(results)
}

fn extract_organizations_from_gkg(db: &AnalyticsDb, config: &EntitiesConfig) -> Result<Vec<EntityResult>> {
    let having_clause = format!("HAVING COUNT(*) >= {}", config.min_count);

    let sql = format!(
        r#"
        SELECT
            org,
            COUNT(*) as cnt,
            AVG(tone) as tone,
            MIN(date) as first_seen,
            MAX(date) as last_seen
        FROM (
            SELECT UNNEST(organizations) as org, tone, date
            FROM gkg
            WHERE organizations IS NOT NULL
        )
        GROUP BY org
        {}
        ORDER BY cnt DESC
        LIMIT {}
        "#,
        having_clause, config.limit
    );

    let query_result = db.query(&sql)?;

    let mut results = Vec::new();
    for row in &query_result.rows {
        results.push(EntityResult {
            name: row.get(0).and_then(|v| v.as_str()).unwrap_or("").to_string(),
            entity_type: EntityType::Organization,
            count: row.get(1).and_then(|v| v.as_i64()).unwrap_or(0),
            avg_tone: row.get(2).and_then(|v| v.as_f64()),
            first_seen: row.get(3).and_then(|v| v.as_i64()).map(|d| d.to_string()),
            last_seen: row.get(4).and_then(|v| v.as_i64()).map(|d| d.to_string()),
        });
    }

    Ok(results)
}

fn extract_themes_from_gkg(db: &AnalyticsDb, config: &EntitiesConfig) -> Result<Vec<EntityResult>> {
    let having_clause = format!("HAVING COUNT(*) >= {}", config.min_count);

    let sql = format!(
        r#"
        SELECT
            theme,
            COUNT(*) as cnt,
            AVG(tone) as tone,
            MIN(date) as first_seen,
            MAX(date) as last_seen
        FROM (
            SELECT UNNEST(themes) as theme, tone, date
            FROM gkg
            WHERE themes IS NOT NULL
        )
        GROUP BY theme
        {}
        ORDER BY cnt DESC
        LIMIT {}
        "#,
        having_clause, config.limit
    );

    let query_result = db.query(&sql)?;

    let mut results = Vec::new();
    for row in &query_result.rows {
        results.push(EntityResult {
            name: row.get(0).and_then(|v| v.as_str()).unwrap_or("").to_string(),
            entity_type: EntityType::Theme,
            count: row.get(1).and_then(|v| v.as_i64()).unwrap_or(0),
            avg_tone: row.get(2).and_then(|v| v.as_f64()),
            first_seen: row.get(3).and_then(|v| v.as_i64()).map(|d| d.to_string()),
            last_seen: row.get(4).and_then(|v| v.as_i64()).map(|d| d.to_string()),
        });
    }

    Ok(results)
}