gdelt 0.1.0

CLI for GDELT Project - optimized for agentic usage with local data caching
//! GDELT Global Knowledge Graph (GKG) model.
//!
//! The GKG provides rich metadata about news articles including
//! themes, entities, locations, tone, and more.

use chrono::NaiveDateTime;
use serde::{Deserialize, Serialize};

/// A GKG record
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GkgRecord {
    /// GKG Record ID
    pub gkg_record_id: String,
    /// Date (V2.1: YYYYMMDDHHMMSS)
    pub date: Option<NaiveDateTime>,
    /// Source identifier
    pub source_collection_identifier: Option<i32>,
    /// Source common name
    pub source_common_name: Option<String>,
    /// Document identifier
    pub document_identifier: String,

    // Counts
    pub counts: Option<String>,
    pub v2_counts: Option<String>,

    // Themes
    pub themes: Vec<String>,
    pub v2_themes: Option<String>,
    pub v2_enhanced_themes: Vec<EnhancedTheme>,

    // Locations
    pub locations: Vec<String>,
    pub v2_locations: Vec<GkgLocation>,
    pub v2_enhanced_locations: Vec<EnhancedLocation>,

    // Persons
    pub persons: Vec<String>,
    pub v2_persons: Vec<String>,
    pub v2_enhanced_persons: Vec<EnhancedEntity>,

    // Organizations
    pub organizations: Vec<String>,
    pub v2_organizations: Vec<String>,
    pub v2_enhanced_organizations: Vec<EnhancedEntity>,

    // Tone
    pub tone: Option<GkgTone>,
    pub v2_tone: Option<GkgTone>,

    // Dates mentioned in article
    pub v2_enhanced_dates: Vec<String>,

    // GCAM (Global Content Analysis Measures)
    pub v2_gcam: Vec<GcamScore>,

    // URLs
    pub v2_sharing_image: Option<String>,
    pub v2_related_images: Vec<String>,
    pub v2_social_image_embeds: Vec<String>,
    pub v2_social_video_embeds: Vec<String>,

    // Quotations
    pub v2_quotations: Vec<Quotation>,

    // Names
    pub v2_all_names: Vec<String>,
    pub v2_amounts: Vec<String>,

    // Translation info
    pub v2_translation_info: Option<String>,

    // Extras
    pub v2_extras_xml: Option<String>,
}

/// Enhanced theme with character offset
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EnhancedTheme {
    pub theme: String,
    pub char_offset: Option<i32>,
}

/// GKG location
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GkgLocation {
    pub location_type: i32,
    pub full_name: String,
    pub country_code: Option<String>,
    pub adm1_code: Option<String>,
    pub adm2_code: Option<String>,
    pub lat: Option<f64>,
    pub long: Option<f64>,
    pub feature_id: Option<String>,
}

/// Enhanced location with character offset
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EnhancedLocation {
    pub location: GkgLocation,
    pub char_offset: Option<i32>,
}

/// Enhanced entity (person or organization)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EnhancedEntity {
    pub name: String,
    pub char_offset: Option<i32>,
}

/// GKG tone analysis
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GkgTone {
    pub tone: f64,
    pub positive_score: f64,
    pub negative_score: f64,
    pub polarity: f64,
    pub activity_reference_density: f64,
    pub self_group_reference_density: f64,
    pub word_count: Option<i32>,
}

/// GCAM (Global Content Analysis Measures) score
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GcamScore {
    pub dimension: String,
    pub score: f64,
}

/// Quotation from an article
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Quotation {
    pub char_offset: i32,
    pub length: i32,
    pub verb: Option<String>,
    pub quote: String,
}

impl GkgRecord {
    /// Parse a GKG record from a TSV line (GKG 2.1 format)
    pub fn from_tsv(line: &str) -> Result<Self, String> {
        let fields: Vec<&str> = line.split('\t').collect();
        if fields.len() < 15 {
            return Err(format!("Expected at least 15 fields, got {}", fields.len()));
        }

        Ok(GkgRecord {
            gkg_record_id: fields[0].to_string(),
            date: parse_gkg_date(fields[1]),
            source_collection_identifier: fields[2].parse().ok(),
            source_common_name: non_empty(fields[3]),
            document_identifier: fields[4].to_string(),
            counts: non_empty(fields[5]),
            v2_counts: non_empty(fields[6]),
            themes: parse_semicolon_list(fields[7]),
            v2_themes: non_empty(fields[7]),
            v2_enhanced_themes: parse_enhanced_themes(fields.get(8).unwrap_or(&"")),
            locations: parse_semicolon_list(fields[9]),
            v2_locations: vec![], // Complex parsing needed
            v2_enhanced_locations: vec![],
            persons: parse_semicolon_list(fields[10]),
            v2_persons: parse_semicolon_list(fields[10]),
            v2_enhanced_persons: vec![],
            organizations: parse_semicolon_list(fields[11]),
            v2_organizations: parse_semicolon_list(fields[11]),
            v2_enhanced_organizations: vec![],
            tone: parse_tone(fields.get(12).unwrap_or(&"")),
            v2_tone: parse_tone(fields.get(12).unwrap_or(&"")),
            v2_enhanced_dates: vec![],
            v2_gcam: vec![],
            v2_sharing_image: fields.get(13).and_then(|s| non_empty(s)),
            v2_related_images: vec![],
            v2_social_image_embeds: vec![],
            v2_social_video_embeds: vec![],
            v2_quotations: vec![],
            v2_all_names: vec![],
            v2_amounts: vec![],
            v2_translation_info: None,
            v2_extras_xml: fields.get(14).and_then(|s| non_empty(s)),
        })
    }

    /// Get all unique themes
    pub fn all_themes(&self) -> Vec<&str> {
        self.themes.iter().map(|s| s.as_str()).collect()
    }

    /// Get the overall tone score
    pub fn tone_score(&self) -> Option<f64> {
        self.v2_tone.as_ref().or(self.tone.as_ref()).map(|t| t.tone)
    }

    /// Check if record mentions a specific theme
    pub fn has_theme(&self, theme: &str) -> bool {
        let theme_upper = theme.to_uppercase();
        self.themes.iter().any(|t| t.to_uppercase().contains(&theme_upper))
    }

    /// Check if record mentions a specific person
    pub fn has_person(&self, person: &str) -> bool {
        let person_lower = person.to_lowercase();
        self.persons.iter().any(|p| p.to_lowercase().contains(&person_lower))
    }

    /// Check if record mentions a specific organization
    pub fn has_organization(&self, org: &str) -> bool {
        let org_lower = org.to_lowercase();
        self.organizations.iter().any(|o| o.to_lowercase().contains(&org_lower))
    }
}

fn parse_gkg_date(s: &str) -> Option<NaiveDateTime> {
    if s.len() < 14 {
        return None;
    }
    chrono::NaiveDateTime::parse_from_str(s, "%Y%m%d%H%M%S").ok()
}

fn parse_semicolon_list(s: &str) -> Vec<String> {
    if s.trim().is_empty() {
        return vec![];
    }
    s.split(';')
        .filter(|s| !s.is_empty())
        .map(|s| s.to_string())
        .collect()
}

fn parse_enhanced_themes(s: &str) -> Vec<EnhancedTheme> {
    if s.trim().is_empty() {
        return vec![];
    }
    s.split(';')
        .filter(|s| !s.is_empty())
        .map(|entry| {
            let parts: Vec<&str> = entry.split(',').collect();
            EnhancedTheme {
                theme: parts.first().unwrap_or(&"").to_string(),
                char_offset: parts.get(1).and_then(|s| s.parse().ok()),
            }
        })
        .collect()
}

fn parse_tone(s: &str) -> Option<GkgTone> {
    let parts: Vec<&str> = s.split(',').collect();
    if parts.len() < 6 {
        return None;
    }
    Some(GkgTone {
        tone: parts[0].parse().ok()?,
        positive_score: parts[1].parse().ok()?,
        negative_score: parts[2].parse().ok()?,
        polarity: parts[3].parse().ok()?,
        activity_reference_density: parts[4].parse().ok()?,
        self_group_reference_density: parts[5].parse().ok()?,
        word_count: parts.get(6).and_then(|s| s.parse().ok()),
    })
}

fn non_empty(s: &str) -> Option<String> {
    let trimmed = s.trim();
    if trimmed.is_empty() {
        None
    } else {
        Some(trimmed.to_string())
    }
}

/// Column names for GKG export
pub const GKG_COLUMNS: &[&str] = &[
    "GKGRECORDID", "DATE", "SourceCollectionIdentifier", "SourceCommonName",
    "DocumentIdentifier", "Counts", "V2Counts", "Themes", "V2EnhancedThemes",
    "Locations", "V2EnhancedLocations", "Persons", "V2EnhancedPersons",
    "Organizations", "V2EnhancedOrganizations", "V2Tone", "V2EnhancedDates",
    "V2GCAM", "V2SharingImage", "V2RelatedImages", "V2SocialImageEmbeds",
    "V2SocialVideoEmbeds", "V2Quotations", "V2AllNames", "V2Amounts",
    "V2TranslationInfo", "V2ExtrasXML"
];