use chrono::NaiveDateTime;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GkgRecord {
pub gkg_record_id: String,
pub date: Option<NaiveDateTime>,
pub source_collection_identifier: Option<i32>,
pub source_common_name: Option<String>,
pub document_identifier: String,
pub counts: Option<String>,
pub v2_counts: Option<String>,
pub themes: Vec<String>,
pub v2_themes: Option<String>,
pub v2_enhanced_themes: Vec<EnhancedTheme>,
pub locations: Vec<String>,
pub v2_locations: Vec<GkgLocation>,
pub v2_enhanced_locations: Vec<EnhancedLocation>,
pub persons: Vec<String>,
pub v2_persons: Vec<String>,
pub v2_enhanced_persons: Vec<EnhancedEntity>,
pub organizations: Vec<String>,
pub v2_organizations: Vec<String>,
pub v2_enhanced_organizations: Vec<EnhancedEntity>,
pub tone: Option<GkgTone>,
pub v2_tone: Option<GkgTone>,
pub v2_enhanced_dates: Vec<String>,
pub v2_gcam: Vec<GcamScore>,
pub v2_sharing_image: Option<String>,
pub v2_related_images: Vec<String>,
pub v2_social_image_embeds: Vec<String>,
pub v2_social_video_embeds: Vec<String>,
pub v2_quotations: Vec<Quotation>,
pub v2_all_names: Vec<String>,
pub v2_amounts: Vec<String>,
pub v2_translation_info: Option<String>,
pub v2_extras_xml: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EnhancedTheme {
pub theme: String,
pub char_offset: Option<i32>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GkgLocation {
pub location_type: i32,
pub full_name: String,
pub country_code: Option<String>,
pub adm1_code: Option<String>,
pub adm2_code: Option<String>,
pub lat: Option<f64>,
pub long: Option<f64>,
pub feature_id: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EnhancedLocation {
pub location: GkgLocation,
pub char_offset: Option<i32>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EnhancedEntity {
pub name: String,
pub char_offset: Option<i32>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GkgTone {
pub tone: f64,
pub positive_score: f64,
pub negative_score: f64,
pub polarity: f64,
pub activity_reference_density: f64,
pub self_group_reference_density: f64,
pub word_count: Option<i32>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GcamScore {
pub dimension: String,
pub score: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Quotation {
pub char_offset: i32,
pub length: i32,
pub verb: Option<String>,
pub quote: String,
}
impl GkgRecord {
pub fn from_tsv(line: &str) -> Result<Self, String> {
let fields: Vec<&str> = line.split('\t').collect();
if fields.len() < 15 {
return Err(format!("Expected at least 15 fields, got {}", fields.len()));
}
Ok(GkgRecord {
gkg_record_id: fields[0].to_string(),
date: parse_gkg_date(fields[1]),
source_collection_identifier: fields[2].parse().ok(),
source_common_name: non_empty(fields[3]),
document_identifier: fields[4].to_string(),
counts: non_empty(fields[5]),
v2_counts: non_empty(fields[6]),
themes: parse_semicolon_list(fields[7]),
v2_themes: non_empty(fields[7]),
v2_enhanced_themes: parse_enhanced_themes(fields.get(8).unwrap_or(&"")),
locations: parse_semicolon_list(fields[9]),
v2_locations: vec![], v2_enhanced_locations: vec![],
persons: parse_semicolon_list(fields[10]),
v2_persons: parse_semicolon_list(fields[10]),
v2_enhanced_persons: vec![],
organizations: parse_semicolon_list(fields[11]),
v2_organizations: parse_semicolon_list(fields[11]),
v2_enhanced_organizations: vec![],
tone: parse_tone(fields.get(12).unwrap_or(&"")),
v2_tone: parse_tone(fields.get(12).unwrap_or(&"")),
v2_enhanced_dates: vec![],
v2_gcam: vec![],
v2_sharing_image: fields.get(13).and_then(|s| non_empty(s)),
v2_related_images: vec![],
v2_social_image_embeds: vec![],
v2_social_video_embeds: vec![],
v2_quotations: vec![],
v2_all_names: vec![],
v2_amounts: vec![],
v2_translation_info: None,
v2_extras_xml: fields.get(14).and_then(|s| non_empty(s)),
})
}
pub fn all_themes(&self) -> Vec<&str> {
self.themes.iter().map(|s| s.as_str()).collect()
}
pub fn tone_score(&self) -> Option<f64> {
self.v2_tone.as_ref().or(self.tone.as_ref()).map(|t| t.tone)
}
pub fn has_theme(&self, theme: &str) -> bool {
let theme_upper = theme.to_uppercase();
self.themes.iter().any(|t| t.to_uppercase().contains(&theme_upper))
}
pub fn has_person(&self, person: &str) -> bool {
let person_lower = person.to_lowercase();
self.persons.iter().any(|p| p.to_lowercase().contains(&person_lower))
}
pub fn has_organization(&self, org: &str) -> bool {
let org_lower = org.to_lowercase();
self.organizations.iter().any(|o| o.to_lowercase().contains(&org_lower))
}
}
fn parse_gkg_date(s: &str) -> Option<NaiveDateTime> {
if s.len() < 14 {
return None;
}
chrono::NaiveDateTime::parse_from_str(s, "%Y%m%d%H%M%S").ok()
}
fn parse_semicolon_list(s: &str) -> Vec<String> {
if s.trim().is_empty() {
return vec![];
}
s.split(';')
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.collect()
}
fn parse_enhanced_themes(s: &str) -> Vec<EnhancedTheme> {
if s.trim().is_empty() {
return vec![];
}
s.split(';')
.filter(|s| !s.is_empty())
.map(|entry| {
let parts: Vec<&str> = entry.split(',').collect();
EnhancedTheme {
theme: parts.first().unwrap_or(&"").to_string(),
char_offset: parts.get(1).and_then(|s| s.parse().ok()),
}
})
.collect()
}
fn parse_tone(s: &str) -> Option<GkgTone> {
let parts: Vec<&str> = s.split(',').collect();
if parts.len() < 6 {
return None;
}
Some(GkgTone {
tone: parts[0].parse().ok()?,
positive_score: parts[1].parse().ok()?,
negative_score: parts[2].parse().ok()?,
polarity: parts[3].parse().ok()?,
activity_reference_density: parts[4].parse().ok()?,
self_group_reference_density: parts[5].parse().ok()?,
word_count: parts.get(6).and_then(|s| s.parse().ok()),
})
}
fn non_empty(s: &str) -> Option<String> {
let trimmed = s.trim();
if trimmed.is_empty() {
None
} else {
Some(trimmed.to_string())
}
}
pub const GKG_COLUMNS: &[&str] = &[
"GKGRECORDID", "DATE", "SourceCollectionIdentifier", "SourceCommonName",
"DocumentIdentifier", "Counts", "V2Counts", "Themes", "V2EnhancedThemes",
"Locations", "V2EnhancedLocations", "Persons", "V2EnhancedPersons",
"Organizations", "V2EnhancedOrganizations", "V2Tone", "V2EnhancedDates",
"V2GCAM", "V2SharingImage", "V2RelatedImages", "V2SocialImageEmbeds",
"V2SocialVideoEmbeds", "V2Quotations", "V2AllNames", "V2Amounts",
"V2TranslationInfo", "V2ExtrasXML"
];