#![allow(dead_code)]
use crate::api::client::{endpoints, GdeltClient};
use crate::error::Result;
use chrono::{NaiveDate, NaiveDateTime};
use serde::{Deserialize, Serialize};
use tracing::{debug, info, instrument};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileEntry {
pub size: u64,
pub hash: String,
pub url: String,
pub file_type: FileType,
pub datetime: Option<NaiveDateTime>,
pub is_translation: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum FileType {
Events,
Gkg,
Mentions,
Unknown,
}
impl FileType {
pub fn as_str(&self) -> &'static str {
match self {
Self::Events => "events",
Self::Gkg => "gkg",
Self::Mentions => "mentions",
Self::Unknown => "unknown",
}
}
}
impl std::fmt::Display for FileType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.as_str())
}
}
#[derive(Debug, Clone)]
pub struct MasterFileList {
pub entries: Vec<FileEntry>,
}
impl MasterFileList {
#[instrument(skip(client))]
pub async fn fetch(client: &GdeltClient, include_translations: bool) -> Result<Self> {
info!("Fetching GDELT master file list");
let mut entries = Vec::new();
let main_list = client.get_text(endpoints::MASTER_FILE_LIST).await?;
entries.extend(Self::parse_list(&main_list, false));
if include_translations {
let trans_list = client.get_text(endpoints::MASTER_FILE_LIST_TRANSLATION).await?;
entries.extend(Self::parse_list(&trans_list, true));
}
debug!("Parsed {} file entries", entries.len());
Ok(Self { entries })
}
#[instrument(skip(client))]
pub async fn fetch_latest(client: &GdeltClient) -> Result<Self> {
info!("Fetching GDELT latest update");
let list = client.get_text(endpoints::LAST_UPDATE).await?;
let entries = Self::parse_list(&list, false);
Ok(Self { entries })
}
fn parse_list(content: &str, is_translation: bool) -> Vec<FileEntry> {
content
.lines()
.filter(|line| !line.is_empty())
.filter_map(|line| Self::parse_line(line, is_translation))
.collect()
}
fn parse_line(line: &str, is_translation: bool) -> Option<FileEntry> {
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() != 3 {
return None;
}
let size = parts[0].parse().ok()?;
let hash = parts[1].to_string();
let url = parts[2].to_string();
let file_type = Self::detect_file_type(&url);
let datetime = Self::parse_datetime_from_url(&url);
Some(FileEntry {
size,
hash,
url,
file_type,
datetime,
is_translation,
})
}
fn detect_file_type(url: &str) -> FileType {
let filename = url.rsplit('/').next().unwrap_or("");
if filename.contains(".export.") || filename.contains(".EXPORT.") {
FileType::Events
} else if filename.contains(".gkg.") || filename.contains(".GKG.") {
FileType::Gkg
} else if filename.contains(".mentions.") || filename.contains(".MENTIONS.") {
FileType::Mentions
} else {
FileType::Unknown
}
}
fn parse_datetime_from_url(url: &str) -> Option<NaiveDateTime> {
let filename = url.rsplit('/').next()?;
if filename.len() >= 14 {
let date_part = &filename[0..14];
NaiveDateTime::parse_from_str(date_part, "%Y%m%d%H%M%S").ok()
} else {
None
}
}
pub fn filter_by_type(&self, file_type: FileType) -> Vec<&FileEntry> {
self.entries
.iter()
.filter(|e| e.file_type == file_type)
.collect()
}
pub fn filter_by_date_range(
&self,
start: Option<NaiveDate>,
end: Option<NaiveDate>,
) -> Vec<&FileEntry> {
self.entries
.iter()
.filter(|e| {
if let Some(dt) = e.datetime {
let date = dt.date();
let after_start = start.map_or(true, |s| date >= s);
let before_end = end.map_or(true, |e| date <= e);
after_start && before_end
} else {
false
}
})
.collect()
}
pub fn entries_for_date(&self, date: NaiveDate) -> Vec<&FileEntry> {
self.entries
.iter()
.filter(|e| e.datetime.map(|dt| dt.date()) == Some(date))
.collect()
}
pub fn latest_entries(&self) -> Vec<&FileEntry> {
let max_datetime = self.entries.iter().filter_map(|e| e.datetime).max();
if let Some(max_dt) = max_datetime {
self.entries
.iter()
.filter(|e| e.datetime == Some(max_dt))
.collect()
} else {
vec![]
}
}
pub fn total_size(&self) -> u64 {
self.entries.iter().map(|e| e.size).sum()
}
pub fn size_by_type(&self, file_type: FileType) -> u64 {
self.entries
.iter()
.filter(|e| e.file_type == file_type)
.map(|e| e.size)
.sum()
}
pub fn date_range(&self) -> Option<(NaiveDate, NaiveDate)> {
let dates: Vec<NaiveDate> = self
.entries
.iter()
.filter_map(|e| e.datetime.map(|dt| dt.date()))
.collect();
if dates.is_empty() {
return None;
}
let min = dates.iter().min().copied()?;
let max = dates.iter().max().copied()?;
Some((min, max))
}
pub fn count_by_type(&self) -> std::collections::HashMap<FileType, usize> {
let mut counts = std::collections::HashMap::new();
for entry in &self.entries {
*counts.entry(entry.file_type).or_insert(0) += 1;
}
counts
}
}
impl FileEntry {
pub fn filename(&self) -> &str {
self.url.rsplit('/').next().unwrap_or(&self.url)
}
pub fn file_id(&self) -> String {
format!("{}_{}", self.datetime.map(|dt| dt.format("%Y%m%d%H%M%S").to_string()).unwrap_or_default(), self.file_type.as_str())
}
pub fn date_int(&self) -> Option<i64> {
self.datetime.map(|dt| {
let d = dt.date();
(d.year() as i64) * 10000 + (d.month() as i64) * 100 + (d.day() as i64)
})
}
}
use chrono::Datelike;