use anyhow::Result;
use cached::proc_macro::cached;
use chrono::{NaiveDate, NaiveDateTime};
use once_cell::sync::Lazy;
use polars::prelude::*;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use reqwest::{Client, StatusCode};
use select::document::Document;
use select::predicate::Name;
use std::error::Error;
use std::time::Duration;
use tokio::task::spawn_blocking;
use vader_sentiment::SentimentIntensityAnalyzer;
pub static REQUEST_CLIENT: Lazy<Client> = Lazy::new(|| {
let mut headers = HeaderMap::new();
headers.insert(USER_AGENT, HeaderValue::from_static("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"));
headers.insert(
"Accept",
HeaderValue::from_static("application/json, text/plain, */*"),
);
Client::builder()
.default_headers(headers)
.cookie_store(true)
.build()
.unwrap()
});
pub async fn fetch_news(
token: &str,
start_date: NaiveDate,
end_date: NaiveDate,
compute_sentiment: bool,
) -> Result<DataFrame, Box<dyn Error + Send + Sync>> {
let url = format!(
"https://news.google.com/rss/search?q=allintext:{}+after:{}+before:{}",
token,
start_date.format("%Y-%m-%d"),
end_date.format("%Y-%m-%d")
);
let body = fetch_html(url).await?;
let df = spawn_blocking(move || extract_news_details(body, compute_sentiment)).await?;
Ok(df)
}
#[cached(
result = true,
time = 3600 // Cache Google News Results for 1 Hour
)]
async fn fetch_html(url: String) -> Result<String, Box<dyn Error + Send + Sync>> {
let response = REQUEST_CLIENT.get(&url).send().await?;
if response.status() != StatusCode::OK {
let status = response.status();
let body = response.text().await?;
return Err(format!(
"Google News request failed with status {status} for URL {url}: {body}"
)
.into());
}
let body = response.text().await?;
Ok(body)
}
fn extract_news_details(body: String, compute_sentiment: bool) -> DataFrame {
let document = match Document::from_read(body.as_bytes()) {
Ok(doc) => doc,
Err(e) => {
eprintln!("Failed to parse news RSS body: {e}");
return DataFrame::default();
}
};
let mut titles = Vec::new();
let mut sources = Vec::new();
let mut links = Vec::new();
let mut pub_dates = Vec::new();
let mut sentiment_scores = Vec::new();
for item in document.find(Name("item")) {
let title = item.children().next().map(|n| n.text()).unwrap_or_default();
let source = item.last_child().map(|n| n.text()).unwrap_or_default();
let link = item.children().nth(2).map(|n| n.text()).unwrap_or_default();
let pub_date = item.children().nth(4).map(|n| n.text()).unwrap_or_default();
if title.is_empty() || link.is_empty() || pub_date.is_empty() {
continue;
}
let pub_date = match NaiveDateTime::parse_from_str(&pub_date, "%a, %d %b %Y %H:%M:%S GMT") {
Ok(dt) => dt,
Err(_) => {
eprintln!("Skipping news item with unparseable date: {pub_date}");
continue;
}
};
titles.push(title.clone());
links.push(format!(
r#"<a href="{}">{}</a>"#,
link,
title.replace(format!("- {source}").as_str(), "")
));
sources.push(source);
pub_dates.push(pub_date);
if compute_sentiment {
let analyzer = SentimentIntensityAnalyzer::new();
let sentiment = analyzer.polarity_scores(&title);
sentiment_scores.push(sentiment["compound"]);
}
}
let df = match DataFrame::new(vec![
Column::new("Published Date".into(), pub_dates),
Column::new("Source".into(), sources),
Column::new("Title".into(), titles),
Column::new("Link".into(), links),
]) {
Ok(df) => df,
Err(e) => {
eprintln!("Failed to build news DataFrame: {e}");
return DataFrame::default();
}
};
if compute_sentiment {
let mut new_df = df.clone();
match new_df.with_column(Series::new("Sentiment Score".into(), sentiment_scores)) {
Ok(_) => new_df,
Err(e) => {
eprintln!("Error Computing Sentiment Scores: {e}");
df
}
}
} else {
df
}
}