use anyhow::{Result, anyhow};
use reqwest::{Client, header::{HeaderMap, HeaderValue}};
use serde::Deserialize;
use crate::config::{Config, Channel};
use crate::storage::Message;
#[derive(Deserialize)]
struct DiscordMessage {
id: String,
content: String,
timestamp: String,
author: DiscordAuthor,
referenced_message: Option<Box<DiscordMessage>>,
}
#[derive(Deserialize)]
struct DiscordAuthor {
id: String,
username: String,
}
pub struct Scraper {
client: Client,
token: String,
}
impl Scraper {
pub fn new(config: &Config) -> Result<Self> {
let mut headers = HeaderMap::new();
headers.insert("User-Agent", HeaderValue::from_static(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0"
));
headers.insert("Accept", HeaderValue::from_static("*/*"));
headers.insert("Accept-Language", HeaderValue::from_static("en-US,en;q=0.5"));
headers.insert("X-Discord-Locale", HeaderValue::from_static("en-US"));
headers.insert("X-Debug-Options", HeaderValue::from_static("bugReporterEnabled"));
headers.insert("Sec-Fetch-Dest", HeaderValue::from_static("empty"));
headers.insert("Sec-Fetch-Mode", HeaderValue::from_static("cors"));
headers.insert("Sec-Fetch-Site", HeaderValue::from_static("same-origin"));
let client = Client::builder()
.default_headers(headers)
.build()?;
Ok(Self {
client,
token: config.discord.token.clone(),
})
}
pub async fn scrape_channel(&self, channel: &Channel) -> Result<Vec<Message>> {
let mut messages = Vec::new();
let mut before: Option<String> = None;
let mut fetched = 0;
loop {
let remaining = channel.scrape_limit - fetched;
if remaining == 0 { break; }
let limit = remaining.min(100);
let mut url = format!(
"https://discord.com/api/v9/channels/{}/messages?limit={}",
channel.id, limit
);
if let Some(ref b) = before {
url.push_str(&format!("&before={}", b));
}
let resp = self.client
.get(&url)
.header("Authorization", &self.token)
.send()
.await?;
if !resp.status().is_success() {
return Err(anyhow!("discord api error: {}", resp.status()));
}
let batch: Vec<DiscordMessage> = resp.json().await?;
if batch.is_empty() { break; }
before = Some(batch.last().unwrap().id.clone());
fetched += batch.len();
for dm in batch {
if dm.content.is_empty() { continue; }
messages.push(Message {
id: dm.id,
channel_id: channel.id.clone(),
channel_name: channel.name.clone(),
author_id: dm.author.id,
author_name: dm.author.username,
content: dm.content,
timestamp: dm.timestamp,
reply_to: dm.referenced_message.map(|r| r.id).unwrap_or_default(),
});
}
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
}
Ok(messages)
}
pub async fn scrape_all(&self, config: &Config) -> Result<Vec<Message>> {
let mut all = Vec::new();
for channel in &config.channels {
eprintln!("scraping #{} ...", channel.name);
let msgs = self.scrape_channel(channel).await?;
eprintln!(" got {} msgs", msgs.len());
all.extend(msgs);
}
Ok(all)
}
}