use anyhow::{Context, Result, bail};
use std::time::Duration;
use crate::impersonate_client::{ImpersonatedMethod, ImpersonatedResponse, request_impersonated};
use crate::site::linkedin::helpers::extract_csrf_token;
pub const DEFAULT_FORM_URL: &str = "https://www.linkedin.com/psettings/member-data";
pub const DEFAULT_REQUEST_URL: &str =
"https://www.linkedin.com/mysettings-api/settingsApiDataExport/";
#[derive(Debug, Clone, Copy)]
pub enum ArchiveKind {
Fast,
Full,
}
impl ArchiveKind {
pub fn as_form_value(self) -> &'static str {
match self {
ArchiveKind::Fast => "FAST_FILE_ONLY",
ArchiveKind::Full => "ARCHIVE",
}
}
}
#[derive(Debug, Clone)]
pub enum ArchiveStatus {
Pending {
message: Option<String>,
},
Ready { download_url: String },
}
fn referer() -> &'static str {
"https://www.linkedin.com/psettings/member-data"
}
fn form_headers(csrf: &str, content_type: Option<&str>) -> Vec<(String, String)> {
let accept = match content_type {
Some(ct) if ct.contains("json") => "application/json",
_ => "text/html,application/xhtml+xml,application/xml;q=0.9",
};
let mut h = vec![
("csrf-token".to_string(), csrf.to_string()),
("referer".to_string(), referer().to_string()),
("accept".to_string(), accept.to_string()),
];
if let Some(ct) = content_type {
h.push(("content-type".to_string(), ct.to_string()));
}
h
}
pub async fn request_archive(
cookies: &str,
csrf: &str,
kind: ArchiveKind,
request_url: &str,
body_override: Option<&str>,
) -> Result<()> {
let body = body_override.map_or_else(
|| format!(r#"{{"archiveType":"{}"}}"#, kind.as_form_value()),
std::string::ToString::to_string,
);
let headers = form_headers(csrf, Some("application/json"));
let resp = request_impersonated(
ImpersonatedMethod::Post,
request_url,
Some(cookies),
Some(&headers),
Some(body.into_bytes()),
)
.await
.context("data-export request POST failed")?;
if resp.status.is_success() || resp.status.is_redirection() {
return Ok(());
}
let preview: String = resp.body.chars().take(400).collect();
bail!(
"data-export request returned HTTP {} (body preview: {}). \
Body shape may have rotated. Capture via Chrome DevTools and pass \
via --body-override.",
resp.status.as_u16(),
preview
)
}
pub fn parse_status_page(html: &str) -> ArchiveStatus {
if let Some(url) = extract_download_url(html) {
return ArchiveStatus::Ready { download_url: url };
}
let pending_markers = [
"Your archive is being prepared",
"We're preparing your download",
"Request a copy of your data",
"preparing your archive",
];
let lc = html.to_lowercase();
let message = pending_markers
.iter()
.find(|needle| lc.contains(&needle.to_lowercase()))
.map(|s| (*s).to_string());
ArchiveStatus::Pending { message }
}
fn extract_download_url(html: &str) -> Option<String> {
if let Some(idx) = html.find("Download archive") {
let head = &html[..idx];
if let Some(href_start) = head.rfind("href=\"") {
let after = &html[href_start + 6..];
if let Some(end) = after.find('"') {
let url = &after[..end];
if url.starts_with("https://") {
return Some(url.to_string());
}
}
}
}
for needle in [
"https://download.linkedin.com/",
"https://media.licdn.com/",
"https://www.linkedin.com/ambry/",
] {
if let Some(start) = html.find(needle) {
let tail = &html[start..];
let end = tail.find(['"', '\'', '<', ' ']).unwrap_or(tail.len());
let candidate = &tail[..end];
if candidate.contains("archive")
|| candidate.contains("data-export")
|| candidate.contains("ambry")
{
return Some(candidate.to_string());
}
}
}
None
}
pub async fn poll_archive_status(
cookies: &str,
csrf: &str,
form_url: &str,
) -> Result<(ArchiveStatus, ImpersonatedResponse)> {
let headers = form_headers(csrf, None);
let resp = request_impersonated(
ImpersonatedMethod::Get,
form_url,
Some(cookies),
Some(&headers),
None,
)
.await
.context("data-export status GET failed")?;
if !resp.status.is_success() {
let preview: String = resp.body.chars().take(400).collect();
bail!(
"data-export status returned HTTP {} (body preview: {})",
resp.status.as_u16(),
preview
);
}
Ok((parse_status_page(&resp.body), resp))
}
pub fn csrf_from_cookies(cookies: &str) -> Result<String> {
extract_csrf_token(cookies)
.context("no JSESSIONID cookie — cannot derive csrf-token. Use --cookies brave (or chrome) and ensure you are logged into LinkedIn.")
}
pub fn next_poll_delay(attempt: u32, base_secs: u64, max_secs: u64) -> Duration {
let secs = base_secs.saturating_mul(2u64.saturating_pow(attempt.min(6)));
Duration::from_secs(secs.min(max_secs))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn archive_kind_form_values() {
assert_eq!(ArchiveKind::Fast.as_form_value(), "FAST_FILE_ONLY");
assert_eq!(ArchiveKind::Full.as_form_value(), "ARCHIVE");
}
#[test]
fn parse_pending_page() {
let html = "<html>Your archive is being prepared. Check back soon.</html>";
match parse_status_page(html) {
ArchiveStatus::Pending { message } => {
assert_eq!(message.as_deref(), Some("Your archive is being prepared"));
}
ArchiveStatus::Ready { .. } => panic!("should be pending"),
}
}
#[test]
fn parse_ready_page_explicit_anchor() {
let html = r#"<html><a href="https://download.linkedin.com/exports/abc.zip">Download archive</a></html>"#;
match parse_status_page(html) {
ArchiveStatus::Ready { download_url } => {
assert_eq!(
download_url,
"https://download.linkedin.com/exports/abc.zip"
);
}
ArchiveStatus::Pending { .. } => panic!("should be ready"),
}
}
#[test]
fn parse_ready_page_fallback_hostname() {
let html = r#"<html>Your archive is ready: https://www.linkedin.com/ambry/data-export/abc123.zip</html>"#;
match parse_status_page(html) {
ArchiveStatus::Ready { download_url } => {
assert!(download_url.contains("ambry"));
}
ArchiveStatus::Pending { .. } => panic!("should be ready"),
}
}
#[test]
fn parse_neutral_page_is_pending_without_message() {
let html = "<html>some unrelated content</html>";
match parse_status_page(html) {
ArchiveStatus::Pending { message } => assert!(message.is_none()),
ArchiveStatus::Ready { .. } => panic!("should be pending"),
}
}
#[test]
fn next_poll_delay_grows_then_caps() {
assert_eq!(next_poll_delay(0, 60, 600).as_secs(), 60);
assert_eq!(next_poll_delay(1, 60, 600).as_secs(), 120);
assert_eq!(next_poll_delay(2, 60, 600).as_secs(), 240);
assert_eq!(next_poll_delay(3, 60, 600).as_secs(), 480);
assert_eq!(next_poll_delay(4, 60, 600).as_secs(), 600);
assert_eq!(next_poll_delay(10, 60, 600).as_secs(), 600);
}
#[test]
fn csrf_extraction_requires_jsessionid() {
assert!(csrf_from_cookies("li_at=foo").is_err());
assert!(csrf_from_cookies("JSESSIONID=\"ajax:1234567890\"; li_at=foo").is_ok());
}
}