use reqwest::Client;
use scraper::{ElementRef, Html, Selector};
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::env;
use std::error::Error as StdError;
use std::fs;
use std::io;
use std::path::{Path, PathBuf};
use std::process::Command;
use std::sync::Arc;
use std::time::{SystemTime, UNIX_EPOCH};
use cloudllm::client_wrapper::Role;
use cloudllm::clients::openai::{Model, OpenAIClient};
use cloudllm::LLMSession;
const BROWSER_USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36";
const X_WEB_BEARER_TOKEN: &str =
"AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA";
const X_WEB_TWEET_RESULT_BY_REST_ID_QUERY_ID: &str = "zy39CwTyYhU-_0LP7dljjg";
#[derive(Debug, Serialize, Clone)]
pub struct Post {
pub title: String,
pub content: String,
pub featured_image_url: String,
pub publication_date: Option<String>,
pub author: Option<String>,
pub error: String,
}
#[must_use]
fn clean_element(element: ElementRef, skip_tags: &HashSet<&str>) -> String {
let tag_name = element.value().name();
if skip_tags.contains(tag_name) {
return String::new();
}
let mut children_cleaned = String::new();
for child in element.children() {
if let Some(child_elem) = ElementRef::wrap(child) {
let cleaned = clean_element(child_elem, skip_tags);
if !cleaned.trim().is_empty() {
children_cleaned.push_str(&cleaned);
children_cleaned.push(' ');
}
} else if let Some(text) = child.value().as_text() {
let text_trimmed = text.trim();
if !text_trimmed.is_empty() {
children_cleaned.push_str(text_trimmed);
children_cleaned.push(' ');
}
}
}
if children_cleaned.trim().is_empty() {
return String::new();
}
format!(
"<{tag}>{content}</{tag}>",
tag = tag_name,
content = children_cleaned.trim()
)
}
#[must_use]
fn extract_clean_content(document: &Html, skip_tags: &HashSet<&str>) -> String {
if let Ok(article_sel) = Selector::parse("article") {
if let Some(article) = document.select(&article_sel).next() {
let cleaned = clean_element(article, skip_tags);
if !cleaned.trim().is_empty() {
return cleaned;
}
}
}
if let Ok(body_sel) = Selector::parse("body") {
if let Some(body) = document.select(&body_sel).next() {
return clean_element(body, skip_tags);
}
}
String::new()
}
#[derive(Deserialize, Debug)]
struct XTweet {
id: String,
text: String,
created_at: Option<String>,
author_id: Option<String>,
conversation_id: Option<String>,
article: Option<XArticleMeta>,
entities: Option<XEntities>,
}
#[derive(Deserialize, Debug)]
struct XArticleMeta {
title: Option<String>,
plain_text: Option<String>,
preview_text: Option<String>,
}
#[derive(Deserialize, Debug, Clone)]
struct XUrlEntity {
url: Option<String>,
expanded_url: Option<String>,
unwound_url: Option<String>,
}
#[derive(Deserialize, Debug, Clone)]
struct XEntities {
urls: Option<Vec<XUrlEntity>>,
}
#[derive(Deserialize, Debug)]
struct XUser {
name: String,
username: String,
profile_image_url: Option<String>,
}
#[derive(Deserialize, Debug)]
struct XIncludes {
users: Option<Vec<XUser>>,
}
#[derive(Deserialize, Debug)]
struct XTweetResponse {
data: Option<XTweet>,
includes: Option<XIncludes>,
errors: Option<Vec<serde_json::Value>>,
}
#[derive(Deserialize, Debug)]
struct XSearchResponse {
data: Option<Vec<XTweet>>,
}
#[derive(Deserialize, Debug)]
struct XBearerTokenResponse {
token_type: String,
access_token: String,
}
#[derive(Deserialize, Debug)]
struct XGuestActivateResponse {
guest_token: String,
}
#[derive(Deserialize, Debug)]
struct XWebTweetResultResponse {
data: Option<XWebTweetResultData>,
errors: Option<Vec<serde_json::Value>>,
}
#[derive(Deserialize, Debug)]
struct XWebTweetResultData {
#[serde(rename = "tweetResult")]
tweet_result: Option<XWebTweetResultEnvelope>,
}
#[derive(Deserialize, Debug)]
struct XWebTweetResultEnvelope {
result: Option<XWebTweetResult>,
}
#[derive(Deserialize, Debug)]
struct XWebTweetResult {
article: Option<XWebArticleEnvelope>,
}
#[derive(Deserialize, Debug)]
struct XWebArticleEnvelope {
#[serde(rename = "article_results")]
article_results: Option<XWebArticleResults>,
}
#[derive(Deserialize, Debug)]
struct XWebArticleResults {
result: Option<XWebArticle>,
}
#[derive(Deserialize, Debug)]
struct XWebArticle {
title: Option<String>,
plain_text: Option<String>,
content_state: Option<XWebArticleContentState>,
cover_media: Option<XWebArticleCoverMedia>,
}
#[derive(Deserialize, Debug)]
struct XWebArticleContentState {
blocks: Option<Vec<XWebArticleBlock>>,
}
#[derive(Deserialize, Debug)]
struct XWebArticleBlock {
text: Option<String>,
}
#[derive(Deserialize, Debug)]
struct XWebArticleCoverMedia {
media_info: Option<XWebArticleMediaInfo>,
}
#[derive(Deserialize, Debug)]
struct XWebArticleMediaInfo {
original_img_url: Option<String>,
}
fn is_x_url(url: &str) -> bool {
url.starts_with("https://x.com/") || url.starts_with("https://twitter.com/")
}
fn is_x_article_url(url: &str) -> bool {
url.contains("x.com/i/article/") || url.contains("twitter.com/i/article/")
}
fn extract_tweet_id(url: &str) -> Option<String> {
let clean = url.split('?').next().unwrap_or(url);
let clean = clean.split('#').next().unwrap_or(clean);
const STATUS: &str = "/status/";
if let Some(pos) = clean.find(STATUS) {
let after = &clean[pos + STATUS.len()..];
let id: String = after.chars().take_while(|c| c.is_ascii_digit()).collect();
if !id.is_empty() {
return Some(id);
}
}
None
}
fn first_non_empty_env_var(keys: &[&str]) -> Option<String> {
keys.iter().find_map(|key| match env::var(key) {
Ok(value) if !value.trim().is_empty() => Some(value),
_ => None,
})
}
fn x_api_error_message(body: &str) -> Option<String> {
let parsed: serde_json::Value = serde_json::from_str(body).ok()?;
if let Some(message) = parsed.get("error").and_then(|value| value.as_str()) {
return Some(message.to_string());
}
if let Some(message) = parsed.get("detail").and_then(|value| value.as_str()) {
return Some(message.to_string());
}
parsed
.get("errors")
.and_then(|value| value.as_array())
.and_then(|errors| errors.first())
.and_then(|error| {
error
.get("detail")
.or_else(|| error.get("message"))
.and_then(|value| value.as_str())
})
.map(ToString::to_string)
}
fn summarize_body(body: &str, max_len: usize) -> String {
let trimmed = body.trim();
if trimmed.len() <= max_len {
return trimmed.to_string();
}
let mut end = max_len;
while !trimmed.is_char_boundary(end) {
end -= 1;
}
format!("{}...", &trimmed[..end])
}
fn normalized_output_language(language: &str) -> &str {
if language.trim().is_empty() {
"english"
} else {
language
}
}
fn markdown_system_prompt(language: &str) -> String {
format!(
"You are an expert markdown formatter and translator for scraped news articles. \
The provided JSON already contains the extracted article body in the `content` field. \
Convert that content into clean Markdown in {} while preserving the source text and structure as fully as possible. \
Do not summarize, paraphrase, compress, or omit substantive details. \
Preserve paragraph order, list items, quotes, headings, names, dates, numbers, and factual claims. \
Only remove obvious HTML tags, duplicated boilerplate, or navigation noise that slipped through the scraper. \
If translation is requested, translate faithfully without shortening the article. \
Output only the final Markdown body text. If {} is not supported, default to english.",
language, language
)
}
fn markdown_user_prompt(language: &str, post_json: &str) -> String {
format!(
"Convert the following Post JSON into Markdown formatted text in {}. \
Treat `content` as the canonical article body and keep it nearly verbatim except for Markdown formatting, minimal cleanup, and faithful translation if needed. \
Do not add commentary and do not return JSON.\n\n{}",
language, post_json
)
}
fn x_debug_enabled() -> bool {
matches!(
env::var("UNINEWS_DEBUG_X_JSON").ok().as_deref(),
Some("1") | Some("true") | Some("TRUE") | Some("yes") | Some("YES")
)
}
fn x_debug_dump(label: &str, body: &str) {
if x_debug_enabled() {
eprintln!("--- {} ---\n{}\n--- end {} ---", label, body, label);
}
}
fn x_debug_dump_http_response(
label: &str,
url: &str,
status: reqwest::StatusCode,
headers: &reqwest::header::HeaderMap,
body: &str,
) {
if !x_debug_enabled() {
return;
}
eprintln!("--- {} ---", label);
eprintln!("url: {}", url);
eprintln!("status: {}", status);
for (name, value) in headers {
eprintln!(
"header {}: {}",
name.as_str(),
value.to_str().unwrap_or("<non-utf8>")
);
}
eprintln!();
eprintln!("{}", body);
eprintln!("--- end {} ---", label);
}
fn x_url_is_status_link(url: &str) -> bool {
url.contains("/status/")
}
fn normalize_text_url_token(token: &str) -> Option<String> {
let trimmed = token.trim_matches(|c: char| {
matches!(
c,
'"' | '\'' | '(' | ')' | '[' | ']' | '{' | '}' | '<' | '>' | ',' | '.' | ';' | ':'
)
});
if trimmed.starts_with("https://") || trimmed.starts_with("http://") {
return Some(trimmed.to_string());
}
None
}
fn x_text_urls(tweet: &XTweet) -> Vec<String> {
let mut urls = Vec::new();
if let Some(entity_urls) = tweet
.entities
.as_ref()
.and_then(|entities| entities.urls.as_ref())
{
for url in entity_urls {
for candidate in [&url.url, &url.expanded_url, &url.unwound_url]
.into_iter()
.flatten()
{
let candidate = candidate.trim();
if !candidate.is_empty() && !urls.iter().any(|url| url == candidate) {
urls.push(candidate.to_string());
}
}
}
}
for token in tweet.text.split_whitespace() {
if let Some(candidate) = normalize_text_url_token(token) {
if !urls.iter().any(|url| url == &candidate) {
urls.push(candidate);
}
}
}
urls
}
fn x_linked_article_url(tweet: &XTweet) -> Option<String> {
x_text_urls(tweet).into_iter().find(|candidate| {
!candidate.is_empty()
&& !candidate.starts_with("https://t.co/")
&& !candidate.starts_with("http://t.co/")
&& !x_url_is_status_link(candidate)
})
}
async fn resolve_url_redirect(client: &Client, url: &str) -> Option<String> {
let response = client.get(url).send().await.ok()?;
let final_url = response.url().as_str().trim().to_string();
if final_url.is_empty()
|| final_url.starts_with("https://t.co/")
|| final_url.starts_with("http://t.co/")
|| x_url_is_status_link(&final_url)
{
return None;
}
Some(final_url)
}
async fn resolve_x_linked_article_url(client: &Client, tweet: &XTweet) -> Option<String> {
if let Some(article_url) = x_linked_article_url(tweet) {
return Some(article_url);
}
for candidate in x_text_urls(tweet) {
if let Some(resolved_url) = resolve_url_redirect(client, &candidate).await {
return Some(resolved_url);
}
}
None
}
fn x_text_without_urls(tweet: &XTweet) -> String {
let mut text = tweet.text.clone();
for candidate in x_text_urls(tweet) {
text = text.replace(&candidate, " ");
}
text.split_whitespace().collect::<Vec<_>>().join(" ")
}
fn x_post_is_link_only(tweet: &XTweet) -> bool {
x_text_without_urls(tweet).trim().is_empty()
}
fn x_article_plain_text(article: &XArticleMeta) -> Option<String> {
article
.plain_text
.as_deref()
.or(article.preview_text.as_deref())
.map(str::trim)
.filter(|text| !text.is_empty())
.map(ToString::to_string)
}
fn x_article_body_unavailable(body: &str) -> bool {
let lower = body.to_ascii_lowercase();
lower.contains("this page is not supported")
|| lower.contains("please visit the author's profile")
|| lower.contains("javascript is not available")
}
fn x_web_article_body(article: &XWebArticle) -> Option<String> {
if let Some(plain_text) = article.plain_text.as_ref() {
let plain_text = plain_text.trim();
if !plain_text.is_empty() {
return Some(plain_text.to_string());
}
}
let blocks = article.content_state.as_ref()?.blocks.as_ref()?;
let block_text = blocks
.iter()
.filter_map(|block| block.text.as_deref())
.map(str::trim)
.filter(|text| !text.is_empty())
.map(ToString::to_string)
.collect::<Vec<_>>()
.join("\n\n");
if block_text.trim().is_empty() {
None
} else {
Some(block_text)
}
}
fn parse_x_web_article_post(
body: &str,
title_override: Option<&str>,
publication_date: Option<String>,
author: Option<String>,
) -> Result<Post, String> {
let response: XWebTweetResultResponse = serde_json::from_str(body).map_err(|error| {
format!(
"Failed to parse X web GraphQL article response: {} ({})",
error,
summarize_body(body, 400)
)
})?;
if let Some(errors) = response.errors.as_ref() {
if !errors.is_empty() {
let message = errors
.first()
.and_then(|error| error.get("message").or_else(|| error.get("detail")))
.and_then(|value| value.as_str())
.unwrap_or("Unknown X web GraphQL error");
return Err(format!("X web GraphQL error: {}", message));
}
}
let article = response
.data
.and_then(|data| data.tweet_result)
.and_then(|tweet_result| tweet_result.result)
.and_then(|tweet_result| tweet_result.article)
.and_then(|article| article.article_results)
.and_then(|results| results.result)
.ok_or_else(|| "X web GraphQL response did not include an article payload.".to_string())?;
let content = x_web_article_body(&article)
.ok_or_else(|| "X web GraphQL response did not include article body text.".to_string())?;
let title = article
.title
.as_deref()
.map(str::trim)
.filter(|title| !title.is_empty())
.map(ToString::to_string)
.or_else(|| {
title_override
.map(str::trim)
.filter(|title| !title.is_empty())
.map(ToString::to_string)
})
.unwrap_or_else(|| "X article".to_string());
let featured_image_url = article
.cover_media
.and_then(|media| media.media_info)
.and_then(|media_info| media_info.original_img_url)
.unwrap_or_default();
Ok(Post {
title,
content,
featured_image_url,
publication_date,
author,
error: String::new(),
})
}
fn chrome_binary() -> String {
if let Some(binary) = first_non_empty_env_var(&["UNINEWS_CHROME_BINARY"]) {
return binary;
}
for candidate in [
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"/Applications/Chromium.app/Contents/MacOS/Chromium",
] {
if Path::new(candidate).exists() {
return candidate.to_string();
}
}
"google-chrome".to_string()
}
fn should_skip_chrome_profile_entry(name: &str) -> bool {
matches!(
name,
"SingletonCookie" | "SingletonLock" | "SingletonSocket" | "Crashpad"
)
}
fn copy_dir_recursively(source: &Path, destination: &Path) -> io::Result<()> {
fs::create_dir_all(destination)?;
for entry in fs::read_dir(source)? {
let entry = entry?;
let file_type = entry.file_type()?;
let entry_name = entry.file_name();
let entry_name = entry_name.to_string_lossy();
if should_skip_chrome_profile_entry(&entry_name) {
continue;
}
let source_path = entry.path();
let destination_path = destination.join(entry.file_name());
if file_type.is_dir() {
copy_dir_recursively(&source_path, &destination_path)?;
} else if file_type.is_file() {
if let Some(parent) = destination_path.parent() {
fs::create_dir_all(parent)?;
}
fs::copy(&source_path, &destination_path)?;
}
}
Ok(())
}
fn clone_chrome_profile(
source_user_data_dir: &Path,
profile_name: &str,
) -> Result<(PathBuf, String), String> {
let profile_source = source_user_data_dir.join(profile_name);
if !profile_source.is_dir() {
return Err(format!(
"Chrome profile directory not found: {}",
profile_source.display()
));
}
let nonce = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|duration| duration.as_millis())
.unwrap_or_default();
let temp_root = env::temp_dir().join(format!(
"uninews-chrome-profile-{}-{}",
std::process::id(),
nonce
));
fs::create_dir_all(&temp_root).map_err(|err| {
format!(
"Failed to create temporary Chrome profile directory {}: {}",
temp_root.display(),
err
)
})?;
for root_file in ["Local State", "First Run"] {
let source_file = source_user_data_dir.join(root_file);
if source_file.is_file() {
let destination_file = temp_root.join(root_file);
fs::copy(&source_file, &destination_file).map_err(|err| {
format!(
"Failed to copy {} into temporary Chrome profile: {}",
source_file.display(),
err
)
})?;
}
}
let staged_profile = temp_root.join(profile_name);
copy_dir_recursively(&profile_source, &staged_profile).map_err(|err| {
format!(
"Failed to clone Chrome profile {} into {}: {}",
profile_source.display(),
staged_profile.display(),
err
)
})?;
Ok((temp_root, profile_name.to_string()))
}
async fn fetch_rendered_dom_with_chrome(url: &str) -> Result<String, String> {
let browser_binary = chrome_binary();
let user_data_dir = first_non_empty_env_var(&["UNINEWS_CHROME_USER_DATA_DIR"]);
let profile_dir = first_non_empty_env_var(&["UNINEWS_CHROME_PROFILE_DIR"]);
let url = url.to_string();
let browser_binary_for_error = browser_binary.clone();
let url_for_error = url.clone();
let output = tokio::task::spawn_blocking(move || {
let staged_profile = if let Some(user_data_dir) = user_data_dir.as_ref() {
let profile_name = profile_dir.as_deref().unwrap_or("Default");
Some(clone_chrome_profile(Path::new(user_data_dir), profile_name))
} else {
None
};
let (effective_user_data_dir, effective_profile_dir, staged_root) = match staged_profile {
Some(Ok((temp_root, profile_name))) => {
(Some(temp_root.clone()), Some(profile_name), Some(temp_root))
}
Some(Err(err)) => return Err(io::Error::other(err)),
None => (None, profile_dir, None),
};
let mut command = Command::new(&browser_binary);
command
.arg("--headless=new")
.arg("--disable-gpu")
.arg("--virtual-time-budget=15000")
.arg("--dump-dom");
if let Some(user_data_dir) = effective_user_data_dir.as_ref() {
command.arg(format!("--user-data-dir={}", user_data_dir.display()));
}
if let Some(profile_dir) = effective_profile_dir.as_ref() {
command.arg(format!("--profile-directory={}", profile_dir));
}
command.arg(&url);
let result = command.output();
if let Some(staged_root) = staged_root {
let _ = fs::remove_dir_all(staged_root);
}
result
})
.await
.map_err(|err| format!("Chrome browser fallback task failed: {}", err))?
.map_err(|err| format!("Failed to launch Chrome browser fallback: {}", err))?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
let stdout = String::from_utf8_lossy(&output.stdout);
let detail = if !stderr.trim().is_empty() {
stderr.trim().to_string()
} else if !stdout.trim().is_empty() {
summarize_body(stdout.as_ref(), 400)
} else {
"unknown error".to_string()
};
return Err(format!(
"failed to render {} with {}: {}",
url_for_error, browser_binary_for_error, detail
));
}
Ok(String::from_utf8_lossy(&output.stdout).into_owned())
}
async fn resolve_x_guest_token(client: &Client) -> Result<String, String> {
let response = client
.post("https://api.x.com/1.1/guest/activate.json")
.header("Authorization", format!("Bearer {}", X_WEB_BEARER_TOKEN))
.header("x-twitter-active-user", "yes")
.header("x-twitter-client-language", "en")
.send()
.await
.map_err(|error| format!("Failed to activate X guest token: {}", error))?;
let status = response.status();
let body = response
.text()
.await
.map_err(|error| format!("Failed to read X guest token response: {}", error))?;
x_debug_dump("X guest token JSON", &body);
if !status.is_success() {
let message = x_api_error_message(&body).unwrap_or_else(|| summarize_body(&body, 400));
return Err(format!(
"Failed to activate X guest token (status {}): {}",
status, message
));
}
let response: XGuestActivateResponse = serde_json::from_str(&body).map_err(|error| {
format!(
"Failed to parse X guest token response: {} ({})",
error,
summarize_body(&body, 400)
)
})?;
Ok(response.guest_token)
}
async fn scrape_x_article_from_web_graphql(
client: &Client,
tweet_id: &str,
title_override: Option<&str>,
publication_date: Option<String>,
author: Option<String>,
) -> Result<Post, String> {
let guest_token = resolve_x_guest_token(client).await?;
let endpoint = format!(
"https://x.com/i/api/graphql/{}/TweetResultByRestId",
X_WEB_TWEET_RESULT_BY_REST_ID_QUERY_ID
);
let variables = serde_json::json!({
"tweetId": tweet_id,
"withCommunity": false,
"includePromotedContent": false,
"withVoice": false
})
.to_string();
let features = serde_json::json!({
"withArticleRichContentState": true,
"withArticlePlainText": true,
"withArticleSummaryText": false,
"withArticleVoiceOver": false
})
.to_string();
let field_toggles = serde_json::json!({
"withArticleRichContentState": true,
"withArticlePlainText": true,
"withArticleSummaryText": false,
"withArticleVoiceOver": false
})
.to_string();
let endpoint = reqwest::Url::parse_with_params(
&endpoint,
&[
("variables", variables),
("features", features),
("fieldToggles", field_toggles),
],
)
.map_err(|error| format!("Failed to build X web GraphQL URL: {}", error))?;
let response = client
.get(endpoint)
.header("Authorization", format!("Bearer {}", X_WEB_BEARER_TOKEN))
.header("x-guest-token", guest_token)
.header("x-twitter-active-user", "yes")
.header("x-twitter-client-language", "en")
.send()
.await
.map_err(|error| format!("Failed to fetch X article via web GraphQL: {}", error))?;
let status = response.status();
let body = response
.text()
.await
.map_err(|error| format!("Failed to read X web GraphQL response body: {}", error))?;
x_debug_dump("X web GraphQL JSON", &body);
if !status.is_success() {
let message = x_api_error_message(&body).unwrap_or_else(|| summarize_body(&body, 400));
return Err(format!(
"X web GraphQL returned HTTP {}: {}",
status, message
));
}
parse_x_web_article_post(&body, title_override, publication_date, author)
}
fn parse_scraped_post_from_html(
source_url: &str,
body_text: &str,
title_override: Option<&str>,
) -> Post {
if is_x_article_url(source_url) && x_article_body_unavailable(body_text) {
return Post {
title: title_override.unwrap_or_default().trim().to_string(),
content: String::new(),
featured_image_url: String::new(),
publication_date: None,
author: None,
error: "X article body is not available in the guest HTML response.".to_string(),
};
}
let document = Html::parse_document(body_text);
let skip_tags: HashSet<&str> = [
"script", "style", "noscript", "iframe", "header", "footer", "nav", "aside", "form",
"input", "button", "svg", "picture", "source",
]
.iter()
.cloned()
.collect();
let title_selector = Selector::parse("title").unwrap();
let extracted_title = document
.select(&title_selector)
.next()
.map(|elem| elem.text().collect::<Vec<_>>().join(" ").trim().to_string())
.unwrap_or_default();
let title = title_override
.filter(|title| !title.trim().is_empty())
.map(|title| title.trim().to_string())
.unwrap_or(extracted_title);
let content = extract_clean_content(&document, &skip_tags);
let meta_selector = Selector::parse(r#"meta[property="og:image"]"#).unwrap();
let featured_image_url = document
.select(&meta_selector)
.next()
.and_then(|meta| meta.value().attr("content"))
.unwrap_or("")
.to_string();
let date_selector = Selector::parse(r#"meta[property="article:published_time"]"#).unwrap();
let publication_date = document
.select(&date_selector)
.next()
.and_then(|meta| meta.value().attr("content"))
.map(String::from);
let author_selector = Selector::parse(r#"meta[name="author"]"#).unwrap();
let author = document
.select(&author_selector)
.next()
.and_then(|meta| meta.value().attr("content"))
.map(String::from);
if content.trim().is_empty() {
return Post {
title,
content: String::new(),
featured_image_url,
publication_date,
author,
error: "Could not extract meaningful content from the page.".into(),
};
}
Post {
title,
content,
featured_image_url,
publication_date,
author,
error: String::new(),
}
}
async fn resolve_x_bearer_token(client: &Client) -> Result<String, String> {
let api_key = first_non_empty_env_var(&["X_API_KEY", "DBTC_TWITTER_API_KEY"]);
let api_secret = first_non_empty_env_var(&["X_API_SECRET", "DBTC_TWITTER_API_SECRET"]);
let (api_key, api_secret) = match (api_key, api_secret) {
(Some(api_key), Some(api_secret)) => (api_key, api_secret),
_ => {
return Err(
"Please provide both X_API_KEY and X_API_SECRET (or DBTC_TWITTER_API_KEY and DBTC_TWITTER_API_SECRET).".into(),
);
}
};
let token_resp = client
.post("https://api.x.com/oauth2/token")
.basic_auth(api_key, Some(api_secret))
.header(
"Content-Type",
"application/x-www-form-urlencoded;charset=UTF-8",
)
.body("grant_type=client_credentials")
.send()
.await
.map_err(|e| format!("Failed to obtain X bearer token: {}", e))?;
let status = token_resp.status();
let body = token_resp
.text()
.await
.map_err(|e| format!("Failed to read X bearer token response: {}", e))?;
if !status.is_success() {
let message = x_api_error_message(&body).unwrap_or(body);
return Err(format!(
"Failed to obtain X bearer token (status {}): {}",
status, message
));
}
let token_data: XBearerTokenResponse = serde_json::from_str(&body)
.map_err(|e| format!("Failed to parse X bearer token response: {} ({})", e, body))?;
if !token_data.token_type.eq_ignore_ascii_case("bearer") {
return Err(format!(
"X token exchange returned unsupported token type: {}",
token_data.token_type
));
}
Ok(token_data.access_token)
}
async fn scrape_web_url_raw_with_title_override(url: &str, title_override: Option<&str>) -> Post {
let client = Client::builder()
.user_agent(BROWSER_USER_AGENT)
.http1_only()
.build()
.unwrap_or_default();
let response = client.get(url).send().await;
if let Err(err) = response {
let mut msg = format!("Failed to fetch URL: {}", err);
let mut src: Option<&dyn StdError> = err.source();
while let Some(cause) = src {
msg.push_str(&format!(" => {}", cause));
src = cause.source();
}
return Post {
title: "".into(),
content: "".into(),
featured_image_url: "".into(),
publication_date: None,
author: None,
error: msg,
};
}
let response = response.unwrap();
let response_url = response.url().to_string();
let is_x_article = is_x_article_url(&response_url) || is_x_article_url(url);
let response_status = response.status();
let response_headers = response.headers().clone();
let body_text = match response.text().await {
Ok(text) => text,
Err(err) => {
return Post {
title: "".into(),
content: "".into(),
featured_image_url: "".into(),
publication_date: None,
author: None,
error: format!("Failed to read response body: {}", err),
}
}
};
if is_x_article {
x_debug_dump_http_response(
"X article page response",
&response_url,
response_status,
&response_headers,
&body_text,
);
}
let scraped_post = parse_scraped_post_from_html(&response_url, &body_text, title_override);
if scraped_post.error.is_empty() || !is_x_article {
return scraped_post;
}
let rendered_dom = match fetch_rendered_dom_with_chrome(&response_url).await {
Ok(rendered_dom) => rendered_dom,
Err(browser_error) => {
if x_article_body_unavailable(&body_text) {
return Post {
error: format!(
"X article body is not available to guest sessions. Set UNINEWS_CHROME_USER_DATA_DIR and optionally UNINEWS_CHROME_PROFILE_DIR to a logged-in Chrome profile. Browser fallback failed: {}",
browser_error
),
..scraped_post
};
}
return Post {
error: format!(
"{} Chrome browser fallback failed: {}",
scraped_post.error, browser_error
),
..scraped_post
};
}
};
x_debug_dump("X article rendered DOM", &rendered_dom);
let rendered_post = parse_scraped_post_from_html(&response_url, &rendered_dom, title_override);
if rendered_post.error.is_empty() {
return rendered_post;
}
if x_article_body_unavailable(&rendered_dom) {
return Post {
error: "X article body is not available to guest sessions. Set UNINEWS_CHROME_USER_DATA_DIR and optionally UNINEWS_CHROME_PROFILE_DIR to a logged-in Chrome profile.".to_string(),
..rendered_post
};
}
Post {
error: format!(
"{} Browser-rendered fallback also failed: {}",
scraped_post.error, rendered_post.error
),
..rendered_post
}
}
async fn scrape_web_url_with_title_override(
url: &str,
language: &str,
openai_model: Option<Model>,
title_override: Option<&str>,
) -> Post {
let scraped_post = scrape_web_url_raw_with_title_override(url, title_override).await;
if !scraped_post.error.is_empty() {
return scraped_post;
}
match convert_content_to_markdown(scraped_post.clone(), language, openai_model).await {
Ok(markdown_post) => markdown_post,
Err(err) => Post {
error: err,
..scraped_post
},
}
}
async fn scrape_web_url(url: &str, language: &str, openai_model: Option<Model>) -> Post {
scrape_web_url_with_title_override(url, language, openai_model, None).await
}
async fn scrape_x_url(url: &str, language: &str, openai_model: Option<Model>) -> Post {
let tweet_id = match extract_tweet_id(url) {
Some(id) => id,
None => {
return Post {
title: String::new(),
content: String::new(),
featured_image_url: String::new(),
publication_date: None,
author: None,
error: format!("Could not extract a tweet ID from the URL: {}", url),
};
}
};
let client = Client::builder()
.user_agent(BROWSER_USER_AGENT)
.build()
.unwrap_or_default();
let bearer_token = match resolve_x_bearer_token(&client).await {
Ok(token) => token,
Err(error) => {
return Post {
title: String::new(),
content: String::new(),
featured_image_url: String::new(),
publication_date: None,
author: None,
error,
};
}
};
let auth_header = format!("Bearer {}", bearer_token);
let root_tweet_url = format!(
"https://api.x.com/2/tweets/{}?tweet.fields=created_at,author_id,conversation_id,text,entities,article&expansions=author_id&user.fields=name,username,profile_image_url",
tweet_id
);
let root_resp = match client
.get(&root_tweet_url)
.header("Authorization", &auth_header)
.send()
.await
{
Ok(r) => r,
Err(e) => {
return Post {
title: String::new(),
content: String::new(),
featured_image_url: String::new(),
publication_date: None,
author: None,
error: format!("Failed to call X API: {}", e),
};
}
};
let root_status = root_resp.status();
let root_body = match root_resp.text().await {
Ok(body) => body,
Err(e) => {
return Post {
title: String::new(),
content: String::new(),
featured_image_url: String::new(),
publication_date: None,
author: None,
error: format!("Failed to read X API response body: {}", e),
};
}
};
x_debug_dump("X root tweet JSON", &root_body);
if !root_status.is_success() {
let message =
x_api_error_message(&root_body).unwrap_or_else(|| summarize_body(&root_body, 400));
return Post {
title: String::new(),
content: String::new(),
featured_image_url: String::new(),
publication_date: None,
author: None,
error: format!("X API returned HTTP {}: {}", root_status, message),
};
}
let root_data: XTweetResponse = match serde_json::from_str(&root_body) {
Ok(d) => d,
Err(e) => {
return Post {
title: String::new(),
content: String::new(),
featured_image_url: String::new(),
publication_date: None,
author: None,
error: format!(
"Failed to parse X API response: {} ({})",
e,
summarize_body(&root_body, 400)
),
};
}
};
if let Some(errors) = &root_data.errors {
if !errors.is_empty() {
let msg = errors
.first()
.and_then(|e| e.get("detail").or_else(|| e.get("message")))
.and_then(|v| v.as_str())
.unwrap_or("Unknown X API error");
return Post {
title: String::new(),
content: String::new(),
featured_image_url: String::new(),
publication_date: None,
author: None,
error: format!("X API error: {}", msg),
};
}
}
let root_tweet = match root_data.data {
Some(t) => t,
None => {
return Post {
title: String::new(),
content: String::new(),
featured_image_url: String::new(),
publication_date: None,
author: None,
error: format!(
"X API returned no tweet data. Response body: {}",
summarize_body(&root_body, 400)
),
};
}
};
let author_info = root_data
.includes
.as_ref()
.and_then(|inc| inc.users.as_ref())
.and_then(|users| users.first());
let author_display = author_info.map(|u| format!("@{} ({})", u.username, u.name));
let profile_image = author_info
.and_then(|u| u.profile_image_url.clone())
.unwrap_or_default();
let author_id = root_tweet.author_id.clone().unwrap_or_default();
let conversation_id = root_tweet
.conversation_id
.clone()
.unwrap_or_else(|| root_tweet.id.clone());
if x_post_is_link_only(&root_tweet) {
let article_title_override = root_tweet
.article
.as_ref()
.and_then(|article| article.title.as_deref());
let embedded_article_body = root_tweet.article.as_ref().and_then(x_article_plain_text);
if let Some(content) = embedded_article_body {
let scraped_article_post = Post {
title: article_title_override
.map(str::trim)
.filter(|title| !title.is_empty())
.unwrap_or("X article")
.to_string(),
content,
featured_image_url: profile_image.clone(),
publication_date: root_tweet.created_at.clone(),
author: author_display.clone(),
error: String::new(),
};
return match convert_content_to_markdown(
scraped_article_post.clone(),
language,
openai_model,
)
.await
{
Ok(markdown_post) => markdown_post,
Err(err) => Post {
error: err,
..scraped_article_post
},
};
}
if let Some(article_url) = resolve_x_linked_article_url(&client, &root_tweet).await {
if is_x_article_url(&article_url) {
match scrape_x_article_from_web_graphql(
&client,
&root_tweet.id,
article_title_override,
root_tweet.created_at.clone(),
author_display.clone(),
)
.await
{
Ok(scraped_article_post) => {
return match convert_content_to_markdown(
scraped_article_post.clone(),
language,
openai_model,
)
.await
{
Ok(markdown_post) => markdown_post,
Err(err) => Post {
error: err,
..scraped_article_post
},
};
}
Err(graphql_error) => {
let article_post = scrape_web_url_with_title_override(
&article_url,
language,
openai_model,
article_title_override,
)
.await;
if article_post.error.is_empty() {
return article_post;
}
return Post {
title: article_post.title,
content: article_post.content,
featured_image_url: article_post.featured_image_url,
publication_date: article_post.publication_date,
author: article_post.author,
error: format!(
"Failed to scrape linked X article {} via X web GraphQL: {}. HTML fallback failed: {}",
article_url, graphql_error, article_post.error
),
};
}
}
}
let article_post = scrape_web_url_with_title_override(
&article_url,
language,
openai_model,
article_title_override,
)
.await;
if article_post.error.is_empty() {
return article_post;
}
return Post {
title: article_post.title,
content: article_post.content,
featured_image_url: article_post.featured_image_url,
publication_date: article_post.publication_date,
author: article_post.author,
error: format!(
"Failed to scrape linked article {}: {}",
article_url, article_post.error
),
};
}
}
let mut thread_tweets: Vec<(String, String)> = vec![(
root_tweet.created_at.clone().unwrap_or_default(),
root_tweet.text.clone(),
)];
let search_url = format!(
"https://api.x.com/2/tweets/search/recent?query=conversation_id%3A{}&tweet.fields=created_at,author_id,text,entities&max_results=100",
conversation_id
);
if let Ok(search_resp) = client
.get(&search_url)
.header("Authorization", &auth_header)
.send()
.await
{
if let Ok(search_body) = search_resp.text().await {
x_debug_dump("X recent search JSON", &search_body);
if let Ok(search_data) = serde_json::from_str::<XSearchResponse>(&search_body) {
if let Some(tweets) = search_data.data {
for t in tweets {
let same_author = !author_id.is_empty()
&& t.author_id.as_deref() == Some(author_id.as_str());
if same_author && t.id != root_tweet.id {
thread_tweets.push((t.created_at.unwrap_or_default(), t.text));
}
}
}
}
}
}
thread_tweets.sort_by(|a, b| a.0.cmp(&b.0));
let title = format!(
"{}: {}",
author_display.as_deref().unwrap_or("X post"),
root_tweet.text.chars().take(80).collect::<String>()
);
let content = thread_tweets
.iter()
.map(|(ts, text)| {
if ts.is_empty() {
text.clone()
} else {
format!("[{}] {}", ts, text)
}
})
.collect::<Vec<_>>()
.join("\n\n");
let scraped_post = Post {
title,
content,
featured_image_url: profile_image,
publication_date: root_tweet.created_at,
author: author_display,
error: String::new(),
};
match convert_content_to_markdown(scraped_post.clone(), language, openai_model).await {
Ok(markdown_post) => markdown_post,
Err(err) => Post {
error: err,
..scraped_post
},
}
}
pub async fn convert_content_to_markdown(
mut post: Post,
language: &str,
openai_model: Option<Model>,
) -> Result<Post, String> {
let secret_key = env::var("OPEN_AI_SECRET")
.map_err(|_| "Please set the OPEN_AI_SECRET environment variable.".to_string())?;
let model = openai_model.unwrap_or(Model::GPT54);
let client = Arc::new(OpenAIClient::new_with_model_enum(&secret_key, model));
let lang = normalized_output_language(language);
let system_prompt = markdown_system_prompt(lang);
let mut session = LLMSession::new(client, system_prompt, 1000000);
let post_json = serde_json::to_string(&post)
.map_err(|e| format!("Failed to serialize Post to JSON: {}", e))?;
let user_prompt = markdown_user_prompt(lang, &post_json);
match session.send_message(Role::User, user_prompt, None).await {
Ok(response) => {
post.content = response.content.to_string();
Ok(post)
}
Err(err) => Err(format!("LLM Error: {}", err)),
}
}
pub async fn universal_scrape(url: &str, language: &str, openai_model: Option<Model>) -> Post {
if is_x_url(url) {
return scrape_x_url(url, language, openai_model).await;
}
scrape_web_url(url, language, openai_model).await
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_x_url_x_com() {
assert!(is_x_url("https://x.com/user/status/123"));
}
#[test]
fn test_is_x_url_twitter_com() {
assert!(is_x_url("https://twitter.com/user/status/123"));
}
#[test]
fn test_is_x_url_non_x() {
assert!(!is_x_url("https://example.com/article"));
assert!(!is_x_url("https://bbc.com/news/world"));
assert!(!is_x_url("http://x.com/user/status/123")); assert!(!is_x_url("https://notx.com/user/status/123"));
assert!(!is_x_url("https://x.com")); assert!(!is_x_url("https://twitter.com")); }
#[test]
fn test_extract_tweet_id_x_com() {
assert_eq!(
extract_tweet_id("https://x.com/user/status/1234567890"),
Some("1234567890".to_string())
);
}
#[test]
fn test_extract_tweet_id_twitter_com() {
assert_eq!(
extract_tweet_id("https://twitter.com/user/status/9876543210"),
Some("9876543210".to_string())
);
}
#[test]
fn test_extract_tweet_id_with_query_params() {
assert_eq!(
extract_tweet_id("https://x.com/user/status/111222333?s=20&t=abc"),
Some("111222333".to_string())
);
}
#[test]
fn test_extract_tweet_id_with_fragment() {
assert_eq!(
extract_tweet_id("https://x.com/user/status/555666777#anchor"),
Some("555666777".to_string())
);
}
#[test]
fn test_extract_tweet_id_no_status() {
assert_eq!(extract_tweet_id("https://x.com/user"), None);
assert_eq!(extract_tweet_id("https://example.com/article"), None);
}
#[test]
fn test_extract_tweet_id_empty_status() {
assert_eq!(extract_tweet_id("https://x.com/user/status/"), None);
}
#[test]
fn test_x_linked_article_url_prefers_unwound_url() {
let tweet = XTweet {
id: "1".to_string(),
text: "https://t.co/abc".to_string(),
created_at: None,
author_id: None,
conversation_id: None,
article: None,
entities: Some(XEntities {
urls: Some(vec![XUrlEntity {
url: Some("https://t.co/abc".to_string()),
expanded_url: Some("https://x.com/DiarioBitcoin/status/123".to_string()),
unwound_url: Some("https://www.diariobitcoin.com/test-article".to_string()),
}]),
}),
};
assert_eq!(
x_linked_article_url(&tweet),
Some("https://www.diariobitcoin.com/test-article".to_string())
);
}
#[test]
fn test_x_linked_article_url_ignores_status_links() {
let tweet = XTweet {
id: "1".to_string(),
text: "https://t.co/abc".to_string(),
created_at: None,
author_id: None,
conversation_id: None,
article: None,
entities: Some(XEntities {
urls: Some(vec![XUrlEntity {
url: Some("https://t.co/abc".to_string()),
expanded_url: Some("https://x.com/DiarioBitcoin/status/123".to_string()),
unwound_url: None,
}]),
}),
};
assert_eq!(x_linked_article_url(&tweet), None);
}
#[test]
fn test_x_post_is_link_only() {
let tweet = XTweet {
id: "1".to_string(),
text: "https://t.co/abc".to_string(),
created_at: None,
author_id: None,
conversation_id: None,
article: None,
entities: Some(XEntities {
urls: Some(vec![XUrlEntity {
url: Some("https://t.co/abc".to_string()),
expanded_url: Some("https://www.diariobitcoin.com/test-article".to_string()),
unwound_url: None,
}]),
}),
};
assert!(x_post_is_link_only(&tweet));
}
#[test]
fn test_x_post_is_not_link_only_when_text_remains() {
let tweet = XTweet {
id: "1".to_string(),
text: "Analisis completo https://t.co/abc".to_string(),
created_at: None,
author_id: None,
conversation_id: None,
article: None,
entities: Some(XEntities {
urls: Some(vec![XUrlEntity {
url: Some("https://t.co/abc".to_string()),
expanded_url: Some("https://www.diariobitcoin.com/test-article".to_string()),
unwound_url: None,
}]),
}),
};
assert!(!x_post_is_link_only(&tweet));
}
#[test]
fn test_x_article_plain_text_prefers_plain_text() {
let article = XArticleMeta {
title: Some("Bitcoin bajo presión".to_string()),
plain_text: Some(" Cuerpo completo del articulo ".to_string()),
preview_text: Some("Preview".to_string()),
};
assert_eq!(
x_article_plain_text(&article),
Some("Cuerpo completo del articulo".to_string())
);
}
#[test]
fn test_x_article_plain_text_falls_back_to_preview_text() {
let article = XArticleMeta {
title: Some("Bitcoin bajo presión".to_string()),
plain_text: None,
preview_text: Some(" Preview del articulo ".to_string()),
};
assert_eq!(
x_article_plain_text(&article),
Some("Preview del articulo".to_string())
);
}
#[test]
fn test_is_x_article_url() {
assert!(is_x_article_url(
"https://x.com/i/article/2034262647731101696"
));
assert!(!is_x_article_url(
"https://x.com/DiarioBitcoin/status/2034263054754726116"
));
}
#[test]
fn test_x_article_body_unavailable_detects_guest_page() {
let body = "<html><body><h1>This page is not supported.</h1><p>Please visit the author's profile on the latest version of X to view this content.</p></body></html>";
assert!(x_article_body_unavailable(body));
}
#[test]
fn test_parse_scraped_post_from_html_blocks_guest_x_article_page() {
let post = parse_scraped_post_from_html(
"https://x.com/i/article/2034262647731101696",
"<html><body><h1>This page is not supported.</h1></body></html>",
Some("Expected X Article Title"),
);
assert_eq!(post.title, "Expected X Article Title");
assert!(post
.error
.contains("X article body is not available in the guest HTML response"));
}
#[test]
fn test_parse_x_web_article_post_prefers_graphql_article_payload() {
let body = r#"{
"data": {
"tweetResult": {
"result": {
"article": {
"article_results": {
"result": {
"title": "Bitcoin bajo presión",
"plain_text": "Primer parrafo.\n\nSegundo parrafo.",
"cover_media": {
"media_info": {
"original_img_url": "https://pbs.twimg.com/media/example.jpg"
}
}
}
}
}
}
}
}
}"#;
let post = parse_x_web_article_post(
body,
Some("Fallback title"),
Some("2026-03-18T13:38:01.000Z".to_string()),
Some("@DiarioBitcoin (Diario฿itcoin)".to_string()),
)
.unwrap();
assert_eq!(post.title, "Bitcoin bajo presión");
assert_eq!(post.content, "Primer parrafo.\n\nSegundo parrafo.");
assert_eq!(
post.featured_image_url,
"https://pbs.twimg.com/media/example.jpg"
);
assert_eq!(
post.publication_date,
Some("2026-03-18T13:38:01.000Z".to_string())
);
assert_eq!(
post.author,
Some("@DiarioBitcoin (Diario฿itcoin)".to_string())
);
}
#[test]
fn test_normalized_output_language_defaults_to_english() {
assert_eq!(normalized_output_language(""), "english");
assert_eq!(normalized_output_language(" "), "english");
assert_eq!(normalized_output_language("spanish"), "spanish");
}
#[test]
fn test_markdown_prompts_require_near_lossless_preservation() {
let system_prompt = markdown_system_prompt("english");
let user_prompt = markdown_user_prompt("english", r#"{"content":"<p>Hello</p>"}"#);
assert!(
system_prompt.contains("preserving the source text and structure as fully as possible")
);
assert!(system_prompt
.contains("Do not summarize, paraphrase, compress, or omit substantive details"));
assert!(user_prompt.contains("Treat `content` as the canonical article body"));
assert!(user_prompt.contains("keep it nearly verbatim"));
}
}