use reqwest::Client;
use scraper::{ElementRef, Html, Selector};
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::env;
use std::error::Error as StdError;
use std::sync::Arc;
use cloudllm::client_wrapper::Role;
use cloudllm::clients::openai::{Model, OpenAIClient};
use cloudllm::LLMSession;
#[derive(Debug, Serialize, Clone)]
pub struct Post {
pub title: String,
pub content: String,
pub featured_image_url: String,
pub publication_date: Option<String>,
pub author: Option<String>,
pub error: String,
}
#[must_use]
fn clean_element(element: ElementRef, skip_tags: &HashSet<&str>) -> String {
let tag_name = element.value().name();
if skip_tags.contains(tag_name) {
return String::new();
}
let mut children_cleaned = String::new();
for child in element.children() {
if let Some(child_elem) = ElementRef::wrap(child) {
let cleaned = clean_element(child_elem, skip_tags);
if !cleaned.trim().is_empty() {
children_cleaned.push_str(&cleaned);
children_cleaned.push(' ');
}
} else if let Some(text) = child.value().as_text() {
let text_trimmed = text.trim();
if !text_trimmed.is_empty() {
children_cleaned.push_str(text_trimmed);
children_cleaned.push(' ');
}
}
}
if children_cleaned.trim().is_empty() {
return String::new();
}
format!(
"<{tag}>{content}</{tag}>",
tag = tag_name,
content = children_cleaned.trim()
)
}
#[must_use]
fn extract_clean_content(document: &Html, skip_tags: &HashSet<&str>) -> String {
if let Ok(article_sel) = Selector::parse("article") {
if let Some(article) = document.select(&article_sel).next() {
let cleaned = clean_element(article, skip_tags);
if !cleaned.trim().is_empty() {
return cleaned;
}
}
}
if let Ok(body_sel) = Selector::parse("body") {
if let Some(body) = document.select(&body_sel).next() {
return clean_element(body, skip_tags);
}
}
String::new()
}
#[derive(Deserialize, Debug)]
struct XTweet {
id: String,
text: String,
created_at: Option<String>,
author_id: Option<String>,
conversation_id: Option<String>,
}
#[derive(Deserialize, Debug)]
struct XUser {
name: String,
username: String,
profile_image_url: Option<String>,
}
#[derive(Deserialize, Debug)]
struct XIncludes {
users: Option<Vec<XUser>>,
}
#[derive(Deserialize, Debug)]
struct XTweetResponse {
data: Option<XTweet>,
includes: Option<XIncludes>,
errors: Option<Vec<serde_json::Value>>,
}
#[derive(Deserialize, Debug)]
struct XSearchResponse {
data: Option<Vec<XTweet>>,
}
fn is_x_url(url: &str) -> bool {
url.starts_with("https://x.com/") || url.starts_with("https://twitter.com/")
}
fn extract_tweet_id(url: &str) -> Option<String> {
let clean = url.split('?').next().unwrap_or(url);
let clean = clean.split('#').next().unwrap_or(clean);
const STATUS: &str = "/status/";
if let Some(pos) = clean.find(STATUS) {
let after = &clean[pos + STATUS.len()..];
let id: String = after.chars().take_while(|c| c.is_ascii_digit()).collect();
if !id.is_empty() {
return Some(id);
}
}
None
}
async fn scrape_x_url(url: &str, language: &str, openai_model: Option<Model>) -> Post {
let bearer_token = match env::var("X_BEARER_TOKEN") {
Ok(t) if !t.trim().is_empty() => t,
_ => {
return Post {
title: String::new(),
content: String::new(),
featured_image_url: String::new(),
publication_date: None,
author: None,
error:
"Please set the X_BEARER_TOKEN environment variable to access X.com content."
.into(),
};
}
};
let tweet_id = match extract_tweet_id(url) {
Some(id) => id,
None => {
return Post {
title: String::new(),
content: String::new(),
featured_image_url: String::new(),
publication_date: None,
author: None,
error: format!("Could not extract a tweet ID from the URL: {}", url),
};
}
};
let client = Client::builder()
.user_agent("Mozilla/5.0 (compatible; uninews/1.0)")
.build()
.unwrap_or_default();
let auth_header = format!("Bearer {}", bearer_token);
let root_tweet_url = format!(
"https://api.twitter.com/2/tweets/{}?tweet.fields=created_at,author_id,conversation_id,text&expansions=author_id&user.fields=name,username,profile_image_url",
tweet_id
);
let root_resp = match client
.get(&root_tweet_url)
.header("Authorization", &auth_header)
.send()
.await
{
Ok(r) => r,
Err(e) => {
return Post {
title: String::new(),
content: String::new(),
featured_image_url: String::new(),
publication_date: None,
author: None,
error: format!("Failed to call X API: {}", e),
};
}
};
let root_data: XTweetResponse = match root_resp.json().await {
Ok(d) => d,
Err(e) => {
return Post {
title: String::new(),
content: String::new(),
featured_image_url: String::new(),
publication_date: None,
author: None,
error: format!("Failed to parse X API response: {}", e),
};
}
};
if let Some(errors) = &root_data.errors {
if !errors.is_empty() {
let msg = errors
.first()
.and_then(|e| e.get("detail").or_else(|| e.get("message")))
.and_then(|v| v.as_str())
.unwrap_or("Unknown X API error");
return Post {
title: String::new(),
content: String::new(),
featured_image_url: String::new(),
publication_date: None,
author: None,
error: format!("X API error: {}", msg),
};
}
}
let root_tweet = match root_data.data {
Some(t) => t,
None => {
return Post {
title: String::new(),
content: String::new(),
featured_image_url: String::new(),
publication_date: None,
author: None,
error: "X API returned no tweet data.".into(),
};
}
};
let author_info = root_data
.includes
.as_ref()
.and_then(|inc| inc.users.as_ref())
.and_then(|users| users.first());
let author_display = author_info.map(|u| format!("@{} ({})", u.username, u.name));
let profile_image = author_info
.and_then(|u| u.profile_image_url.clone())
.unwrap_or_default();
let author_id = root_tweet.author_id.clone().unwrap_or_default();
let conversation_id = root_tweet
.conversation_id
.clone()
.unwrap_or_else(|| root_tweet.id.clone());
let mut thread_tweets: Vec<(String, String)> = vec![(
root_tweet.created_at.clone().unwrap_or_default(),
root_tweet.text.clone(),
)];
let search_url = format!(
"https://api.twitter.com/2/tweets/search/recent?query=conversation_id%3A{}&tweet.fields=created_at,author_id,text&max_results=100",
conversation_id
);
if let Ok(search_resp) = client
.get(&search_url)
.header("Authorization", &auth_header)
.send()
.await
{
if let Ok(search_data) = search_resp.json::<XSearchResponse>().await {
if let Some(tweets) = search_data.data {
for t in tweets {
let same_author =
!author_id.is_empty() && t.author_id.as_deref() == Some(author_id.as_str());
if same_author && t.id != root_tweet.id {
thread_tweets.push((t.created_at.unwrap_or_default(), t.text));
}
}
}
}
}
thread_tweets.sort_by(|a, b| a.0.cmp(&b.0));
let title = format!(
"{}: {}",
author_display.as_deref().unwrap_or("X post"),
root_tweet.text.chars().take(80).collect::<String>()
);
let content = thread_tweets
.iter()
.map(|(ts, text)| {
if ts.is_empty() {
text.clone()
} else {
format!("[{}] {}", ts, text)
}
})
.collect::<Vec<_>>()
.join("\n\n");
let scraped_post = Post {
title,
content,
featured_image_url: profile_image,
publication_date: root_tweet.created_at,
author: author_display,
error: String::new(),
};
match convert_content_to_markdown(scraped_post.clone(), language, openai_model).await {
Ok(markdown_post) => markdown_post,
Err(err) => Post {
error: err,
..scraped_post
},
}
}
pub async fn convert_content_to_markdown(
mut post: Post,
language: &str,
openai_model: Option<Model>,
) -> Result<Post, String> {
let secret_key = env::var("OPEN_AI_SECRET")
.map_err(|_| "Please set the OPEN_AI_SECRET environment variable.".to_string())?;
let model = openai_model.unwrap_or(Model::GPT4o);
let client = Arc::new(OpenAIClient::new_with_model_enum(&secret_key, model));
let lang = if language.trim().is_empty() {
"english"
} else {
language
};
let system_prompt = format!(
"You are an expert markdown formatter and translator. Given a JSON object representing a news post, \
extract and output only the text content in Markdown format in {}. Remove all HTML tags and extra markup. \
Do not include any JSON keys or metadata—only the formatted content. If {} is not supported, default to english.",
lang, lang
);
let mut session = LLMSession::new(client, system_prompt, 128000);
let post_json = serde_json::to_string(&post)
.map_err(|e| format!("Failed to serialize Post to JSON: {}", e))?;
let user_prompt = format!(
"Convert the following Post JSON into Markdown formatted text in {} language, nothing else:\n\n{}",
lang, post_json
);
match session.send_message(Role::User, user_prompt, None).await {
Ok(response) => {
post.content = response.content.to_string();
Ok(post)
}
Err(err) => Err(format!("LLM Error: {}", err)),
}
}
pub async fn universal_scrape(url: &str, language: &str, openai_model: Option<Model>) -> Post {
if is_x_url(url) {
return scrape_x_url(url, language, openai_model).await;
}
let client = Client::builder()
.user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
.http1_only()
.build()
.unwrap_or_default();
let response = client.get(url).send().await;
if let Err(err) = response {
let mut msg = format!("Failed to fetch URL: {}", err);
let mut src: Option<&dyn StdError> = err.source();
while let Some(cause) = src {
msg.push_str(&format!(" => {}", cause));
src = cause.source();
}
return Post {
title: "".into(),
content: "".into(),
featured_image_url: "".into(),
publication_date: None,
author: None,
error: msg,
};
}
let response = response.unwrap();
let body_text = match response.text().await {
Ok(text) => text,
Err(err) => {
return Post {
title: "".into(),
content: "".into(),
featured_image_url: "".into(),
publication_date: None,
author: None,
error: format!("Failed to read response body: {}", err),
}
}
};
let document = Html::parse_document(&body_text);
let skip_tags: HashSet<&str> = [
"script", "style", "noscript", "iframe", "header", "footer", "nav", "aside", "form",
"input", "button", "svg", "picture", "source",
]
.iter()
.cloned()
.collect();
let title_selector = Selector::parse("title").unwrap();
let title = document
.select(&title_selector)
.next()
.map(|elem| elem.text().collect::<Vec<_>>().join(" ").trim().to_string())
.unwrap_or_default();
let content = extract_clean_content(&document, &skip_tags);
let meta_selector = Selector::parse(r#"meta[property="og:image"]"#).unwrap();
let featured_image_url = document
.select(&meta_selector)
.next()
.and_then(|meta| meta.value().attr("content"))
.unwrap_or("")
.to_string();
let date_selector = Selector::parse(r#"meta[property="article:published_time"]"#).unwrap();
let publication_date = document
.select(&date_selector)
.next()
.and_then(|meta| meta.value().attr("content"))
.map(String::from);
let author_selector = Selector::parse(r#"meta[name="author"]"#).unwrap();
let author = document
.select(&author_selector)
.next()
.and_then(|meta| meta.value().attr("content"))
.map(String::from);
if content.trim().is_empty() {
return Post {
title: "".into(),
content: "".into(),
featured_image_url: "".into(),
publication_date: None,
author: None,
error: "Could not extract meaningful content from the page.".into(),
};
}
let scraped_post = Post {
title,
content,
featured_image_url,
publication_date,
author,
error: "".into(),
};
match convert_content_to_markdown(scraped_post.clone(), language, openai_model).await {
Ok(markdown_post) => markdown_post,
Err(err) => Post {
error: err,
..scraped_post
},
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_x_url_x_com() {
assert!(is_x_url("https://x.com/user/status/123"));
}
#[test]
fn test_is_x_url_twitter_com() {
assert!(is_x_url("https://twitter.com/user/status/123"));
}
#[test]
fn test_is_x_url_non_x() {
assert!(!is_x_url("https://example.com/article"));
assert!(!is_x_url("https://bbc.com/news/world"));
assert!(!is_x_url("http://x.com/user/status/123")); assert!(!is_x_url("https://notx.com/user/status/123"));
assert!(!is_x_url("https://x.com")); assert!(!is_x_url("https://twitter.com")); }
#[test]
fn test_extract_tweet_id_x_com() {
assert_eq!(
extract_tweet_id("https://x.com/user/status/1234567890"),
Some("1234567890".to_string())
);
}
#[test]
fn test_extract_tweet_id_twitter_com() {
assert_eq!(
extract_tweet_id("https://twitter.com/user/status/9876543210"),
Some("9876543210".to_string())
);
}
#[test]
fn test_extract_tweet_id_with_query_params() {
assert_eq!(
extract_tweet_id("https://x.com/user/status/111222333?s=20&t=abc"),
Some("111222333".to_string())
);
}
#[test]
fn test_extract_tweet_id_with_fragment() {
assert_eq!(
extract_tweet_id("https://x.com/user/status/555666777#anchor"),
Some("555666777".to_string())
);
}
#[test]
fn test_extract_tweet_id_no_status() {
assert_eq!(extract_tweet_id("https://x.com/user"), None);
assert_eq!(extract_tweet_id("https://example.com/article"), None);
}
#[test]
fn test_extract_tweet_id_empty_status() {
assert_eq!(extract_tweet_id("https://x.com/user/status/"), None);
}
}