use dotenv::dotenv;
use reqwest::Client as HttpClient;
use scraper::{Html, Selector};
use serde_json::{json, Value};
use std::env;
use std::error::Error;
use std::fmt;
use std::fs;
use std::io::{self, BufRead, Write};
use std::path::Path;
use tokio::runtime::Runtime;
const GREEN: &str = "\x1b[0;32m";
const YELLOW: &str = "\x1b[1;33m";
const BLUE: &str = "\x1b[0;34m";
const RED: &str = "\x1b[0;31m";
const CYAN: &str = "\x1b[0;36m";
const BOLD: &str = "\x1b[1m";
const NC: &str = "\x1b[0m";
const ANTHROPIC_API_URL: &str = "https://api.anthropic.com/v1/messages";
const ANTHROPIC_MODEL: &str = "claude-3-opus-20240229";
const MAX_TOKENS: u32 = 4096;
#[derive(Debug)]
struct WebScraperError(String);
impl fmt::Display for WebScraperError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.0)
}
}
impl Error for WebScraperError {}
impl From<String> for WebScraperError {
fn from(error: String) -> Self {
WebScraperError(error)
}
}
impl From<&str> for WebScraperError {
fn from(error: &str) -> Self {
WebScraperError(error.to_string())
}
}
impl From<reqwest::Error> for WebScraperError {
fn from(error: reqwest::Error) -> Self {
WebScraperError(error.to_string())
}
}
impl From<io::Error> for WebScraperError {
fn from(error: io::Error) -> Self {
WebScraperError(error.to_string())
}
}
impl From<env::VarError> for WebScraperError {
fn from(error: env::VarError) -> Self {
WebScraperError(error.to_string())
}
}
struct WebScraper {
http_client: HttpClient,
anthropic_key: String,
}
impl WebScraper {
fn new(anthropic_key: String) -> Self {
Self {
http_client: HttpClient::new(),
anthropic_key,
}
}
async fn scrape_url(&self, url: &str) -> Result<String, WebScraperError> {
println!("{}Scraping URL: {}{}", YELLOW, url, NC);
let response = self.http_client.get(url)
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
.send()
.await?;
if !response.status().is_success() {
return Err(format!("Failed to fetch URL: HTTP {}", response.status()).into());
}
let html = response.text().await?;
println!(
"{}Successfully fetched HTML content ({} bytes){}",
GREEN,
html.len(),
NC
);
let html_dir = env::var("HTML_DIR").unwrap_or_else(|_| "/tmp".to_string());
let file_name = format!("scraped_{}.html", chrono::Utc::now().timestamp());
let file_path = Path::new(&html_dir).join(&file_name);
fs::write(&file_path, &html)?;
println!("{}HTML saved to {}{}", GREEN, file_path.display(), NC);
Ok(html)
}
async fn extract_content(
&self,
html: &str,
query: &str,
css_selector: Option<&str>,
) -> Result<String, WebScraperError> {
println!(
"{}Parsing HTML and extracting content for query: {}{}",
YELLOW, query, NC
);
let document = Html::parse_document(html);
let title_selector = Selector::parse("title").unwrap();
let title = document
.select(&title_selector)
.next()
.map(|element| element.inner_html())
.unwrap_or_else(|| "No title found".to_string());
let specific_content = if let Some(selector_str) = css_selector {
match Selector::parse(selector_str) {
Ok(selector) => {
println!("{}Using CSS selector: {}{}", GREEN, selector_str, NC);
let selected_content: Vec<String> = document
.select(&selector)
.map(|element| element.html())
.collect();
if selected_content.is_empty() {
println!(
"{}No elements found with selector: {}{}",
YELLOW, selector_str, NC
);
None
} else {
println!(
"{}Found {} elements with selector{}",
GREEN,
selected_content.len(),
NC
);
Some(selected_content.join("\n"))
}
}
Err(e) => {
println!(
"{}Invalid CSS selector: {} - {}{}",
RED, selector_str, e, NC
);
None
}
}
} else {
None
};
let body_text = if specific_content.is_none() {
let body_selector = Selector::parse("body").unwrap();
document
.select(&body_selector)
.next()
.map(|element| element.text().collect::<Vec<_>>().join(" "))
.unwrap_or_else(|| "No body content found".to_string())
} else {
"".to_string()
};
let content_to_analyze = if let Some(content) = specific_content {
println!("{}Using selected content for analysis{}", GREEN, NC);
content
} else {
println!("{}Using full body content for analysis{}", GREEN, NC);
body_text
};
let truncated_content = if content_to_analyze.len() > 100000 {
format!("{}... (truncated)", &content_to_analyze[0..100000])
} else {
content_to_analyze
};
let extracted_content = self
.query_anthropic(&title, &truncated_content, query)
.await?;
Ok(extracted_content)
}
async fn query_anthropic(
&self,
title: &str,
content: &str,
query: &str,
) -> Result<String, WebScraperError> {
println!(
"{}Querying Anthropic API to extract relevant information{}",
YELLOW, NC
);
let prompt = format!(
"I need you to extract specific information from this web page based on the user's query.\n\n\
Page Title: {}\n\n\
User Query: {}\n\n\
Web Page Content:\n{}\n\n\
Please extract and summarize only the information that is relevant to the user's query. \
Focus on providing accurate, concise, and helpful information. \
If the requested information is not found in the content, please state that clearly.",
title, query, content
);
let request_body = json!({
"model": ANTHROPIC_MODEL,
"max_tokens": MAX_TOKENS,
"messages": [
{
"role": "user",
"content": prompt
}
]
});
let response = self
.http_client
.post(ANTHROPIC_API_URL)
.header("x-api-key", &self.anthropic_key)
.header("anthropic-version", "2023-06-01")
.header("Content-Type", "application/json")
.json(&request_body)
.send()
.await?;
let status = response.status();
if !status.is_success() {
let error_text = response.text().await?;
return Err(format!("Anthropic API error: {} - {}", status, error_text).into());
}
let response_json: Value = response.json().await?;
let extracted_content = response_json["content"][0]["text"]
.as_str()
.ok_or_else(|| WebScraperError("Failed to parse Anthropic API response".to_string()))?
.to_string();
println!(
"{}Successfully extracted relevant content using Anthropic API{}",
GREEN, NC
);
Ok(extracted_content)
}
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
dotenv().ok();
println!("{}{}Direct Web Scraper{}{}", BOLD, BLUE, NC, NC);
println!(
"{}This tool allows you to scrape web content and extract information using Claude.{}",
CYAN, NC
);
let anthropic_key = match env::var("ANTHROPIC_API_KEY") {
Ok(key) => key,
Err(_) => {
eprintln!(
"{}Error: ANTHROPIC_API_KEY environment variable not set{}",
RED, NC
);
eprintln!(
"{}Please set it using: export ANTHROPIC_API_KEY=your_api_key{}",
YELLOW, NC
);
return Err(Box::<dyn Error>::from(WebScraperError(
"ANTHROPIC_API_KEY environment variable not set".into(),
)));
}
};
let scraper = WebScraper::new(anthropic_key);
let args: Vec<String> = env::args().collect();
let mut url = String::new();
let mut query = String::new();
let mut css_selector = String::new();
let mut i = 1;
while i < args.len() {
match args[i].as_str() {
"--url" => {
if i + 1 < args.len() {
url = args[i + 1].clone();
i += 2;
} else {
eprintln!("{}Error: Missing value for --url{}", RED, NC);
return Err(Box::<dyn Error>::from(WebScraperError(
"Missing value for --url".into(),
)));
}
}
"--query" => {
if i + 1 < args.len() {
query = args[i + 1].clone();
i += 2;
} else {
eprintln!("{}Error: Missing value for --query{}", RED, NC);
return Err(Box::<dyn Error>::from(WebScraperError(
"Missing value for --query".into(),
)));
}
}
"--css-selector" => {
if i + 1 < args.len() {
css_selector = args[i + 1].clone();
i += 2;
} else {
eprintln!("{}Error: Missing value for --css-selector{}", RED, NC);
return Err(Box::<dyn Error>::from(WebScraperError(
"Missing value for --css-selector".into(),
)));
}
}
_ => {
i += 1;
}
}
}
if url.is_empty() || query.is_empty() {
let stdin = io::stdin();
let mut stdin_lock = stdin.lock();
let mut input = String::new();
if url.is_empty() {
print!("{}Enter the URL to scrape: {}", YELLOW, NC);
io::stdout().flush().unwrap();
input.clear();
stdin_lock.read_line(&mut input).unwrap();
url = input.trim().to_string();
if url.is_empty() {
println!("{}URL cannot be empty. Exiting.{}", RED, NC);
return Err(Box::<dyn Error>::from(WebScraperError(
"URL cannot be empty".into(),
)));
}
}
if css_selector.is_empty() {
print!(
"{}Enter CSS selector (optional, e.g., 'div.content', 'article', '#main'): {}",
YELLOW, NC
);
io::stdout().flush().unwrap();
input.clear();
stdin_lock.read_line(&mut input).unwrap();
css_selector = input.trim().to_string();
}
if query.is_empty() {
print!(
"{}Enter your query (what information are you looking for?): {}",
YELLOW, NC
);
io::stdout().flush().unwrap();
input.clear();
stdin_lock.read_line(&mut input).unwrap();
query = input.trim().to_string();
if query.is_empty() {
println!("{}Query cannot be empty. Exiting.{}", RED, NC);
return Err(Box::<dyn Error>::from(WebScraperError(
"Query cannot be empty".into(),
)));
}
}
}
let html = match scraper.scrape_url(&url).await {
Ok(html) => html,
Err(e) => {
eprintln!("{}Error scraping URL: {}{}", RED, e, NC);
return Err(Box::<dyn Error>::from(e));
}
};
let css_selector_option = if css_selector.is_empty() {
None
} else {
Some(css_selector.as_str())
};
let extracted_content = match scraper
.extract_content(&html, &query, css_selector_option)
.await
{
Ok(content) => content,
Err(e) => {
eprintln!("{}Error extracting content: {}{}", RED, e, NC);
return Err(Box::<dyn Error>::from(e));
}
};
println!("\n{}{}Extracted Content:{}{}", BOLD, GREEN, NC, NC);
println!("{}", extracted_content);
Ok(())
}