use std::cmp::Ordering;
use std::collections::HashMap;
use sha1::{Digest, Sha1};
use scrapling_fetch::Response;
pub type Callback = Box<dyn Fn(Response) -> Vec<SpiderOutput> + Send + Sync>;
#[derive(Debug)]
pub enum SpiderOutput {
Item(serde_json::Value),
FollowRequest(Request),
}
pub struct Request {
pub url: String,
pub sid: String,
pub callback: Option<Callback>,
pub callback_name: Option<String>,
pub priority: i32,
pub dont_filter: bool,
pub meta: HashMap<String, serde_json::Value>,
pub retry_count: u32,
pub session_kwargs: HashMap<String, serde_json::Value>,
fingerprint: Option<Vec<u8>>,
}
impl Request {
pub fn new(url: impl Into<String>) -> Self {
Self {
url: url.into(),
sid: String::new(),
callback: None,
callback_name: None,
priority: 0,
dont_filter: false,
meta: HashMap::new(),
retry_count: 0,
session_kwargs: HashMap::new(),
fingerprint: None,
}
}
pub fn with_sid(mut self, sid: impl Into<String>) -> Self {
self.sid = sid.into();
self
}
pub fn with_priority(mut self, priority: i32) -> Self {
self.priority = priority;
self
}
pub fn with_dont_filter(mut self, dont_filter: bool) -> Self {
self.dont_filter = dont_filter;
self
}
pub fn with_meta(mut self, meta: HashMap<String, serde_json::Value>) -> Self {
self.meta = meta;
self
}
pub fn with_callback(mut self, name: &str, callback: Callback) -> Self {
self.callback_name = Some(name.to_owned());
self.callback = Some(callback);
self
}
pub fn domain(&self) -> String {
url::Url::parse(&self.url)
.ok()
.and_then(|u| u.host_str().map(|h| h.to_owned()))
.unwrap_or_default()
}
pub fn update_fingerprint(
&mut self,
include_kwargs: bool,
include_headers: bool,
keep_fragments: bool,
) -> &[u8] {
if let Some(ref fp) = self.fingerprint {
return fp;
}
let mut url = self.url.clone();
if !keep_fragments {
if let Some(pos) = url.find('#') {
url.truncate(pos);
}
}
let method = self
.session_kwargs
.get("method")
.and_then(|v| v.as_str())
.unwrap_or("GET")
.to_uppercase();
let body = self.extract_body_hex();
let mut parts = serde_json::Map::new();
parts.insert("sid".into(), serde_json::Value::String(self.sid.clone()));
parts.insert("method".into(), serde_json::Value::String(method));
parts.insert("url".into(), serde_json::Value::String(url));
parts.insert("body".into(), serde_json::Value::String(body));
if include_kwargs {
let mut keys: Vec<&String> = self.session_kwargs.keys().collect();
keys.sort();
let hex = hex::encode(format!("{keys:?}"));
parts.insert("kwargs".into(), serde_json::Value::String(hex));
}
if include_headers {
if let Some(headers) = self.session_kwargs.get("headers") {
let s = serde_json::to_string(headers).unwrap_or_default();
parts.insert("headers".into(), serde_json::Value::String(s));
}
}
let serialized = serde_json::to_vec(&parts).unwrap_or_default();
let mut hasher = Sha1::new();
hasher.update(&serialized);
let fp = hasher.finalize().to_vec();
self.fingerprint = Some(fp);
self.fingerprint.as_ref().unwrap()
}
fn extract_body_hex(&self) -> String {
if let Some(data) = self.session_kwargs.get("data") {
if let Some(s) = data.as_str() {
return hex::encode(s.as_bytes());
}
return hex::encode(serde_json::to_vec(data).unwrap_or_default());
}
if let Some(json) = self.session_kwargs.get("json") {
return hex::encode(serde_json::to_vec(json).unwrap_or_default());
}
String::new()
}
pub fn fingerprint(&self) -> Option<&[u8]> {
self.fingerprint.as_deref()
}
pub fn copy_without_callback(&self) -> Self {
Self {
url: self.url.clone(),
sid: self.sid.clone(),
callback: None,
callback_name: self.callback_name.clone(),
priority: self.priority,
dont_filter: self.dont_filter,
meta: self.meta.clone(),
retry_count: self.retry_count,
session_kwargs: self.session_kwargs.clone(),
fingerprint: self.fingerprint.clone(),
}
}
}
impl std::fmt::Debug for Request {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Request")
.field("url", &self.url)
.field("priority", &self.priority)
.field("callback", &self.callback_name)
.finish()
}
}
impl std::fmt::Display for Request {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.url)
}
}
impl PartialEq for Request {
fn eq(&self, other: &Self) -> bool {
match (&self.fingerprint, &other.fingerprint) {
(Some(a), Some(b)) => a == b,
_ => self.url == other.url,
}
}
}
impl Eq for Request {}
impl PartialOrd for Request {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Request {
fn cmp(&self, other: &Self) -> Ordering {
self.priority.cmp(&other.priority)
}
}