use std::collections::VecDeque;
use std::time::Duration;
use clap::Subcommand;
use futures_util::Stream;
use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
use reqwest::header::COOKIE;
use reqwest::{Client, StatusCode};
use typed_builder::TypedBuilder;
use crate::error::NitterError;
use crate::parse::{parse_nitter_html, parse_nitter_single};
use crate::tweet::Tweet;
#[derive(TypedBuilder)]
pub struct NitterScraper<'a> {
client: &'a Client,
#[builder(setter(into))]
instance: String,
query: NitterQuery,
#[builder(default)]
limit: Option<usize>,
#[builder(default)]
reorder_pinned: bool,
#[builder(default)]
skip_retweets: bool,
#[builder(default)]
min_id: Option<u128>,
#[builder(setter(skip), default)]
state: NitterSearchState,
}
#[derive(Debug, Default)]
struct NitterSearchState {
tweets: VecDeque<Tweet>,
cursor: NitterCursor,
count: usize,
errored: bool,
pinned: Option<Tweet>,
}
#[derive(Debug)]
pub enum NitterCursor {
Initial,
More(String),
End,
}
impl Default for NitterCursor {
fn default() -> Self {
Self::Initial
}
}
#[derive(Subcommand)]
pub enum NitterQuery {
Search { query: String },
User { user: String },
UserWithReplies { user: String },
UserMedia { user: String },
UserSearch { user: String, query: String },
Status { id: u64 },
}
impl NitterQuery {
fn encode_get_params(&self) -> String {
match self {
Self::Search { query } => {
let encoded = utf8_percent_encode(query, NON_ALPHANUMERIC);
format!("?f=tweets&q={}", encoded)
}
Self::User { .. } => "".into(),
Self::UserWithReplies { .. } => "".into(),
Self::UserMedia { .. } => "".into(),
Self::UserSearch { query, .. } => {
let encoded = utf8_percent_encode(query, NON_ALPHANUMERIC);
format!("?f=tweets&q={}", encoded)
}
Self::Status { .. } => "".into(),
}
}
fn url_path(&self) -> String {
match self {
Self::Search { .. } => "/search".into(),
Self::User { user } => format!("/{}", user),
Self::UserWithReplies { user } => format!("/{}/with_replies", user),
Self::UserMedia { user } => format!("/{}/media", user),
Self::UserSearch { user, .. } => format!("/{}/search", user),
Self::Status { id } => format!("/i/status/{}", id),
}
}
fn is_single(&self) -> bool {
matches!(self, Self::Status { .. })
}
}
enum ReturnedTweet {
Pinned,
Normal,
None,
}
impl<'a> NitterScraper<'a> {
pub async fn search(&'a mut self) -> impl Stream<Item = Result<Tweet, NitterError>> + '_ {
self.state = Default::default();
futures_util::stream::unfold(self, |state| async {
if state.state.errored {
return None;
}
if let Some(limit) = state.limit {
if state.state.count >= limit {
return None;
}
}
loop {
if let Some(tweet) = state.state.tweets.iter().next() {
match Self::should_return_tweet(
tweet,
&state.state.pinned,
state.min_id,
state.reorder_pinned,
) {
ReturnedTweet::Normal => {
state.state.count += 1;
return Some((Ok(state.state.tweets.pop_front().unwrap()), state));
}
ReturnedTweet::Pinned => {
state.state.count += 1;
return Some((Ok(state.state.pinned.take().unwrap()), state));
}
ReturnedTweet::None => break,
}
}
if let NitterCursor::End = state.state.cursor {
break;
}
match state.scrape_page().await {
Ok(tweets) => {
state.state.tweets.extend(tweets.into_iter());
}
Err(e) => {
state.state.errored = true;
return Some((Err(e), state));
}
}
}
if let Some(t) = state.state.pinned.take() {
return Some((Ok(t), state));
}
None
})
}
fn should_return_tweet(
tweet: &Tweet,
pinned: &Option<Tweet>,
min_id: Option<u128>,
reorder_pinned: bool,
) -> ReturnedTweet {
if reorder_pinned {
if let Some(p) = pinned {
if p.created_at_ts > tweet.created_at_ts {
return ReturnedTweet::Pinned;
}
}
}
if let Some(min_id) = min_id {
if tweet.id < min_id {
return ReturnedTweet::None;
}
}
ReturnedTweet::Normal
}
async fn scrape_page(&mut self) -> Result<Vec<Tweet>, NitterError> {
let get_params = match self.state.cursor {
NitterCursor::Initial => self.query.encode_get_params(),
NitterCursor::More(ref c) => c.clone(),
NitterCursor::End => return Ok(vec![]),
};
let mut nitter_retry = 0;
let tweets = loop {
let url = format!("{}{}{}", self.instance, self.query.url_path(), get_params);
let mut i = 0;
let response = loop {
let response = self
.client
.get(&url)
.header(COOKIE, "replaceTwitter=; replaceYouTube=; replaceReddit=")
.send()
.await
.map_err(|e| NitterError::Network(e.to_string()))?;
if response.status() == StatusCode::TOO_MANY_REQUESTS {
if i < 25 {
i += 1;
let sleep_s = 1 << std::cmp::min(i, 8);
eprintln!(
"Received status code {}, sleeping for {} seconds",
response.status().as_u16(),
sleep_s
);
tokio::time::sleep(Duration::from_secs(sleep_s)).await;
continue;
} else {
return Err(NitterError::Network(format!(
"received status code {}",
response.status().as_u16()
)));
}
} else if response.status() == StatusCode::NOT_FOUND {
return Err(NitterError::NotFound);
} else if !response.status().is_success() {
return Err(NitterError::Network(format!(
"received status code {}",
response.status().as_u16()
)));
}
break response;
};
let text = response.text().await.unwrap();
let (tweets, cursor) = if self.query.is_single() {
let (tweet, cursor) = parse_nitter_single(text)?;
(vec![tweet], cursor)
} else {
parse_nitter_html(text)?
};
let tweets = if self.reorder_pinned {
let (mut pinned, unpinned): (Vec<_>, Vec<_>) =
tweets.into_iter().partition(|t| t.pinned);
if let Some(t) = pinned.pop() {
if let Some(min_id) = self.min_id {
if t.id >= min_id {
self.state.pinned = Some(t);
}
} else {
self.state.pinned = Some(t);
}
}
unpinned
} else {
tweets
};
if !tweets.is_empty() || nitter_retry > 10 {
self.state.cursor = cursor;
break tweets;
}
tokio::time::sleep(Duration::from_secs(1)).await;
nitter_retry += 1;
};
let tweets = if self.skip_retweets {
tweets.into_iter().filter(|t| !t.retweet).collect()
} else {
tweets
};
Ok(tweets)
}
}