1use std::collections::VecDeque;
2use std::time::Duration;
3
4use clap::Subcommand;
5use futures_util::Stream;
6use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
7use reqwest::header::COOKIE;
8use reqwest::{Client, StatusCode};
9use typed_builder::TypedBuilder;
10
11use crate::error::NitterError;
12use crate::parse::{parse_nitter_html, parse_nitter_single};
13use crate::tweet::Tweet;
14
15#[derive(TypedBuilder)]
16pub struct NitterScraper<'a> {
17 client: &'a Client,
18
19 #[builder(setter(into))]
20 instance: String,
21
22 query: NitterQuery,
23
24 #[builder(default)]
25 limit: Option<usize>,
26
27 #[builder(default)]
28 reorder_pinned: bool,
29
30 #[builder(default)]
31 skip_retweets: bool,
32
33 #[builder(default)]
34 min_id: Option<u128>,
35
36 #[builder(setter(skip), default)]
37 state: NitterSearchState,
38}
39
40#[derive(Debug, Default)]
41struct NitterSearchState {
42 tweets: VecDeque<Tweet>,
43 cursor: NitterCursor,
44 count: usize,
45 errored: bool,
46 pinned: Option<Tweet>,
47}
48
49#[derive(Debug)]
50pub enum NitterCursor {
51 Initial,
52 More(String),
53 End,
54}
55
56impl Default for NitterCursor {
57 fn default() -> Self {
58 Self::Initial
59 }
60}
61
62#[derive(Subcommand)]
63pub enum NitterQuery {
64 Search { query: String },
65 User { user: String },
66 UserWithReplies { user: String },
67 UserMedia { user: String },
68 UserSearch { user: String, query: String },
69 Status { id: u64 },
70}
71
72impl NitterQuery {
73 fn encode_get_params(&self) -> String {
74 match self {
75 Self::Search { query } => {
76 let encoded = utf8_percent_encode(query, NON_ALPHANUMERIC);
77 format!("?f=tweets&q={}", encoded)
78 }
79 Self::User { .. } => "".into(),
80 Self::UserWithReplies { .. } => "".into(),
81 Self::UserMedia { .. } => "".into(),
82 Self::UserSearch { query, .. } => {
83 let encoded = utf8_percent_encode(query, NON_ALPHANUMERIC);
84 format!("?f=tweets&q={}", encoded)
85 }
86 Self::Status { .. } => "".into(),
87 }
88 }
89
90 fn url_path(&self) -> String {
91 match self {
92 Self::Search { .. } => "/search".into(),
93 Self::User { user } => format!("/{}", user),
94 Self::UserWithReplies { user } => format!("/{}/with_replies", user),
95 Self::UserMedia { user } => format!("/{}/media", user),
96 Self::UserSearch { user, .. } => format!("/{}/search", user),
97 Self::Status { id } => format!("/i/status/{}", id),
98 }
99 }
100
101 fn is_single(&self) -> bool {
102 matches!(self, Self::Status { .. })
103 }
104}
105
106enum ReturnedTweet {
107 Pinned,
108 Normal,
109 None,
110}
111
112impl<'a> NitterScraper<'a> {
113 pub async fn search(&'a mut self) -> impl Stream<Item = Result<Tweet, NitterError>> + '_ {
114 self.state = Default::default();
116
117 futures_util::stream::unfold(self, |state| async {
118 if state.state.errored {
120 return None;
121 }
122
123 if let Some(limit) = state.limit {
125 if state.state.count >= limit {
126 return None;
127 }
128 }
129
130 loop {
133 if let Some(tweet) = state.state.tweets.iter().next() {
135 match Self::should_return_tweet(
136 tweet,
137 &state.state.pinned,
138 state.min_id,
139 state.reorder_pinned,
140 ) {
141 ReturnedTweet::Normal => {
142 state.state.count += 1;
143 return Some((Ok(state.state.tweets.pop_front().unwrap()), state));
144 }
145 ReturnedTweet::Pinned => {
146 state.state.count += 1;
147 return Some((Ok(state.state.pinned.take().unwrap()), state));
148 }
149 ReturnedTweet::None => break,
150 }
151 }
152
153 if let NitterCursor::End = state.state.cursor {
154 break;
155 }
156
157 match state.scrape_page().await {
159 Ok(tweets) => {
160 state.state.tweets.extend(tweets.into_iter());
161 }
162 Err(e) => {
163 state.state.errored = true;
164 return Some((Err(e), state));
165 }
166 }
167 }
168
169 if let Some(t) = state.state.pinned.take() {
171 return Some((Ok(t), state));
172 }
173
174 None
175 })
176 }
177
178 fn should_return_tweet(
179 tweet: &Tweet,
180 pinned: &Option<Tweet>,
181 min_id: Option<u128>,
182 reorder_pinned: bool,
183 ) -> ReturnedTweet {
184 if reorder_pinned {
185 if let Some(p) = pinned {
186 if p.created_at_ts > tweet.created_at_ts {
188 return ReturnedTweet::Pinned;
189 }
190 }
191 }
192
193 if let Some(min_id) = min_id {
195 if tweet.id < min_id {
196 return ReturnedTweet::None;
197 }
198 }
199
200 ReturnedTweet::Normal
202 }
203
204 async fn scrape_page(&mut self) -> Result<Vec<Tweet>, NitterError> {
205 let get_params = match self.state.cursor {
207 NitterCursor::Initial => self.query.encode_get_params(),
208 NitterCursor::More(ref c) => c.clone(),
209 NitterCursor::End => return Ok(vec![]),
210 };
211
212 let mut nitter_retry = 0;
213 let tweets = loop {
214 let url = format!("{}{}{}", self.instance, self.query.url_path(), get_params);
216 let mut i = 0;
217 let response = loop {
218 let response = self
219 .client
220 .get(&url)
221 .header(COOKIE, "proxyVideos=; replaceTwitter=; replaceYouTube=; replaceReddit=")
222 .send()
223 .await
224 .map_err(|e| NitterError::Network(e.to_string()))?;
225
226 if response.status() == StatusCode::TOO_MANY_REQUESTS {
227 if i < 25 {
229 i += 1;
230 let sleep_s = 1 << std::cmp::min(i, 8);
231 eprintln!(
232 "Received status code {}, sleeping for {} seconds",
233 response.status().as_u16(),
234 sleep_s
235 );
236 tokio::time::sleep(Duration::from_secs(sleep_s)).await;
237 continue;
238 } else {
239 return Err(NitterError::Network(format!(
240 "received status code {}",
241 response.status().as_u16()
242 )));
243 }
244 } else if response.status() == StatusCode::NOT_FOUND {
245 return Err(NitterError::NotFound);
247 } else if !response.status().is_success() {
248 return Err(NitterError::Network(format!(
250 "received status code {}",
251 response.status().as_u16()
252 )));
253 }
254
255 break response;
256 };
257
258 let text = response.text().await.unwrap();
259
260 let (tweets, cursor) = if self.query.is_single() {
262 let (tweet, cursor) = parse_nitter_single(text)?;
263 (vec![tweet], cursor)
264 } else {
265 parse_nitter_html(text)?
266 };
267
268 let tweets = if self.reorder_pinned {
269 let (mut pinned, unpinned): (Vec<_>, Vec<_>) =
271 tweets.into_iter().partition(|t| t.pinned);
272 if let Some(t) = pinned.pop() {
273 if let Some(min_id) = self.min_id {
274 if t.id >= min_id {
275 self.state.pinned = Some(t);
276 }
277 } else {
278 self.state.pinned = Some(t);
279 }
280 }
281 unpinned
282 } else {
283 tweets
284 };
285
286 if !tweets.is_empty() || nitter_retry > 10 {
288 self.state.cursor = cursor;
289 break tweets;
290 }
291
292 tokio::time::sleep(Duration::from_secs(1)).await;
293 nitter_retry += 1;
294 };
295
296 let tweets = if self.skip_retweets {
297 tweets.into_iter().filter(|t| !t.retweet).collect()
299 } else {
300 tweets
301 };
302
303 Ok(tweets)
304 }
305}