nitter_scraper/
nitter_scraper.rs

1use std::collections::VecDeque;
2use std::time::Duration;
3
4use clap::Subcommand;
5use futures_util::Stream;
6use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
7use reqwest::header::COOKIE;
8use reqwest::{Client, StatusCode};
9use typed_builder::TypedBuilder;
10
11use crate::error::NitterError;
12use crate::parse::{parse_nitter_html, parse_nitter_single};
13use crate::tweet::Tweet;
14
15#[derive(TypedBuilder)]
16pub struct NitterScraper<'a> {
17    client: &'a Client,
18
19    #[builder(setter(into))]
20    instance: String,
21
22    query: NitterQuery,
23
24    #[builder(default)]
25    limit: Option<usize>,
26
27    #[builder(default)]
28    reorder_pinned: bool,
29
30    #[builder(default)]
31    skip_retweets: bool,
32
33    #[builder(default)]
34    min_id: Option<u128>,
35
36    #[builder(setter(skip), default)]
37    state: NitterSearchState,
38}
39
40#[derive(Debug, Default)]
41struct NitterSearchState {
42    tweets: VecDeque<Tweet>,
43    cursor: NitterCursor,
44    count: usize,
45    errored: bool,
46    pinned: Option<Tweet>,
47}
48
49#[derive(Debug)]
50pub enum NitterCursor {
51    Initial,
52    More(String),
53    End,
54}
55
56impl Default for NitterCursor {
57    fn default() -> Self {
58        Self::Initial
59    }
60}
61
62#[derive(Subcommand)]
63pub enum NitterQuery {
64    Search { query: String },
65    User { user: String },
66    UserWithReplies { user: String },
67    UserMedia { user: String },
68    UserSearch { user: String, query: String },
69    Status { id: u64 },
70}
71
72impl NitterQuery {
73    fn encode_get_params(&self) -> String {
74        match self {
75            Self::Search { query } => {
76                let encoded = utf8_percent_encode(query, NON_ALPHANUMERIC);
77                format!("?f=tweets&q={}", encoded)
78            }
79            Self::User { .. } => "".into(),
80            Self::UserWithReplies { .. } => "".into(),
81            Self::UserMedia { .. } => "".into(),
82            Self::UserSearch { query, .. } => {
83                let encoded = utf8_percent_encode(query, NON_ALPHANUMERIC);
84                format!("?f=tweets&q={}", encoded)
85            }
86            Self::Status { .. } => "".into(),
87        }
88    }
89
90    fn url_path(&self) -> String {
91        match self {
92            Self::Search { .. } => "/search".into(),
93            Self::User { user } => format!("/{}", user),
94            Self::UserWithReplies { user } => format!("/{}/with_replies", user),
95            Self::UserMedia { user } => format!("/{}/media", user),
96            Self::UserSearch { user, .. } => format!("/{}/search", user),
97            Self::Status { id } => format!("/i/status/{}", id),
98        }
99    }
100
101    fn is_single(&self) -> bool {
102        matches!(self, Self::Status { .. })
103    }
104}
105
106enum ReturnedTweet {
107    Pinned,
108    Normal,
109    None,
110}
111
112impl<'a> NitterScraper<'a> {
113    pub async fn search(&'a mut self) -> impl Stream<Item = Result<Tweet, NitterError>> + '_ {
114        // Reset internal state
115        self.state = Default::default();
116
117        futures_util::stream::unfold(self, |state| async {
118            // Stop if previously errored
119            if state.state.errored {
120                return None;
121            }
122
123            // Stop if limit reached
124            if let Some(limit) = state.limit {
125                if state.state.count >= limit {
126                    return None;
127                }
128            }
129
130            // Since skip-retweets may cause entire page to be empty, loop until cursor doesn't
131            // exist anymore
132            loop {
133                // Return tweet if available
134                if let Some(tweet) = state.state.tweets.iter().next() {
135                    match Self::should_return_tweet(
136                        tweet,
137                        &state.state.pinned,
138                        state.min_id,
139                        state.reorder_pinned,
140                    ) {
141                        ReturnedTweet::Normal => {
142                            state.state.count += 1;
143                            return Some((Ok(state.state.tweets.pop_front().unwrap()), state));
144                        }
145                        ReturnedTweet::Pinned => {
146                            state.state.count += 1;
147                            return Some((Ok(state.state.pinned.take().unwrap()), state));
148                        }
149                        ReturnedTweet::None => break,
150                    }
151                }
152
153                if let NitterCursor::End = state.state.cursor {
154                    break;
155                }
156
157                // Scrape nitter
158                match state.scrape_page().await {
159                    Ok(tweets) => {
160                        state.state.tweets.extend(tweets.into_iter());
161                    }
162                    Err(e) => {
163                        state.state.errored = true;
164                        return Some((Err(e), state));
165                    }
166                }
167            }
168
169            // Return pinned tweet if needed
170            if let Some(t) = state.state.pinned.take() {
171                return Some((Ok(t), state));
172            }
173
174            None
175        })
176    }
177
178    fn should_return_tweet(
179        tweet: &Tweet,
180        pinned: &Option<Tweet>,
181        min_id: Option<u128>,
182        reorder_pinned: bool,
183    ) -> ReturnedTweet {
184        if reorder_pinned {
185            if let Some(p) = pinned {
186                // Should use tweet id here but nitter doesn't expose it for retweets
187                if p.created_at_ts > tweet.created_at_ts {
188                    return ReturnedTweet::Pinned;
189                }
190            }
191        }
192
193        // Stop if minimum tweet id reached
194        if let Some(min_id) = min_id {
195            if tweet.id < min_id {
196                return ReturnedTweet::None;
197            }
198        }
199
200        // Return next tweet
201        ReturnedTweet::Normal
202    }
203
204    async fn scrape_page(&mut self) -> Result<Vec<Tweet>, NitterError> {
205        // Use cursor if it exists
206        let get_params = match self.state.cursor {
207            NitterCursor::Initial => self.query.encode_get_params(),
208            NitterCursor::More(ref c) => c.clone(),
209            NitterCursor::End => return Ok(vec![]),
210        };
211
212        let mut nitter_retry = 0;
213        let tweets = loop {
214            // Send request
215            let url = format!("{}{}{}", self.instance, self.query.url_path(), get_params);
216            let mut i = 0;
217            let response = loop {
218                let response = self
219                    .client
220                    .get(&url)
221                    .header(COOKIE, "proxyVideos=; replaceTwitter=; replaceYouTube=; replaceReddit=")
222                    .send()
223                    .await
224                    .map_err(|e| NitterError::Network(e.to_string()))?;
225
226                if response.status() == StatusCode::TOO_MANY_REQUESTS {
227                    // Retry if 429
228                    if i < 25 {
229                        i += 1;
230                        let sleep_s = 1 << std::cmp::min(i, 8);
231                        eprintln!(
232                            "Received status code {}, sleeping for {} seconds",
233                            response.status().as_u16(),
234                            sleep_s
235                        );
236                        tokio::time::sleep(Duration::from_secs(sleep_s)).await;
237                        continue;
238                    } else {
239                        return Err(NitterError::Network(format!(
240                            "received status code {}",
241                            response.status().as_u16()
242                        )));
243                    }
244                } else if response.status() == StatusCode::NOT_FOUND {
245                    // Return nothing on 404
246                    return Err(NitterError::NotFound);
247                } else if !response.status().is_success() {
248                    // Error if bad status code
249                    return Err(NitterError::Network(format!(
250                        "received status code {}",
251                        response.status().as_u16()
252                    )));
253                }
254
255                break response;
256            };
257
258            let text = response.text().await.unwrap();
259
260            // Parse html and update cursor
261            let (tweets, cursor) = if self.query.is_single() {
262                let (tweet, cursor) = parse_nitter_single(text)?;
263                (vec![tweet], cursor)
264            } else {
265                parse_nitter_html(text)?
266            };
267
268            let tweets = if self.reorder_pinned {
269                // Extract pinned tweet
270                let (mut pinned, unpinned): (Vec<_>, Vec<_>) =
271                    tweets.into_iter().partition(|t| t.pinned);
272                if let Some(t) = pinned.pop() {
273                    if let Some(min_id) = self.min_id {
274                        if t.id >= min_id {
275                            self.state.pinned = Some(t);
276                        }
277                    } else {
278                        self.state.pinned = Some(t);
279                    }
280                }
281                unpinned
282            } else {
283                tweets
284            };
285
286            // Sometimes nitter will return nothing, retry a few times to make sure it's correct
287            if !tweets.is_empty() || nitter_retry > 10 {
288                self.state.cursor = cursor;
289                break tweets;
290            }
291
292            tokio::time::sleep(Duration::from_secs(1)).await;
293            nitter_retry += 1;
294        };
295
296        let tweets = if self.skip_retweets {
297            // Filter out retweets
298            tweets.into_iter().filter(|t| !t.retweet).collect()
299        } else {
300            tweets
301        };
302
303        Ok(tweets)
304    }
305}