rustube/
fetcher.rs

1use once_cell::sync::Lazy;
2
3use regex::Regex;
4use reqwest::Client;
5use serde::Deserialize;
6use url::Url;
7
8use crate::{Error, Id, IdBuf, PlayerResponse, VideoDescrambler, VideoInfo};
9use crate::video_info::player_response::playability_status::PlayabilityStatus;
10
11/// A fetcher used to download all necessary data from YouTube, which then could be used
12/// to extract video-URLs.
13///
14/// You will probably rarely use this type directly, and use [`Video`] instead.
15///
16/// # Example
17///```no_run
18///# use rustube::{Id, VideoFetcher};
19///# use url::Url;
20/// const URL: &str = "https://youtube.com/watch?iv=5jlI4uzZGjU";
21/// let url = Url::parse(URL).unwrap();
22///
23/// let fetcher: VideoFetcher =  VideoFetcher::from_url(&url).unwrap();
24/// ```
25/// # How it works
26/// So you want to download a YouTube video? You probably already noticed, that YouTube makes
27/// this quite hard, and does not just provide static URLs for their videos. In fact, there's
28/// not the one URL for each video. When currently nobody is watching a video, there's actually
29/// no URL for this video at all!
30///
31/// So we need to somehow show YouTube that we want to watch the video, so the YouTube server
32/// generates a URL for us. To do this, we do what every 'normal' human being would do: we
33/// request the webpage of the video. To do so, we need nothing more, then the video's id (If you
34/// want to learn more about the id, you can have a look at [`Id`]. But you don't need to know
35/// anything about it for now). Let's, for example, take the id '5jlI4uzZGjU'. With this id, we
36/// can then visit <https://youtube.com/watch?v=5jlI4uzZGjU>, the site, you as a human would visit
37/// when just watching the video.
38///
39/// The next step is to extract as much information from <https://youtube.com/watch?v=5jlI4uzZGjU>
40/// as possible. This is, i.e., information like "is the video age-restricted?", or "can we watch
41/// the video without being a member of that channel?".
42///
43/// But there's information, which is a lot more important then knowing if we are old enough to watch the video: The [`VideoInfo`], the [`PlayerResponse`] and the JavaScript of the
44/// page. [`VideoInfo`] and [`PlayerResponse`] are JSON objects, which contain the most
45/// important information about the video. If you are feeling brave, feel free to have a look
46/// at the definitions of those two types, their subtypes, and all the information they contain
47/// (It's huge!). The JavaScript is not processed by `fetch`, but is used later by
48/// [`VideoDescrambler::descramble`] to generate the `transform_plan` and the `transform_map`
49/// (you will learn about both when it comes to descrambling).
50///
51/// To get the videos [`VideoInfo`], we actually need to request one more page. One you probably
52/// don't frequently visit as a 'normal' human being. Because we, programmers, are really
53/// creative when it comes to naming stuff, a video's [`VideoInfo`] can be requested at
54/// <https://youtube.com/get_video_info>. Btw.: If you want to see how the computer feels, when
55/// we ask him to deserialize the response into the [`VideoInfo`] struct, you can have a look
56/// at <https://www.youtube.com/get_video_info?video_id=5jlI4uzZGjU&eurl=https%3A%2F%2Fyoutube.com%2Fwatch%3Fiv%3D5jlI4uzZGjU&sts=>
57/// (most browsers, will download a text file!). This is the actual [`VideoInfo`] for the
58/// video with the id '5jlI4uzZGjU'.
59///
60/// That's it! Of course, we cannot download the video yet. But that's not the task of `fetch`.
61/// `fetch` is just responsible for requesting all the essential information. To learn how the
62/// journey continues, have a look at [`VideoDescrambler`].
63///
64/// [`Video`]: crate::video::Video
65#[derive(Clone, derive_more::Display, derivative::Derivative)]
66#[display(fmt = "VideoFetcher({})", video_id)]
67#[derivative(Debug, PartialEq, Eq)]
68pub struct VideoFetcher {
69    video_id: IdBuf,
70    watch_url: Url,
71    #[derivative(PartialEq = "ignore")]
72    client: Client,
73}
74
75impl VideoFetcher {
76    /// Constructs a [`VideoFetcher`] from an `Url`.
77    /// ### Errors
78    /// - When [`Id::from_raw`] fails to extracted the videos id from the url.
79    /// - When [`reqwest`] fails to initialize an new [`Client`].
80    #[inline]
81    #[cfg(feature = "regex")]
82    pub fn from_url(url: &Url) -> crate::Result<Self> {
83        let id = Id::from_raw(url.as_str())?
84            .into_owned();
85        Self::from_id(id)
86    }
87
88    /// Constructs a [`VideoFetcher`] from an `Id`.
89    /// ### Errors
90    /// When [`reqwest`] fails to initialize an new [`Client`].
91    #[inline]
92    pub fn from_id(video_id: IdBuf) -> crate::Result<Self> {
93        // maybe make these feature gated, to prevent overhead for users that
94        //  don't have problems with youtube consent
95        let cookie_jar = recommended_cookies();
96        let headers = recommended_headers();
97
98        let client = Client::builder()
99            .default_headers(headers)
100            .cookie_provider(std::sync::Arc::new(cookie_jar))
101            .build()?;
102
103        Ok(Self::from_id_with_client(video_id, client))
104    }
105
106    /// Constructs a [`VideoFetcher`] from an [`Id`] and an existing [`Client`].
107    /// There are no special constrains, what the [`Client`] has to look like.
108    /// It's recommended to use the cookie jar returned from [`recommended_cookies`].
109    /// It's recommended to use the headers returned from [`recommended_headers`].
110    #[inline]
111    pub fn from_id_with_client(video_id: IdBuf, client: Client) -> Self {
112        Self {
113            watch_url: video_id.watch_url(),
114            video_id,
115            client,
116        }
117    }
118
119    /// Fetches all available video data and deserializes it into [`VideoInfo`].
120    ///
121    /// ### Errors
122    /// - When the video is private, only for members, or otherwise not accessible.
123    /// - When requests to some video resources fail.
124    /// - When deserializing the raw response fails.
125    ///
126    /// When having a good internet connection, only errors due to inaccessible videos should occur.
127    /// Other errors usually mean, that YouTube changed their API, and `rustube` did not adapt to
128    /// this change yet. Please feel free to open a GitHub issue if this is the case.
129    #[cfg(feature = "fetch")]
130    #[log_derive::logfn(ok = "Trace", err = "Error")]
131    #[log_derive::logfn_inputs(Trace)]
132    pub async fn fetch(self) -> crate::Result<VideoDescrambler> {
133        // fixme:
134        //  It seems like watch_html also contains a PlayerResponse in all cases. VideoInfo
135        //  only contains the  extra field `adaptive_fmts_raw`. It may be possible to just use the
136        //  watch_html PlayerResponse. This would eliminate one request and therefore improve
137        //  performance.
138        //  To do so, two things must happen:
139        //       1. I need a video, which has `adaptive_fmts_raw` set, so I can examine
140        //          both the watch_html as well as the video_info. (adaptive_fmts_raw even may be
141        //          a legacy thing, which isn't used by YouTube anymore).
142        //       2. I need to have some kind of evidence, that watch_html comes with the
143        //          PlayerResponse in most cases. (It would also be possible to just check, whether
144        //          or not watch_html contains PlayerResponse, and otherwise request video_info).
145
146        let watch_html = self.get_html(&self.watch_url).await?;
147        let is_age_restricted = is_age_restricted(&watch_html);
148        Self::check_downloadability(&watch_html, is_age_restricted)?;
149
150        let (video_info, js) = self.get_video_info_and_js(&watch_html, is_age_restricted).await?;
151
152        Ok(VideoDescrambler {
153            video_info,
154            client: self.client,
155            js,
156        })
157    }
158
159    /// Fetches all available video data, and deserializes it into [`VideoInfo`].
160    ///
161    /// This method will only return the [`VideoInfo`]. You won't have the ability to download
162    /// the video afterwards. If you want to download videos, have a look at [`VideoFetcher::fetch`].
163    ///
164    /// This method is useful if you want to find out something about a video that is not available
165    /// for download, like live streams that are offline.
166    ///
167    /// ### Errors
168    /// - When requests to some video resources fail.
169    /// - When deserializing the raw response fails.
170    ///
171    /// When having a good internet connection, this method should not fail. Errors usually mean,
172    /// that YouTube changed their API, and `rustube` did not adapt to this change yet. Please feel
173    /// free to open a GitHub issue if this is the case.
174    #[cfg(feature = "fetch")]
175    pub async fn fetch_info(self) -> crate::Result<VideoInfo> {
176        let watch_html = self.get_html(&self.watch_url).await?;
177        let is_age_restricted = is_age_restricted(&watch_html);
178        Self::check_fetchability(&watch_html, is_age_restricted)?;
179        let (video_info, _js) = self.get_video_info_and_js(&watch_html, is_age_restricted).await?;
180
181        Ok(video_info)
182    }
183
184    /// The id of the video.
185    #[inline]
186    pub fn video_id(&self) -> Id<'_> {
187        self.video_id.as_borrowed()
188    }
189
190    /// The url, under witch the video can be watched.
191    #[inline]
192    pub fn watch_url(&self) -> &Url {
193        &self.watch_url
194    }
195
196    fn check_downloadability(watch_html: &str, is_age_restricted: bool) -> crate::Result<PlayabilityStatus> {
197        let playability_status = Self::extract_playability_status(watch_html)?;
198
199        match playability_status {
200            PlayabilityStatus::Ok { .. } => Ok(playability_status),
201            PlayabilityStatus::LoginRequired { .. } if is_age_restricted => Ok(playability_status),
202            ps => Err(Error::VideoUnavailable(Box::new(ps)))
203        }
204    }
205
206    fn check_fetchability(watch_html: &str, is_age_restricted: bool) -> crate::Result<()> {
207        let playability_status = Self::extract_playability_status(watch_html)?;
208
209        match playability_status {
210            PlayabilityStatus::Ok { .. } => Ok(()),
211            PlayabilityStatus::Unplayable { .. } => Ok(()),
212            PlayabilityStatus::LiveStreamOffline { .. } => Ok(()),
213            PlayabilityStatus::LoginRequired { .. } if is_age_restricted => Ok(()),
214            ps => Err(Error::VideoUnavailable(Box::new(ps)))
215        }
216    }
217
218    /// Checks, whether or not the video is accessible for normal users.
219    fn extract_playability_status(watch_html: &str) -> crate::Result<PlayabilityStatus> {
220        static PLAYABILITY_STATUS: Lazy<Regex> = Lazy::new(||
221            Regex::new(r#"["']?playabilityStatus["']?\s*[:=]\s*"#).unwrap()
222        );
223
224        PLAYABILITY_STATUS
225            .find_iter(watch_html)
226            .map(|m| json_object(
227                watch_html
228                    .get(m.end()..)
229                    .ok_or(Error::Internal("The regex does not match meaningful"))?
230            ))
231            .filter_map(Result::ok)
232            .map(serde_json::from_str::<PlayabilityStatus>)
233            .filter_map(Result::ok)
234            .next()
235            .ok_or_else(|| Error::UnexpectedResponse(
236                "watch html did not contain a PlayabilityStatus".into()
237            ))
238    }
239
240    #[inline]
241    async fn get_video_info_and_js(
242        &self,
243        watch_html: &str,
244        is_age_restricted: bool,
245    ) -> crate::Result<(VideoInfo, String)> {
246        let (js, player_response) = self.get_js(is_age_restricted, watch_html).await?;
247
248        let player_response = player_response.ok_or_else(|| Error::UnexpectedResponse(
249            "Could not acquire the player response from the watch html!\n\
250            It looks like YouTube changed it's API again :-/\n\
251            If this not yet reported, it would be great if you could file an issue:
252            (https://github.com/DzenanJupic/rustube/issues/new?assignees=&labels=youtube-api-changed&template=youtube_api_changed.yml).".into()
253        ))?;
254
255        let video_info = VideoInfo {
256            player_response,
257            adaptive_fmts_raw: None,
258            is_age_restricted,
259        };
260
261        Ok((video_info, js))
262    }
263
264    /// Extracts or requests the JavaScript used to descramble the video signature.
265    #[inline]
266    async fn get_js(
267        &self,
268        is_age_restricted: bool,
269        watch_html: &str,
270    ) -> crate::Result<(String, Option<PlayerResponse>)> {
271        let (js_url, player_response) = match is_age_restricted {
272            true => {
273                let embed_url = self.video_id.embed_url();
274                let embed_html = self.get_html(&embed_url).await?;
275                js_url(&embed_html)?
276            }
277            false => js_url(watch_html)?
278        };
279
280        self
281            .get_html(&js_url)
282            .await
283            .map(|html| (html, player_response))
284    }
285
286    /// Requests the [`VideoInfo`] of a video
287    #[inline]
288    #[allow(unused)]
289    async fn get_video_info(&self, is_age_restricted: bool) -> crate::Result<VideoInfo> {
290        // FIXME: Currently no in use + broken due to #38
291        let video_info_url = self.get_video_info_url(is_age_restricted);
292        let video_info_raw = self.get_html(&video_info_url).await?;
293
294        let mut video_info = serde_qs::from_str::<VideoInfo>(video_info_raw.as_str())?;
295        video_info.is_age_restricted = is_age_restricted;
296
297        Ok(video_info)
298    }
299
300    /// Generates the url under which the [`VideoInfo`] can be requested.
301    #[inline]
302    #[log_derive::logfn_inputs(Debug)]
303    #[log_derive::logfn(Trace, fmt = "get_video_info_url() => {}")]
304    fn get_video_info_url(&self, is_age_restricted: bool) -> Url {
305        if is_age_restricted {
306            video_info_url_age_restricted(
307                self.video_id.as_borrowed(),
308                &self.watch_url,
309            )
310        } else {
311            video_info_url(
312                self.video_id.as_borrowed(),
313                &self.watch_url,
314            )
315        }
316    }
317
318    /// Requests a website.
319    #[inline]
320    #[log_derive::logfn_inputs(Debug)]
321    #[log_derive::logfn(ok = "Trace", err = "Error", fmt = "get_html() => `{}`")]
322    async fn get_html(&self, url: &Url) -> crate::Result<String> {
323        Ok(
324            self.client
325                .get(url.as_str())
326                .send()
327                .await?
328                .error_for_status()?
329                .text()
330                .await?
331        )
332    }
333
334    /*#[inline]
335    #[log_derive::logfn_inputs(Debug)]
336    #[log_derive::logfn(ok = "Trace", err = "Error", fmt = "call_api() => `{:?}`")]
337    async fn call_api<T: serde::de::DeserializeOwned + std::fmt::Debug>(
338        &self,
339        endpoint: &str,
340        video_id: Id<'_>,
341    ) -> crate::Result<T> {
342        // FIXME: get rid of all the allocations here
343        let url = Url::parse(&format!(
344            "https://www.youtube.com/youtubei/v1/{}?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8",
345            endpoint
346        )).unwrap();
347        let body = serde_json::json!({
348            "context": {
349                "client": {
350                    "clientName": "WEB",
351                    "clientVersion": "2.20201021.03.00",
352                },
353            },
354            "videoId": video_id,
355        });
356
357        Ok(
358            self.client
359                .get(url)
360                .json(&body)
361                .send()
362                .await?
363                .error_for_status()?
364                .json::<T>()
365                .await?
366        )
367    }*/
368}
369
370/// Extracts whether or not a particular video is age restricted.
371#[inline]
372fn is_age_restricted(watch_html: &str) -> bool {
373    static PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new("og:restrictions:age").unwrap());
374    PATTERN.is_match(watch_html)
375}
376
377/// Generates the url under which the [`VideoInfo`] of a video can be requested.
378#[inline]
379fn video_info_url(video_id: Id<'_>, watch_url: &Url) -> Url {
380    let params: &[(&str, &str)] = &[
381        ("video_id", video_id.as_str()),
382        ("ps", "default"),
383        ("eurl", watch_url.as_str()),
384        ("hl", "en_US"),
385        ("html5", "1"),
386        ("c", "TVHTML5"),
387        ("cver", "7.20211231"),
388    ];
389    _video_info_url(params)
390}
391
392/// Generates the url under which the [`VideoInfo`] of an age restricted video can be requested.
393#[inline]
394fn video_info_url_age_restricted(video_id: Id<'_>, watch_url: &Url) -> Url {
395    static PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r#""sts"\s*:\s*(\d+)"#).unwrap());
396
397    let sts = match PATTERN.captures(watch_url.as_str()) {
398        Some(c) => c.get(1).unwrap().as_str(),
399        None => ""
400    };
401
402    let eurl = format!("https://youtube.googleapis.com/v/{}", video_id.as_str());
403    let params: &[(&str, &str)] = &[
404        ("video_id", video_id.as_str()),
405        ("eurl", eurl.as_str()),
406        ("sts", sts),
407        ("html5", "1"),
408        ("c", "TVHTML5"),
409        ("cver", "7.20211231"),
410    ];
411    _video_info_url(params)
412}
413
414/// Helper for assembling th video info url.
415#[inline]
416fn _video_info_url(params: &[(&str, &str)]) -> Url {
417    Url::parse_with_params(
418        "https://www.youtube.com/get_video_info?",
419        params,
420    ).unwrap()
421}
422
423/// Generates the url under which the JavaScript used for descrambling can be requested.
424#[inline]
425fn js_url(html: &str) -> crate::Result<(Url, Option<PlayerResponse>)> {
426    let player_response = get_ytplayer_config(html);
427    let base_js = match player_response {
428        Ok(PlayerResponse { assets: Some(ref assets), .. }) => assets.js.as_str(),
429        _ => get_ytplayer_js(html)?
430    };
431
432    Ok((Url::parse(&format!("https://youtube.com{}", base_js))?, player_response.ok()))
433}
434
435/// Extracts the [`PlayerResponse`] from the watch html.
436#[inline]
437fn get_ytplayer_config(html: &str) -> crate::Result<PlayerResponse> {
438    static CONFIG_PATTERNS: Lazy<[Regex; 3]> = Lazy::new(|| [
439        Regex::new(r"ytplayer\.config\s*=\s*").unwrap(),
440        Regex::new(r"ytInitialPlayerResponse\s*=\s*").unwrap(),
441        // fixme
442        // pytube handles `setConfig` little bit differently. It parses the entire argument
443        // to `setConfig()` and then uses load json to find `PlayerResponse` inside of it.
444        // We currently handle both the same way, and just deserialize into the `PlayerConfig` enum.
445        // This *should* have the same effect.
446        //
447        // In the future, it may be a good idea, to also handle both cases differently, so we don't
448        // loose performance on deserializing into an enum, but deserialize `CONFIG_PATTERNS` directly
449        // into `PlayerResponse`, and `SET_CONFIG_PATTERNS` into `Args`. The problem currently is, that
450        // I don't know, if CONFIG_PATTERNS can also contain `Args`.
451        Regex::new(r#"yt\.setConfig\(.*['"]PLAYER_CONFIG['"]:\s*"#).unwrap()
452    ]);
453
454    CONFIG_PATTERNS
455        .iter()
456        .find_map(|pattern| {
457            let json = parse_for_object(html, pattern).ok()?;
458            deserialize_ytplayer_config(json).ok()
459        })
460        .ok_or_else(|| Error::UnexpectedResponse(
461            "Could not find ytplayer_config in the watch html.".into()
462        ))
463}
464
465/// Extracts a json object from a string starting after a pattern.
466#[inline]
467fn parse_for_object<'a>(html: &'a str, regex: &Regex) -> crate::Result<&'a str> {
468    let json_obj_start = regex
469        .find(html)
470        .ok_or(Error::Internal("The regex does not match"))?
471        .end();
472
473    json_object(
474        html
475            .get(json_obj_start..)
476            .ok_or(Error::Internal("The regex does not match meaningful"))?
477    )
478}
479
480/// Deserializes the [`PalyerResponse`] which can be found in the watch html.
481#[inline]
482#[log_derive::logfn(Debug, fmt = "player response: {:?}")]
483#[log_derive::logfn_inputs(Trace, fmt = "player response json: {:?}")]
484fn deserialize_ytplayer_config(json: &str) -> crate::Result<PlayerResponse> {
485    #[derive(Deserialize)]
486    struct Args {
487        player_response: PlayerResponse,
488    }
489
490    // There are multiple possible formats the PlayerResponse could be in. So we basically
491    // have an untagged enum here.
492    // ```rust
493    // #[derive(Deserialize)]
494    // #[serde(untagged)]
495    // enum PlayerConfig {
496    //     Args { args: Args },
497    //     Response(PlayerResponse)
498    // }
499    // ```
500    // The only problem with deserializing this enum is, that we don't get any information about
501    // the cause in case of a failed deserialization. That's why we do this manually here, so that
502    // the log contains information about the error cause.
503
504    let args_err = match serde_json::from_str::<PlayerResponse>(json) {
505        Ok(pr) => return Ok(pr),
506        Err(err) => err,
507    };
508
509    let pr_err = match serde_json::from_str::<Args>(json) {
510        Ok(args) => return Ok(args.player_response),
511        Err(err) => err,
512    };
513
514    Err(crate::Error::JsonDeserialization(serde::de::Error::custom(format_args!(
515        "data did not match any variant of untagged enum PlayerConfig:\n\tArgs:{}\n\tPlayerResponse:{}",
516        args_err, pr_err,
517    ))))
518}
519
520/// Extracts the JavaScript used for descrambling from the watch html.
521#[inline]
522fn get_ytplayer_js(html: &str) -> crate::Result<&str> {
523    static JS_URL_PATTERNS: Lazy<Regex> = Lazy::new(||
524        Regex::new(r"(/s/player/[\w\d]+/[\w\d_/.]+/base\.js)").unwrap()
525    );
526
527    match JS_URL_PATTERNS.captures(html) {
528        Some(function_match) => Ok(function_match.get(1).unwrap().as_str()),
529        None => Err(Error::UnexpectedResponse(
530            "could not extract the ytplayer-javascript url from the watch html".into()
531        ))
532    }
533}
534
535/// Extracts a complete json object from a string.
536#[inline]
537fn json_object(mut html: &str) -> crate::Result<&str> {
538    html = html.trim_start_matches(|c| c != '{');
539    if html.is_empty() {
540        return Err(Error::Internal("cannot parse a json object from an empty string"));
541    }
542
543    let mut stack = vec![b'{'];
544    let mut skip = false;
545
546    let (i, _c) = html
547        .as_bytes()
548        .iter()
549        .enumerate()
550        .skip(1)
551        .find(
552            |(_i, &curr_char)| is_json_object_end(curr_char, &mut skip, &mut stack)
553        )
554        .ok_or(Error::Internal("could not find a closing delimiter"))?;
555
556    let full_obj = html
557        .get(..=i)
558        .expect("i must always mark the position of a valid '}' char");
559
560    Ok(full_obj)
561}
562
563/// Checks if a char represents the end of a json object.
564#[inline]
565fn is_json_object_end(curr_char: u8, skip: &mut bool, stack: &mut Vec<u8>) -> bool {
566    if *skip {
567        *skip = false;
568        return false;
569    }
570
571    let context = *stack
572        .last()
573        .expect("stack must start with len == 1, and search must end, when len == 0");
574
575    match curr_char {
576        b'}' if context == b'{' => { stack.pop(); }
577        b']' if context == b'[' => { stack.pop(); }
578        b'"' if context == b'"' => { stack.pop(); }
579
580        b'\\' if context == b'"' => { *skip = true; }
581
582        b'{' if context != b'"' => stack.push(b'{'),
583        b'[' if context != b'"' => stack.push(b'['),
584        b'"' if context != b'"' => stack.push(b'"'),
585
586        _ => {}
587    }
588
589    stack.is_empty()
590}
591
592pub fn recommended_cookies() -> reqwest::cookie::Jar {
593    let cookie = "CONSENT=YES+; Path=/; Domain=youtube.com; Secure; Expires=Fri, 01 Jan 2038 00:00:00 GMT;";
594    let url = "https://youtube.com".parse().unwrap();
595
596    let jar = reqwest::cookie::Jar::default();
597    jar.add_cookie_str(cookie, &url);
598    jar
599}
600
601pub fn recommended_headers() -> reqwest::header::HeaderMap {
602    let mut headers = reqwest::header::HeaderMap::new();
603
604    headers.insert(reqwest::header::ACCEPT_LANGUAGE, "en-US,en".parse().unwrap());
605    headers.insert(reqwest::header::USER_AGENT, "Mozilla/5.0".parse().unwrap());
606
607    headers
608}