rustube/fetcher.rs
1use once_cell::sync::Lazy;
2
3use regex::Regex;
4use reqwest::Client;
5use serde::Deserialize;
6use url::Url;
7
8use crate::{Error, Id, IdBuf, PlayerResponse, VideoDescrambler, VideoInfo};
9use crate::video_info::player_response::playability_status::PlayabilityStatus;
10
11/// A fetcher used to download all necessary data from YouTube, which then could be used
12/// to extract video-URLs.
13///
14/// You will probably rarely use this type directly, and use [`Video`] instead.
15///
16/// # Example
17///```no_run
18///# use rustube::{Id, VideoFetcher};
19///# use url::Url;
20/// const URL: &str = "https://youtube.com/watch?iv=5jlI4uzZGjU";
21/// let url = Url::parse(URL).unwrap();
22///
23/// let fetcher: VideoFetcher = VideoFetcher::from_url(&url).unwrap();
24/// ```
25/// # How it works
26/// So you want to download a YouTube video? You probably already noticed, that YouTube makes
27/// this quite hard, and does not just provide static URLs for their videos. In fact, there's
28/// not the one URL for each video. When currently nobody is watching a video, there's actually
29/// no URL for this video at all!
30///
31/// So we need to somehow show YouTube that we want to watch the video, so the YouTube server
32/// generates a URL for us. To do this, we do what every 'normal' human being would do: we
33/// request the webpage of the video. To do so, we need nothing more, then the video's id (If you
34/// want to learn more about the id, you can have a look at [`Id`]. But you don't need to know
35/// anything about it for now). Let's, for example, take the id '5jlI4uzZGjU'. With this id, we
36/// can then visit <https://youtube.com/watch?v=5jlI4uzZGjU>, the site, you as a human would visit
37/// when just watching the video.
38///
39/// The next step is to extract as much information from <https://youtube.com/watch?v=5jlI4uzZGjU>
40/// as possible. This is, i.e., information like "is the video age-restricted?", or "can we watch
41/// the video without being a member of that channel?".
42///
43/// But there's information, which is a lot more important then knowing if we are old enough to watch the video: The [`VideoInfo`], the [`PlayerResponse`] and the JavaScript of the
44/// page. [`VideoInfo`] and [`PlayerResponse`] are JSON objects, which contain the most
45/// important information about the video. If you are feeling brave, feel free to have a look
46/// at the definitions of those two types, their subtypes, and all the information they contain
47/// (It's huge!). The JavaScript is not processed by `fetch`, but is used later by
48/// [`VideoDescrambler::descramble`] to generate the `transform_plan` and the `transform_map`
49/// (you will learn about both when it comes to descrambling).
50///
51/// To get the videos [`VideoInfo`], we actually need to request one more page. One you probably
52/// don't frequently visit as a 'normal' human being. Because we, programmers, are really
53/// creative when it comes to naming stuff, a video's [`VideoInfo`] can be requested at
54/// <https://youtube.com/get_video_info>. Btw.: If you want to see how the computer feels, when
55/// we ask him to deserialize the response into the [`VideoInfo`] struct, you can have a look
56/// at <https://www.youtube.com/get_video_info?video_id=5jlI4uzZGjU&eurl=https%3A%2F%2Fyoutube.com%2Fwatch%3Fiv%3D5jlI4uzZGjU&sts=>
57/// (most browsers, will download a text file!). This is the actual [`VideoInfo`] for the
58/// video with the id '5jlI4uzZGjU'.
59///
60/// That's it! Of course, we cannot download the video yet. But that's not the task of `fetch`.
61/// `fetch` is just responsible for requesting all the essential information. To learn how the
62/// journey continues, have a look at [`VideoDescrambler`].
63///
64/// [`Video`]: crate::video::Video
65#[derive(Clone, derive_more::Display, derivative::Derivative)]
66#[display(fmt = "VideoFetcher({})", video_id)]
67#[derivative(Debug, PartialEq, Eq)]
68pub struct VideoFetcher {
69 video_id: IdBuf,
70 watch_url: Url,
71 #[derivative(PartialEq = "ignore")]
72 client: Client,
73}
74
75impl VideoFetcher {
76 /// Constructs a [`VideoFetcher`] from an `Url`.
77 /// ### Errors
78 /// - When [`Id::from_raw`] fails to extracted the videos id from the url.
79 /// - When [`reqwest`] fails to initialize an new [`Client`].
80 #[inline]
81 #[cfg(feature = "regex")]
82 pub fn from_url(url: &Url) -> crate::Result<Self> {
83 let id = Id::from_raw(url.as_str())?
84 .into_owned();
85 Self::from_id(id)
86 }
87
88 /// Constructs a [`VideoFetcher`] from an `Id`.
89 /// ### Errors
90 /// When [`reqwest`] fails to initialize an new [`Client`].
91 #[inline]
92 pub fn from_id(video_id: IdBuf) -> crate::Result<Self> {
93 // maybe make these feature gated, to prevent overhead for users that
94 // don't have problems with youtube consent
95 let cookie_jar = recommended_cookies();
96 let headers = recommended_headers();
97
98 let client = Client::builder()
99 .default_headers(headers)
100 .cookie_provider(std::sync::Arc::new(cookie_jar))
101 .build()?;
102
103 Ok(Self::from_id_with_client(video_id, client))
104 }
105
106 /// Constructs a [`VideoFetcher`] from an [`Id`] and an existing [`Client`].
107 /// There are no special constrains, what the [`Client`] has to look like.
108 /// It's recommended to use the cookie jar returned from [`recommended_cookies`].
109 /// It's recommended to use the headers returned from [`recommended_headers`].
110 #[inline]
111 pub fn from_id_with_client(video_id: IdBuf, client: Client) -> Self {
112 Self {
113 watch_url: video_id.watch_url(),
114 video_id,
115 client,
116 }
117 }
118
119 /// Fetches all available video data and deserializes it into [`VideoInfo`].
120 ///
121 /// ### Errors
122 /// - When the video is private, only for members, or otherwise not accessible.
123 /// - When requests to some video resources fail.
124 /// - When deserializing the raw response fails.
125 ///
126 /// When having a good internet connection, only errors due to inaccessible videos should occur.
127 /// Other errors usually mean, that YouTube changed their API, and `rustube` did not adapt to
128 /// this change yet. Please feel free to open a GitHub issue if this is the case.
129 #[cfg(feature = "fetch")]
130 #[log_derive::logfn(ok = "Trace", err = "Error")]
131 #[log_derive::logfn_inputs(Trace)]
132 pub async fn fetch(self) -> crate::Result<VideoDescrambler> {
133 // fixme:
134 // It seems like watch_html also contains a PlayerResponse in all cases. VideoInfo
135 // only contains the extra field `adaptive_fmts_raw`. It may be possible to just use the
136 // watch_html PlayerResponse. This would eliminate one request and therefore improve
137 // performance.
138 // To do so, two things must happen:
139 // 1. I need a video, which has `adaptive_fmts_raw` set, so I can examine
140 // both the watch_html as well as the video_info. (adaptive_fmts_raw even may be
141 // a legacy thing, which isn't used by YouTube anymore).
142 // 2. I need to have some kind of evidence, that watch_html comes with the
143 // PlayerResponse in most cases. (It would also be possible to just check, whether
144 // or not watch_html contains PlayerResponse, and otherwise request video_info).
145
146 let watch_html = self.get_html(&self.watch_url).await?;
147 let is_age_restricted = is_age_restricted(&watch_html);
148 Self::check_downloadability(&watch_html, is_age_restricted)?;
149
150 let (video_info, js) = self.get_video_info_and_js(&watch_html, is_age_restricted).await?;
151
152 Ok(VideoDescrambler {
153 video_info,
154 client: self.client,
155 js,
156 })
157 }
158
159 /// Fetches all available video data, and deserializes it into [`VideoInfo`].
160 ///
161 /// This method will only return the [`VideoInfo`]. You won't have the ability to download
162 /// the video afterwards. If you want to download videos, have a look at [`VideoFetcher::fetch`].
163 ///
164 /// This method is useful if you want to find out something about a video that is not available
165 /// for download, like live streams that are offline.
166 ///
167 /// ### Errors
168 /// - When requests to some video resources fail.
169 /// - When deserializing the raw response fails.
170 ///
171 /// When having a good internet connection, this method should not fail. Errors usually mean,
172 /// that YouTube changed their API, and `rustube` did not adapt to this change yet. Please feel
173 /// free to open a GitHub issue if this is the case.
174 #[cfg(feature = "fetch")]
175 pub async fn fetch_info(self) -> crate::Result<VideoInfo> {
176 let watch_html = self.get_html(&self.watch_url).await?;
177 let is_age_restricted = is_age_restricted(&watch_html);
178 Self::check_fetchability(&watch_html, is_age_restricted)?;
179 let (video_info, _js) = self.get_video_info_and_js(&watch_html, is_age_restricted).await?;
180
181 Ok(video_info)
182 }
183
184 /// The id of the video.
185 #[inline]
186 pub fn video_id(&self) -> Id<'_> {
187 self.video_id.as_borrowed()
188 }
189
190 /// The url, under witch the video can be watched.
191 #[inline]
192 pub fn watch_url(&self) -> &Url {
193 &self.watch_url
194 }
195
196 fn check_downloadability(watch_html: &str, is_age_restricted: bool) -> crate::Result<PlayabilityStatus> {
197 let playability_status = Self::extract_playability_status(watch_html)?;
198
199 match playability_status {
200 PlayabilityStatus::Ok { .. } => Ok(playability_status),
201 PlayabilityStatus::LoginRequired { .. } if is_age_restricted => Ok(playability_status),
202 ps => Err(Error::VideoUnavailable(Box::new(ps)))
203 }
204 }
205
206 fn check_fetchability(watch_html: &str, is_age_restricted: bool) -> crate::Result<()> {
207 let playability_status = Self::extract_playability_status(watch_html)?;
208
209 match playability_status {
210 PlayabilityStatus::Ok { .. } => Ok(()),
211 PlayabilityStatus::Unplayable { .. } => Ok(()),
212 PlayabilityStatus::LiveStreamOffline { .. } => Ok(()),
213 PlayabilityStatus::LoginRequired { .. } if is_age_restricted => Ok(()),
214 ps => Err(Error::VideoUnavailable(Box::new(ps)))
215 }
216 }
217
218 /// Checks, whether or not the video is accessible for normal users.
219 fn extract_playability_status(watch_html: &str) -> crate::Result<PlayabilityStatus> {
220 static PLAYABILITY_STATUS: Lazy<Regex> = Lazy::new(||
221 Regex::new(r#"["']?playabilityStatus["']?\s*[:=]\s*"#).unwrap()
222 );
223
224 PLAYABILITY_STATUS
225 .find_iter(watch_html)
226 .map(|m| json_object(
227 watch_html
228 .get(m.end()..)
229 .ok_or(Error::Internal("The regex does not match meaningful"))?
230 ))
231 .filter_map(Result::ok)
232 .map(serde_json::from_str::<PlayabilityStatus>)
233 .filter_map(Result::ok)
234 .next()
235 .ok_or_else(|| Error::UnexpectedResponse(
236 "watch html did not contain a PlayabilityStatus".into()
237 ))
238 }
239
240 #[inline]
241 async fn get_video_info_and_js(
242 &self,
243 watch_html: &str,
244 is_age_restricted: bool,
245 ) -> crate::Result<(VideoInfo, String)> {
246 let (js, player_response) = self.get_js(is_age_restricted, watch_html).await?;
247
248 let player_response = player_response.ok_or_else(|| Error::UnexpectedResponse(
249 "Could not acquire the player response from the watch html!\n\
250 It looks like YouTube changed it's API again :-/\n\
251 If this not yet reported, it would be great if you could file an issue:
252 (https://github.com/DzenanJupic/rustube/issues/new?assignees=&labels=youtube-api-changed&template=youtube_api_changed.yml).".into()
253 ))?;
254
255 let video_info = VideoInfo {
256 player_response,
257 adaptive_fmts_raw: None,
258 is_age_restricted,
259 };
260
261 Ok((video_info, js))
262 }
263
264 /// Extracts or requests the JavaScript used to descramble the video signature.
265 #[inline]
266 async fn get_js(
267 &self,
268 is_age_restricted: bool,
269 watch_html: &str,
270 ) -> crate::Result<(String, Option<PlayerResponse>)> {
271 let (js_url, player_response) = match is_age_restricted {
272 true => {
273 let embed_url = self.video_id.embed_url();
274 let embed_html = self.get_html(&embed_url).await?;
275 js_url(&embed_html)?
276 }
277 false => js_url(watch_html)?
278 };
279
280 self
281 .get_html(&js_url)
282 .await
283 .map(|html| (html, player_response))
284 }
285
286 /// Requests the [`VideoInfo`] of a video
287 #[inline]
288 #[allow(unused)]
289 async fn get_video_info(&self, is_age_restricted: bool) -> crate::Result<VideoInfo> {
290 // FIXME: Currently no in use + broken due to #38
291 let video_info_url = self.get_video_info_url(is_age_restricted);
292 let video_info_raw = self.get_html(&video_info_url).await?;
293
294 let mut video_info = serde_qs::from_str::<VideoInfo>(video_info_raw.as_str())?;
295 video_info.is_age_restricted = is_age_restricted;
296
297 Ok(video_info)
298 }
299
300 /// Generates the url under which the [`VideoInfo`] can be requested.
301 #[inline]
302 #[log_derive::logfn_inputs(Debug)]
303 #[log_derive::logfn(Trace, fmt = "get_video_info_url() => {}")]
304 fn get_video_info_url(&self, is_age_restricted: bool) -> Url {
305 if is_age_restricted {
306 video_info_url_age_restricted(
307 self.video_id.as_borrowed(),
308 &self.watch_url,
309 )
310 } else {
311 video_info_url(
312 self.video_id.as_borrowed(),
313 &self.watch_url,
314 )
315 }
316 }
317
318 /// Requests a website.
319 #[inline]
320 #[log_derive::logfn_inputs(Debug)]
321 #[log_derive::logfn(ok = "Trace", err = "Error", fmt = "get_html() => `{}`")]
322 async fn get_html(&self, url: &Url) -> crate::Result<String> {
323 Ok(
324 self.client
325 .get(url.as_str())
326 .send()
327 .await?
328 .error_for_status()?
329 .text()
330 .await?
331 )
332 }
333
334 /*#[inline]
335 #[log_derive::logfn_inputs(Debug)]
336 #[log_derive::logfn(ok = "Trace", err = "Error", fmt = "call_api() => `{:?}`")]
337 async fn call_api<T: serde::de::DeserializeOwned + std::fmt::Debug>(
338 &self,
339 endpoint: &str,
340 video_id: Id<'_>,
341 ) -> crate::Result<T> {
342 // FIXME: get rid of all the allocations here
343 let url = Url::parse(&format!(
344 "https://www.youtube.com/youtubei/v1/{}?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8",
345 endpoint
346 )).unwrap();
347 let body = serde_json::json!({
348 "context": {
349 "client": {
350 "clientName": "WEB",
351 "clientVersion": "2.20201021.03.00",
352 },
353 },
354 "videoId": video_id,
355 });
356
357 Ok(
358 self.client
359 .get(url)
360 .json(&body)
361 .send()
362 .await?
363 .error_for_status()?
364 .json::<T>()
365 .await?
366 )
367 }*/
368}
369
370/// Extracts whether or not a particular video is age restricted.
371#[inline]
372fn is_age_restricted(watch_html: &str) -> bool {
373 static PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new("og:restrictions:age").unwrap());
374 PATTERN.is_match(watch_html)
375}
376
377/// Generates the url under which the [`VideoInfo`] of a video can be requested.
378#[inline]
379fn video_info_url(video_id: Id<'_>, watch_url: &Url) -> Url {
380 let params: &[(&str, &str)] = &[
381 ("video_id", video_id.as_str()),
382 ("ps", "default"),
383 ("eurl", watch_url.as_str()),
384 ("hl", "en_US"),
385 ("html5", "1"),
386 ("c", "TVHTML5"),
387 ("cver", "7.20211231"),
388 ];
389 _video_info_url(params)
390}
391
392/// Generates the url under which the [`VideoInfo`] of an age restricted video can be requested.
393#[inline]
394fn video_info_url_age_restricted(video_id: Id<'_>, watch_url: &Url) -> Url {
395 static PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r#""sts"\s*:\s*(\d+)"#).unwrap());
396
397 let sts = match PATTERN.captures(watch_url.as_str()) {
398 Some(c) => c.get(1).unwrap().as_str(),
399 None => ""
400 };
401
402 let eurl = format!("https://youtube.googleapis.com/v/{}", video_id.as_str());
403 let params: &[(&str, &str)] = &[
404 ("video_id", video_id.as_str()),
405 ("eurl", eurl.as_str()),
406 ("sts", sts),
407 ("html5", "1"),
408 ("c", "TVHTML5"),
409 ("cver", "7.20211231"),
410 ];
411 _video_info_url(params)
412}
413
414/// Helper for assembling th video info url.
415#[inline]
416fn _video_info_url(params: &[(&str, &str)]) -> Url {
417 Url::parse_with_params(
418 "https://www.youtube.com/get_video_info?",
419 params,
420 ).unwrap()
421}
422
423/// Generates the url under which the JavaScript used for descrambling can be requested.
424#[inline]
425fn js_url(html: &str) -> crate::Result<(Url, Option<PlayerResponse>)> {
426 let player_response = get_ytplayer_config(html);
427 let base_js = match player_response {
428 Ok(PlayerResponse { assets: Some(ref assets), .. }) => assets.js.as_str(),
429 _ => get_ytplayer_js(html)?
430 };
431
432 Ok((Url::parse(&format!("https://youtube.com{}", base_js))?, player_response.ok()))
433}
434
435/// Extracts the [`PlayerResponse`] from the watch html.
436#[inline]
437fn get_ytplayer_config(html: &str) -> crate::Result<PlayerResponse> {
438 static CONFIG_PATTERNS: Lazy<[Regex; 3]> = Lazy::new(|| [
439 Regex::new(r"ytplayer\.config\s*=\s*").unwrap(),
440 Regex::new(r"ytInitialPlayerResponse\s*=\s*").unwrap(),
441 // fixme
442 // pytube handles `setConfig` little bit differently. It parses the entire argument
443 // to `setConfig()` and then uses load json to find `PlayerResponse` inside of it.
444 // We currently handle both the same way, and just deserialize into the `PlayerConfig` enum.
445 // This *should* have the same effect.
446 //
447 // In the future, it may be a good idea, to also handle both cases differently, so we don't
448 // loose performance on deserializing into an enum, but deserialize `CONFIG_PATTERNS` directly
449 // into `PlayerResponse`, and `SET_CONFIG_PATTERNS` into `Args`. The problem currently is, that
450 // I don't know, if CONFIG_PATTERNS can also contain `Args`.
451 Regex::new(r#"yt\.setConfig\(.*['"]PLAYER_CONFIG['"]:\s*"#).unwrap()
452 ]);
453
454 CONFIG_PATTERNS
455 .iter()
456 .find_map(|pattern| {
457 let json = parse_for_object(html, pattern).ok()?;
458 deserialize_ytplayer_config(json).ok()
459 })
460 .ok_or_else(|| Error::UnexpectedResponse(
461 "Could not find ytplayer_config in the watch html.".into()
462 ))
463}
464
465/// Extracts a json object from a string starting after a pattern.
466#[inline]
467fn parse_for_object<'a>(html: &'a str, regex: &Regex) -> crate::Result<&'a str> {
468 let json_obj_start = regex
469 .find(html)
470 .ok_or(Error::Internal("The regex does not match"))?
471 .end();
472
473 json_object(
474 html
475 .get(json_obj_start..)
476 .ok_or(Error::Internal("The regex does not match meaningful"))?
477 )
478}
479
480/// Deserializes the [`PalyerResponse`] which can be found in the watch html.
481#[inline]
482#[log_derive::logfn(Debug, fmt = "player response: {:?}")]
483#[log_derive::logfn_inputs(Trace, fmt = "player response json: {:?}")]
484fn deserialize_ytplayer_config(json: &str) -> crate::Result<PlayerResponse> {
485 #[derive(Deserialize)]
486 struct Args {
487 player_response: PlayerResponse,
488 }
489
490 // There are multiple possible formats the PlayerResponse could be in. So we basically
491 // have an untagged enum here.
492 // ```rust
493 // #[derive(Deserialize)]
494 // #[serde(untagged)]
495 // enum PlayerConfig {
496 // Args { args: Args },
497 // Response(PlayerResponse)
498 // }
499 // ```
500 // The only problem with deserializing this enum is, that we don't get any information about
501 // the cause in case of a failed deserialization. That's why we do this manually here, so that
502 // the log contains information about the error cause.
503
504 let args_err = match serde_json::from_str::<PlayerResponse>(json) {
505 Ok(pr) => return Ok(pr),
506 Err(err) => err,
507 };
508
509 let pr_err = match serde_json::from_str::<Args>(json) {
510 Ok(args) => return Ok(args.player_response),
511 Err(err) => err,
512 };
513
514 Err(crate::Error::JsonDeserialization(serde::de::Error::custom(format_args!(
515 "data did not match any variant of untagged enum PlayerConfig:\n\tArgs:{}\n\tPlayerResponse:{}",
516 args_err, pr_err,
517 ))))
518}
519
520/// Extracts the JavaScript used for descrambling from the watch html.
521#[inline]
522fn get_ytplayer_js(html: &str) -> crate::Result<&str> {
523 static JS_URL_PATTERNS: Lazy<Regex> = Lazy::new(||
524 Regex::new(r"(/s/player/[\w\d]+/[\w\d_/.]+/base\.js)").unwrap()
525 );
526
527 match JS_URL_PATTERNS.captures(html) {
528 Some(function_match) => Ok(function_match.get(1).unwrap().as_str()),
529 None => Err(Error::UnexpectedResponse(
530 "could not extract the ytplayer-javascript url from the watch html".into()
531 ))
532 }
533}
534
535/// Extracts a complete json object from a string.
536#[inline]
537fn json_object(mut html: &str) -> crate::Result<&str> {
538 html = html.trim_start_matches(|c| c != '{');
539 if html.is_empty() {
540 return Err(Error::Internal("cannot parse a json object from an empty string"));
541 }
542
543 let mut stack = vec![b'{'];
544 let mut skip = false;
545
546 let (i, _c) = html
547 .as_bytes()
548 .iter()
549 .enumerate()
550 .skip(1)
551 .find(
552 |(_i, &curr_char)| is_json_object_end(curr_char, &mut skip, &mut stack)
553 )
554 .ok_or(Error::Internal("could not find a closing delimiter"))?;
555
556 let full_obj = html
557 .get(..=i)
558 .expect("i must always mark the position of a valid '}' char");
559
560 Ok(full_obj)
561}
562
563/// Checks if a char represents the end of a json object.
564#[inline]
565fn is_json_object_end(curr_char: u8, skip: &mut bool, stack: &mut Vec<u8>) -> bool {
566 if *skip {
567 *skip = false;
568 return false;
569 }
570
571 let context = *stack
572 .last()
573 .expect("stack must start with len == 1, and search must end, when len == 0");
574
575 match curr_char {
576 b'}' if context == b'{' => { stack.pop(); }
577 b']' if context == b'[' => { stack.pop(); }
578 b'"' if context == b'"' => { stack.pop(); }
579
580 b'\\' if context == b'"' => { *skip = true; }
581
582 b'{' if context != b'"' => stack.push(b'{'),
583 b'[' if context != b'"' => stack.push(b'['),
584 b'"' if context != b'"' => stack.push(b'"'),
585
586 _ => {}
587 }
588
589 stack.is_empty()
590}
591
592pub fn recommended_cookies() -> reqwest::cookie::Jar {
593 let cookie = "CONSENT=YES+; Path=/; Domain=youtube.com; Secure; Expires=Fri, 01 Jan 2038 00:00:00 GMT;";
594 let url = "https://youtube.com".parse().unwrap();
595
596 let jar = reqwest::cookie::Jar::default();
597 jar.add_cookie_str(cookie, &url);
598 jar
599}
600
601pub fn recommended_headers() -> reqwest::header::HeaderMap {
602 let mut headers = reqwest::header::HeaderMap::new();
603
604 headers.insert(reqwest::header::ACCEPT_LANGUAGE, "en-US,en".parse().unwrap());
605 headers.insert(reqwest::header::USER_AGENT, "Mozilla/5.0".parse().unwrap());
606
607 headers
608}