use once_cell::sync::Lazy;
use regex::Regex;
use reqwest::Client;
use serde::Deserialize;
use url::Url;
use crate::{Error, Id, IdBuf, PlayerResponse, VideoDescrambler, VideoInfo};
use crate::video_info::player_response::playability_status::PlayabilityStatus;
#[derive(Clone, derive_more::Display, derivative::Derivative)]
#[display(fmt = "VideoFetcher({video_id})")]
#[derivative(Debug, PartialEq, Eq)]
pub struct VideoFetcher {
video_id: IdBuf,
watch_url: Url,
#[derivative(PartialEq = "ignore")]
client: Client,
}
impl VideoFetcher {
#[inline]
#[cfg(feature = "regex")]
pub fn from_url(url: &Url) -> crate::Result<Self> {
let id = Id::from_raw(url.as_str())?
.into_owned();
Self::from_id(id)
}
#[inline]
pub fn from_id(video_id: IdBuf) -> crate::Result<Self> {
let cookie_jar = recommended_cookies();
let headers = recommended_headers();
let client = Client::builder()
.proxy(reqwest::Proxy::http("socks5://127.0.0.1:8119").unwrap())
.default_headers(headers)
.cookie_provider(std::sync::Arc::new(cookie_jar))
.build()?;
Ok(Self::from_id_with_client(video_id, client))
}
#[inline]
pub fn from_id_with_client(video_id: IdBuf, client: Client) -> Self {
Self {
watch_url: video_id.watch_url(),
video_id,
client,
}
}
#[cfg(feature = "fetch")]
#[log_derive::logfn(ok = "Trace", err = "Error")]
#[log_derive::logfn_inputs(Trace)]
pub async fn fetch(self) -> crate::Result<VideoDescrambler> {
let watch_html = self.get_html(&self.watch_url).await?;
let is_age_restricted = is_age_restricted(&watch_html);
Self::check_downloadability(&watch_html, is_age_restricted)?;
let (video_info, js) = self.get_video_info_and_js(&watch_html, is_age_restricted).await?;
Ok(VideoDescrambler {
video_info,
client: self.client,
js,
})
}
#[cfg(feature = "fetch")]
pub async fn fetch_info(self) -> crate::Result<VideoInfo> {
let watch_html = self.get_html(&self.watch_url).await?;
let is_age_restricted = is_age_restricted(&watch_html);
Self::check_fetchability(&watch_html, is_age_restricted)?;
let (video_info, _js) = self.get_video_info_and_js(&watch_html, is_age_restricted).await?;
Ok(video_info)
}
#[inline]
pub fn video_id(&self) -> Id<'_> {
self.video_id.as_borrowed()
}
#[inline]
pub fn watch_url(&self) -> &Url {
&self.watch_url
}
fn check_downloadability(watch_html: &str, is_age_restricted: bool) -> crate::Result<PlayabilityStatus> {
let playability_status = Self::extract_playability_status(watch_html)?;
match playability_status {
PlayabilityStatus::Ok { .. } => Ok(playability_status),
PlayabilityStatus::LoginRequired { .. } if is_age_restricted => Ok(playability_status),
ps => Err(Error::VideoUnavailable(Box::new(ps)))
}
}
fn check_fetchability(watch_html: &str, is_age_restricted: bool) -> crate::Result<()> {
let playability_status = Self::extract_playability_status(watch_html)?;
match playability_status {
PlayabilityStatus::Ok { .. } => Ok(()),
PlayabilityStatus::Unplayable { .. } => Ok(()),
PlayabilityStatus::LiveStreamOffline { .. } => Ok(()),
PlayabilityStatus::LoginRequired { .. } if is_age_restricted => Ok(()),
ps => Err(Error::VideoUnavailable(Box::new(ps)))
}
}
fn extract_playability_status(watch_html: &str) -> crate::Result<PlayabilityStatus> {
static PLAYABILITY_STATUS: Lazy<Regex> = Lazy::new(||
Regex::new(r#"["']?playabilityStatus["']?\s*[:=]\s*"#).unwrap()
);
PLAYABILITY_STATUS
.find_iter(watch_html)
.map(|m| json_object(
watch_html
.get(m.end()..)
.ok_or(Error::Internal("The regex does not match meaningful"))?
))
.filter_map(Result::ok)
.map(serde_json::from_str::<PlayabilityStatus>)
.filter_map(Result::ok)
.next()
.ok_or_else(|| Error::UnexpectedResponse(
"watch html did not contain a PlayabilityStatus".into()
))
}
#[inline]
async fn get_video_info_and_js(
&self,
watch_html: &str,
is_age_restricted: bool,
) -> crate::Result<(VideoInfo, String)> {
let (js, player_response) = self.get_js(is_age_restricted, watch_html).await?;
let player_response = player_response.ok_or_else(|| Error::UnexpectedResponse(
"Could not acquire the player response from the watch html!\n\
It looks like YouTube changed it's API again :-/\n\
If this not yet reported, it would be great if you could file an issue:
(https://github.com/DzenanJupic/rustube/issues/new?assignees=&labels=youtube-api-changed&template=youtube_api_changed.yml).".into()
))?;
let video_info = VideoInfo {
player_response,
adaptive_fmts_raw: None,
is_age_restricted,
};
Ok((video_info, js))
}
#[inline]
async fn get_js(
&self,
is_age_restricted: bool,
watch_html: &str,
) -> crate::Result<(String, Option<PlayerResponse>)> {
let (js_url, player_response) = match is_age_restricted {
true => {
let embed_url = self.video_id.embed_url();
let embed_html = self.get_html(&embed_url).await?;
js_url(&embed_html)?
}
false => js_url(watch_html)?
};
self
.get_html(&js_url)
.await
.map(|html| (html, player_response))
}
#[inline]
#[allow(unused)]
async fn get_video_info(&self, is_age_restricted: bool) -> crate::Result<VideoInfo> {
let video_info_url = self.get_video_info_url(is_age_restricted);
let video_info_raw = self.get_html(&video_info_url).await?;
let mut video_info = serde_qs::from_str::<VideoInfo>(video_info_raw.as_str())?;
video_info.is_age_restricted = is_age_restricted;
Ok(video_info)
}
#[inline]
#[log_derive::logfn_inputs(Debug)]
#[log_derive::logfn(Trace, fmt = "get_video_info_url() => {}")]
fn get_video_info_url(&self, is_age_restricted: bool) -> Url {
if is_age_restricted {
video_info_url_age_restricted(
self.video_id.as_borrowed(),
&self.watch_url,
)
} else {
video_info_url(
self.video_id.as_borrowed(),
&self.watch_url,
)
}
}
#[inline]
#[log_derive::logfn_inputs(Debug)]
#[log_derive::logfn(ok = "Trace", err = "Error", fmt = "get_html() => `{}`")]
async fn get_html(&self, url: &Url) -> crate::Result<String> {
Ok(
self.client
.get(url.as_str())
.send()
.await?
.error_for_status()?
.text()
.await?
)
}
}
#[inline]
fn is_age_restricted(watch_html: &str) -> bool {
static PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new("og:restrictions:age").unwrap());
PATTERN.is_match(watch_html)
}
#[inline]
fn video_info_url(video_id: Id<'_>, watch_url: &Url) -> Url {
let params: &[(&str, &str)] = &[
("video_id", video_id.as_str()),
("ps", "default"),
("eurl", watch_url.as_str()),
("hl", "en_US"),
("html5", "1"),
("c", "TVHTML5"),
("cver", "7.20211231"),
];
_video_info_url(params)
}
#[inline]
fn video_info_url_age_restricted(video_id: Id<'_>, watch_url: &Url) -> Url {
static PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r#""sts"\s*:\s*(\d+)"#).unwrap());
let sts = match PATTERN.captures(watch_url.as_str()) {
Some(c) => c.get(1).unwrap().as_str(),
None => ""
};
let eurl = format!("https://youtube.googleapis.com/v/{}", video_id.as_str());
let params: &[(&str, &str)] = &[
("video_id", video_id.as_str()),
("eurl", eurl.as_str()),
("sts", sts),
("html5", "1"),
("c", "TVHTML5"),
("cver", "7.20211231"),
];
_video_info_url(params)
}
#[inline]
fn _video_info_url(params: &[(&str, &str)]) -> Url {
Url::parse_with_params(
"https://www.youtube.com/get_video_info?",
params,
).unwrap()
}
#[inline]
fn js_url(html: &str) -> crate::Result<(Url, Option<PlayerResponse>)> {
let player_response = get_ytplayer_config(html);
let base_js = match player_response {
Ok(PlayerResponse { assets: Some(ref assets), .. }) => assets.js.as_str(),
_ => get_ytplayer_js(html)?
};
Ok((Url::parse(&format!("https://youtube.com{base_js}"))?, player_response.ok()))
}
#[inline]
fn get_ytplayer_config(html: &str) -> crate::Result<PlayerResponse> {
static CONFIG_PATTERNS: Lazy<[Regex; 3]> = Lazy::new(|| [
Regex::new(r"ytplayer\.config\s*=\s*").unwrap(),
Regex::new(r"ytInitialPlayerResponse\s*=\s*").unwrap(),
Regex::new(r#"yt\.setConfig\(.*['"]PLAYER_CONFIG['"]:\s*"#).unwrap()
]);
CONFIG_PATTERNS
.iter()
.find_map(|pattern| {
let json = parse_for_object(html, pattern).ok()?;
deserialize_ytplayer_config(json).ok()
})
.ok_or_else(|| Error::UnexpectedResponse(
"Could not find ytplayer_config in the watch html.".into()
))
}
#[inline]
fn parse_for_object<'a>(html: &'a str, regex: &Regex) -> crate::Result<&'a str> {
let json_obj_start = regex
.find(html)
.ok_or(Error::Internal("The regex does not match"))?
.end();
json_object(
html
.get(json_obj_start..)
.ok_or(Error::Internal("The regex does not match meaningful"))?
)
}
#[inline]
#[log_derive::logfn(Debug, fmt = "player response: {:?}")]
#[log_derive::logfn_inputs(Trace, fmt = "player response json: {:?}")]
fn deserialize_ytplayer_config(json: &str) -> crate::Result<PlayerResponse> {
#[derive(Deserialize)]
struct Args {
player_response: PlayerResponse,
}
let args_err = match serde_json::from_str::<PlayerResponse>(json) {
Ok(pr) => return Ok(pr),
Err(err) => err,
};
let pr_err = match serde_json::from_str::<Args>(json) {
Ok(args) => return Ok(args.player_response),
Err(err) => err,
};
Err(Error::JsonDeserialization(serde::de::Error::custom(format_args!(
"data did not match any variant of untagged enum PlayerConfig:\n\
\tArgs:{args_err}\n\
\tPlayerResponse:{pr_err}",
))))
}
#[inline]
fn get_ytplayer_js(html: &str) -> crate::Result<&str> {
static JS_URL_PATTERNS: Lazy<Regex> = Lazy::new(||
Regex::new(r"(/s/player/[\w\d]+/[\w\d_/.]+/base\.js)").unwrap()
);
match JS_URL_PATTERNS.captures(html) {
Some(function_match) => Ok(function_match.get(1).unwrap().as_str()),
None => Err(Error::UnexpectedResponse(
"could not extract the ytplayer-javascript url from the watch html".into()
))
}
}
#[inline]
fn json_object(mut html: &str) -> crate::Result<&str> {
html = html.trim_start_matches(|c| c != '{');
if html.is_empty() {
return Err(Error::Internal("cannot parse a json object from an empty string"));
}
let mut stack = vec![b'{'];
let mut skip = false;
let (i, _c) = html
.as_bytes()
.iter()
.enumerate()
.skip(1)
.find(
|(_i, &curr_char)| is_json_object_end(curr_char, &mut skip, &mut stack)
)
.ok_or(Error::Internal("could not find a closing delimiter"))?;
let full_obj = html
.get(..=i)
.expect("i must always mark the position of a valid '}' char");
Ok(full_obj)
}
#[inline]
fn is_json_object_end(curr_char: u8, skip: &mut bool, stack: &mut Vec<u8>) -> bool {
if *skip {
*skip = false;
return false;
}
let context = *stack
.last()
.expect("stack must start with len == 1, and search must end, when len == 0");
match curr_char {
b'}' if context == b'{' => { stack.pop(); }
b']' if context == b'[' => { stack.pop(); }
b'"' if context == b'"' => { stack.pop(); }
b'\\' if context == b'"' => { *skip = true; }
b'{' if context != b'"' => stack.push(b'{'),
b'[' if context != b'"' => stack.push(b'['),
b'"' if context != b'"' => stack.push(b'"'),
_ => {}
}
stack.is_empty()
}
pub fn recommended_cookies() -> reqwest::cookie::Jar {
let cookie = "CONSENT=YES+; Path=/; Domain=youtube.com; Secure; Expires=Fri, 01 Jan 2038 00:00:00 GMT;";
let url = "https://youtube.com".parse().unwrap();
let jar = reqwest::cookie::Jar::default();
jar.add_cookie_str(cookie, &url);
jar
}
pub fn recommended_headers() -> reqwest::header::HeaderMap {
let mut headers = reqwest::header::HeaderMap::new();
headers.insert(reqwest::header::ACCEPT_LANGUAGE, "en-US,en".parse().unwrap());
headers.insert(reqwest::header::USER_AGENT, "Mozilla/5.0".parse().unwrap());
headers
}