use crate::prelude::*;
use std::borrow::Borrow;
use std::borrow::Cow;
use std::collections::HashMap;
use std::default::Default;
use std::io::Read;
use html5ever::tendril::*;
use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
use html5ever::{expanded_name, local_name, namespace_url, ns, parse_document};
use html5ever::{Attribute, ExpandedName, LocalNameStaticSet, QualName};
use string_cache::Atom;
use super::project_info::{ArtifactInfo, DistInfoMetadata, ProjectInfo, Yanked};
const META_TAG: ExpandedName = expanded_name!(html "meta");
const BASE_TAG: ExpandedName = expanded_name!(html "base");
const A_TAG: ExpandedName = expanded_name!(html "a");
const HREF_ATTR: Atom<LocalNameStaticSet> = html5ever::local_name!("href");
const NAME_ATTR: Atom<LocalNameStaticSet> = html5ever::local_name!("name");
const CONTENT_ATTR: Atom<LocalNameStaticSet> = html5ever::local_name!("content");
static REQUIRES_PYTHON_ATTR: Lazy<Atom<LocalNameStaticSet>> =
Lazy::new(|| Atom::from("data-requires-python"));
static YANKED_ATTR: Lazy<Atom<LocalNameStaticSet>> =
Lazy::new(|| Atom::from("data-yanked"));
static DATA_DIST_INFO_METADATA: Lazy<Atom<LocalNameStaticSet>> =
Lazy::new(|| Atom::from("data-dist-info-metadata"));
struct Sink {
next_id: usize,
names: HashMap<usize, QualName>,
base: Url,
changed_base: bool,
project_info: ProjectInfo,
}
impl Sink {
fn get_id(&mut self) -> usize {
let id = self.next_id;
self.next_id += 2;
id
}
}
fn get_attr<'a>(
name: &Atom<LocalNameStaticSet>,
attrs: &'a Vec<Attribute>,
) -> Option<&'a str> {
for attr in attrs {
if attr.name.local == *name {
return Some(attr.value.as_ref());
}
}
None
}
fn parse_hash(s: &str) -> Option<ArtifactHash> {
if let Some((mode, hex)) = s.split_once('=') {
ArtifactHash::from_hex(mode, hex).ok()
} else {
None
}
}
impl Sink {
fn try_parse_link(
&self,
url_str: &str,
attrs: &Vec<Attribute>,
) -> Option<Vec<ArtifactInfo>> {
let url = self.base.join(url_str).ok()?;
let name: ArtifactName = url.path_segments()?.next_back()?.try_into().ok()?;
let names = name.split_multiplatform_pybis();
let hash = url.fragment().and_then(parse_hash);
let requires_python =
get_attr(REQUIRES_PYTHON_ATTR.borrow(), &attrs).map(String::from);
let metadata_attr = get_attr(DATA_DIST_INFO_METADATA.borrow(), &attrs);
let dist_info_metadata = match metadata_attr {
None => DistInfoMetadata {
available: false,
hash: None,
},
Some("true") => DistInfoMetadata {
available: true,
hash: None,
},
Some(value) => DistInfoMetadata {
available: true,
hash: parse_hash(value),
},
};
let yanked_reason = get_attr(YANKED_ATTR.borrow(), &attrs);
let yanked = match yanked_reason {
None => Yanked {
yanked: false,
reason: None,
},
Some(reason) => Yanked {
yanked: true,
reason: Some(reason.into()),
},
};
let template = ArtifactInfo {
name,
url,
hash,
requires_python,
dist_info_metadata,
yanked,
};
Some(
names
.into_iter()
.map(|name| ArtifactInfo {
name,
..template.clone()
})
.collect(),
)
}
}
impl TreeSink for Sink {
type Handle = usize;
type Output = Self;
fn create_element(
&mut self,
name: QualName,
attrs: Vec<Attribute>,
_: ElementFlags,
) -> usize {
if name.expanded() == META_TAG {
if let Some("pypi:repository-version") = get_attr(&NAME_ATTR, &attrs) {
if let Some(version) = get_attr(&CONTENT_ATTR, &attrs) {
self.project_info.meta.version = version.into();
}
}
}
if name.expanded() == BASE_TAG {
if !self.changed_base {
self.changed_base = true;
if let Some(new_base_str) = get_attr(&HREF_ATTR, &attrs) {
if let Ok(new_base) = self.base.join(new_base_str) {
self.base = new_base;
}
}
}
}
if name.expanded() == A_TAG {
if let Some(url_str) = get_attr(&HREF_ATTR, &attrs) {
if let Some(artifact_infos) = self.try_parse_link(&url_str, &attrs) {
self.project_info.artifacts.extend(artifact_infos);
}
}
}
let id = self.get_id();
self.names.insert(id, name);
id
}
fn finish(self) -> Self {
self
}
fn get_document(&mut self) -> usize {
0
}
fn get_template_contents(&mut self, target: &usize) -> usize {
target + 1
}
fn same_node(&self, x: &usize, y: &usize) -> bool {
x == y
}
fn elem_name(&self, target: &usize) -> ExpandedName {
self.names.get(target).expect("not an element").expanded()
}
fn create_comment(&mut self, _text: StrTendril) -> usize {
self.get_id()
}
fn create_pi(&mut self, _target: StrTendril, _value: StrTendril) -> usize {
unreachable!()
}
fn append_before_sibling(
&mut self,
_sibling: &usize,
_new_node: NodeOrText<usize>,
) {
}
fn append_based_on_parent_node(
&mut self,
_element: &usize,
_prev_element: &usize,
_new_node: NodeOrText<usize>,
) {
}
fn parse_error(&mut self, _msg: Cow<'static, str>) {}
fn set_quirks_mode(&mut self, _mode: QuirksMode) {}
fn append(&mut self, _parent: &usize, _child: NodeOrText<usize>) {}
fn append_doctype_to_document(
&mut self,
_: StrTendril,
_: StrTendril,
_: StrTendril,
) {
}
fn add_attrs_if_missing(&mut self, _target: &usize, _attrs: Vec<Attribute>) {}
fn remove_from_parent(&mut self, _target: &usize) {}
fn reparent_children(&mut self, _node: &usize, _new_parent: &usize) {}
fn mark_script_already_started(&mut self, _node: &usize) {}
}
pub fn parse_html<T>(url: &Url, content_type: &str, mut body: T) -> Result<ProjectInfo>
where
T: Read,
{
let content_type: mime::Mime = content_type.parse()?;
match (
content_type.type_().as_str(),
content_type.subtype().as_str(),
) {
("text", "html") => {}
_ => bail!(
"simple API page expected Content-Type: text/html, but got {}",
content_type,
),
};
let sink = Sink {
next_id: 1,
base: url.clone(),
changed_base: false,
names: HashMap::new(),
project_info: Default::default(),
};
Ok(parse_document(sink, Default::default())
.from_utf8()
.read_from(&mut body)?
.project_info)
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_sink_simple() {
let parsed = parse_html(
&Url::parse("https://example.com/old-base/").unwrap(),
"text/html",
br#"<html>
<head>
<meta name="pypi:repository-version" content="1.0">
<base href="https://example.com/new-base/">
</head>
<body>
<a href="link1-1.0.tar.gz#sha256=0000000000000000000000000000000000000000000000000000000000000000">link1</a>
<a href="/elsewhere/link2-2.0.zip" data-yanked="some reason">link2</a>
<a href="link3-3.0.tar.gz" data-requires-python=">= 3.17">link3</a>
</body>
</html>
"# as &[u8],
).unwrap();
insta::assert_ron_snapshot!(parsed, @r###"
ProjectInfo(
meta: Meta(
version: "1.0",
),
artifacts: [
ArtifactInfo(
name: "link1-1.0.tar.gz",
url: "https://example.com/new-base/link1-1.0.tar.gz#sha256=0000000000000000000000000000000000000000000000000000000000000000",
hash: Some("sha256=0000000000000000000000000000000000000000000000000000000000000000"),
requires_python: None,
dist_info_metadata: DistInfoMetadata(
available: false,
hash: None,
),
yanked: Yanked(
yanked: false,
reason: None,
),
),
ArtifactInfo(
name: "link2-2.0.zip",
url: "https://example.com/elsewhere/link2-2.0.zip",
hash: None,
requires_python: None,
dist_info_metadata: DistInfoMetadata(
available: false,
hash: None,
),
yanked: Yanked(
yanked: true,
reason: Some("some reason"),
),
),
ArtifactInfo(
name: "link3-3.0.tar.gz",
url: "https://example.com/new-base/link3-3.0.tar.gz",
hash: None,
requires_python: Some(">= 3.17"),
dist_info_metadata: DistInfoMetadata(
available: false,
hash: None,
),
yanked: Yanked(
yanked: false,
reason: None,
),
),
],
)
"###);
}
}