use super::dom;
use super::error::Error;
use super::scorer;
use crate::rcdom::{RcDom, SerializableHandle};
use html5ever::tendril::stream::TendrilSink;
use html5ever::{parse_document, serialize};
use std::cell::Cell;
use std::collections::BTreeMap;
use std::default::Default;
use std::io::Read;
use std::path::Path;
use url::Url;
#[cfg(feature = "tokio")]
use html5ever::tendril::stream::Utf8LossyDecoder;
#[cfg(feature = "tokio")]
use html5ever::tendril::{fmt as tendril_fmt, Tendril};
#[cfg(feature = "tokio")]
use html5ever::Parser;
#[cfg(feature = "tokio")]
use tokio::io::{AsyncRead, AsyncReadExt};
#[cfg(feature = "tokio")]
pub const ASYNC_BYTE_THRESHOLD: usize = 128 * 1024;
#[cfg(feature = "tokio")]
const STREAM_READ_CHUNK: usize = 32 * 1024;
#[derive(Debug)]
pub struct Product {
pub content: String,
pub text: String,
}
pub fn extract<R>(input: &mut R, url: &Url) -> Result<Product, Error>
where
R: Read,
{
let dom = parse_document(RcDom::default(), Default::default())
.from_utf8()
.read_from(input)?;
process_dom(dom, url)
}
pub fn process_dom(mut dom: RcDom, url: &Url) -> Result<Product, Error> {
let mut candidates = BTreeMap::new();
let mut nodes = BTreeMap::new();
let mut id: &str = "/";
let mut bytes = vec![];
let mut text: String = String::new();
let mut title: String = String::new();
let mut lang: String = String::new();
let handle = dom.document.clone();
scorer::preprocess(&mut dom, &handle, &mut title, &mut lang);
scorer::find_candidates(Path::new(id), &handle, &mut candidates, &mut nodes);
let mut top_candidate: &scorer::Candidate = &scorer::Candidate {
node: handle,
score: Cell::new(0.0),
};
for (i, c) in candidates.iter() {
let score = c.score.get() * (1.0 - scorer::get_link_density(&c.node));
c.score.set(score);
if score <= top_candidate.score.get() {
continue;
}
id = i;
top_candidate = c;
}
let node = &top_candidate.node;
scorer::clean(&mut dom, Path::new(id), &node, url, &candidates);
serialize(
&mut bytes,
&SerializableHandle::from(node.clone()),
Default::default(),
)
.ok();
let content = auto_encoder::auto_encode_bytes(bytes.as_slice());
dom::extract_text(&node, &mut text, true);
let html_content = format!(
r#"<html class="paper"{}><head>
<meta name="disabled-adaptations" content="watch">
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="viewport" content="initial-scale=1">
<base href="{url}">
{}
<script>window.isReaderPage = true;</script>
</head><body>
"#,
if !lang.is_empty() {
format!(r#" lang="{}""#, &lang)
} else {
"".into()
},
if title.is_empty() {
"".into()
} else {
format!("<title>{title}</title>")
}
);
let formatted_content = format!("{}{}</body></html>", html_content, content);
Ok(Product {
content: formatted_content,
text,
})
}
#[cfg(feature = "tokio")]
pub async fn extract_async(bytes: Vec<u8>, url: Url) -> Result<Product, Error> {
let mut sink: Utf8LossyDecoder<Parser<RcDom>> =
parse_document(RcDom::default(), Default::default()).from_utf8();
if bytes.len() < ASYNC_BYTE_THRESHOLD {
sink.process(Tendril::<tendril_fmt::Bytes>::from_slice(&bytes));
let dom = sink.finish();
return process_dom(dom, &url);
}
let mut offset = 0;
while offset < bytes.len() {
let end = (offset + STREAM_READ_CHUNK).min(bytes.len());
sink.process(Tendril::<tendril_fmt::Bytes>::from_slice(
&bytes[offset..end],
));
offset = end;
tokio::task::yield_now().await;
}
let dom = sink.finish();
process_dom(dom, &url)
}
#[cfg(feature = "tokio")]
pub async fn extract_async_reader<R>(mut reader: R, url: Url) -> Result<Product, Error>
where
R: AsyncRead + Unpin,
{
let mut sink: Utf8LossyDecoder<Parser<RcDom>> =
parse_document(RcDom::default(), Default::default()).from_utf8();
let mut buf = vec![0u8; STREAM_READ_CHUNK];
let mut since_yield = 0usize;
loop {
let n = reader.read(&mut buf).await?;
if n == 0 {
break;
}
sink.process(Tendril::<tendril_fmt::Bytes>::from_slice(&buf[..n]));
since_yield += n;
if since_yield >= ASYNC_BYTE_THRESHOLD {
tokio::task::yield_now().await;
since_yield = 0;
}
}
let dom = sink.finish();
process_dom(dom, &url)
}