use kalosm::language::*;
use std::future::Future;
use std::io::Write;
use std::pin::Pin;
use std::sync::atomic::AtomicUsize;
use std::sync::atomic::Ordering;
use std::sync::Arc;
#[tokio::main]
async fn main() {
let real_visited = Arc::new(AtomicUsize::new(0));
Page::crawl(
Url::parse("https://dioxuslabs.com/learn/0.5/").unwrap(),
BrowserMode::Static,
move |page: Page| {
let real_visited = real_visited.clone();
Box::pin(async move {
let visited = real_visited.fetch_add(1, Ordering::SeqCst);
if page.url().domain() != Some("dioxuslabs.com") {
return CrawlFeedback::follow_none();
}
let path_prefix = "/learn/0.5/";
if !page.url().path().starts_with(path_prefix) {
return CrawlFeedback::follow_none();
}
let Ok(mut document) = page.html().await else {
return CrawlFeedback::follow_none();
};
let original_length = document.html().len();
let mut simplifier = HtmlSimplifier::default();
simplifier.simplify(&mut document);
let simplified = document.html();
let simplified_length = simplified.len();
let percentage_decrease =
(original_length - simplified_length) as f32 / original_length as f32;
println!(
"simplifing {} -{:.3}% from {:?} to {:?}",
page.url(),
percentage_decrease,
original_length,
simplified_length
);
let _ = std::fs::create_dir_all("scraped");
if let Ok(mut file) = std::fs::File::create(format!("scraped/{visited}.html")) {
_ = file.write_all(simplified.as_bytes());
}
CrawlFeedback::follow_all()
}) as Pin<Box<dyn Future<Output = CrawlFeedback>>>
},
)
.await;
}