use anyhow::{Context, Result};
use headless_chrome::LaunchOptions;
use std::{
collections::HashSet,
fs::File,
io::{copy, Cursor},
sync::{Arc, Mutex},
};
use scraper::Html;
use url::Url;
use super::{extract, Browser, ScrapyError};
use crate::cli::Commands;
use crate::topology;
pub async fn download_img(url: &Url) -> Result<()> {
println!("Download image");
let (_, file_name) = url.path().rsplit_once('/').context("error on split")?;
let response = reqwest::get(url.as_str()).await?;
let mut file = File::create(file_name)?;
let mut content = Cursor::new(response.bytes().await?);
copy(&mut content, &mut file)?;
Ok(())
}
impl Browser {
pub fn new_navigate(url: &Url) -> Result<Self, ScrapyError> {
let browser = headless_chrome::Browser::new(
LaunchOptions::default_builder()
.devtools(false)
.build()
.map_err(|e| ScrapyError::Browser(e.to_string()))?,
)?;
let tab = browser.new_tab()?;
tab.navigate_to(url.as_str())?;
tab.wait_until_navigated()?;
Ok(Self { browser, tab })
}
pub async fn parse_document(
self,
cmd: Commands,
parent: &Arc<Mutex<topology::Node>>,
) -> HashSet<Url> {
let response = self.tab.get_content().unwrap();
let document = Html::parse_document(&response);
let links = extract::extract_links(&parent.lock().unwrap().url, &document);
match cmd {
Commands::Texts => {
extract::extract_texts(parent, &document);
parent.lock().unwrap().handle_texts();
}
Commands::Comments => {
extract::extract_comments(parent, &document);
parent.lock().unwrap().handle_comments();
}
Commands::Images => {
extract::extract_images(parent, &document);
parent.lock().unwrap().handle_images().await;
}
Commands::Links => {}
Commands::Graph => {
extract::extract_texts(parent, &document);
extract::extract_comments(parent, &document);
extract::extract_images(parent, &document);
}
};
links
}
}