1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
#[allow(unused_imports)] use crate::internal_prelude::*; use crate::types as rt; pub type Result = rt::ExtResult<()>; pub trait Filter<JS: rt::JobStateValues, TS: rt::TaskStateValues> { fn name(&self) -> String { String::from("no name") } fn accept( &self, ctx: &rt::JobCtx<JS, TS>, task: &rt::Task, status: &rt::HttpStatus, reader: Box<dyn io::Read + Sync + Send>, ) -> Result; } #[derive(Default)] pub struct RobotsTxt {} impl<JS: rt::JobStateValues, TS: rt::TaskStateValues> Filter<JS, TS> for RobotsTxt { fn accept( &self, ctx: &rt::JobCtx<JS, TS>, task: &rt::Task, status: &rt::HttpStatus, mut reader: Box<dyn io::Read + Sync + Send>, ) -> Result { if task.link.url.as_str().ends_with("robots.txt") { let content_type = status .headers .get(http::header::CONTENT_TYPE) .map(|v| v.to_str()) .unwrap_or_else(|| Ok("")) .unwrap_or(""); if content_type.to_lowercase() == "text/plain" { let mut content = String::from(""); let _ = reader.read_to_string(&mut content).context("cannot read robots.txt")?; ctx.shared.lock().unwrap().insert(String::from("robots"), Box::new(content)); } } Ok(()) } } impl RobotsTxt { pub fn new() -> Self { Self::default() } }