1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#[allow(unused_imports)]
use crate::internal_prelude::*;
use crate::types as rt;

pub type Result = rt::ExtResult<()>;

pub trait Filter<JS: rt::JobStateValues, TS: rt::TaskStateValues> {
	fn name(&self) -> String {
		String::from("no name")
	}
	fn accept(
		&self,
		ctx: &rt::JobCtx<JS, TS>,
		task: &rt::Task,
		status: &rt::HttpStatus,
		reader: Box<dyn io::Read + Sync + Send>,
	) -> Result;
}

#[derive(Default)]
pub struct RobotsTxt {}

impl<JS: rt::JobStateValues, TS: rt::TaskStateValues> Filter<JS, TS> for RobotsTxt {
	fn accept(
		&self,
		ctx: &rt::JobCtx<JS, TS>,
		task: &rt::Task,
		status: &rt::HttpStatus,
		mut reader: Box<dyn io::Read + Sync + Send>,
	) -> Result {
		if task.link.url.as_str().ends_with("robots.txt") {
			let content_type = status
				.headers
				.get(http::header::CONTENT_TYPE)
				.map(|v| v.to_str())
				.unwrap_or_else(|| Ok(""))
				.unwrap_or("");
			if content_type.to_lowercase() == "text/plain" {
				let mut content = String::from("");
				let _ = reader.read_to_string(&mut content).context("cannot read robots.txt")?;

				ctx.shared.lock().unwrap().insert(String::from("robots"), Box::new(content));
			}
		}

		Ok(())
	}
}

impl RobotsTxt {
	pub fn new() -> Self {
		Self::default()
	}
}