use std::io::Read;
use crate::{
http::{field::MediaType, MessageReader},
io::ComboReader,
};
use super::{HeaderMapExt, HeaderMetadata};
pub trait Classifier {
fn can_accept(&self, metadata: &HeaderMetadata) -> bool;
}
pub trait Extractor<S: Read>: Read {
fn get_ref(&self) -> &S;
fn get_mut(&mut self) -> &mut S;
fn into_inner(self) -> S;
fn into_inner_box(self: Box<Self>) -> S;
fn finish(self) -> Result<S, crate::error::Error>;
fn finish_box(self: Box<Self>) -> Result<S, crate::error::Error>;
}
pub type ExtractorFactory<'a, S> =
Box<dyn 'a + Fn(S) -> Result<Box<dyn 'a + Extractor<S>>, crate::error::Error>>;
pub struct ExtractorDispatcher<'a, S: Read> {
source: Option<S>,
extractor: Option<Box<dyn 'a + Extractor<S>>>,
extractors: Vec<(Box<dyn 'a + Classifier>, ExtractorFactory<'a, S>)>,
}
impl<'a, S: 'a + Read> ExtractorDispatcher<'a, S> {
pub fn new(source: S) -> Self {
Self {
source: Some(source),
extractor: None,
extractors: Vec::new(),
}
}
pub fn get_ref(&self) -> &S {
match &self.source {
Some(source) => source,
None => self.extractor.as_ref().unwrap().get_ref(),
}
}
pub fn get_mut(&mut self) -> &mut S {
match &mut self.source {
Some(source) => source,
None => self.extractor.as_mut().unwrap().get_mut(),
}
}
pub fn into_inner(self) -> S {
match self.source {
Some(source) => source,
None => self.extractor.unwrap().into_inner_box(),
}
}
pub fn add_extractor(
&mut self,
classifier: Box<dyn Classifier>,
extractor_factory: ExtractorFactory<'a, S>,
) {
self.extractors.push((classifier, extractor_factory));
}
pub fn add_default_extractors(&mut self) {
self.add_extractor(
Box::new(ResourceClassifier),
Box::new(|source: S| Ok(Box::new(ResourceExtractor::new(source)?))),
);
self.add_extractor(
Box::new(HTTPClassifier),
Box::new(|source: S| Ok(Box::new(HTTPExtractor::new(source)?))),
);
}
pub fn can_accept_any(&self, metadata: &HeaderMetadata) -> bool {
self.extractors
.iter()
.any(|(classifier, _)| classifier.can_accept(metadata))
}
pub fn begin(&mut self, metadata: &HeaderMetadata) -> Result<(), crate::error::Error> {
for (classifier, factory) in &self.extractors {
if classifier.can_accept(metadata) {
let extractor = factory(self.source.take().unwrap())?;
self.extractor = Some(extractor);
return Ok(());
}
}
Err(std::io::Error::new(std::io::ErrorKind::InvalidInput, "no extractor").into())
}
pub fn finish(mut self) -> Result<S, crate::error::Error> {
match self.extractor {
Some(extractor) => extractor.finish_box(),
None => Ok(self.source.take().unwrap()),
}
}
}
impl<'a, S: Read> Read for ExtractorDispatcher<'a, S> {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
self.extractor.as_mut().unwrap().read(buf)
}
}
pub struct ResourceClassifier;
impl Classifier for ResourceClassifier {
fn can_accept(&self, metadata: &HeaderMetadata) -> bool {
let warc_type = metadata
.fields()
.get_required("WARC-Type")
.unwrap_or_default();
warc_type == "resource"
}
}
pub struct ResourceExtractor<S: Read> {
source: S,
}
impl<S: Read> ResourceExtractor<S> {
pub fn new(source: S) -> Result<Self, crate::error::Error>
where
Self: std::marker::Sized,
{
Ok(Self { source })
}
}
impl<S: Read> Read for ResourceExtractor<S> {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
self.source.read(buf)
}
}
impl<S: Read> Extractor<S> for ResourceExtractor<S> {
fn get_ref(&self) -> &S {
&self.source
}
fn get_mut(&mut self) -> &mut S {
&mut self.source
}
fn into_inner(self) -> S {
self.source
}
fn into_inner_box(self: Box<Self>) -> S {
self.source
}
fn finish(self) -> Result<S, crate::error::Error> {
Ok(self.source)
}
fn finish_box(self: Box<Self>) -> Result<S, crate::error::Error> {
Ok(self.source)
}
}
pub struct HTTPClassifier;
impl Classifier for HTTPClassifier {
fn can_accept(&self, metadata: &HeaderMetadata) -> bool {
let warc_type = match metadata.fields().get_required("WARC-Type") {
Ok(warc_type) => warc_type,
Err(_) => return false,
};
let content_type = match metadata
.fields()
.get_parsed_required::<MediaType>("Content-Type")
{
Ok(content_type) => content_type,
Err(_) => return false,
};
warc_type == "response"
&& content_type.type_ == "application"
&& content_type.subtype == "http"
}
}
pub struct HTTPExtractor<'a, S: Read> {
reader: MessageReader<'a, ComboReader<S>>,
}
impl<'a, S: Read> HTTPExtractor<'a, S> {
pub fn new(source: S) -> Result<Self, crate::error::Error> {
let mut reader = MessageReader::new(ComboReader::new(source));
reader.begin_response(None)?;
Ok(Self { reader })
}
}
impl<'a, S: Read> Extractor<S> for HTTPExtractor<'a, S> {
fn get_ref(&self) -> &S {
self.reader.get_ref().get_ref()
}
fn get_mut(&mut self) -> &mut S {
self.reader.get_mut().get_mut()
}
fn into_inner(self) -> S {
self.reader.into_inner().into_inner()
}
fn into_inner_box(self: Box<Self>) -> S {
self.reader.into_inner().into_inner()
}
fn finish(mut self) -> Result<S, crate::error::Error> {
self.reader.end_message()?;
Ok(self.reader.into_inner().into_inner())
}
fn finish_box(mut self: Box<Self>) -> Result<S, crate::error::Error> {
self.reader.end_message()?;
Ok(self.reader.into_inner().into_inner())
}
}
impl<'a, S: Read> Read for HTTPExtractor<'a, S> {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
self.reader.read_body().read(buf)
}
}