1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
//! `stork` is a simple futures-based library to recursively crawl //! sources in a search engine-like fashion. stork was designed from the //! ground to have a simple API that is easy to use and can be reused //! across multiple protocols, yielding each result giving end users the //! freedom to do BFS, DFS or any type of search they may so wish. //! //! Your entry point into stork is the [Storkable::new] function. Have //! a look through the [Storkable] struct's documentation for your //! entry into the world of storking. //! //! *Note: you're probably not looking for this library on its own but //! a protocol implementation of it. See below for some first-party //! implementations:* //! - [stork_http](../../../stork_http/) #![recursion_limit = "256"] #[macro_use] extern crate failure_derive; pub mod errors; pub mod filters; pub use errors::StorkError; pub use filters::FilterSet; use async_stream::try_stream; use futures::prelude::*; use std::pin::Pin; use std::sync::Arc; use failure::Error; use failure::ResultExt; /// A [Storkable] represents a "thing" which is traversable ("storkable"). /// /// To start "storking" an initial [Storkable] can be constructed with /// with [Storkable::new], once initialised filters can be added using /// [Storkable::with_filters]. /// /// After a [Storkable] has been initialised, the storking can begin /// with a call to [Storkable::exec] which will return a /// stream of more [Storkable]s (with the filters from the parent /// [Storkable] copied) which in turn can also be storked if necessary. /// /// A Storkable derives its functionality from its two generics, /// `T` and `C: StorkClient<T>`. The `StorkClient` implementation will /// be called with a value of `T`, and is expected to return all the /// values of `T` that can be found on the given `T`. #[derive(Debug, Clone)] pub struct Storkable<T: Unpin, C: StorkClient<T>> { value: T, filters: FilterSet<T>, client: Arc<C>, parent: Option<Arc<Storkable<T, C>>>, } impl<'a, T: Unpin + 'a, C: StorkClient<T> + 'a> Storkable<T, C> { /// Instantiates a new [Storkable] from a T, storking can then /// begin on the given entrypoint using the [Storkable::exec] method. pub fn new(val: T) -> Self { Self { value: val, filters: FilterSet::default(), client: Arc::new(C::default()), parent: None, } } /// Attaches a [FilterSet] to this [Storkable] and any children /// found after executing this one. pub fn with_filters(mut self, filters: FilterSet<T>) -> Self { self.filters = filters; self } /// Replaces the default [StorkClient] with a new one accepting /// and returning the same type for this [Storkable]. pub fn with_client(mut self, client: C) -> Self { self.client = Arc::new(client); self } // Grab a reference to the filters set on this [Storkable]. pub fn filters(&self) -> &FilterSet<T> { &self.filters } /// Get the value of this [Storkable]. pub fn val(&self) -> &T { &self.value } /// Get the [Storkable] from which this [Storkable] was found on. pub fn parent(&self) -> Option<&Storkable<T, C>> { // map to Arc::as_ref to hide the underlying Arc implementation self.parent.as_ref().map(Arc::as_ref) } /// Start storking this [Storkable]. /// /// Finds all the followable links on this [Storkable] and returns /// a stream of more [Storkable]s with the same filters and the /// `parent` set to a reference of the current [Storkable]. pub fn exec<'b>(self) -> impl futures::Stream<Item = Result<Storkable<T, C>, Error>> + 'a { let this = Arc::new(self); try_stream! { let mut children = this.client.run(this.val()); while let Some(child) = children.next().await { let child = child.context(StorkError::ClientError)?; if !this.filters.matches(&child) { continue; } yield Storkable { value: child, client: Arc::clone(&this.client), filters: this.filters.clone(), parent: Some(Arc::clone(&this)), }; } } } } /// A [StorkClient] is an underlying implementation of a storker. When a /// [Storkable] is initialised a [StorkClient] will be created using /// [Default::default] and the instance will be shared between all child /// [Storkable]s. /// /// The default [StorkClient] initialised by the [Storkable] can be /// replaced using [Storkable::with_client]. /// /// [StorkClient]s may be used across threads and *must* be thread-safe. pub trait StorkClient<T>: Default { /// Makes a call to `T` and returns the child `T`s it can find on the /// page. fn run(&self, src: &T) -> Pin<Box<dyn futures::Stream<Item = Result<T, Error>>>>; }