crabler_tokio_derive/
lib.rs

1extern crate proc_macro;
2
3use proc_macro::TokenStream;
4use proc_macro2;
5use proc_macro_error::*;
6use quote::quote;
7use syn::{parse_macro_input, DeriveInput};
8
9#[proc_macro_derive(WebScraper, attributes(on_html, on_page, on_response))]
10#[proc_macro_error]
11/// Macro to derive WebScraper trait on to a given struct.
12/// Supported options:
13/// * `#[on_page(method_name)]` - will bind given method to a successful page load with the
14///   contents of the page
15/// * `#[on_html("css selector", method_name)]` - will bind given css selector to a method. When page
16///   is loaded this method will be invoked for all elements that match given selector.
17/// * `#[on_response(method_name)]` - will bind given method to an HTTP response
18pub fn web_scraper_derive(input: TokenStream) -> TokenStream {
19    let ast: syn::DeriveInput = parse_macro_input!(input as DeriveInput);
20
21    match ast.data {
22        syn::Data::Struct(syn::DataStruct { .. }) => impl_web_scraper(&ast),
23        _ => abort_call_site!("#[WebScraper] only supports structs"),
24    }
25}
26
27fn impl_web_scraper(ast: &syn::DeriveInput) -> TokenStream {
28    use syn::*;
29
30    let name = &ast.ident;
31
32    let mut pages = vec![];
33    let mut selectors = vec![];
34    let mut matches = vec![];
35    let mut responses = vec![];
36
37    for attr in &ast.attrs {
38        let meta = attr.parse_meta();
39
40        match meta {
41            Ok(Meta::List(MetaList { path, nested, .. }))
42                if path.segments[0].ident == "on_page" =>
43            {
44                let page = handle_on_page_attr(nested);
45                pages.push(page);
46            }
47            Ok(Meta::List(MetaList { path, nested, .. }))
48                if path.segments[0].ident == "on_html" =>
49            {
50                let (selector, match_clause) = handle_on_html_attr(nested);
51                selectors.push(selector);
52                matches.push(match_clause);
53            }
54            Ok(Meta::List(MetaList { path, nested, .. }))
55                if path.segments[0].ident == "on_response" =>
56            {
57                let response = handle_on_response_attr(nested);
58                responses.push(response);
59            }
60            Err(err) => {
61                abort_call_site!("Failed to parse attribute: {}", err);
62            }
63            _ => {
64                abort_call_site!("Unsupported arguments on attribute");
65            }
66        }
67    }
68
69    let gen = quote! {
70        #[async_trait(?Send)]
71        impl WebScraper for #name {
72            async fn dispatch_on_page(
73                &mut self,
74                page: String,
75            ) -> std::result::Result<(), CrablerError> {
76                #( #pages; )*
77
78                Ok(())
79            }
80
81            async fn dispatch_on_html(
82                &mut self,
83                selector: &str,
84                request: Response,
85                element: Element,
86            ) -> std::result::Result<(), CrablerError> {
87
88                match selector {
89                    #( #matches, )*
90                    _ => panic!("Failed to dispatch {}", selector),
91                };
92
93                Ok(())
94            }
95
96            fn all_html_selectors(&self) -> Vec<&str> {
97                vec![#( #selectors ),*]
98            }
99
100            async fn dispatch_on_response(
101                &mut self,
102                request: Response,
103            ) -> std::result::Result<(), CrablerError> {
104                #( #responses; )*
105
106                Ok(())
107            }
108
109            async fn run(
110                self,
111                opts: Opts,
112            ) -> std::result::Result<(), CrablerError> {
113                use crabler_tokio::Crabler;
114
115                let mut crabler = Crabler::new(self, &opts);
116
117                for url in &opts.urls {
118                    crabler.navigate(url).await?;
119                }
120
121                for _ in 0..opts.threads {
122                    crabler.start_worker();
123                }
124
125                crabler.run().await
126            }
127        }
128    };
129
130    gen.into()
131}
132
133fn handle_on_page_attr(
134    nested: syn::punctuated::Punctuated<syn::NestedMeta, syn::token::Comma>,
135) -> proc_macro2::TokenStream {
136    use syn::*;
137
138    let l = nested.len();
139    if l < 1 {
140        abort_call_site!("Not enough arguments provided to on_page attribute: {}", l);
141    }
142
143    let f = match &nested[0] {
144        NestedMeta::Meta(Meta::Path(Path { segments, .. })) => &segments[0].ident,
145        _ => abort_call_site!("Can't find on_page method"),
146    };
147
148    quote! { self.#f(page).await? }
149}
150
151fn handle_on_html_attr(
152    nested: syn::punctuated::Punctuated<syn::NestedMeta, syn::token::Comma>,
153) -> (proc_macro2::TokenStream, proc_macro2::TokenStream) {
154    use syn::*;
155
156    let l = nested.len();
157    if l < 2 {
158        abort_call_site!("Not enough arguments provided to on_html attribute: {}", l);
159    }
160
161    let token = match &nested[0] {
162        NestedMeta::Lit(Lit::Str(lit_str)) => lit_str,
163        _ => abort_call_site!("Can't find on_html selector"),
164    };
165
166    let f = match &nested[1] {
167        NestedMeta::Meta(Meta::Path(Path { segments, .. })) => &segments[0].ident,
168        _ => abort_call_site!("Can't find on_html method"),
169    };
170
171    let selector = quote! { #token };
172    let match_clause = quote! { #token => self.#f(request, element).await? };
173
174    (selector, match_clause)
175}
176
177fn handle_on_response_attr(
178    nested: syn::punctuated::Punctuated<syn::NestedMeta, syn::token::Comma>,
179) -> proc_macro2::TokenStream {
180    use syn::*;
181
182    let l = nested.len();
183    if l < 1 {
184        abort_call_site!(
185            "Not enough arguments provided to on_response attribute: {}",
186            l
187        );
188    }
189
190    let f = match &nested[0] {
191        NestedMeta::Meta(Meta::Path(Path { segments, .. })) => &segments[0].ident,
192        _ => abort_call_site!("Can't find on_response method"),
193    };
194
195    quote! { self.#f(request).await? }
196}