html_extractor_macros/
lib.rs

1use proc_macro::TokenStream as TokenStream1;
2use proc_macro2::{Delimiter, TokenStream, TokenTree, TokenTree::*};
3use proc_macro_error::*;
4use quote::{quote, ToTokens};
5
6#[proc_macro_error]
7#[proc_macro]
8pub fn html_extractor(input: TokenStream1) -> TokenStream1 {
9    let mut input_iter: TokenStreamIter = TokenStream::from(input).into_iter().peekable();
10
11    let mut structs = Vec::new();
12    while !input_iter.is_finished() {
13        structs.push(Struct::parse(&mut input_iter));
14    }
15
16    quote!(#(#structs)*).into()
17}
18
19lazy_static::lazy_static! {
20    static ref CRATE: String = proc_macro_crate::crate_name("html-extractor").unwrap();
21}
22
23type TokenStreamIter = std::iter::Peekable<<TokenStream as IntoIterator>::IntoIter>;
24trait TokenStreamIterExt {
25    fn is_finished(&mut self) -> bool;
26    fn peek_ex(&mut self, expected: &str) -> &TokenTree;
27    fn peek_ex_str(&mut self, expected: &str) -> String;
28    fn next_ex(&mut self, expected: &str) -> TokenTree;
29    fn next_ex_str(&mut self, expected: &str) -> String;
30    fn expect(&mut self, expect: &str);
31    fn expect_or_none(&mut self, expect: &str);
32    fn advance(&mut self, advance: usize);
33}
34impl TokenStreamIterExt for TokenStreamIter {
35    fn is_finished(&mut self) -> bool {
36        self.peek().is_none()
37    }
38    fn peek_ex(&mut self, expected: &str) -> &TokenTree {
39        self.peek()
40            .unwrap_or_else(|| abort_call_site!("expected {}", expected))
41    }
42    fn peek_ex_str(&mut self, expected: &str) -> String {
43        self.peek()
44            .unwrap_or_else(|| abort_call_site!("expected {}", expected))
45            .to_string()
46    }
47    fn next_ex(&mut self, expected: &str) -> TokenTree {
48        self.next()
49            .unwrap_or_else(|| abort_call_site!("expected {}", expected))
50    }
51    fn next_ex_str(&mut self, expected: &str) -> String {
52        self.next()
53            .unwrap_or_else(|| abort_call_site!("expected {}", expected))
54            .to_string()
55    }
56    fn expect(&mut self, expect: &str) {
57        let next = self
58            .next()
59            .unwrap_or_else(|| abort_call_site!("expected `{}`", expect));
60        if next.to_string() != expect {
61            abort!(next, "expected `{}`, found `{}`", expect, next);
62        }
63    }
64    fn expect_or_none(&mut self, expect: &str) {
65        let next = match self.next() {
66            Some(n) => n,
67            None => return,
68        };
69        if next.to_string() != expect {
70            abort!(next, "expected `{}`, found `{}`", expect, next);
71        }
72    }
73    fn advance(&mut self, advance: usize) {
74        for _ in 0..advance {
75            self.next();
76        }
77    }
78}
79
80enum Visibility {
81    Private,
82    Public,
83    PublicIn(TokenStream),
84}
85impl Visibility {
86    fn parse(ts: &mut TokenStreamIter) -> Visibility {
87        let iter_advance;
88        let vis = match &*ts.peek_ex_str("`pub` or identifier") {
89            "pub" => {
90                ts.next();
91                match ts.peek_ex("`(crate)`, `(super)`, `(in SimplePath)` or identifier") {
92                    Group(g) if g.delimiter() == Delimiter::Parenthesis => {
93                        iter_advance = 1;
94                        Visibility::PublicIn(g.stream())
95                    }
96                    _ => {
97                        iter_advance = 0;
98                        Visibility::Public
99                    }
100                }
101            }
102            _ => {
103                iter_advance = 0;
104                Visibility::Private
105            }
106        };
107        ts.advance(iter_advance);
108        vis
109    }
110}
111impl ToTokens for Visibility {
112    fn to_tokens(&self, tokens: &mut TokenStream) {
113        tokens.extend(match self {
114            Visibility::Private => quote!(),
115            Visibility::Public => quote!(pub),
116            Visibility::PublicIn(s) => quote!(pub (#s)),
117        });
118    }
119}
120
121struct Attributes {
122    tokens: Vec<TokenTree>,
123}
124impl Attributes {
125    fn parse(ts: &mut TokenStreamIter) -> Attributes {
126        let mut tokens = Vec::new();
127        while ts.peek_ex_str("attribute, visibility or identifier") == "#" {
128            tokens.push(ts.next_ex("`#`"));
129            tokens.push(ts.next_ex("`[..]`"));
130        }
131        Attributes { tokens }
132    }
133}
134impl ToTokens for Attributes {
135    fn to_tokens(&self, tokens: &mut TokenStream) {
136        tokens.extend(self.tokens.clone());
137    }
138}
139
140struct Struct {
141    attr: Attributes,
142    vis: Visibility,
143    name: TokenTree,
144    fields: Vec<Field>,
145}
146impl Struct {
147    fn parse(ts: &mut TokenStreamIter) -> Struct {
148        let attr = Attributes::parse(ts);
149        let vis = Visibility::parse(ts);
150        let name = ts.next_ex("identifier");
151
152        let mut fields = Vec::new();
153        match ts.next_ex("{{..}}") {
154            Group(g) if g.delimiter() == Delimiter::Brace => {
155                let mut body_ts = g.stream().into_iter().peekable();
156                while !body_ts.is_finished() {
157                    fields.push(Field::parse(&mut body_ts));
158                    body_ts.expect_or_none(",");
159                }
160            }
161            tt => abort!(tt, "expected {{..}}, found `{}`", tt),
162        }
163
164        Struct {
165            attr,
166            vis,
167            name,
168            fields,
169        }
170    }
171}
172impl ToTokens for Struct {
173    fn to_tokens(&self, tokens: &mut TokenStream) {
174        let attr = &self.attr;
175        let vis = &self.vis;
176        let name = &self.name;
177
178        let field_def = self.fields.iter().map(|f| f.def_tokens());
179        let field_extract = self.fields.iter().map(|f| f.extract_tokens(&self.name));
180        let field_init = self.fields.iter().map(|f| f.init_tokens());
181
182        let _crate = CRATE.parse::<TokenStream>().unwrap();
183
184        tokens.extend(quote!(
185            #attr
186            #vis struct #name {
187                #(#field_def)*
188            }
189            impl #_crate::HtmlExtractor for #name {
190                fn extract(__elem: &#_crate::scraper::ElementRef) -> ::std::result::Result<Self, #_crate::Error> {
191                    #(#field_extract)*
192                    ::std::result::Result::Ok(Self {
193                        #(#field_init)*
194                    })
195                }
196            }
197        ));
198    }
199}
200
201enum Field {
202    Single {
203        field: SingleField,
204        extractor: Extractor,
205    },
206    Tuple {
207        fields: Vec<SingleField>,
208        extractor: Extractor,
209    },
210}
211impl Field {
212    fn parse(ts: &mut TokenStreamIter) -> Field {
213        match ts.peek_ex("(..), visibility or identifier") {
214            Group(g) if g.delimiter() == Delimiter::Parenthesis => {
215                //Tuple
216                let mut fields_ts = g.stream().into_iter().peekable();
217
218                let mut fields = Vec::new();
219                while !fields_ts.is_finished() {
220                    fields.push(SingleField::parse(&mut fields_ts));
221                    fields_ts.expect_or_none(",");
222                }
223                ts.next();
224
225                ts.expect("=");
226
227                let extractor = Extractor::parse(ts);
228
229                if extractor.capture.is_none() {
230                    abort!(
231                        fields[0].name,
232                        "parsing to tuple fields requires capturing with regex"
233                    );
234                }
235
236                Field::Tuple { fields, extractor }
237            }
238            _ => {
239                //Single
240                let field = SingleField::parse(ts);
241
242                ts.expect("=");
243
244                let extractor = Extractor::parse(ts);
245
246                Field::Single { field, extractor }
247            }
248        }
249    }
250
251    fn def_tokens(&self) -> TokenStream {
252        let mut ts = TokenStream::new();
253        match self {
254            Field::Single { field, .. } => {
255                let attr = &field.attr;
256                let vis = &field.vis;
257                let name = &field.name;
258                let ty = &field.ty;
259                ts.extend(quote!(
260                    #attr
261                    #vis #name: #(#ty)*,
262                ));
263            }
264            Field::Tuple { fields, .. } => {
265                for field in fields {
266                    let attr = &field.attr;
267                    let vis = &field.vis;
268                    let name = &field.name;
269                    let ty = &field.ty;
270                    ts.extend(quote!(
271                        #attr
272                        #vis #name: #(#ty)*,
273                    ));
274                }
275            }
276        }
277        ts
278    }
279    fn extract_tokens(&self, struct_name: &TokenTree) -> TokenStream {
280        match self {
281            Field::Single { field, extractor } => {
282                let name = &field.name;
283                let extractor_ts = extractor.to_tokens(struct_name, &field.name);
284                quote!(
285                    let #name = #extractor_ts;
286                )
287            }
288            Field::Tuple { fields, extractor } => {
289                let names = fields.iter().map(|f| &f.name);
290                let extractor_ts = extractor.to_tokens(struct_name, &fields[0].name);
291                quote!(
292                    let (#(#names,)*) = #extractor_ts;
293                )
294            }
295        }
296    }
297    fn init_tokens(&self) -> TokenStream {
298        match self {
299            Field::Single { field, .. } => {
300                let name = &field.name;
301                quote!(
302                    #name,
303                )
304            }
305            Field::Tuple { fields, .. } => {
306                let names = fields.iter().map(|f| &f.name);
307                quote!(
308                    #(#names,)*
309                )
310            }
311        }
312    }
313}
314struct SingleField {
315    attr: Attributes,
316    vis: Visibility,
317    name: TokenTree,
318    ty: Vec<TokenTree>,
319}
320impl SingleField {
321    fn parse(ts: &mut TokenStreamIter) -> Self {
322        let attr = Attributes::parse(ts);
323        let vis = Visibility::parse(ts);
324        let name = ts.next_ex("identifier");
325
326        ts.expect(":");
327
328        let mut ty = Vec::<TokenTree>::new();
329        while !ts.is_finished() && {
330            let peek = ts.peek_ex_str("`,` or `=`");
331            peek != "," && peek != "="
332        } {
333            ty.push(ts.next_ex(","));
334        }
335
336        Self {
337            attr,
338            vis,
339            name,
340            ty,
341        }
342    }
343}
344
345struct Extractor {
346    target: ExtractTarget,
347    capture: Option<TokenTree>,
348    collector: ExtractCollector,
349    parser: Vec<TokenTree>,
350}
351impl Extractor {
352    fn parse(ts: &mut TokenStreamIter) -> Self {
353        let extractor_tt = ts.next_ex("`(..)`");
354        let mut extractor_ts: TokenStreamIter = match &extractor_tt {
355            Group(g) if g.delimiter() == Delimiter::Parenthesis => {
356                g.stream().into_iter().peekable()
357            }
358            tt => abort!(tt, "expect `(..)`, found `{}`", tt),
359        };
360
361        let mut target = None;
362        let mut capture = None;
363        let mut collector = ExtractCollector::First;
364        let mut parser = None;
365
366        while !extractor_ts.is_finished() {
367            match &*extractor_ts.next_ex_str(
368                "`elem`, `attr`, `text`, `inner_html`, `presence`, `capture`, `collect`, `optional` or `parse`",
369            ) {
370                "elem" => {
371                    extractor_ts.expect("of");
372                    let selector = extractor_ts.next_ex("literal string").clone();
373                    target = Some(ExtractTarget::Element { selector });
374                }
375                "attr" => {
376                    let attribute = match extractor_ts.next_ex("`[..]`") {
377                        Group(g) if g.delimiter() == Delimiter::Bracket => {
378                            g.stream().into_iter().peekable().next_ex("literal string")
379                        }
380                        tt => abort!(tt, "expected `[..]`, found {}", tt),
381                    };
382                    extractor_ts.expect("of");
383                    let selector = extractor_ts.next_ex("literal string").clone();
384                    target = Some(ExtractTarget::Attribute {
385                        attribute,
386                        selector,
387                    });
388                }
389                "text" => {
390                    let nth = match extractor_ts.next_ex("`[..]` or `of`") {
391                        Group(g) if g.delimiter() == Delimiter::Bracket => {
392                            extractor_ts.expect("of");
393                            g.stream()
394                        }
395                        tt if tt.to_string() == "of" => "0".parse().unwrap(),
396                        tt => abort!(tt, "expected `[..]` or `of`, found {}", tt),
397                    };
398
399                    let selector = extractor_ts.next_ex("literal string").clone();
400                    target = Some(ExtractTarget::TextNode { nth, selector });
401                }
402                "inner_html" => {
403                    extractor_ts.expect("of");
404                    let selector = extractor_ts.next_ex("literal string").clone();
405                    target = Some(ExtractTarget::InnerHTML { selector });
406                }
407                "presence" => {
408                    extractor_ts.expect("of");
409                    let selector = extractor_ts.next_ex("literal string").clone();
410                    target = Some(ExtractTarget::PresenceOf { selector });
411                }
412                "capture" => {
413                    extractor_ts.expect("with");
414                    let regex = extractor_ts.next_ex("literal string").clone();
415                    capture = Some(regex);
416                }
417                "collect" => {
418                    collector = ExtractCollector::IntoIterator;
419                }
420                "optional" => {
421                    collector = ExtractCollector::Option;
422                }
423                "parse" => {
424                    extractor_ts.expect("with");
425                    let mut parser_vec = Vec::new();
426                    while !extractor_ts.is_finished() && extractor_ts.peek_ex_str(",") != "," {
427                        parser_vec.push(extractor_ts.next_ex(","));
428                    }
429                    parser = Some(parser_vec)
430                }
431                tt => abort!(
432                    tt,
433                    "expected `elem`, `attr`, `text`, `capture` or `collect`, found `{}`",
434                    tt
435                ),
436            }
437            extractor_ts.expect_or_none(",");
438        }
439
440        let target = match target {
441            Some(t) => t,
442            None => abort!(extractor_tt, "target is not specified"),
443        };
444
445        if let ExtractTarget::Element { .. } = &target {
446            if capture.is_some() {
447                abort!(
448                    extractor_tt,
449                    "`elem of ..` and `capture with ..` cannot be used for the same field"
450                );
451            }
452        } else if let ExtractTarget::PresenceOf { .. } = &target {
453            if capture.is_some() || collector != ExtractCollector::First || parser.is_some() {
454                abort!(
455                    extractor_tt,
456                    "`presence of ..` cannot be used with any other specifier"
457                );
458            }
459        }
460
461        Extractor {
462            target,
463            capture,
464            collector,
465            parser: parser
466                .unwrap_or_else(|| quote!(::std::str::FromStr::from_str).into_iter().collect()),
467        }
468    }
469    fn to_tokens(&self, struct_name: &TokenTree, field_name: &TokenTree) -> TokenStream {
470        let _crate = CRATE.parse::<TokenStream>().unwrap();
471
472        let selector = self.target.selector();
473        if let Err(err) = scraper::Selector::parse(&get_literal_str_value(selector)) {
474            abort!(selector, "cannot parse the selector: {:?}", err);
475        }
476
477        let mut regex_captures_len = None;
478
479        let lazy_static_ts = match &self.capture {
480            Some(regex) => {
481                match regex::Regex::new(&get_literal_str_value(regex)) {
482                    Ok(regex) => regex_captures_len = Some(regex.captures_len()),
483                    Err(err) => abort!(regex, "cannot parse the regex: {:?}", err),
484                };
485                quote! {
486                    #_crate::lazy_static::lazy_static! {
487                        static ref SELECTOR: #_crate::scraper::Selector = #_crate::scraper::Selector::parse(#selector).unwrap();
488                        static ref REGEX: #_crate::regex::Regex = #_crate::regex::Regex::new(#regex).unwrap();
489                    }
490                }
491            }
492            None => quote! {
493                #_crate::lazy_static::lazy_static! {
494                    static ref SELECTOR: #_crate::scraper::Selector = #_crate::scraper::Selector::parse(#selector).unwrap();
495                }
496            },
497        };
498
499        let extract_data_from_elem_ts = match &self.target {
500            ExtractTarget::Element { .. } => quote! {
501                let data = target_elem;
502            },
503            ExtractTarget::Attribute { attribute, .. } => quote! {
504                let data = target_elem.value().attr(#attribute).ok_or(
505                    #_crate::error::Error::InvalidInput(
506                        ::std::borrow::Cow::Borrowed(::std::concat!(
507                            "extracting the data of field `",
508                            ::std::stringify!(#field_name),
509                            "` in struct `",
510                            ::std::stringify!(#struct_name),
511                            "`, attribute `",
512                            #attribute,
513                            "` is not found"
514                        ))
515                    )
516                )?;
517            },
518            ExtractTarget::TextNode { nth, .. } => quote! {
519                let data_whitespace = target_elem.text().nth(#nth).ok_or(
520                    #_crate::error::Error::InvalidInput(
521                        ::std::borrow::Cow::Borrowed(::std::concat!(
522                            "extracting the data of field `",
523                            ::std::stringify!(#field_name),
524                            "` in struct `",
525                            ::std::stringify!(#struct_name),
526                            "`, ",
527                            ::std::stringify!(#nth),
528                            "th text node is not found"
529                        ))
530                    )
531                )?;
532                let data = data_whitespace.trim();
533            },
534            ExtractTarget::InnerHTML { .. } => quote! {
535                let data_whitespace = target_elem.inner_html();
536                let data = data_whitespace.trim();
537            },
538            ExtractTarget::PresenceOf { .. } => quote! {
539                let data = presence;
540            },
541        };
542
543        let parser = &self.parser;
544        let parse_data_ts = match &self.capture {
545            Some(_) => {
546                let mut captures = Vec::new();
547                for i in 1..regex_captures_len.unwrap() {
548                    captures.push(quote! {
549                        (#(#parser)*)(caps.get(#i).unwrap().as_str()).or_else(|e| ::std::result::Result::Err(
550                            #_crate::error::Error::InvalidInput(
551                                ::std::borrow::Cow::Owned(::std::format!(::std::concat!(
552                                    "extracting the data of field `",
553                                    ::std::stringify!(#field_name),
554                                    "` in struct `",
555                                    ::std::stringify!(#struct_name),
556                                    "`, cannot parse for the ",
557                                    ::std::stringify!(#i),
558                                    "th field: {:#?}"
559                                ), e))
560                            )
561                        ))?
562                    });
563                }
564                quote! {
565                    let caps = REGEX.captures(data).ok_or(
566                        #_crate::error::Error::InvalidInput(
567                            ::std::borrow::Cow::Borrowed(::std::concat!(
568                                "extracting the data of field `",
569                                ::std::stringify!(#field_name),
570                                "` in struct `",
571                                ::std::stringify!(#struct_name),
572                                "`, nothing is captured with regex"
573                            ))
574                        )
575                    )?;
576                    (
577                        #(#captures,)*
578                    )
579                }
580            }
581            None => match &self.target {
582                ExtractTarget::Element { .. } => quote! {
583                    #_crate::HtmlExtractor::extract(&data)?
584                },
585                _ => quote! {
586                    (#(#parser)*)(data).or_else(|e| ::std::result::Result::Err(#_crate::error::Error::InvalidInput(
587                            ::std::borrow::Cow::Owned(::std::format!(::std::concat!(
588                                "extracting the data of field `",
589                                ::std::stringify!(#field_name),
590                                "` in struct `",
591                                ::std::stringify!(#struct_name),
592                                "`, cannot parse `{}`: {:#?}",
593                            ), data, e))
594                        )
595                    ))?
596                },
597            },
598        };
599
600        let collector_ts = match &self.collector {
601            ExtractCollector::First => {
602                if let ExtractTarget::PresenceOf { .. } = &self.target {
603                    quote! {
604                        __elem.select(&*SELECTOR).next().is_some()
605                    }
606                } else {
607                    quote! {
608                        let target_elem = __elem.select(&*SELECTOR).next().ok_or(
609                            #_crate::error::Error::InvalidInput(
610                                ::std::borrow::Cow::Borrowed(::std::concat!(
611                                    "extracting the data of field `",
612                                    ::std::stringify!(#field_name),
613                                    "` in struct `",
614                                    ::std::stringify!(#struct_name),
615                                    "`, no element matched the selector"
616                                ))
617                            )
618                        )?;
619                        #extract_data_from_elem_ts
620                        #parse_data_ts
621                    }
622                }
623            }
624            ExtractCollector::IntoIterator => {
625                quote! {
626                    let mut items = ::std::vec::Vec::new();
627                    for target_elem in __elem.select(&*SELECTOR) {
628                        let item = {
629                            #extract_data_from_elem_ts
630                            #parse_data_ts
631                        };
632                        items.push(item);
633                    }
634                    items.into_iter().collect()
635                }
636            }
637            ExtractCollector::Option => {
638                quote! {
639                    match __elem.select(&*SELECTOR).next() {
640                        Some(target_elem) => Some({
641                            #extract_data_from_elem_ts
642                            #parse_data_ts
643                        }),
644                        None => None,
645                    }
646                }
647            }
648        };
649
650        quote! {{
651            #lazy_static_ts
652            #collector_ts
653        }}
654    }
655}
656enum ExtractTarget {
657    Element {
658        selector: TokenTree,
659    },
660    Attribute {
661        attribute: TokenTree,
662        selector: TokenTree,
663    },
664    TextNode {
665        nth: TokenStream,
666        selector: TokenTree,
667    },
668    InnerHTML {
669        selector: TokenTree,
670    },
671    PresenceOf {
672        selector: TokenTree,
673    },
674}
675impl ExtractTarget {
676    fn selector(&self) -> &TokenTree {
677        match self {
678            ExtractTarget::Element { selector } => selector,
679            ExtractTarget::Attribute { selector, .. } => selector,
680            ExtractTarget::TextNode { selector, .. } => selector,
681            ExtractTarget::InnerHTML { selector } => selector,
682            ExtractTarget::PresenceOf { selector } => selector,
683        }
684    }
685}
686
687#[derive(PartialEq)]
688enum ExtractCollector {
689    //extracts only the first data
690    First,
691    //extracts all the data and collects into the type that implements IntoIterator,
692    IntoIterator,
693    //emits Some(..) if the data exist, None if not
694    Option,
695}
696
697fn get_literal_str_value(tt: &TokenTree) -> String {
698    let ts = quote!(#tt);
699    let lit_str: syn::LitStr =
700        syn::parse2(ts).unwrap_or_else(|_| abort!(tt, "expected literal string, found `{}`", tt));
701    lit_str.value()
702}