1use proc_macro::TokenStream as TokenStream1;
2use proc_macro2::{Delimiter, TokenStream, TokenTree, TokenTree::*};
3use proc_macro_error::*;
4use quote::{quote, ToTokens};
5
6#[proc_macro_error]
7#[proc_macro]
8pub fn html_extractor(input: TokenStream1) -> TokenStream1 {
9 let mut input_iter: TokenStreamIter = TokenStream::from(input).into_iter().peekable();
10
11 let mut structs = Vec::new();
12 while !input_iter.is_finished() {
13 structs.push(Struct::parse(&mut input_iter));
14 }
15
16 quote!(#(#structs)*).into()
17}
18
19lazy_static::lazy_static! {
20 static ref CRATE: String = proc_macro_crate::crate_name("html-extractor").unwrap();
21}
22
23type TokenStreamIter = std::iter::Peekable<<TokenStream as IntoIterator>::IntoIter>;
24trait TokenStreamIterExt {
25 fn is_finished(&mut self) -> bool;
26 fn peek_ex(&mut self, expected: &str) -> &TokenTree;
27 fn peek_ex_str(&mut self, expected: &str) -> String;
28 fn next_ex(&mut self, expected: &str) -> TokenTree;
29 fn next_ex_str(&mut self, expected: &str) -> String;
30 fn expect(&mut self, expect: &str);
31 fn expect_or_none(&mut self, expect: &str);
32 fn advance(&mut self, advance: usize);
33}
34impl TokenStreamIterExt for TokenStreamIter {
35 fn is_finished(&mut self) -> bool {
36 self.peek().is_none()
37 }
38 fn peek_ex(&mut self, expected: &str) -> &TokenTree {
39 self.peek()
40 .unwrap_or_else(|| abort_call_site!("expected {}", expected))
41 }
42 fn peek_ex_str(&mut self, expected: &str) -> String {
43 self.peek()
44 .unwrap_or_else(|| abort_call_site!("expected {}", expected))
45 .to_string()
46 }
47 fn next_ex(&mut self, expected: &str) -> TokenTree {
48 self.next()
49 .unwrap_or_else(|| abort_call_site!("expected {}", expected))
50 }
51 fn next_ex_str(&mut self, expected: &str) -> String {
52 self.next()
53 .unwrap_or_else(|| abort_call_site!("expected {}", expected))
54 .to_string()
55 }
56 fn expect(&mut self, expect: &str) {
57 let next = self
58 .next()
59 .unwrap_or_else(|| abort_call_site!("expected `{}`", expect));
60 if next.to_string() != expect {
61 abort!(next, "expected `{}`, found `{}`", expect, next);
62 }
63 }
64 fn expect_or_none(&mut self, expect: &str) {
65 let next = match self.next() {
66 Some(n) => n,
67 None => return,
68 };
69 if next.to_string() != expect {
70 abort!(next, "expected `{}`, found `{}`", expect, next);
71 }
72 }
73 fn advance(&mut self, advance: usize) {
74 for _ in 0..advance {
75 self.next();
76 }
77 }
78}
79
80enum Visibility {
81 Private,
82 Public,
83 PublicIn(TokenStream),
84}
85impl Visibility {
86 fn parse(ts: &mut TokenStreamIter) -> Visibility {
87 let iter_advance;
88 let vis = match &*ts.peek_ex_str("`pub` or identifier") {
89 "pub" => {
90 ts.next();
91 match ts.peek_ex("`(crate)`, `(super)`, `(in SimplePath)` or identifier") {
92 Group(g) if g.delimiter() == Delimiter::Parenthesis => {
93 iter_advance = 1;
94 Visibility::PublicIn(g.stream())
95 }
96 _ => {
97 iter_advance = 0;
98 Visibility::Public
99 }
100 }
101 }
102 _ => {
103 iter_advance = 0;
104 Visibility::Private
105 }
106 };
107 ts.advance(iter_advance);
108 vis
109 }
110}
111impl ToTokens for Visibility {
112 fn to_tokens(&self, tokens: &mut TokenStream) {
113 tokens.extend(match self {
114 Visibility::Private => quote!(),
115 Visibility::Public => quote!(pub),
116 Visibility::PublicIn(s) => quote!(pub (#s)),
117 });
118 }
119}
120
121struct Attributes {
122 tokens: Vec<TokenTree>,
123}
124impl Attributes {
125 fn parse(ts: &mut TokenStreamIter) -> Attributes {
126 let mut tokens = Vec::new();
127 while ts.peek_ex_str("attribute, visibility or identifier") == "#" {
128 tokens.push(ts.next_ex("`#`"));
129 tokens.push(ts.next_ex("`[..]`"));
130 }
131 Attributes { tokens }
132 }
133}
134impl ToTokens for Attributes {
135 fn to_tokens(&self, tokens: &mut TokenStream) {
136 tokens.extend(self.tokens.clone());
137 }
138}
139
140struct Struct {
141 attr: Attributes,
142 vis: Visibility,
143 name: TokenTree,
144 fields: Vec<Field>,
145}
146impl Struct {
147 fn parse(ts: &mut TokenStreamIter) -> Struct {
148 let attr = Attributes::parse(ts);
149 let vis = Visibility::parse(ts);
150 let name = ts.next_ex("identifier");
151
152 let mut fields = Vec::new();
153 match ts.next_ex("{{..}}") {
154 Group(g) if g.delimiter() == Delimiter::Brace => {
155 let mut body_ts = g.stream().into_iter().peekable();
156 while !body_ts.is_finished() {
157 fields.push(Field::parse(&mut body_ts));
158 body_ts.expect_or_none(",");
159 }
160 }
161 tt => abort!(tt, "expected {{..}}, found `{}`", tt),
162 }
163
164 Struct {
165 attr,
166 vis,
167 name,
168 fields,
169 }
170 }
171}
172impl ToTokens for Struct {
173 fn to_tokens(&self, tokens: &mut TokenStream) {
174 let attr = &self.attr;
175 let vis = &self.vis;
176 let name = &self.name;
177
178 let field_def = self.fields.iter().map(|f| f.def_tokens());
179 let field_extract = self.fields.iter().map(|f| f.extract_tokens(&self.name));
180 let field_init = self.fields.iter().map(|f| f.init_tokens());
181
182 let _crate = CRATE.parse::<TokenStream>().unwrap();
183
184 tokens.extend(quote!(
185 #attr
186 #vis struct #name {
187 #(#field_def)*
188 }
189 impl #_crate::HtmlExtractor for #name {
190 fn extract(__elem: &#_crate::scraper::ElementRef) -> ::std::result::Result<Self, #_crate::Error> {
191 #(#field_extract)*
192 ::std::result::Result::Ok(Self {
193 #(#field_init)*
194 })
195 }
196 }
197 ));
198 }
199}
200
201enum Field {
202 Single {
203 field: SingleField,
204 extractor: Extractor,
205 },
206 Tuple {
207 fields: Vec<SingleField>,
208 extractor: Extractor,
209 },
210}
211impl Field {
212 fn parse(ts: &mut TokenStreamIter) -> Field {
213 match ts.peek_ex("(..), visibility or identifier") {
214 Group(g) if g.delimiter() == Delimiter::Parenthesis => {
215 let mut fields_ts = g.stream().into_iter().peekable();
217
218 let mut fields = Vec::new();
219 while !fields_ts.is_finished() {
220 fields.push(SingleField::parse(&mut fields_ts));
221 fields_ts.expect_or_none(",");
222 }
223 ts.next();
224
225 ts.expect("=");
226
227 let extractor = Extractor::parse(ts);
228
229 if extractor.capture.is_none() {
230 abort!(
231 fields[0].name,
232 "parsing to tuple fields requires capturing with regex"
233 );
234 }
235
236 Field::Tuple { fields, extractor }
237 }
238 _ => {
239 let field = SingleField::parse(ts);
241
242 ts.expect("=");
243
244 let extractor = Extractor::parse(ts);
245
246 Field::Single { field, extractor }
247 }
248 }
249 }
250
251 fn def_tokens(&self) -> TokenStream {
252 let mut ts = TokenStream::new();
253 match self {
254 Field::Single { field, .. } => {
255 let attr = &field.attr;
256 let vis = &field.vis;
257 let name = &field.name;
258 let ty = &field.ty;
259 ts.extend(quote!(
260 #attr
261 #vis #name: #(#ty)*,
262 ));
263 }
264 Field::Tuple { fields, .. } => {
265 for field in fields {
266 let attr = &field.attr;
267 let vis = &field.vis;
268 let name = &field.name;
269 let ty = &field.ty;
270 ts.extend(quote!(
271 #attr
272 #vis #name: #(#ty)*,
273 ));
274 }
275 }
276 }
277 ts
278 }
279 fn extract_tokens(&self, struct_name: &TokenTree) -> TokenStream {
280 match self {
281 Field::Single { field, extractor } => {
282 let name = &field.name;
283 let extractor_ts = extractor.to_tokens(struct_name, &field.name);
284 quote!(
285 let #name = #extractor_ts;
286 )
287 }
288 Field::Tuple { fields, extractor } => {
289 let names = fields.iter().map(|f| &f.name);
290 let extractor_ts = extractor.to_tokens(struct_name, &fields[0].name);
291 quote!(
292 let (#(#names,)*) = #extractor_ts;
293 )
294 }
295 }
296 }
297 fn init_tokens(&self) -> TokenStream {
298 match self {
299 Field::Single { field, .. } => {
300 let name = &field.name;
301 quote!(
302 #name,
303 )
304 }
305 Field::Tuple { fields, .. } => {
306 let names = fields.iter().map(|f| &f.name);
307 quote!(
308 #(#names,)*
309 )
310 }
311 }
312 }
313}
314struct SingleField {
315 attr: Attributes,
316 vis: Visibility,
317 name: TokenTree,
318 ty: Vec<TokenTree>,
319}
320impl SingleField {
321 fn parse(ts: &mut TokenStreamIter) -> Self {
322 let attr = Attributes::parse(ts);
323 let vis = Visibility::parse(ts);
324 let name = ts.next_ex("identifier");
325
326 ts.expect(":");
327
328 let mut ty = Vec::<TokenTree>::new();
329 while !ts.is_finished() && {
330 let peek = ts.peek_ex_str("`,` or `=`");
331 peek != "," && peek != "="
332 } {
333 ty.push(ts.next_ex(","));
334 }
335
336 Self {
337 attr,
338 vis,
339 name,
340 ty,
341 }
342 }
343}
344
345struct Extractor {
346 target: ExtractTarget,
347 capture: Option<TokenTree>,
348 collector: ExtractCollector,
349 parser: Vec<TokenTree>,
350}
351impl Extractor {
352 fn parse(ts: &mut TokenStreamIter) -> Self {
353 let extractor_tt = ts.next_ex("`(..)`");
354 let mut extractor_ts: TokenStreamIter = match &extractor_tt {
355 Group(g) if g.delimiter() == Delimiter::Parenthesis => {
356 g.stream().into_iter().peekable()
357 }
358 tt => abort!(tt, "expect `(..)`, found `{}`", tt),
359 };
360
361 let mut target = None;
362 let mut capture = None;
363 let mut collector = ExtractCollector::First;
364 let mut parser = None;
365
366 while !extractor_ts.is_finished() {
367 match &*extractor_ts.next_ex_str(
368 "`elem`, `attr`, `text`, `inner_html`, `presence`, `capture`, `collect`, `optional` or `parse`",
369 ) {
370 "elem" => {
371 extractor_ts.expect("of");
372 let selector = extractor_ts.next_ex("literal string").clone();
373 target = Some(ExtractTarget::Element { selector });
374 }
375 "attr" => {
376 let attribute = match extractor_ts.next_ex("`[..]`") {
377 Group(g) if g.delimiter() == Delimiter::Bracket => {
378 g.stream().into_iter().peekable().next_ex("literal string")
379 }
380 tt => abort!(tt, "expected `[..]`, found {}", tt),
381 };
382 extractor_ts.expect("of");
383 let selector = extractor_ts.next_ex("literal string").clone();
384 target = Some(ExtractTarget::Attribute {
385 attribute,
386 selector,
387 });
388 }
389 "text" => {
390 let nth = match extractor_ts.next_ex("`[..]` or `of`") {
391 Group(g) if g.delimiter() == Delimiter::Bracket => {
392 extractor_ts.expect("of");
393 g.stream()
394 }
395 tt if tt.to_string() == "of" => "0".parse().unwrap(),
396 tt => abort!(tt, "expected `[..]` or `of`, found {}", tt),
397 };
398
399 let selector = extractor_ts.next_ex("literal string").clone();
400 target = Some(ExtractTarget::TextNode { nth, selector });
401 }
402 "inner_html" => {
403 extractor_ts.expect("of");
404 let selector = extractor_ts.next_ex("literal string").clone();
405 target = Some(ExtractTarget::InnerHTML { selector });
406 }
407 "presence" => {
408 extractor_ts.expect("of");
409 let selector = extractor_ts.next_ex("literal string").clone();
410 target = Some(ExtractTarget::PresenceOf { selector });
411 }
412 "capture" => {
413 extractor_ts.expect("with");
414 let regex = extractor_ts.next_ex("literal string").clone();
415 capture = Some(regex);
416 }
417 "collect" => {
418 collector = ExtractCollector::IntoIterator;
419 }
420 "optional" => {
421 collector = ExtractCollector::Option;
422 }
423 "parse" => {
424 extractor_ts.expect("with");
425 let mut parser_vec = Vec::new();
426 while !extractor_ts.is_finished() && extractor_ts.peek_ex_str(",") != "," {
427 parser_vec.push(extractor_ts.next_ex(","));
428 }
429 parser = Some(parser_vec)
430 }
431 tt => abort!(
432 tt,
433 "expected `elem`, `attr`, `text`, `capture` or `collect`, found `{}`",
434 tt
435 ),
436 }
437 extractor_ts.expect_or_none(",");
438 }
439
440 let target = match target {
441 Some(t) => t,
442 None => abort!(extractor_tt, "target is not specified"),
443 };
444
445 if let ExtractTarget::Element { .. } = &target {
446 if capture.is_some() {
447 abort!(
448 extractor_tt,
449 "`elem of ..` and `capture with ..` cannot be used for the same field"
450 );
451 }
452 } else if let ExtractTarget::PresenceOf { .. } = &target {
453 if capture.is_some() || collector != ExtractCollector::First || parser.is_some() {
454 abort!(
455 extractor_tt,
456 "`presence of ..` cannot be used with any other specifier"
457 );
458 }
459 }
460
461 Extractor {
462 target,
463 capture,
464 collector,
465 parser: parser
466 .unwrap_or_else(|| quote!(::std::str::FromStr::from_str).into_iter().collect()),
467 }
468 }
469 fn to_tokens(&self, struct_name: &TokenTree, field_name: &TokenTree) -> TokenStream {
470 let _crate = CRATE.parse::<TokenStream>().unwrap();
471
472 let selector = self.target.selector();
473 if let Err(err) = scraper::Selector::parse(&get_literal_str_value(selector)) {
474 abort!(selector, "cannot parse the selector: {:?}", err);
475 }
476
477 let mut regex_captures_len = None;
478
479 let lazy_static_ts = match &self.capture {
480 Some(regex) => {
481 match regex::Regex::new(&get_literal_str_value(regex)) {
482 Ok(regex) => regex_captures_len = Some(regex.captures_len()),
483 Err(err) => abort!(regex, "cannot parse the regex: {:?}", err),
484 };
485 quote! {
486 #_crate::lazy_static::lazy_static! {
487 static ref SELECTOR: #_crate::scraper::Selector = #_crate::scraper::Selector::parse(#selector).unwrap();
488 static ref REGEX: #_crate::regex::Regex = #_crate::regex::Regex::new(#regex).unwrap();
489 }
490 }
491 }
492 None => quote! {
493 #_crate::lazy_static::lazy_static! {
494 static ref SELECTOR: #_crate::scraper::Selector = #_crate::scraper::Selector::parse(#selector).unwrap();
495 }
496 },
497 };
498
499 let extract_data_from_elem_ts = match &self.target {
500 ExtractTarget::Element { .. } => quote! {
501 let data = target_elem;
502 },
503 ExtractTarget::Attribute { attribute, .. } => quote! {
504 let data = target_elem.value().attr(#attribute).ok_or(
505 #_crate::error::Error::InvalidInput(
506 ::std::borrow::Cow::Borrowed(::std::concat!(
507 "extracting the data of field `",
508 ::std::stringify!(#field_name),
509 "` in struct `",
510 ::std::stringify!(#struct_name),
511 "`, attribute `",
512 #attribute,
513 "` is not found"
514 ))
515 )
516 )?;
517 },
518 ExtractTarget::TextNode { nth, .. } => quote! {
519 let data_whitespace = target_elem.text().nth(#nth).ok_or(
520 #_crate::error::Error::InvalidInput(
521 ::std::borrow::Cow::Borrowed(::std::concat!(
522 "extracting the data of field `",
523 ::std::stringify!(#field_name),
524 "` in struct `",
525 ::std::stringify!(#struct_name),
526 "`, ",
527 ::std::stringify!(#nth),
528 "th text node is not found"
529 ))
530 )
531 )?;
532 let data = data_whitespace.trim();
533 },
534 ExtractTarget::InnerHTML { .. } => quote! {
535 let data_whitespace = target_elem.inner_html();
536 let data = data_whitespace.trim();
537 },
538 ExtractTarget::PresenceOf { .. } => quote! {
539 let data = presence;
540 },
541 };
542
543 let parser = &self.parser;
544 let parse_data_ts = match &self.capture {
545 Some(_) => {
546 let mut captures = Vec::new();
547 for i in 1..regex_captures_len.unwrap() {
548 captures.push(quote! {
549 (#(#parser)*)(caps.get(#i).unwrap().as_str()).or_else(|e| ::std::result::Result::Err(
550 #_crate::error::Error::InvalidInput(
551 ::std::borrow::Cow::Owned(::std::format!(::std::concat!(
552 "extracting the data of field `",
553 ::std::stringify!(#field_name),
554 "` in struct `",
555 ::std::stringify!(#struct_name),
556 "`, cannot parse for the ",
557 ::std::stringify!(#i),
558 "th field: {:#?}"
559 ), e))
560 )
561 ))?
562 });
563 }
564 quote! {
565 let caps = REGEX.captures(data).ok_or(
566 #_crate::error::Error::InvalidInput(
567 ::std::borrow::Cow::Borrowed(::std::concat!(
568 "extracting the data of field `",
569 ::std::stringify!(#field_name),
570 "` in struct `",
571 ::std::stringify!(#struct_name),
572 "`, nothing is captured with regex"
573 ))
574 )
575 )?;
576 (
577 #(#captures,)*
578 )
579 }
580 }
581 None => match &self.target {
582 ExtractTarget::Element { .. } => quote! {
583 #_crate::HtmlExtractor::extract(&data)?
584 },
585 _ => quote! {
586 (#(#parser)*)(data).or_else(|e| ::std::result::Result::Err(#_crate::error::Error::InvalidInput(
587 ::std::borrow::Cow::Owned(::std::format!(::std::concat!(
588 "extracting the data of field `",
589 ::std::stringify!(#field_name),
590 "` in struct `",
591 ::std::stringify!(#struct_name),
592 "`, cannot parse `{}`: {:#?}",
593 ), data, e))
594 )
595 ))?
596 },
597 },
598 };
599
600 let collector_ts = match &self.collector {
601 ExtractCollector::First => {
602 if let ExtractTarget::PresenceOf { .. } = &self.target {
603 quote! {
604 __elem.select(&*SELECTOR).next().is_some()
605 }
606 } else {
607 quote! {
608 let target_elem = __elem.select(&*SELECTOR).next().ok_or(
609 #_crate::error::Error::InvalidInput(
610 ::std::borrow::Cow::Borrowed(::std::concat!(
611 "extracting the data of field `",
612 ::std::stringify!(#field_name),
613 "` in struct `",
614 ::std::stringify!(#struct_name),
615 "`, no element matched the selector"
616 ))
617 )
618 )?;
619 #extract_data_from_elem_ts
620 #parse_data_ts
621 }
622 }
623 }
624 ExtractCollector::IntoIterator => {
625 quote! {
626 let mut items = ::std::vec::Vec::new();
627 for target_elem in __elem.select(&*SELECTOR) {
628 let item = {
629 #extract_data_from_elem_ts
630 #parse_data_ts
631 };
632 items.push(item);
633 }
634 items.into_iter().collect()
635 }
636 }
637 ExtractCollector::Option => {
638 quote! {
639 match __elem.select(&*SELECTOR).next() {
640 Some(target_elem) => Some({
641 #extract_data_from_elem_ts
642 #parse_data_ts
643 }),
644 None => None,
645 }
646 }
647 }
648 };
649
650 quote! {{
651 #lazy_static_ts
652 #collector_ts
653 }}
654 }
655}
656enum ExtractTarget {
657 Element {
658 selector: TokenTree,
659 },
660 Attribute {
661 attribute: TokenTree,
662 selector: TokenTree,
663 },
664 TextNode {
665 nth: TokenStream,
666 selector: TokenTree,
667 },
668 InnerHTML {
669 selector: TokenTree,
670 },
671 PresenceOf {
672 selector: TokenTree,
673 },
674}
675impl ExtractTarget {
676 fn selector(&self) -> &TokenTree {
677 match self {
678 ExtractTarget::Element { selector } => selector,
679 ExtractTarget::Attribute { selector, .. } => selector,
680 ExtractTarget::TextNode { selector, .. } => selector,
681 ExtractTarget::InnerHTML { selector } => selector,
682 ExtractTarget::PresenceOf { selector } => selector,
683 }
684 }
685}
686
687#[derive(PartialEq)]
688enum ExtractCollector {
689 First,
691 IntoIterator,
693 Option,
695}
696
697fn get_literal_str_value(tt: &TokenTree) -> String {
698 let ts = quote!(#tt);
699 let lit_str: syn::LitStr =
700 syn::parse2(ts).unwrap_or_else(|_| abort!(tt, "expected literal string, found `{}`", tt));
701 lit_str.value()
702}