web_scraper/
lib.rs

1pub mod site;
2use regex::Regex;
3
4use std::fmt::{self, Display, Formatter};
5
6// region:  ---TagNames
7
8const A: &str = "a";
9const ABBR: &str = "abbr";
10const ACRONYM: &str = "acronym";
11const ADDRESS: &str = "address";
12const APPLET: &str = "applet";
13const AREA: &str = "area";
14const ARTICLE: &str = "article";
15const ASIDE: &str = "aside";
16const AUDIO: &str = "audio";
17const B: &str = "b";
18const BASE: &str = "base";
19const BASEFONT: &str = "basefont";
20const BDI: &str = "bdi";
21const BDO: &str = "bdo";
22const BIG: &str = "big";
23const BLOCKQUOTE: &str = "blockquote";
24const BODY: &str = "body";
25const BR: &str = "br";
26const BUTTON: &str = "button";
27const CANVAS: &str = "canvas";
28const CAPTION: &str = "caption";
29const CENTER: &str = "center";
30const CITE: &str = "cite";
31const CODE: &str = "code";
32const COL: &str = "col";
33const COLGROUP: &str = "colgroup";
34const DATALIST: &str = "datalist";
35const DD: &str = "dd";
36const DEL: &str = "del";
37const DETAILS: &str = "details";
38const DFN: &str = "dfn";
39const DIR: &str = "dir";
40const DIV: &str = "div";
41const DL: &str = "dl";
42const DT: &str = "dt";
43const EM: &str = "em";
44const EMBED: &str = "embed";
45const FIELDSET: &str = "fieldset";
46const FIGCAPTION: &str = "figcaption";
47const FIGURE: &str = "figure";
48const FONT: &str = "font";
49const FOOTER: &str = "footer";
50const FORM: &str = "form";
51const FRAME: &str = "frame";
52const FRAMESET: &str = "frameset";
53const H1: &str = "h1";
54const H2: &str = "h2";
55const H3: &str = "h3";
56const H4: &str = "h4";
57const H5: &str = "h5";
58const H6: &str = "h6";
59const HEAD: &str = "head";
60const HEADER: &str = "header";
61const HR: &str = "hr";
62const HTML: &str = "html";
63const I: &str = "i";
64const IFRAME: &str = "iframe";
65const IMG: &str = "img";
66const INPUT: &str = "input";
67const INS: &str = "ins";
68const KBD: &str = "kbd";
69const KEYGEN: &str = "keygen";
70const LABEL: &str = "label";
71const LEGEND: &str = "legend";
72const LI: &str = "li";
73const LINK: &str = "link";
74const MAIN: &str = "main";
75const MAP: &str = "map";
76const MARK: &str = "mark";
77const MENU: &str = "menu";
78const MENUITEM: &str = "menuitem";
79const META: &str = "meta";
80const METER: &str = "meter";
81const NAV: &str = "nav";
82const NOFRAMES: &str = "noframes";
83const NOSCRIPT: &str = "noscript";
84const OBJECT: &str = "object";
85const OL: &str = "ol";
86const OPTGROUP: &str = "optgroup";
87const OPTION: &str = "option";
88const OUTPUT: &str = "output";
89const P: &str = "p";
90const PARAM: &str = "param";
91const PRE: &str = "pre";
92const PROGRESS: &str = "progress";
93const Q: &str = "q";
94
95const S: &str = "s";
96const SAMP: &str = "samp";
97const SCRIPT: &str = "script";
98const SECTION: &str = "section";
99const SELECT: &str = "select";
100const SMALL: &str = "small";
101const SOURCE: &str = "source";
102const SPAN: &str = "span";
103const STRIKE: &str = "strike";
104const STRONG: &str = "strong";
105const STYLE: &str = "style";
106const SUB: &str = "sub";
107const SUMMARY: &str = "summary";
108const SUP: &str = "sup";
109const TABLE: &str = "table";
110const TBODY: &str = "tbody";
111const TD: &str = "td";
112const TEXTAREA: &str = "textarea";
113const TFOOT: &str = "tfoot";
114const TH: &str = "th";
115const THEAD: &str = "thead";
116const TIME: &str = "time";
117const TITLE: &str = "title";
118const TR: &str = "tr";
119const TRACK: &str = "track";
120const TT: &str = "tt";
121const U: &str = "u";
122const UL: &str = "ul";
123const VAR: &str = "var";
124const VIDEO: &str = "video";
125
126// endregion:  ---TagNames
127
128// region:  ---Make html tag struct
129///You can use this struct to get the elements inside a tag, by the tag name
130/// # Example
131/// ```rust
132/// let tag = HtmlTag::HTML
133/// // Parse the <html> tags and collect the results into a vector of strings
134/// let new_vector = tag.parse_tags(&buf);
135/// ```
136pub struct HtmlTag(pub &'static str);
137impl Display for HtmlTag {
138    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
139        write!(f, "{}", self.0)
140    }
141}
142impl HtmlTag {
143    ///   This represents the html tag: a
144    pub const A: HtmlTag = HtmlTag(A);
145    ///   This represents the html tag:  abbr  
146    pub const ABBR: HtmlTag = HtmlTag(ABBR);
147    ///   This represents the html tag:  acronym  
148    pub const ACRONYM: HtmlTag = HtmlTag(ACRONYM);
149    ///   This represents the html tag:  address  
150    pub const ADDRESS: HtmlTag = HtmlTag(ADDRESS);
151    ///   This represents the html tag:  applet  
152    pub const APPLET: HtmlTag = HtmlTag(APPLET);
153    ///   This represents the html tag:  area  
154    pub const AREA: HtmlTag = HtmlTag(AREA);
155    ///   This represents the html tag:  article  
156    pub const ARTICLE: HtmlTag = HtmlTag(ARTICLE);
157    ///   This represents the html tag:  aside  
158    pub const ASIDE: HtmlTag = HtmlTag(ASIDE);
159    ///   This represents the html tag:  audio  
160    pub const AUDIO: HtmlTag = HtmlTag(AUDIO);
161    ///   This represents the html tag:  b  
162    pub const B: HtmlTag = HtmlTag(B);
163    ///   This represents the html tag:  base  
164    pub const BASE: HtmlTag = HtmlTag(BASE);
165    ///   This represents the html tag:  basefont  
166    pub const BASEFONT: HtmlTag = HtmlTag(BASEFONT);
167    ///   This represents the html tag:  bdi  
168    pub const BDI: HtmlTag = HtmlTag(BDI);
169    ///   This represents the html tag:  bdo  
170    pub const BDO: HtmlTag = HtmlTag(BDO);
171    ///   This represents the html tag:  big  
172    pub const BIG: HtmlTag = HtmlTag(BIG);
173    ///   This represents the html tag:  blockquote  
174    pub const BLOCKQUOTE: HtmlTag = HtmlTag(BLOCKQUOTE);
175    ///   This represents the html tag:  body  
176    pub const BODY: HtmlTag = HtmlTag(BODY);
177    ///   This represents the html tag:  br  
178    pub const BR: HtmlTag = HtmlTag(BR);
179    ///   This represents the html tag:  button  
180    pub const BUTTON: HtmlTag = HtmlTag(BUTTON);
181    ///   This represents the html tag:  canvas  
182    pub const CANVAS: HtmlTag = HtmlTag(CANVAS);
183    ///   This represents the html tag:  caption  
184    pub const CAPTION: HtmlTag = HtmlTag(CAPTION);
185    ///   This represents the html tag:  center  
186    pub const CENTER: HtmlTag = HtmlTag(CENTER);
187    ///   This represents the html tag:  cite  
188    pub const CITE: HtmlTag = HtmlTag(CITE);
189    ///   This represents the html tag:  code  
190    pub const CODE: HtmlTag = HtmlTag(CODE);
191    ///   This represents the html tag:  col  
192    pub const COL: HtmlTag = HtmlTag(COL);
193    ///   This represents the html tag:  colgroup  
194    pub const COLGROUP: HtmlTag = HtmlTag(COLGROUP);
195    ///   This represents the html tag:  datalist  
196    pub const DATALIST: HtmlTag = HtmlTag(DATALIST);
197    ///   This represents the html tag:  dd  
198    pub const DD: HtmlTag = HtmlTag(DD);
199    ///   This represents the html tag:  del  
200    pub const DEL: HtmlTag = HtmlTag(DEL);
201    ///   This represents the html tag:  details  
202    pub const DETAILS: HtmlTag = HtmlTag(DETAILS);
203    ///   This represents the html tag:  dfn  
204    pub const DFN: HtmlTag = HtmlTag(DFN);
205    ///   This represents the html tag:  dir  
206    pub const DIR: HtmlTag = HtmlTag(DIR);
207    ///   This represents the html tag:  div  
208    pub const DIV: HtmlTag = HtmlTag(DIV);
209    ///   This represents the html tag:  dl  
210    pub const DL: HtmlTag = HtmlTag(DL);
211    ///   This represents the html tag:  dt  
212    pub const DT: HtmlTag = HtmlTag(DT);
213    ///   This represents the html tag:  em  
214    pub const EM: HtmlTag = HtmlTag(EM);
215    ///   This represents the html tag:  embed  
216    pub const EMBED: HtmlTag = HtmlTag(EMBED);
217    ///   This represents the html tag:  fieldset  
218    pub const FIELDSET: HtmlTag = HtmlTag(FIELDSET);
219    ///   This represents the html tag:  figcaption
220    pub const FIGCAPTION: HtmlTag = HtmlTag(FIGCAPTION);
221    ///   This represents the html tag:  figure
222    pub const FIGURE: HtmlTag = HtmlTag(FIGURE);
223    ///   This represents the html tag:  font
224    pub const FONT: HtmlTag = HtmlTag(FONT);
225    ///   This represents the html tag:  footer
226    pub const FOOTER: HtmlTag = HtmlTag(FOOTER);
227    ///   This represents the html tag:  form
228    pub const FORM: HtmlTag = HtmlTag(FORM);
229    ///   This represents the html tag:  frame
230    pub const FRAME: HtmlTag = HtmlTag(FRAME);
231    ///   This represents the html tag:  frameset
232    pub const FRAMESET: HtmlTag = HtmlTag(FRAMESET);
233    ///   This represents the html tag:  h1
234    pub const H1: HtmlTag = HtmlTag(H1);
235    ///   This represents the html tag:  h2
236    pub const H2: HtmlTag = HtmlTag(H2);
237    ///   This represents the html tag:  h3
238    pub const H3: HtmlTag = HtmlTag(H3);
239    ///   This represents the html tag:  h4
240    pub const H4: HtmlTag = HtmlTag(H4);
241    ///   This represents the html tag:  h5
242    pub const H5: HtmlTag = HtmlTag(H5);
243    ///   This represents the html tag:  h6
244    pub const H6: HtmlTag = HtmlTag(H6);
245    ///   This represents the html tag:  head
246    pub const HEAD: HtmlTag = HtmlTag(HEAD);
247    ///   This represents the html tag:  header
248    pub const HEADER: HtmlTag = HtmlTag(HEADER);
249    ///   This represents the html tag:   hr
250    pub const HR: HtmlTag = HtmlTag(HR);
251    ///   This represents the html tag:  html
252    pub const HTML: HtmlTag = HtmlTag(HTML);
253    ///   This represents the html tag:  i
254    pub const I: HtmlTag = HtmlTag(I);
255    ///   This represents the html tag:  iframe
256    pub const IFRAME: HtmlTag = HtmlTag(IFRAME);
257    ///   This represents the html tag:  img
258    pub const IMG: HtmlTag = HtmlTag(IMG);
259    ///   This represents the html tag:  input
260    pub const INPUT: HtmlTag = HtmlTag(INPUT);
261    ///   This represents the html tag:  ins
262    pub const INS: HtmlTag = HtmlTag(INS);
263    ///   This represents the html tag:  kbd
264    pub const KBD: HtmlTag = HtmlTag(KBD);
265    ///   This represents the html tag:  keygen
266    pub const KEYGEN: HtmlTag = HtmlTag(KEYGEN);
267    ///   This represents the html tag:  label
268    pub const LABEL: HtmlTag = HtmlTag(LABEL);
269    ///   This represents the html tag:  legend
270    pub const LEGEND: HtmlTag = HtmlTag(LEGEND);
271    ///   This represents the html tag:  li
272    pub const LI: HtmlTag = HtmlTag(LI);
273    ///   This represents the html tag:  link
274    pub const LINK: HtmlTag = HtmlTag(LINK);
275    ///   This represents the html tag:  main
276    pub const MAIN: HtmlTag = HtmlTag(MAIN);
277    ///   This represents the html tag:  map
278    pub const MAP: HtmlTag = HtmlTag(MAP);
279    ///   This represents the html tag:  mark
280    pub const MARK: HtmlTag = HtmlTag(MARK);
281    ///   This represents the html tag:  menu
282    pub const MENU: HtmlTag = HtmlTag(MENU);
283    ///   This represents the html tag:  menuitem
284    pub const MENUITEM: HtmlTag = HtmlTag(MENUITEM);
285    ///   This represents the html tag:  meta
286    pub const META: HtmlTag = HtmlTag(META);
287    ///   This represents the html tag:  meter
288    pub const METER: HtmlTag = HtmlTag(METER);
289    ///   This represents the html tag:  nav
290    pub const NAV: HtmlTag = HtmlTag(NAV);
291    ///   This represents the html tag:  noframes
292    pub const NOFRAMES: HtmlTag = HtmlTag(NOFRAMES);
293    ///   This represents the html tag:  noscript
294    pub const NOSCRIPT: HtmlTag = HtmlTag(NOSCRIPT);
295    ///   This represents the html tag:  object
296    pub const OBJECT: HtmlTag = HtmlTag(OBJECT);
297    ///   This represents the html tag:  ol
298    pub const OL: HtmlTag = HtmlTag(OL);
299    ///   This represents the html tag:  optgroup
300    pub const OPTGROUP: HtmlTag = HtmlTag(OPTGROUP);
301    ///   This represents the html tag:  option
302    pub const OPTION: HtmlTag = HtmlTag(OPTION);
303    ///   This represents the html tag:  output
304    pub const OUTPUT: HtmlTag = HtmlTag(OUTPUT);
305    ///   This represents the html tag:  p
306    pub const P: HtmlTag = HtmlTag(P);
307    ///   This represents the html tag:  param
308    pub const PARAM: HtmlTag = HtmlTag(PARAM);
309    ///   This represents the html tag:  pre
310    pub const PRE: HtmlTag = HtmlTag(PRE);
311    ///   This represents the html tag:  progress
312    pub const PROGRESS: HtmlTag = HtmlTag(PROGRESS);
313    ///   This represents the html tag:  q
314    pub const Q: HtmlTag = HtmlTag(Q);
315    ///   This represents the html tag:  s
316    pub const S: HtmlTag = HtmlTag(S);
317    ///   This represents the html tag:  samp
318    pub const SAMP: HtmlTag = HtmlTag(SAMP);
319    ///   This represents the html tag:  script
320    pub const SCRIPT: HtmlTag = HtmlTag(SCRIPT);
321    ///   This represents the html tag:  section
322    pub const SECTION: HtmlTag = HtmlTag(SECTION);
323    ///   This represents the html tag:  select
324    pub const SELECT: HtmlTag = HtmlTag(SELECT);
325    ///   This represents the html tag:  small
326    pub const SMALL: HtmlTag = HtmlTag(SMALL);
327    ///   This represents the html tag:  source
328    pub const SOURCE: HtmlTag = HtmlTag(SOURCE);
329    ///   This represents the html tag:  span
330    pub const SPAN: HtmlTag = HtmlTag(SPAN);
331    ///   This represents the html tag:  strike
332    pub const STRIKE: HtmlTag = HtmlTag(STRIKE);
333    ///   This represents the html tag:  strong
334    pub const STRONG: HtmlTag = HtmlTag(STRONG);
335    ///   This represents the html tag:  style
336    pub const STYLE: HtmlTag = HtmlTag(STYLE);
337    ///   This represents the html tag:  sub
338    pub const SUB: HtmlTag = HtmlTag(SUB);
339    ///   This represents the html tag:  summary
340    pub const SUMMARY: HtmlTag = HtmlTag(SUMMARY);
341    ///   This represents the html tag:  sup
342    pub const SUP: HtmlTag = HtmlTag(SUP);
343    ///   This represents the html tag:  table
344    pub const TABLE: HtmlTag = HtmlTag(TABLE);
345    ///   This represents the html tag:  tbody
346    pub const TBODY: HtmlTag = HtmlTag(TBODY);
347    ///   This represents the html tag:  td
348    pub const TD: HtmlTag = HtmlTag(TD);
349    ///   This represents the html tag:  textarea
350    pub const TEXTAREA: HtmlTag = HtmlTag(TEXTAREA);
351    ///   This represents the html tag:  tfoot
352    pub const TFOOT: HtmlTag = HtmlTag(TFOOT);
353    ///   This represents the html tag:  th
354    pub const TH: HtmlTag = HtmlTag(TH);
355    ///   This represents the html tag:  thead
356    pub const THEAD: HtmlTag = HtmlTag(THEAD);
357    ///   This represents the html tag:  time
358    pub const TIME: HtmlTag = HtmlTag(TIME);
359    ///   This represents the html tag:  title
360    pub const TITLE: HtmlTag = HtmlTag(TITLE);
361    ///   This represents the html tag:  tr
362    pub const TR: HtmlTag = HtmlTag(TR);
363    ///   This represents the html tag:  track
364    pub const TRACK: HtmlTag = HtmlTag(TRACK);
365    ///   This represents the html tag:  tt
366    pub const TT: HtmlTag = HtmlTag(TT);
367    ///   This represents the html tag:  u
368    pub const U: HtmlTag = HtmlTag(U);
369    ///   This represents the html tag:  ul
370    pub const UL: HtmlTag = HtmlTag(UL);
371    ///   This represents the html tag:  var
372    pub const VAR: HtmlTag = HtmlTag(VAR);
373    ///   This represents the html tag:  video
374    pub const VIDEO: HtmlTag = HtmlTag(VIDEO);
375    ///  This gets the content inside the specified tag
376    pub fn parse_tags(&self, input: &str) -> Vec<String> {
377        let re = Regex::new(&format!(r#"(?s)<{}[^>]*>(.*?)</{}>"#, &self, &self)).unwrap();
378
379        let mut result = Vec::new();
380
381        for capture in re.captures_iter(input) {
382            if let Some(text) = capture.get(1) {
383                result.push(text.as_str().trim().to_string());
384            }
385        }
386
387        result
388    }
389}
390
391// endregion:  ---Make html tag struct