1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
//!# Parses and searches Tag documents. (e.g. HTML, XML)
//!
//! parsercher parses documents written in tags such as HTML and XML.
//! * Create a tree of Dom structures from the tag document.
//! * Search for tags and text in the tree of Dom structures.
//!
//! # Usage
//! Add this to your `Cargo.toml`:
//! ```
//! [dependencies]
//! parsercher = "1.0.0"
//! ```
//!
//! # Examples
//! **Example of getting text from HTML.**  
//! Create a tree of Dom structure from HTML and get the text of `li` tag that value of `class` attribute is `target`.
//! ```rust
//! use std::collections::HashMap;
//! use parsercher;
//! use parsercher::dom::Tag;
//!
//! let html = r#"
//! <!DOCTYPE html>
//! <html>
//!   <head>
//!     <meta charset="UTF-8">
//!     <title>sample html</title>
//!   </head>
//!   <body>
//!     <ol>
//!       <li class="target">first</li>
//!       <li>second</li>
//!       <li class="target">therd</li>
//!     </ol>
//!   </body>
//! </html>
//! "#;
//!
//! if let Ok(root_dom) = parsercher::parse(&html) {
//!     let mut needle = Tag::new("li".to_string());
//!     let mut attr = HashMap::new();
//!     attr.insert("class".to_string(), "target".to_string());
//!     needle.set_attr(attr);
//!
//!     if let Some(texts) = parsercher::search_text_from_tag_children(&root_dom, &needle) {
//!         assert_eq!(texts.len(), 2);
//!         assert_eq!(texts[0], "first".to_string());
//!         assert_eq!(texts[1], "therd".to_string());
//!     }
//! }
//! ```
//!
//! **More complex examples of Dom structure tree**
//! ```rust
//! use parsercher;
//!
//! let html = r#"
//! <!DOCTYPE html>
//! <html>
//!   <head>
//!     <meta charset="UTF-8">
//!     <title>sample html</title>
//!   </head>
//!   <body>
//!     <h1>Hello, world!</h1>
//!
//!     <div id="content"></div>
//!
//!     <ol>
//!       <li>first</li>
//!       <li>second</li>
//!       <li>therd</li>
//!     </ol>
//!     <!-- All script code becomes one text -->
//! <script>
//!   let content = document.getElementById('content');
//!   content.textContent = 'content';
//! </script>
//!   </body>
//! </html>
//! "#;
//!
//! if let Ok(dom) = parsercher::parse(&html) {
//!     println!("{:#?}", dom);
//! }
//! ```
//!
//! output:
//! ```
//! Dom {
//!     dom_type: Tag,
//!     tag: Some(
//!         Tag {
//!             name: "root",
//!             attr: None,
//!             terminated: false,
//!             terminator: false,
//!         },
//!     ),
//!     text: None,
//!     comment: None,
//!     children: Some(
//!         [
//!             Dom {
//!                 dom_type: Tag,
//!                 tag: Some(
//!                     Tag {
//!                         name: "!DOCTYPE",
//!                         attr: Some(
//!                             {
//!                                 "html": "",
//!                             },
//!                         ),
//!                         terminated: false,
//!                         terminator: false,
//!                     },
//!                 ),
//!                 text: None,
//!                 comment: None,
//!                 children: None,
//!             },
//!             Dom {
//!                 dom_type: Tag,
//!                 tag: Some(
//!                     Tag {
//!                         name: "html",
//!                         attr: None,
//!                         terminated: false,
//!                         terminator: false,
//!                     },
//!                 ),
//!                 text: None,
//!                 comment: None,
//!                 children: Some(
//!                     [
//!                         Dom {
//!                             dom_type: Tag,
//!                             tag: Some(
//!                                 Tag {
//!                                     name: "head",
//!                                     attr: None,
//!                                     terminated: false,
//!                                     terminator: false,
//!                                 },
//!                             ),
//!                             text: None,
//!                             comment: None,
//!                             children: Some(
//!                                 [
//!                                     Dom {
//!                                         dom_type: Tag,
//!                                         tag: Some(
//!                                             Tag {
//!                                                 name: "meta",
//!                                                 attr: Some(
//!                                                     {
//!                                                         "charset": "UTF-8",
//!                                                     },
//!                                                 ),
//!                                                 terminated: false,
//!                                                 terminator: false,
//!                                             },
//!                                         ),
//!                                         text: None,
//!                                         comment: None,
//!                                         children: None,
//!                                     },
//!                                     Dom {
//!                                         dom_type: Tag,
//!                                         tag: Some(
//!                                             Tag {
//!                                                 name: "title",
//!                                                 attr: None,
//!                                                 terminated: false,
//!                                                 terminator: false,
//!                                             },
//!                                         ),
//!                                         text: None,
//!                                         comment: None,
//!                                         children: Some(
//!                                             [
//!                                                 Dom {
//!                                                     dom_type: Text,
//!                                                     tag: None,
//!                                                     text: Some(
//!                                                         Text {
//!                                                             text: "sample html",
//!                                                         },
//!                                                     ),
//!                                                     comment: None,
//!                                                     children: None,
//!                                                 },
//!                                             ],
//!                                         ),
//!                                     },
//!                                 ],
//!                             ),
//!                         },
//!                         Dom {
//!                             dom_type: Tag,
//!                             tag: Some(
//!                                 Tag {
//!                                     name: "body",
//!                                     attr: None,
//!                                     terminated: false,
//!                                     terminator: false,
//!                                 },
//!                             ),
//!                             text: None,
//!                             comment: None,
//!                             children: Some(
//!                                 [
//!                                     Dom {
//!                                         dom_type: Tag,
//!                                         tag: Some(
//!                                             Tag {
//!                                                 name: "h1",
//!                                                 attr: None,
//!                                                 terminated: false,
//!                                                 terminator: false,
//!                                             },
//!                                         ),
//!                                         text: None,
//!                                         comment: None,
//!                                         children: Some(
//!                                             [
//!                                                 Dom {
//!                                                     dom_type: Text,
//!                                                     tag: None,
//!                                                     text: Some(
//!                                                         Text {
//!                                                             text: "Hello, world!",
//!                                                         },
//!                                                     ),
//!                                                     comment: None,
//!                                                     children: None,
//!                                                 },
//!                                             ],
//!                                         ),
//!                                     },
//!                                     Dom {
//!                                         dom_type: Tag,
//!                                         tag: Some(
//!                                             Tag {
//!                                                 name: "div",
//!                                                 attr: Some(
//!                                                     {
//!                                                         "id": "content",
//!                                                     },
//!                                                 ),
//!                                                 terminated: false,
//!                                                 terminator: false,
//!                                             },
//!                                         ),
//!                                         text: None,
//!                                         comment: None,
//!                                         children: None,
//!                                     },
//!                                     Dom {
//!                                         dom_type: Tag,
//!                                         tag: Some(
//!                                             Tag {
//!                                                 name: "ol",
//!                                                 attr: None,
//!                                                 terminated: false,
//!                                                 terminator: false,
//!                                             },
//!                                         ),
//!                                         text: None,
//!                                         comment: None,
//!                                         children: Some(
//!                                             [
//!                                                 Dom {
//!                                                     dom_type: Tag,
//!                                                     tag: Some(
//!                                                         Tag {
//!                                                             name: "li",
//!                                                             attr: None,
//!                                                             terminated: false,
//!                                                             terminator: false,
//!                                                         },
//!                                                     ),
//!                                                     text: None,
//!                                                     comment: None,
//!                                                     children: Some(
//!                                                         [
//!                                                             Dom {
//!                                                                 dom_type: Text,
//!                                                                 tag: None,
//!                                                                 text: Some(
//!                                                                     Text {
//!                                                                         text: "first",
//!                                                                     },
//!                                                                 ),
//!                                                                 comment: None,
//!                                                                 children: None,
//!                                                             },
//!                                                         ],
//!                                                     ),
//!                                                 },
//!                                                 Dom {
//!                                                     dom_type: Tag,
//!                                                     tag: Some(
//!                                                         Tag {
//!                                                             name: "li",
//!                                                             attr: None,
//!                                                             terminated: false,
//!                                                             terminator: false,
//!                                                         },
//!                                                     ),
//!                                                     text: None,
//!                                                     comment: None,
//!                                                     children: Some(
//!                                                         [
//!                                                             Dom {
//!                                                                 dom_type: Text,
//!                                                                 tag: None,
//!                                                                 text: Some(
//!                                                                     Text {
//!                                                                         text: "second",
//!                                                                     },
//!                                                                 ),
//!                                                                 comment: None,
//!                                                                 children: None,
//!                                                             },
//!                                                         ],
//!                                                     ),
//!                                                 },
//!                                                 Dom {
//!                                                     dom_type: Tag,
//!                                                     tag: Some(
//!                                                         Tag {
//!                                                             name: "li",
//!                                                             attr: None,
//!                                                             terminated: false,
//!                                                             terminator: false,
//!                                                         },
//!                                                     ),
//!                                                     text: None,
//!                                                     comment: None,
//!                                                     children: Some(
//!                                                         [
//!                                                             Dom {
//!                                                                 dom_type: Text,
//!                                                                 tag: None,
//!                                                                 text: Some(
//!                                                                     Text {
//!                                                                         text: "therd",
//!                                                                     },
//!                                                                 ),
//!                                                                 comment: None,
//!                                                                 children: None,
//!                                                             },
//!                                                         ],
//!                                                     ),
//!                                                 },
//!                                             ],
//!                                         ),
//!                                     },
//!                                     Dom {
//!                                         dom_type: Comment,
//!                                         tag: None,
//!                                         text: None,
//!                                         comment: Some(
//!                                             Comment {
//!                                                 comment: " All script code becomes one text ",
//!                                             },
//!                                         ),
//!                                         children: None,
//!                                     },
//!                                     Dom {
//!                                         dom_type: Tag,
//!                                         tag: Some(
//!                                             Tag {
//!                                                 name: "script",
//!                                                 attr: None,
//!                                                 terminated: false,
//!                                                 terminator: false,
//!                                             },
//!                                         ),
//!                                         text: None,
//!                                         comment: None,
//!                                         children: Some(
//!                                             [
//!                                                 Dom {
//!                                                     dom_type: Text,
//!                                                     tag: None,
//!                                                     text: Some(
//!                                                         Text {
//!                                                             text: "\n  let content = document.getElementById(\'content\');\n  content.textContent = \'content\';\n",
//!                                                         },
//!                                                     ),
//!                                                     comment: None,
//!                                                     children: None,
//!                                                 },
//!                                             ],
//!                                         ),
//!                                     },
//!                                 ],
//!                             ),
//!                         },
//!                     ],
//!                 ),
//!             },
//!         ],
//!     ),
//! }
//! ```
//!

pub mod dom;
mod parser;
mod searcher;

pub use parser::parse;
pub use parser::print_dom_tree;

pub use searcher::satisfy_sufficient_condition;
pub use searcher::search_tag;
pub use searcher::search_tag_from_name;
pub use searcher::search_text_from_tag_children;