1use scraper::{Html, ElementRef, Selector};
12use serde::{Deserialize, Serialize};
13
14use crate::types::ParserResult;
15
16#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
22pub struct Form {
23 pub id: Option<String>,
25 pub name: Option<String>,
27 pub action: Option<String>,
29 pub method: FormMethod,
31 pub enctype: Option<String>,
33 pub fields: Vec<FormField>,
35 pub form_type: FormType,
37 pub has_csrf: bool,
39 pub has_captcha: bool,
41 pub submit_text: Option<String>,
43}
44
45impl Form {
46 pub fn new() -> Self {
48 Self {
49 id: None,
50 name: None,
51 action: None,
52 method: FormMethod::Get,
53 enctype: None,
54 fields: Vec::new(),
55 form_type: FormType::Unknown,
56 has_csrf: false,
57 has_captcha: false,
58 submit_text: None,
59 }
60 }
61
62 pub fn is_login(&self) -> bool {
64 matches!(self.form_type, FormType::Login)
65 }
66
67 pub fn is_search(&self) -> bool {
69 matches!(self.form_type, FormType::Search)
70 }
71
72 pub fn has_file_upload(&self) -> bool {
74 self.fields.iter().any(|f| f.field_type == FieldType::File)
75 }
76
77 pub fn required_fields(&self) -> Vec<&FormField> {
79 self.fields.iter().filter(|f| f.required).collect()
80 }
81
82 pub fn get_field(&self, name: &str) -> Option<&FormField> {
84 self.fields.iter().find(|f| f.name.as_deref() == Some(name))
85 }
86}
87
88impl Default for Form {
89 fn default() -> Self {
90 Self::new()
91 }
92}
93
94#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
96#[serde(rename_all = "lowercase")]
97pub enum FormMethod {
98 #[default]
99 Get,
100 Post,
101 Dialog,
102}
103
104impl From<&str> for FormMethod {
105 fn from(s: &str) -> Self {
106 match s.to_lowercase().as_str() {
107 "post" => FormMethod::Post,
108 "dialog" => FormMethod::Dialog,
109 _ => FormMethod::Get,
110 }
111 }
112}
113
114#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
116pub enum FormType {
117 Login,
119 Registration,
121 Search,
123 Contact,
125 Newsletter,
127 PasswordReset,
129 Checkout,
131 Comment,
133 Upload,
135 #[default]
137 Unknown,
138}
139
140#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
142pub struct FormField {
143 pub name: Option<String>,
145 pub id: Option<String>,
147 pub field_type: FieldType,
149 pub label: Option<String>,
151 pub placeholder: Option<String>,
153 pub value: Option<String>,
155 pub required: bool,
157 pub disabled: bool,
159 pub readonly: bool,
161 pub autocomplete: Option<String>,
163 pub pattern: Option<String>,
165 pub min_length: Option<u32>,
167 pub max_length: Option<u32>,
169 pub options: Vec<SelectOption>,
171}
172
173impl FormField {
174 pub fn new(field_type: FieldType) -> Self {
175 Self {
176 name: None,
177 id: None,
178 field_type,
179 label: None,
180 placeholder: None,
181 value: None,
182 required: false,
183 disabled: false,
184 readonly: false,
185 autocomplete: None,
186 pattern: None,
187 min_length: None,
188 max_length: None,
189 options: Vec::new(),
190 }
191 }
192
193 pub fn is_password(&self) -> bool {
195 matches!(self.field_type, FieldType::Password)
196 }
197
198 pub fn is_email(&self) -> bool {
200 matches!(self.field_type, FieldType::Email) ||
201 self.name.as_ref().map(|n| n.to_lowercase().contains("email")).unwrap_or(false) ||
202 self.autocomplete.as_ref().map(|a| a.contains("email")).unwrap_or(false)
203 }
204
205 pub fn is_username(&self) -> bool {
207 let name_lower = self.name.as_ref().map(|n| n.to_lowercase()).unwrap_or_default();
208 let id_lower = self.id.as_ref().map(|n| n.to_lowercase()).unwrap_or_default();
209
210 name_lower.contains("user") || name_lower.contains("login") ||
211 id_lower.contains("user") || id_lower.contains("login") ||
212 self.autocomplete.as_ref().map(|a| a.contains("username")).unwrap_or(false)
213 }
214}
215
216#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
218#[serde(rename_all = "lowercase")]
219pub enum FieldType {
220 #[default]
221 Text,
222 Password,
223 Email,
224 Tel,
225 Url,
226 Number,
227 Search,
228 Date,
229 DateTime,
230 Time,
231 Month,
232 Week,
233 Color,
234 Range,
235 File,
236 Hidden,
237 Checkbox,
238 Radio,
239 Select,
240 Textarea,
241 Submit,
242 Button,
243 Reset,
244 Image,
245}
246
247impl From<&str> for FieldType {
248 fn from(s: &str) -> Self {
249 match s.to_lowercase().as_str() {
250 "password" => FieldType::Password,
251 "email" => FieldType::Email,
252 "tel" | "telephone" | "phone" => FieldType::Tel,
253 "url" => FieldType::Url,
254 "number" => FieldType::Number,
255 "search" => FieldType::Search,
256 "date" => FieldType::Date,
257 "datetime" | "datetime-local" => FieldType::DateTime,
258 "time" => FieldType::Time,
259 "month" => FieldType::Month,
260 "week" => FieldType::Week,
261 "color" => FieldType::Color,
262 "range" => FieldType::Range,
263 "file" => FieldType::File,
264 "hidden" => FieldType::Hidden,
265 "checkbox" => FieldType::Checkbox,
266 "radio" => FieldType::Radio,
267 "select" | "select-one" | "select-multiple" => FieldType::Select,
268 "textarea" => FieldType::Textarea,
269 "submit" => FieldType::Submit,
270 "button" => FieldType::Button,
271 "reset" => FieldType::Reset,
272 "image" => FieldType::Image,
273 _ => FieldType::Text,
274 }
275 }
276}
277
278#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
280pub struct SelectOption {
281 pub value: String,
282 pub text: String,
283 pub selected: bool,
284 pub disabled: bool,
285}
286
287pub fn extract_forms(document: &Html) -> ParserResult<Vec<Form>> {
293 let form_selector = Selector::parse("form").unwrap();
294 let mut forms = Vec::new();
295
296 for form_el in document.select(&form_selector) {
297 if let Some(form) = extract_form(&form_el) {
298 forms.push(form);
299 }
300 }
301
302 Ok(forms)
303}
304
305fn extract_form(element: &ElementRef) -> Option<Form> {
307 let mut form = Form::new();
308
309 form.id = element.value().attr("id").map(|s| s.to_string());
311 form.name = element.value().attr("name").map(|s| s.to_string());
312 form.action = element.value().attr("action").map(|s| s.to_string());
313 form.method = element.value().attr("method")
314 .map(FormMethod::from)
315 .unwrap_or_default();
316 form.enctype = element.value().attr("enctype").map(|s| s.to_string());
317
318 form.fields = extract_form_fields(element);
320
321 form.form_type = detect_form_type(&form);
323
324 form.has_csrf = detect_csrf_token(&form);
326
327 form.has_captcha = detect_captcha(element);
329
330 form.submit_text = extract_submit_text(element);
332
333 Some(form)
334}
335
336fn extract_form_fields(form: &ElementRef) -> Vec<FormField> {
338 let mut fields = Vec::new();
339
340 let input_sel = Selector::parse("input").unwrap();
342 for input in form.select(&input_sel) {
343 if let Some(field) = extract_input_field(&input) {
344 fields.push(field);
345 }
346 }
347
348 let select_sel = Selector::parse("select").unwrap();
350 for select in form.select(&select_sel) {
351 if let Some(field) = extract_select_field(&select) {
352 fields.push(field);
353 }
354 }
355
356 let textarea_sel = Selector::parse("textarea").unwrap();
358 for textarea in form.select(&textarea_sel) {
359 if let Some(field) = extract_textarea_field(&textarea) {
360 fields.push(field);
361 }
362 }
363
364 associate_labels(form, &mut fields);
366
367 fields
368}
369
370fn extract_input_field(element: &ElementRef) -> Option<FormField> {
372 let input_type = element.value().attr("type").unwrap_or("text");
373 let mut field = FormField::new(FieldType::from(input_type));
374
375 field.name = element.value().attr("name").map(|s| s.to_string());
376 field.id = element.value().attr("id").map(|s| s.to_string());
377 field.placeholder = element.value().attr("placeholder").map(|s| s.to_string());
378 field.value = element.value().attr("value").map(|s| s.to_string());
379 field.required = element.value().attr("required").is_some();
380 field.disabled = element.value().attr("disabled").is_some();
381 field.readonly = element.value().attr("readonly").is_some();
382 field.autocomplete = element.value().attr("autocomplete").map(|s| s.to_string());
383 field.pattern = element.value().attr("pattern").map(|s| s.to_string());
384 field.min_length = element.value().attr("minlength").and_then(|s| s.parse().ok());
385 field.max_length = element.value().attr("maxlength").and_then(|s| s.parse().ok());
386
387 Some(field)
388}
389
390fn extract_select_field(element: &ElementRef) -> Option<FormField> {
392 let mut field = FormField::new(FieldType::Select);
393
394 field.name = element.value().attr("name").map(|s| s.to_string());
395 field.id = element.value().attr("id").map(|s| s.to_string());
396 field.required = element.value().attr("required").is_some();
397 field.disabled = element.value().attr("disabled").is_some();
398
399 let option_sel = Selector::parse("option").unwrap();
401 for option in element.select(&option_sel) {
402 let opt = SelectOption {
403 value: option.value().attr("value")
404 .unwrap_or("")
405 .to_string(),
406 text: option.text().collect::<String>().trim().to_string(),
407 selected: option.value().attr("selected").is_some(),
408 disabled: option.value().attr("disabled").is_some(),
409 };
410 field.options.push(opt);
411 }
412
413 Some(field)
414}
415
416fn extract_textarea_field(element: &ElementRef) -> Option<FormField> {
418 let mut field = FormField::new(FieldType::Textarea);
419
420 field.name = element.value().attr("name").map(|s| s.to_string());
421 field.id = element.value().attr("id").map(|s| s.to_string());
422 field.placeholder = element.value().attr("placeholder").map(|s| s.to_string());
423 field.value = Some(element.text().collect::<String>());
424 field.required = element.value().attr("required").is_some();
425 field.disabled = element.value().attr("disabled").is_some();
426 field.readonly = element.value().attr("readonly").is_some();
427 field.min_length = element.value().attr("minlength").and_then(|s| s.parse().ok());
428 field.max_length = element.value().attr("maxlength").and_then(|s| s.parse().ok());
429
430 Some(field)
431}
432
433fn associate_labels(form: &ElementRef, fields: &mut [FormField]) {
435 let label_sel = Selector::parse("label").unwrap();
436
437 for label in form.select(&label_sel) {
438 let label_text = label.text().collect::<String>().trim().to_string();
439
440 if let Some(for_id) = label.value().attr("for") {
442 for field in fields.iter_mut() {
443 if field.id.as_deref() == Some(for_id) {
444 field.label = Some(label_text.clone());
445 break;
446 }
447 }
448 }
449 }
450}
451
452fn detect_form_type(form: &Form) -> FormType {
454 let has_password = form.fields.iter().any(|f| f.field_type == FieldType::Password);
455 let has_email = form.fields.iter().any(|f| f.is_email());
456 let has_username = form.fields.iter().any(|f| f.is_username());
457 let has_search = form.fields.iter().any(|f| f.field_type == FieldType::Search);
458 let has_file = form.fields.iter().any(|f| f.field_type == FieldType::File);
459 let has_textarea = form.fields.iter().any(|f| f.field_type == FieldType::Textarea);
460 let password_count = form.fields.iter().filter(|f| f.field_type == FieldType::Password).count();
461
462 let action_lower = form.action.as_ref().map(|a| a.to_lowercase()).unwrap_or_default();
464 let name_lower = form.name.as_ref().map(|n| n.to_lowercase()).unwrap_or_default();
465 let id_lower = form.id.as_ref().map(|i| i.to_lowercase()).unwrap_or_default();
466
467 if has_search ||
469 action_lower.contains("search") ||
470 name_lower.contains("search") ||
471 id_lower.contains("search") {
472 return FormType::Search;
473 }
474
475 if has_password && password_count == 1 && (has_email || has_username) {
477 return FormType::Login;
478 }
479
480 if password_count >= 2 && has_email {
482 return FormType::Registration;
483 }
484
485 if has_password && !has_email && !has_username
487 && (action_lower.contains("reset") || action_lower.contains("password") ||
488 name_lower.contains("reset") || id_lower.contains("reset")) {
489 return FormType::PasswordReset;
490 }
491
492 if has_email && !has_password && form.fields.len() <= 3
494 && (action_lower.contains("subscribe") || action_lower.contains("newsletter") ||
495 name_lower.contains("subscribe") || name_lower.contains("newsletter")) {
496 return FormType::Newsletter;
497 }
498
499 if has_email && has_textarea && !has_password {
501 return FormType::Contact;
502 }
503
504 if has_textarea && !has_password &&
506 (action_lower.contains("comment") || name_lower.contains("comment") || id_lower.contains("comment")) {
507 return FormType::Comment;
508 }
509
510 if has_file {
512 return FormType::Upload;
513 }
514
515 if action_lower.contains("checkout") || action_lower.contains("payment") ||
517 action_lower.contains("order") || action_lower.contains("cart") {
518 return FormType::Checkout;
519 }
520
521 FormType::Unknown
522}
523
524fn detect_csrf_token(form: &Form) -> bool {
526 let csrf_patterns = [
527 "csrf", "token", "_token", "authenticity_token",
528 "xsrf", "__requestverificationtoken", "anti-forgery",
529 ];
530
531 form.fields.iter().any(|f| {
532 if f.field_type != FieldType::Hidden {
533 return false;
534 }
535
536 let name_lower = f.name.as_ref().map(|n| n.to_lowercase()).unwrap_or_default();
537 csrf_patterns.iter().any(|p| name_lower.contains(p))
538 })
539}
540
541fn detect_captcha(form: &ElementRef) -> bool {
543 let html = form.html().to_lowercase();
544
545 html.contains("recaptcha") ||
547 html.contains("hcaptcha") ||
548 html.contains("captcha") ||
549 html.contains("g-recaptcha") ||
550 html.contains("cf-turnstile") ||
551 html.contains("data-sitekey")
552}
553
554fn extract_submit_text(form: &ElementRef) -> Option<String> {
556 let submit_sel = Selector::parse("input[type='submit'], button[type='submit'], button:not([type])").unwrap();
558
559 if let Some(submit) = form.select(&submit_sel).next() {
560 if let Some(value) = submit.value().attr("value") {
562 return Some(value.to_string());
563 }
564 let text = submit.text().collect::<String>().trim().to_string();
566 if !text.is_empty() {
567 return Some(text);
568 }
569 }
570
571 None
572}
573
574pub fn get_login_forms(document: &Html) -> ParserResult<Vec<Form>> {
580 let forms = extract_forms(document)?;
581 Ok(forms.into_iter().filter(|f| f.is_login()).collect())
582}
583
584pub fn get_search_forms(document: &Html) -> ParserResult<Vec<Form>> {
586 let forms = extract_forms(document)?;
587 Ok(forms.into_iter().filter(|f| f.is_search()).collect())
588}
589
590pub fn get_contact_forms(document: &Html) -> ParserResult<Vec<Form>> {
592 let forms = extract_forms(document)?;
593 Ok(forms.into_iter()
594 .filter(|f| matches!(f.form_type, FormType::Contact))
595 .collect())
596}
597
598pub fn has_forms(document: &Html) -> bool {
600 let form_selector = Selector::parse("form").unwrap();
601 document.select(&form_selector).next().is_some()
602}
603
604pub fn has_login_form(document: &Html) -> bool {
606 get_login_forms(document).map(|f| !f.is_empty()).unwrap_or(false)
607}
608
609pub fn has_search_form(document: &Html) -> bool {
611 get_search_forms(document).map(|f| !f.is_empty()).unwrap_or(false)
612}
613
614#[cfg(test)]
619mod tests {
620 use super::*;
621
622 fn parse_html(html: &str) -> Html {
623 Html::parse_document(html)
624 }
625
626 #[test]
627 fn test_extract_login_form() {
628 let html = r#"
629 <form action="/login" method="post">
630 <input type="email" name="email" required>
631 <input type="password" name="password" required>
632 <input type="submit" value="Sign In">
633 </form>
634 "#;
635
636 let doc = parse_html(html);
637 let forms = extract_forms(&doc).unwrap();
638
639 assert_eq!(forms.len(), 1);
640 assert_eq!(forms[0].form_type, FormType::Login);
641 assert_eq!(forms[0].method, FormMethod::Post);
642 assert_eq!(forms[0].submit_text, Some("Sign In".to_string()));
643 }
644
645 #[test]
646 fn test_extract_search_form() {
647 let html = r#"
648 <form action="/search" method="get">
649 <input type="search" name="q" placeholder="Search...">
650 <button type="submit">Search</button>
651 </form>
652 "#;
653
654 let doc = parse_html(html);
655 let forms = extract_forms(&doc).unwrap();
656
657 assert_eq!(forms.len(), 1);
658 assert_eq!(forms[0].form_type, FormType::Search);
659 assert!(forms[0].is_search());
660 }
661
662 #[test]
663 fn test_extract_contact_form() {
664 let html = r#"
665 <form action="/contact" method="post">
666 <input type="text" name="name" required>
667 <input type="email" name="email" required>
668 <textarea name="message" required></textarea>
669 <button type="submit">Send</button>
670 </form>
671 "#;
672
673 let doc = parse_html(html);
674 let forms = extract_forms(&doc).unwrap();
675
676 assert_eq!(forms.len(), 1);
677 assert_eq!(forms[0].form_type, FormType::Contact);
678 }
679
680 #[test]
681 fn test_extract_registration_form() {
682 let html = r#"
683 <form action="/register" method="post">
684 <input type="email" name="email" required>
685 <input type="password" name="password" required>
686 <input type="password" name="password_confirm" required>
687 <button type="submit">Register</button>
688 </form>
689 "#;
690
691 let doc = parse_html(html);
692 let forms = extract_forms(&doc).unwrap();
693
694 assert_eq!(forms.len(), 1);
695 assert_eq!(forms[0].form_type, FormType::Registration);
696 }
697
698 #[test]
699 fn test_detect_csrf_token() {
700 let html = r#"
701 <form action="/login" method="post">
702 <input type="hidden" name="csrf_token" value="abc123">
703 <input type="email" name="email">
704 <input type="password" name="password">
705 </form>
706 "#;
707
708 let doc = parse_html(html);
709 let forms = extract_forms(&doc).unwrap();
710
711 assert!(forms[0].has_csrf);
712 }
713
714 #[test]
715 fn test_detect_captcha() {
716 let html = r#"
717 <form action="/login" method="post">
718 <input type="email" name="email">
719 <input type="password" name="password">
720 <div class="g-recaptcha" data-sitekey="xxx"></div>
721 </form>
722 "#;
723
724 let doc = parse_html(html);
725 let forms = extract_forms(&doc).unwrap();
726
727 assert!(forms[0].has_captcha);
728 }
729
730 #[test]
731 fn test_extract_select_field() {
732 let html = r#"
733 <form>
734 <select name="country" required>
735 <option value="">Select...</option>
736 <option value="us" selected>United States</option>
737 <option value="ca">Canada</option>
738 </select>
739 </form>
740 "#;
741
742 let doc = parse_html(html);
743 let forms = extract_forms(&doc).unwrap();
744 let field = forms[0].get_field("country").unwrap();
745
746 assert_eq!(field.field_type, FieldType::Select);
747 assert_eq!(field.options.len(), 3);
748 assert!(field.options[1].selected);
749 }
750
751 #[test]
752 fn test_form_with_labels() {
753 let html = r#"
754 <form>
755 <label for="email">Email Address</label>
756 <input type="email" id="email" name="email">
757 </form>
758 "#;
759
760 let doc = parse_html(html);
761 let forms = extract_forms(&doc).unwrap();
762 let field = forms[0].get_field("email").unwrap();
763
764 assert_eq!(field.label, Some("Email Address".to_string()));
765 }
766
767 #[test]
768 fn test_newsletter_form() {
769 let html = r#"
770 <form action="/subscribe" method="post" id="newsletter">
771 <input type="email" name="email" placeholder="Enter your email">
772 <button type="submit">Subscribe</button>
773 </form>
774 "#;
775
776 let doc = parse_html(html);
777 let forms = extract_forms(&doc).unwrap();
778
779 assert_eq!(forms[0].form_type, FormType::Newsletter);
780 }
781
782 #[test]
783 fn test_upload_form() {
784 let html = r#"
785 <form action="/upload" method="post" enctype="multipart/form-data">
786 <input type="file" name="document" accept=".pdf,.doc">
787 <button type="submit">Upload</button>
788 </form>
789 "#;
790
791 let doc = parse_html(html);
792 let forms = extract_forms(&doc).unwrap();
793
794 assert_eq!(forms[0].form_type, FormType::Upload);
795 assert!(forms[0].has_file_upload());
796 }
797
798 #[test]
799 fn test_has_forms() {
800 let html = "<html><body><form></form></body></html>";
801 let doc = parse_html(html);
802 assert!(has_forms(&doc));
803
804 let html_no_form = "<html><body><p>No forms here</p></body></html>";
805 let doc_no_form = parse_html(html_no_form);
806 assert!(!has_forms(&doc_no_form));
807 }
808
809 #[test]
810 fn test_required_fields() {
811 let html = r#"
812 <form>
813 <input type="email" name="email" required>
814 <input type="text" name="name">
815 <input type="password" name="password" required>
816 </form>
817 "#;
818
819 let doc = parse_html(html);
820 let forms = extract_forms(&doc).unwrap();
821 let required = forms[0].required_fields();
822
823 assert_eq!(required.len(), 2);
824 }
825
826 #[test]
827 fn test_form_method_parsing() {
828 assert_eq!(FormMethod::from("POST"), FormMethod::Post);
829 assert_eq!(FormMethod::from("get"), FormMethod::Get);
830 assert_eq!(FormMethod::from("dialog"), FormMethod::Dialog);
831 assert_eq!(FormMethod::from("unknown"), FormMethod::Get);
832 }
833
834 #[test]
835 fn test_field_type_parsing() {
836 assert_eq!(FieldType::from("password"), FieldType::Password);
837 assert_eq!(FieldType::from("EMAIL"), FieldType::Email);
838 assert_eq!(FieldType::from("tel"), FieldType::Tel);
839 assert_eq!(FieldType::from("unknown"), FieldType::Text);
840 }
841}