node_html_parser/css_select/
legacy.rs

1use std::collections::{HashSet};
2
3use crate::dom::element::HTMLElement;
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq)]
6pub enum Combinator {
7	Descendant,
8	Child,
9	Adjacent,
10	Sibling,
11}
12
13#[derive(Debug, Clone, PartialEq, Eq)]
14pub enum AttrOp {
15	Exists,
16	Eq,
17	Prefix,
18	Suffix,
19	Substr,
20	Includes,
21	Dash,
22}
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub enum CaseMode {
26	Sensitive,
27	Insensitive,
28}
29
30#[derive(Debug, Clone)]
31pub struct AttrMatcher {
32	pub name: String,
33	pub op: AttrOp,
34	pub value: String,
35	pub case: CaseMode,
36}
37
38#[derive(Debug, Clone)]
39pub enum NthExpr {
40	Number(i32),
41	Odd,
42	Even,
43	Pattern { a: i32, b: i32 },
44}
45
46#[derive(Debug, Clone)]
47pub enum Pseudo {
48	FirstChild,
49	LastChild,
50	OnlyChild,
51	FirstOfType,
52	LastOfType,
53	OnlyOfType,
54	NthChild(NthExpr),
55	NthLastChild(NthExpr),
56	NthOfType(NthExpr),
57	NthLastOfType(NthExpr),
58	Not(Vec<Selector>),
59	Scope,
60	Is(Vec<Selector>),
61	Where(Vec<Selector>),
62	Has(Vec<Selector>),
63	Empty,
64	Root,
65}
66
67#[derive(Debug, Clone, Default)]
68pub struct CompoundSelector {
69	pub tag: Option<String>,
70	pub id: Option<String>,
71	pub classes: Vec<String>,
72	pub attrs: Vec<AttrMatcher>,
73	pub pseudos: Vec<Pseudo>,
74}
75
76#[derive(Debug, Clone)]
77pub struct Selector(pub Vec<(Option<Combinator>, CompoundSelector)>);
78
79pub fn query_selector_all<'a>(root: &'a HTMLElement, selector_str: &str) -> Vec<&'a HTMLElement> {
80	let selectors = parse_selector_list(selector_str);
81	if selectors.is_empty() {
82		return vec![];
83	}
84	let mut ptr_set: HashSet<*const HTMLElement> = HashSet::new();
85	for sel in selectors {
86		for el in apply_selector(root, &sel) {
87			ptr_set.insert(el as *const HTMLElement);
88		}
89	}
90	let mut ordered = Vec::new();
91	collect_in_order(root, &ptr_set, &mut ordered);
92	ordered
93}
94
95fn collect_in_order<'a>(
96	el: &'a HTMLElement,
97	set: &HashSet<*const HTMLElement>,
98	out: &mut Vec<&'a HTMLElement>,
99) {
100	if !el.is_root() {
101		let p = el as *const HTMLElement;
102		if set.contains(&p) {
103			out.push(el);
104		}
105	}
106	for c in el.iter_elements() {
107		collect_in_order(c, set, out);
108	}
109}
110
111fn apply_selector<'a>(root: &'a HTMLElement, selector: &Selector) -> Vec<&'a HTMLElement> {
112	let mut current: Vec<&HTMLElement> = vec![root];
113	for (idx, (comb, comp)) in selector.0.iter().enumerate() {
114		let mut next_vec: Vec<&HTMLElement> = Vec::new();
115		for base in &current {
116			let effective = comb.unwrap_or(Combinator::Descendant);
117			match effective {
118				Combinator::Descendant => {
119					if idx == 0 {
120						if match_compound(root, base, comp) {
121							next_vec.push(base);
122						}
123					}
124					collect_descendants(root, base, comp, &mut next_vec);
125				}
126				Combinator::Child => {
127					for child in base.iter_elements() {
128						if match_compound(root, child, comp) {
129							next_vec.push(child);
130						}
131					}
132				}
133				Combinator::Adjacent => {
134					if let Some(parent) = find_parent(root, base) {
135						let sibs: Vec<&HTMLElement> = parent.iter_elements().collect();
136						if let Some(pos) = sibs.iter().position(|e| std::ptr::eq(*e, *base)) {
137							if pos + 1 < sibs.len() {
138								let n = sibs[pos + 1];
139								if match_compound(root, n, comp) {
140									next_vec.push(n);
141								}
142							}
143						}
144					}
145				}
146				Combinator::Sibling => {
147					if let Some(parent) = find_parent(root, base) {
148						let sibs: Vec<&HTMLElement> = parent.iter_elements().collect();
149						if let Some(pos) = sibs.iter().position(|e| std::ptr::eq(*e, *base)) {
150							for n in sibs.iter().skip(pos + 1) {
151								if match_compound(root, *n, comp) {
152									next_vec.push(*n);
153								}
154							}
155						}
156					}
157				}
158			}
159		}
160		let mut seen: HashSet<*const HTMLElement> = HashSet::new();
161		let mut dedup = Vec::new();
162		for e in next_vec {
163			let p = e as *const HTMLElement;
164			if seen.insert(p) {
165				dedup.push(e);
166			}
167		}
168		current = dedup;
169	}
170	current.into_iter().filter(|e| !e.is_root()).collect()
171}
172
173fn collect_descendants<'a>(
174	root: &'a HTMLElement,
175	el: &'a HTMLElement,
176	comp: &CompoundSelector,
177	out: &mut Vec<&'a HTMLElement>,
178) {
179	for child in el.iter_elements() {
180		if match_compound(root, child, comp) {
181			out.push(child);
182		}
183		collect_descendants(root, child, comp, out);
184	}
185}
186
187fn match_compound<'a>(root: &'a HTMLElement, el: &'a HTMLElement, comp: &CompoundSelector) -> bool {
188	if let Some(tag) = &comp.tag {
189		if !el.name().eq_ignore_ascii_case(tag) {
190			return false;
191		}
192	}
193	if let Some(id) = &comp.id {
194		match el.get_attr("id") {
195			Some(v) if v == id => {}
196			_ => return false,
197		}
198	}
199	for class in &comp.classes {
200		if !el.class_list_view().iter().any(|c| c == class) {
201			return false;
202		}
203	}
204	if !comp.attrs.is_empty() {
205		for matcher in &comp.attrs {
206			let raw_opt = el.get_attr(&matcher.name); // get_attr 已大小写无关 + 解码
207			match matcher.op {
208				AttrOp::Exists => {
209					if raw_opt.is_none() {
210						return false;
211					}
212				}
213				_ => {
214					if raw_opt.is_none() {
215						return false;
216					}
217					let val = raw_opt.unwrap();
218					let (left, right) = match matcher.case {
219						CaseMode::Insensitive => {
220							(val.to_ascii_lowercase(), matcher.value.to_ascii_lowercase())
221						}
222						CaseMode::Sensitive => (val.to_string(), matcher.value.clone()),
223					};
224					let ok = match matcher.op {
225						AttrOp::Exists => true,
226						AttrOp::Eq => left == right,
227						AttrOp::Prefix => left.starts_with(&right),
228						AttrOp::Suffix => left.ends_with(&right),
229						AttrOp::Substr => left.contains(&right),
230						AttrOp::Includes => left.split_whitespace().any(|t| t == right),
231						AttrOp::Dash => left == right || left.starts_with(&(right + "-")),
232					};
233					if !ok {
234						return false;
235					}
236				}
237			}
238		}
239	}
240	if !comp.pseudos.is_empty() {
241		for p in &comp.pseudos {
242			if !match_pseudo(root, el, p) {
243				return false;
244			}
245		}
246	}
247	true
248}
249
250fn match_pseudo<'a>(root: &'a HTMLElement, el: &'a HTMLElement, pseudo: &Pseudo) -> bool {
251	match pseudo {
252		Pseudo::FirstChild => position_in_parent(root, el)
253			.map(|(i, _, _, _)| i == 0)
254			.unwrap_or(false),
255		Pseudo::LastChild => position_in_parent(root, el)
256			.map(|(i, len, _, _)| i + 1 == len)
257			.unwrap_or(false),
258		Pseudo::OnlyChild => position_in_parent(root, el)
259			.map(|(_, len, _, _)| len == 1)
260			.unwrap_or(false),
261		Pseudo::FirstOfType => position_in_parent(root, el)
262			.map(|(_, _, ti, _)| ti == 0)
263			.unwrap_or(false),
264		Pseudo::LastOfType => position_in_parent(root, el)
265			.map(|(_, _, ti, tlen)| ti + 1 == tlen)
266			.unwrap_or(false),
267		Pseudo::OnlyOfType => position_in_parent(root, el)
268			.map(|(_, _, _, tlen)| tlen == 1)
269			.unwrap_or(false),
270		Pseudo::NthChild(expr) => position_in_parent(root, el)
271			.map(|(i, _, _, _)| match_nth(expr, i as i32 + 1))
272			.unwrap_or(false),
273		Pseudo::NthLastChild(expr) => position_in_parent(root, el)
274			.map(|(i, len, _, _)| {
275				let rev = (len - i - 1) as i32 + 1;
276				match_nth(expr, rev)
277			})
278			.unwrap_or(false),
279		Pseudo::NthOfType(expr) => position_in_parent(root, el)
280			.map(|(_, _, ti, _)| match_nth(expr, ti as i32 + 1))
281			.unwrap_or(false),
282		Pseudo::NthLastOfType(expr) => position_in_parent(root, el)
283			.map(|(_, _, ti, tlen)| {
284				let rev = (tlen - ti - 1) as i32 + 1;
285				match_nth(expr, rev)
286			})
287			.unwrap_or(false),
288		Pseudo::Not(list) => !list.iter().any(|sel| apply_selector_from_el(root, el, sel)),
289		Pseudo::Empty => {
290			el.iter_elements().next().is_none()
291				&& el
292					.children
293					.iter()
294					.all(|n| n.as_element().is_some() || n.text().trim().is_empty())
295		}
296		Pseudo::Root => find_parent(root, el).map(|p| p.is_root()).unwrap_or(false),
297		Pseudo::Is(list) => list.iter().any(|sel| apply_selector_from_el(root, el, sel)),
298		Pseudo::Where(list) => list.iter().any(|sel| apply_selector_from_el(root, el, sel)),
299		Pseudo::Has(list) => {
300			// :has(A) 若当前元素存在后代匹配 A;使用 DFS
301			fn any_desc<'a>(
302				cur: &'a HTMLElement,
303				root: &'a HTMLElement,
304				list: &Vec<Selector>,
305			) -> bool {
306				for child in cur.iter_elements() {
307					for sel in list {
308						if apply_selector_from_el(root, child, sel) {
309							return true;
310						}
311					}
312					if any_desc(child, root, list) {
313						return true;
314					}
315				}
316				false
317			}
318			any_desc(el, root, list)
319		}
320		Pseudo::Scope => true,
321	}
322}
323
324fn match_nth(expr: &NthExpr, index_one: i32) -> bool {
325	match expr {
326		NthExpr::Number(n) => index_one == *n,
327		NthExpr::Odd => (index_one % 2) == 1,
328		NthExpr::Even => (index_one % 2) == 0,
329		NthExpr::Pattern { a, b } => {
330			let a = *a;
331			let b = *b;
332			if a == 0 {
333				return index_one == b;
334			}
335			if a > 0 {
336				if index_one < b {
337					return false;
338				}
339				(index_one - b) % a == 0
340			} else {
341				let mut k = 0;
342				loop {
343					let val = a * k + b;
344					if val == index_one {
345						return true;
346					}
347					if val < 1 {
348						return false;
349					}
350					if val < index_one {
351						k += 1;
352						continue;
353					}
354					return false;
355				}
356			}
357		}
358	}
359}
360
361fn position_in_parent<'a>(
362	root: &'a HTMLElement,
363	el: &'a HTMLElement,
364) -> Option<(usize, usize, usize, usize)> {
365	let parent = find_parent(root, el)?;
366	let list: Vec<&HTMLElement> = parent.iter_elements().collect();
367	if list.is_empty() {
368		return None;
369	}
370	let same: Vec<&HTMLElement> = list
371		.iter()
372		.copied()
373		.filter(|c| c.name() == el.name())
374		.collect();
375	let idx = list.iter().position(|e| std::ptr::eq(*e, el))?;
376	let tidx = same.iter().position(|e| std::ptr::eq(*e, el))?;
377	Some((idx, list.len(), tidx, same.len()))
378}
379
380fn find_parent<'a>(root: &'a HTMLElement, target: &'a HTMLElement) -> Option<&'a HTMLElement> {
381	if std::ptr::eq(root, target) {
382		return None;
383	}
384	fn dfs<'a>(cur: &'a HTMLElement, target: &'a HTMLElement) -> Option<&'a HTMLElement> {
385		for child in cur.iter_elements() {
386			if std::ptr::eq(child, target) {
387				return Some(cur);
388			}
389			if let Some(p) = dfs(child, target) {
390				return Some(p);
391			}
392		}
393		None
394	}
395	dfs(root, target)
396}
397
398fn parse_selector_list(input: &str) -> Vec<Selector> {
399	let mut parts = Vec::new();
400	let mut buf = String::new();
401	let mut depth = 0; // parentheses depth
402	for c in input.chars() {
403		match c {
404			'(' => {
405				depth += 1;
406				buf.push(c);
407			}
408			')' => {
409				if depth > 0 {
410					depth -= 1;
411				}
412				buf.push(c);
413			}
414			',' if depth == 0 => {
415				let t = buf.trim();
416				if !t.is_empty() {
417					parts.push(t.to_string());
418				}
419				buf.clear();
420			}
421			_ => buf.push(c),
422		}
423	}
424	let t = buf.trim();
425	if !t.is_empty() {
426		parts.push(t.to_string());
427	}
428	parts
429		.into_iter()
430		.filter_map(|p| {
431			let tt = p.trim();
432			if tt.is_empty() {
433				None
434			} else {
435				Some(parse_single_selector(tt))
436			}
437		})
438		.collect()
439}
440
441fn parse_single_selector(input: &str) -> Selector {
442	let mut chars = input.chars().peekable();
443	let mut parts: Vec<(Option<Combinator>, CompoundSelector)> = Vec::new();
444	let mut current = CompoundSelector::default();
445	let mut pending: Option<Combinator> = None;
446	while let Some(&ch) = chars.peek() {
447		match ch {
448			' ' | '\t' | '\n' | '\r' => {
449				chars.next();
450				if !current_is_empty(&current) {
451					parts.push((pending.take(), current));
452					current = CompoundSelector::default();
453				}
454				pending.get_or_insert(Combinator::Descendant);
455				while let Some(&c2) = chars.peek() {
456					if c2.is_whitespace() {
457						chars.next();
458					} else {
459						break;
460					}
461				}
462			}
463			'>' => {
464				chars.next();
465				if !current_is_empty(&current) {
466					parts.push((pending.take(), current));
467					current = CompoundSelector::default();
468				}
469				pending = Some(Combinator::Child);
470				skip_ws(&mut chars);
471			}
472			'+' => {
473				chars.next();
474				if !current_is_empty(&current) {
475					parts.push((pending.take(), current));
476					current = CompoundSelector::default();
477				}
478				pending = Some(Combinator::Adjacent);
479				skip_ws(&mut chars);
480			}
481			'~' => {
482				chars.next();
483				if !current_is_empty(&current) {
484					parts.push((pending.take(), current));
485					current = CompoundSelector::default();
486				}
487				pending = Some(Combinator::Sibling);
488				skip_ws(&mut chars);
489			}
490			'#' => {
491				chars.next();
492				let ident = read_ident(&mut chars);
493				current.id = Some(ident);
494			}
495			'.' => {
496				chars.next();
497				let ident = read_ident(&mut chars);
498				current.classes.push(ident);
499			}
500			'[' => {
501				chars.next();
502				let name = read_ident(&mut chars);
503				skip_ws(&mut chars);
504				let mut op = AttrOp::Exists;
505				let mut value = String::new();
506				let mut case = CaseMode::Sensitive;
507				if let Some(&c2) = chars.peek() {
508					match c2 {
509						'=' => {
510							op = AttrOp::Eq;
511							chars.next();
512							skip_ws(&mut chars);
513							value = read_attr_value(&mut chars);
514						}
515						'^' | '$' | '*' | '~' | '|' => {
516							let first = c2;
517							chars.next();
518							if matches!(chars.peek(), Some('=')) {
519								chars.next();
520								op = match first {
521									'^' => AttrOp::Prefix,
522									'$' => AttrOp::Suffix,
523									'*' => AttrOp::Substr,
524									'~' => AttrOp::Includes,
525									'|' => AttrOp::Dash,
526									_ => AttrOp::Eq,
527								};
528								skip_ws(&mut chars);
529								value = read_attr_value(&mut chars);
530							}
531						}
532						_ => {}
533					}
534				}
535				skip_ws(&mut chars);
536				if let Some(&flag) = chars.peek() {
537					if flag == 'i' {
538						case = CaseMode::Insensitive;
539						chars.next();
540					} else if flag == 's' {
541						case = CaseMode::Sensitive;
542						chars.next();
543					}
544				}
545				while let Some(c) = chars.next() {
546					if c == ']' {
547						break;
548					}
549				}
550				current.attrs.push(AttrMatcher {
551					name: name.to_lowercase(),
552					op,
553					value,
554					case,
555				});
556			}
557			'*' => {
558				chars.next();
559			}
560			':' => {
561				chars.next();
562				let pseudo_name = read_ident(&mut chars).to_ascii_lowercase();
563				let pseudo = if matches!(chars.peek(), Some('(')) {
564					chars.next();
565					let arg = read_until_paren(&mut chars);
566					parse_pseudo_with_arg(&pseudo_name, &arg)
567				} else {
568					parse_pseudo_no_arg(&pseudo_name)
569				};
570				if let Some(p) = pseudo {
571					current.pseudos.push(p);
572				}
573			}
574			_ => {
575				if current.tag.is_none() {
576					let ident = read_ident_starting(ch, &mut chars);
577					current.tag = Some(ident.to_lowercase());
578				} else {
579					chars.next();
580				}
581			}
582		}
583	}
584	if !current_is_empty(&current) || parts.is_empty() {
585		parts.push((pending.take(), current));
586	}
587	Selector(parts)
588}
589
590fn current_is_empty(c: &CompoundSelector) -> bool {
591	c.tag.is_none()
592		&& c.id.is_none()
593		&& c.classes.is_empty()
594		&& c.attrs.is_empty()
595		&& c.pseudos.is_empty()
596}
597fn skip_ws<I: Iterator<Item = char>>(it: &mut std::iter::Peekable<I>) {
598	while matches!(it.peek(), Some(c) if c.is_whitespace()) {
599		it.next();
600	}
601}
602fn read_ident<I: Iterator<Item = char>>(it: &mut std::iter::Peekable<I>) -> String {
603	let mut s = String::new();
604	while let Some(&c) = it.peek() {
605		if c.is_alphanumeric() || c == '-' || c == '_' {
606			s.push(c);
607			it.next();
608		} else {
609			break;
610		}
611	}
612	s
613}
614fn read_ident_starting(first: char, it: &mut std::iter::Peekable<std::str::Chars<'_>>) -> String {
615	let mut s = String::new();
616	s.push(first);
617	it.next();
618	while let Some(&c) = it.peek() {
619		if c.is_alphanumeric() || c == '-' || c == '_' {
620			s.push(c);
621			it.next();
622		} else {
623			break;
624		}
625	}
626	s
627}
628fn read_attr_value<I: Iterator<Item = char>>(it: &mut std::iter::Peekable<I>) -> String {
629	if let Some(&q) = it.peek() {
630		if q == '"' || q == '\'' {
631			it.next();
632			let mut val = String::new();
633			while let Some(&c) = it.peek() {
634				it.next();
635				if c == q {
636					break;
637				}
638				val.push(c);
639			}
640			return val;
641		}
642	}
643	read_ident(it)
644}
645fn read_until_paren<I: Iterator<Item = char>>(it: &mut std::iter::Peekable<I>) -> String {
646	let mut depth = 1;
647	let mut s = String::new();
648	while let Some(&c) = it.peek() {
649		it.next();
650		if c == '(' {
651			depth += 1;
652		} else if c == ')' {
653			depth -= 1;
654			if depth == 0 {
655				break;
656			}
657		}
658		s.push(c);
659	}
660	s.trim().to_string()
661}
662fn parse_pseudo_no_arg(name: &str) -> Option<Pseudo> {
663	Some(match name {
664		"first-child" => Pseudo::FirstChild,
665		"last-child" => Pseudo::LastChild,
666		"only-child" => Pseudo::OnlyChild,
667		"first-of-type" => Pseudo::FirstOfType,
668		"last-of-type" => Pseudo::LastOfType,
669		"only-of-type" => Pseudo::OnlyOfType,
670		"empty" => Pseudo::Empty,
671		"root" => Pseudo::Root,
672		"scope" => Pseudo::Scope,
673		_ => return None,
674	})
675}
676fn parse_pseudo_with_arg(name: &str, arg: &str) -> Option<Pseudo> {
677	match name {
678		"nth-child" => Some(Pseudo::NthChild(parse_nth_expr(arg)?)),
679		"nth-last-child" => Some(Pseudo::NthLastChild(parse_nth_expr(arg)?)),
680		"nth-of-type" => Some(Pseudo::NthOfType(parse_nth_expr(arg)?)),
681		"nth-last-of-type" => Some(Pseudo::NthLastOfType(parse_nth_expr(arg)?)),
682		"not" => {
683			let sels = parse_selector_list(arg);
684			if sels.is_empty() {
685				None
686			} else {
687				Some(Pseudo::Not(sels))
688			}
689		}
690		"is" => {
691			let sels = parse_selector_list(arg);
692			if sels.is_empty() {
693				None
694			} else {
695				Some(Pseudo::Is(sels))
696			}
697		}
698		"where" => {
699			let sels = parse_selector_list(arg);
700			if sels.is_empty() {
701				None
702			} else {
703				Some(Pseudo::Where(sels))
704			}
705		}
706		"has" => {
707			let sels = parse_selector_list(arg);
708			if sels.is_empty() {
709				None
710			} else {
711				Some(Pseudo::Has(sels))
712			}
713		}
714		_ => None,
715	}
716}
717fn parse_nth_expr(s: &str) -> Option<NthExpr> {
718	let t = s.trim().to_ascii_lowercase();
719	if t == "odd" {
720		return Some(NthExpr::Odd);
721	}
722	if t == "even" {
723		return Some(NthExpr::Even);
724	}
725	if let Ok(n) = t.parse::<i32>() {
726		if n > 0 {
727			return Some(NthExpr::Number(n));
728		}
729	}
730	if let Some(pos) = t.find('n') {
731		let (a_part, rest) = t.split_at(pos);
732		let rest = &rest[1..];
733		let a = if a_part.is_empty() || a_part == "+" {
734			1
735		} else if a_part == "-" {
736			-1
737		} else {
738			a_part.parse().ok()?
739		};
740		let b = if rest.is_empty() {
741			0
742		} else {
743			let r = rest.trim();
744			if r.starts_with('+') {
745				r[1..].parse().ok()?
746			} else {
747				r.parse().ok()?
748			}
749		};
750		return Some(NthExpr::Pattern { a, b });
751	}
752	None
753}
754
755pub fn apply_selector_from_el<'a>(
756	root: &'a HTMLElement,
757	el: &'a HTMLElement,
758	selector: &Selector,
759) -> bool {
760	let matches = apply_selector(root, selector);
761	matches.into_iter().any(|e| std::ptr::eq(e, el))
762}
763pub fn parse_selector_list_public(input: &str) -> Vec<Selector> {
764	parse_selector_list(input)
765}
766
767// Expose helper for conversion: return reference parts
768pub fn selector_parts(sel: &Selector) -> &Vec<(Option<Combinator>, CompoundSelector)> {
769	&sel.0
770}