1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
use regex::Regex;
pub struct Parser {
html: String,
}
impl Parser {
pub fn new(html: String) -> Self {
Parser { html }
}
/// Parses HTML content and extracts all tags of the specified type
///
/// # Arguments
///
/// * `tag` - The HTML tag name to search for (e.g., "a", "p", "div")
///
/// # Returns
///
/// A vector of strings containing all matching HTML tags
///
/// # Example
///
/// ```
/// # // This example is for internal documentation only and not run as a test
/// # // To use this in your code, you would need to import Parser from the parser module
/// # use tagparser::parser::Parser;
/// #
/// let html = "<a href='https://example.com'>Link</a><p>Paragraph</p>";
/// let mut parser = Parser::new(html.to_string());
///
/// // Get all links
/// let links = parser.parse_tags("a".to_string());
/// assert_eq!(links, vec!["<a href='https://example.com'>Link</a>"]);
///
/// // Get all paragraphs
/// let paragraphs = parser.parse_tags("p".to_string());
/// assert_eq!(paragraphs, vec!["<p>Paragraph</p>"]);
/// ```
pub fn parse_tags(&mut self, tag: String) -> Vec<String> {
// Create two patterns:
// 1. For regular tags with closing tag: <tag>...</tag>
// 2. For self-closing tags with closing slash: <tag/>
let pattern_regular = format!(r"<{}[^>]*>.*?</{}[^>]*>", tag, tag);
let pattern_self_closing = format!(r"<{}[^>]*/>", tag);
// Get all regular tags
let mut results = Regex::new(&pattern_regular)
.unwrap()
.find_iter(&self.html)
.map(|x| x.as_str().to_string())
.collect::<Vec<String>>();
// Add all self-closing tags
let self_closing_tags = Regex::new(&pattern_self_closing)
.unwrap()
.find_iter(&self.html)
.map(|x| x.as_str().to_string())
.collect::<Vec<String>>();
results.extend(self_closing_tags);
// For self-closing tags without closing slash (HTML5) use a different approach
// Find all opening tags that don't have a corresponding closing tag
let pattern_opening = format!(r"<{}[^>]*>", tag);
let opening_tags = Regex::new(&pattern_opening)
.unwrap()
.find_iter(&self.html)
.map(|x| x.as_str().to_string())
.collect::<Vec<String>>();
// Check each opening tag
for opening_tag in opening_tags {
// If the tag is already in the results (as part of a regular tag or self-closing tag), skip it
let is_part_of_existing_tag = results.iter().any(|existing_tag| existing_tag.contains(&opening_tag));
// For tests with malformed HTML, we should not add opening tags without closing tags
// Check if there is a closing tag for this opening tag
let closing_tag = format!("</{}", tag);
let has_closing_tag = self.html.contains(&closing_tag);
if !is_part_of_existing_tag && has_closing_tag {
// If the tag is not part of an existing tag and has a closing tag, add it as a self-closing tag
results.push(opening_tag);
}
}
results
}
/// Filters HTML tags by attribute name and optionally by attribute value
///
/// # Arguments
///
/// * `tag` - The HTML tag name to search for (e.g., "a", "div", "img")
/// * `attr_name` - The attribute name to filter by (e.g., "href", "class", "id")
/// * `attr_value` - Optional attribute value to filter by
/// - If `None`, returns all tags with the specified attribute regardless of value
/// - If `Some(value)`, returns only tags where the attribute exactly matches the value
///
/// # Returns
///
/// A vector of strings containing the matching HTML tags
///
/// # Examples
///
/// ```
/// # // This example is for internal documentation only and not run as a test
/// # // To use this in your code, you would need to import Parser from the parser module
/// # use tagparser::parser::Parser;
/// #
/// let html = r#"
/// <a href="https://github.com">GitHub</a>
/// <a href="https://rust-lang.org" class="official">Rust</a>
/// <a class="social" href="https://twitter.com">Twitter</a>
/// "#;
///
/// let mut parser = Parser::new(html.to_string());
///
/// // Example 1: Find all links with href attribute (any value)
/// let links_with_href = parser.parse_tags_with_attr("a".to_string(), "href", None);
/// // Returns all three links
///
/// // Example 2: Find links with class="social"
/// let social_links = parser.parse_tags_with_attr("a".to_string(), "class", Some("social"));
/// // Returns only: <a class="social" href="https://twitter.com">Twitter</a>
///
/// // Example 3: Find links to a specific URL
/// let github_links = parser.parse_tags_with_attr("a".to_string(), "href", Some("https://github.com"));
/// // Returns only: <a href="https://github.com">GitHub</a>
/// ```
///
/// # Command Line Usage
///
/// When using the CLI tool, you can filter by attributes like this:
///
/// ```bash
/// # Find all links with href attribute
/// tagparser "<html>...</html>" "a" "href"
///
/// # Find all links with href pointing to github.com
/// tagparser "<html>...</html>" "a" "href" "https://github.com"
/// ```
pub fn parse_tags_with_attr(&mut self, tag: String, attr_name: &str, attr_value: Option<&str>) -> Vec<String> {
let all_tags = self.parse_tags(tag);
all_tags.into_iter().filter(|tag_str| {
// Check if the tag contains the attribute
let attr_pattern = match attr_value {
Some(value) => format!(r#"{}=["']{}["']"#, attr_name, value),
None => format!(r#"{}=["'][^"']*["']"#, attr_name),
};
Regex::new(&attr_pattern).unwrap().is_match(tag_str)
}).collect()
}
/// Extracts the content (text) from inside HTML tags of the specified type
///
/// This method returns only the text content between the opening and closing tags,
/// without the tags themselves or any HTML attributes.
///
/// # Arguments
///
/// * `tag` - The HTML tag name to search for (e.g., "a", "p", "div")
///
/// # Returns
///
/// A vector of strings containing the text content of all matching tags
///
/// # Examples
///
/// ```
/// # // This example is for internal documentation only and not run as a test
/// # // To use this in your code, you would need to import Parser from the parser module
/// # use tagparser::parser::Parser;
/// #
/// let html = r#"
/// <a href="https://github.com">GitHub</a>
/// <p>This is a <strong>paragraph</strong> with some text.</p>
/// <div class="container">Some content</div>
/// "#;
///
/// let mut parser = Parser::new(html.to_string());
///
/// // Extract content from links
/// let link_texts = parser.extract_tag_content("a".to_string());
/// assert_eq!(link_texts, vec!["GitHub"]);
///
/// // Extract content from paragraphs (includes nested HTML)
/// let paragraph_texts = parser.extract_tag_content("p".to_string());
/// assert_eq!(paragraph_texts, vec!["This is a <strong>paragraph</strong> with some text."]);
///
/// // Extract content from divs
/// let div_texts = parser.extract_tag_content("div".to_string());
/// assert_eq!(div_texts, vec!["Some content"]);
/// ```
pub fn extract_tag_content(&mut self, tag: String) -> Vec<String> {
// Create a regex pattern that captures the content between tags
let pattern = format!(r"<{}.*?>(.*?)</{}.*?>", tag, tag);
// Find all matches and extract the captured group (content)
Regex::new(&pattern)
.unwrap()
.captures_iter(&self.html)
.map(|cap| cap[1].to_string())
.collect()
}
/// Extracts attribute values from HTML tags of the specified type
///
/// This method returns the values of the specified attribute from all matching tags.
///
/// # Arguments
///
/// * `tag` - The HTML tag name to search for (e.g., "a", "img", "div")
/// * `attr_name` - The attribute name to extract values from (e.g., "href", "src", "class")
///
/// # Returns
///
/// A vector of strings containing the attribute values from all matching tags.
/// Returns an empty vector if no matching tags or attributes are found.
///
/// # Examples
///
/// ```
/// # // This example is for internal documentation only and not run as a test
/// # // To use this in your code, you would need to import Parser from the parser module
/// # use tagparser::parser::Parser;
/// #
/// let html = r#"
/// <a href="https://github.com">GitHub</a>
/// <a href="https://rust-lang.org" class="official">Rust</a>
/// <a class="social" href="https://twitter.com">Twitter</a>
/// "#;
///
/// let mut parser = Parser::new(html.to_string());
///
/// // Extract all href values from links
/// let hrefs = parser.extract_attribute_values("a".to_string(), "href");
/// assert_eq!(
/// vec!["https://github.com", "https://rust-lang.org", "https://twitter.com"],
/// hrefs
/// );
///
/// // Extract all class values from links
/// let classes = parser.extract_attribute_values("a".to_string(), "class");
/// assert_eq!(
/// vec!["official", "social"],
/// classes
/// );
/// ```
pub fn extract_attribute_values(&mut self, tag: String, attr_name: &str) -> Vec<String> {
// First get all tags of the specified type
let all_tags = self.parse_tags(tag);
// Create a regex pattern to extract the attribute value
let attr_pattern = format!(r#"{}=["']([^"']*)["']"#, attr_name);
let re = Regex::new(&attr_pattern).unwrap();
// Extract attribute values from all matching tags
all_tags.iter()
.filter_map(|tag_str| {
re.captures(tag_str).map(|cap| cap[1].to_string())
})
.collect()
}
}