1use anyhow::{Context, Result};
2use quick_xml::events::Event;
3use quick_xml::reader::Reader;
4use quick_xml::writer::Writer;
5use rayon::prelude::*;
6use reqwest::blocking::Client;
7use scraper::{Html, Selector};
8use std::collections::HashSet;
9use std::fs;
10use std::io::Cursor;
11use std::path::Path;
12use url::Url;
13
14#[derive(Debug, Clone)]
15pub struct RssFeed {
16 pub title: String,
17 pub url: String,
18 pub html_url: String,
19 pub feed_type: FeedType,
20}
21
22#[derive(Debug, Clone, Copy, PartialEq)]
23pub enum FeedType {
24 Rss,
25 Atom,
26}
27
28pub fn read_urls_from_file(path: &Path) -> Result<Vec<String>> {
29 let content =
30 fs::read_to_string(path).context(format!("Failed to read file: {}", path.display()))?;
31
32 let mut urls = Vec::new();
33 let mut seen = HashSet::new();
34
35 for line in content.lines() {
36 let trimmed = line.trim();
37 if trimmed.is_empty() || trimmed.starts_with('#') {
38 continue;
39 }
40
41 let normalized = normalize_to_domain_url(trimmed);
42 if seen.insert(normalized.clone()) {
43 urls.push(normalized);
44 }
45 }
46
47 Ok(urls)
48}
49
50pub fn find_rss_feeds(url: &str, client: &Client) -> Result<Vec<RssFeed>> {
51 let response = client.get(url).send()?;
53 let html_content = response.text()?;
54 let document = if let Some(head_html) = extract_head_html(&html_content) {
55 Html::parse_fragment(head_html)
56 } else {
57 Html::parse_document(&html_content)
58 };
59
60 let mut feeds = Vec::new();
61
62 let link_selector =
64 Selector::parse("link[type='application/rss+xml'], link[type='application/atom+xml']")
65 .expect("Failed to parse CSS selector");
66
67 for element in document.select(&link_selector) {
68 if let Some(href) = element.value().attr("href") {
69 let feed_url = resolve_url(url, href)?;
70
71 if let Some(feed_type) = validate_rss_feed(&feed_url, client) {
73 let title = element
74 .value()
75 .attr("title")
76 .unwrap_or("Untitled Feed")
77 .to_string();
78
79 feeds.push(RssFeed {
80 title,
81 url: feed_url,
82 html_url: url.to_string(),
83 feed_type,
84 });
85 }
86 }
87 }
88
89 if feeds.is_empty() {
91 let common_paths = vec![
92 "/feed",
93 "/rss",
94 "/feed.xml",
95 "/rss.xml",
96 "/atom.xml",
97 "/index.xml",
98 ];
99
100 for path in common_paths {
101 if let Ok(feed_url) = resolve_url(url, path) {
102 if let Some(feed_type) = validate_rss_feed(&feed_url, client) {
103 feeds.push(RssFeed {
104 title: extract_title_from_url(url),
105 url: feed_url,
106 html_url: url.to_string(),
107 feed_type,
108 });
109 break; }
111 }
112 }
113 }
114
115 Ok(feeds)
116}
117
118pub fn find_rss_feeds_parallel(urls: &[String], client: &Client, verbose: bool) -> Vec<RssFeed> {
119 urls.par_iter()
120 .filter_map(|url| {
121 if verbose {
122 println!("Processing: {}", url);
123 }
124 match find_rss_feeds(url, client) {
125 Ok(feeds) => {
126 if !feeds.is_empty() {
127 if verbose {
128 println!(" Found {} feed(s) for {}", feeds.len(), url);
129 }
130 Some(feeds)
131 } else {
132 if verbose {
133 println!(" No feeds found for {}", url);
134 }
135 None
136 }
137 }
138 Err(e) => {
139 if verbose {
140 eprintln!(" Error processing {}: {}", url, e);
141 }
142 None
143 }
144 }
145 })
146 .flatten()
147 .collect()
148}
149
150fn resolve_url(base: &str, href: &str) -> Result<String> {
151 let base_url = Url::parse(base)?;
152 let resolved = base_url.join(href)?;
153 Ok(resolved.to_string())
154}
155
156fn validate_rss_feed(feed_url: &str, client: &Client) -> Option<FeedType> {
157 match client.get(feed_url).send() {
159 Ok(response) => {
160 if !response.status().is_success() {
161 return None;
162 }
163
164 match response.text() {
165 Ok(content) => {
166 if rss::Channel::read_from(content.as_bytes()).is_ok() {
168 return Some(FeedType::Rss);
169 }
170
171 if atom_syndication::Feed::read_from(content.as_bytes()).is_ok() {
173 return Some(FeedType::Atom);
174 }
175
176 None
177 }
178 Err(_) => None,
179 }
180 }
181 Err(_) => None,
182 }
183}
184
185fn extract_head_html(html: &str) -> Option<&str> {
186 let lower = html.to_ascii_lowercase();
187 let head_start = lower.find("<head")?;
188 let tag_end = lower[head_start..].find('>')? + head_start + 1;
189 let head_end = lower[tag_end..].find("</head")? + tag_end;
190 Some(&html[tag_end..head_end])
191}
192
193fn extract_title_from_url(url: &str) -> String {
194 Url::parse(url)
195 .ok()
196 .and_then(|u| u.host_str().map(String::from))
197 .unwrap_or_else(|| "Unknown".to_string())
198}
199
200fn normalize_to_domain_url(input: &str) -> String {
201 if let Ok(url) = Url::parse(input) {
202 if let Some(host) = url.host_str() {
203 let mut base = format!("{}://{}", url.scheme(), host);
204 if let Some(port) = url.port() {
205 base.push(':');
206 base.push_str(&port.to_string());
207 }
208 return base;
209 }
210 }
211
212 input.to_string()
213}
214
215fn pretty_print_xml(xml: &str) -> Result<String> {
235 const INDENT_CHAR: u8 = b' ';
236 const INDENT_SIZE: usize = 2;
237
238 let mut reader = Reader::from_str(xml);
239 reader.config_mut().trim_text(true);
240
241 let mut writer = Writer::new_with_indent(Cursor::new(Vec::new()), INDENT_CHAR, INDENT_SIZE);
242
243 loop {
244 match reader.read_event() {
245 Ok(Event::Eof) => break,
246 Ok(event) => {
247 writer.write_event(event).context("Failed to write XML event")?;
248 }
249 Err(e) => return Err(anyhow::anyhow!("Error parsing XML: {}", e)),
250 }
251 }
252
253 let result = writer.into_inner().into_inner();
254 String::from_utf8(result).context("Failed to convert XML to UTF-8")
255}
256
257pub fn create_opml_file(feeds: &[RssFeed], output_path: &Path) -> Result<()> {
258 create_opml_file_filtered(feeds, output_path, None)
259}
260
261pub fn create_opml_file_filtered(
262 feeds: &[RssFeed],
263 output_path: &Path,
264 feed_type_filter: Option<FeedType>,
265) -> Result<()> {
266 let mut opml = opml::OPML::default();
267
268 let title = match feed_type_filter {
269 Some(FeedType::Rss) => "RSS Feeds",
270 Some(FeedType::Atom) => "Atom Feeds",
271 None => "RSS and Atom Feeds",
272 };
273
274 opml.head = Some(opml::Head {
275 title: Some(title.to_string()),
276 ..Default::default()
277 });
278
279 let mut outlines = Vec::new();
280 let mut seen_urls = HashSet::with_capacity(feeds.len());
281
282 for feed in feeds {
283 if let Some(filter_type) = feed_type_filter {
285 if filter_type != feed.feed_type {
286 continue;
287 }
288 }
289
290 if seen_urls.contains(&feed.url) {
292 continue;
293 }
294 seen_urls.insert(feed.url.clone());
295
296 let feed_type_str = match feed.feed_type {
297 FeedType::Rss => "rss",
298 FeedType::Atom => "atom",
299 };
300
301 let outline = opml::Outline {
302 text: feed.title.clone(),
303 r#type: Some(feed_type_str.to_string()),
304 xml_url: Some(feed.url.clone()),
305 html_url: Some(feed.html_url.clone()),
306 ..Default::default()
307 };
308 outlines.push(outline);
309 }
310
311 opml.body = opml::Body { outlines };
312
313 let opml_string = opml.to_string()?;
314 let pretty_xml = pretty_print_xml(&opml_string)?;
315 fs::write(output_path, pretty_xml).context(format!(
316 "Failed to write OPML file: {}",
317 output_path.display()
318 ))?;
319
320 Ok(())
321}
322
323#[cfg(test)]
324mod tests {
325 use super::*;
326 use std::io::Write;
327 use tempfile::NamedTempFile;
328
329 #[test]
330 fn test_read_urls_from_file() {
331 let mut temp_file = NamedTempFile::new().unwrap();
332 writeln!(temp_file, "# Comment line").unwrap();
333 writeln!(temp_file, "https://example.com/path?query=1").unwrap();
334 writeln!(temp_file).unwrap();
335 writeln!(temp_file, "https://test.com/another/path").unwrap();
336 writeln!(temp_file, "https://example.com/dup/path").unwrap();
337 writeln!(temp_file, " https://trimmed.com/page ").unwrap();
338
339 let urls = read_urls_from_file(temp_file.path()).unwrap();
340 assert_eq!(urls.len(), 3);
341 assert_eq!(urls[0], "https://example.com");
342 assert_eq!(urls[1], "https://test.com");
343 assert_eq!(urls[2], "https://trimmed.com");
344 }
345
346 #[test]
347 fn test_resolve_url_absolute() {
348 let result = resolve_url("https://example.com", "https://feed.example.com/rss").unwrap();
349 assert_eq!(result, "https://feed.example.com/rss");
350 }
351
352 #[test]
353 fn test_resolve_url_relative() {
354 let result = resolve_url("https://example.com", "/feed.xml").unwrap();
355 assert_eq!(result, "https://example.com/feed.xml");
356 }
357
358 #[test]
359 fn test_extract_title_from_url() {
360 let title = extract_title_from_url("https://example.com/path");
361 assert_eq!(title, "example.com");
362 }
363
364 #[test]
365 fn test_extract_title_from_invalid_url() {
366 let title = extract_title_from_url("not-a-url");
367 assert_eq!(title, "Unknown");
368 }
369
370 #[test]
371 fn test_extract_head_html() {
372 let html = "<html><head><link rel=\"alternate\" type=\"application/rss+xml\" href=\"/rss.xml\"></head><body>Content</body></html>";
373 let head = extract_head_html(html).unwrap();
374 assert!(head.contains("application/rss+xml"));
375 assert!(!head.contains("Content"));
376 }
377
378 #[test]
379 fn test_extract_head_html_missing() {
380 let html = "<html><body>No head</body></html>";
381 assert!(extract_head_html(html).is_none());
382 }
383
384 #[test]
385 fn test_create_opml_file() {
386 let feeds = vec![
387 RssFeed {
388 title: "Test Feed 1".to_string(),
389 url: "https://example.com/feed1.xml".to_string(),
390 html_url: "https://example.com".to_string(),
391 feed_type: FeedType::Rss,
392 },
393 RssFeed {
394 title: "Test Feed 2".to_string(),
395 url: "https://example.com/feed2.xml".to_string(),
396 html_url: "https://example.com".to_string(),
397 feed_type: FeedType::Atom,
398 },
399 ];
400
401 let temp_file = NamedTempFile::new().unwrap();
402 let output_path = temp_file.path();
403
404 create_opml_file(&feeds, output_path).unwrap();
405
406 let content = fs::read_to_string(output_path).unwrap();
407 assert!(content.contains("Test Feed 1"));
408 assert!(content.contains("Test Feed 2"));
409 assert!(content.contains("https://example.com/feed1.xml"));
410 assert!(content.contains("https://example.com/feed2.xml"));
411 assert!(content.contains("<opml"));
412 assert!(content.contains("RSS and Atom Feeds"));
413 }
414
415 #[test]
416 fn test_create_opml_file_with_duplicates() {
417 let feeds = vec![
418 RssFeed {
419 title: "Test Feed 1".to_string(),
420 url: "https://example.com/feed1.xml".to_string(),
421 html_url: "https://example.com".to_string(),
422 feed_type: FeedType::Rss,
423 },
424 RssFeed {
425 title: "Test Feed 2".to_string(),
426 url: "https://example.com/feed2.xml".to_string(),
427 html_url: "https://example.com".to_string(),
428 feed_type: FeedType::Atom,
429 },
430 RssFeed {
431 title: "Test Feed 1 Duplicate".to_string(),
432 url: "https://example.com/feed1.xml".to_string(), html_url: "https://example.com".to_string(),
434 feed_type: FeedType::Rss,
435 },
436 RssFeed {
437 title: "Test Feed 3".to_string(),
438 url: "https://example.com/feed3.xml".to_string(),
439 html_url: "https://example.com".to_string(),
440 feed_type: FeedType::Rss,
441 },
442 RssFeed {
443 title: "Test Feed 2 Duplicate".to_string(),
444 url: "https://example.com/feed2.xml".to_string(), html_url: "https://example.com".to_string(),
446 feed_type: FeedType::Atom,
447 },
448 ];
449
450 let temp_file = NamedTempFile::new().unwrap();
451 let output_path = temp_file.path();
452
453 create_opml_file(&feeds, output_path).unwrap();
454
455 let content = fs::read_to_string(output_path).unwrap();
456
457 assert!(content.contains("Test Feed 1"));
459 assert!(content.contains("Test Feed 2"));
460 assert!(content.contains("Test Feed 3"));
461
462 assert!(!content.contains("Test Feed 1 Duplicate"));
464 assert!(!content.contains("Test Feed 2 Duplicate"));
465
466 assert_eq!(content.matches("https://example.com/feed1.xml").count(), 1);
468 assert_eq!(content.matches("https://example.com/feed2.xml").count(), 1);
469 assert_eq!(content.matches("https://example.com/feed3.xml").count(), 1);
470 }
471
472 #[test]
473 fn test_create_opml_file_rss_only() {
474 let feeds = vec![
475 RssFeed {
476 title: "RSS Feed 1".to_string(),
477 url: "https://example.com/rss1.xml".to_string(),
478 html_url: "https://example.com".to_string(),
479 feed_type: FeedType::Rss,
480 },
481 RssFeed {
482 title: "Atom Feed 1".to_string(),
483 url: "https://example.com/atom1.xml".to_string(),
484 html_url: "https://example.com".to_string(),
485 feed_type: FeedType::Atom,
486 },
487 RssFeed {
488 title: "RSS Feed 2".to_string(),
489 url: "https://example.com/rss2.xml".to_string(),
490 html_url: "https://example.com".to_string(),
491 feed_type: FeedType::Rss,
492 },
493 ];
494
495 let temp_file = NamedTempFile::new().unwrap();
496 let output_path = temp_file.path();
497
498 create_opml_file_filtered(&feeds, output_path, Some(FeedType::Rss)).unwrap();
499
500 let content = fs::read_to_string(output_path).unwrap();
501
502 assert!(content.contains("RSS Feed 1"));
504 assert!(content.contains("RSS Feed 2"));
505 assert!(content.contains("https://example.com/rss1.xml"));
506 assert!(content.contains("https://example.com/rss2.xml"));
507
508 assert!(!content.contains("Atom Feed 1"));
510 assert!(!content.contains("https://example.com/atom1.xml"));
511
512 assert!(content.contains("RSS Feeds"));
514 }
515
516 #[test]
517 fn test_create_opml_file_atom_only() {
518 let feeds = vec![
519 RssFeed {
520 title: "RSS Feed 1".to_string(),
521 url: "https://example.com/rss1.xml".to_string(),
522 html_url: "https://example.com".to_string(),
523 feed_type: FeedType::Rss,
524 },
525 RssFeed {
526 title: "Atom Feed 1".to_string(),
527 url: "https://example.com/atom1.xml".to_string(),
528 html_url: "https://example.com".to_string(),
529 feed_type: FeedType::Atom,
530 },
531 RssFeed {
532 title: "Atom Feed 2".to_string(),
533 url: "https://example.com/atom2.xml".to_string(),
534 html_url: "https://example.com".to_string(),
535 feed_type: FeedType::Atom,
536 },
537 ];
538
539 let temp_file = NamedTempFile::new().unwrap();
540 let output_path = temp_file.path();
541
542 create_opml_file_filtered(&feeds, output_path, Some(FeedType::Atom)).unwrap();
543
544 let content = fs::read_to_string(output_path).unwrap();
545
546 assert!(content.contains("Atom Feed 1"));
548 assert!(content.contains("Atom Feed 2"));
549 assert!(content.contains("https://example.com/atom1.xml"));
550 assert!(content.contains("https://example.com/atom2.xml"));
551
552 assert!(!content.contains("RSS Feed 1"));
554 assert!(!content.contains("https://example.com/rss1.xml"));
555
556 assert!(content.contains("Atom Feeds"));
558 }
559}
560
561#[cfg(feature = "python")]
563pub mod python {
564 use super::*;
565 use pyo3::prelude::*;
566 use std::collections::HashMap;
567 use std::time::Duration;
568
569 #[pyclass]
571 #[derive(Clone)]
572 pub struct PyRssFeed {
573 #[pyo3(get)]
574 pub title: String,
575 #[pyo3(get)]
576 pub url: String,
577 #[pyo3(get)]
578 pub html_url: String,
579 #[pyo3(get)]
580 pub feed_type: String,
581 }
582
583 impl From<RssFeed> for PyRssFeed {
584 fn from(feed: RssFeed) -> Self {
585 PyRssFeed {
586 title: feed.title,
587 url: feed.url,
588 html_url: feed.html_url,
589 feed_type: match feed.feed_type {
590 FeedType::Rss => "rss".to_string(),
591 FeedType::Atom => "atom".to_string(),
592 },
593 }
594 }
595 }
596
597 #[pymethods]
598 impl PyRssFeed {
599 fn __repr__(&self) -> String {
600 format!(
601 "RssFeed(title='{}', url='{}', html_url='{}', feed_type='{}')",
602 self.title, self.url, self.html_url, self.feed_type
603 )
604 }
605
606 fn to_dict(&self) -> HashMap<String, String> {
607 let mut map = HashMap::new();
608 map.insert("title".to_string(), self.title.clone());
609 map.insert("url".to_string(), self.url.clone());
610 map.insert("html_url".to_string(), self.html_url.clone());
611 map.insert("feed_type".to_string(), self.feed_type.clone());
612 map
613 }
614 }
615
616 fn build_client() -> PyResult<Client> {
617 Client::builder()
618 .timeout(Duration::from_secs(10))
619 .build()
620 .map_err(|e| PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(format!("{}", e)))
621 }
622
623 #[pyfunction]
625 fn find_feeds(url: String) -> PyResult<Vec<PyRssFeed>> {
626 let client = build_client()?;
627 let feeds = find_rss_feeds(&url, &client)
628 .map_err(|e| PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(format!("{}", e)))?;
629 Ok(feeds.into_iter().map(PyRssFeed::from).collect())
630 }
631
632 #[pyfunction]
634 #[pyo3(signature = (urls, verbose=false))]
635 fn find_feeds_parallel(
636 urls: Vec<String>,
637 verbose: bool,
638 ) -> PyResult<(Vec<PyRssFeed>, Vec<(String, String)>)> {
639 let client = build_client()?;
640 let results: Vec<(String, Vec<RssFeed>, bool)> = urls
641 .par_iter()
642 .map(|url| {
643 if verbose {
644 println!("Processing: {}", url);
645 }
646 match find_rss_feeds(url, &client) {
647 Ok(feeds) => {
648 if verbose {
649 if !feeds.is_empty() {
650 println!(" Found {} feed(s) for {}", feeds.len(), url);
651 } else {
652 println!(" No feeds found for {}", url);
653 }
654 }
655 (url.clone(), feeds, true)
656 }
657 Err(e) => {
658 if verbose {
659 eprintln!(" Error processing {}: {}", url, e);
660 }
661 (url.clone(), Vec::new(), false)
662 }
663 }
664 })
665 .collect();
666
667 let statuses = results
668 .iter()
669 .map(|(url, _feeds, ok)| {
670 (
671 url.clone(),
672 if *ok { "success" } else { "failed" }.to_string(),
673 )
674 })
675 .collect();
676
677 let feeds = results
678 .into_iter()
679 .flat_map(|(_url, feeds, _ok)| feeds)
680 .map(PyRssFeed::from)
681 .collect();
682
683 Ok((feeds, statuses))
684 }
685
686 #[pyfunction]
688 fn read_urls(file_path: String) -> PyResult<Vec<String>> {
689 let path = Path::new(&file_path);
690 read_urls_from_file(path)
691 .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(format!("{}", e)))
692 }
693
694 fn convert_py_feeds_to_rust(feeds: Vec<PyRssFeed>) -> Vec<RssFeed> {
696 feeds
697 .into_iter()
698 .map(|py_feed| RssFeed {
699 title: py_feed.title,
700 url: py_feed.url,
701 html_url: py_feed.html_url,
702 feed_type: if py_feed.feed_type == "rss" {
703 FeedType::Rss
704 } else {
705 FeedType::Atom
706 },
707 })
708 .collect()
709 }
710
711 #[pyfunction]
713 fn create_opml(feeds: Vec<PyRssFeed>, output_path: String) -> PyResult<()> {
714 let rust_feeds = convert_py_feeds_to_rust(feeds);
715 let path = Path::new(&output_path);
716 create_opml_file(&rust_feeds, path)
717 .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(format!("{}", e)))
718 }
719
720 #[pyfunction]
722 fn create_opml_rss_only(feeds: Vec<PyRssFeed>, output_path: String) -> PyResult<()> {
723 let rust_feeds = convert_py_feeds_to_rust(feeds);
724 let path = Path::new(&output_path);
725 create_opml_file_filtered(&rust_feeds, path, Some(FeedType::Rss))
726 .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(format!("{}", e)))
727 }
728
729 #[pyfunction]
731 fn create_opml_atom_only(feeds: Vec<PyRssFeed>, output_path: String) -> PyResult<()> {
732 let rust_feeds = convert_py_feeds_to_rust(feeds);
733 let path = Path::new(&output_path);
734 create_opml_file_filtered(&rust_feeds, path, Some(FeedType::Atom))
735 .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(format!("{}", e)))
736 }
737
738 #[pymodule]
740 fn rss_miner(m: &Bound<'_, PyModule>) -> PyResult<()> {
741 m.add_class::<PyRssFeed>()?;
742 m.add_function(wrap_pyfunction!(find_feeds, m)?)?;
743 m.add_function(wrap_pyfunction!(find_feeds_parallel, m)?)?;
744 m.add_function(wrap_pyfunction!(read_urls, m)?)?;
745 m.add_function(wrap_pyfunction!(create_opml, m)?)?;
746 m.add_function(wrap_pyfunction!(create_opml_rss_only, m)?)?;
747 m.add_function(wrap_pyfunction!(create_opml_atom_only, m)?)?;
748 Ok(())
749 }
750}