nyaa_si/
extractor.rs

1use scraper::{ElementRef, Html, Selector};
2use std::sync::OnceLock;
3
4use crate::error::{Error, Result};
5use crate::model::{Size, Torrent};
6
7static ITEM_SELECTOR: OnceLock<Selector> = OnceLock::new();
8static TITLE_SELECTOR: OnceLock<Selector> = OnceLock::new();
9static TORRENT_LINK_SELECTOR: OnceLock<Selector> = OnceLock::new();
10static MAGNET_SELECTOR: OnceLock<Selector> = OnceLock::new();
11static SEEDERS_SELECTOR: OnceLock<Selector> = OnceLock::new();
12static LEECHERS_SELECTOR: OnceLock<Selector> = OnceLock::new();
13static DATE_SELECTOR: OnceLock<Selector> = OnceLock::new();
14static DOWNLOADS_SELECTOR: OnceLock<Selector> = OnceLock::new();
15static SIZE_SELECTOR: OnceLock<Selector> = OnceLock::new();
16
17pub fn extract(html: &str, base_url: &str) -> Result<Vec<Torrent>> {
18    let document = Html::parse_document(html);
19    let selector = ITEM_SELECTOR
20        .get_or_init(|| Selector::parse("table>tbody>tr").unwrap());
21    let items = document.select(selector);
22    let mut res_vec: Vec<Torrent> = Vec::with_capacity(75);
23
24    for item in items {
25        let title = extract_title(item)?;
26        let torrent_link = extract_torrent_link(item, base_url)?;
27        let magnet = extract_magnet_url(item)?;
28        let seeders = extract_seeders(item)?;
29        let leechers = extract_leechers(item)?;
30        let downloads = extract_downloads(item)?;
31        let size = extract_size(item)?;
32        let date = extract_date(item)?;
33        let torrent = Torrent {
34            title,
35            link: torrent_link,
36            magnet_url: magnet,
37            date,
38            seeders,
39            leechers,
40            downloads,
41            size,
42        };
43        res_vec.push(torrent);
44    }
45    Ok(res_vec)
46}
47
48fn extract_title(item: ElementRef<'_>) -> Result<String> {
49    let selector = TITLE_SELECTOR.get_or_init(|| {
50        Selector::parse("td:nth-of-type(2)>a:last-child").unwrap()
51    });
52    let title = item
53        .select(selector)
54        .next()
55        .ok_or(Error::SelectorError("Title not found".into()))?;
56    Ok(title.text().collect())
57}
58
59fn extract_torrent_link(
60    item: ElementRef<'_>,
61    base_url: &str,
62) -> Result<String> {
63    let selector = TORRENT_LINK_SELECTOR.get_or_init(|| {
64        Selector::parse("td:nth-of-type(3)>a:first-child").unwrap()
65    });
66    let link = item
67        .select(selector)
68        .next()
69        .ok_or(Error::SelectorError("Link not found".into()))?;
70    link.value()
71        .attr("href")
72        .ok_or(Error::SelectorError("Link not found".into()))
73        .map(|s| format!("{}{}", base_url, s))
74}
75
76fn extract_magnet_url(item: ElementRef<'_>) -> Result<String> {
77    let selector = MAGNET_SELECTOR.get_or_init(|| {
78        Selector::parse("td:nth-of-type(3)>a:last-child").unwrap()
79    });
80    let link = item
81        .select(selector)
82        .next()
83        .ok_or(Error::SelectorError("magnet not found".into()))?;
84    link.value()
85        .attr("href")
86        .ok_or(Error::SelectorError("magnet not found".into()))
87        .map(|s| s.to_string())
88}
89
90fn extract_seeders(item: ElementRef<'_>) -> Result<u32> {
91    let selector = SEEDERS_SELECTOR
92        .get_or_init(|| Selector::parse("td:nth-of-type(6)").unwrap());
93    let seeders = item
94        .select(selector)
95        .next()
96        .ok_or(Error::SelectorError("Seeders not found".into()))?;
97    let seeders_str: String = seeders.text().collect();
98    seeders_str
99        .parse::<u32>()
100        .map_err(|_| Error::SelectorError("Seeders not found".into()))
101}
102
103fn extract_leechers(item: ElementRef<'_>) -> Result<u32> {
104    let selector = LEECHERS_SELECTOR
105        .get_or_init(|| Selector::parse("td:nth-of-type(7)").unwrap());
106    let leechers = item
107        .select(selector)
108        .next()
109        .ok_or(Error::SelectorError("Leechers not found".into()))?;
110    let leechers_str: String = leechers.text().collect();
111    leechers_str
112        .parse::<u32>()
113        .map_err(|_| Error::SelectorError("Leechers not found".into()))
114}
115
116fn extract_downloads(item: ElementRef<'_>) -> Result<u32> {
117    let selector = DOWNLOADS_SELECTOR
118        .get_or_init(|| Selector::parse("td:nth-of-type(8)").unwrap());
119    let downloads = item
120        .select(selector)
121        .next()
122        .ok_or(Error::SelectorError("Downloads not found".into()))?;
123    let downloads_str: String = downloads.text().collect();
124    downloads_str
125        .parse::<u32>()
126        .map_err(|_| Error::SelectorError("Downloads not found".into()))
127}
128
129fn extract_size(item: ElementRef<'_>) -> Result<Size> {
130    let selector = SIZE_SELECTOR
131        .get_or_init(|| Selector::parse("td:nth-of-type(4)").unwrap());
132    let size = item
133        .select(selector)
134        .next()
135        .ok_or(Error::SelectorError("Size not found".into()))?;
136    size.text().collect::<String>().parse()
137}
138
139fn extract_date(item: ElementRef<'_>) -> Result<chrono::DateTime<chrono::Utc>> {
140    let selector = DATE_SELECTOR
141        .get_or_init(|| Selector::parse("td:nth-of-type(5)").unwrap());
142    let date = item
143        .select(selector)
144        .next()
145        .ok_or(Error::SelectorError("Date not found".into()))?;
146    date.attr("data-timestamp")
147        .ok_or(Error::SelectorError("Date not found".into()))
148        .map(|s| {
149            let time_stamp = s.parse::<i64>().unwrap();
150            let ts_in_millis = time_stamp * 1000;
151            chrono::DateTime::from_timestamp_millis(ts_in_millis)
152                .unwrap_or_default()
153        })
154}