wiki_info/
lib.rs

1// library functions are not used outside of test
2#![allow(unused)]
3
4pub mod wiki_info;
5
6// Tests
7#[cfg(test)]
8mod tests {
9    use crate::wiki_info::{
10        clean_document, cosine_sim, get_page_similarity, page_from_url,
11        url_utils::{self, resolve_wiki_url},
12    };
13
14    use super::wiki_info::{get_most_similar_page, page_from_title, Page};
15
16    #[test]
17    fn test_page_from_title() {
18        let page = page_from_title("Paris").unwrap();
19        assert_eq!(page.title, "Paris");
20    }
21
22    #[test]
23    fn test_get_most_sim_pages() {
24        let main: Page = page_from_title("Paris").unwrap();
25
26        let pages_to_check: Vec<Page> = vec![
27            "France",
28            "European Union",
29            "World War I",
30            "Prime Minister of France",
31        ]
32        .iter()
33        .map(|title| page_from_title(title).unwrap())
34        .collect();
35
36        let most_similar_page = get_most_similar_page(&main, &pages_to_check);
37
38        assert_eq!(pages_to_check[most_similar_page].title, "France")
39    }
40
41    #[test]
42    fn test_same_documents_eq() {
43        let page = page_from_title("Prime Minister of France").unwrap();
44        let same_page = page_from_title("Prime Minister of France").unwrap();
45
46        let sim_score = get_page_similarity(&page, &same_page);
47
48        assert!(
49            sim_score > 0.98,
50            "Same page should be practically identical"
51        );
52    }
53
54    #[test]
55    fn test_url_resolve() {
56        assert_eq!(
57            resolve_wiki_url("Prime Minister of France").unwrap(),
58            "https://en.wikipedia.org/wiki/Prime_Minister_of_France"
59        )
60    }
61
62    #[test]
63    fn test_doc_difference() {
64        let page1 =
65            page_from_url("https://en.wikipedia.org/wiki/Prime_Minister_of_France").unwrap();
66
67        let page2 = page_from_url("https://en.wikipedia.org/wiki/The_Dark_Tower_(series)").unwrap();
68
69        let sim = get_page_similarity(&page1, &page2);
70        assert!(sim < 0.2, "Pages should differ signifcantly");
71    }
72
73    #[test]
74    fn test_traversal() {
75        let starting_url = "https://en.wikipedia.org/wiki/The_Dark_Tower_(series)";
76        let starting_page = page_from_url(starting_url).unwrap();
77
78        let link_num = 19;
79        let target_link = &starting_page.links[link_num].outlink;
80
81        let target_page = page_from_url(target_link).unwrap();
82
83        println!("Title of {}th link: {}", link_num, target_page.title);
84
85        assert!(
86            !target_page.title.is_empty(),
87            "The target page should have a title."
88        );
89    }
90
91    #[test]
92    fn test_clean_doc() {
93        let starting_url = "https://en.wikipedia.org/wiki/The_Dark_Tower_(series)";
94        let starting_page = page_from_url(starting_url).unwrap();
95
96        let cleaned = clean_document(&starting_page);
97
98        println!("Content: {:?}", cleaned);
99
100        let words: Vec<&str> = cleaned.content.split_whitespace().collect();
101
102        assert!(
103            !words.contains(&"the"),
104            "Should not contain common stop word 'the'"
105        );
106        assert!(
107            !words.contains(&"and"),
108            "Should not contain common stop word 'and'"
109        );
110        assert!(
111            !words.contains(&"a"),
112            "Should not contain common stop word 'a'"
113        );
114    }
115
116    use scraper::Html;
117
118    #[test]
119    fn test_page_from_title_spec_chars() {
120        let title = "Rust_(programming_language)";
121        let page = page_from_title(title).unwrap();
122        assert_eq!(page.title, "Rust (programming language)");
123        assert!(!page.content.is_empty());
124        assert!(!page.links.is_empty());
125    }
126
127    #[test]
128    fn test_page_from_url() {
129        let url = "https://en.wikipedia.org/wiki/Rust_(programming_language)";
130        let page = page_from_url(url).unwrap();
131        assert_eq!(page.title, "Rust (programming language)");
132        assert!(!page.content.is_empty());
133        assert!(!page.links.is_empty());
134    }
135
136    #[test]
137    fn test_clean_meta_content() {
138        let raw_content = "   Some content \nwith \n\nlots of  whitespace. ";
139        let cleaned = crate::wiki_info::clean_meta_content(raw_content);
140        assert_eq!(cleaned, "Some content with lots of whitespace.");
141    }
142
143    #[test]
144    fn test_process_content() {
145        let html = r#"
146        <html>
147            <body>
148                <div class="mw-content-container">
149                    <main id="content">
150                        <p>This is a test paragraph with <a href="/wiki/Test_Link">a link</a>.</p>
151                    </main>
152                </div>
153            </body>
154        </html>
155        "#;
156        let document = Html::parse_document(html);
157        let selector = scraper::Selector::parse("div.mw-content-container main#content").unwrap();
158        let element = document.select(&selector).next().unwrap();
159
160        let page = crate::wiki_info::process_content(element, "Test Page");
161        assert_eq!(page.title, "Test Page");
162        assert!(page.content.contains("This is a test paragraph"));
163        assert_eq!(page.links.len(), 1);
164        assert_eq!(page.links[0].title, "a link");
165        assert_eq!(
166            page.links[0].outlink,
167            "https://en.wikipedia.org/wiki/Test_Link"
168        );
169    }
170
171    #[test]
172    fn test_clean_document() {
173        let page = Page {
174            title: "Test Page".to_string(),
175            content: "The quick brown fox jumps over the lazy dog.".to_string(),
176            links: vec![],
177        };
178        let cleaned = clean_document(&page);
179        assert!(cleaned.content.contains("quick"));
180        assert!(!cleaned.content.contains("the")); // Assuming "the" is a stop word
181    }
182
183    // #[test]
184    // fn test_page_to_vec() {
185    //     let page = Page {
186    //         title: "Test Page".to_string(),
187    //         content: "the quick brown fox jumps over the lazy dog".to_string(),
188    //         links: vec![],
189    //     };
190    //     let vec = crate::wiki_info::page_to_vec(&page);
191    //     assert_eq!(vec.len(), 6); // stopwords: the, over, the
192    //     assert!(vec.iter().all(|&x| x > 0.0));
193    // }
194
195    #[test]
196    fn test_cosine_similarity() {
197        let vec1 = vec![1.0, 0.5, 0.0];
198        let vec2 = vec![0.5, 1.0, 0.0];
199        let sim = cosine_sim(&vec1, &vec2);
200        assert!(sim > 0.7);
201    }
202
203    #[test]
204    fn test_get_page_similarity() {
205        let page1 = Page {
206            title: "Page 1".to_string(),
207            content: "The quick brown fox jumps over the lazy dog.".to_string(),
208            links: vec![],
209        };
210        let page2 = Page {
211            title: "Page 2".to_string(),
212            content: "The quick brown cat sleeps under the lazy dog.".to_string(),
213            links: vec![],
214        };
215        let similarity = get_page_similarity(&page1, &page2);
216        assert!(similarity > 0.5);
217    }
218
219    #[test]
220    fn test_url_utils_title_from_url() {
221        let url = "https://en.wikipedia.org/wiki/Rust_(programming_language)";
222        let title = url_utils::title_from_url(url);
223        assert_eq!(title, "Rust (programming language) ");
224    }
225
226    #[test]
227    fn test_url_utils_resolve_wiki_url() {
228        let title = "Rust_(programming_language)";
229        let resolved_url = url_utils::resolve_wiki_url(title).unwrap();
230
231        println!("RESOLVED URL: {:?}", resolved_url);
232
233        assert!(resolved_url.contains("https://en.wikipedia.org/wiki/"));
234    }
235
236    #[test]
237    fn test_invalid_url() {
238        let invalid_url = "not-a-url";
239        let result = url_utils::resolve_wiki_url(invalid_url);
240        assert!(result.is_err(), "Expected error for invalid URL");
241    }
242
243    #[test]
244    fn test_non_200_status() {
245        let mock_url = "https://en.wikipedia.org/wiki/NonexistentPage404"; // hope they never add this page lol
246        let result = url_utils::resolve_wiki_url(mock_url);
247        assert!(result.is_err(), "Expected error for non-200 HTTP status");
248        if let Err(e) = result {
249            assert!(
250                e.to_string().contains("URL returned status"),
251                "Unexpected error message: {}",
252                e
253            );
254        }
255    }
256}