1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
//! Load (a subset of) Reddit hyperlinks dataset into a graph.
//! The dataset is available at http://snap.stanford.edu/data/soc-redditHyperlinks-title.tsv
//! The hyperlink network represents the directed connections between two subreddits (a subreddit
//! is a community on Reddit). We also provide subreddit embeddings. The network is extracted
//! from publicly available Reddit data of 2.5 years from Jan 2014 to April 2017.
//! *NOTE: It may take a while to download the dataset
//!
//! ## Dataset statistics
//! * Number of nodes (subreddits) 35,776
//! * Number of edges (hyperlink between subreddits) 137,821
//! * Timespan Jan 2014 - April 2017
//!
//! ## Source
//! S. Kumar, W.L. Hamilton, J. Leskovec, D. Jurafsky. Community Interaction and Conflict
//! on the Web. World Wide Web Conference, 2018.
//!
//! ## Properties
//!
//!  * SOURCE_SUBREDDIT: the subreddit where the link originates
//!  * TARGET_SUBREDDIT: the subreddit where the link ends
//!  * POST_ID: the post in the source subreddit that starts the link
//!  * TIMESTAMP: time time of the post
//!  * POST_LABEL: label indicating if the source post is explicitly negative towards the target
//! post. The value is -1 if the source is negative towards the target, and 1 if it is neutral or
//! positive. The label is created using crowd-sourcing and training a text based classifier, and
//! is better than simple sentiment analysis of the posts. Please see the reference paper for details.
//!  * POST_PROPERTIES: a vector representing the text properties of the source post, listed as a
//! list of comma separated numbers. This can be found on the source website
//!
//! Example:
//! ```no_run
//! use raphtory_io::graph_loader::example::reddit_hyperlinks::reddit_graph;
//! use raphtory::db::graph::Graph;
//! use raphtory::db::view_api::*;
//!
//! let graph = reddit_graph(1, 120, false);
//!
//! println!("The graph has {:?} vertices", graph.num_vertices());
//! println!("The graph has {:?} edges", graph.num_edges());
//! ```

use crate::graph_loader::fetch_file;
use chrono::*;
use itertools::Itertools;
use raphtory::core::Prop;
use raphtory::db::graph::Graph;
use std::fs::File;
use std::io::{self, BufRead};
use std::path::Path;
use std::path::PathBuf;

/// Download the dataset and return the path to the file
/// # Arguments
/// * `timeout` - The timeout in seconds for downloading the dataset
/// # Returns
/// * `PathBuf` - The path to the file
pub fn reddit_file(
    timeout: u64,
    test_file: Option<bool>,
) -> Result<PathBuf, Box<dyn std::error::Error>> {
    match test_file {
        Some(true) => fetch_file(
            "reddit-title-test.tsv",
            true,
            "https://raw.githubusercontent.com/Raphtory/Data/main/reddit-title-test.tsv",
            timeout,
        ),
        _ => fetch_file(
            "reddit-title.tsv",
            true,
            "http://snap.stanford.edu/data/soc-redditHyperlinks-title.tsv",
            timeout,
        ),
    }
}

/// Read the file line by line
fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
where
    P: AsRef<Path>,
{
    let file = File::open(filename)?;
    Ok(io::BufReader::new(file).lines())
}

/// Load the Reddit hyperlinks dataset into a graph and return it
///
/// # Arguments
///
/// * `shards` - The number of shards to use for the graph
/// * `timeout` - The timeout in seconds for downloading the dataset
///
/// # Returns
///
/// * `Graph` - The graph containing the Reddit hyperlinks dataset
pub fn reddit_graph(shards: usize, timeout: u64, test_file: bool) -> Graph {
    let graph = {
        let g = Graph::new(shards);

        if let Ok(path) = reddit_file(timeout, Some(test_file)) {
            if let Ok(lines) = read_lines(path.as_path()) {
                // Consumes the iterator, returns an (Optional) String
                for reddit in lines.dropping(1).flatten() {
                    let reddit: Vec<&str> = reddit.split('\t').collect();
                    let src_id = &reddit[0];
                    let dst_id = &reddit[1];
                    let post_id = reddit[2].to_string();

                    match NaiveDateTime::parse_from_str(reddit[3], "%Y-%m-%d %H:%M:%S") {
                        Ok(time) => {
                            let time = time.timestamp() * 1000;
                            let post_label: i32 = reddit[4].parse::<i32>().unwrap();
                            let post_properties: Vec<f64> = reddit[5]
                                .split(',')
                                .map(|s| s.parse::<f64>().unwrap())
                                .collect();
                            let edge_properties = &vec![
                                ("post_label".to_string(), Prop::I32(post_label)),
                                ("post_id".to_string(), Prop::Str(post_id)),
                                ("word_count".to_string(), Prop::F64(post_properties[7])),
                                ("long_words".to_string(), Prop::F64(post_properties[9])),
                                ("sentences".to_string(), Prop::F64(post_properties[13])),
                                ("readability".to_string(), Prop::F64(post_properties[17])),
                                (
                                    "positive_sentiment".to_string(),
                                    Prop::F64(post_properties[18]),
                                ),
                                (
                                    "negative_sentiment".to_string(),
                                    Prop::F64(post_properties[19]),
                                ),
                                (
                                    "compound_sentiment".to_string(),
                                    Prop::F64(post_properties[20]),
                                ),
                            ];
                            g.add_vertex(time, *src_id, &vec![])
                                .map_err(|err| println!("{:?}", err))
                                .ok();
                            g.add_vertex(time, *dst_id, &vec![])
                                .map_err(|err| println!("{:?}", err))
                                .ok();
                            g.add_edge(time, *src_id, *dst_id, edge_properties, None)
                                .expect("Error: Unable to add edge");
                        }
                        Err(e) => {
                            println!("{}", e)
                        }
                    }
                }
            }
        };

        g
    };
    graph
}

#[cfg(test)]
mod reddit_test {
    use crate::graph_loader::example::reddit_hyperlinks::{reddit_file, reddit_graph};
    use raphtory::db::view_api::GraphViewOps;

    #[test]
    fn check_data() {
        let file = reddit_file(100, Some(true));
        assert!(file.is_ok());
    }

    #[test]
    fn check_graph() {
        let graph = reddit_graph(1, 100, true);
        assert_eq!(graph.num_vertices(), 16);
        assert_eq!(graph.num_edges(), 9);
    }
}