In [145]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.utils import shuffle
from tqdm import tqdm
import pickle as pkl
import pandas as pd
import random
import sys
import os
from sklearn.metrics import f1_score

In [146]:
random.seed(0)
np.random.seed(0)

In [147]:
config = {
 #embedding computation
 'cleora_n_iter': 5,
 'cleora_dim': 1024,
 
 #dataset preparation
 'train_test_split': 0.2,
 
 #training classification
 'input_embeddings': [
 'output/emb__cluster_id__StarNode.out',
 'output/emb__CliqueNode__CliqueNode.out',
 ],
 'batch_size': 256,
 'test_batch_size': 1000,
 'epochs': [20],
}

# Dataset preparation

1. Download the Facebook dataset from SNAP: https://snap.stanford.edu/data/facebook-large-page-page-network.html
2. Extract the dataset to ./facebook_large/
3. Compute Cleora embeddings as shown in "Cleora training" section in `example_link_prediction.ipynb`

In [148]:
df_cleora = pd.read_csv("./facebook_large/musae_facebook_edges.csv")

In [149]:
df_cleora.head()

Unnamed: 0,id_1,id_2
0,0,18427
1,1,21708
2,1,22208
3,1,22171
4,1,6829


In [150]:
train_cleora, test_cleora = train_test_split(df_cleora, test_size=config['train_test_split'])

In [151]:
fb_cleora_input_clique_filename = "fb_cleora_input_clique.txt"
fb_cleora_input_star_filename = "fb_cleora_input_star.txt"
output_dir = 'output'

In [152]:
with open(fb_cleora_input_clique_filename, "w") as f_cleora_clique, open(fb_cleora_input_star_filename, "w") as f_cleora_star:
 grouped_train = train_cleora.groupby('id_1')
 for n, (name, group) in enumerate(grouped_train):
 group_list = group['id_2'].tolist()
 group_elems = list(map(str, group_list))
 f_cleora_clique.write("{} {}\n".format(name, ' '.join(group_elems)))
 f_cleora_star.write("{}\t{}\n".format(n, name))
 for elem in group_elems:
 f_cleora_star.write("{}\t{}\n".format(n, elem))

In [153]:
df = pd.read_csv("facebook_large/musae_facebook_target.csv")

In [154]:
classes = df['page_type'].unique()
class_ids = list(range(0, len(classes)))
class_dict = {k:v for k,v in zip(classes, class_ids)}
df['page_type'] = [class_dict[item] for item in df['page_type']] 

In [155]:
train_filename = "fb_classification_train.txt"
test_filename = "fb_classification_test.txt"

In [156]:
train, test = train_test_split(df, test_size=config['train_test_split'])

In [157]:
with open(train_filename, "w") as f_train:
 for index, row in train.iterrows():
 f_train.write("{} {}\n".format(row['id'], row['page_type']))

In [158]:
with open(test_filename, "w") as f_test:
 for index, row in test.iterrows():
 f_test.write("{} {}\n".format(row['id'], row['page_type']))

# Cleora training

Download an appropriate binary Cleora release from: https://github.com/Synerise/cleora/releases . 

A Linux GNU version is assumed in this example, but any other will do.

In [159]:
import subprocess


def columns2output_filename(output_dir, columns):
 columns_split = columns.split()
 if len(columns_split) == 1 and 'reflexive' in columns:
 column_name = columns.split('::')[-1]
 return os.path.join(output_dir, f'emb__{column_name}__{column_name}.out')

 column_names = [i.split('::')[-1] for i in columns_split]
 return os.path.join(output_dir, 'emb__' + '__'.join(column_names) + '.out')


def train_cleora(dim, n_iter, columns, input_filename, output_dir):
 command = ['./cleora-v1.0.1-x86_64-unknown-linux-gnu',
 '--columns', columns,
 '--dimension', str(dim), 
 '-n', str(n_iter), 
 '--input', input_filename, 
 '-o', output_dir]
 subprocess.run(command, check=True)
 return columns2output_filename(output_dir, columns)

## Star expansion

In the `fb_cleora_input_star.txt` file the first column is a virtual node. The parameter `-c "transient::cluster_id node"` means that embeddings will not be created for nodes from this column. This translates to star expansion scheme.

In [160]:
%%time
cleora_output_star_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], "transient::cluster_id StarNode", fb_cleora_input_star_filename, output_dir)

CPU times: user 1.37 ms, sys: 8.1 ms, total: 9.47 ms
Wall time: 8.59 s


## Clique expansion

The `fb_cleora_input_clique.txt` file has the structure of adjacency list. The parameter `-c "complex::reflexive::node"` means that edges will be created for all cominations of nodes from each line. This translates to clique expansion scheme.

In [161]:
%%time
cleora_output_clique_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], "complex::reflexive::CliqueNode", fb_cleora_input_clique_filename, output_dir)

CPU times: user 4.42 ms, sys: 8.34 ms, total: 12.8 ms
Wall time: 13.7 s


## No expansion

You can also compute Cleora without any expansion scheme by providing an input file in the edgelist format (single pair of nodes per line). Run with a simple parameter: `-c "node1 node2"`.

# Classification

We train a simple multiclass Logistic Regression classifier to predict the class of node based on its embedding. We assess the quality of the classifier with of 2 metrics: micro-F1 and macro-F1.

In [162]:
def read_embeddings(input_file):
 df_full = pd.read_csv(input_file, delimiter = " ", skiprows=[0], header=None, 
 index_col=0)
 df_full = df_full.drop([1], axis=1)

 return df_full

In [163]:
def read_train_test(embeddings):
 valid_idx = embeddings.index.to_numpy()
 
 train = np.loadtxt(train_filename, delimiter=" ", dtype=np.int) 
 test = np.loadtxt(test_filename, delimiter=" ", dtype=np.int)
 
 train = train[np.isin(train[:,0], valid_idx) & np.isin(train[:,1], valid_idx)]
 test = [t for t in test if (t[0] in valid_idx) and (t[1] in valid_idx)] 
 
 train = np.array(train)
 test = np.array(test)
 
 return train,test

In [164]:
batch_size = config['batch_size']
test_batch_size = config['test_batch_size']

In [165]:
for algo in config['input_embeddings']:
 embeddings = read_embeddings(algo)
 train,test = read_train_test(embeddings)
 
 y_train = train[:, 1]
 y_test = test[:, 1]

 clf = SGDClassifier(random_state=0, loss='log', alpha=0.0001)
 for e in tqdm(range(0, max(config['epochs']))):
 for idx in range(0,train.shape[0],batch_size):
 ex=train[idx:min(idx+batch_size,train.shape[0]),:]

 ex_emb_in = embeddings.loc[ex[:,0]].to_numpy()
 ex_y = y_train[idx:min(idx+batch_size,train.shape[0])]
 
 clf.partial_fit(ex_emb_in, ex_y, classes=[0,1,2,3])
 
 if e+1 in config['epochs']:
 acc = 0.0
 y_pred = []
 for n, idx in enumerate(range(0,test.shape[0],test_batch_size)):
 ex=test[idx:min(idx+test_batch_size,train.shape[0]),:]
 ex_emb_in = embeddings.loc[ex[:,0]].to_numpy()
 pred = clf.predict_proba(ex_emb_in)
 
 classes = np.argmax(pred, axis=1)
 y_pred.extend(classes)

 f1_micro = f1_score(y_test, y_pred, average='micro')
 f1_macro = f1_score(y_test, y_pred, average='macro')
 print('algo: {} epochs: {}, micro f1: {}, macro f1:{}'.format(algo, e+1, f1_micro, f1_macro))


100%|██████████| 20/20 [00:15<00:00, 1.29it/s]

algo: output/emb__cluster_id__StarNode.out epochs: 20, micro f1: 0.9093110871905274, macro f1:0.9094875754311472



100%|██████████| 20/20 [00:15<00:00, 1.33it/s]

algo: output/emb__CliqueNode__CliqueNode.out epochs: 20, micro f1: 0.9171151776103337, macro f1:0.9169262311726959



