{
"cells": [
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import SGDClassifier\n",
"from sklearn.utils import shuffle\n",
"from tqdm import tqdm\n",
"import pickle as pkl\n",
"import pandas as pd\n",
"import random\n",
"import sys\n",
"import os\n",
"from sklearn.metrics import f1_score"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {},
"outputs": [],
"source": [
"random.seed(0)\n",
"np.random.seed(0)"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {},
"outputs": [],
"source": [
"config = {\n",
" #embedding computation\n",
" 'cleora_n_iter': 5,\n",
" 'cleora_dim': 1024,\n",
" \n",
" #dataset preparation\n",
" 'train_test_split': 0.2,\n",
" \n",
" #training classification\n",
" 'input_embeddings': [\n",
" 'output/emb__cluster_id__StarNode.out',\n",
" 'output/emb__CliqueNode__CliqueNode.out',\n",
" ],\n",
" 'batch_size': 256,\n",
" 'test_batch_size': 1000,\n",
" 'epochs': [20],\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dataset preparation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. Download the Facebook dataset from SNAP: https://snap.stanford.edu/data/facebook-large-page-page-network.html\n",
"2. Extract the dataset to ./facebook_large/\n",
"3. Compute Cleora embeddings as shown in \"Cleora training\" section in `example_link_prediction.ipynb`"
]
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {},
"outputs": [],
"source": [
"df_cleora = pd.read_csv(\"./facebook_large/musae_facebook_edges.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id_1</th>\n",
" <th>id_2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>18427</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>21708</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>22208</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>22171</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>6829</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id_1 id_2\n",
"0 0 18427\n",
"1 1 21708\n",
"2 1 22208\n",
"3 1 22171\n",
"4 1 6829"
]
},
"execution_count": 149,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_cleora.head()"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {},
"outputs": [],
"source": [
"train_cleora, test_cleora = train_test_split(df_cleora, test_size=config['train_test_split'])"
]
},
{
"cell_type": "code",
"execution_count": 151,
"metadata": {},
"outputs": [],
"source": [
"fb_cleora_input_clique_filename = \"fb_cleora_input_clique.txt\"\n",
"fb_cleora_input_star_filename = \"fb_cleora_input_star.txt\"\n",
"output_dir = 'output'"
]
},
{
"cell_type": "code",
"execution_count": 152,
"metadata": {},
"outputs": [],
"source": [
"with open(fb_cleora_input_clique_filename, \"w\") as f_cleora_clique, open(fb_cleora_input_star_filename, \"w\") as f_cleora_star:\n",
" grouped_train = train_cleora.groupby('id_1')\n",
" for n, (name, group) in enumerate(grouped_train):\n",
" group_list = group['id_2'].tolist()\n",
" group_elems = list(map(str, group_list))\n",
" f_cleora_clique.write(\"{} {}\\n\".format(name, ' '.join(group_elems)))\n",
" f_cleora_star.write(\"{}\\t{}\\n\".format(n, name))\n",
" for elem in group_elems:\n",
" f_cleora_star.write(\"{}\\t{}\\n\".format(n, elem))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"facebook_large/musae_facebook_target.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {},
"outputs": [],
"source": [
"classes = df['page_type'].unique()\n",
"class_ids = list(range(0, len(classes)))\n",
"class_dict = {k:v for k,v in zip(classes, class_ids)}\n",
"df['page_type'] = [class_dict[item] for item in df['page_type']] "
]
},
{
"cell_type": "code",
"execution_count": 155,
"metadata": {},
"outputs": [],
"source": [
"train_filename = \"fb_classification_train.txt\"\n",
"test_filename = \"fb_classification_test.txt\""
]
},
{
"cell_type": "code",
"execution_count": 156,
"metadata": {},
"outputs": [],
"source": [
"train, test = train_test_split(df, test_size=config['train_test_split'])"
]
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {},
"outputs": [],
"source": [
"with open(train_filename, \"w\") as f_train:\n",
" for index, row in train.iterrows():\n",
" f_train.write(\"{} {}\\n\".format(row['id'], row['page_type']))"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {},
"outputs": [],
"source": [
"with open(test_filename, \"w\") as f_test:\n",
" for index, row in test.iterrows():\n",
" f_test.write(\"{} {}\\n\".format(row['id'], row['page_type']))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cleora training"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Download an appropriate binary Cleora release from: https://github.com/Synerise/cleora/releases . \n",
"\n",
"A Linux GNU version is assumed in this example, but any other will do."
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [],
"source": [
"import subprocess\n",
"\n",
"\n",
"def columns2output_filename(output_dir, columns):\n",
" columns_split = columns.split()\n",
" if len(columns_split) == 1 and 'reflexive' in columns:\n",
" column_name = columns.split('::')[-1]\n",
" return os.path.join(output_dir, f'emb__{column_name}__{column_name}.out')\n",
"\n",
" column_names = [i.split('::')[-1] for i in columns_split]\n",
" return os.path.join(output_dir, 'emb__' + '__'.join(column_names) + '.out')\n",
"\n",
"\n",
"def train_cleora(dim, n_iter, columns, input_filename, output_dir):\n",
" command = ['./cleora-v1.0.1-x86_64-unknown-linux-gnu',\n",
" '--columns', columns,\n",
" '--dimension', str(dim), \n",
" '-n', str(n_iter), \n",
" '--input', input_filename, \n",
" '-o', output_dir]\n",
" subprocess.run(command, check=True)\n",
" return columns2output_filename(output_dir, columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Star expansion\n",
"\n",
"In the `fb_cleora_input_star.txt` file the first column is a virtual node. The parameter `-c \"transient::cluster_id node\"` means that embeddings will not be created for nodes from this column. This translates to star expansion scheme."
]
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.37 ms, sys: 8.1 ms, total: 9.47 ms\n",
"Wall time: 8.59 s\n"
]
}
],
"source": [
"%%time\n",
"cleora_output_star_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], \"transient::cluster_id StarNode\", fb_cleora_input_star_filename, output_dir)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Clique expansion\n",
"\n",
"The `fb_cleora_input_clique.txt` file has the structure of adjacency list. The parameter `-c \"complex::reflexive::node\"` means that edges will be created for all cominations of nodes from each line. This translates to clique expansion scheme."
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4.42 ms, sys: 8.34 ms, total: 12.8 ms\n",
"Wall time: 13.7 s\n"
]
}
],
"source": [
"%%time\n",
"cleora_output_clique_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], \"complex::reflexive::CliqueNode\", fb_cleora_input_clique_filename, output_dir)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## No expansion\n",
"\n",
"You can also compute Cleora without any expansion scheme by providing an input file in the edgelist format (single pair of nodes per line). Run with a simple parameter: `-c \"node1 node2\"`."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Classification\n",
"\n",
"We train a simple multiclass Logistic Regression classifier to predict the class of node based on its embedding. We assess the quality of the classifier with of 2 metrics: micro-F1 and macro-F1."
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {},
"outputs": [],
"source": [
"def read_embeddings(input_file):\n",
" df_full = pd.read_csv(input_file, delimiter = \" \", skiprows=[0], header=None, \n",
" index_col=0)\n",
" df_full = df_full.drop([1], axis=1)\n",
"\n",
" return df_full"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [],
"source": [
"def read_train_test(embeddings):\n",
" valid_idx = embeddings.index.to_numpy()\n",
" \n",
" train = np.loadtxt(train_filename, delimiter=\" \", dtype=np.int) \n",
" test = np.loadtxt(test_filename, delimiter=\" \", dtype=np.int)\n",
" \n",
" train = train[np.isin(train[:,0], valid_idx) & np.isin(train[:,1], valid_idx)]\n",
" test = [t for t in test if (t[0] in valid_idx) and (t[1] in valid_idx)] \n",
" \n",
" train = np.array(train)\n",
" test = np.array(test)\n",
" \n",
" return train,test"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {},
"outputs": [],
"source": [
"batch_size = config['batch_size']\n",
"test_batch_size = config['test_batch_size']"
]
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [00:15<00:00, 1.29it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"algo: output/emb__cluster_id__StarNode.out epochs: 20, micro f1: 0.9093110871905274, macro f1:0.9094875754311472\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 20/20 [00:15<00:00, 1.33it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"algo: output/emb__CliqueNode__CliqueNode.out epochs: 20, micro f1: 0.9171151776103337, macro f1:0.9169262311726959\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"for algo in config['input_embeddings']:\n",
" embeddings = read_embeddings(algo)\n",
" train,test = read_train_test(embeddings)\n",
" \n",
" y_train = train[:, 1]\n",
" y_test = test[:, 1]\n",
"\n",
" clf = SGDClassifier(random_state=0, loss='log', alpha=0.0001)\n",
" for e in tqdm(range(0, max(config['epochs']))):\n",
" for idx in range(0,train.shape[0],batch_size):\n",
" ex=train[idx:min(idx+batch_size,train.shape[0]),:]\n",
"\n",
" ex_emb_in = embeddings.loc[ex[:,0]].to_numpy()\n",
" ex_y = y_train[idx:min(idx+batch_size,train.shape[0])]\n",
" \n",
" clf.partial_fit(ex_emb_in, ex_y, classes=[0,1,2,3])\n",
" \n",
" if e+1 in config['epochs']:\n",
" acc = 0.0\n",
" y_pred = []\n",
" for n, idx in enumerate(range(0,test.shape[0],test_batch_size)):\n",
" ex=test[idx:min(idx+test_batch_size,train.shape[0]),:]\n",
" ex_emb_in = embeddings.loc[ex[:,0]].to_numpy()\n",
" pred = clf.predict_proba(ex_emb_in)\n",
" \n",
" classes = np.argmax(pred, axis=1)\n",
" y_pred.extend(classes)\n",
"\n",
" f1_micro = f1_score(y_test, y_pred, average='micro')\n",
" f1_macro = f1_score(y_test, y_pred, average='macro')\n",
" print('algo: {} epochs: {}, micro f1: {}, macro f1:{}'.format(algo, e+1, f1_micro, f1_macro))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}