cleora 1.2.3 - Docs.rs

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn.utils import shuffle\n",
    "from tqdm import tqdm\n",
    "import pickle as pkl\n",
    "import pandas as pd\n",
    "import random\n",
    "import sys\n",
    "import os\n",
    "from sklearn.metrics import f1_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [],
   "source": [
    "random.seed(0)\n",
    "np.random.seed(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = {\n",
    "    #embedding computation\n",
    "    'cleora_n_iter': 5,\n",
    "    'cleora_dim': 1024,\n",
    "    \n",
    "    #dataset preparation\n",
    "    'train_test_split': 0.2,\n",
    "    \n",
    "    #training classification\n",
    "    'input_embeddings': [\n",
    "                    'output/emb__cluster_id__StarNode.out',\n",
    "                    'output/emb__CliqueNode__CliqueNode.out',\n",
    "                   ],\n",
    "    'batch_size': 256,\n",
    "    'test_batch_size': 1000,\n",
    "    'epochs': [20],\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset preparation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1. Download the Facebook dataset from SNAP: https://snap.stanford.edu/data/facebook-large-page-page-network.html\n",
    "2. Extract the dataset to ./facebook_large/\n",
    "3. Compute Cleora embeddings as shown in \"Cleora training\" section in `example_link_prediction.ipynb`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cleora = pd.read_csv(\"./facebook_large/musae_facebook_edges.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id_1</th>\n",
       "      <th>id_2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>18427</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>21708</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>22208</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>22171</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>6829</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id_1   id_2\n",
       "0     0  18427\n",
       "1     1  21708\n",
       "2     1  22208\n",
       "3     1  22171\n",
       "4     1   6829"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_cleora.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_cleora, test_cleora = train_test_split(df_cleora, test_size=config['train_test_split'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [],
   "source": [
    "fb_cleora_input_clique_filename = \"fb_cleora_input_clique.txt\"\n",
    "fb_cleora_input_star_filename = \"fb_cleora_input_star.txt\"\n",
    "output_dir = 'output'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(fb_cleora_input_clique_filename, \"w\") as f_cleora_clique, open(fb_cleora_input_star_filename, \"w\") as f_cleora_star:\n",
    "    grouped_train = train_cleora.groupby('id_1')\n",
    "    for n, (name, group) in enumerate(grouped_train):\n",
    "        group_list = group['id_2'].tolist()\n",
    "        group_elems = list(map(str, group_list))\n",
    "        f_cleora_clique.write(\"{} {}\\n\".format(name, ' '.join(group_elems)))\n",
    "        f_cleora_star.write(\"{}\\t{}\\n\".format(n, name))\n",
    "        for elem in group_elems:\n",
    "            f_cleora_star.write(\"{}\\t{}\\n\".format(n, elem))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"facebook_large/musae_facebook_target.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [],
   "source": [
    "classes = df['page_type'].unique()\n",
    "class_ids = list(range(0, len(classes)))\n",
    "class_dict = {k:v for k,v in zip(classes, class_ids)}\n",
    "df['page_type'] = [class_dict[item] for item in df['page_type']] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_filename = \"fb_classification_train.txt\"\n",
    "test_filename = \"fb_classification_test.txt\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [],
   "source": [
    "train, test = train_test_split(df, test_size=config['train_test_split'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(train_filename, \"w\") as f_train:\n",
    "    for index, row in train.iterrows():\n",
    "        f_train.write(\"{} {}\\n\".format(row['id'], row['page_type']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(test_filename, \"w\") as f_test:\n",
    "    for index, row in test.iterrows():\n",
    "        f_test.write(\"{} {}\\n\".format(row['id'], row['page_type']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Cleora training"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Download an appropriate binary Cleora release from: https://github.com/Synerise/cleora/releases . \n",
    "\n",
    "A Linux GNU version is assumed in this example, but any other will do."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [],
   "source": [
    "import subprocess\n",
    "\n",
    "\n",
    "def columns2output_filename(output_dir, columns):\n",
    "    columns_split = columns.split()\n",
    "    if len(columns_split) == 1 and 'reflexive' in columns:\n",
    "        column_name = columns.split('::')[-1]\n",
    "        return os.path.join(output_dir, f'emb__{column_name}__{column_name}.out')\n",
    "\n",
    "    column_names = [i.split('::')[-1] for i in columns_split]\n",
    "    return os.path.join(output_dir, 'emb__' + '__'.join(column_names) + '.out')\n",
    "\n",
    "\n",
    "def train_cleora(dim, n_iter, columns, input_filename, output_dir):\n",
    "    command = ['./cleora-v1.0.1-x86_64-unknown-linux-gnu',\n",
    "                '--columns', columns,\n",
    "                '--dimension', str(dim), \n",
    "                '-n', str(n_iter), \n",
    "                '--input', input_filename, \n",
    "                '-o', output_dir]\n",
    "    subprocess.run(command, check=True)\n",
    "    return columns2output_filename(output_dir, columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Star expansion\n",
    "\n",
    "In the `fb_cleora_input_star.txt` file the first column is a virtual node. The parameter `-c \"transient::cluster_id node\"` means that embeddings will not be created for nodes from this column. This translates to star expansion scheme."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.37 ms, sys: 8.1 ms, total: 9.47 ms\n",
      "Wall time: 8.59 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "cleora_output_star_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], \"transient::cluster_id StarNode\", fb_cleora_input_star_filename, output_dir)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Clique expansion\n",
    "\n",
    "The `fb_cleora_input_clique.txt` file has the structure of adjacency list. The parameter `-c \"complex::reflexive::node\"` means that edges will be created for all cominations of nodes from each line. This translates to clique expansion scheme."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.42 ms, sys: 8.34 ms, total: 12.8 ms\n",
      "Wall time: 13.7 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "cleora_output_clique_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], \"complex::reflexive::CliqueNode\", fb_cleora_input_clique_filename, output_dir)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## No expansion\n",
    "\n",
    "You can also compute Cleora without any expansion scheme by providing an input file in the edgelist format (single pair of nodes per line). Run with a simple parameter: `-c \"node1 node2\"`."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Classification\n",
    "\n",
    "We train a simple multiclass Logistic Regression classifier to predict the class of node based on its embedding. We assess the quality of the classifier with of 2 metrics: micro-F1 and macro-F1."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_embeddings(input_file):\n",
    "    df_full = pd.read_csv(input_file, delimiter = \" \", skiprows=[0], header=None, \n",
    "                     index_col=0)\n",
    "    df_full = df_full.drop([1], axis=1)\n",
    "\n",
    "    return df_full"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_train_test(embeddings):\n",
    "    valid_idx = embeddings.index.to_numpy()\n",
    "    \n",
    "    train = np.loadtxt(train_filename, delimiter=\" \", dtype=np.int) \n",
    "    test = np.loadtxt(test_filename, delimiter=\" \", dtype=np.int)\n",
    "    \n",
    "    train = train[np.isin(train[:,0], valid_idx) & np.isin(train[:,1], valid_idx)]\n",
    "    test = [t for t in test if (t[0] in valid_idx) and (t[1] in valid_idx)] \n",
    "     \n",
    "    train = np.array(train)\n",
    "    test = np.array(test)\n",
    "    \n",
    "    return train,test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = config['batch_size']\n",
    "test_batch_size = config['test_batch_size']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 20/20 [00:15<00:00,  1.29it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "algo: output/emb__cluster_id__StarNode.out epochs: 20, micro f1: 0.9093110871905274, macro f1:0.9094875754311472\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "100%|██████████| 20/20 [00:15<00:00,  1.33it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "algo: output/emb__CliqueNode__CliqueNode.out epochs: 20, micro f1: 0.9171151776103337, macro f1:0.9169262311726959\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "for algo in config['input_embeddings']:\n",
    "    embeddings = read_embeddings(algo)\n",
    "    train,test = read_train_test(embeddings)\n",
    "                                 \n",
    "    y_train = train[:, 1]\n",
    "    y_test = test[:, 1]\n",
    "\n",
    "    clf = SGDClassifier(random_state=0, loss='log', alpha=0.0001)\n",
    "    for e in tqdm(range(0, max(config['epochs']))):\n",
    "        for idx in range(0,train.shape[0],batch_size):\n",
    "            ex=train[idx:min(idx+batch_size,train.shape[0]),:]\n",
    "\n",
    "            ex_emb_in = embeddings.loc[ex[:,0]].to_numpy()\n",
    "            ex_y = y_train[idx:min(idx+batch_size,train.shape[0])]\n",
    "    \n",
    "            clf.partial_fit(ex_emb_in, ex_y, classes=[0,1,2,3])\n",
    "        \n",
    "        if e+1 in config['epochs']:\n",
    "            acc = 0.0\n",
    "            y_pred = []\n",
    "            for n, idx in enumerate(range(0,test.shape[0],test_batch_size)):\n",
    "                ex=test[idx:min(idx+test_batch_size,train.shape[0]),:]\n",
    "                ex_emb_in = embeddings.loc[ex[:,0]].to_numpy()\n",
    "                pred = clf.predict_proba(ex_emb_in)\n",
    "    \n",
    "                classes = np.argmax(pred, axis=1)\n",
    "                y_pred.extend(classes)\n",
    "\n",
    "            f1_micro = f1_score(y_test, y_pred, average='micro')\n",
    "            f1_macro = f1_score(y_test, y_pred, average='macro')\n",
    "            print('algo: {} epochs: {}, micro f1: {}, macro f1:{}'.format(algo, e+1, f1_micro, f1_macro))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}