{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Data generation\n",
    "---------------------------\n",
    "\n",
    "First, we generate a dataset with the `peerannot simulate` command.\n",
    "This dataset has 30 workers, 200 tasks for 5 classes. Each task receives 10 votes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "path = (Path() / \"..\" / \"_build\" / \"notebooks\")\n",
    "path.mkdir(exist_ok=True, parents=True)\n",
    "\n",
    "! peerannot simulate --n-worker=30 --n-task=200  --n-classes=5 \\\n",
    "                     --strategy independent-confusion \\\n",
    "                     --feedback=10 --seed 0 \\\n",
    "                     --folder ../_build/notebooks/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can visualize the generated votes and the true labels of the tasks.\n",
    "For example let us consider task 5:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "with open(path / \"answers.json\") as f:\n",
    "    answers = json.load(f)\n",
    "gt = np.load(path / \"ground_truth.npy\")\n",
    "\n",
    "print(\"Task 5:\", answers[\"5\"])\n",
    "print(\"Number of votes:\", len(answers[\"5\"]))\n",
    "print(\"Ground truth:\", gt[5])\n",
    "fig, ax = plt.subplots()\n",
    "\n",
    "counts = np.bincount(list(answers[\"5\"].values()), minlength=5)\n",
    "classes = [f\"class {str(i)}\" for i in [0, 1, 2, 3, 4]]\n",
    "\n",
    "ax.bar(classes, counts)\n",
    "plt.yticks(range(0, max(counts)+1))\n",
    "ax.set_ylabel(\"Number of votes\")\n",
    "ax.set_title(\"Number of votes for each class for task 5\")\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Command Line Aggregation\n",
    "------------------------\n",
    "\n",
    "Let us run some aggregation methods on the dataset we just generated using the command line interface."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for strat in [\"MV\", \"NaiveSoft\", \"DS\", \"GLAD\", \"DSWC[L=5]\", \"Wawa\"]:\n",
    "    ! peerannot aggregate ../_build/notebooks/ -s {strat}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, as we know the ground truth we can evaluate the performance of the aggregation methods.\n",
    "In this example we consider the accuracy. Other metrics such as F1-scores, precision, recall, etc. can be used."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "def accuracy(labels, gt):\n",
    "    return np.mean(labels == gt) if labels.ndim == 1 else np.mean(np.argmax(labels, axis=1) == gt)\n",
    "\n",
    "results = {  # initialize results dictionary\n",
    "    \"mv\": [],\n",
    "    \"naivesoft\": [],\n",
    "    \"glad\": [],\n",
    "    \"ds\": [],\n",
    "    \"wawa\": [],\n",
    "    \"dswc[l=5]\": [],\n",
    "}\n",
    "for strategy in results.keys():\n",
    "    path_labels = path / \"labels\" / f\"labels_independent-confusion_{strategy}.npy\"\n",
    "    labels = np.load(path_labels)  # load aggregated labels\n",
    "    results[strategy].append(accuracy(labels, gt))  # compute accuracy\n",
    "results[\"NS\"] = results[\"naivesoft\"]  # rename naivesoft to NS\n",
    "results.pop(\"naivesoft\")\n",
    "\n",
    "# Styling the results\n",
    "results = pd.DataFrame(results, index=[\"AccTrain\"])\n",
    "results.columns = map(str.upper, results.columns)\n",
    "results = results.style.set_table_styles(\n",
    "    [dict(selector=\"th\", props=[(\"text-align\", \"center\")])]\n",
    ")\n",
    "results.set_properties(**{\"text-align\": \"center\"})\n",
    "results = results.format(precision=3)\n",
    "results"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "API Aggregation\n",
    "------------------------\n",
    "\n",
    "We showed how to use the command line interface, but what about the API?\n",
    "It's just as simple!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from peerannot.models import agg_strategies\n",
    "\n",
    "strategies = [\"MV\", \"GLAD\", \"DS\", \"NaiveSoft\", \"DSWC\", \"Wawa\"]\n",
    "yhats = []\n",
    "for strat in strategies:\n",
    "    agg = agg_strategies[strat]\n",
    "    if strat != \"DSWC\":\n",
    "        agg = agg(answers, n_classes=5, n_workers=30, n_tasks=200, dataset=path)\n",
    "    else:\n",
    "        agg = agg(answers, L=5, n_classes=5, n_workers=30, n_tasks=200)\n",
    "    if hasattr(agg, \"run\"):\n",
    "        agg.run(maxiter=20)\n",
    "    yhats.append(agg.get_answers())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = {  # initialize results dictionary\n",
    "    \"mv\": [],\n",
    "    \"glad\": [],\n",
    "    \"ds\": [],\n",
    "    \"naivesoft\": [],\n",
    "    \"dswc[l=5]\": [],\n",
    "    \"wawa\": [],\n",
    "}\n",
    "for i, strategy in enumerate(results.keys()):\n",
    "    labels = yhats[i] # load aggregated labels\n",
    "    results[strategy].append(accuracy(labels, gt))  # compute accuracy\n",
    "results[\"NS\"] = results[\"naivesoft\"]  # rename naivesoft to NS\n",
    "results.pop(\"naivesoft\")\n",
    "\n",
    "# Styling the results\n",
    "results = pd.DataFrame(results, index=[\"AccTrain\"])\n",
    "results.columns = map(str.upper, results.columns)\n",
    "results = results.style.set_table_styles(\n",
    "    [dict(selector=\"th\", props=[(\"text-align\", \"center\")])]\n",
    ")\n",
    "results.set_properties(**{\"text-align\": \"center\"})\n",
    "results = results.format(precision=3)\n",
    "results"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The difference in performance shown result from the random tie-breaks generated."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "peerannot",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}