{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU",
    "gpuClass": "standard",
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "7c21106e243948c1b19e00ba2212f842": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_9b91e48b1a7d4236a9b4edb41dbe3242",
              "IPY_MODEL_ab7891e3c1024943975123d6d8de0665",
              "IPY_MODEL_192a3dc75e774942a07bf29a6e3a65e8"
            ],
            "layout": "IPY_MODEL_927b3fa567dd41f1b61f9bc05eeb6a82"
          }
        },
        "9b91e48b1a7d4236a9b4edb41dbe3242": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_f1f21b9aeceb4110aa4988c4beacc129",
            "placeholder": "​",
            "style": "IPY_MODEL_59777f6d004b46499e87f0d8367361e6",
            "value": "Epoch: 100%"
          }
        },
        "ab7891e3c1024943975123d6d8de0665": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_9420b3d8d3d64e0492741f049f9b40fd",
            "max": 100,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_7cf1feb600694602a9bbc516d2ef8b07",
            "value": 100
          }
        },
        "192a3dc75e774942a07bf29a6e3a65e8": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_60596865e1934f7c83aab800e4625019",
            "placeholder": "​",
            "style": "IPY_MODEL_8139be1d51e14586902b4f9e89c1c706",
            "value": " 100/100 [02:24&lt;00:00,  1.40s/it]"
          }
        },
        "927b3fa567dd41f1b61f9bc05eeb6a82": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "f1f21b9aeceb4110aa4988c4beacc129": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "59777f6d004b46499e87f0d8367361e6": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "9420b3d8d3d64e0492741f049f9b40fd": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "7cf1feb600694602a9bbc516d2ef8b07": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "60596865e1934f7c83aab800e4625019": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "8139be1d51e14586902b4f9e89c1c706": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "1696f5d7748c4b20bb3368d2c331b77c": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_f2cef426f4cc433cbc4d9b72978f0b89",
              "IPY_MODEL_88e441b133ac48cc838c4f2559f7ecbc",
              "IPY_MODEL_3a0d32ee08854722b9e4b5acb43e3827"
            ],
            "layout": "IPY_MODEL_247c56dafc904b29b92918337053aad2"
          }
        },
        "f2cef426f4cc433cbc4d9b72978f0b89": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_5e5a80f1531541e49e28386c149f4dbd",
            "placeholder": "​",
            "style": "IPY_MODEL_64b0c057adb743a4b5a7c6d9ecd275ed",
            "value": "Evaluating: 100%"
          }
        },
        "88e441b133ac48cc838c4f2559f7ecbc": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_26f015d1c760403d8d5008eafbb2e734",
            "max": 25,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_b0bf1fe05a2046988f9ea6900addfa56",
            "value": 25
          }
        },
        "3a0d32ee08854722b9e4b5acb43e3827": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_ee794964c2094bcfab6b1beb12a3dc22",
            "placeholder": "​",
            "style": "IPY_MODEL_a8680e763adf40f6bbe61d0df15ba8da",
            "value": " 25/25 [00:00&lt;00:00, 108.52it/s]"
          }
        },
        "247c56dafc904b29b92918337053aad2": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "5e5a80f1531541e49e28386c149f4dbd": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "64b0c057adb743a4b5a7c6d9ecd275ed": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "26f015d1c760403d8d5008eafbb2e734": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "b0bf1fe05a2046988f9ea6900addfa56": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "ee794964c2094bcfab6b1beb12a3dc22": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "a8680e763adf40f6bbe61d0df15ba8da": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        }
      }
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# RNN for POS Tagging\n",
        "\n",
        "What included in the notebook:\n",
        "\n",
        "- Implementation of RNN model for POS Tagging"
      ],
      "metadata": {
        "id": "jM909rAz86iC"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.nn.functional as F\n",
        "import torch.optim as optim\n",
        "\n",
        "torch.manual_seed(1)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "T2BHCHqO4hEC",
        "outputId": "6c40e7a8-2ae1-4100-d470-0b71a14c3d35"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<torch._C.Generator at 0x7fe108bc4110>"
            ]
          },
          "metadata": {},
          "execution_count": 1
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Dataset\n",
        "\n",
        "We will use the Treebak data obtained from nltk.\n"
      ],
      "metadata": {
        "id": "0D3at1Q5ACM3"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "%%capture\n",
        "\n",
        "import nltk\n",
        "from nltk.corpus import treebank\n",
        "\n",
        "nltk.download('universal_tagset')\n",
        "nltk.download('treebank')"
      ],
      "metadata": {
        "id": "AvmzTdKNAGca"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Load data set"
      ],
      "metadata": {
        "id": "yexlfZRQAJh9"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "tagged_sentences = treebank.tagged_sents(tagset='universal')\n",
        "tagged_sentences[0]"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ejPjn4-pAQvg",
        "outputId": "72420d46-d459-4b78-dace-bba3d5a9987c"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[('Pierre', 'NOUN'),\n",
              " ('Vinken', 'NOUN'),\n",
              " (',', '.'),\n",
              " ('61', 'NUM'),\n",
              " ('years', 'NOUN'),\n",
              " ('old', 'ADJ'),\n",
              " (',', '.'),\n",
              " ('will', 'VERB'),\n",
              " ('join', 'VERB'),\n",
              " ('the', 'DET'),\n",
              " ('board', 'NOUN'),\n",
              " ('as', 'ADP'),\n",
              " ('a', 'DET'),\n",
              " ('nonexecutive', 'ADJ'),\n",
              " ('director', 'NOUN'),\n",
              " ('Nov.', 'NOUN'),\n",
              " ('29', 'NUM'),\n",
              " ('.', '.')]"
            ]
          },
          "metadata": {},
          "execution_count": 3
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Create train/test/split"
      ],
      "metadata": {
        "id": "RC--NOf7Az-2"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.model_selection import train_test_split\n",
        "\n",
        "train_tagged_sentences, test_tagged_sentences = train_test_split(tagged_sentences, test_size=0.2, random_state=42)"
      ],
      "metadata": {
        "id": "SiqKtYo5EFd3"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Seperate sentences and tag sequences"
      ],
      "metadata": {
        "id": "Z23hZnfXEIGm"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def make_x_y(tagged_sentences):\n",
        "    \"\"\"Seperate sentences and tag sequences from tagged sentences\n",
        "\n",
        "    Arguments\n",
        "    ----------\n",
        "        tagged_sentences\n",
        "\n",
        "    Returns\n",
        "    ----------\n",
        "        sentences (list): list of sentences. Each sentence is a list of words\n",
        "        tag_sequences\n",
        "    \"\"\"\n",
        "    sentences = []\n",
        "    tag_sequences = []\n",
        "    for s in tagged_sentences:\n",
        "        words, tags = zip(*s)\n",
        "        sentences.append(list(words))\n",
        "        tag_sequences.append(list(tags))\n",
        "    return sentences, tag_sequences"
      ],
      "metadata": {
        "id": "mW6DzBhEAjIS"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "train_sentences, train_tag_sequences = make_x_y(train_tagged_sentences)\n",
        "test_sentences, test_tag_sequences = make_x_y(test_tagged_sentences)"
      ],
      "metadata": {
        "id": "xGhHIkYJ6vOG"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Steps in building RNN model for POS Tagging\n",
        "\n",
        "- Create Vocabulary, Vectorizer, Dataset\n",
        "- Implement model class\n",
        "- Training loop\n",
        "- Evaluation on the test data"
      ],
      "metadata": {
        "id": "SnIOM22QAX3H"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Create Vocabulary\n",
        "\n",
        "We modified the Vocabulary class in the previous lecture.\n",
        "\n",
        "We need to convert tags into integer indeces, so we will create two vocabularies, one for words and one for tags."
      ],
      "metadata": {
        "id": "YKC5n_msAxLL"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from collections import defaultdict\n",
        "\n",
        "class Vocabulary:\n",
        "    def __init__(self, token_to_idx=None, use_unk=True):\n",
        "        \"\"\"\n",
        "        Args:\n",
        "            token_to_idx (dict): a pre-existing map of tokens to indices\n",
        "        \"\"\"\n",
        "        if token_to_idx is None:\n",
        "            token_to_idx = {}\n",
        "        self._token_to_idx = token_to_idx\n",
        "\n",
        "        self._idx_to_token = {idx: token\n",
        "                              for token, idx in self._token_to_idx.items()}\n",
        "\n",
        "        self.pad_index = 0\n",
        "\n",
        "        if use_unk:\n",
        "            self.unk_index = 1\n",
        "        else:\n",
        "            self.unk_index = -1\n",
        "\n",
        "    def lookup_token(self, token):\n",
        "        \"\"\"Retrieve the index associated with the token\n",
        "          or the UNK index if token isn't present.\n",
        "\n",
        "        Args:\n",
        "            token (str): the token to look up\n",
        "        Returns:\n",
        "            index (int): the index corresponding to the token\n",
        "        Notes:\n",
        "            `unk_index` needs to be >=0 (having been added into the Vocabulary)\n",
        "              for the UNK functionality\n",
        "        \"\"\"\n",
        "        if self.unk_index >= 0:\n",
        "            return self._token_to_idx.get(token, self.unk_index)\n",
        "        else:\n",
        "            return self._token_to_idx[token]\n",
        "\n",
        "    def lookup_index(self, index):\n",
        "        \"\"\"Return the token associated with the index\n",
        "\n",
        "        Args:\n",
        "            index (int): the index to look up\n",
        "        Returns:\n",
        "            token (str): the token corresponding to the index\n",
        "        Raises:\n",
        "            KeyError: if the index is not in the Vocabulary\n",
        "        \"\"\"\n",
        "        if index not in self._idx_to_token:\n",
        "            raise KeyError(\"the index (%d) is not in the Vocabulary\" % index)\n",
        "        return self._idx_to_token[index]\n",
        "\n",
        "    def add_token(self, token):\n",
        "        \"\"\"Update mapping dicts based on the token.\n",
        "\n",
        "        Args:\n",
        "            token (str): the item to add into the Vocabulary\n",
        "        Returns:\n",
        "            index (int): the integer corresponding to the token\n",
        "        \"\"\"\n",
        "        if token in self._token_to_idx:\n",
        "            index = self._token_to_idx[token]\n",
        "        else:\n",
        "            index = len(self._token_to_idx)\n",
        "            self._token_to_idx[token] = index\n",
        "            self._idx_to_token[index] = token\n",
        "        return index\n",
        "\n",
        "    @classmethod\n",
        "    def build_vocab(cls, sequences, use_unk=True):\n",
        "        \"\"\"Build vocabulary from a list of sequences\n",
        "        A sequence may be a sequence of words or a sequence of tags.\n",
        "\n",
        "        Arguments:\n",
        "        ----------\n",
        "            sequences (list): list of sequences, each sentence list of words\n",
        "            or list of tags\n",
        "\n",
        "        Return:\n",
        "        ----------\n",
        "            vocab (Vocabulary): a Vocabulary object\n",
        "        \"\"\"\n",
        "        if use_unk:\n",
        "            token_to_idx = {\"<PAD>\": 0, \"<UNK>\": 1}\n",
        "        else:\n",
        "            token_to_idx = {\"<PAD>\": 0}\n",
        "\n",
        "        vocab = cls(token_to_idx, use_unk=use_unk)\n",
        "        for s in sequences:\n",
        "            for word in s:\n",
        "                vocab.add_token(word)\n",
        "        return vocab\n",
        "\n",
        "    def __str__(self):\n",
        "        return \"<Vocabulary(size=%d)>\" % len(self)\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self._token_to_idx)"
      ],
      "metadata": {
        "id": "11ojPoT_8he1"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Word vocabulary\n",
        "word_vocab = Vocabulary.build_vocab(train_sentences)\n",
        "print(word_vocab)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YO2HGYCKANK9",
        "outputId": "e8622409-c749-43a0-f317-4596827c7e11"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "<Vocabulary(size=11051)>\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Tag vocabulary\n",
        "tag_vocab = Vocabulary.build_vocab(train_tag_sequences, use_unk=False)\n",
        "print(tag_vocab._token_to_idx)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "HaNIuMBwW0dh",
        "outputId": "50d517b5-c8aa-472d-d653-8e391af991c7"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'<PAD>': 0, 'NOUN': 1, '.': 2, 'NUM': 3, 'ADJ': 4, 'VERB': 5, 'DET': 6, 'ADP': 7, 'CONJ': 8, 'PRON': 9, 'X': 10, 'ADV': 11, 'PRT': 12}\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Data Vectorizer"
      ],
      "metadata": {
        "id": "63wTCWWtaTHb"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import torch\n",
        "import numpy as np\n",
        "\n",
        "def vectorize(vocab, sequence):\n",
        "    \"\"\"\n",
        "    Args:\n",
        "        vocab (Vocabulary)\n",
        "        sequence (list): list of words or tags\n",
        "    \"\"\"\n",
        "    indices = [vocab.lookup_token(token) for token in sequence]\n",
        "\n",
        "    return torch.tensor(indices, dtype=torch.long)"
      ],
      "metadata": {
        "id": "fqrUB46vaVGF"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(train_sentences[0])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "vQMTRqC_gCAZ",
        "outputId": "3b52fe2e-4db4-40f2-c5b1-77bb0667cc66"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "vectorize(word_vocab, train_sentences[0])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "C6DN2qF3gE5i",
        "outputId": "528c8ef9-d798-4c69-cad8-830e869af1cd"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "tensor([ 2,  3,  4,  5,  6,  7,  4,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18])"
            ]
          },
          "metadata": {},
          "execution_count": 12
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(train_tag_sequences[0])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "75muyN_vgQ0E",
        "outputId": "4911dcca-c2a4-4c1e-93ba-86a6ec933860"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "['NOUN', 'NOUN', '.', 'NUM', 'NOUN', 'ADJ', '.', 'VERB', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'NOUN', 'NUM', '.']\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "vectorize(tag_vocab, train_tag_sequences[0])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "cgl3-wergTsz",
        "outputId": "c148281d-8e19-4eae-af5a-166445ec0a2e"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "tensor([1, 1, 2, 3, 1, 4, 2, 5, 5, 6, 1, 7, 6, 4, 1, 1, 3, 2])"
            ]
          },
          "metadata": {},
          "execution_count": 14
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Vectorize train/test data"
      ],
      "metadata": {
        "id": "fTRwKyaNYv46"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "train_data = [vectorize(word_vocab, t) for t in train_sentences]\n",
        "test_data = [vectorize(word_vocab, t) for t in test_sentences]\n",
        "\n",
        "train_y = [vectorize(tag_vocab, t) for t in train_tag_sequences]\n",
        "test_y = [vectorize(tag_vocab, t) for t in test_tag_sequences]"
      ],
      "metadata": {
        "id": "ey8sO7ADYxmj"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Dataset class"
      ],
      "metadata": {
        "id": "QXU1ufU3JMxJ"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from torch.utils.data import Dataset, DataLoader\n",
        "\n",
        "class TextDataset(Dataset):\n",
        "\n",
        "    def __init__(self, sequences, tag_sequences):\n",
        "        \"\"\"\n",
        "        Args:\n",
        "            sequences (list): list of sentences. Each sentence is a list of words\n",
        "            tag_sequences (list): list of tag sequences, each for one sentence\n",
        "        \"\"\"\n",
        "        self.sequences = sequences\n",
        "        self.tag_sequences = tag_sequences\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.sequences)\n",
        "\n",
        "    def __getitem__(self, index):\n",
        "        x = self.sequences[index]\n",
        "        y = self.tag_sequences[index]\n",
        "\n",
        "        return x, y"
      ],
      "metadata": {
        "id": "qfQ-PxQyJOau"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Create train_dataset and test_dataset"
      ],
      "metadata": {
        "id": "Z4Z2ksjoLLaH"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "train_dataset = TextDataset(train_data, train_y)\n",
        "test_dataset = TextDataset(test_data, test_y)"
      ],
      "metadata": {
        "id": "14lrsCGpYNwW"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print( train_dataset[1] )"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "IaVUUrZXYfVA",
        "outputId": "5a360411-f943-45af-9fca-1185dc9f9132"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "(tensor([19, 20, 21, 22, 18]), tensor([8, 9, 5, 4, 2]))\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Create DataLoader\n",
        "\n",
        "We need to define function for processing batches generated by DataLoader"
      ],
      "metadata": {
        "id": "5J4SlQHDoWiu"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from torch.nn.utils.rnn import pad_sequence\n",
        "\n",
        "def collate_batch(batch):\n",
        "    \"\"\"Processing a batch generated by DataLoader\n",
        "\n",
        "    Arguments:\n",
        "    -----\n",
        "        batch (torch.tensor): a tensor generated by DataLoader\n",
        "    \"\"\"\n",
        "    (x, y) = zip(*batch)\n",
        "    x_lens = torch.tensor([len(x) for x in x])\n",
        "    y_lens = torch.tensor([len(y) for y in y])\n",
        "\n",
        "    x_pad = pad_sequence(x, batch_first=True, padding_value=0)\n",
        "    y_pad = pad_sequence(y, batch_first=True, padding_value=0)\n",
        "\n",
        "    return x_pad, y_pad, x_lens, y_lens"
      ],
      "metadata": {
        "id": "SCrjP0hroXlV"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Creating Embedding Matrix from Pre-trained Glove"
      ],
      "metadata": {
        "id": "5uTgmdTWjfnr"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import torch\n",
        "import torch.nn as nn\n",
        "from torchtext.vocab import GloVe\n",
        "\n",
        "glove = GloVe(name='6B', dim=300)\n",
        "\n",
        "def create_embedding_matrix(glove, word_to_idx, emb_dim=300):\n",
        "    vocab_size = len(word_to_idx)\n",
        "    embedding_matrix = torch.zeros(vocab_size, emb_dim)\n",
        "\n",
        "    for word, idx in word_to_idx.items():\n",
        "        try:\n",
        "            embedding_matrix[idx] = glove[word]\n",
        "        except KeyError:\n",
        "            # Initialize out-of-vocabulary words with random embeddings\n",
        "            embedding_matrix[idx] = torch.randn(emb_dim)\n",
        "\n",
        "    return embedding_matrix\n",
        "\n",
        "# Assuming you have a word_to_idx dictionary\n",
        "word_to_idx = word_vocab._token_to_idx\n",
        "\n",
        "embedding_matrix = create_embedding_matrix(glove, word_to_idx)"
      ],
      "metadata": {
        "id": "q9fVlsaRjhxW"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## RNN Tagging Model"
      ],
      "metadata": {
        "id": "fkmP6Xhp9jiD"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import torch.nn as nn\n",
        "from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence\n",
        "\n",
        "\n",
        "class LSTMTagger(nn.Module):\n",
        "\n",
        "    def __init__(self, embedding_matrix, hidden_dim, tagset_size,\n",
        "                 num_layers=1, batch_first=True, padding_idx=0):\n",
        "\n",
        "        super(LSTMTagger, self).__init__()\n",
        "        vocab_size, embedding_size = embedding_matrix.shape\n",
        "\n",
        "        self.hidden_dim = hidden_dim\n",
        "\n",
        "        self.emb = nn.Embedding.from_pretrained(embedding_matrix, freeze=False, padding_idx=0)\n",
        "\n",
        "        # The LSTM takes word embeddings as inputs, and outputs hidden states\n",
        "        # with dimensionality hidden_dim.\n",
        "        self.lstm = nn.LSTM(embedding_dim, hidden_dim,\n",
        "                            num_layers=num_layers, bidirectional=True, batch_first=batch_first)\n",
        "        self.fc = nn.Linear(in_features=2*hidden_dim, out_features=tagset_size)\n",
        "\n",
        "        ## Comment out to disable weight initialization\n",
        "        torch.nn.init.xavier_uniform_(self.emb.weight)\n",
        "        torch.nn.init.xavier_uniform_(self.fc.weight)\n",
        "\n",
        "    def forward(self, x_in, x_lens):\n",
        "        x_embed = self.emb(x_in)\n",
        "        x_packed = pack_padded_sequence(x_embed, x_lens, batch_first=True, enforce_sorted=False)\n",
        "        output_packed, _ = self.lstm(x_packed)\n",
        "        output_padded, output_lengths = pad_packed_sequence(output_packed, batch_first=True)\n",
        "        tag_space = self.fc(output_padded)\n",
        "        tag_scores = F.log_softmax(tag_space, dim=1)\n",
        "        return tag_scores"
      ],
      "metadata": {
        "id": "FGt-X6ei9nJq"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Create an LSTM Tagger Model"
      ],
      "metadata": {
        "id": "d5y8qIUEAxrK"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "hidden_dim = 128\n",
        "num_layers = 2\n",
        "tagset_size = len(tag_vocab)\n",
        "batch_first = True\n",
        "\n",
        "model = LSTMTagger(embedding_matrix,\n",
        "                   hidden_dim=hidden_dim,\n",
        "                   num_layers=num_layers,\n",
        "                   tagset_size=tagset_size,\n",
        "                   batch_first=batch_first)"
      ],
      "metadata": {
        "id": "ZjYlVuHgA1dR"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(model)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "JwHuxQoXB0Aw",
        "outputId": "edf508ab-e764-428b-a214-8ca8544eaba8"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "LSTMTagger(\n",
            "  (emb): Embedding(11051, 300, padding_idx=0)\n",
            "  (lstm): LSTM(300, 128, num_layers=2, batch_first=True, bidirectional=True)\n",
            "  (fc): Linear(in_features=256, out_features=13, bias=True)\n",
            ")\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Training Loop"
      ],
      "metadata": {
        "id": "BRtuvVjrB41j"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from tqdm.notebook import trange, tqdm\n",
        "\n",
        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
        "\n",
        "learning_rate = 1e-3\n",
        "batch_size = 32\n",
        "epochs = 100\n",
        "\n",
        "criterion = torch.nn.CrossEntropyLoss(ignore_index=0)\n",
        "optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)\n",
        "model.to(device)\n",
        "\n",
        "def train():\n",
        "    train_dataloader = DataLoader(\n",
        "        train_dataset,\n",
        "        collate_fn=collate_batch,\n",
        "        batch_size=batch_size,\n",
        "    )\n",
        "    model.train()\n",
        "    train_iterator = trange(int(epochs), desc=\"Epoch\")\n",
        "\n",
        "    for _ in train_iterator:\n",
        "        for x_pad, y_pad, x_lens, y_lens in train_dataloader:\n",
        "            x_pad = x_pad.to(device)\n",
        "            y_pad = y_pad.to(device)\n",
        "\n",
        "            optimizer.zero_grad()\n",
        "            pred = model(x_pad, x_lens)\n",
        "\n",
        "            pred = pred.view(-1, pred.shape[-1])\n",
        "            y_pad = y_pad.view(-1)\n",
        "\n",
        "            loss = criterion(pred, y_pad)\n",
        "            loss.backward()\n",
        "            optimizer.step()\n",
        "\n",
        "train()\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 49,
          "referenced_widgets": [
            "7c21106e243948c1b19e00ba2212f842",
            "9b91e48b1a7d4236a9b4edb41dbe3242",
            "ab7891e3c1024943975123d6d8de0665",
            "192a3dc75e774942a07bf29a6e3a65e8",
            "927b3fa567dd41f1b61f9bc05eeb6a82",
            "f1f21b9aeceb4110aa4988c4beacc129",
            "59777f6d004b46499e87f0d8367361e6",
            "9420b3d8d3d64e0492741f049f9b40fd",
            "7cf1feb600694602a9bbc516d2ef8b07",
            "60596865e1934f7c83aab800e4625019",
            "8139be1d51e14586902b4f9e89c1c706"
          ]
        },
        "id": "z8WZa3b0B72x",
        "outputId": "702c2b35-659d-4835-ed3a-5b63d8f0f1a4"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "Epoch:   0%|          | 0/100 [00:00<?, ?it/s]"
            ],
            "application/vnd.jupyter.widget-view+json": {
              "version_major": 2,
              "version_minor": 0,
              "model_id": "7c21106e243948c1b19e00ba2212f842"
            }
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Evaluation"
      ],
      "metadata": {
        "id": "ntT6PIxnKHZ9"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn import metrics\n",
        "\n",
        "def evaluate():\n",
        "    model.eval()\n",
        "    test_dataloader = DataLoader(\n",
        "        test_dataset,\n",
        "        collate_fn=collate_batch,\n",
        "        shuffle=False,\n",
        "        batch_size=batch_size,\n",
        "    )\n",
        "\n",
        "    y_true = []\n",
        "    y_pred = []\n",
        "    with torch.no_grad():\n",
        "        for x_pad, y_pad, x_lens, y_lens in tqdm(test_dataloader, desc=\"Evaluating\"):\n",
        "            x_pad = x_pad.to(device)\n",
        "            y_pad = y_pad.to(device)\n",
        "\n",
        "            logits = model(x_pad, x_lens)\n",
        "            predictions = logits.argmax(-1)\n",
        "\n",
        "            predictions = predictions.detach().cpu().numpy()\n",
        "            y_pad = y_pad.detach().cpu().numpy()\n",
        "\n",
        "            y_lens = y_lens.numpy()\n",
        "\n",
        "            for i in range(y_pad.shape[0]):\n",
        "                len_y = y_lens[i]\n",
        "                for true_tag, predicted_tag in zip(y_pad[i][:len_y], predictions[i][:len_y]):\n",
        "                    if predicted_tag != tag_vocab.pad_index:\n",
        "                        true_tag = tag_vocab.lookup_index(true_tag)\n",
        "                        predicted_tag = tag_vocab.lookup_index(predicted_tag)\n",
        "                        y_true.append(true_tag)\n",
        "                        y_pred.append(predicted_tag)\n",
        "\n",
        "    print(\"Accuracy: %.4f\" % metrics.accuracy_score(y_true, y_pred))\n",
        "    print(metrics.classification_report(y_true, y_pred))\n",
        "\n",
        "evaluate()"
      ],
      "metadata": {
        "id": "BwKHamnVKKDY",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 416,
          "referenced_widgets": [
            "1696f5d7748c4b20bb3368d2c331b77c",
            "f2cef426f4cc433cbc4d9b72978f0b89",
            "88e441b133ac48cc838c4f2559f7ecbc",
            "3a0d32ee08854722b9e4b5acb43e3827",
            "247c56dafc904b29b92918337053aad2",
            "5e5a80f1531541e49e28386c149f4dbd",
            "64b0c057adb743a4b5a7c6d9ecd275ed",
            "26f015d1c760403d8d5008eafbb2e734",
            "b0bf1fe05a2046988f9ea6900addfa56",
            "ee794964c2094bcfab6b1beb12a3dc22",
            "a8680e763adf40f6bbe61d0df15ba8da"
          ]
        },
        "outputId": "988fea32-7602-46c8-8f9e-047e76c1d59a"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "Evaluating:   0%|          | 0/25 [00:00<?, ?it/s]"
            ],
            "application/vnd.jupyter.widget-view+json": {
              "version_major": 2,
              "version_minor": 0,
              "model_id": "1696f5d7748c4b20bb3368d2c331b77c"
            }
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Accuracy: 0.9559\n",
            "              precision    recall  f1-score   support\n",
            "\n",
            "           .       1.00      1.00      1.00      2354\n",
            "         ADJ       0.88      0.80      0.84      1316\n",
            "         ADP       0.97      0.98      0.98      2028\n",
            "         ADV       0.90      0.87      0.88       631\n",
            "        CONJ       0.98      0.98      0.98       465\n",
            "         DET       0.99      0.99      0.99      1795\n",
            "        NOUN       0.94      0.97      0.95      5943\n",
            "         NUM       0.96      0.92      0.94       726\n",
            "        PRON       0.98      0.99      0.99       522\n",
            "         PRT       0.96      0.98      0.97       656\n",
            "        VERB       0.94      0.95      0.94      2740\n",
            "           X       1.00      0.95      0.97      1360\n",
            "\n",
            "    accuracy                           0.96     20536\n",
            "   macro avg       0.96      0.95      0.95     20536\n",
            "weighted avg       0.96      0.96      0.96     20536\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Further Improvements\n",
        "\n",
        "- Initialize weights of the neural networks\n",
        "- Use pre-trained word embeddings"
      ],
      "metadata": {
        "id": "vDOkei_ni2Ft"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## References\n",
        "\n",
        "- [Sequence Models and Long Short-Term Memory Networks](https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html), on official Pytorch tutorial.\n",
        "- [LSTM (character + word) POS-tag model PyTorch](https://www.kaggle.com/code/krishanudb/lstm-character-word-pos-tag-model-pytorch)\n",
        "- [1 - BiLSTM for PoS Tagging](https://github.com/bentrevett/pytorch-pos-tagging/blob/master/1_bilstm.ipynb), Notebook\n",
        "- [bentrevett/pytorch-pos-tagging](https://github.com/bentrevett/pytorch-pos-tagging)\n",
        "- [Pad pack sequences for Pytorch batch processing with DataLoader](https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html)\n"
      ],
      "metadata": {
        "id": "ueACNUfN3Pry"
      }
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "L8jxQ1CdIzT4"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}