{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "b4d2971da985472ba0ddc86f90b4f527": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_7e9066a0c9a34ceaa9fd6ad2d1518986", "IPY_MODEL_010fcb1915d54dd198fc5877da5bae63", "IPY_MODEL_3623abfddc294a9c8758f967aaa19937" ], "layout": "IPY_MODEL_9bf11ad3f63a45acb55353502022dd93" } }, "7e9066a0c9a34ceaa9fd6ad2d1518986": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a115e3a2d5e64a84b1df0a1cd3e8cd8f", "placeholder": "​", "style": "IPY_MODEL_950216a2fd18445b97d0ee3470b9a1b8", "value": "tokenizer_config.json: 100%" } }, "010fcb1915d54dd198fc5877da5bae63": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_fdae5a5f56734d3c8579a4fa6a7752fc", "max": 48, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_507ebe2b3ba04f9b90d0406b9cf340b5", "value": 48 } }, "3623abfddc294a9c8758f967aaa19937": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_7c745de05b604198adbd18521e132a6b", "placeholder": "​", "style": "IPY_MODEL_61f34f9fee2049b8bfdb600fc1ac7479", "value": " 48.0/48.0 [00:00<00:00, 5.08kB/s]" } }, "9bf11ad3f63a45acb55353502022dd93": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a115e3a2d5e64a84b1df0a1cd3e8cd8f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "950216a2fd18445b97d0ee3470b9a1b8": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "fdae5a5f56734d3c8579a4fa6a7752fc": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "507ebe2b3ba04f9b90d0406b9cf340b5": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "7c745de05b604198adbd18521e132a6b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "61f34f9fee2049b8bfdb600fc1ac7479": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "1fdf91c546104a579347eaa24a60da5e": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_1dec5d7d70ef4577a1d60dd5bd1cdc38", "IPY_MODEL_ef34deb6195b45fcbb8f52d26694a0d1", "IPY_MODEL_f236b722c48d4b1f9e2520cda6e0aad9" ], "layout": "IPY_MODEL_eec610a52be146b88e32db5eedbb3b09" } }, "1dec5d7d70ef4577a1d60dd5bd1cdc38": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_334f8dd0d7274cb3a072c5a8230c336c", "placeholder": "​", "style": "IPY_MODEL_279269ca89934061a8bfcd38ffd8467e", "value": "vocab.txt: 100%" } }, "ef34deb6195b45fcbb8f52d26694a0d1": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_5687152a773547838c24d73f38d38ffc", "max": 231508, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_a45d81ab597249ecadefafa3950b4a66", "value": 231508 } }, "f236b722c48d4b1f9e2520cda6e0aad9": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_fc09d0acc63d4d6e8db4524d531ff8b3", "placeholder": "​", "style": "IPY_MODEL_31d0b3a321d145979f01b7aaf32952a7", "value": " 232k/232k [00:00<00:00, 1.08MB/s]" } }, "eec610a52be146b88e32db5eedbb3b09": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "334f8dd0d7274cb3a072c5a8230c336c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "279269ca89934061a8bfcd38ffd8467e": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "5687152a773547838c24d73f38d38ffc": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a45d81ab597249ecadefafa3950b4a66": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "fc09d0acc63d4d6e8db4524d531ff8b3": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "31d0b3a321d145979f01b7aaf32952a7": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "7c3375fd2fb74cd2a2a3cf59d2539d89": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_324feb694a064859a533a760d1de44b0", "IPY_MODEL_af8b8e6559104aa7a1112c38ea4a843a", "IPY_MODEL_a5c1af764c12459aa99e16a0358e5971" ], "layout": "IPY_MODEL_f92f2ffeb9dc43c79ab30c2e052dcdba" } }, "324feb694a064859a533a760d1de44b0": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_f094b86ef6b943738669d2f15fb51d3c", "placeholder": "​", "style": "IPY_MODEL_4ead4e4549da43ec856399b0ad0100b9", "value": "tokenizer.json: 100%" } }, "af8b8e6559104aa7a1112c38ea4a843a": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_e327f72bab4143c5907f97f7b319d06a", "max": 466062, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_10ab21b53c274925b76564ee4f991527", "value": 466062 } }, "a5c1af764c12459aa99e16a0358e5971": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_231bc555458a4a629ef2cdb8338fce63", "placeholder": "​", "style": "IPY_MODEL_f526c3aab39f4e93a0de8bc807713a3d", "value": " 466k/466k [00:00<00:00, 2.00MB/s]" } }, "f92f2ffeb9dc43c79ab30c2e052dcdba": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f094b86ef6b943738669d2f15fb51d3c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4ead4e4549da43ec856399b0ad0100b9": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "e327f72bab4143c5907f97f7b319d06a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "10ab21b53c274925b76564ee4f991527": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "231bc555458a4a629ef2cdb8338fce63": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f526c3aab39f4e93a0de8bc807713a3d": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "96b306e4c92b436f92de9e41699977ed": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_7295cba870444b158104a8d84970d50f", "IPY_MODEL_5301fc5aa6f44fad81ba7983d9505017", "IPY_MODEL_07736ea6737d437bb35b4c8211ff0df0" ], "layout": "IPY_MODEL_18061307d2214690b8d765a87a16b064" } }, "7295cba870444b158104a8d84970d50f": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_f1dd1935354548048e16d08037eb1a45", "placeholder": "​", "style": "IPY_MODEL_21777bb011eb40fcb93b2b24b68a4854", "value": "config.json: 100%" } }, "5301fc5aa6f44fad81ba7983d9505017": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_aaada3d216ff408c8630921e514db928", "max": 570, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_904ebf9b43b84efcaa376eb922d110bc", "value": 570 } }, "07736ea6737d437bb35b4c8211ff0df0": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_644c595a19e64b4a8617ef049248e43e", "placeholder": "​", "style": "IPY_MODEL_dddf6cc4a7c84e1e896e577b76ad70bf", "value": " 570/570 [00:00<00:00, 44.2kB/s]" } }, "18061307d2214690b8d765a87a16b064": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f1dd1935354548048e16d08037eb1a45": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "21777bb011eb40fcb93b2b24b68a4854": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "aaada3d216ff408c8630921e514db928": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "904ebf9b43b84efcaa376eb922d110bc": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "644c595a19e64b4a8617ef049248e43e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "dddf6cc4a7c84e1e896e577b76ad70bf": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "0c2556096909469c992868dfdaf3065d": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_d5730e4c876943e4b039218cad89964f", "IPY_MODEL_24279c3b9fe94912a2445ed04683abad", "IPY_MODEL_2549bc13587040a0898fddf875712eec" ], "layout": "IPY_MODEL_79be610cddd647899f1c742b79d5b774" } }, "d5730e4c876943e4b039218cad89964f": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b800a7e468e940feb3f43c1809b7c3dc", "placeholder": "​", "style": "IPY_MODEL_7bcd16a6155b486b8089c6310fad869f", "value": "model.safetensors: 100%" } }, "24279c3b9fe94912a2445ed04683abad": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_141eba54141f43cba85f3cf0641c8ca4", "max": 440449768, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_38ae217388684aef860bd37545851614", "value": 440449768 } }, "2549bc13587040a0898fddf875712eec": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_f094117ddbf547e3b9c8641f663ef6e4", "placeholder": "​", "style": "IPY_MODEL_284cc72895714a8c94c518db4669a653", "value": " 440M/440M [00:01<00:00, 203MB/s]" } }, "79be610cddd647899f1c742b79d5b774": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b800a7e468e940feb3f43c1809b7c3dc": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "7bcd16a6155b486b8089c6310fad869f": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "141eba54141f43cba85f3cf0641c8ca4": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "38ae217388684aef860bd37545851614": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "f094117ddbf547e3b9c8641f663ef6e4": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "284cc72895714a8c94c518db4669a653": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "markdown", "source": [ "# Fine-tuning BERT for Sentiment Classification\n", "\n", "In this tutorial, we are going to fine-tune a pre-trained BERT model for sentiment classification. Ones who do not know what BERT is can refer to the lecture slide about [Masked language model](https://drive.google.com/file/d/1NOyi0k0EclW4X51iAXuvAET-UjBs8J_S/view?usp=drive_link)." ], "metadata": { "id": "S4Nh9huMezZd" } }, { "cell_type": "markdown", "source": [ "## Dataset\n", "\n", "We will use the data set [sentiment.txt](https://drive.google.com/file/d/1JdJ9oMZRbXMZ4b124uQ9BZ2K9GyuwI6f/view?usp=sharing). The data contains tokenized English movie reviews and there corresponding labels. There are two categories:\n", "- +1 label for positive reviews\n", "- -1 label for negative reviews" ], "metadata": { "id": "c6MYbbdQhTca" } }, { "cell_type": "markdown", "source": [ "## Loading data\n", "\n", "We are going to load data file into a Data Frame. We will change labels to (1, 0) instead of (+1, -1) because in many machine learning frameworks, labels are indexed from 0." ], "metadata": { "id": "yW-cLwWLjojE" } }, { "cell_type": "code", "source": [ "# prompt: Download datafile sentiment.txt from Google Drive https://drive.google.com/file/d/1JdJ9oMZRbXMZ4b124uQ9BZ2K9GyuwI6f/view using gdown.\n", "!rm -f sentiment.txt\n", "!gdown 1JdJ9oMZRbXMZ4b124uQ9BZ2K9GyuwI6f\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QRHkIhl9lCjR", "outputId": "51203e47-3421-4cc2-f22d-8bb391488bfd" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Downloading...\n", "From: https://drive.google.com/uc?id=1JdJ9oMZRbXMZ4b124uQ9BZ2K9GyuwI6f\n", "To: /content/sentiment.txt\n", "\r 0% 0.00/1.27M [00:00 text\n", "\n", "import pandas as pd\n", "\n", "# Load the data into a pandas DataFrame\n", "data = []\n", "with open('sentiment.txt', 'r', encoding='ISO-8859-1') as file:\n", " for line in file:\n", " label, text = line.strip().split(' ', 1)\n", " data.append({'label': int(label), 'text': text})\n", "\n", "df = pd.DataFrame(data)\n", "\n", "# Replace labels +1 and -1 with 1 and 0 respectively\n", "df['label'] = df['label'].replace({1: 1, -1: 0})\n", "\n", "print(df.head())\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bmyHBTbYmF_V", "outputId": "d2a007b7-d1e2-4b2f-f57d-4208b6d07ad0" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " label text\n", "0 1 the rock is destined to be the 21st century's ...\n", "1 1 the gorgeously elaborate continuation of \" the...\n", "2 1 effective but too-tepid biopic\n", "3 1 if you sometimes like to go to the movies to h...\n", "4 1 emerges as something rare , an issue movie tha...\n" ] } ] }, { "cell_type": "markdown", "source": [ "We would like to know data statistics such as the number of positive/negative reviews, max, min and mean of reviews' lengths (the number of words of reviews) in the data." ], "metadata": { "id": "r0iZwG31mhdS" } }, { "cell_type": "code", "source": [ "# prompt: Calculate the number of positive/negative reviews, max, min and mean of reviews' lengths (the number of words of reviews) in the data. Do not add a new column for review length into the dataframe. Print the information in bullet items.\n", "\n", "# Calculate the number of positive and negative reviews\n", "positive_reviews = df[df['label'] == 1].shape[0]\n", "negative_reviews = df[df['label'] == 0].shape[0]\n", "\n", "# Calculate review lengths (number of words)\n", "review_lengths = df['text'].apply(lambda x: len(x.split()))\n", "\n", "# Calculate max, min, and mean review lengths\n", "max_length = review_lengths.max()\n", "min_length = review_lengths.min()\n", "mean_length = review_lengths.mean()\n", "\n", "# Print the information in bullet items\n", "print(\"Data Statistics:\")\n", "print(f\"- Number of positive reviews: {positive_reviews}\")\n", "print(f\"- Number of negative reviews: {negative_reviews}\")\n", "print(f\"- Maximum review length: {max_length}\")\n", "print(f\"- Minimum review length: {min_length}\")\n", "print(f\"- Mean review length: {mean_length}\")\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MWbzjWP3m133", "outputId": "38a5ec79-d910-4cc2-c0e0-5093b5dfe475" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Data Statistics:\n", "- Number of positive reviews: 5331\n", "- Number of negative reviews: 5331\n", "- Maximum review length: 59\n", "- Minimum review length: 1\n", "- Mean review length: 21.0160382667417\n" ] } ] }, { "cell_type": "markdown", "source": [ "### Data split\n", "\n", "We would like to split data into three sets: train/validation/test set with ratio 70% for training, 10% for validation and 20% for test. We will use `train_test_split` function in scikit-learn package. Save data into three Data Frames" ], "metadata": { "id": "BXt1q0lKnYSu" } }, { "cell_type": "code", "source": [ "# prompt: We would like to split data into three sets: train/validation/test set with ratio 70% for training, 10% for validation and 20% for test. We will use train_test_split function in scikit-learn package. Make distribution of labels in three sets equally. Save data into three Data Frames\n", "\n", "from sklearn.model_selection import train_test_split\n", "\n", "# Split data into train (70%) and temp (30%)\n", "df_train, df_temp = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])\n", "\n", "# Split temp data into validation (10% of total) and test (20% of total)\n", "df_val, df_test = train_test_split(df_temp, test_size=2/3, random_state=42, stratify=df_temp['label'])\n", "\n", "# Print the shapes of the resulting DataFrames\n", "print(\"Shape of train set:\", df_train.shape)\n", "print(\"Shape of validation set:\", df_val.shape)\n", "print(\"Shape of test set:\", df_test.shape)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8O8C1ioGoGLl", "outputId": "f0c58ae5-13ce-4db7-df69-80aefbfe6c1e" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Shape of train set: (7463, 2)\n", "Shape of validation set: (1066, 2)\n", "Shape of test set: (2133, 2)\n" ] } ] }, { "cell_type": "markdown", "source": [ "## Naive Bayes Baseline\n", "\n", "Before implementing a complicated model like BERT, you may want to try a\n", "simple baseline model such as Naive Bayes to jusity the difficulty of the problem and understand the data better." ], "metadata": { "id": "IYp9TfSGiB_L" } }, { "cell_type": "code", "source": [ "# prompt: Train Naive Bayes model on the train dataset and evaluate the model on validation and test set.\n", "# - Use Scikit-learn for implementation.\n", "# - Use CountTokenizer for features\n", "# - Print classification report in evaluation\n", "# - Use default parameters\n", "\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.metrics import classification_report\n", "\n", "# Create a CountVectorizer object\n", "vectorizer = CountVectorizer()\n", "\n", "# Fit and transform the training data\n", "X_train = vectorizer.fit_transform(df_train['text'])\n", "y_train = df_train['label']\n", "\n", "# Transform the validation and test data\n", "X_val = vectorizer.transform(df_val['text'])\n", "y_val = df_val['label']\n", "\n", "X_test = vectorizer.transform(df_test['text'])\n", "y_test = df_test['label']\n", "\n", "# Initialize and train a Multinomial Naive Bayes model\n", "nb_model = MultinomialNB()\n", "nb_model.fit(X_train, y_train)\n", "\n", "# Make predictions on the validation and test sets\n", "y_val_pred = nb_model.predict(X_val)\n", "y_test_pred = nb_model.predict(X_test)\n", "\n", "# Evaluate the model and print classification reports\n", "print(\"Validation Set:\")\n", "print(classification_report(y_val, y_val_pred))\n", "\n", "print(\"\\nTest Set:\")\n", "print(classification_report(y_test, y_test_pred))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2Yme3ZcNouLy", "outputId": "e9c9602f-5cbd-421c-d0c6-617758712a11" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Validation Set:\n", " precision recall f1-score support\n", "\n", " 0 0.75 0.76 0.76 533\n", " 1 0.76 0.75 0.75 533\n", "\n", " accuracy 0.76 1066\n", " macro avg 0.76 0.76 0.76 1066\n", "weighted avg 0.76 0.76 0.76 1066\n", "\n", "\n", "Test Set:\n", " precision recall f1-score support\n", "\n", " 0 0.78 0.79 0.78 1067\n", " 1 0.79 0.78 0.78 1066\n", "\n", " accuracy 0.78 2133\n", " macro avg 0.78 0.78 0.78 2133\n", "weighted avg 0.78 0.78 0.78 2133\n", "\n" ] } ] }, { "cell_type": "markdown", "source": [ "We do not get high accuracy on the validation and test data set with Naive Bayes model. Let's see the performance of BERT model on the dataset" ], "metadata": { "id": "3TyWtz3LpwIe" } }, { "cell_type": "markdown", "source": [ "## Fine-tuning BERT Model\n", "\n", "In this section, we will fine-tune BERT model for text classification. There are several steps:\n", "- Text Encoding with tokenizer\n", "- Fine-tuning BERT model\n", "- Evaluation on test dataset" ], "metadata": { "id": "M4JkOXqdqIz6" } }, { "cell_type": "code", "source": [ "%%capture\n", "# prompt: Install transformers package\n", "\n", "!pip install -q transformers\n" ], "metadata": { "id": "Yb8ryM6qAAMw" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### Encoding Texts\n", "\n", "We will use tokenizer to encode train data (df_train), validation data (df_val) and test data (df_test) in batch encode mode. Since reviews' lengths are different, apply truncating method." ], "metadata": { "id": "-iuRzdfQ_nIC" } }, { "cell_type": "code", "source": [ "# prompt: We will use tokenizer to encode train data (df_train), validation data (df_val) and test data (df_test) in batch encode mode. Just use data in text column. Since reviews' lengths are different, apply truncating method.\n", "\n", "from transformers import BertTokenizerFast\n", "\n", "# Load the pre-trained BERT tokenizer\n", "tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')\n", "\n", "# Encode the train, validation, and test data\n", "train_encodings = tokenizer(list(df_train['text']), truncation=True, padding=True)\n", "val_encodings = tokenizer(list(df_val['text']), truncation=True, padding=True)\n", "test_encodings = tokenizer(list(df_test['text']), truncation=True, padding=True)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 276, "referenced_widgets": [ "b4d2971da985472ba0ddc86f90b4f527", "7e9066a0c9a34ceaa9fd6ad2d1518986", "010fcb1915d54dd198fc5877da5bae63", "3623abfddc294a9c8758f967aaa19937", "9bf11ad3f63a45acb55353502022dd93", "a115e3a2d5e64a84b1df0a1cd3e8cd8f", "950216a2fd18445b97d0ee3470b9a1b8", "fdae5a5f56734d3c8579a4fa6a7752fc", "507ebe2b3ba04f9b90d0406b9cf340b5", "7c745de05b604198adbd18521e132a6b", "61f34f9fee2049b8bfdb600fc1ac7479", "1fdf91c546104a579347eaa24a60da5e", "1dec5d7d70ef4577a1d60dd5bd1cdc38", "ef34deb6195b45fcbb8f52d26694a0d1", "f236b722c48d4b1f9e2520cda6e0aad9", "eec610a52be146b88e32db5eedbb3b09", "334f8dd0d7274cb3a072c5a8230c336c", "279269ca89934061a8bfcd38ffd8467e", "5687152a773547838c24d73f38d38ffc", "a45d81ab597249ecadefafa3950b4a66", "fc09d0acc63d4d6e8db4524d531ff8b3", "31d0b3a321d145979f01b7aaf32952a7", "7c3375fd2fb74cd2a2a3cf59d2539d89", "324feb694a064859a533a760d1de44b0", "af8b8e6559104aa7a1112c38ea4a843a", "a5c1af764c12459aa99e16a0358e5971", "f92f2ffeb9dc43c79ab30c2e052dcdba", "f094b86ef6b943738669d2f15fb51d3c", "4ead4e4549da43ec856399b0ad0100b9", "e327f72bab4143c5907f97f7b319d06a", "10ab21b53c274925b76564ee4f991527", "231bc555458a4a629ef2cdb8338fce63", "f526c3aab39f4e93a0de8bc807713a3d", "96b306e4c92b436f92de9e41699977ed", "7295cba870444b158104a8d84970d50f", "5301fc5aa6f44fad81ba7983d9505017", "07736ea6737d437bb35b4c8211ff0df0", "18061307d2214690b8d765a87a16b064", "f1dd1935354548048e16d08037eb1a45", "21777bb011eb40fcb93b2b24b68a4854", "aaada3d216ff408c8630921e514db928", "904ebf9b43b84efcaa376eb922d110bc", "644c595a19e64b4a8617ef049248e43e", "dddf6cc4a7c84e1e896e577b76ad70bf" ] }, "id": "D0ojekUn_o70", "outputId": "c4017cf6-9599-47f9-ca08-18788f339b1e" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", "You will be able to reuse this secret in all of your notebooks.\n", "Please note that authentication is recommended but still optional to access public models or datasets.\n", " warnings.warn(\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "tokenizer_config.json: 0%| | 0.00/48.0 [00:00 best_accuracy:\n", " best_accuracy = accuracy\n", " torch.save(model.state_dict(), best_model_path)\n", " print(f\"Best model saved at epoch {epoch+1} with accuracy: {best_accuracy}\")\n", "\n", "print(f\"Training finished. Best model saved to {best_model_path} with accuracy {best_accuracy}\")\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zc3NFBWvlVBJ", "outputId": "5c839469-96da-4277-c043-1631314abc6f" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "Epoch 1: 100%|██████████| 467/467 [01:35<00:00, 4.90it/s, loss=0.0576]\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Validation Accuracy after epoch 1: 0.8545966228893058\n", "Best model saved at epoch 1 with accuracy: 0.8545966228893058\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "Epoch 2: 100%|██████████| 467/467 [01:39<00:00, 4.68it/s, loss=0.746]\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Validation Accuracy after epoch 2: 0.8574108818011257\n", "Best model saved at epoch 2 with accuracy: 0.8574108818011257\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "Epoch 3: 100%|██████████| 467/467 [01:39<00:00, 4.69it/s, loss=0.0124]\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Validation Accuracy after epoch 3: 0.848968105065666\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "Epoch 4: 100%|██████████| 467/467 [01:39<00:00, 4.70it/s, loss=0.022]\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Validation Accuracy after epoch 4: 0.8348968105065666\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "Epoch 5: 100%|██████████| 467/467 [01:39<00:00, 4.70it/s, loss=0.0252]\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Validation Accuracy after epoch 5: 0.8564727954971857\n", "Training finished. Best model saved to best_model.bin with accuracy 0.8574108818011257\n" ] } ] }, { "cell_type": "markdown", "source": [ "Load the best model checkpoint to the model" ], "metadata": { "id": "L749I2VGmJhr" } }, { "cell_type": "code", "source": [ "# prompt: Load the best model checkpoint to the model\n", "\n", "model.load_state_dict(torch.load(best_model_path))\n", "print(\"Best model checkpoint loaded successfully.\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "r4i9NBhdmN4L", "outputId": "d5d59364-8729-45de-9b3c-580001fa8f12" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":3: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", " model.load_state_dict(torch.load(best_model_path))\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Best model checkpoint loaded successfully.\n" ] } ] }, { "cell_type": "markdown", "source": [ "Now using the trained model to evaluate on test data." ], "metadata": { "id": "h2Xv7gREdcUa" } }, { "cell_type": "code", "source": [ "# prompt: Evaluate on the test data using the fine-tuned model to evaluate on test data.\n", "\n", "# Now using the trained model to evaluate on test data.\n", "model.eval()\n", "correct_predictions = 0\n", "total_predictions = 0\n", "with torch.no_grad():\n", " for batch in test_loader:\n", " input_ids = batch['input_ids'].to(device)\n", " attention_mask = batch['attention_mask'].to(device)\n", " labels = batch['labels'].to(device)\n", " outputs = model(input_ids, attention_mask=attention_mask)\n", " logits = outputs.logits\n", " predicted_labels = torch.argmax(logits, dim=1)\n", " correct_predictions += (predicted_labels == labels).sum().item()\n", " total_predictions += labels.size(0)\n", "\n", "accuracy = correct_predictions / total_predictions\n", "print(f\"Test Accuracy: {accuracy}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9PgsNqrodgUs", "outputId": "ba980766-2ac1-4f81-9f99-cd9ff3c4f0a7" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Test Accuracy: 0.8649789029535865\n" ] } ] }, { "cell_type": "markdown", "source": [ "The results showed that BERT model got 86.1% accuracy on the test data. That number is significantly higher than the Naive Bayes baseline." ], "metadata": { "id": "g4_kBwHXpTl4" } } ] }