{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"gpuType":"T4","authorship_tag":"ABX9TyPuoZcvBsqDC3TI4gcx/OCP"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"fc888af936e44ca298403cfecf8de3d4":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_bd1669facac6454b965313d39d6d475a","IPY_MODEL_7418d9b3786c470198924664e8cf9c9f","IPY_MODEL_7cf1f8c40ec3455daabf97d3cd4bcb59"],"layout":"IPY_MODEL_0e98aa8a1efa4b27a0d67efa732a8360"}},"bd1669facac6454b965313d39d6d475a":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_240fad6e4ee14d149ba16a18b431f108","placeholder":"","style":"IPY_MODEL_911e4bd3834b4e3eb768b0c92f66789d","value":"Evaluating: 100%"}},"7418d9b3786c470198924664e8cf9c9f":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_13ee6bd48d04423299c78db79ef3b5fa","max":89,"min":0,"orientation":"horizontal","style":"IPY_MODEL_7a9e739583e54ea9be06cc459ca49f26","value":89}},"7cf1f8c40ec3455daabf97d3cd4bcb59":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_796eaf8e07d14f8597ef7c9cb9190422","placeholder":"","style":"IPY_MODEL_d84f28cc297c4e639574899d155cf0a5","value":" 89/89 [00:00<00:00, 913.43it/s]"}},"0e98aa8a1efa4b27a0d67efa732a8360":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"240fad6e4ee14d149ba16a18b431f108":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"911e4bd3834b4e3eb768b0c92f66789d":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"13ee6bd48d04423299c78db79ef3b5fa":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7a9e739583e54ea9be06cc459ca49f26":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"796eaf8e07d14f8597ef7c9cb9190422":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d84f28cc297c4e639574899d155cf0a5":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"06a8857078214fdd9bd78e74a6f46a07":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_f6d02dc9e74541b4a439df43108b7dee","IPY_MODEL_61cddb5837dc433484855e3efe9b5483","IPY_MODEL_8d16f7837adb42bfb1103b10e018c71a"],"layout":"IPY_MODEL_4682950316044b3283a6d6d517506a0b"}},"f6d02dc9e74541b4a439df43108b7dee":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_5da7498f5d7f4250a9dd789d661257d9","placeholder":"","style":"IPY_MODEL_a084e21833f94da3bc049da43adea989","value":"Epoch: 100%"}},"61cddb5837dc433484855e3efe9b5483":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_ad0c47916d734480b1d7d07ba84589ee","max":200,"min":0,"orientation":"horizontal","style":"IPY_MODEL_c53c7d7cc39542f1b1439e6d971e3276","value":200}},"8d16f7837adb42bfb1103b10e018c71a":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_862eb90a08db452ca267e0fba658e0fe","placeholder":"","style":"IPY_MODEL_416caafbd3674e14869f326a5b3a3740","value":" 200/200 [01:35<00:00, 2.13it/s]"}},"4682950316044b3283a6d6d517506a0b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5da7498f5d7f4250a9dd789d661257d9":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a084e21833f94da3bc049da43adea989":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"ad0c47916d734480b1d7d07ba84589ee":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c53c7d7cc39542f1b1439e6d971e3276":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"862eb90a08db452ca267e0fba658e0fe":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"416caafbd3674e14869f326a5b3a3740":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"cells":[{"cell_type":"markdown","source":["# Implementation of Logistic Regression in Pytorch\n","\n","In this notebook, we will implement Logistic Regression for Text Classification. We are going to use Pytorch framework to do the job."],"metadata":{"id":"SoQL9w5BueDE"}},{"cell_type":"markdown","source":["## Download the data"],"metadata":{"id":"V_aNeh4Uu5b7"}},{"cell_type":"code","source":["%%capture\n","!rm -f titles-en-train.labeled\n","!rm -f titles-en-test.labeled\n","\n","!wget https://raw.githubusercontent.com/neubig/nlptutorial/master/data/titles-en-train.labeled\n","!wget https://raw.githubusercontent.com/neubig/nlptutorial/master/data/titles-en-test.labeled"],"metadata":{"id":"3QRHGXrivLzl","executionInfo":{"status":"ok","timestamp":1706932460346,"user_tz":-420,"elapsed":5,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}}},"execution_count":1,"outputs":[]},{"cell_type":"markdown","source":["Each sample is written in a line. There are two labels {1, -1} in the data.\n","\n","```\n","1\tFUJIWARA no Chikamori ( year of birth and death unknown ) was a samurai and poet who lived at the end of the Heian period .\n","-1\tYomi is the world of the dead .\n","```"],"metadata":{"id":"ccmaZsowvMKw"}},{"cell_type":"markdown","source":["### Load data\n","\n","We will load data into a list of sentences with their labels."],"metadata":{"id":"cq9NZAvivRCy"}},{"cell_type":"code","source":["def load_data(file_path):\n"," data = []\n"," with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:\n"," for line in f:\n"," line = line.strip()\n"," if line == '':\n"," continue\n"," lb, text = line.split('\\t')\n"," data.append((text,int(lb)))\n","\n"," return data"],"metadata":{"id":"4hgyjPOovTN2","executionInfo":{"status":"ok","timestamp":1706932467292,"user_tz":-420,"elapsed":3,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}}},"execution_count":2,"outputs":[]},{"cell_type":"markdown","source":["Loading data from files"],"metadata":{"id":"qpdOjiuIvUb4"}},{"cell_type":"code","source":["train_data = load_data('./titles-en-train.labeled')\n","test_data = load_data('./titles-en-test.labeled')\n","\n","train_docs, train_labels = zip(*train_data)\n","test_docs, test_labels = zip(*test_data)"],"metadata":{"id":"OvXx9SPYvWDD","executionInfo":{"status":"ok","timestamp":1706932509038,"user_tz":-420,"elapsed":2,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}}},"execution_count":3,"outputs":[]},{"cell_type":"markdown","source":["## Data Processing\n","\n","We need to convert textual data into Tensors before putting it into the training steps. We use BoW features (but note that, in deep learning, we often used different way in representing inputs, such as word embeddings)."],"metadata":{"id":"yGEF8RMmvygQ"}},{"cell_type":"code","source":["from sklearn.feature_extraction.text import CountVectorizer\n","\n","vectorizer = CountVectorizer(\n"," max_features=10000\n"," )\n","vectorizer"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":75},"id":"O5Bz9vG-wqoi","executionInfo":{"status":"ok","timestamp":1706932512223,"user_tz":-420,"elapsed":820,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}},"outputId":"039bae64-8718-4908-898b-93a041d3f7e3"},"execution_count":4,"outputs":[{"output_type":"execute_result","data":{"text/plain":["CountVectorizer(max_features=10000)"],"text/html":["
CountVectorizer(max_features=10000)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
CountVectorizer(max_features=10000)