{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"gpuType":"T4","authorship_tag":"ABX9TyPuoZcvBsqDC3TI4gcx/OCP"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"fc888af936e44ca298403cfecf8de3d4":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_bd1669facac6454b965313d39d6d475a","IPY_MODEL_7418d9b3786c470198924664e8cf9c9f","IPY_MODEL_7cf1f8c40ec3455daabf97d3cd4bcb59"],"layout":"IPY_MODEL_0e98aa8a1efa4b27a0d67efa732a8360"}},"bd1669facac6454b965313d39d6d475a":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_240fad6e4ee14d149ba16a18b431f108","placeholder":"​","style":"IPY_MODEL_911e4bd3834b4e3eb768b0c92f66789d","value":"Evaluating: 100%"}},"7418d9b3786c470198924664e8cf9c9f":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_13ee6bd48d04423299c78db79ef3b5fa","max":89,"min":0,"orientation":"horizontal","style":"IPY_MODEL_7a9e739583e54ea9be06cc459ca49f26","value":89}},"7cf1f8c40ec3455daabf97d3cd4bcb59":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_796eaf8e07d14f8597ef7c9cb9190422","placeholder":"​","style":"IPY_MODEL_d84f28cc297c4e639574899d155cf0a5","value":" 89/89 [00:00<00:00, 913.43it/s]"}},"0e98aa8a1efa4b27a0d67efa732a8360":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"240fad6e4ee14d149ba16a18b431f108":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"911e4bd3834b4e3eb768b0c92f66789d":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"13ee6bd48d04423299c78db79ef3b5fa":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7a9e739583e54ea9be06cc459ca49f26":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"796eaf8e07d14f8597ef7c9cb9190422":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d84f28cc297c4e639574899d155cf0a5":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"06a8857078214fdd9bd78e74a6f46a07":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_f6d02dc9e74541b4a439df43108b7dee","IPY_MODEL_61cddb5837dc433484855e3efe9b5483","IPY_MODEL_8d16f7837adb42bfb1103b10e018c71a"],"layout":"IPY_MODEL_4682950316044b3283a6d6d517506a0b"}},"f6d02dc9e74541b4a439df43108b7dee":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_5da7498f5d7f4250a9dd789d661257d9","placeholder":"​","style":"IPY_MODEL_a084e21833f94da3bc049da43adea989","value":"Epoch: 100%"}},"61cddb5837dc433484855e3efe9b5483":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_ad0c47916d734480b1d7d07ba84589ee","max":200,"min":0,"orientation":"horizontal","style":"IPY_MODEL_c53c7d7cc39542f1b1439e6d971e3276","value":200}},"8d16f7837adb42bfb1103b10e018c71a":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_862eb90a08db452ca267e0fba658e0fe","placeholder":"​","style":"IPY_MODEL_416caafbd3674e14869f326a5b3a3740","value":" 200/200 [01:35<00:00, 2.13it/s]"}},"4682950316044b3283a6d6d517506a0b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5da7498f5d7f4250a9dd789d661257d9":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a084e21833f94da3bc049da43adea989":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"ad0c47916d734480b1d7d07ba84589ee":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c53c7d7cc39542f1b1439e6d971e3276":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"862eb90a08db452ca267e0fba658e0fe":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"416caafbd3674e14869f326a5b3a3740":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"cells":[{"cell_type":"markdown","source":["# Implementation of Logistic Regression in Pytorch\n","\n","In this notebook, we will implement Logistic Regression for Text Classification. We are going to use Pytorch framework to do the job."],"metadata":{"id":"SoQL9w5BueDE"}},{"cell_type":"markdown","source":["## Download the data"],"metadata":{"id":"V_aNeh4Uu5b7"}},{"cell_type":"code","source":["%%capture\n","!rm -f titles-en-train.labeled\n","!rm -f titles-en-test.labeled\n","\n","!wget https://raw.githubusercontent.com/neubig/nlptutorial/master/data/titles-en-train.labeled\n","!wget https://raw.githubusercontent.com/neubig/nlptutorial/master/data/titles-en-test.labeled"],"metadata":{"id":"3QRHGXrivLzl","executionInfo":{"status":"ok","timestamp":1706932460346,"user_tz":-420,"elapsed":5,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}}},"execution_count":1,"outputs":[]},{"cell_type":"markdown","source":["Each sample is written in a line. There are two labels {1, -1} in the data.\n","\n","```\n","1\tFUJIWARA no Chikamori ( year of birth and death unknown ) was a samurai and poet who lived at the end of the Heian period .\n","-1\tYomi is the world of the dead .\n","```"],"metadata":{"id":"ccmaZsowvMKw"}},{"cell_type":"markdown","source":["### Load data\n","\n","We will load data into a list of sentences with their labels."],"metadata":{"id":"cq9NZAvivRCy"}},{"cell_type":"code","source":["def load_data(file_path):\n"," data = []\n"," with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:\n"," for line in f:\n"," line = line.strip()\n"," if line == '':\n"," continue\n"," lb, text = line.split('\\t')\n"," data.append((text,int(lb)))\n","\n"," return data"],"metadata":{"id":"4hgyjPOovTN2","executionInfo":{"status":"ok","timestamp":1706932467292,"user_tz":-420,"elapsed":3,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}}},"execution_count":2,"outputs":[]},{"cell_type":"markdown","source":["Loading data from files"],"metadata":{"id":"qpdOjiuIvUb4"}},{"cell_type":"code","source":["train_data = load_data('./titles-en-train.labeled')\n","test_data = load_data('./titles-en-test.labeled')\n","\n","train_docs, train_labels = zip(*train_data)\n","test_docs, test_labels = zip(*test_data)"],"metadata":{"id":"OvXx9SPYvWDD","executionInfo":{"status":"ok","timestamp":1706932509038,"user_tz":-420,"elapsed":2,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}}},"execution_count":3,"outputs":[]},{"cell_type":"markdown","source":["## Data Processing\n","\n","We need to convert textual data into Tensors before putting it into the training steps. We use BoW features (but note that, in deep learning, we often used different way in representing inputs, such as word embeddings)."],"metadata":{"id":"yGEF8RMmvygQ"}},{"cell_type":"code","source":["from sklearn.feature_extraction.text import CountVectorizer\n","\n","vectorizer = CountVectorizer(\n"," max_features=10000\n"," )\n","vectorizer"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":75},"id":"O5Bz9vG-wqoi","executionInfo":{"status":"ok","timestamp":1706932512223,"user_tz":-420,"elapsed":820,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}},"outputId":"039bae64-8718-4908-898b-93a041d3f7e3"},"execution_count":4,"outputs":[{"output_type":"execute_result","data":{"text/plain":["CountVectorizer(max_features=10000)"],"text/html":["
CountVectorizer(max_features=10000)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
"]},"metadata":{},"execution_count":4}]},{"cell_type":"code","source":["X_train = vectorizer.fit_transform(train_docs)\n","X_train.shape"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"SJKG0aikxHgV","executionInfo":{"status":"ok","timestamp":1706932518254,"user_tz":-420,"elapsed":404,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}},"outputId":"865800b6-8eec-44e0-88a6-9eaae4cf8f05"},"execution_count":5,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(11288, 10000)"]},"metadata":{},"execution_count":5}]},{"cell_type":"code","source":["X_test = vectorizer.transform(test_docs)"],"metadata":{"id":"iPv4CTAAxKgs","executionInfo":{"status":"ok","timestamp":1706932521091,"user_tz":-420,"elapsed":436,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}}},"execution_count":6,"outputs":[]},{"cell_type":"markdown","source":["We cannot use `X_train` and `X_test` for training with Pytorch. We need to convert them into dense matrices."],"metadata":{"id":"Tbfr-RtYxOiJ"}},{"cell_type":"code","source":["X_train = X_train.toarray()\n","X_test = X_test.toarray()"],"metadata":{"id":"8l_p-8wpxk2f","executionInfo":{"status":"ok","timestamp":1706932524436,"user_tz":-420,"elapsed":832,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}}},"execution_count":7,"outputs":[]},{"cell_type":"code","source":["X_train.shape"],"metadata":{"id":"YMiYkif7x48M","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1706932526125,"user_tz":-420,"elapsed":2,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}},"outputId":"c3acd8c0-7271-4f1a-963d-87207d42a96b"},"execution_count":8,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(11288, 10000)"]},"metadata":{},"execution_count":8}]},{"cell_type":"markdown","source":["Converting labels"],"metadata":{"id":"2OPnHGQ_x6rn"}},{"cell_type":"code","source":["train_labels = [0 if lb == -1 else lb for lb in train_labels]"],"metadata":{"id":"n_F683aEyBOv","executionInfo":{"status":"ok","timestamp":1706932528920,"user_tz":-420,"elapsed":326,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}}},"execution_count":9,"outputs":[]},{"cell_type":"code","source":["test_labels = [0 if lb == -1 else lb for lb in test_labels]"],"metadata":{"id":"8x3V852EyYRc","executionInfo":{"status":"ok","timestamp":1706932530060,"user_tz":-420,"elapsed":0,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}}},"execution_count":10,"outputs":[]},{"cell_type":"markdown","source":["### Converting data into Pytorch Tensors"],"metadata":{"id":"XsKj1GlAgkuc"}},{"cell_type":"code","source":["import torch\n","from torch.utils.data import TensorDataset, DataLoader\n","\n","device = 'cuda' if torch.cuda.is_available() else 'cpu'\n","\n","X_train_t = torch.from_numpy(X_train).to(torch.float32).to(device)\n","y_train_t = torch.tensor(train_labels, dtype=torch.float32).to(device)\n","\n","X_test_t = torch.from_numpy(X_test).to(torch.float32).to(device)\n","y_test_t = torch.tensor(test_labels, dtype=torch.float32).to(device)"],"metadata":{"id":"XKYobTY4gwbI","executionInfo":{"status":"ok","timestamp":1706932539452,"user_tz":-420,"elapsed":5397,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}}},"execution_count":11,"outputs":[]},{"cell_type":"code","source":["print(\"X_train_t.size()=\", X_train_t.size())\n","print(\"y_train_t.size()=\", y_train_t.size())"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"XLcu13F1hqtu","executionInfo":{"status":"ok","timestamp":1706932543051,"user_tz":-420,"elapsed":303,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}},"outputId":"73c2667e-28a6-4031-ce6c-e965465ce655"},"execution_count":12,"outputs":[{"output_type":"stream","name":"stdout","text":["X_train_t.size()= torch.Size([11288, 10000])\n","y_train_t.size()= torch.Size([11288])\n"]}]},{"cell_type":"markdown","source":["### Creating datasets"],"metadata":{"id":"K6tLlTtSmzKq"}},{"cell_type":"code","source":["train_dataset = TensorDataset(X_train_t, y_train_t)\n","val_dataset = TensorDataset(X_test_t, y_test_t)"],"metadata":{"id":"mKAxx3asm1dA","executionInfo":{"status":"ok","timestamp":1706932554761,"user_tz":-420,"elapsed":304,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}}},"execution_count":13,"outputs":[]},{"cell_type":"markdown","source":["## Logistic Regression Model"],"metadata":{"id":"EIgDUFBbvXfX"}},{"cell_type":"code","source":["import torch\n","import torch.nn as nn\n","import torch.nn.functional as F\n","\n","class LogisticRegression(nn.Module):\n"," def __init__(self, input_dim, output_dim):\n"," super().__init__()\n"," self.linear = nn.Linear(input_dim, output_dim)\n","\n"," def forward(self, x):\n"," # Multinomial: dùng softmax\n"," return torch.sigmoid(self.linear(x))"],"metadata":{"id":"DkjqoK4ZvjwR","executionInfo":{"status":"ok","timestamp":1706932560138,"user_tz":-420,"elapsed":383,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}}},"execution_count":14,"outputs":[]},{"cell_type":"markdown","source":["## Training Logistic Regression\n"],"metadata":{"id":"YOV8qZP_ydw6"}},{"cell_type":"markdown","source":["### Building the model"],"metadata":{"id":"ZUzaTSg2jOt-"}},{"cell_type":"code","source":["import time\n","\n","input_dim = 10000\n","output_dim = 1\n","epochs = 200 # epoch\n","learning_rate = 1e-3 # learning rate\n","batch_size = 32 # batch size for training\n","\n","model = LogisticRegression(input_dim, output_dim)\n","model = model.to(device)\n","\n","criterion = torch.nn.BCELoss()\n","optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)"],"metadata":{"id":"rT6UWPsbytOV","executionInfo":{"status":"ok","timestamp":1706932581583,"user_tz":-420,"elapsed":1646,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}}},"execution_count":15,"outputs":[]},{"cell_type":"markdown","source":["### Training Looop"],"metadata":{"id":"8F0Io6PCjRXt"}},{"cell_type":"code","source":["from tqdm.notebook import trange, tqdm\n","from torch.utils.data import RandomSampler, SequentialSampler\n","from sklearn import metrics\n","\n","def train():\n"," train_sampler = RandomSampler(train_dataset)\n"," train_dataloader = DataLoader(\n"," train_dataset,\n"," sampler=train_sampler,\n"," batch_size=batch_size,\n"," )\n"," model.train()\n"," total_acc, total_count = 0, 0\n","\n"," train_iterator = trange(int(epochs), desc=\"Epoch\")\n","\n"," for _ in train_iterator:\n"," for batch in train_dataloader:\n"," optimizer.zero_grad()\n"," pred = model(batch[0]).squeeze(1) # Compute output\n"," loss = criterion(pred, batch[1]) # Compute loss function\n"," loss.backward() # to compute Gradients\n"," optimizer.step() # Update weights\n","\n","def evaluate():\n"," model.eval()\n"," test_sampler = SequentialSampler(val_dataset)\n"," test_dataloader = DataLoader(\n"," val_dataset,\n"," sampler=test_sampler,\n"," batch_size=batch_size,\n"," )\n","\n"," preds = []\n"," true_labels = []\n"," with torch.no_grad():\n"," for batch in tqdm(test_dataloader, desc=\"Evaluating\"):\n"," logits = model(batch[0])\n"," _preds = (logits>0.5).type(torch.long).squeeze(1)\n"," preds += _preds.detach().cpu().numpy().tolist()\n"," true_labels += batch[1].detach().cpu().numpy().tolist()\n","\n"," print(metrics.classification_report(true_labels, preds))"],"metadata":{"id":"c8tzpt180o0r","executionInfo":{"status":"ok","timestamp":1706932616131,"user_tz":-420,"elapsed":317,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}}},"execution_count":16,"outputs":[]},{"cell_type":"code","source":["train()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":49,"referenced_widgets":["06a8857078214fdd9bd78e74a6f46a07","f6d02dc9e74541b4a439df43108b7dee","61cddb5837dc433484855e3efe9b5483","8d16f7837adb42bfb1103b10e018c71a","4682950316044b3283a6d6d517506a0b","5da7498f5d7f4250a9dd789d661257d9","a084e21833f94da3bc049da43adea989","ad0c47916d734480b1d7d07ba84589ee","c53c7d7cc39542f1b1439e6d971e3276","862eb90a08db452ca267e0fba658e0fe","416caafbd3674e14869f326a5b3a3740"]},"id":"xo29OxA6S56K","executionInfo":{"status":"ok","timestamp":1706932715044,"user_tz":-420,"elapsed":95791,"user":{"displayName":"Minh Pham","userId":"01293297774691882951"}},"outputId":"ec31c8fb-f7bd-4139-91e7-09a00c6d0db7"},"execution_count":17,"outputs":[{"output_type":"display_data","data":{"text/plain":["Epoch: 0%| | 0/200 [00:00