{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyMZvigX0h8yCq90ueauAQdK"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"DG5gGFyhgE_K"},"outputs":[],"source":["import numpy as np\n","import pandas as pd\n","from sklearn.datasets import load_breast_cancer, fetch_california_housing\n","from sklearn.feature_selection import SelectFromModel\n","from sklearn.linear_model import Lasso, LogisticRegression\n","from sklearn.model_selection import train_test_split\n","from sklearn.preprocessing import StandardScaler"]},{"cell_type":"code","source":["breast_cancer = load_breast_cancer()\n","X = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)\n","y = breast_cancer.target\n","X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n","X.head()"],"metadata":{"id":"S9s_HLXVgTvu","executionInfo":{"status":"ok","timestamp":1711273065702,"user_tz":-420,"elapsed":15,"user":{"displayName":"Nhật Quang Đoàn","userId":"10175964550021301622"}},"colab":{"base_uri":"https://localhost:8080/","height":290},"outputId":"fc6a6447-fff1-4074-c403-b56dfc50f79e"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" mean radius mean texture mean perimeter mean area mean smoothness \\\n","0 17.99 10.38 122.80 1001.0 0.11840 \n","1 20.57 17.77 132.90 1326.0 0.08474 \n","2 19.69 21.25 130.00 1203.0 0.10960 \n","3 11.42 20.38 77.58 386.1 0.14250 \n","4 20.29 14.34 135.10 1297.0 0.10030 \n","\n"," mean compactness mean concavity mean concave points mean symmetry \\\n","0 0.27760 0.3001 0.14710 0.2419 \n","1 0.07864 0.0869 0.07017 0.1812 \n","2 0.15990 0.1974 0.12790 0.2069 \n","3 0.28390 0.2414 0.10520 0.2597 \n","4 0.13280 0.1980 0.10430 0.1809 \n","\n"," mean fractal dimension ... worst radius worst texture worst perimeter \\\n","0 0.07871 ... 25.38 17.33 184.60 \n","1 0.05667 ... 24.99 23.41 158.80 \n","2 0.05999 ... 23.57 25.53 152.50 \n","3 0.09744 ... 14.91 26.50 98.87 \n","4 0.05883 ... 22.54 16.67 152.20 \n","\n"," worst area worst smoothness worst compactness worst concavity \\\n","0 2019.0 0.1622 0.6656 0.7119 \n","1 1956.0 0.1238 0.1866 0.2416 \n","2 1709.0 0.1444 0.4245 0.4504 \n","3 567.7 0.2098 0.8663 0.6869 \n","4 1575.0 0.1374 0.2050 0.4000 \n","\n"," worst concave points worst symmetry worst fractal dimension \n","0 0.2654 0.4601 0.11890 \n","1 0.1860 0.2750 0.08902 \n","2 0.2430 0.3613 0.08758 \n","3 0.2575 0.6638 0.17300 \n","4 0.1625 0.2364 0.07678 \n","\n","[5 rows x 30 columns]"],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
mean radiusmean texturemean perimetermean areamean smoothnessmean compactnessmean concavitymean concave pointsmean symmetrymean fractal dimension...worst radiusworst textureworst perimeterworst areaworst smoothnessworst compactnessworst concavityworst concave pointsworst symmetryworst fractal dimension
017.9910.38122.801001.00.118400.277600.30010.147100.24190.07871...25.3817.33184.602019.00.16220.66560.71190.26540.46010.11890
120.5717.77132.901326.00.084740.078640.08690.070170.18120.05667...24.9923.41158.801956.00.12380.18660.24160.18600.27500.08902
219.6921.25130.001203.00.109600.159900.19740.127900.20690.05999...23.5725.53152.501709.00.14440.42450.45040.24300.36130.08758
311.4220.3877.58386.10.142500.283900.24140.105200.25970.09744...14.9126.5098.87567.70.20980.86630.68690.25750.66380.17300
420.2914.34135.101297.00.100300.132800.19800.104300.18090.05883...22.5416.67152.201575.00.13740.20500.40000.16250.23640.07678
\n","

5 rows × 30 columns

\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"X"}},"metadata":{},"execution_count":3}]},{"cell_type":"code","source":["scaler = StandardScaler()\n","scaler.fit(X_train)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":74},"id":"TKmrxvrogXmH","executionInfo":{"status":"ok","timestamp":1710599949063,"user_tz":-420,"elapsed":397,"user":{"displayName":"Nhật Quang Đoàn","userId":"10175964550021301622"}},"outputId":"a163f11f-4b3d-4cac-a622-559b65b4f4f0"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["StandardScaler()"],"text/html":["
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
"]},"metadata":{},"execution_count":18}]},{"cell_type":"code","source":["sel_ = SelectFromModel(\n"," LogisticRegression(C=0.5, penalty='l1', solver='liblinear', random_state=10))\n","sel_.fit(scaler.transform(X_train), y_train)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":117},"id":"nb_jL139gZKE","executionInfo":{"status":"ok","timestamp":1710599962515,"user_tz":-420,"elapsed":328,"user":{"displayName":"Nhật Quang Đoàn","userId":"10175964550021301622"}},"outputId":"1bc8144a-bce1-49d2-f825-31daf37aed86"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["SelectFromModel(estimator=LogisticRegression(C=0.5, penalty='l1',\n"," random_state=10,\n"," solver='liblinear'))"],"text/html":["
SelectFromModel(estimator=LogisticRegression(C=0.5, penalty='l1',\n","                                             random_state=10,\n","                                             solver='liblinear'))
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
"]},"metadata":{},"execution_count":19}]},{"cell_type":"code","source":["sel_.get_support()\n","#False =0, True = 1\n","# weight of irrelevant features = 0"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"B32FGTcggdDL","executionInfo":{"status":"ok","timestamp":1710600270396,"user_tz":-420,"elapsed":406,"user":{"displayName":"Nhật Quang Đoàn","userId":"10175964550021301622"}},"outputId":"ac504646-6016-4e53-b63f-e18d84b76fba"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([False, True, False, False, False, False, False, True, True,\n"," False, True, False, False, False, False, True, False, False,\n"," False, True, True, True, True, True, True, False, True,\n"," True, True, False])"]},"metadata":{},"execution_count":23}]},{"cell_type":"code","source":["# k-nn for the full features\n","# k-nn for the selected True features\n","# compare performances before/after feature selection"],"metadata":{"id":"aWwbnFPNJIKn"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["np.corrcoef(X)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"gclRh90euoG1","executionInfo":{"status":"ok","timestamp":1710601180513,"user_tz":-420,"elapsed":324,"user":{"displayName":"Nhật Quang Đoàn","userId":"10175964550021301622"}},"outputId":"5904cd20-7780-4665-a57b-fdc60dc10bf7"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[1. , 0.9892786 , 0.98703483, ..., 0.97919942, 0.98785229,\n"," 0.97683484],\n"," [0.9892786 , 1. , 0.999713 , ..., 0.99774113, 0.99972889,\n"," 0.98469916],\n"," [0.98703483, 0.999713 , 1. , ..., 0.99878867, 0.99988251,\n"," 0.98591229],\n"," ...,\n"," [0.97919942, 0.99774113, 0.99878867, ..., 1. , 0.99875905,\n"," 0.98800384],\n"," [0.98785229, 0.99972889, 0.99988251, ..., 0.99875905, 1. ,\n"," 0.98765632],\n"," [0.97683484, 0.98469916, 0.98591229, ..., 0.98800384, 0.98765632,\n"," 1. ]])"]},"metadata":{},"execution_count":32}]},{"cell_type":"code","source":["removed_feats = X_train.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]\n","X_train_selected = sel_.transform(scaler.transform(X_train))\n","X_test_selected = sel_.transform(scaler.transform(X_test))"],"metadata":{"id":"kQXvJVKWuJ_D"},"execution_count":null,"outputs":[]}]}