{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[{"file_id":"1IdsdHZ9Q8pAhmPfX4hHed150IG4YtnaX","timestamp":1709829281835}]},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"widgets":{"application/vnd.jupyter.widget-state+json":{"28e11fb654d24ef68a72d29ab062b926":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_78f924df828046f5b093cf927260960a","IPY_MODEL_e402dfe3038444718c40ba487ee5a2fe","IPY_MODEL_7907716bab2e4ea3a86f0b80303c089a"],"layout":"IPY_MODEL_9921542e63864e0cab7b3a8780ec88a9"}},"78f924df828046f5b093cf927260960a":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_45439de478d741698e095508be038474","placeholder":"​","style":"IPY_MODEL_59a7711f82804a78b11aefc765e81362","value":"100%"}},"e402dfe3038444718c40ba487ee5a2fe":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_c9aa7491673046b1a6cab9f4ef1e1594","max":10000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_edeeb0c041e74af48eceec05203cdc57","value":10000}},"7907716bab2e4ea3a86f0b80303c089a":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_362065e4131142c4872b59dab7430775","placeholder":"​","style":"IPY_MODEL_b02a41ef6745420f989aba9c357a983f","value":" 10000/10000 [00:03<00:00, 4132.18it/s]"}},"9921542e63864e0cab7b3a8780ec88a9":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"45439de478d741698e095508be038474":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"59a7711f82804a78b11aefc765e81362":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"c9aa7491673046b1a6cab9f4ef1e1594":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"edeeb0c041e74af48eceec05203cdc57":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"362065e4131142c4872b59dab7430775":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"b02a41ef6745420f989aba9c357a983f":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"cells":[{"cell_type":"markdown","source":["**The code is inspired by Thomas Simonini's Q-Learning [Tutorial](https://huggingface.co/blog/deep-rl-q-part2)**"],"metadata":{"id":"WfElKFV4zYu_"}},{"cell_type":"markdown","source":["## Setup a Virtual Display\n","To generate a replay video of agent and environment."],"metadata":{"id":"1JZkkkRezF2h"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"4jLiaLlkqeK7"},"outputs":[],"source":["%%capture\n","!pip install pyglet==1.5.1\n","!apt install python-opengl\n","!apt install ffmpeg\n","!apt install xvfb\n","!pip3 install pyvirtualdisplay\n","\n","# Virtual display\n","from pyvirtualdisplay import Display\n","\n","virtual_display = Display(visible=0, size=(1400, 900))\n","virtual_display.start()"]},{"cell_type":"markdown","source":["## Install dependencies"],"metadata":{"id":"YA8eccSjzTkm"}},{"cell_type":"code","source":["%%capture\n","!pip install gym==0.24\n","!pip install pygame\n","!pip install numpy\n","\n","!pip install imageio imageio_ffmpeg"],"metadata":{"id":"qy__wzWWq1Ip"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Import the packages"],"metadata":{"id":"_PapFtBBzW3G"}},{"cell_type":"code","source":["import numpy as np\n","import gym\n","import random\n","import imageio\n","from tqdm.notebook import trange"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"PWkA0eXLq52V","executionInfo":{"status":"ok","timestamp":1666697846684,"user_tz":-300,"elapsed":1287,"user":{"displayName":"Abid Ali Awan","userId":"05541326649808384534"}},"outputId":"a1a9e7d0-ad83-4fcf-9b2f-f72d7a168ec0"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["Warning: Gym version v0.24.0 has a number of critical issues with `gym.make` such that the `reset` and `step` functions are called before returning the environment. It is recommend to downgrading to v0.23.1 or upgrading to v0.25.1\n"]}]},{"cell_type":"markdown","source":["## Frozen Lake"],"metadata":{"id":"868LWxur0OGI"}},{"cell_type":"code","source":["# Create the FrozenLake-v1 environment using 4x4 map and non-slippery version\n","env = gym.make(\"FrozenLake-v1\",map_name=\"4x4\",is_slippery=False)"],"metadata":{"id":"Ho0clcOHrBlm"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["### Understanding the FrozenLake environment"],"metadata":{"id":"jFwhmxg80WkH"}},{"cell_type":"code","source":["print(\"_____OBSERVATION SPACE_____ \\n\")\n","print(\"Observation Space\", env.observation_space)\n","print(\"Sample observation\", env.observation_space.sample()) # Get a random observation"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"rQtUaOEJrDVf","executionInfo":{"status":"ok","timestamp":1666697846685,"user_tz":-300,"elapsed":28,"user":{"displayName":"Abid Ali Awan","userId":"05541326649808384534"}},"outputId":"525f9f4e-ddb8-4e95-e059-2241fcee4c4a"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["_____OBSERVATION SPACE_____ \n","\n","Observation Space Discrete(16)\n","Sample observation 4\n"]}]},{"cell_type":"code","source":["print(\"\\n _____ACTION SPACE_____ \\n\")\n","print(\"Action Space Shape\", env.action_space.n)\n","print(\"Action Space Sample\", env.action_space.sample()) # Take a random action"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"7fE_JTjGrF-B","executionInfo":{"status":"ok","timestamp":1666697846686,"user_tz":-300,"elapsed":22,"user":{"displayName":"Abid Ali Awan","userId":"05541326649808384534"}},"outputId":"184e69e2-67e9-4ba5-b906-49b1bef3fd3c"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\n"," _____ACTION SPACE_____ \n","\n","Action Space Shape 4\n","Action Space Sample 1\n"]}]},{"cell_type":"markdown","source":["## Create and Initialize the Q-table"],"metadata":{"id":"badxmLJg0i6Z"}},{"cell_type":"code","source":["state_space = env.observation_space.n\n","print(\"There are \", state_space, \" possible states\")\n","\n","action_space = env.action_space.n\n","print(\"There are \", action_space, \" possible actions\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"2m0z54AfrJVj","executionInfo":{"status":"ok","timestamp":1666697846687,"user_tz":-300,"elapsed":19,"user":{"displayName":"Abid Ali Awan","userId":"05541326649808384534"}},"outputId":"6a885a98-0498-4c45-8e85-957886ed312c"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["There are 16 possible states\n","There are 4 possible actions\n"]}]},{"cell_type":"code","source":["# Let's create our Qtable of size (state_space, action_space) and initialized each values at 0 using np.zeros\n","def initialize_q_table(state_space, action_space):\n"," Qtable = np.zeros((state_space, action_space))\n"," return Qtable"],"metadata":{"id":"rCddoOXM3UQH"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["Qtable_frozenlake = initialize_q_table(state_space, action_space)"],"metadata":{"id":"9YfvrqRt3jdR"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Define the epsilon-greedy policy"],"metadata":{"id":"lFeeieaA0mQ_"}},{"cell_type":"code","source":["def epsilon_greedy_policy(Qtable, state, epsilon):\n"," # Randomly generate a number between 0 and 1\n"," random_int = random.uniform(0,1)\n"," # if random_int > greater than epsilon --> exploitation\n"," if random_int > epsilon:\n"," # Take the action with the highest value given a state\n"," # np.argmax can be useful here\n"," action = np.argmax(Qtable[state])\n"," # else --> exploration\n"," else:\n"," action = env.action_space.sample()\n","\n"," return action"],"metadata":{"id":"Pe3nPqOQrYXi"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Define the greedy policy"],"metadata":{"id":"TMc7hclu0uI1"}},{"cell_type":"code","source":["def greedy_policy(Qtable, state):\n"," # Exploitation: take the action with the highest state, action value\n"," action = np.argmax(Qtable[state])\n","\n"," return action"],"metadata":{"id":"Xvh3Qj0rrbPu"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Define the hyperparameters"],"metadata":{"id":"AStP0Cwf0vjF"}},{"cell_type":"code","source":["# Training parameters\n","n_training_episodes = 10000 # Total training episodes\n","learning_rate = 0.7 # Learning rate\n","\n","# Evaluation parameters\n","n_eval_episodes = 100 # Total number of test episodes\n","\n","# Environment parameters\n","env_id = \"FrozenLake-v1\" # Name of the environment\n","max_steps = 99 # Max steps per episode\n","gamma = 0.95 # Discounting rate\n","eval_seed = [] # The evaluation seed of the environment\n","\n","# Exploration parameters\n","max_epsilon = 1.0 # Exploration probability at start\n","min_epsilon = 0.05 # Minimum exploration probability\n","decay_rate = 0.0005 # Exponential decay rate for exploration prob"],"metadata":{"id":"Y1tWn0tycWZ1"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Training the model"],"metadata":{"id":"bL4oWIJ800M6"}},{"cell_type":"code","source":["def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable):\n"," for episode in trange(n_training_episodes):\n"," # Reduce epsilon (because we need less and less exploration)\n"," epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)\n"," # Reset the environment\n"," state = env.reset()\n"," step = 0\n"," done = False\n","\n"," # repeat\n"," for step in range(max_steps):\n"," # Choose the action At using epsilon greedy policy\n"," action = epsilon_greedy_policy(Qtable, state, epsilon)\n","\n"," # Take action At and observe Rt+1 and St+1\n"," # Take the action (a) and observe the outcome state(s') and reward (r)\n"," new_state, reward, done, info = env.step(action)\n","\n"," # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]\n"," Qtable[state][action] = Qtable[state][action] + learning_rate * (reward + gamma * np.max(Qtable[new_state]) - Qtable[state][action])\n","\n"," # If done, finish the episode\n"," if done:\n"," break\n","\n"," # Our state is the new state\n"," state = new_state\n"," return Qtable"],"metadata":{"id":"paOynXy3aoJW"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["Qtable_frozenlake = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable_frozenlake)"],"metadata":{"id":"DPBxfjJdTCOH","colab":{"base_uri":"https://localhost:8080/","height":49,"referenced_widgets":["28e11fb654d24ef68a72d29ab062b926","78f924df828046f5b093cf927260960a","e402dfe3038444718c40ba487ee5a2fe","7907716bab2e4ea3a86f0b80303c089a","9921542e63864e0cab7b3a8780ec88a9","45439de478d741698e095508be038474","59a7711f82804a78b11aefc765e81362","c9aa7491673046b1a6cab9f4ef1e1594","edeeb0c041e74af48eceec05203cdc57","362065e4131142c4872b59dab7430775","b02a41ef6745420f989aba9c357a983f"]},"executionInfo":{"status":"ok","timestamp":1666697849653,"user_tz":-300,"elapsed":2979,"user":{"displayName":"Abid Ali Awan","userId":"05541326649808384534"}},"outputId":"1211b0d1-e57b-478c-f9c2-a6d85d9ed290"},"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/plain":[" 0%| | 0/10000 [00:00"]},"metadata":{},"execution_count":21}]}]}