{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "nJfog0veD-Yb",
        "outputId": "676502df-6ae3-4289-d745-c92aa72bac06"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "--2025-12-27 03:38:08--  https://raw.githubusercontent.com/duyet/truyenkieu-word2vec/master/truyen_kieu_data.txt\n",
            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 140399 (137K) [text/plain]\n",
            "Saving to: ‘truyen_kieu_data.txt’\n",
            "\n",
            "truyen_kieu_data.tx 100%[===================>] 137.11K  --.-KB/s    in 0.03s   \n",
            "\n",
            "2025-12-27 03:38:08 (5.28 MB/s) - ‘truyen_kieu_data.txt’ saved [140399/140399]\n",
            "\n"
          ]
        }
      ],
      "source": [
        "!wget https://raw.githubusercontent.com/duyet/truyenkieu-word2vec/master/truyen_kieu_data.txt\n"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import re"
      ],
      "metadata": {
        "id": "yQpB1vS6K2j4"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "with open('truyen_kieu_data.txt', 'r', encoding='utf-8') as f:\n",
        "  lines = [line.strip() for line in f]\n",
        "  clean_lines = [re.sub(r\"^[\\d\\.,\\s]+|,$|[,.!?:'\\\"”]|\\n$\", \"\", line) for line in lines]\n"
      ],
      "metadata": {
        "id": "Xs-A0y_kK35Q"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def write_to_file(file_name, data):\n",
        "  with open(file_name, 'w', encoding='utf-8') as f:\n",
        "    i = 0\n",
        "    for line in data:\n",
        "      i += 1\n",
        "      if i != len(data):\n",
        "        f.write(line + \"\\n\")\n",
        "      else:\n",
        "        f.write(line)"
      ],
      "metadata": {
        "id": "RzC1MifVj4PX"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "write_to_file('truyen_kieu_data_clean.txt', clean_lines)"
      ],
      "metadata": {
        "id": "SzTDPiwEkNC3"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "test_size = 0.2\n",
        "split_idx = int(test_size * len(lines))\n",
        "test_data = clean_lines[:split_idx]\n",
        "train_data = clean_lines[split_idx:]"
      ],
      "metadata": {
        "id": "5Ac0s_qteSSc"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "write_to_file('truyen_kieu_train.txt', train_data)\n",
        "write_to_file('truyen_kieu_test.txt', test_data)"
      ],
      "metadata": {
        "id": "Z4DY1H4vfS7V"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from collections import defaultdict\n",
        "\n",
        "\n",
        "def train_bigram(train_file, model_file):\n",
        "    \"\"\"Train bigram language model and save to model file\n",
        "    \"\"\"\n",
        "    counts = defaultdict(int)  # count the n-gram\n",
        "    context_counts = defaultdict(int)   # count the context\n",
        "    with open(train_file) as f:\n",
        "        for line in f:\n",
        "            line = line.strip()\n",
        "            if line == '':\n",
        "                continue\n",
        "            words = line.split()\n",
        "            words.append('</s>')\n",
        "            words.insert(0, '<s>')\n",
        "\n",
        "            for i in range(1, len(words)):  # Note: starting at 1, after <s>\n",
        "                counts[words[i-1] + ' ' +  words[i]] += 1 # Add bigram and bigram context\n",
        "                context_counts[words[i-1]] += 1\n",
        "                counts[words[i]] += 1    # Add unigram and unigram context\n",
        "                context_counts[\"\"] += 1\n",
        "\n",
        "                pass\n",
        "\n",
        "    # Save probabilities to the model file\n",
        "    with open(model_file, 'w') as fo:\n",
        "        for ngram, count in counts.items():\n",
        "            words = ngram.split(' ')\n",
        "            words.pop()\n",
        "            context = ' '.join(words)\n",
        "            probability = counts[ngram]/context_counts[context]\n",
        "            fo.write('%s\\t%f\\n' % (ngram, probability))\n",
        "            pass"
      ],
      "metadata": {
        "id": "2BdI8YmhlRLC"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "train_bigram('truyen_kieu_train.txt', 'bigram_model.txt')"
      ],
      "metadata": {
        "id": "_KXI2vNLlaaL"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def load_bigram_model(model_file):\n",
        "    \"\"\"Load the model file\n",
        "\n",
        "    Args:\n",
        "        model_file (str): Path to the model file\n",
        "\n",
        "    Returns:\n",
        "        probs (dict): Dictionary object that map from ngrams to their probabilities\n",
        "    \"\"\"\n",
        "    probs = {}\n",
        "    with open(model_file, 'r') as f:\n",
        "        for line in f:\n",
        "            line = line.strip()\n",
        "            if line == '':\n",
        "                continue\n",
        "            ngram, p = line.split('\\t')\n",
        "            probs[ngram] = float(p)\n",
        "            pass\n",
        "    return probs"
      ],
      "metadata": {
        "id": "v9hkxeSRl_4S"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import random\n",
        "def random_choice(distribution):\n",
        "    r = random.random()      # số ngẫu nhiên trong [0, 1)\n",
        "    cumulative = 0.0\n",
        "    sentence = []\n",
        "\n",
        "    for item, prob in distribution.items():\n",
        "      if len(item) == 2 and item[0] == \"<s>\":\n",
        "        if r <= cumulative:\n",
        "          cumulative += prob\n",
        "          sentence.append(item)\n",
        "          break\n",
        "\n"
      ],
      "metadata": {
        "id": "k5uqRr4dl3Y_"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "model = load_bigram_model(\"bigram_model.txt\")\n",
        "\n",
        "print(sampling_bigram_model(model, 10))\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1T0K_manmBiT",
        "outputId": "b0faa2cd-1177-4f4a-d931-13ffa1945909"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Đánh liều một lời\n"
          ]
        }
      ]
    }
  ]
}