{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nJfog0veD-Yb", "outputId": "676502df-6ae3-4289-d745-c92aa72bac06" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--2025-12-27 03:38:08-- https://raw.githubusercontent.com/duyet/truyenkieu-word2vec/master/truyen_kieu_data.txt\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 140399 (137K) [text/plain]\n", "Saving to: ‘truyen_kieu_data.txt’\n", "\n", "truyen_kieu_data.tx 100%[===================>] 137.11K --.-KB/s in 0.03s \n", "\n", "2025-12-27 03:38:08 (5.28 MB/s) - ‘truyen_kieu_data.txt’ saved [140399/140399]\n", "\n" ] } ], "source": [ "!wget https://raw.githubusercontent.com/duyet/truyenkieu-word2vec/master/truyen_kieu_data.txt\n" ] }, { "cell_type": "code", "source": [ "import re" ], "metadata": { "id": "yQpB1vS6K2j4" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "with open('truyen_kieu_data.txt', 'r', encoding='utf-8') as f:\n", " lines = [line.strip() for line in f]\n", " clean_lines = [re.sub(r\"^[\\d\\.,\\s]+|,$|[,.!?:'\\\"”]|\\n$\", \"\", line) for line in lines]\n" ], "metadata": { "id": "Xs-A0y_kK35Q" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def write_to_file(file_name, data):\n", " with open(file_name, 'w', encoding='utf-8') as f:\n", " i = 0\n", " for line in data:\n", " i += 1\n", " if i != len(data):\n", " f.write(line + \"\\n\")\n", " else:\n", " f.write(line)" ], "metadata": { "id": "RzC1MifVj4PX" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "write_to_file('truyen_kieu_data_clean.txt', clean_lines)" ], "metadata": { "id": "SzTDPiwEkNC3" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "test_size = 0.2\n", "split_idx = int(test_size * len(lines))\n", "test_data = clean_lines[:split_idx]\n", "train_data = clean_lines[split_idx:]" ], "metadata": { "id": "5Ac0s_qteSSc" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "write_to_file('truyen_kieu_train.txt', train_data)\n", "write_to_file('truyen_kieu_test.txt', test_data)" ], "metadata": { "id": "Z4DY1H4vfS7V" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from collections import defaultdict\n", "\n", "\n", "def train_bigram(train_file, model_file):\n", " \"\"\"Train bigram language model and save to model file\n", " \"\"\"\n", " counts = defaultdict(int) # count the n-gram\n", " context_counts = defaultdict(int) # count the context\n", " with open(train_file) as f:\n", " for line in f:\n", " line = line.strip()\n", " if line == '':\n", " continue\n", " words = line.split()\n", " words.append('')\n", " words.insert(0, '')\n", "\n", " for i in range(1, len(words)): # Note: starting at 1, after \n", " counts[words[i-1] + ' ' + words[i]] += 1 # Add bigram and bigram context\n", " context_counts[words[i-1]] += 1\n", " counts[words[i]] += 1 # Add unigram and unigram context\n", " context_counts[\"\"] += 1\n", "\n", " pass\n", "\n", " # Save probabilities to the model file\n", " with open(model_file, 'w') as fo:\n", " for ngram, count in counts.items():\n", " words = ngram.split(' ')\n", " words.pop()\n", " context = ' '.join(words)\n", " probability = counts[ngram]/context_counts[context]\n", " fo.write('%s\\t%f\\n' % (ngram, probability))\n", " pass" ], "metadata": { "id": "2BdI8YmhlRLC" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "train_bigram('truyen_kieu_train.txt', 'bigram_model.txt')" ], "metadata": { "id": "_KXI2vNLlaaL" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def load_bigram_model(model_file):\n", " \"\"\"Load the model file\n", "\n", " Args:\n", " model_file (str): Path to the model file\n", "\n", " Returns:\n", " probs (dict): Dictionary object that map from ngrams to their probabilities\n", " \"\"\"\n", " probs = {}\n", " with open(model_file, 'r') as f:\n", " for line in f:\n", " line = line.strip()\n", " if line == '':\n", " continue\n", " ngram, p = line.split('\\t')\n", " probs[ngram] = float(p)\n", " pass\n", " return probs" ], "metadata": { "id": "v9hkxeSRl_4S" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import random\n", "def random_choice(distribution):\n", " r = random.random() # số ngẫu nhiên trong [0, 1)\n", " cumulative = 0.0\n", " sentence = []\n", "\n", " for item, prob in distribution.items():\n", " if len(item) == 2 and item[0] == \"\":\n", " if r <= cumulative:\n", " cumulative += prob\n", " sentence.append(item)\n", " break\n", "\n" ], "metadata": { "id": "k5uqRr4dl3Y_" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model = load_bigram_model(\"bigram_model.txt\")\n", "\n", "print(sampling_bigram_model(model, 10))\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1T0K_manmBiT", "outputId": "b0faa2cd-1177-4f4a-d931-13ffa1945909" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Đánh liều một lời\n" ] } ] } ] }