{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "private_outputs": true, "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "gpuClass": "standard" }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "hgxOJ0dfx9KR" }, "outputs": [], "source": [ "import pandas as pd\n", "from google.colab import drive\n", "drive.mount('/content/drive')" ] }, { "cell_type": "code", "source": [ "df = pd.read_csv(\"/content/drive/MyDrive/train_data_csfd.csv\")\n", "df.head()" ], "metadata": { "id": "Q9XWgGBmzuNS" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "reviews.head()" ], "metadata": { "id": "h6JpULOD6u-e" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import nltk\n", "nltk.download('all')\n" ], "metadata": { "id": "2y9NuKBX9xdu" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "\n", "!pip install torch\n", "!pip install nltk\n", "!pip install transformers\n", "! pip install seaborn\n", "import pandas as pd\n", "from google.colab import drive\n", "drive.mount('/content/drive')\n", "import numpy as np\n", "import nltk\n", "import torch\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", "\n", "\n", "\n", "nltk.download('stopwords')\n", "\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize\n", "import string\n", "import torch\n", "import re\n", "import seaborn as sns\n", "\n", "from transformers import BertTokenizer, BertModel\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import matplotlib.pyplot as plt\n", "\n", "# Load text and label data\n", "reviews = pd.read_csv(\"/content/drive/MyDrive/train_data_csfd.csv\")\n", "\n", "# Define text and label variables\n", "text = reviews[\"text\"]\n", "label = reviews[\"label\"]\n", "\n", "# Create DataFrame from provided text and label data\n", "data = {\n", " \"review\": text,\n", " \"label\": label\n", "}\n", "df = pd.DataFrame(data)\n", "\n", "# Define list of stopwords\n", "import nltk\n", "nltk.download('stopwords')\n", "nltk.download('punkt')\n", "\n", "from nltk.corpus import stopwords\n", "stopwords = set(stopwords.words('english'))\n", "stopwords.update([\"br\",\"a\",\t\n", "\"v\",\t\n", "\"se\",\t\n", "\"na\",\t\n", "\"je\",\t\n", "\"že\",\t\n", "\"o\",\n", "\"s\",\n", "\"z\",\n", "\"do\",\t\n", "\"i\",\t\n", "\"to\",\t\n", "\"k\",\t\n", "\"ve\",\t\n", "\"pro\",\t\n", "\"za\",\t\n", "\"by\",\t\n", "\"ale\",\t\n", "\"si\",\t\n", "\"po\",\t\n", "\"jako\",\t\n", "\"podle\",\t\n", "\"od\",\t\n", "\"jsem\",\t\n", "\"tak\",\t\n", "\"jsou\",\t\n", "\"které\",\n", "\"který\",\t\n", "\"jeho\",\t\n", "\"však\",\t\n", "\"bude\",\t\n", "\"nebo\",\t\n", "\"už\",\t\n", "\"jen\",\t\n", "\"byl\",\t\n", "\"jak\",\t\n", "\"u\",\t\n", "\"co\",\t\n", "\"při\",\t\n", "\"až\",\t\n", "\"aby\",\t\n", "\"má\",\t\n", "\"když\",\t\n", "\"než\",\t\n", "\"ze\",\t\n", "\"která\",\t\n", "\"před\",\t\n", "\"být\",\t\n", "\"také\",\t\n", "\"bylo\",\t\n", "\"jsme\",\t\n", "\"není\",\t\n", "\"jejich\",\t\n", "\"ještě\",\t\n", "\"ani\",\t\n", "\"mezi\",\t\n", "\"byla\",\t\n", "\"své\",\n", "\"roku\",\t\n", "\"již\",\t\n", "\"pak\",\t\n", "\"první\",\t\n", "\"roce\",\t\n", "\"kteří\",\t\n", "\"další\",\t\n", "\"proti\",\t\n", "\"let\",\t\n", "\"tím\",\t\n", "\"může\",\t\n", "\"korun\",\n", "\"řekl\",\n", "\"tom\",\n", "\"kde\",\n", "\"či\",\n", "\"tedy\",\n", "\"pouze\"])\n", "\n", "\n", "\n", "\n", "\n", "# Function to clean text\n", "def clean_text(text):\n", " # Remove non-letters and digits\n", " text = re.sub(\"[^a-zA-Z]\", \" \", text)\n", " # Tokenize text\n", " tokens = nltk.word_tokenize(text)\n", " # Remove stopwords\n", " tokens = [token for token in tokens if token not in stopwords]\n", " # Lowercase text\n", " tokens = [token.lower() for token in tokens]\n", " # Join tokens back into text\n", " text = \" \".join(tokens)\n", " return text\n", "\n", "# Apply clean_text function to reviews DataFrame\n", "df[\"clean_review\"] = df[\"review\"].apply(lambda x: clean_text(x))\n", "\n", "# Tokenize text\n", "tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\")\n", "\n", "def tokenize_text(text):\n", " return tokenizer.encode_plus(text, add_special_tokens=True, return_tensors=\"pt\")\n", "\n", "# Tokenize clean reviews\n", "tokenized_reviews = df[\"clean_review\"].apply(tokenize_text)\n", "\n", "# Load pre-trained BERT model\n", "model = BertModel.from_pretrained(\"bert-base-multilingual-cased\")\n", "# Get the embeddings weights from the pre-trained BERT model\n", "embedding_weight = model.embeddings.word_embeddings.weight\n", "\n", "\n", "# Apply BERT model to tokenized reviews\n", "\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "model.to(device)\n", "\n", "reviews_encoded = []\n", "for review in tokenized_reviews:\n", " with torch.no_grad():\n", " input_ids = review['input_ids'].to(device)\n", " attention_mask = review['attention_mask'].to(device)\n", " review_encoded = model(input_ids=input_ids, attention_mask=attention_mask)\n", " reviews_encoded.append(review_encoded)\n", "# define dimensions of the 3D array\n", "num_reviews = 10\n", "num_sentences = 5\n", "embedding_size = 50\n", "\n", "# create a 3D array of random values\n", "features = np.random.rand(num_reviews, num_sentences, embedding_size)\n", "\n", "# reshape features to a 2D array with dimensions (num_reviews * num_sentences, embedding_size)\n", "features = features.reshape(num_reviews * num_sentences, embedding_size)\n", "\n", "\n", "# Compute cosine similarity matrix between all reviews\n", "similarity_matrix = cosine_similarity(features)\n", "\n", "# Visualize similarity matrix as heatmap\n", "sns.heatmap(similarity_matrix, annot=True, cmap='Blues')\n", "plt.show()\n", "\n", "\n", " \n", "\n", "\n" ], "metadata": { "id": "pvy6YbxSNgwl" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from transformers import BertTokenizer, BertForSequenceClassification\n", "model_name = \"bert-base-multilingual-cased\"\n", "tokenizer = BertTokenizer.from_pretrained(model_name)\n", "model = BertForSequenceClassification.from_pretrained(model_name)\n" ], "metadata": { "id": "2e0Kt-j8GNB8" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def clean_text(text):\n", " # Remove non-letters and digits\n", " text = re.sub(\"[^a-zA-Z0-9]\", \" \", text)\n", " # Tokenize text\n", " tokens = nltk.word_tokenize(text)\n", " # Remove stopwords\n", " tokens = [token for token in tokens if token not in stopwords]\n", " # Lowercase text\n", " tokens = [token.lower() for token in tokens]\n", " # Join tokens back into text\n", " text = \" \".join(tokens)\n", " return text\n", "\n", "new_review = \"Tento film je úžasný!\"\n", "cleaned_review = clean_text(new_review)\n", "tokens = tokenizer.encode_plus(cleaned_review, add_special_tokens=True, return_tensors=\"pt\")\n" ], "metadata": { "id": "oS4MvOt3GQW_" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model.eval()\n", "with torch.no_grad():\n", " outputs = model(tokens['input_ids'], tokens['attention_mask'])\n", " logits = outputs[0]\n", " predictions = torch.argmax(logits, dim=1).flatten()\n", "sentiment = \"positive\" if predictions == 1 else \"negative\"\n", "print(sentiment)\n" ], "metadata": { "id": "9hARNr9AGVil" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import nltk\n", "nltk.download('stopwords')\n", "from nltk.corpus import stopwords\n", "stopwords = stopwords.words('english')\n", "\n", "def clean_text(text):\n", " # Remove non-letters and digits\n", " text = re.sub(\"[^a-zA-Z0-9]\", \" \", text)\n", " # Tokenize text\n", " tokens = nltk.word_tokenize(text)\n", " # Remove stopwords\n", " tokens = [token for token in tokens if token.lower() not in stopwords]\n", " # Lowercase text\n", " tokens = [token.lower() for token in tokens]\n", " # Join tokens back into text\n", " text = \" \".join(tokens)\n", " return text\n", "\n", "\n", "new_review = \"tento film nesplnil má očekávání\"\n", "cleaned_review = clean_text(new_review)\n", "tokens = tokenizer.encode_plus(cleaned_review, add_special_tokens=True, return_tensors=\"pt\")\n", "model.eval()\n", "with torch.no_grad():\n", " outputs = model(tokens['input_ids'], tokens['attention_mask'])\n", " logits = outputs[0]\n", " predictions = torch.argmax(logits, dim=1).flatten()\n", " confidence = torch.softmax(logits, dim=1)[0][predictions].item()\n", " sentiment = \"positive\" if predictions == 1 else \"negative\"\n", "print(f\"The sentiment of the review is {sentiment} with a confidence score of {confidence:.2f}\")\n" ], "metadata": { "id": "aQXwwHU6dm3k" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from transformers import BertTokenizer, BertForSequenceClassification\n", "import torch\n", "import pandas as pd\n", "import re\n", "import nltk\n", "nltk.download('stopwords')\n", "from nltk.corpus import stopwords\n", "\n", "model_name = \"bert-base-multilingual-cased\"\n", "tokenizer = BertTokenizer.from_pretrained(model_name)\n", "model = BertForSequenceClassification.from_pretrained(model_name)\n", "\n", "def clean_text(text):\n", " # Remove non-letters and digits\n", " text = re.sub(\"[^a-zA-Z0-9]\", \" \", text)\n", " # Tokenize text\n", " tokens = nltk.word_tokenize(text)\n", " # Remove stopwords\n", " tokens = [token for token in tokens if token not in stopwords.words('english')]\n", " # Lowercase text\n", " tokens = [token.lower() for token in tokens]\n", " # Join tokens back into text\n", " text = \" \".join(tokens)\n", " return text\n", "\n", "# Load a sample dataset of movie reviews with known sentiments\n", "reviews = pd.read_csv(\"/content/drive/MyDrive/train_data_csfd.csv\")\n", "sample = reviews.sample(n=500) # Choose a random sample of 100 reviews\n", "\n", "# Evaluate the model on the sample dataset\n", "correct = 0\n", "total = 0\n", "for _, row in sample.iterrows():\n", " text = row[\"text\"]\n", " label = row[\"label\"]\n", " cleaned_review = clean_text(text)\n", " tokens = tokenizer.encode_plus(cleaned_review, add_special_tokens=True, return_tensors=\"pt\")\n", " with torch.no_grad():\n", " outputs = model(tokens['input_ids'], tokens['attention_mask'])\n", " logits = outputs[0]\n", " predicted_sentiment = torch.argmax(logits, dim=1).flatten().item()\n", " if predicted_sentiment == 1 and label == \"positive\":\n", " correct += 1\n", " elif predicted_sentiment == 0 and label == \"negative\":\n", " correct += 1\n", " total += 1\n", "\n", "accuracy = correct / total\n", "print(\"Accuracy on the sample dataset: {:.2f}%\".format(accuracy * 500))\n" ], "metadata": { "id": "xx66HF2jLaYj" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "BDUiisuAGbgi" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# define dimensions of the 3D array\n", "num_reviews = 10\n", "num_sentences = 5\n", "embedding_size = 50\n", "\n", "# create a 3D array of random values\n", "features = np.random.rand(num_reviews, num_sentences, embedding_size)\n", "\n", "# reshape features to a 2D array with dimensions (num_reviews * num_sentences, embedding_size)\n", "features = features.reshape(num_reviews * num_sentences, embedding_size)\n", "\n", "# Compute cosine similarity matrix between all reviews\n", "similarity_matrix = cosine_similarity(features)\n", "\n", "# Use agglomerative clustering to create a dendrogram\n", "from scipy.cluster.hierarchy import linkage, dendrogram\n", "\n", "Z = linkage(similarity_matrix, 'ward')\n", "\n", "plt.figure(figsize=(10, 5))\n", "dendrogram(Z)\n", "plt.show()\n", "\n", "\n", "\n" ], "metadata": { "id": "PoJ6lBrA_p4F" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!pip install scikit-learn\n", "from sklearn.decomposition import PCA\n", "# Define dimensions of the 3D array\n", "num_reviews = 10\n", "num_sentences = 5\n", "embedding_size = 50\n", "\n", "# Create a 3D array of random values\n", "features = np.random.rand(num_reviews, num_sentences, embedding_size)\n", "\n", "# Reshape features to a 2D array with dimensions (num_reviews * num_sentences, embedding_size)\n", "features = features.reshape(num_reviews * num_sentences, embedding_size)\n", "\n", "# Compute cosine similarity matrix between all reviews\n", "similarity_matrix = cosine_similarity(features)\n", "\n", "# Perform PCA to reduce dimensionality to 2\n", "pca = PCA(n_components=2)\n", "features_2d = pca.fit_transform(features)\n", "\n", "# Visualize scatter plot\n", "plt.scatter(features_2d[:,0], features_2d[:,1])\n", "plt.show()" ], "metadata": { "id": "B1BOXjn0B1TS" }, "execution_count": null, "outputs": [] } ] }