{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "private_outputs": true,
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "gpuClass": "standard"
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "hgxOJ0dfx9KR"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "df = pd.read_csv(\"/content/drive/MyDrive/train_data_csfd.csv\")\n",
        "df.head()"
      ],
      "metadata": {
        "id": "Q9XWgGBmzuNS"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "reviews.head()"
      ],
      "metadata": {
        "id": "h6JpULOD6u-e"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import nltk\n",
        "nltk.download('all')\n"
      ],
      "metadata": {
        "id": "2y9NuKBX9xdu"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "!pip install torch\n",
        "!pip install nltk\n",
        "!pip install transformers\n",
        "! pip install seaborn\n",
        "import pandas as pd\n",
        "from google.colab import drive\n",
        "drive.mount('/content/drive')\n",
        "import numpy as np\n",
        "import nltk\n",
        "import torch\n",
        "from sklearn.metrics.pairwise import cosine_similarity\n",
        "\n",
        "\n",
        "\n",
        "\n",
        "nltk.download('stopwords')\n",
        "\n",
        "from nltk.corpus import stopwords\n",
        "from nltk.tokenize import word_tokenize\n",
        "import string\n",
        "import torch\n",
        "import re\n",
        "import seaborn as sns\n",
        "\n",
        "from transformers import BertTokenizer, BertModel\n",
        "from sklearn.metrics.pairwise import cosine_similarity\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "# Load text and label data\n",
        "reviews = pd.read_csv(\"/content/drive/MyDrive/train_data_csfd.csv\")\n",
        "\n",
        "# Define text and label variables\n",
        "text = reviews[\"text\"]\n",
        "label = reviews[\"label\"]\n",
        "\n",
        "# Create DataFrame from provided text and label data\n",
        "data = {\n",
        "    \"review\": text,\n",
        "    \"label\": label\n",
        "}\n",
        "df = pd.DataFrame(data)\n",
        "\n",
        "# Define list of stopwords\n",
        "import nltk\n",
        "nltk.download('stopwords')\n",
        "nltk.download('punkt')\n",
        "\n",
        "from nltk.corpus import stopwords\n",
        "stopwords = set(stopwords.words('english'))\n",
        "stopwords.update([\"br\",\"a\",\t\n",
        "\"v\",\t\n",
        "\"se\",\t\n",
        "\"na\",\t\n",
        "\"je\",\t\n",
        "\"že\",\t\n",
        "\"o\",\n",
        "\"s\",\n",
        "\"z\",\n",
        "\"do\",\t\n",
        "\"i\",\t\n",
        "\"to\",\t\n",
        "\"k\",\t\n",
        "\"ve\",\t\n",
        "\"pro\",\t\n",
        "\"za\",\t\n",
        "\"by\",\t\n",
        "\"ale\",\t\n",
        "\"si\",\t\n",
        "\"po\",\t\n",
        "\"jako\",\t\n",
        "\"podle\",\t\n",
        "\"od\",\t\n",
        "\"jsem\",\t\n",
        "\"tak\",\t\n",
        "\"jsou\",\t\n",
        "\"které\",\n",
        "\"který\",\t\n",
        "\"jeho\",\t\n",
        "\"však\",\t\n",
        "\"bude\",\t\n",
        "\"nebo\",\t\n",
        "\"už\",\t\n",
        "\"jen\",\t\n",
        "\"byl\",\t\n",
        "\"jak\",\t\n",
        "\"u\",\t\n",
        "\"co\",\t\n",
        "\"při\",\t\n",
        "\"až\",\t\n",
        "\"aby\",\t\n",
        "\"má\",\t\n",
        "\"když\",\t\n",
        "\"než\",\t\n",
        "\"ze\",\t\n",
        "\"která\",\t\n",
        "\"před\",\t\n",
        "\"být\",\t\n",
        "\"také\",\t\n",
        "\"bylo\",\t\n",
        "\"jsme\",\t\n",
        "\"není\",\t\n",
        "\"jejich\",\t\n",
        "\"ještě\",\t\n",
        "\"ani\",\t\n",
        "\"mezi\",\t\n",
        "\"byla\",\t\n",
        "\"své\",\n",
        "\"roku\",\t\n",
        "\"již\",\t\n",
        "\"pak\",\t\n",
        "\"první\",\t\n",
        "\"roce\",\t\n",
        "\"kteří\",\t\n",
        "\"další\",\t\n",
        "\"proti\",\t\n",
        "\"let\",\t\n",
        "\"tím\",\t\n",
        "\"může\",\t\n",
        "\"korun\",\n",
        "\"řekl\",\n",
        "\"tom\",\n",
        "\"kde\",\n",
        "\"či\",\n",
        "\"tedy\",\n",
        "\"pouze\"])\n",
        "\n",
        "\n",
        "\n",
        "\n",
        "\n",
        "# Function to clean text\n",
        "def clean_text(text):\n",
        "    # Remove non-letters and digits\n",
        "    text = re.sub(\"[^a-zA-Z]\", \" \", text)\n",
        "    # Tokenize text\n",
        "    tokens = nltk.word_tokenize(text)\n",
        "    # Remove stopwords\n",
        "    tokens = [token for token in tokens if token not in stopwords]\n",
        "    # Lowercase text\n",
        "    tokens = [token.lower() for token in tokens]\n",
        "    # Join tokens back into text\n",
        "    text = \" \".join(tokens)\n",
        "    return text\n",
        "\n",
        "# Apply clean_text function to reviews DataFrame\n",
        "df[\"clean_review\"] = df[\"review\"].apply(lambda x: clean_text(x))\n",
        "\n",
        "# Tokenize text\n",
        "tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\")\n",
        "\n",
        "def tokenize_text(text):\n",
        "    return tokenizer.encode_plus(text, add_special_tokens=True, return_tensors=\"pt\")\n",
        "\n",
        "# Tokenize clean reviews\n",
        "tokenized_reviews = df[\"clean_review\"].apply(tokenize_text)\n",
        "\n",
        "# Load pre-trained BERT model\n",
        "model = BertModel.from_pretrained(\"bert-base-multilingual-cased\")\n",
        "# Get the embeddings weights from the pre-trained BERT model\n",
        "embedding_weight = model.embeddings.word_embeddings.weight\n",
        "\n",
        "\n",
        "# Apply BERT model to tokenized reviews\n",
        "\n",
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "model.to(device)\n",
        "\n",
        "reviews_encoded = []\n",
        "for review in tokenized_reviews:\n",
        "    with torch.no_grad():\n",
        "        input_ids = review['input_ids'].to(device)\n",
        "        attention_mask = review['attention_mask'].to(device)\n",
        "        review_encoded = model(input_ids=input_ids, attention_mask=attention_mask)\n",
        "    reviews_encoded.append(review_encoded)\n",
        "# define dimensions of the 3D array\n",
        "num_reviews = 10\n",
        "num_sentences = 5\n",
        "embedding_size = 50\n",
        "\n",
        "# create a 3D array of random values\n",
        "features = np.random.rand(num_reviews, num_sentences, embedding_size)\n",
        "\n",
        "# reshape features to a 2D array with dimensions (num_reviews * num_sentences, embedding_size)\n",
        "features = features.reshape(num_reviews * num_sentences, embedding_size)\n",
        "\n",
        "\n",
        "# Compute cosine similarity matrix between all reviews\n",
        "similarity_matrix = cosine_similarity(features)\n",
        "\n",
        "# Visualize similarity matrix as heatmap\n",
        "sns.heatmap(similarity_matrix, annot=True, cmap='Blues')\n",
        "plt.show()\n",
        "\n",
        "\n",
        "        \n",
        "\n",
        "\n"
      ],
      "metadata": {
        "id": "pvy6YbxSNgwl"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from transformers import BertTokenizer, BertForSequenceClassification\n",
        "model_name = \"bert-base-multilingual-cased\"\n",
        "tokenizer = BertTokenizer.from_pretrained(model_name)\n",
        "model = BertForSequenceClassification.from_pretrained(model_name)\n"
      ],
      "metadata": {
        "id": "2e0Kt-j8GNB8"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def clean_text(text):\n",
        "    # Remove non-letters and digits\n",
        "    text = re.sub(\"[^a-zA-Z0-9]\", \" \", text)\n",
        "    # Tokenize text\n",
        "    tokens = nltk.word_tokenize(text)\n",
        "    # Remove stopwords\n",
        "    tokens = [token for token in tokens if token not in stopwords]\n",
        "    # Lowercase text\n",
        "    tokens = [token.lower() for token in tokens]\n",
        "    # Join tokens back into text\n",
        "    text = \" \".join(tokens)\n",
        "    return text\n",
        "\n",
        "new_review = \"Tento film je úžasný!\"\n",
        "cleaned_review = clean_text(new_review)\n",
        "tokens = tokenizer.encode_plus(cleaned_review, add_special_tokens=True, return_tensors=\"pt\")\n"
      ],
      "metadata": {
        "id": "oS4MvOt3GQW_"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "model.eval()\n",
        "with torch.no_grad():\n",
        "    outputs = model(tokens['input_ids'], tokens['attention_mask'])\n",
        "    logits = outputs[0]\n",
        "    predictions = torch.argmax(logits, dim=1).flatten()\n",
        "sentiment = \"positive\" if predictions == 1 else \"negative\"\n",
        "print(sentiment)\n"
      ],
      "metadata": {
        "id": "9hARNr9AGVil"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import nltk\n",
        "nltk.download('stopwords')\n",
        "from nltk.corpus import stopwords\n",
        "stopwords = stopwords.words('english')\n",
        "\n",
        "def clean_text(text):\n",
        "    # Remove non-letters and digits\n",
        "    text = re.sub(\"[^a-zA-Z0-9]\", \" \", text)\n",
        "    # Tokenize text\n",
        "    tokens = nltk.word_tokenize(text)\n",
        "    # Remove stopwords\n",
        "    tokens = [token for token in tokens if token.lower() not in stopwords]\n",
        "    # Lowercase text\n",
        "    tokens = [token.lower() for token in tokens]\n",
        "    # Join tokens back into text\n",
        "    text = \" \".join(tokens)\n",
        "    return text\n",
        "\n",
        "\n",
        "new_review = \"tento film nesplnil má očekávání\"\n",
        "cleaned_review = clean_text(new_review)\n",
        "tokens = tokenizer.encode_plus(cleaned_review, add_special_tokens=True, return_tensors=\"pt\")\n",
        "model.eval()\n",
        "with torch.no_grad():\n",
        "    outputs = model(tokens['input_ids'], tokens['attention_mask'])\n",
        "    logits = outputs[0]\n",
        "    predictions = torch.argmax(logits, dim=1).flatten()\n",
        "    confidence = torch.softmax(logits, dim=1)[0][predictions].item()\n",
        "    sentiment = \"positive\" if predictions == 1 else \"negative\"\n",
        "print(f\"The sentiment of the review is {sentiment} with a confidence score of {confidence:.2f}\")\n"
      ],
      "metadata": {
        "id": "aQXwwHU6dm3k"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from transformers import BertTokenizer, BertForSequenceClassification\n",
        "import torch\n",
        "import pandas as pd\n",
        "import re\n",
        "import nltk\n",
        "nltk.download('stopwords')\n",
        "from nltk.corpus import stopwords\n",
        "\n",
        "model_name = \"bert-base-multilingual-cased\"\n",
        "tokenizer = BertTokenizer.from_pretrained(model_name)\n",
        "model = BertForSequenceClassification.from_pretrained(model_name)\n",
        "\n",
        "def clean_text(text):\n",
        "    # Remove non-letters and digits\n",
        "    text = re.sub(\"[^a-zA-Z0-9]\", \" \", text)\n",
        "    # Tokenize text\n",
        "    tokens = nltk.word_tokenize(text)\n",
        "    # Remove stopwords\n",
        "    tokens = [token for token in tokens if token not in stopwords.words('english')]\n",
        "    # Lowercase text\n",
        "    tokens = [token.lower() for token in tokens]\n",
        "    # Join tokens back into text\n",
        "    text = \" \".join(tokens)\n",
        "    return text\n",
        "\n",
        "# Load a sample dataset of movie reviews with known sentiments\n",
        "reviews = pd.read_csv(\"/content/drive/MyDrive/train_data_csfd.csv\")\n",
        "sample = reviews.sample(n=500) # Choose a random sample of 100 reviews\n",
        "\n",
        "# Evaluate the model on the sample dataset\n",
        "correct = 0\n",
        "total = 0\n",
        "for _, row in sample.iterrows():\n",
        "    text = row[\"text\"]\n",
        "    label = row[\"label\"]\n",
        "    cleaned_review = clean_text(text)\n",
        "    tokens = tokenizer.encode_plus(cleaned_review, add_special_tokens=True, return_tensors=\"pt\")\n",
        "    with torch.no_grad():\n",
        "        outputs = model(tokens['input_ids'], tokens['attention_mask'])\n",
        "        logits = outputs[0]\n",
        "        predicted_sentiment = torch.argmax(logits, dim=1).flatten().item()\n",
        "        if predicted_sentiment == 1 and label == \"positive\":\n",
        "            correct += 1\n",
        "        elif predicted_sentiment == 0 and label == \"negative\":\n",
        "            correct += 1\n",
        "        total += 1\n",
        "\n",
        "accuracy = correct / total\n",
        "print(\"Accuracy on the sample dataset: {:.2f}%\".format(accuracy * 500))\n"
      ],
      "metadata": {
        "id": "xx66HF2jLaYj"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "BDUiisuAGbgi"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# define dimensions of the 3D array\n",
        "num_reviews = 10\n",
        "num_sentences = 5\n",
        "embedding_size = 50\n",
        "\n",
        "# create a 3D array of random values\n",
        "features = np.random.rand(num_reviews, num_sentences, embedding_size)\n",
        "\n",
        "# reshape features to a 2D array with dimensions (num_reviews * num_sentences, embedding_size)\n",
        "features = features.reshape(num_reviews * num_sentences, embedding_size)\n",
        "\n",
        "# Compute cosine similarity matrix between all reviews\n",
        "similarity_matrix = cosine_similarity(features)\n",
        "\n",
        "# Use agglomerative clustering to create a dendrogram\n",
        "from scipy.cluster.hierarchy import linkage, dendrogram\n",
        "\n",
        "Z = linkage(similarity_matrix, 'ward')\n",
        "\n",
        "plt.figure(figsize=(10, 5))\n",
        "dendrogram(Z)\n",
        "plt.show()\n",
        "\n",
        "\n",
        "\n"
      ],
      "metadata": {
        "id": "PoJ6lBrA_p4F"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install scikit-learn\n",
        "from sklearn.decomposition import PCA\n",
        "# Define dimensions of the 3D array\n",
        "num_reviews = 10\n",
        "num_sentences = 5\n",
        "embedding_size = 50\n",
        "\n",
        "# Create a 3D array of random values\n",
        "features = np.random.rand(num_reviews, num_sentences, embedding_size)\n",
        "\n",
        "# Reshape features to a 2D array with dimensions (num_reviews * num_sentences, embedding_size)\n",
        "features = features.reshape(num_reviews * num_sentences, embedding_size)\n",
        "\n",
        "# Compute cosine similarity matrix between all reviews\n",
        "similarity_matrix = cosine_similarity(features)\n",
        "\n",
        "# Perform PCA to reduce dimensionality to 2\n",
        "pca = PCA(n_components=2)\n",
        "features_2d = pca.fit_transform(features)\n",
        "\n",
        "# Visualize scatter plot\n",
        "plt.scatter(features_2d[:,0], features_2d[:,1])\n",
        "plt.show()"
      ],
      "metadata": {
        "id": "B1BOXjn0B1TS"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}