# !pip install dill
# !pip install pycontractions
# nltk.download('stopwords')

import pandas as pd
import numpy as np
import spacy
import nltk
import unicodedata
import matplotlib.pyplot as plt
import string
import plotly.express as px
import plotly.graph_objects as go
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
import random
import seaborn as sns
import math
import re
import pickle
import dill
import os
from sklearn.pipeline import Pipeline
from termcolor import colored
from sklearn.preprocessing import MultiLabelBinarizer
from IPython.display import IFrame
from IPython.core.display import display, HTML
from nltk import FreqDist
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
from matplotlib.pyplot import figure
from pycontractions import Contractions
from sklearn.metrics import f1_score, accuracy_score, multilabel_confusion_matrix
from torchtext.legacy import data
from torch.utils.data import Dataset
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from shutil import copyfile

%matplotlib inline

display(HTML("<style>.container { width:98% !important; }</style>"))


# Read IMDb dataset
path = "Dataset/IMDbMovies.csv.zip"
dataframe = pd.read_csv(path, low_memory=False)

# Read specific columns and drop and rows with NaN values
movies_data = dataframe[['title', 'description', 'genre']].dropna()
movies_data


def find_unique_genres(data):
    # Search all samples to find the genres types
    all_gens = []

    for genre_list in data['genre'].apply(lambda genre_list: genre_list.split(", ")):
        # Iterate through genre(s) for each film
        for genre in genre_list:
            # Remove whitespace
            all_gens.append(genre)

    # Create list of all unique genres
    unique_gens = list(set(all_gens))
    unique_gens.sort()
    return unique_gens, all_gens

unique_genres, all_genres = find_unique_genres(movies_data)


tokenizer = RegexpTokenizer(r"\w+")

print(colored("Total number of films in IMDb dataset: ", color="blue", attrs=['bold']) + str(len(dataframe.index)))
print(colored("Number of suitable films (samples): ", color="blue", attrs=['bold']) + str(len(movies_data.index)))
print(colored("Number of films dropped: ", color="blue", attrs=['bold']) + str(len(dataframe.index) - len(movies_data.index)))
print(colored("Total number of genres (labels): ", color="green", attrs=['bold']) + str(len(all_genres)))
print(colored("Average number of genres per film: ", color="green", attrs=['bold']) + str(len(all_genres) / len(movies_data.index)))
print(colored("Unique genres: ", color="green", attrs=['bold']) + str(unique_genres))
print(colored("Number of unique genres: ", color="green", attrs=['bold']) + str(len(unique_genres)))
print(colored("Average description length: ", color="magenta", attrs=['bold']) + str(np.mean([len(desc) for desc in movies_data.description])))
print(colored("Average number of words in description: ", color="magenta", attrs=['bold']) + str(np.mean([len(tokenizer.tokenize(desc)) for desc in movies_data.description])))
print(colored("Average number of unique words in description: ", color="magenta", attrs=['bold']) + str(np.mean([len(set(tokenizer.tokenize(desc))) for desc in movies_data.description])))
print(colored("Shortest description length: ", color="magenta", attrs=['bold']) + str(min([len(tokenizer.tokenize(desc)) for desc in movies_data.description])))
print(colored("Longest description length: ", color="magenta", attrs=['bold']) + str(max([len(tokenizer.tokenize(desc)) for desc in movies_data.description])))

Total number of films in IMDb dataset: 85855
Number of suitable films (samples): 83740
Number of films dropped: 2115
Total number of genres (labels): 172461
Average number of genres per film: 2.059481729161691
Unique genres: ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']
Number of unique genres: 25
Average description length: 160.06323143061857
Average number of words in description: 28.19075710532601
Average number of unique words in description: 25.13014091234774
Shortest description length: 1
Longest description length: 79


# Count the number of genres
genre_counts = np.zeros(len(unique_genres))

for genre_list in movies_data['genre'].apply(lambda genre_list: genre_list.split(", ")):
    for i in range(len(unique_genres)):
        if unique_genres[i] in genre_list:
            genre_counts[i] += 1

# Print counts for each genre
for gen, con in zip(unique_genres, genre_counts):
    print(str(gen) + ": " + str(int(con)))
            
# Plot the interactive pie chart
fig = go.Figure()
fig.add_trace(go.Pie(labels=unique_genres, values=genre_counts))
fig.show()

Action: 12801
Adult: 2
Adventure: 7478
Animation: 2095
Biography: 2341
Comedy: 28263
Crime: 10923
Documentary: 2
Drama: 46127
Family: 3862
Fantasy: 3741
Film-Noir: 663
History: 2244
Horror: 9504
Music: 1664
Musical: 1997
Mystery: 5196
News: 1
Reality-TV: 3
Romance: 13838
Sci-Fi: 3590
Sport: 1046
Thriller: 11304
War: 2198
Western: 1578


n_samples = 0
min_length = 10
remove_rare_genres = True
rare_count = 3590
max_genre_samples = 3590

# Create copy of movies_data
sampled_movies_data = movies_data

# Remove samples where description size is less than min_length
for i, desc in zip(sampled_movies_data.index, sampled_movies_data.description):
    if(len(tokenizer.tokenize(desc))) < min_length:
        sampled_movies_data = sampled_movies_data.drop(i)
        
if remove_rare_genres:
    # Find rare genres
    rare_genres = []
    
    for gen, count in zip(unique_genres, genre_counts):
        if count < rare_count:
            rare_genres.append(gen)
    print(colored("Genres removed: ", color="blue", attrs=['bold']) + str(rare_genres))
    
    for i, genre_list in zip(sampled_movies_data.index, sampled_movies_data['genre'].apply(lambda genre_list: genre_list.split(", "))):
        # Remove rare genres from list of film's genres
        new_genre_list = [gen for gen in genre_list if gen not in rare_genres]
        # Check if any genres remain
        if new_genre_list != []:
            # Update film's genre column
            sampled_movies_data.at[i,'genre'] = ", ".join(new_genre_list)
        else:
            # Else drop samples if all genres removed
            sampled_movies_data = sampled_movies_data.drop(i)

if max_genre_samples > 0:
    # Find the genres that have not been removed
    if remove_rare_genres:
        remaining_genres = list(set(unique_genres) - set(rare_genres))
    else:
        remaining_genres = unique_genres
        
    # Create an empty dataframe to record samples to keep
    updated_sampled_movies = pd.DataFrame()
    
    for gen in remaining_genres:
        # Find all samples belonging to the genre
        genre_samples = sampled_movies_data.loc[sampled_movies_data["genre"].str.contains(gen)]
        # Get the first n samples for that genre
        try:
            # Get the first n samples for that genre
            genre_samples = genre_samples.sample(n=max_genre_samples, random_state=1)
        except:
            # If n > number of samples of that genre, use them all
            None
        # Record the samples to keep
        updated_sampled_movies = pd.concat([updated_sampled_movies, genre_samples])
        
    # Remove any duplicates as some movies belong to multiple genres
    sampled_movies_data = updated_sampled_movies.drop_duplicates()
    # Shuffle order by sampling with the same size
    sampled_movies_data = sampled_movies_data.sample(len(sampled_movies_data), random_state=1)
    
if n_samples > 0:
    # Cap the number of samples to a maximum n_samples
    sampled_movies_data = sampled_movies_data.sample(n_samples, random_state=1)

# View selected samples
display(sampled_movies_data)

Genres removed: ['Adult', 'Animation', 'Biography', 'Documentary', 'Film-Noir', 'History', 'Music', 'Musical', 'News', 'Reality-TV', 'Sport', 'War', 'Western']


# Split the data for training and testing
train_samples, test_samples = train_test_split(sampled_movies_data, test_size=0.2, random_state=0)


# Seed torch results semi-reproducable
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True


class FilmClassifierLSTM(nn.Module):
    """ 
    Long-short term memory (LSTM) classifier
    Layers: Embedding -> LSTM -> fully connected
    
    Parameters:
        n_vocab: Number of words TEXT Field was trained on
        n_classes: Number of genres
        pad_index: Index of <pad> token
        unk_index: Index of <unk> token
        n_embedding: Size of the trained vectors, e.g if using 'glove.6B.100d', set to 100
        pretrained_embeddings: Vectors from pre-trained word embedding such as GloVe
        n_hidden: Number of hidden layers
        dropout: Dropout rate, e.g 0.2 = 20% dropout
        activation: Set as "softmax" or "sigmoid"
        bidirectional: Whether to use bidirectional LSTM
        batch_norm: Whether to apply a batch normalization layer
    
    Return on forward pass:
        output: Predicted probabilities for each class
        
    """
    
    def __init__(self, n_vocab, n_classes, pad_index, unk_index, n_embedding, pretrained_embeddings=None,
                 n_hidden=256, dropout=0.2, activation="sigmoid", bidirectional=True, batch_norm=True):
        super().__init__()
        
        self.bidirectional = bidirectional
        self.batch_norm = batch_norm
        
        if bidirectional:
            # Use two layers for bidirectionality
            n_layers = 2
            # Double size of linear output
            linear_hidden = n_hidden * 2
        else:
            n_layers = 1
            linear_hidden = n_hidden
        
        # Create model layers
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(n_vocab, n_embedding, padding_idx=pad_index) # Tell embedding not to learn <pad> embeddings
        self.lstm = nn.LSTM(n_embedding, n_hidden, num_layers=n_layers, dropout=dropout, bidirectional=bidirectional)
        self.batchnorm = nn.BatchNorm1d(linear_hidden)
        self.linear = nn.Linear(linear_hidden, n_classes)
        
        # Set output activation function
        if activation == "softmax":
            self.activation = nn.Softmax(dim=1)
        else:
            # Sigmoid recommended for multi-label
            self.activation = nn.Sigmoid()
        
        if pretrained_embeddings != None:
            # Replace weights of embedding layer
            self.embedding.weight.data.copy_(pretrained_embeddings)
            # Set padding and unknown tokens to zero
            self.embedding.weight.data[pad_index] = torch.zeros(n_embedding)
            self.embedding.weight.data[unk_index] = torch.zeros(n_embedding)

    def forward(self, text, text_lengths):
        # Create word embedding, then apply drop out
        embedded = self.embedding(text)
        dropped_embedded = self.dropout(embedded)
        # Pack the embedding so that LSTM only processes non-embedded sequences
        packed_embedded = nn.utils.rnn.pack_padded_sequence(dropped_embedded, text_lengths.to('cpu'))
        # Return output of all hidden states in the sequence, hidden state of the last LSTM unit and cell state
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # Unpack packed_output
        unpacked_output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        
        if self.bidirectional:
            # Find the final two hidden states and join them together
            top_two_hidden = torch.cat((hidden[-1], hidden[-2]), dim=1)
            if self.batch_norm:
                # Add aa batch normalization layer
                top_two_hidden = self.batchnorm(top_two_hidden)
            # Apply dropout, pass through fully connected layer, then apply activation function
            output = self.activation(self.linear(self.dropout(top_two_hidden)))
        else:
            # Apply dropout to final hidden state, pass through fully connected layer, then apply activation function
            output = self.activation(self.linear(self.dropout(hidden[-1])))

        return output


class DescriptionTransformer(BaseEstimator, TransformerMixin):
    """
    Process the movie descriptions before classification
    
    Parameters:
        stop_words: The stop word list
        transformation: Lemmatization or stemming
        contractions: Set as True to contract words
        stemmer_algorithm: Algorithm to use when applying stemming, defaults to Porter if not given
        verbose: set as 0 to print nothing, 1 to print progress and 2 to print progress and data
    """
    
    def __init__(self, stop_words, transformation="lemmatize", contractions=False, 
                 stemmer_algorithm=None, verbose=0):
        # Settable parameters
        self.stop_words = stop_words
        self.transformation = transformation
        self.contractions = contractions
        self.stemmer_algorithm = stemmer_algorithm if stemmer_algorithm else PorterStemmer()
        self.verbose = verbose
        
        # Unset parameters
        self.data = None
        self.column_name = None
        
    def fit(self, x):
        if self.verbose > 0:
            print(colored("Called Description Transformer Fit", color="blue", attrs=['bold', 'underline']))
            if not self.stop_words:
                print("No stop word list set")
            if self.transformation[0].lower() == "l":
                print("Set to use lemmatization")
            elif self.transformation[0].lower() == "s":
                print("Set to use stemming")
        return self
    
    def transform(self, x):
        if self.verbose > 0:
            print(colored("Called Description Transformer Transform", color="blue", attrs=['bold', 'underline']))
            print("Processing description text")
            
        # Copy the data and find the name of the description column
        self.data = x.copy()
        self.column_name = self.data.columns.values[0]
        
        # Load spaCy language processorz
        nlp = spacy.load("en_core_web_sm")
        # Load pre-trained word embedding if using contractions
        contraction = Contractions(api_key="glove-twitter-25") if self.contractions else None
        
        # Process text by iterating over each sample's index and description
        for idx, sample in zip(self.data.index.values, self.data.values):
            # Change accented characters, e.g. à -> a
            sample = self.remove_accents(str(sample))
            if contraction:
                # Contract words, e.g. "hasn't" -> "has not"
                sample = list(contraction.expand_texts([sample], precise=True))
                sample = ''.join(sample)
                
            # Input sample text into spaCy language processor
            doc = nlp(sample)
            # Split sample text into sentences
            sentences = list(doc.sents)
            
            for sent_idx in range(len(sentences)):
                # Remove punctuation tokens, e.g. ! , .
                sentences[sent_idx] = [token for token in sentences[sent_idx] if not token.is_punct]
            
                # Remove stop words
                if self.stop_words:
                    sentences[sent_idx] = [token for token in sentences[sent_idx] if token.text.lower() not in self.stop_words]
            
                # Apply lemmatization
                if self.transformation[0].lower() == "l":
                    # Resolve words to their dictionary form using PoS tags
                    sentences[sent_idx] = [token.lemma_.lower() for token in sentences[sent_idx]]
                    
                # Apply stemming (only if lemmatization not applied)
                elif self.transformation[0].lower() == "s":
                    # Stem tokens
                    for word_idx in range(len(sentences[sent_idx])):
                        # Apply stemmer to each word
                        stemmed = self.stemmer_algorithm.stem(sentences[sent_idx][word_idx].text)
                        # Convert back to type Token and update word in sentence
                        sentences[sent_idx][word_idx] = nlp(stemmed)[0]
                        
                # Remove remaining punctuation within tokens, e.g. "(years)" -> "years", not including -
                sentences[sent_idx] = [token.translate(str.maketrans('', '', '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~')) for token in sentences[sent_idx]]
                
            # Split words containing dash or spaces caused by lemmatization, e.g. "16-year" -> "16" + "year"
            for k in range(len(sentences)):
                new_sentence = []
                for token in sentences[k]:
                    split_token = re.split(' |-', token)
                    for word in split_token:
                        # Check word not empty
                        if word:
                            new_sentence.append(word)
                # Replace words in sentence
                sentences[k] = new_sentence
                    
            # Remove empty lists from list of sentences
            sentences = [sent for sent in sentences if sent != []]
            # The join the sentences and update the descriptions dataframe
            word_list = [word for sent in sentences for word in sent]
            self.data.loc[idx, self.column_name] = ' '.join([str(elem) for elem in word_list])
            
        if self.verbose > 1:
            display(self.data)
        if self.verbose > 0:
            print(colored("Finshed processing all descriptions\n", color="blue", attrs=['bold', 'underline']))
            
        return self.data
    
    def remove_accents(self, text):
        # Remove accent or unknown characters from text
        text = unicodedata.normalize('NFD', text)\
               .encode('ascii', 'ignore')\
               .decode("utf-8")
        return str(text)


class GenreTransformer(BaseEstimator, TransformerMixin):
    """
    Convert the genre(s) of each movie to multi-hot labels
    
    Parameters:
        binary_encoder_file: Name to save the MultiLabelBinarizer to a pickle file
        verbose: set as 0 to print nothing, 1 to print progress and 2 to print progress and data
    """
    
    def __init__(self, binary_encoder_file='New_Model/binary_encoder.pickle', verbose=0):
        # Settable parameters
        self.binary_encoder_file = binary_encoder_file
        if binary_encoder_file and not binary_encoder_file.endswith('.pickle'):
            # Add pickle file extension if not set
            self.binary_encoder_file = binary_encoder_file + ".pickle"
        self.verbose = verbose
        
        # Unset parameters
        self.binary_encoder = None
        self.column_name = None
        self.unique_genres = None
        
    def fit(self, x):
        if self.verbose > 0:
            print(colored("Called Genre Transformer Fit", color="cyan", attrs=['bold', 'underline']))
    
        self.data = x.copy()
        self.column_name = self.data.columns.values[0]
        # Create a list of all unique genres
        self.unique_genres = self.find_unique()
        # Create a multi-hot label encoder for the genres
        self.binary_encoder = MultiLabelBinarizer(classes=self.unique_genres)
        
        if self.binary_encoder_file:
            # Save the binary encoder so that labels can be reversed
            pickle.dump(self.binary_encoder, open(self.binary_encoder_file, 'wb'))
            if self.verbose > 0:
                print("Saved label binary encoder to " + str(self.binary_encoder_file))
        
        if self.verbose > 1:
            # Print a pie chart of label distribution
            self.view_distribution()
        
        return self
    
    def transform(self, x):
        if self.verbose > 0:
            print(colored("Called Genre Transformer Tranform", color="cyan", attrs=['bold', 'underline']))
            print("Converting genres to multi-hot labels")
            
        self.data = x.copy()
        self.column_name = self.data.columns.values[0]
            
        for idx, genre_list in zip(self.data.index.values, self.data[self.column_name].apply(lambda genre_list: genre_list.split(", "))):
            # Create a binary encoding for each film
            film_binary = self.binary_encoder.fit_transform([genre_list])
            self.data.loc[idx, self.column_name] = film_binary[0]
    
        if self.verbose > 1:
            display(self.data)
        if self.verbose > 0:
            print(colored("Finished processing all labels\n", color="cyan", attrs=['bold', 'underline']))
            
        return self.data
    
    def view_distribution(self):
        # Count the number of samples belonging to each genre
        genre_counts = np.zeros(len(self.unique_genres))
        for genre_list in self.data[self.column_name].apply(lambda genre_list: genre_list.split(", ")):
            for i in range(len(self.unique_genres)):
                if self.unique_genres[i] in genre_list:
                    genre_counts[i] += 1
        print(colored("Label distribution", attrs=['underline']))
        # Print counts for each genre
        for gen, con in zip(self.unique_genres, genre_counts):
            print(str(gen) + ": " + str(int(con)))
            
        # Plot an interactive pie chart showing the percentages
        fig = go.Figure()
        fig.add_trace(go.Pie(labels=self.unique_genres, values=genre_counts))
        fig.show()
    
    def find_unique(self):
        all_gens = []
        # Find the unique genres
        for genre_list in self.data[self.column_name].apply(lambda genre_list: genre_list.split(", ")):
            # Iterate through genre(s) for each film
            for genre in genre_list:
                # Remove whitespace
                all_gens.append(genre)

        # Create list of all unique genres
        unique_gens = list(set(all_gens))
        unique_gens.sort()
        
        return unique_gens


class PostprocessorTransformer(BaseEstimator, TransformerMixin):
    """
    Remove samples where the description does not contain enough words
    Create a TorchText dataset
    
    Parameters:
        min_words: The shortest word length allowed for a description
        TEXT_field_file: Name of the file to save the TorchText TEXT field
        verbose: set as 0 to print nothing, 1 to print progress and 2 to print progress and data
    """
    
    def __init__(self, min_words=1, TEXT_field_file="New_Model/TEXT.Field", verbose=0):
        # Settable parameters
        self.min_words = min_words
        self.TEXT_field_file = TEXT_field_file
        self.verbose = verbose
        
        # Other
        self.data = None
        
    def fit(self, x, y=None):
        if self.verbose > 0:
            print(colored("Called Postprocessor Transformer Fit", color="green", attrs=['bold', 'underline']))
            
        self.data = pd.DataFrame(x.copy())
        
        return self
    
    def transform(self, x, y=None):
        processed_samples = self.data.copy()
        
        if self.verbose > 0:
            print(colored("Called Postprocessor Transformer Tranform", color="green", attrs=['bold', 'underline']))
            print("Removing samples with short descriptions")
        
        for idx, text in zip(self.data.index.values, self.data[0]):
            if len(text.split()) < self.min_words:
                # Remove samples with short descriptions
                processed_samples = processed_samples.drop([idx])
                
        processed_samples.columns = ["Description", "Label"]
        processed_samples.index.name = "ID"
        
        if self.verbose > 0:
            print("Creating TorchText dataset")
            
        # Create fields
        RAW = data.RawField()
        TEXT = data.Field(tokenize="spacy", include_lengths=True) # Use packed padded sequences so only non-padded elements are processed
        LABEL = data.LabelField(dtype=torch.float, use_vocab=False, sequential=False, is_target=True, unk_token=None)

        # Map rows in processed_samples to the matching fields
        fields = [('id', RAW), ('text', TEXT), ('label', LABEL)]
        examples = [data.Example.fromlist(row, fields) for row in zip(processed_samples.index.values,
                                                                      processed_samples['Description'].to_list(), 
                                                                      processed_samples['Label'].to_list())]
        
        # Construct a torchtext dataset
        torchtext_data = data.Dataset(examples, fields)
        
        if self.verbose > 0:
            print("Setting trained word embeddings")
            
        trained_vector = "glove.6B.100d"
        max_vocab = 30000
        all_words = [desc.split() for desc in processed_samples['Description'].values]
        # Build text using pre-trained vectors
        TEXT.build_vocab(all_words, max_size=max_vocab, vectors=trained_vector, unk_init=torch.Tensor.normal_)
        
        # Save the TEXT field so that it can be used later
        dill.dump(TEXT, open(self.TEXT_field_file, "wb"))
        
        if self.verbose > 0:
            print("Saved the TorchText TEXT field to file " + str(self.TEXT_field_file))

        if self.verbose > 0:
            print(colored("Unique tokens in TEXT vocabulary:", attrs=['underline']) + " " + str(len(TEXT.vocab)))
            print(colored("Extract of TEXT vocab:", attrs=['underline']) + " " + str(TEXT.vocab.itos[:100]))
            print(colored("Most common tokens:", attrs=['underline']) + " " + str(TEXT.vocab.freqs.most_common(20)))
        
        # Get vocab size of trained text field
        vocab_size = len(TEXT.vocab)
        # Padding and unknow indexes allow these tokens to be ignored
        padding_index = TEXT.vocab.stoi[TEXT.pad_token]
        unknown_index = TEXT.vocab.stoi[TEXT.unk_token]
        # Embedding size must be the same as pre-trained vectors
        embedded_dim = int(trained_vector[len(trained_vector)-4:len(trained_vector)-1])
        # Get the pre-trained embedding
        pretrained_vectors = TEXT.vocab.vectors

        # Automatically determined model hyperparameters
        model_kwargs = {'n_vocab': vocab_size, 'n_classes': len(processed_samples['Label'].values[0]),
                        'pad_index': padding_index, 'unk_index': unknown_index, 
                        'n_embedding': embedded_dim, 'pretrained_embeddings': pretrained_vectors}
            
        if self.verbose > 1:
            display(processed_samples)
        if self.verbose > 0:
            print(colored("Completed post-processing\n", color="green", attrs=['bold', 'underline']))
        
        return torchtext_data, model_kwargs


class ModelTransformer(BaseEstimator, TransformerMixin):
    """
    Split data into training and validation
    Optionally perform k-fold cross validation to find the best split
    Batch the data
    Create a new classifier
    Train the classifier
    
    Model parameters:
        dropout: Model dropout rate
        activation: Model activation function
        n_hidden: Model number of hidden layers
        bidirectional: Whether to create bidirectional LSTM
        batch_norm: Whether to apply batch normalization
        weight_decay: amount of L2 regularization, set as 0 to use none
        amsgrad: Whether to use the AMSGrad variant of Adam optimizer
        model_kwargs_file: Name of pickle file to store dictionary of model arguments
        
    K-fold parameters:
        use_k_folds: Whether to use k-fold cross validation
        n_folds: Number of splits for cross validation
        k_fold_epochs: Number of epochs for cross validation
    
    Training parameters:
        batch_size: Number of samples in a training / testing / validation batch
        label_threshold: Minimum number that must be outputted by model for a prediction to be converted to 1
        best_train_measure: Set as either "loss" or "accuracy", determines what measure to use to save the best model state
        training_epochs: Number of epochs / model training iterations
        calculate_f1: Whether to calculatate F1 score when training the model
        model_weights_file: Name of the PyTorch file to save the best model weights
        final_model_weights_file: Name of the PyTorch file to save the weights of the final model's state
        TEXT_field_file: Name of the TorchText TEXT field file
    
    Other parameters:
        binary_encoder_file: File location of the saved MultiLabelBinarizer
        verbose: Set as 0 to print nothing, 1 to print progress and 2 to print progress and data
    """
    
    def __init__(self, dropout=0.2, activation="sigmoid", n_hidden=256, bidirectional=True,
                 batch_norm=True, weight_decay=0, amsgrad=False, model_kwargs_file='New_Model/model_kwargs.pickle', 
                 use_k_folds=True, n_folds=5, k_fold_epochs=3, batch_size=32, label_threshold=0.5, 
                 best_train_measure="loss", training_epochs=5, calculate_f1=False, 
                 model_weights_file='New_Model/trained_model.pt', final_model_weights_file='New_Model/final_model.pt',
                 binary_encoder_file='New_Model/binary_encoder.pickle', TEXT_field_file="New_Model/TEXT.Field", verbose=0):
        
        self.dropout = dropout
        self.activation = activation
        self.n_hidden = n_hidden
        self.bidirectional = bidirectional
        self.batch_norm = batch_norm
        self.weight_decay = weight_decay
        self.amsgrad = amsgrad
        
        # Set the customisable model hyperparameters
        self.customisable_model_kwargs = {'n_hidden': n_hidden, 'dropout': dropout, 'activation': activation, 
                                          'bidirectional': bidirectional, 'batch_norm': batch_norm}
        # Optimiser hyperparameters
        self.optimizer_kwargs = {'weight_decay': weight_decay, 'amsgrad': amsgrad}
        
        # File to save model hyperparameters
        self.model_kwargs_file = model_kwargs_file
        if model_kwargs_file and not model_kwargs_file.endswith('.pickle'):
            # Add pickle file extension if not set
            self.model_kwargs_file = model_kwargs_file + ".pickle"
            
        # K-fold parameters
        self.use_k_folds = use_k_folds
        self.n_folds = n_folds
        self.k_fold_epochs = k_fold_epochs
        
        # Training parameters
        self.batch_size = batch_size
        self.label_threshold = label_threshold
        self.best_train_measure = best_train_measure
        self.training_epochs = training_epochs
        self.calculate_f1 = calculate_f1
        self.model_weights_file = model_weights_file
        self.final_model_weights_file = final_model_weights_file
        self.TEXT_field_file = TEXT_field_file
        self.verbose = verbose
        
        # Testing parameter
        self.binary_encoder_file = binary_encoder_file
        
        # Other
        self.data = None
        self.model_kwargs = {}
        # Set device to use GPU if available
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.train_folds = []
        self.val_folds = []
        self.model = None
        self.binary_encoder = None
        self.TEXT = None
        
    def fit(self, x, y=None):
        if self.verbose > 0:
            print(colored("Called Model Transformer Fit", color="yellow", attrs=['bold', 'underline']))
        
        # Set the training data, model arguments and TorchText TEXT field
        self.data, preset_model_kwargs = x
        self.TEXT = dill.load(open(self.TEXT_field_file, "rb"))
        # Construct the model arguments from pre-determined and user defined
        self.model_kwargs = {**preset_model_kwargs, **self.customisable_model_kwargs}
        
        if self.model_kwargs_file:
            # Save the hyperparameters
            pickle.dump(self.model_kwargs, open(self.model_kwargs_file, 'wb'))
            if self.verbose > 0:
                print("Saved model parameters to " + str(self.model_kwargs_file))
                
        if self.binary_encoder_file:
            self.binary_encoder = pickle.load(open(self.binary_encoder_file, 'rb'))
            # Fit the encoder so it can be used
            self.binary_encoder.fit(self.binary_encoder.classes)
            if self.verbose > 0:
                print("Loaded binary encoder from " + str(self.binary_encoder_file))
        
        return self
    
    def transform(self, x, y=None):
        file_name = 'film_classifier.pt'

        if self.verbose > 0:
            print(colored("Called Model Transformer Transform", color="yellow", attrs=['bold', 'underline']))
            print("Splitting data for training and validation")
            
        if self.use_k_folds:
            # Perform k-fold cross valdiation to find the best train-validation split
            train_data, val_data = self.cross_validation()
        else:
            # Otherwise randomly split train dataset 70% train : 30% validation
            train_data, val_data = self.data.split(split_ratio=0.7, random_state = random.seed(0))
            
        # Create bucket iterators for mini-batching
        train_iterator, val_iterator, = data.BucketIterator.splits((train_data, val_data),
                                                                    sort_key=lambda x: len(x.text),
                                                                    batch_size=self.batch_size,
                                                                    sort_within_batch=True,
                                                                    shuffle=True,
                                                                    device=self.device)
    
        if self.verbose > 0:
            print("Samples in train data: " + str(len(train_data)))
            print("Samples in validation data: " + str(len(val_data)))
            
        # Create a new classifier
        self.model = FilmClassifierLSTM(**self.model_kwargs)
        # Use GPU if present
        self.model = self.model.to(self.device)
        # Create the optimizer
        self.optimizer = optim.Adam(self.model.parameters(), **self.optimizer_kwargs)
        
        if self.verbose > 0:
            print(colored("Creating a new classifier", attrs=['underline']))
            print(self.model)
            print(colored("Initialising optimizer", attrs=['underline']))
            print(self.optimizer)

        # Train the model
        training_results = self.train(self.model, self.optimizer, self.training_epochs, train_iterator, val_iterator,
                                      self.model_weights_file, self.final_model_weights_file, self.calculate_f1, print_progress=True)
    
        if self.verbose > 1:
            # Plot a graph of training loss over epochs
            fig = px.line(pd.DataFrame({'Train loss': training_results.get('train_loss'), 
                                        'Validation loss': training_results.get('validation_loss')}), title="Training Loss")
            fig.update_xaxes(title="Epoch")
            fig.update_yaxes(title="Loss")
            fig.show()

            # Plot a graph of training accuracy
            fig = px.line(pd.DataFrame({'Train accuracy': training_results.get('train_accuracy'), 
                                        'Validation accuracy': training_results.get('validation_accuracy')}), title="Training Accuracy")
            fig.update_xaxes(title="Epoch")
            fig.update_yaxes(title="Accuracy")
            fig.show()

            # Plot F1 if present
            if 'train_f1' in training_results:
                fig = px.line(pd.DataFrame({'Train F1 score': training_results.get('train_f1'), 
                                            'Validation F1 score': training_results.get('validation_f1')}), title="Training F1 Score")
                fig.update_xaxes(title="Epoch")
                fig.update_yaxes(title="F1 Score")
                fig.show()
    
        if self.verbose > 0:
            print(colored("Finished Model Training\n", color="yellow", attrs=['bold', 'underline']))
            
        return self.model
    
    def cross_validation(self):
        if self.verbose > 0:
            print(colored("Running K-Fold Cross Validation", color="yellow", attrs=['bold', 'underline']))
            
        best_train_data = None
        best_val_data = None

        # Record the best validation loss or accuracy
        if self.best_train_measure == "loss":
            best_score = float('inf')
        else:
            best_score = float(-1)
            
        for i in range(self.n_folds):
            # Split train dataset 70% train : 30% validation
            train_data, val_data = self.data.split(split_ratio=0.7, random_state = random.seed(i))
        
            # Create bucket iterators for mini-batching
            train_iterator, val_iterator, = data.BucketIterator.splits((train_data, val_data),
                                                                        sort_key=lambda x: len(x.text),
                                                                        batch_size=self.batch_size,
                                                                        sort_within_batch=True,
                                                                        shuffle=True,
                                                                        device=self.device)
            # Create a new classifier
            model = FilmClassifierLSTM(**self.model_kwargs)
            # Use GPU if present
            model = model.to(self.device)
            # Create optimiser
            optimizer = optim.Adam(model.parameters(), **self.optimizer_kwargs)
            
            # Train the model for the fold
            training_results = self.train(model, optimizer, self.k_fold_epochs, train_iterator, val_iterator,
                                          None, self.calculate_f1)
            # Find the best results
            best_accuracy_index = np.argmax(training_results.get('validation_accuracy'))
            train_loss = training_results.get('train_loss')[best_accuracy_index]
            val_loss = training_results.get('validation_loss')[best_accuracy_index]
            val_accuracy = training_results.get('validation_accuracy')[best_accuracy_index]
            
            # Record the best validation loss or accuracy
            if self.best_train_measure == "loss" and val_loss < best_score:
                best_score = val_loss
                best_train_data = train_data
                best_val_data = val_data
            elif val_accuracy > best_score:
                best_score = val_accuracy
                best_train_data = train_data
                best_val_data = val_data

        if self.verbose > 0:
            if self.best_train_measure == "loss":
                print("Completed k-fold cross validation, lowest loss was " 
                      + str(round(best_score, 3)) + " after " + str(self.k_fold_epochs) + " epochs")
            else:
                print("Completed k-fold cross validation, highest accuracy was " 
                      + str(round(best_score * 100, 3)) + "% after " + str(self.k_fold_epochs) + " epochs")
                
        return train_data, val_data
    
    def train(self, model, optimizer, epochs, train_data, val_data, best_model_file='model.pt', 
              final_model_file='final_model_state.pt', calculate_f1=False, print_progress=False):
        if print_progress and self.verbose > 0:
            print(colored("Training Model", color="yellow", attrs=['bold', 'underline']))
    
        # Record metrics loss, f1 and accuracy
        train_loss_over_epochs, val_loss_over_epochs = [], []
        train_f1_over_epochs, val_f1_over_epochs = [], []
        train_accuracy_over_epochs, val_accuracy_over_epochs = [], []
    
        # Record the best validation loss or accuracy
        if self.best_train_measure == "loss":
            best_score = float('inf')
        else:
            best_score = float(-1)
    
        for epoch in range(epochs):
            # Record epoch metrics
            train_epoch_loss, val_epoch_loss = 0, 0
            train_epoch_f1, val_epoch_f1 = 0, 0
            train_epoch_accuracy, val_epoch_accuracy = 0, 0
        
            # Set the classifier to train mode
            model.train()
            batch_count = 0
        
            for batch in train_data:
                batch_count += 1
                # Clear gradients of optimizer
                optimizer.zero_grad()
                # Make prediction and get actual labels
                text, text_lengths = batch.text
                prediction = model(text, text_lengths)
                true_labels = batch.label
            
                # Convert probabilities to binary labels, e.g. [0.6, 0.1] -> [1, 0]
                predicted_labels = torch.tensor([[1 if value > self.label_threshold else 0 for value in sample] for sample in prediction])
            
                # Calculate and record loss
                loss = F.binary_cross_entropy(prediction, true_labels)
                loss.backward()
                optimizer.step()
                train_epoch_loss += loss.item()
            
                # Calculate and record f1 score and accuracy
                if calculate_f1:
                    f1 = f1_score(predicted_labels, true_labels.cpu(), average='micro', zero_division=1)
                    train_epoch_f1 += f1
                accuracy = accuracy_score(predicted_labels, true_labels.cpu())
                train_epoch_accuracy += accuracy
            
            # Record epoch's average training loss, f1 and accuracy
            train_loss_over_epochs.append(train_epoch_loss/batch_count)
            if calculate_f1:
                train_f1_over_epochs.append(train_epoch_f1/batch_count)
            train_accuracy_over_epochs.append(train_epoch_accuracy/batch_count)
        
            # Put model into evaluation mode
            model.eval()
            batch_count = 0
        
            # Evaluate on validation data
            with torch.no_grad():
                for batch in val_data:
                    batch_count += 1
                    text, text_lengths = batch.text
                    prediction = model(text, text_lengths)
                    true_labels = batch.label
                    predicted_labels = torch.tensor([[1 if value > self.label_threshold else 0 for value in sample] for sample in prediction])
                    loss = F.binary_cross_entropy(prediction, true_labels)
                    val_epoch_loss += loss.item()
                
                    # Calculate and record f1 score and accuracy
                    if calculate_f1:
                        f1 = f1_score(predicted_labels, true_labels.cpu(), average='micro', zero_division=1)
                        val_epoch_f1 += f1
                    accuracy = accuracy_score(predicted_labels, true_labels.cpu())
                    val_epoch_accuracy += accuracy
                
            # Record epoch's average training loss, f1 and accuracy
            val_loss_over_epochs.append(val_epoch_loss/batch_count)
            if calculate_f1:
                val_f1_over_epochs.append(val_epoch_f1/batch_count)
            val_accuracy_over_epochs.append(val_epoch_accuracy/batch_count)
            
            if print_progress and self.verbose > 0:
                # Print every n epochs
                if epoch % 1 == 0:
                    print("Epoch " + str(epoch) + ") ")
                    print(colored("Train loss:", attrs=['underline']) + " "
                          + str(round(train_loss_over_epochs[-1], 3))
                          + " ----- " 
                          + colored("Validation loss:", attrs=['underline']) + " "
                          + str(round(val_loss_over_epochs[-1], 3)))
                
                    print(colored("Train accuracy:", attrs=['underline'])  + " "
                          + str(round(train_accuracy_over_epochs[-1] * 100, 3)) + "%"
                          + " ----- " 
                          + colored("Validation accuracy:", attrs=['underline']) + " "
                          + str(round(val_accuracy_over_epochs[-1] * 100, 3)) + "%")
                
                    if calculate_f1:
                        print(colored("Train F1 score:", attrs=['underline']) + " "
                              + str(round(train_f1_over_epochs[-1], 3))
                              + " ----- " 
                              + colored("Validation F1 score:", attrs=['underline']) + " "
                              + str(round(val_f1_over_epochs[-1], 3)))
                        
            # Save the best model state to file
            if best_model_file:
                if self.best_train_measure == "loss":
                    if val_epoch_loss < best_score:
                        best_score = val_epoch_loss
                        torch.save(model.state_dict(), best_model_file)
                        if self.verbose > 0:
                            print("Reached best validation loss at epoch " + str(epoch) + ", saved model to " + str(best_model_file))
                else:
                    if val_epoch_accuracy > best_score:
                        best_score = val_epoch_accuracy
                        torch.save(model.state_dict(), best_model_file)
                        if self.verbose > 0:
                            print("Reached best validation accuracy at epoch " + str(epoch) + ", saved model to " + str(best_model_file))
                            
        if final_model_file:
            torch.save(model.state_dict(), final_model_file)
            if self.verbose > 0:
                print("Saved final model state to " + str(final_model_file) + " after " + str(epoch) + " epochs")
    
        # Record scores in a dictionary to be returned
        metrics = {'train_loss': train_loss_over_epochs, 'validation_loss': val_loss_over_epochs,
                    'train_f1': train_f1_over_epochs, 'validation_f1': val_f1_over_epochs,
                    'train_accuracy': train_accuracy_over_epochs, 'validation_accuracy': val_accuracy_over_epochs}
        # Remove empty lists
        metrics = {key:val for key,val in metrics.items() if val}
            
        return metrics

    def predict(self, x, y=None):
        if self.verbose > 0:
            print(colored("Called Model Transformer Predict", color="yellow", attrs=['bold', 'underline']))
            
        if self.model_weights_file:
            # Load the best model weights
            self.model.load_state_dict(torch.load(self.model_weights_file))
            if self.verbose > 0:
                print("State of model loaded from " + str(self.model_weights_file))
            
        # Set the test data, model arguments and TorchText TEXT field
        test_data, preset_model_kwargs = x
        
        # Create bucket iterator for mini-batching
        test_iterator = data.BucketIterator(test_data,
                                            sort_key=lambda x: len(x.text),
                                            batch_size=self.batch_size,
                                            sort_within_batch=True,
                                            shuffle=True,
                                            device=self.device)
    
        # Record loss
        test_loss, test_f1, test_accuracy = 0, 0, 0
        # Record all predictions
        all_predicted = []
        all_true = []
    
        # Put model into evaluation mode
        self.model.eval()
        batch_count = 0
    
        # Disable gradient calculation
        with torch.no_grad():
            for batch in test_iterator:
                batch_count += 1
                text, text_lengths = batch.text
                # Make prediction for label probabilities
                prediction = self.model(text, text_lengths)
                true_labels = batch.label
                # Find the top predicted labels from probabilities
                predicted_labels = torch.tensor([[1 if value > self.label_threshold else 0 for value in sample] for sample in prediction])
                # Calculate loss
                loss = F.binary_cross_entropy(prediction, true_labels)
                test_loss += loss.item()
                if not 1 in predicted_labels:
                    # Prevent no labels being predicted
                    best_label = prediction.argmax(1)[0].item()
                    predicted_labels[0][best_label] = 1
                for pred, true in zip(predicted_labels, true_labels.cpu()):
                    all_predicted.append(np.array(pred))
                    all_true.append(np.array(true))
                f1 = f1_score(predicted_labels, true_labels.cpu(), average='micro', zero_division=1)
                accuracy = accuracy_score(predicted_labels, true_labels.cpu())
                test_f1 += f1
                test_accuracy += accuracy
            
        # Confusion matrix
        conf_matrix = multilabel_confusion_matrix(all_predicted, all_true)

        # Calculate average loss, f1 and accuracy across all batches
        test_loss = test_loss/batch_count
        test_f1 = test_f1/batch_count
        test_accuracy = test_accuracy/batch_count
        
        if self.verbose > 0:
            print(colored("Test Loss:", attrs=['underline']) + " " + str(round(test_loss, 3)))
            print(colored("Test F1:", attrs=['underline']) + " " + str(round(test_f1, 3)))
            print(colored("Test Accuracy:", attrs=['underline']) + " " + str(round(test_accuracy, 3) * 100) + "%")
        
        if self.verbose > 1:
            self.plot_confusion_matrix(conf_matrix)
    
        metrics = {"loss": test_loss, "f1": test_f1, "accuracy": test_accuracy, "confusion_matrix": conf_matrix}
            
        return metrics
    
    def plot_confusion_matrix(self, confusion_matrix):
        # Get the labels
        unique_genres = self.binary_encoder.classes
        # Create subplots
        fig, ax = plt.subplots(math.ceil(len(unique_genres) / 4), 4, figsize=(12, math.ceil(len(unique_genres))), constrained_layout=True)
        
        # Plot each confusion matrix
        for matrix, axis, gen in zip(confusion_matrix, ax.flatten(), unique_genres):
            # Create a plot for each genre
            matrix_df = pd.DataFrame(matrix, index=["No", "Yes"], columns=["No", "Yes"])
            heatmap = sns.heatmap(matrix_df, annot=True, fmt="d", cbar=False, ax=axis)
            axis.set_title("Confusion Matrix for " + gen, fontsize=12)
            axis.set_ylabel('True label')
            axis.set_xlabel('Predicted label')

        # Hide unused axes
        for i in range(1, (math.ceil(len(unique_genres) / 4) * 4) - len(unique_genres) + 1):
            ax[-1, -1 * i].axis('off')
        # Display the graph
        plt.show()


classifier_kwargs = {'dropout': 0.7, 'activation': "sigmoid", 'n_hidden': 120, 
                     'bidirectional': True, 'batch_norm': True,
                     'model_kwargs_file': "New_Model/model_kwargs.pickle"}

adam_kwargs = {'weight_decay': 0, 'amsgrad': False}

k_fold_kwargs = {'use_k_folds': True, 'n_folds': 5, 'k_fold_epochs': 5}

training_kwargs = {'batch_size': 32, 'label_threshold': 0.5, 
                   'best_train_measure': "accuracy", 'training_epochs': 50, 
                   'calculate_f1': False, 'model_weights_file': "New_Model/trained_model.pt", 
                   'binary_encoder_file': "New_Model/binary_encoder.pickle"}

TEXT_file = "New_Model/TEXT.Field"


# Process the descriptions and genres, remove all other columns
preprocessor = ColumnTransformer(
    transformers=[('description_column', DescriptionTransformer(stop_words=nltk.corpus.stopwords.words('english'), verbose=2), ['description']),
                  ('genre_column', GenreTransformer(verbose=2), ['genre'])], 
    remainder='drop')

# Create the pipeline
pipeline = Pipeline(steps=[('preprocess', preprocessor),
                           ('postprocess', PostprocessorTransformer(min_words=10, TEXT_field_file=TEXT_file, verbose=2)),
                           ('model', ModelTransformer(**classifier_kwargs, **adam_kwargs, **k_fold_kwargs, **training_kwargs, verbose=2))])


text_preprocessor_file = "New_Model/text_preprocessor.pickle"

# Run the pipeline
print(colored("Training Pipeline", attrs=['bold', 'underline']))
pipeline.fit_transform(train_samples)

# Save the preprocessor transformer to file so that it can be reused
pickle.dump(preprocessor.transformers[0][1], open(text_preprocessor_file, 'wb'))
text_preprocessor = pickle.load(open(text_preprocessor_file, 'rb'))

Training Pipeline
Called Description Transformer Fit
Set to use lemmatization
Called Description Transformer Transform
Processing description text

Finshed processing all descriptions

Called Genre Transformer Fit
Saved label binary encoder to New_Model/binary_encoder.pickle
Label distribution
Action: 6031
Adventure: 4286
Comedy: 8462
Crime: 4922
Drama: 12456
Family: 2948
Fantasy: 2898
Horror: 4851
Mystery: 3433
Romance: 4682
Sci-Fi: 2829
Thriller: 5281

Called Genre Transformer Tranform
Converting genres to multi-hot labels

Finished processing all labels

Called Postprocessor Transformer Fit
Called Postprocessor Transformer Tranform
Removing samples with short descriptions
Creating TorchText dataset
Setting trained word embeddings
Saved the TorchText TEXT field to file New_Model/TEXT.Field
Unique tokens in TEXT vocabulary: 30002
Extract of TEXT vocab: ['<unk>', '<pad>', 's', 'young', 'man', 'find', 'life', 'one', 'two', 'woman', 'love', 'get', 'year', 'friend', 'take', 'family', 'new', 'go', 'girl', 'old', 'murder', 'story', 'become', 'world', 'try', 'be', 'live', 'father', 'time', 'make', 'group', 'wife', 'must', 'meet', 'help', 'discover', 'town', 'come', 'fall', 'day', 'boy', 'work', 'kill', 'three', 'turn', 'force', 'film', 'set', 'home', 'death', 'mother', 'school', 'son', 'daughter', 'small', 'city', 'war', 'back', 'brother', 'police', 'leave', 'decide', 'people', 'mysterious', 'killer', 'child', 'return', 'way', 'lead', 'begin', 'secret', 'want', 'good', 'save', 'house', 'start', 'not', 'student', 'run', 'fight', 'know', 'night', 'order', 'plan', 'name', 'use', 'american', 'gang', 'high', 'couple', 'end', 'lose', 'send', 'husband', 'together', 'first', 'escape', 'evil', 'team', 'die']
Most common tokens: [('s', 4952), ('young', 3375), ('man', 2955), ('find', 2884), ('life', 2777), ('one', 2300), ('two', 2206), ('woman', 2192), ('love', 2057), ('get', 2052), ('year', 2017), ('friend', 1910), ('take', 1884), ('family', 1809), ('new', 1763), ('go', 1704), ('girl', 1676), ('old', 1606), ('murder', 1546), ('story', 1539)]

Completed post-processing

Called Model Transformer Fit
Saved model parameters to New_Model/model_kwargs.pickle
Loaded binary encoder from New_Model/binary_encoder.pickle
Called Model Transformer Transform
Splitting data for training and validation
Running K-Fold Cross Validation
Completed k-fold cross validation, highest accuracy was 9.409% after 5 epochs
Samples in train data: 17440
Samples in validation data: 7474
Creating a new classifier
FilmClassifierLSTM(
  (dropout): Dropout(p=0.7, inplace=False)
  (embedding): Embedding(30002, 100, padding_idx=1)
  (lstm): LSTM(100, 120, num_layers=2, dropout=0.7, bidirectional=True)
  (batchnorm): BatchNorm1d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear): Linear(in_features=240, out_features=12, bias=True)
  (activation): Sigmoid()
)
Initialising optimizer
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)
Training Model
Epoch 0) 
Train loss: 0.531 ----- Validation loss: 0.409
Train accuracy: 2.099% ----- Validation accuracy: 5.035%
Reached best validation accuracy at epoch 0, saved model to New_Model/trained_model.pt
Epoch 1) 
Train loss: 0.426 ----- Validation loss: 0.393
Train accuracy: 3.83% ----- Validation accuracy: 6.501%
Reached best validation accuracy at epoch 1, saved model to New_Model/trained_model.pt
Epoch 2) 
Train loss: 0.409 ----- Validation loss: 0.38
Train accuracy: 4.925% ----- Validation accuracy: 7.022%
Reached best validation accuracy at epoch 2, saved model to New_Model/trained_model.pt
Epoch 3) 
Train loss: 0.396 ----- Validation loss: 0.376
Train accuracy: 6.279% ----- Validation accuracy: 7.783%
Reached best validation accuracy at epoch 3, saved model to New_Model/trained_model.pt
Epoch 4) 
Train loss: 0.389 ----- Validation loss: 0.376
Train accuracy: 7.047% ----- Validation accuracy: 7.876%
Reached best validation accuracy at epoch 4, saved model to New_Model/trained_model.pt
Epoch 5) 
Train loss: 0.381 ----- Validation loss: 0.368
Train accuracy: 7.58% ----- Validation accuracy: 8.851%
Reached best validation accuracy at epoch 5, saved model to New_Model/trained_model.pt
Epoch 6) 
Train loss: 0.374 ----- Validation loss: 0.368
Train accuracy: 8.257% ----- Validation accuracy: 9.679%
Reached best validation accuracy at epoch 6, saved model to New_Model/trained_model.pt
Epoch 7) 
Train loss: 0.368 ----- Validation loss: 0.367
Train accuracy: 8.997% ----- Validation accuracy: 9.599%
Epoch 8) 
Train loss: 0.362 ----- Validation loss: 0.362
Train accuracy: 9.616% ----- Validation accuracy: 9.095%
Epoch 9) 
Train loss: 0.356 ----- Validation loss: 0.366
Train accuracy: 10.361% ----- Validation accuracy: 10.35%
Reached best validation accuracy at epoch 9, saved model to New_Model/trained_model.pt
Epoch 10) 
Train loss: 0.352 ----- Validation loss: 0.365
Train accuracy: 10.9% ----- Validation accuracy: 9.909%
Epoch 11) 
Train loss: 0.346 ----- Validation loss: 0.363
Train accuracy: 11.365% ----- Validation accuracy: 10.638%
Reached best validation accuracy at epoch 11, saved model to New_Model/trained_model.pt
Epoch 12) 
Train loss: 0.342 ----- Validation loss: 0.363
Train accuracy: 11.743% ----- Validation accuracy: 10.531%
Epoch 13) 
Train loss: 0.338 ----- Validation loss: 0.364
Train accuracy: 12.729% ----- Validation accuracy: 11.132%
Reached best validation accuracy at epoch 13, saved model to New_Model/trained_model.pt
Epoch 14) 
Train loss: 0.332 ----- Validation loss: 0.366
Train accuracy: 13.205% ----- Validation accuracy: 10.894%
Epoch 15) 
Train loss: 0.329 ----- Validation loss: 0.365
Train accuracy: 13.779% ----- Validation accuracy: 11.255%
Reached best validation accuracy at epoch 15, saved model to New_Model/trained_model.pt
Epoch 16) 
Train loss: 0.325 ----- Validation loss: 0.365
Train accuracy: 14.547% ----- Validation accuracy: 11.282%
Reached best validation accuracy at epoch 16, saved model to New_Model/trained_model.pt
Epoch 17) 
Train loss: 0.32 ----- Validation loss: 0.369
Train accuracy: 15.109% ----- Validation accuracy: 10.924%
Epoch 18) 
Train loss: 0.315 ----- Validation loss: 0.369
Train accuracy: 15.734% ----- Validation accuracy: 11.255%
Epoch 19) 
Train loss: 0.312 ----- Validation loss: 0.371
Train accuracy: 16.302% ----- Validation accuracy: 11.602%
Reached best validation accuracy at epoch 19, saved model to New_Model/trained_model.pt
Epoch 20) 
Train loss: 0.307 ----- Validation loss: 0.37
Train accuracy: 16.927% ----- Validation accuracy: 11.332%
Epoch 21) 
Train loss: 0.304 ----- Validation loss: 0.376
Train accuracy: 17.609% ----- Validation accuracy: 11.76%
Reached best validation accuracy at epoch 21, saved model to New_Model/trained_model.pt
Epoch 22) 
Train loss: 0.302 ----- Validation loss: 0.379
Train accuracy: 18.079% ----- Validation accuracy: 11.797%
Reached best validation accuracy at epoch 22, saved model to New_Model/trained_model.pt
Epoch 23) 
Train loss: 0.298 ----- Validation loss: 0.378
Train accuracy: 18.658% ----- Validation accuracy: 11.647%
Epoch 24) 
Train loss: 0.294 ----- Validation loss: 0.383
Train accuracy: 19.472% ----- Validation accuracy: 11.722%
Epoch 25) 
Train loss: 0.292 ----- Validation loss: 0.382
Train accuracy: 19.524% ----- Validation accuracy: 11.041%
Epoch 26) 
Train loss: 0.289 ----- Validation loss: 0.39
Train accuracy: 19.954% ----- Validation accuracy: 11.786%
Epoch 27) 
Train loss: 0.286 ----- Validation loss: 0.387
Train accuracy: 20.556% ----- Validation accuracy: 11.656%
Epoch 28) 
Train loss: 0.283 ----- Validation loss: 0.39
Train accuracy: 21.319% ----- Validation accuracy: 12.067%
Reached best validation accuracy at epoch 28, saved model to New_Model/trained_model.pt
Epoch 29) 
Train loss: 0.281 ----- Validation loss: 0.392
Train accuracy: 21.864% ----- Validation accuracy: 11.506%
Epoch 30) 
Train loss: 0.277 ----- Validation loss: 0.392
Train accuracy: 22.276% ----- Validation accuracy: 12.013%
Epoch 31) 
Train loss: 0.273 ----- Validation loss: 0.397
Train accuracy: 23.383% ----- Validation accuracy: 12.064%
Epoch 32) 
Train loss: 0.27 ----- Validation loss: 0.402
Train accuracy: 23.635% ----- Validation accuracy: 11.682%
Epoch 33) 
Train loss: 0.269 ----- Validation loss: 0.401
Train accuracy: 23.916% ----- Validation accuracy: 11.89%
Epoch 34) 
Train loss: 0.267 ----- Validation loss: 0.404
Train accuracy: 24.467% ----- Validation accuracy: 11.786%
Epoch 35) 
Train loss: 0.263 ----- Validation loss: 0.406
Train accuracy: 24.925% ----- Validation accuracy: 11.947%
Epoch 36) 
Train loss: 0.261 ----- Validation loss: 0.409
Train accuracy: 25.654% ----- Validation accuracy: 11.7%
Epoch 37) 
Train loss: 0.26 ----- Validation loss: 0.411
Train accuracy: 25.952% ----- Validation accuracy: 11.5%
Epoch 38) 
Train loss: 0.258 ----- Validation loss: 0.417
Train accuracy: 26.067% ----- Validation accuracy: 11.92%
Epoch 39) 
Train loss: 0.255 ----- Validation loss: 0.416
Train accuracy: 26.583% ----- Validation accuracy: 11.61%
Epoch 40) 
Train loss: 0.251 ----- Validation loss: 0.424
Train accuracy: 27.569% ----- Validation accuracy: 11.783%
Epoch 41) 
Train loss: 0.249 ----- Validation loss: 0.423
Train accuracy: 28.435% ----- Validation accuracy: 11.38%
Epoch 42) 
Train loss: 0.248 ----- Validation loss: 0.423
Train accuracy: 28.974% ----- Validation accuracy: 11.874%
Epoch 43) 
Train loss: 0.246 ----- Validation loss: 0.434
Train accuracy: 28.601% ----- Validation accuracy: 11.693%
Epoch 44) 
Train loss: 0.243 ----- Validation loss: 0.427
Train accuracy: 29.484% ----- Validation accuracy: 11.596%
Epoch 45) 
Train loss: 0.241 ----- Validation loss: 0.424
Train accuracy: 29.553% ----- Validation accuracy: 11.807%
Epoch 46) 
Train loss: 0.24 ----- Validation loss: 0.436
Train accuracy: 30.178% ----- Validation accuracy: 11.556%
Epoch 47) 
Train loss: 0.238 ----- Validation loss: 0.441
Train accuracy: 30.808% ----- Validation accuracy: 11.553%
Epoch 48) 
Train loss: 0.234 ----- Validation loss: 0.444
Train accuracy: 31.56% ----- Validation accuracy: 11.706%
Epoch 49) 
Train loss: 0.234 ----- Validation loss: 0.447
Train accuracy: 32.104% ----- Validation accuracy: 11.363%
Saved final model state to New_Model/final_model.pt after 49 epochs

Finished Model Training


test_model_file = 'New_Model/final_model.pt'

pipeline['model'].model_weights_file = test_model_file


print(colored("Testing Pipeline", attrs=['bold', 'underline']))
test_scores = pipeline.predict(test_samples)

Testing Pipeline
Called Description Transformer Transform
Processing description text

Finshed processing all descriptions

Called Genre Transformer Tranform
Converting genres to multi-hot labels

Finished processing all labels

Called Postprocessor Transformer Tranform
Removing samples with short descriptions
Creating TorchText dataset
Setting trained word embeddings
Saved the TorchText TEXT field to file New_Model/TEXT.Field
Unique tokens in TEXT vocabulary: 30002
Extract of TEXT vocab: ['<unk>', '<pad>', 's', 'young', 'man', 'find', 'life', 'one', 'two', 'woman', 'love', 'get', 'year', 'friend', 'take', 'family', 'new', 'go', 'girl', 'old', 'murder', 'story', 'become', 'world', 'try', 'be', 'live', 'father', 'time', 'make', 'group', 'wife', 'must', 'meet', 'help', 'discover', 'town', 'come', 'fall', 'day', 'boy', 'work', 'kill', 'three', 'turn', 'force', 'film', 'set', 'home', 'death', 'mother', 'school', 'son', 'daughter', 'small', 'city', 'war', 'back', 'brother', 'police', 'leave', 'decide', 'people', 'mysterious', 'killer', 'child', 'return', 'way', 'lead', 'begin', 'secret', 'want', 'good', 'save', 'house', 'start', 'not', 'student', 'run', 'fight', 'know', 'night', 'order', 'plan', 'name', 'use', 'american', 'gang', 'high', 'couple', 'end', 'lose', 'send', 'husband', 'together', 'first', 'escape', 'evil', 'team', 'die']
Most common tokens: [('s', 4952), ('young', 3375), ('man', 2955), ('find', 2884), ('life', 2777), ('one', 2300), ('two', 2206), ('woman', 2192), ('love', 2057), ('get', 2052), ('year', 2017), ('friend', 1910), ('take', 1884), ('family', 1809), ('new', 1763), ('go', 1704), ('girl', 1676), ('old', 1606), ('murder', 1546), ('story', 1539)]

Completed post-processing

Called Model Transformer Predict
State of model loaded from New_Model/final_model.pt
Test Loss: 0.235
Test F1: 0.752
Test Accuracy: 42.0%


def text_to_genres(text, label_threshold=0.5, model_kwargs_file='New_Model/model_kwargs.pickle', 
                   model_weights_file='New_Model/trained_model.pt', binary_encoder_file='New_Model/binary_encoder.pickle', 
                   TEXT_field_file="New_Model/TEXT.Field", text_preprocessor_file="New_Model/text_preprocessor.pickle"):

     # Load the text preprocessor transformer
    text_preprocessor = pickle.load(open(text_preprocessor_file, 'rb'))
    # Load the multi-hot binary encoder
    binary_encoder = pickle.load(open(binary_encoder_file, 'rb'))
    # Load TorchText TEXT field
    TEXT = dill.load(open(TEXT_field_file, "rb"))
    # Load the model parameters
    model_kwargs = pickle.load(open(model_kwargs_file, 'rb'))
    # Determine device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Convert text into dataframe to be compatible
    text_df = pd.DataFrame(data=[text], columns=["description"])
    # Process the text
    text_preprocessor.verbose = 0
    processed_text = text_preprocessor.transform(text_df)
    # Convert back to string
    processed_text = str(processed_text.values[0][0])
    
    # Get indexes of tokens
    token_indexes = [TEXT.vocab.stoi[token] for token in processed_text.split()]
    # Convert indexes to tensor
    token_tensor = torch.LongTensor(token_indexes).to(device)
    # Add extra dimension to shape to replicate batch
    token_tensor = token_tensor.unsqueeze(1)
    # Get the length of the text
    length_tensor = torch.LongTensor([len(token_indexes)])
    
    # Create the model
    model = FilmClassifierLSTM(**model_kwargs)
    # Set device
    model = model.to(device)
    # Load the model weights from file
    model.load_state_dict(torch.load(model_weights_file))
    # Set model to evaluation mode
    model.eval()
    
    # Make a prediction
    prediction = model(token_tensor, length_tensor)
    # Convert model outputs to binary labels, then to genre
    predicted_labels = torch.tensor([[1 if value > label_threshold else 0 for value in sample] for sample in prediction])
    if not 1 in predicted_labels:
        # Prevent no labels being predicted
        best_label = prediction.argmax(1)[0].item()
        predicted_labels[0][best_label] = 1
        
    # Calculate the percentage prediction
    predicted_categories_scores = []
    for idx in range(len(predicted_labels[0])):
        if predicted_labels[0][idx].item() == 1:
            predicted_categories_scores.append(prediction[0][idx].item())
        
    # Fit the encoder so it can be used
    binary_encoder.fit(binary_encoder.classes)
    # Convert the labels from binary to genres
    predicted_categories = binary_encoder.inverse_transform(predicted_labels.cpu())
    predicted_categories = list(predicted_categories[0])
    
    return predicted_categories, predicted_categories_scores


weights_file = 'New_Model/final_model.pt'

descriptions = ["The Avengers and their allies must be willing to sacrifice all in an attempt to defeat the powerful Thanos before his blitz of devastation and ruin puts an end to the universe.",
                "A group of young adults visit a boarded up campsite named Crystal Lake where they soon encounter the mysterious Jason Voorhees and his deadly intentions.",
                "Comedy following the exploits of Det. Jake Peralta and his diverse, lovable colleagues as they police the NYPD's 99th Precinct.",
                "While navigating their careers in Los Angeles, a pianist and an actress fall in love while attempting to reconcile their aspirations for the future.",
                "Early in his crime-solving career, Sherlock Holmes attempts to prevent Moriarty from cornering the heroin market.",
                "Sheriff Deputy Rick Grimes wakes up from a coma to learn the world is in ruins and must lead a group of survivors to stay alive.",
                "As a new threat to the galaxy rises, Rey, a desert scavenger, and Finn, an ex-stormtrooper, must join Han Solo and Chewbacca to search for the one hope of restoring peace.",
                "A young woman, traumatized by a tragic event in her past, seeks out vengeance against those who crossed her path.",
                "Pack up for a howling fun movie adventure filled with action, laughs, and tender moments as Kate and Humphrey take their pups on their first family vacation!",
                "Stuck in a time loop, two wedding guests develop a budding romance while living the same day over and over again.",
                "The brash James T. Kirk tries to live up to his father's legacy with Mr. Spock keeping him in check as a vengeful Romulan from the future creates black holes to destroy the Federation one planet at a time."]

for desc in descriptions:
    print(desc)
    pred_genres, pred_scores = text_to_genres(desc)
    print(pred_genres, pred_scores)
    print("\n")

The Avengers and their allies must be willing to sacrifice all in an attempt to defeat the powerful Thanos before his blitz of devastation and ruin puts an end to the universe.
['Action', 'Adventure', 'Sci-Fi'] [0.8746257424354553, 0.6552037596702576, 0.7095993161201477]


A group of young adults visit a boarded up campsite named Crystal Lake where they soon encounter the mysterious Jason Voorhees and his deadly intentions.
['Horror', 'Thriller'] [0.9746958613395691, 0.5291595458984375]


Comedy following the exploits of Det. Jake Peralta and his diverse, lovable colleagues as they police the NYPD's 99th Precinct.
['Comedy', 'Crime'] [0.9565695524215698, 0.7569046020507812]


While navigating their careers in Los Angeles, a pianist and an actress fall in love while attempting to reconcile their aspirations for the future.
['Drama', 'Romance'] [0.9068241715431213, 0.8177892565727234]


Early in his crime-solving career, Sherlock Holmes attempts to prevent Moriarty from cornering the heroin market.
['Crime', 'Mystery'] [0.8605350852012634, 0.5884705781936646]


Sheriff Deputy Rick Grimes wakes up from a coma to learn the world is in ruins and must lead a group of survivors to stay alive.
['Drama'] [0.6493645906448364]


As a new threat to the galaxy rises, Rey, a desert scavenger, and Finn, an ex-stormtrooper, must join Han Solo and Chewbacca to search for the one hope of restoring peace.
['Action', 'Sci-Fi'] [0.907088577747345, 0.923986554145813]


A young woman, traumatized by a tragic event in her past, seeks out vengeance against those who crossed her path.
['Drama'] [0.9254385232925415]


Pack up for a howling fun movie adventure filled with action, laughs, and tender moments as Kate and Humphrey take their pups on their first family vacation!
['Comedy', 'Family'] [0.9213259220123291, 0.8006343245506287]


Stuck in a time loop, two wedding guests develop a budding romance while living the same day over and over again.
['Comedy', 'Romance'] [0.901840090751648, 0.9206058979034424]


The brash James T. Kirk tries to live up to his father's legacy with Mr. Spock keeping him in check as a vengeful Romulan from the future creates black holes to destroy the Federation one planet at a time.
['Action', 'Adventure', 'Sci-Fi'] [0.8612030148506165, 0.6572191119194031, 0.9176497459411621]


use_trained = True

if use_trained:
    # Set the text preprocessor transformer
    copyfile(text_preprocessor_file, '../Web_App/flaskr/static/text_preprocessor.pickle')
    # Set the multi-hot binary encoder
    copyfile(training_kwargs.get("binary_encoder_file"), '../Web_App/flaskr/static/binary_encoder.pickle')
    # Set the TorchText TEXT field
    copyfile(TEXT_file, '../Web_App/flaskr/static/TEXT.Field')
    # Set the model parameters
    copyfile(classifier_kwargs.get("model_kwargs_file"), '../Web_App/flaskr/static/model_kwargs.pickle')
    # Set the model weights
    copyfile(training_kwargs.get("model_weights_file"), '../Web_App/flaskr/static/trained_model.pt')
    print("Updated web server files to use new trained model")
    
else:
    # Set the text preprocessor transformer
    copyfile('Best_Model/text_preprocessor.pickle', '../Web_App/flaskr/static/text_preprocessor.pickle')
    # Set the multi-hot binary encoder
    copyfile('Best_Model/binary_encoder.pickle', '../Web_App/flaskr/static/binary_encoder.pickle')
    # Set the TorchText TEXT field
    copyfile('Best_Model/TEXT.Field', '../Web_App/flaskr/static/TEXT.Field')
    # Set the model parameters
    copyfile('Best_Model/model_kwargs.pickle', '../Web_App/flaskr/static/model_kwargs.pickle')
    # Set the model weights
    copyfile('Best_Model/trained_model.pt', '../Web_App/flaskr/static/trained_model.pt')
    print("Updated web server files to use best model")

Updated web server files to use new trained model

	title	description	genre
0	Miss Jerry	The adventures of a female reporter in the 1890s.	Romance
1	The Story of the Kelly Gang	True story of notorious Australian outlaw Ned ...	Biography, Crime, Drama
2	Den sorte drøm	Two men of high rank are both wooing the beaut...	Drama
3	Cleopatra	The fabled queen of Egypt's affair with Roman ...	Drama, History
4	L'Inferno	Loosely adapted from Dante's Divine Comedy and...	Adventure, Drama, Fantasy
...	...	...	...
85848	Pengalila	An unusual bond between a sixty year old Dalit...	Drama
85849	Manoharam	Manoharan is a poster artist struggling to fin...	Comedy, Drama
85850	Le lion	A psychiatric hospital patient pretends to be ...	Comedy
85851	De Beentjes van Sint-Hildegard	A middle-aged veterinary surgeon believes his ...	Comedy, Drama
85854	La vida sense la Sara Amat	Pep, a 13-year-old boy, is in love with a girl...	Drama

	title	description	genre
33250	Spanish Fly	A woman reporter in over her head in trying to...	Comedy, Romance
62381	Hellacious Acres: The Case of John Glass	John Glass, wakes up in a desolated barn from ...	Comedy, Sci-Fi
47492	Sivakaasi	Muthappa runs away from his hometown because o...	Action, Drama
34364	Barierata	In this picture, the barrier epitomizes the li...	Drama, Sci-Fi
2022	I distruttori	A mysterious ray that immobilizes all motors a...	Action, Drama, Mystery
...	...	...	...
82660	Mr. Chandramouli	The happy lives of a boxer and his father goes...	Comedy, Romance
84054	Santa Jaws	Trying to survive the family Christmas, Cody m...	Action, Adventure, Comedy
42180	Cani dell'altro mondo!	An intergalactic dog pilot from Sirius (the do...	Comedy, Drama, Family
41021	Konets vechnosti	Based on the novel by Isaac Asimov.The End of ...	Sci-Fi
8725	L'alba del gran giorno	A Confederate drifter wins a hotel-saloon at p...	Action, Adventure, Drama

	description
61933	berlin wall crumble katrine daughter norwegian...
60228	two teenage assassin accept think quick easy j...
31964	young peter difficult time adjust school confi...
84905	real life account deadly nipah virus outbreak ...
39080	stressed father bride secret smitten event pla...
...	...
44324	swindler wong consider king swindler hundred v...
70003	nick escape criminal past 2 partner get steady...
28633	jeff fbi agent send pick ray manta member whit...
39315	disillusioned filmmaker encounter young girl r...
62886	tory hedderman self center apathetic brood 16 ...

	genre
61933	[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1]
60228	[1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
31964	[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
84905	[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1]
39080	[0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0]
...	...
44324	[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
70003	[0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0]
28633	[1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0]
39315	[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
62886	[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]

	Description	Label
ID
0	berlin wall crumble katrine daughter norwegian...	[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1]
1	two teenage assassin accept think quick easy j...	[1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
2	young peter difficult time adjust school confi...	[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
3	real life account deadly nipah virus outbreak ...	[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1]
4	stressed father bride secret smitten event pla...	[0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0]
...	...	...
27423	swindler wong consider king swindler hundred v...	[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
27424	nick escape criminal past 2 partner get steady...	[0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0]
27425	jeff fbi agent send pick ray manta member whit...	[1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0]
27426	disillusioned filmmaker encounter young girl r...	[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
27427	tory hedderman self center apathetic brood 16 ...	[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]

About¶

Imports¶

IMDb Dataset¶

Analysing the Dataset¶

Metrics¶

View Label distribution¶

Sample Selection¶

Drop Samples¶

Split into Training and Testing¶

Classifier Model¶

Pipeline¶

Custom Transformer Classes¶

Description Transformer¶

Genre Transformer¶

Post-processor Transformer¶

Model Transformer¶

Model Arguments¶

Build a Pipeline¶

Pipeline Training¶

Pipeline Testing¶

Test Predictions on Custom Descriptions¶

Deploy Model as a Web Service¶

	description
84163	romantic getaway indian wilderness couple los ...
80142	tamara miller plan weekend lake getaway lifelo...
2440	runaway bride undercover reporter get catch po...
17549	satanic cult kidnap 3 young people priest dona...
58459	challenge create song music video three musica...
...	...
22916	two con artist brother attempt swindle soon we...
49056	russian english subtitle stunningly beautiful ...
63082	continuation k anime girl band school tea time...
16041	attractive young hitchhiker name ginger meet t...
78855	max assign white house charlie secret service ...

	genre
84163	[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1]
80142	[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1]
2440	[0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0]
17549	[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
58459	[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]
...	...
22916	[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]
49056	[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0]
63082	[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
16041	[0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0]
78855	[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]