# !pip install scattertext
# !pip install pycontractions
# !pip install pyspellchecker
# nltk.download('stopwords')

import pandas as pd
import numpy as np
import spacy
import nltk
import scattertext
import unicodedata
import matplotlib.pyplot as plt
import string
import gensim
import gensim.corpora as corpora
import pyLDAvis
import pyLDAvis.gensim_models as gensim_models
import plotly.express as px
import plotly.graph_objects as go
import re
from termcolor import colored
from sklearn.preprocessing import MultiLabelBinarizer
from scattertext import CorpusFromPandas, produce_scattertext_explorer
from IPython.display import IFrame
from IPython.core.display import display, HTML
from spacy import displacy
from nltk import FreqDist
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
from matplotlib.pyplot import figure
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import CoherenceModel, LsiModel
from pycontractions import Contractions
from sklearn.manifold import TSNE
from spellchecker import SpellChecker

%matplotlib inline

display(HTML("<style>.container { width:98% !important; }</style>"))


# Ignore deprecation warning
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


# Read IMDb dataset
path = "IMDbMovies.csv.zip"
dataframe = pd.read_csv(path, low_memory=False)

# Read specific columns and drop and rows with NaN values
movies_data = dataframe[['title', 'description', 'genre']].dropna()
movies_data


def find_unique_genres(data):
    # Search all samples to find the genres types
    all_gens = []

    for genre_list in data['genre'].apply(lambda genre_list: genre_list.split(", ")):
        # Iterate through genre(s) for each film
        for genre in genre_list:
            # Remove whitespace
            all_gens.append(genre)

    # Create list of all unique genres
    unique_gens = list(set(all_gens))
    unique_gens.sort()
    return unique_gens, all_gens

unique_genres, all_genres = find_unique_genres(movies_data)


tokenizer = RegexpTokenizer(r"\w+")

print(colored("Total number of films in IMDb dataset: ", color="blue", attrs=['bold']) + str(len(dataframe.index)))
print(colored("Number of suitable films (samples): ", color="blue", attrs=['bold']) + str(len(movies_data.index)))
print(colored("Number of films dropped: ", color="blue", attrs=['bold']) + str(len(dataframe.index) - len(movies_data.index)))
print(colored("Total number of genres (labels): ", color="green", attrs=['bold']) + str(len(all_genres)))
print(colored("Average number of genres per film: ", color="green", attrs=['bold']) + str(len(all_genres) / len(movies_data.index)))
print(colored("Unique genres: ", color="green", attrs=['bold']) + str(unique_genres))
print(colored("Number of unique genres: ", color="green", attrs=['bold']) + str(len(unique_genres)))
print(colored("Average description length: ", color="magenta", attrs=['bold']) + str(np.mean([len(desc) for desc in movies_data.description])))
print(colored("Average number of words in description: ", color="magenta", attrs=['bold']) + str(np.mean([len(tokenizer.tokenize(desc)) for desc in movies_data.description])))
print(colored("Average number of unique words in description: ", color="magenta", attrs=['bold']) + str(np.mean([len(set(tokenizer.tokenize(desc))) for desc in movies_data.description])))
print(colored("Shortest description length: ", color="magenta", attrs=['bold']) + str(min([len(tokenizer.tokenize(desc)) for desc in movies_data.description])))
print(colored("Longest description length: ", color="magenta", attrs=['bold']) + str(max([len(tokenizer.tokenize(desc)) for desc in movies_data.description])))

Total number of films in IMDb dataset: 85855
Number of suitable films (samples): 83740
Number of films dropped: 2115
Total number of genres (labels): 172461
Average number of genres per film: 2.059481729161691
Unique genres: ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']
Number of unique genres: 25
Average description length: 160.06323143061857
Average number of words in description: 28.19075710532601
Average number of unique words in description: 25.13014091234774
Shortest description length: 1
Longest description length: 79


genre = "horror" # Set as category to investigate

film_descs = [] # Descriptions of all films in genre set above

# Iterate through genres + description of each films
for gen, desc in zip(movies_data.genre, movies_data.description):
    # Check if film has desired genre
    if genre in gen.lower():
        # Remove puncuation from description
        for char in string.punctuation:
            desc = desc.replace(char,"")
        # Add tokenized description to list
        film_descs.append(desc.split())

# Plot word frequency distribution
fd = FreqDist(word.lower() for sent in film_descs for word in sent)
figure(figsize=(10, 5), dpi=80)
plt.title("Frequency distribution of " + str(genre) + " films")
fd.plot(50)

# Plot word frequency distribution after removing stop words
figure(figsize=(10, 5), dpi=80)
stop_words = nltk.corpus.stopwords.words('english')
fd_no_stop = FreqDist(word.lower() for sent in film_descs for word in sent if word.lower() not in stop_words)
plt.title("Frequency distribution of " + str(genre) + " films - No stop words")
fd_no_stop.plot(50)
plt.show()


# Count the number of genres
genre_counts = np.zeros(len(unique_genres))

for genre_list in movies_data['genre'].apply(lambda genre_list: genre_list.split(", ")):
    for i in range(len(unique_genres)):
        if unique_genres[i] in genre_list:
            genre_counts[i] += 1

# Print counts for each genre
# for gen, con in zip(unique_genres, genre_counts):
#     print(str(gen) + ": " + str(int(con)))

# Plot the interactive pie chart
fig = go.Figure()
fig.add_trace(go.Pie(labels=unique_genres, values=genre_counts))
fig.show()


n_samples = 5000
min_length = 10
remove_rare_genres = True
rare_count = 7500
max_genre_samples = 2000

# Create copy of movies_data
sampled_movies_data = movies_data

# Remove samples where description size is less than min_length
for i, desc in zip(sampled_movies_data.index, sampled_movies_data.description):
    if(len(tokenizer.tokenize(desc))) < min_length:
        sampled_movies_data = sampled_movies_data.drop(i)

if remove_rare_genres:
    # Find rare genres
    rare_genres = []

    for gen, count in zip(unique_genres, genre_counts):
        if count < rare_count:
            rare_genres.append(gen)
    print(colored("Genres removed: ", color="blue", attrs=['bold']) + str(rare_genres))

    for i, genre_list in zip(sampled_movies_data.index, sampled_movies_data['genre'].apply(lambda genre_list: genre_list.split(", "))):
        # Remove rare genres from list of film's genres
        new_genre_list = [gen for gen in genre_list if gen not in rare_genres]
        # Check if any genres remain
        if new_genre_list != []:
            # Update film's genre column
            sampled_movies_data.at[i,'genre'] = ", ".join(new_genre_list)
        else:
            # Else drop samples if all genres removed
            sampled_movies_data = sampled_movies_data.drop(i)

if max_genre_samples > 0:
    # Find the genres that have not been removed
    if remove_rare_genres:
        remaining_genres = list(set(unique_genres) - set(rare_genres))
    else:
        remaining_genres = unique_genres

    # Create an empty dataframe to record samples to keep
    updated_sampled_movies = pd.DataFrame()

    for gen in remaining_genres:
        # Find all samples belonging to the genre
        genre_samples = sampled_movies_data.loc[sampled_movies_data["genre"].str.contains(gen)]
        # Get the first n samples for that genre
        try:
            # Get the first n samples for that genre
            genre_samples = genre_samples.sample(n=max_genre_samples, random_state=1)
        except:
            # If n > number of samples of that genre, use them all
            None
        # Record the samples to keep
        updated_sampled_movies = pd.concat([updated_sampled_movies, genre_samples])

    # Remove any duplicates as some movies belong to multiple genres
    sampled_movies_data = updated_sampled_movies.drop_duplicates()
    # Shuffle order by sampling with the same size
    sampled_movies_data = sampled_movies_data.sample(len(sampled_movies_data), random_state=1)

if n_samples > 0:
    # Cap the number of samples to a maximum n_samples
    sampled_movies_data = sampled_movies_data.sample(n_samples, random_state=1)

# View selected samples
display(sampled_movies_data)

Genres removed: ['Adult', 'Adventure', 'Animation', 'Biography', 'Documentary', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Sci-Fi', 'Sport', 'War', 'Western']


# String of all words in every description
all_desc_words = ''

# Add all descriptions to all_desc_words
for desc in movies_data.description:
    all_desc_words += desc

# Split list of all descriptions into tokens
all_tokens = tokenizer.tokenize(all_desc_words)

# Calculate frequency distribution to find most common words
movie_fd = FreqDist(token.lower() for token in all_tokens)

# Create custom stop word list
movie_stop_words = [word[0] for word in movie_fd.most_common(50)]
print(colored("Movie description stop words: ", color="blue", attrs=['bold']) + str(movie_stop_words)  + "\n")

Movie description stop words: ['a', 'the', 'to', 'of', 'and', 'in', 'his', 'is', 'with', 'her', 'an', 's', 'he', 'for', 'who', 'on', 'by', 'their', 'from', 'that', 'when', 'as', 'young', 'she', 'they', 'but', 'are', 'after', 'has', 'life', 'man', 'at', 'two', 'him', 'one', 'love', 'up', 'woman', 'it', 'into', 'family', 'out', 'story', 'new', 'about', 'old', 'be', 'girl', 'this', 'father']


stop = 1
add_custom_words = True

if stop == 0:
    # No stop words
    stop_words = []
elif stop == 1:
    # NLTK stop words
    stop_words = nltk.corpus.stopwords.words('english')
else:
    # spaCy stop words
    stop_words = list(spacy.lang.en.stop_words.STOP_WORDS)

if add_custom_words:
    # Add custom stop words to list
    stop_words = list(set(stop_words + movie_stop_words))

if stop_words:
    # Print stop word list if not empty
    print(colored("Stop words: ", color="blue", attrs=['bold']) + str(stop_words)  + "\n")
    print(colored("Number of stop words: ", color="blue", attrs=['bold']) + str(len(stop_words)))

Stop words: ['too', 'man', "haven't", 'over', 'once', 'very', 'own', "shan't", 'doesn', 'yours', 'theirs', 'hers', "that'll", 'other', 'don', 'through', 'while', 't', 'won', 'again', 'an', "mustn't", 'here', 'does', 'ma', 'its', 'd', "doesn't", "hasn't", "wasn't", 'in', "you've", 'him', 'that', 'was', 'yourself', 'what', 'whom', 'by', 'family', 'so', 'at', 'out', 'then', 'couldn', 'she', 'isn', 'himself', 'me', 'not', 'i', 'herself', 'or', 'two', 'her', 'with', 'off', 'as', 'nor', 'wasn', 'we', 'one', 'his', 'girl', "mightn't", 'will', 'your', 'a', 'be', 'o', 'most', 'old', 'during', 'love', 'about', 'needn', 'mustn', 'under', 'now', 'shouldn', 'doing', 'few', "wouldn't", 'only', 'both', 'didn', 'if', 'am', 'from', 'it', 'same', 'they', 'because', 'some', 'were', 'to', 'each', 'where', 'm', 'll', "didn't", 'story', 'this', 'father', 'hadn', 'more', 'can', "isn't", 'he', "you'd", "you're", 'against', 'should', 'mightn', 'ours', 'wouldn', 'haven', 'shan', 'yourselves', 'been', 'above', 'y', 're', 'all', 'of', 'for', 'their', "don't", 'no', 'below', 'our', 'have', 've', 'but', 'you', "hadn't", 'do', "shouldn't", 'did', 'my', 'and', 'up', 'having', 'being', 'myself', 'between', 'aren', 'which', "she's", 'the', "it's", 'weren', 'young', 'down', "couldn't", "you'll", 'on', 'life', 'had', 'when', 'there', 'why', "weren't", 'until', 'who', 'these', 'are', 'into', 'any', 'than', 'ain', 'itself', 'after', 'ourselves', 'just', 'is', 'them', 'such', 'those', "aren't", 'how', 'has', 'before', 'hasn', "needn't", 'woman', 'further', 's', 'new', "won't", "should've", 'themselves']

Number of stop words: 192


def remove_accents(text):
    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    return str(text)

print(colored("Before: ", color="blue", attrs=['bold']) + "Arsène Baudu and Hyacinthe, a pair of small-time crooks")
print(colored("After: ", color="blue", attrs=['bold']) + str(remove_accents("Arsène Baudu and Hyacinthe, a pair of small-time crooks")))

Before: Arsène Baudu and Hyacinthe, a pair of small-time crooks
After: Arsene Baudu and Hyacinthe, a pair of small-time crooks


# Load pre-trained word embedding
contraction = Contractions(api_key="glove-twitter-25")

print(colored("Before: ", color="blue", attrs=['bold']) + "They fall in love, but can't quite seem to get the timing right.")
print(colored("After: ", color="blue", attrs=['bold']) + list(contraction.expand_texts(["They fall in love, but can't quite seem to get the timing right."], precise=True))[0])

Before: They fall in love, but can't quite seem to get the timing right.
After: They fall in love, but cannot quite seem to get the timing right.


def process_text_samples(data, stop_list=[], contractions=False, spell_correct=False, lemmatize=False, stem=False, stemmer_algorithm="porter"):
    # List of all samples
    samples = []

    # Create spell checker
    spell_check = SpellChecker()

    for sample in data:
        # Change accented characters, e.g à -> a
        sample = remove_accents(sample)
        # Contract words, e.g "hasn't" -> "has not"
        if contractions:
            sample = list(contraction.expand_texts([sample], precise=True))
            sample = ''.join(contracted)
        # Input sample text into spaCy language processor
        doc = nlp(sample)
        # Split sample text into sentences
        sentences = list(doc.sents)
        for sent_idx in range(len(sentences)):
            # Remove punctuation tokens, e.g. ! , .
            sentences[sent_idx] = [token for token in sentences[sent_idx] if not token.is_punct]

            # Remove stop words
            if stop_list:
                sentences[sent_idx] = [token for token in sentences[sent_idx] if token.text.lower() not in stop_list]

            # Correct spelling mistakes, e.g. "athiest" -> "atheist"
            if spell_correct:
                sentences[sent_idx] = [nlp(spell_check.correction(token.text))[0] for token in sentences[sent_idx]]

            # Apply lemmatization
            if lemmatize:
                # Resolve words to their dictionary form using PoS tags
                sentences[sent_idx] = [token.lemma_.lower() for token in sentences[sent_idx]]

            # Apply stemming (only if lemmatization not applied)
            elif stem:
                # Set stemmer type
                if stemmer_algorithm[0].lower() == "s":
                    # Snowball stemmer is advancement on Porter
                    stemmer = SnowballStemmer(language='english')
                else:
                    # Default to Porter stemmer if not specified
                    stemmer = PorterStemmer()
                # Stem tokens
                for word_idx in range(len(sentences[sent_idx])):
                    # Apply stemmer to each word
                    stemmed = stemmer.stem(sentences[sent_idx][word_idx].text)
                    # Convert back to type Token and update word in sentence
                    sentences[sent_idx][word_idx] = nlp(stemmed)[0]

            # Remove remaining punctuation within tokens, e.g. "(years)" -> "years", not including -
            sentences[sent_idx] = [token.translate(str.maketrans('', '', '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~')) for token in sentences[sent_idx]]

        # Split words containing dash or spaces caused by lemmatization, e.g. "16-year" -> "16" + "year"
        for k in range(len(sentences)):
            new_sentence = []
            for token in sentences[k]:
                split_token = re.split(' |-', token)
                for word in split_token:
                    # Check word not empty
                    if word:
                        new_sentence.append(word)
            # Replace words in sentence
            sentences[k] = new_sentence

        # Remove empty lists from list of sentences
        sentences = [sent for sent in sentences if sent != []]
        samples.append(sentences)

    return samples

# List of description 'documents' seperated into sentences
description_docs = process_text_samples(data=sampled_movies_data.description,
                                        spell_correct=False,
                                        stop_list=stop_words,
                                        lemmatize=True)

# View extract
print(colored("First 5 descriptions:", color="blue", attrs=['bold']))
for i in range(5):
    print(colored(str(i), color="blue", attrs=['bold']) + " " + str(description_docs[i]))
# View extract
print(colored("First 5 descriptions:", color="blue", attrs=['bold']))
for i in range(5):
    print(colored(str(i), color="blue", attrs=['bold']) + " " + str(description_docs[i]))

First 5 descriptions:
0 [['reichau', 'former', 'army', 'captain', 'back', 'france', 'serve', 'three', 'year', 'prison', 'belong', 'oassecret', 'armed', 'organization', 'dissident', 'paramilitary', 'group', 'algerian']]
1 [['kimberly', 'would', 'stop', 'nothing', 'child'], ['recover', 'cancer', 'possibility', 'seem', 'slim'], ['however', 'world', 's', 'first', 'successful', 'human', 'cloning', 'project', 'bring']]
2 [['alien', 'child', 'evacuate', 'die', 'world', 'send', 'earth', 'live', 'among', 'human'], ['peace', 'threaten', 'survivor', 'home', 'planet', 'invade', 'earth']]
3 [['bonga', 'three', 'good', 'hearted', 'bum', 'give', 'shelter', 'homeless', 'child', 'find', 'adoptive', 'parent'], ['day', 'rich', 'boy', 'tired', 'parent', 'indifference', 'ask', 'bonga', 'shelter']]
4 [['since', 'college', 'art', 'nora', 'not', 'luck', 'man'], ['30', 'work', 'nyc', 'hotel', 'go', 'nowhere'], ['meet', 'frenchman']]
First 5 descriptions:
0 [['reichau', 'former', 'army', 'captain', 'back', 'france', 'serve', 'three', 'year', 'prison', 'belong', 'oassecret', 'armed', 'organization', 'dissident', 'paramilitary', 'group', 'algerian']]
1 [['kimberly', 'would', 'stop', 'nothing', 'child'], ['recover', 'cancer', 'possibility', 'seem', 'slim'], ['however', 'world', 's', 'first', 'successful', 'human', 'cloning', 'project', 'bring']]
2 [['alien', 'child', 'evacuate', 'die', 'world', 'send', 'earth', 'live', 'among', 'human'], ['peace', 'threaten', 'survivor', 'home', 'planet', 'invade', 'earth']]
3 [['bonga', 'three', 'good', 'hearted', 'bum', 'give', 'shelter', 'homeless', 'child', 'find', 'adoptive', 'parent'], ['day', 'rich', 'boy', 'tired', 'parent', 'indifference', 'ask', 'bonga', 'shelter']]
4 [['since', 'college', 'art', 'nora', 'not', 'luck', 'man'], ['30', 'work', 'nyc', 'hotel', 'go', 'nowhere'], ['meet', 'frenchman']]


min_words = 5

# Convert docs in description_docs from list of sentences to list of words
word_docs = []

for doc in description_docs:
    # Make a list of all words
    words = [word for sent in doc for word in sent]
    if len(words) >= min_words:
        word_docs.append(words)


# Construct a dictionary mapping words to their integer ids
word_id_dict = corpora.Dictionary(word_docs)
# Build a document-term matrix from the list of documents
corpus = [word_id_dict.doc2bow(doc) for doc in word_docs]

# View corpus
pd.DataFrame(corpus)


def plot_tSNE(model, title="t-SNE", save_to_file=False, file_name="tsne.html"):
    # Create list of topic weights from corpus
    topic_weights = []
    for i, row in enumerate(model[corpus]):
        topic_weights.append([weight for i, weight in row])

    # Pad weights to same length by filling with 0s
    padded_weights = pd.DataFrame(topic_weights).fillna(0).values
    # Find the dominant topic number in each document
    dominant_topics = np.argmax(padded_weights, axis=1)

    # Perform t-SNE dimensionality reduction
    tSNE_model = TSNE(n_components=2, verbose=0, random_state=0, angle=.99, init='pca')
    tsne_features = tSNE_model.fit_transform(padded_weights)

    # Plot the graph against first two components
    fig = px.scatter(x=tsne_features[:,0], y=tsne_features[:,1], color=dominant_topics, width=1200)
    fig.update_layout(title=title)

    if save_to_file:
        # Save the graph to a HTML file
        fig.write_html(file_name)

    # Display the graph
    fig.show()


def find_best_k(corp, id2word, model_type="LDiA"):
    # Record coherence score of each k value
    k_scores = []

    # Test different k values
    for k in range(2, 48, 5):
        # Create model
        if model_type.lower() == "lsa":
            model = LsiModel(corp, id2word=id2word, num_topics=k)
        else:
            model = LdaMulticore(corp, id2word=id2word, num_topics=k)
        # Calculate coherence score
        coherence_model = CoherenceModel(model=model, texts=word_docs, dictionary=word_id_dict, coherence='c_v')
        # Record score for k
        k_scores.append([k, coherence_model.get_coherence()])

    k_scores_df = pd.DataFrame(k_scores, columns=['Number of Topics', 'Coherence Score'])
    k_scores_df = k_scores_df.set_index('Number of Topics')
    return k_scores_df


use_optimised_params = True
save_graphs = True

# Set number of topics
sampled_unique_genres, sampled_all_genres = find_unique_genres(sampled_movies_data)
k_topics = len(sampled_unique_genres)

# Optimal parameters found in section below
optimal_alpha = 0.3
optimal_eta = 0.3

if use_optimised_params:
    # Create model with best alpha and eta
    LDiA_model = LdaMulticore(corpus, id2word=word_id_dict, num_topics=k_topics, alpha=optimal_alpha, eta=optimal_eta)
else:
    # Create model with default alpha and eta
    LDiA_model = LdaMulticore(corpus, id2word=word_id_dict, num_topics=k_topics)

# Calculate coherence score
LDiA_coherence_model = CoherenceModel(model=LDiA_model, texts=word_docs, dictionary=word_id_dict, coherence='c_v')
print(colored("Coherence score: ", color="blue", attrs=['bold']) + str(LDiA_coherence_model.get_coherence()) + "\n")

# Print topic keywords
topic_scores = LDiA_model.print_topics(num_topics=k_topics)
print(colored("Topic keywords:", color="blue", attrs=['bold']))
for i in range(len(topic_scores)):
    print(colored(str(i) + " ", color="blue", attrs=['bold']) + str(topic_scores[i][1:]))
print("")

# Display graph of topic distribution
vis_data = gensim_models.prepare(LDiA_model, corpus, word_id_dict)
if save_graphs:
    pyLDAvis.save_html(vis_data, 'Gensim_LDiA_' + str(k_topics) + '_Topics.html')
pyLDAvis.display(vis_data)

Coherence score: 0.19187554536294868

Topic keywords:
0 ('0.011*"s" + 0.010*"find" + 0.004*"get" + 0.004*"murder" + 0.004*"live" + 0.004*"crime" + 0.003*"year" + 0.003*"become" + 0.003*"wife" + 0.003*"cop"',)
1 ('0.013*"s" + 0.007*"go" + 0.006*"take" + 0.004*"be" + 0.003*"find" + 0.003*"get" + 0.003*"boy" + 0.003*"kill" + 0.003*"try" + 0.003*"year"',)
2 ('0.010*"s" + 0.005*"find" + 0.004*"get" + 0.004*"friend" + 0.003*"take" + 0.003*"killer" + 0.003*"follow" + 0.003*"be" + 0.003*"world" + 0.003*"three"',)
3 ('0.014*"s" + 0.008*"friend" + 0.006*"get" + 0.004*"find" + 0.004*"fall" + 0.003*"become" + 0.003*"discover" + 0.003*"house" + 0.003*"take" + 0.003*"want"',)
4 ('0.009*"year" + 0.005*"s" + 0.005*"take" + 0.005*"try" + 0.005*"become" + 0.004*"murder" + 0.004*"wife" + 0.004*"find" + 0.004*"get" + 0.004*"police"',)
5 ('0.005*"s" + 0.005*"be" + 0.004*"find" + 0.004*"night" + 0.004*"decide" + 0.004*"go" + 0.003*"take" + 0.003*"discover" + 0.003*"murder" + 0.003*"city"',)
6 ('0.004*"s" + 0.004*"group" + 0.004*"try" + 0.004*"find" + 0.004*"friend" + 0.003*"become" + 0.003*"get" + 0.003*"small" + 0.003*"live" + 0.003*"back"',)


# Plot each film and colour by the best matching topic
plot_tSNE(LDiA_model, title="LDiA t-SNE showing all films categoried into " + str(k_topics) + " topics",
          save_to_file=save_graphs, file_name="tSNE_LDiA_" + str(k_topics) + "_Topics.html")


run_optimisation = False

def optimise_LDiA(corp, id2word, k=10):
    # Record coherence score
    alpha_eta_scores = []

    # Test different alpha
    for a in range(1, 20, 2):
        # Test different eta
        for n in range(1, 20, 2):
            # Create model
            model = LdaMulticore(corp, id2word=id2word, num_topics=k, alpha=a/10, eta=n/10)
            # Calculate coherence score
            coherence_model = CoherenceModel(model=model, texts=word_docs, dictionary=word_id_dict, coherence='c_v')
            # Record score
            alpha_eta_scores.append([a/10, n/10, coherence_model.get_coherence()])

    alpha_eta_scores_df = pd.DataFrame(alpha_eta_scores, columns=['Alpha', 'Eta', 'Coherence Score'])
    return alpha_eta_scores_df

if run_optimisation:
    # Get dataframe of best alpha and eta
    optimal_scores_df = optimise_LDiA(corpus, word_id_dict, k=k_topics)
    # Sort by best parameter combinations
    optimal_scores_df.sort_values(by=['Coherence Score'], ascending=False)


k_scores_df = find_best_k(corpus, word_id_dict, model_type="ldia")

# Plot graph of coherence scores for different k values
fig = px.line(k_scores_df, title="Coherence for k Topics")
fig.show()


# Find the best k value from the highest coherence score
best_coherence = k_scores_df.nlargest(1,'Coherence Score')
best_k = best_coherence.idxmax()[0]

print(colored("Best score: ", color="blue", attrs=['bold']))
print(best_coherence)
print("\n")

# Print topic keywords
topic_scores = LDiA_model.print_topics(num_topics=k_topics)
print(colored("Topic keywords:", color="blue", attrs=['bold']))
for i in range(len(topic_scores)):
    print(colored(str(i) + " ", color="blue", attrs=['bold']) + str(topic_scores[i][1:]))

# View graph of best k
print(colored("\nLDiA graph with " + str(best_k) + " topics: ", color="blue", attrs=['bold']))
LDiA_model = LdaMulticore(corpus, id2word=word_id_dict, num_topics=best_k, alpha=optimal_alpha, eta=optimal_eta)
vis_data = gensim_models.prepare(LDiA_model, corpus, word_id_dict)

if save_graphs:
    pyLDAvis.save_html(vis_data, 'Gensim_LDiA_' + str(best_k) + '_Topics.html')

pyLDAvis.display(vis_data)

Best score: 
                  Coherence Score
Number of Topics
47                       0.230579


Topic keywords:
0 ('0.011*"s" + 0.010*"find" + 0.004*"get" + 0.004*"murder" + 0.004*"live" + 0.004*"crime" + 0.003*"year" + 0.003*"become" + 0.003*"wife" + 0.003*"cop"',)
1 ('0.013*"s" + 0.007*"go" + 0.006*"take" + 0.004*"be" + 0.003*"find" + 0.003*"get" + 0.003*"boy" + 0.003*"kill" + 0.003*"try" + 0.003*"year"',)
2 ('0.010*"s" + 0.005*"find" + 0.004*"get" + 0.004*"friend" + 0.003*"take" + 0.003*"killer" + 0.003*"follow" + 0.003*"be" + 0.003*"world" + 0.003*"three"',)
3 ('0.014*"s" + 0.008*"friend" + 0.006*"get" + 0.004*"find" + 0.004*"fall" + 0.003*"become" + 0.003*"discover" + 0.003*"house" + 0.003*"take" + 0.003*"want"',)
4 ('0.009*"year" + 0.005*"s" + 0.005*"take" + 0.005*"try" + 0.005*"become" + 0.004*"murder" + 0.004*"wife" + 0.004*"find" + 0.004*"get" + 0.004*"police"',)
5 ('0.005*"s" + 0.005*"be" + 0.004*"find" + 0.004*"night" + 0.004*"decide" + 0.004*"go" + 0.003*"take" + 0.003*"discover" + 0.003*"murder" + 0.003*"city"',)
6 ('0.004*"s" + 0.004*"group" + 0.004*"try" + 0.004*"find" + 0.004*"friend" + 0.003*"become" + 0.003*"get" + 0.003*"small" + 0.003*"live" + 0.003*"back"',)

LDiA graph with 47 topics:


plot_tSNE(LDiA_model, title="LDiA t-SNE showing all films categoried into " + str(best_k) + " topics",
          save_to_file=save_graphs, file_name="tSNE_LDiA_" + str(best_k) + "_Topics.html")


# Create model
LSA_model = LsiModel(corpus, num_topics=k_topics, id2word=word_id_dict)

# Calculate coherence score
LSA_coherence_model = CoherenceModel(model=LSA_model, texts=word_docs, dictionary=word_id_dict, coherence='c_v')
print(colored("Coherence score: ", color="blue", attrs=['bold']) + str(LSA_coherence_model.get_coherence()) + "\n")

# Print topic keywords
topic_scores = LSA_model.print_topics(num_topics=k_topics)
print(colored("Topic keywords:", color="blue", attrs=['bold']))
for i in range(len(topic_scores)):
    print(colored(str(i) + " ", color="blue", attrs=['bold']) + str(topic_scores[i][1:]))
print("")

Coherence score: 0.22073122242540325

Topic keywords:
0 ('0.683*"s" + 0.201*"find" + 0.163*"get" + 0.161*"friend" + 0.153*"year" + 0.145*"take" + 0.120*"go" + 0.115*"be" + 0.106*"murder" + 0.106*"become"',)
1 ('-0.695*"s" + 0.342*"find" + 0.206*"get" + 0.134*"take" + 0.133*"be" + 0.131*"friend" + 0.123*"year" + 0.112*"go" + 0.096*"murder" + 0.093*"try"',)
2 ('-0.792*"find" + 0.462*"get" + 0.179*"be" + 0.127*"take" + 0.093*"year" + 0.091*"friend" + -0.077*"s" + 0.062*"become" + -0.057*"murder" + 0.054*"job"',)
3 ('-0.610*"get" + 0.532*"year" + 0.373*"take" + -0.228*"find" + -0.101*"friend" + 0.097*"live" + 0.080*"become" + 0.072*"force" + 0.068*"day" + -0.064*"s"',)
4 ('0.747*"friend" + -0.303*"take" + 0.236*"year" + -0.225*"murder" + 0.193*"good" + -0.136*"police" + -0.104*"kill" + -0.102*"get" + -0.094*"killer" + 0.076*"meet"',)
5 ('0.630*"year" + -0.385*"take" + -0.373*"friend" + 0.281*"get" + -0.203*"become" + 0.164*"be" + 0.140*"find" + -0.107*"group" + -0.105*"go" + -0.075*"world"',)
6 ('0.595*"take" + -0.431*"murder" + -0.360*"become" + 0.245*"get" + 0.236*"find" + -0.134*"police" + -0.107*"try" + -0.106*"killer" + -0.093*"town" + -0.089*"investigate"',)


plot_tSNE(LSA_model, title="LSA t-SNE showing all films categoried into " + str(k_topics) + " topics",
          save_to_file=save_graphs, file_name="tSNE_LSA_" + str(k_topics) + "_Topics.html")


k_scores_df = find_best_k(corpus, word_id_dict, model_type="lsa")


fig = px.line(k_scores_df, title="Coherence for k Topics")
fig.show()


# Find the best k value from the highest coherence score
best_coherence = k_scores_df.nlargest(1,'Coherence Score')
best_k = best_coherence.idxmax()[0]

print(colored("Best score: ", color="blue", attrs=['bold']))
print(best_coherence)
print("\n")

# Print topic keywords
LSA_model = LsiModel(corpus, num_topics=best_k, id2word=word_id_dict)
topic_scores = LSA_model.print_topics(num_topics=best_k)
print(colored("Topic keywords:", color="blue", attrs=['bold']))
for i in range(len(topic_scores)):
    print(colored(str(i) + " ", color="blue", attrs=['bold']) + str(topic_scores[i][1:]))

Best score: 
                  Coherence Score
Number of Topics
2                        0.309552


Topic keywords:
0 ('0.683*"s" + 0.201*"find" + 0.163*"get" + 0.161*"friend" + 0.153*"year" + 0.145*"take" + 0.120*"go" + 0.115*"be" + 0.106*"murder" + 0.106*"become"',)
1 ('-0.695*"s" + 0.343*"find" + 0.207*"get" + 0.134*"take" + 0.132*"be" + 0.131*"friend" + 0.123*"year" + 0.111*"go" + 0.096*"murder" + 0.093*"try"',)


plot_tSNE(LSA_model, title="LSA t-SNE showing all films categoried into " + str(best_k) + " topics",
          save_to_file=save_graphs, file_name="tSNE_LSA_" + str(best_k) + "_Topics.html")

	title	description	genre
0	Miss Jerry	The adventures of a female reporter in the 1890s.	Romance
1	The Story of the Kelly Gang	True story of notorious Australian outlaw Ned ...	Biography, Crime, Drama
2	Den sorte drøm	Two men of high rank are both wooing the beaut...	Drama
3	Cleopatra	The fabled queen of Egypt's affair with Roman ...	Drama, History
4	L'Inferno	Loosely adapted from Dante's Divine Comedy and...	Adventure, Drama, Fantasy
...	...	...	...
85848	Pengalila	An unusual bond between a sixty year old Dalit...	Drama
85849	Manoharam	Manoharan is a poster artist struggling to fin...	Comedy, Drama
85850	Le lion	A psychiatric hospital patient pretends to be ...	Comedy
85851	De Beentjes van Sint-Hildegard	A middle-aged veterinary surgeon believes his ...	Comedy, Drama
85854	La vida sense la Sara Amat	Pep, a 13-year-old boy, is in love with a girl...	Drama

	title	description	genre
12759	Obiettivo 500 milioni	Reichau, a former army captain, is back in Fra...	Crime, Drama, Thriller
60347	I'm Not Jesus Mommy	Kimberly would stop at nothing to have a child...	Drama, Thriller
49601	L'uomo d'acciaio	An alien child is evacuated from his dying wor...	Action
32657	Os Vagabundos Trapalhões	Bonga and three other good-hearted bums give s...	Comedy
49606	Broken English	Since college (art), Nora hasn't had luck with...	Comedy, Drama, Romance
...	...	...	...
72925	Stegman Is Dead	Meet Stegman. He's dead. In this crime comedy,...	Action, Comedy, Crime
81813	Gemini Ganeshanum Suruli Raajanum	A playboy falls truely for a woman and decides...	Comedy, Crime, Thriller
43169	Gudumba Shankar	Shankar aka Gudumba Shankar, a petty-thief, fa...	Action, Comedy, Drama
62236	Pak Panter	Russia is a country where our energy minister ...	Comedy
71654	All About E	A beautiful sexy DJ is forced to run when she ...	Comedy, Drama, Romance

	0	1	2	3	4	5	6	7	8	9	...	30	31	32	33	34	35	36	37	38	39
0	(0, 1)	(1, 1)	(2, 1)	(3, 1)	(4, 1)	(5, 1)	(6, 1)	(7, 1)	(8, 1)	(9, 1)	...	None	None	None	None	None	None	None	None	None	None
1	(18, 1)	(19, 1)	(20, 1)	(21, 1)	(22, 1)	(23, 1)	(24, 1)	(25, 1)	(26, 1)	(27, 1)	...	None	None	None	None	None	None	None	None	None	None
2	(20, 1)	(24, 1)	(35, 1)	(37, 1)	(38, 1)	(39, 1)	(40, 2)	(41, 1)	(42, 1)	(43, 1)	...	None	None	None	None	None	None	None	None	None	None
3	(16, 1)	(20, 1)	(50, 1)	(51, 1)	(52, 2)	(53, 1)	(54, 1)	(55, 1)	(56, 1)	(57, 1)	...	None	None	None	None	None	None	None	None	None	None
4	(66, 1)	(67, 1)	(68, 1)	(69, 1)	(70, 1)	(71, 1)	(72, 1)	(73, 1)	(74, 1)	(75, 1)	...	None	None	None	None	None	None	None	None	None	None
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4982	(30, 1)	(74, 1)	(93, 1)	(117, 1)	(160, 2)	(191, 1)	(294, 2)	(340, 1)	(359, 1)	(564, 1)	...	None	None	None	None	None	None	None	None	None	None
4983	(101, 1)	(285, 1)	(379, 1)	(627, 1)	(639, 1)	(925, 1)	(968, 1)	(1120, 1)	(1155, 1)	(1483, 1)	...	None	None	None	None	None	None	None	None	None	None
4984	(259, 1)	(346, 1)	(639, 1)	(873, 1)	(1087, 1)	(1218, 1)	(2041, 1)	(2181, 1)	(2607, 1)	(3488, 1)	...	None	None	None	None	None	None	None	None	None	None
4985	(128, 1)	(179, 1)	(600, 1)	(627, 2)	(1030, 2)	(1064, 1)	(2168, 1)	(2530, 1)	(2670, 1)	(3909, 2)	...	None	None	None	None	None	None	None	None	None	None
4986	(101, 1)	(171, 1)	(344, 1)	(381, 1)	(405, 1)	(455, 1)	(683, 1)	(740, 1)	(1332, 1)	(3940, 1)	...	None	None	None	None	None	None	None	None	None	None

	Alpha	Eta	Coherence Score
14	0.3	0.9	0.183692
2	0.1	0.5	0.183407
10	0.3	0.1	0.180763
11	0.3	0.3	0.178898
1	0.1	0.3	0.175868
...	...	...	...
78	1.5	1.7	0.064671
47	0.9	1.5	0.064235
89	1.7	1.9	0.063932
65	1.3	1.1	0.063840
96	1.9	1.3	0.063565

About¶

Imports¶

IMDb Dataset¶

Analysing the Dataset¶

Metrics¶

Graphs¶

Word frequency distribution of a given genre¶

Label distribution¶

Data Processing¶

Drop samples¶

Samples¶

Stop words¶

Normalisation¶

Processing samples¶

Unsupervised Topic Modelling¶

Latent Dirichlet Allocation (LDiA)¶

LDiA on expected number of topics¶

Optimise LDiA's alpha and eta hyperparameters¶

LDiA experiment with different number of topics¶

Latent Semantic Analysis (LSA)¶

LSA on expected number of topics¶

Experiment with different number of topics¶