Code by TomMakesThings
In this notebook, Latent Dirichlet allocation (LDiA) and Latent semantic analysis (LSA) topic modelling are performed on film descriptions from IMDb. Although these topic modelling algorithms do not classify descriptions into genres in the same way as supervised algorithms, such an LSTM classifier, they provide a method of dimensionality reduction and give a new method finding the most representitive tokens for each film description. The topics created from LDiA and LSA could potentially be used as labels to train a supervised model through a semi-supervised approach.
# !pip install scattertext
# !pip install pycontractions
# !pip install pyspellchecker
# nltk.download('stopwords')
import pandas as pd
import numpy as np
import spacy
import nltk
import scattertext
import unicodedata
import matplotlib.pyplot as plt
import string
import gensim
import gensim.corpora as corpora
import pyLDAvis
import pyLDAvis.gensim_models as gensim_models
import plotly.express as px
import plotly.graph_objects as go
import re
from termcolor import colored
from sklearn.preprocessing import MultiLabelBinarizer
from scattertext import CorpusFromPandas, produce_scattertext_explorer
from IPython.display import IFrame
from IPython.core.display import display, HTML
from spacy import displacy
from nltk import FreqDist
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
from matplotlib.pyplot import figure
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import CoherenceModel, LsiModel
from pycontractions import Contractions
from sklearn.manifold import TSNE
from spellchecker import SpellChecker
%matplotlib inline
display(HTML("<style>.container { width:98% !important; }</style>"))
# Ignore deprecation warning
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
Open the dataset, drop irrelevent columns and remove samples with missing information.
This dataset can be downloaded from Kaggle: https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset
# Read IMDb dataset
path = "IMDbMovies.csv.zip"
dataframe = pd.read_csv(path, low_memory=False)
# Read specific columns and drop and rows with NaN values
movies_data = dataframe[['title', 'description', 'genre']].dropna()
movies_data
title | description | genre | |
---|---|---|---|
0 | Miss Jerry | The adventures of a female reporter in the 1890s. | Romance |
1 | The Story of the Kelly Gang | True story of notorious Australian outlaw Ned ... | Biography, Crime, Drama |
2 | Den sorte drøm | Two men of high rank are both wooing the beaut... | Drama |
3 | Cleopatra | The fabled queen of Egypt's affair with Roman ... | Drama, History |
4 | L'Inferno | Loosely adapted from Dante's Divine Comedy and... | Adventure, Drama, Fantasy |
... | ... | ... | ... |
85848 | Pengalila | An unusual bond between a sixty year old Dalit... | Drama |
85849 | Manoharam | Manoharan is a poster artist struggling to fin... | Comedy, Drama |
85850 | Le lion | A psychiatric hospital patient pretends to be ... | Comedy |
85851 | De Beentjes van Sint-Hildegard | A middle-aged veterinary surgeon believes his ... | Comedy, Drama |
85854 | La vida sense la Sara Amat | Pep, a 13-year-old boy, is in love with a girl... | Drama |
83740 rows × 3 columns
Each film contains between 1 to 3 genres stored in genre
. However, they are stored in the same column meaning some processing is required to separate each one to count the true number. For example, Drama
, Romance
and Drama, Romance
would be counted as three different genres, even though only two unique genres are present.
def find_unique_genres(data):
# Search all samples to find the genres types
all_gens = []
for genre_list in data['genre'].apply(lambda genre_list: genre_list.split(", ")):
# Iterate through genre(s) for each film
for genre in genre_list:
# Remove whitespace
all_gens.append(genre)
# Create list of all unique genres
unique_gens = list(set(all_gens))
unique_gens.sort()
return unique_gens, all_gens
unique_genres, all_genres = find_unique_genres(movies_data)
Print information about the dataset:
min_length
.tokenizer = RegexpTokenizer(r"\w+")
print(colored("Total number of films in IMDb dataset: ", color="blue", attrs=['bold']) + str(len(dataframe.index)))
print(colored("Number of suitable films (samples): ", color="blue", attrs=['bold']) + str(len(movies_data.index)))
print(colored("Number of films dropped: ", color="blue", attrs=['bold']) + str(len(dataframe.index) - len(movies_data.index)))
print(colored("Total number of genres (labels): ", color="green", attrs=['bold']) + str(len(all_genres)))
print(colored("Average number of genres per film: ", color="green", attrs=['bold']) + str(len(all_genres) / len(movies_data.index)))
print(colored("Unique genres: ", color="green", attrs=['bold']) + str(unique_genres))
print(colored("Number of unique genres: ", color="green", attrs=['bold']) + str(len(unique_genres)))
print(colored("Average description length: ", color="magenta", attrs=['bold']) + str(np.mean([len(desc) for desc in movies_data.description])))
print(colored("Average number of words in description: ", color="magenta", attrs=['bold']) + str(np.mean([len(tokenizer.tokenize(desc)) for desc in movies_data.description])))
print(colored("Average number of unique words in description: ", color="magenta", attrs=['bold']) + str(np.mean([len(set(tokenizer.tokenize(desc))) for desc in movies_data.description])))
print(colored("Shortest description length: ", color="magenta", attrs=['bold']) + str(min([len(tokenizer.tokenize(desc)) for desc in movies_data.description])))
print(colored("Longest description length: ", color="magenta", attrs=['bold']) + str(max([len(tokenizer.tokenize(desc)) for desc in movies_data.description])))
Total number of films in IMDb dataset: 85855 Number of suitable films (samples): 83740 Number of films dropped: 2115 Total number of genres (labels): 172461 Average number of genres per film: 2.059481729161691 Unique genres: ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western'] Number of unique genres: 25 Average description length: 160.06323143061857 Average number of words in description: 28.19075710532601 Average number of unique words in description: 25.13014091234774 Shortest description length: 1 Longest description length: 79
Plot the most common words for a given genre. The first graph includes all words, although is dominated by stop words such as 'a', 'the', 'to'. The second graph has stop words removed. These are more characteristic of the genre, e.g "house" and "killer" for horror. The frequency distribution provide an interesting comparision against topics assigned through unsupervised topic modelling algorithms.
genre = "horror" # Set as category to investigate
film_descs = [] # Descriptions of all films in genre set above
# Iterate through genres + description of each films
for gen, desc in zip(movies_data.genre, movies_data.description):
# Check if film has desired genre
if genre in gen.lower():
# Remove puncuation from description
for char in string.punctuation:
desc = desc.replace(char,"")
# Add tokenized description to list
film_descs.append(desc.split())
# Plot word frequency distribution
fd = FreqDist(word.lower() for sent in film_descs for word in sent)
figure(figsize=(10, 5), dpi=80)
plt.title("Frequency distribution of " + str(genre) + " films")
fd.plot(50)
# Plot word frequency distribution after removing stop words
figure(figsize=(10, 5), dpi=80)
stop_words = nltk.corpus.stopwords.words('english')
fd_no_stop = FreqDist(word.lower() for sent in film_descs for word in sent if word.lower() not in stop_words)
plt.title("Frequency distribution of " + str(genre) + " films - No stop words")
fd_no_stop.plot(50)
plt.show()
View the number of films belonging to each genre. Across all samples, the most common genres are Drama (26.7%), Comedy (16.4%) and Romance (8.0%). The least common is News with only one sample across the whole dataset. Other rare genres include Adult and Documentary which have two each, and Reality-TV which has three. After this is Film-Noir with 663. The rarest genres are removed later in the notebook as they do not provide enough samples to be representative.
# Count the number of genres
genre_counts = np.zeros(len(unique_genres))
for genre_list in movies_data['genre'].apply(lambda genre_list: genre_list.split(", ")):
for i in range(len(unique_genres)):
if unique_genres[i] in genre_list:
genre_counts[i] += 1
# Print counts for each genre
# for gen, con in zip(unique_genres, genre_counts):
# print(str(gen) + ": " + str(int(con)))
# Plot the interactive pie chart
fig = go.Figure()
fig.add_trace(go.Pie(labels=unique_genres, values=genre_counts))
fig.show()
As the dataset is large, samples are dropped so that the notebook can run in a reasonable time. n_samples
specifies the maximum number of samples to use. Setting n_samples = 0
will cause all suitable samples to be used, though the notebook will be slow to run. For these experiments I have used 5000 samples as this provides a sufficient amount to perform topic modelling in a reasonable amount of time.
min_length
specifies the minimum number of words in a description and samples less than this will be removed.
Setting remove_rare_genres = True
will remove genres with less than rare_count
instances from the movies' labels. If a sample does not have a label for any remaining genre, it will be dropped. For example, if rare_count = 10
, genres [News, Adult, Documentary, Reality-TV] are removed.
max_genre_samples
is the maximum number of samples of each genre allowed. For example if max_genre_samples = 100
, a maximum of 100 samples are allowed for Comedy, another 100 for Drama etc. This can be turned off by setting max_genre_samples = 0
.
n_samples = 5000
min_length = 10
remove_rare_genres = True
rare_count = 7500
max_genre_samples = 2000
# Create copy of movies_data
sampled_movies_data = movies_data
# Remove samples where description size is less than min_length
for i, desc in zip(sampled_movies_data.index, sampled_movies_data.description):
if(len(tokenizer.tokenize(desc))) < min_length:
sampled_movies_data = sampled_movies_data.drop(i)
if remove_rare_genres:
# Find rare genres
rare_genres = []
for gen, count in zip(unique_genres, genre_counts):
if count < rare_count:
rare_genres.append(gen)
print(colored("Genres removed: ", color="blue", attrs=['bold']) + str(rare_genres))
for i, genre_list in zip(sampled_movies_data.index, sampled_movies_data['genre'].apply(lambda genre_list: genre_list.split(", "))):
# Remove rare genres from list of film's genres
new_genre_list = [gen for gen in genre_list if gen not in rare_genres]
# Check if any genres remain
if new_genre_list != []:
# Update film's genre column
sampled_movies_data.at[i,'genre'] = ", ".join(new_genre_list)
else:
# Else drop samples if all genres removed
sampled_movies_data = sampled_movies_data.drop(i)
if max_genre_samples > 0:
# Find the genres that have not been removed
if remove_rare_genres:
remaining_genres = list(set(unique_genres) - set(rare_genres))
else:
remaining_genres = unique_genres
# Create an empty dataframe to record samples to keep
updated_sampled_movies = pd.DataFrame()
for gen in remaining_genres:
# Find all samples belonging to the genre
genre_samples = sampled_movies_data.loc[sampled_movies_data["genre"].str.contains(gen)]
# Get the first n samples for that genre
try:
# Get the first n samples for that genre
genre_samples = genre_samples.sample(n=max_genre_samples, random_state=1)
except:
# If n > number of samples of that genre, use them all
None
# Record the samples to keep
updated_sampled_movies = pd.concat([updated_sampled_movies, genre_samples])
# Remove any duplicates as some movies belong to multiple genres
sampled_movies_data = updated_sampled_movies.drop_duplicates()
# Shuffle order by sampling with the same size
sampled_movies_data = sampled_movies_data.sample(len(sampled_movies_data), random_state=1)
if n_samples > 0:
# Cap the number of samples to a maximum n_samples
sampled_movies_data = sampled_movies_data.sample(n_samples, random_state=1)
# View selected samples
display(sampled_movies_data)
Genres removed: ['Adult', 'Adventure', 'Animation', 'Biography', 'Documentary', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Sci-Fi', 'Sport', 'War', 'Western']
title | description | genre | |
---|---|---|---|
12759 | Obiettivo 500 milioni | Reichau, a former army captain, is back in Fra... | Crime, Drama, Thriller |
60347 | I'm Not Jesus Mommy | Kimberly would stop at nothing to have a child... | Drama, Thriller |
49601 | L'uomo d'acciaio | An alien child is evacuated from his dying wor... | Action |
32657 | Os Vagabundos Trapalhões | Bonga and three other good-hearted bums give s... | Comedy |
49606 | Broken English | Since college (art), Nora hasn't had luck with... | Comedy, Drama, Romance |
... | ... | ... | ... |
72925 | Stegman Is Dead | Meet Stegman. He's dead. In this crime comedy,... | Action, Comedy, Crime |
81813 | Gemini Ganeshanum Suruli Raajanum | A playboy falls truely for a woman and decides... | Comedy, Crime, Thriller |
43169 | Gudumba Shankar | Shankar aka Gudumba Shankar, a petty-thief, fa... | Action, Comedy, Drama |
62236 | Pak Panter | Russia is a country where our energy minister ... | Comedy |
71654 | All About E | A beautiful sexy DJ is forced to run when she ... | Comedy, Drama, Romance |
5000 rows × 3 columns
Create a custom stop word list movie_stop_words
of the most common words across all movie descriptions.
# String of all words in every description
all_desc_words = ''
# Add all descriptions to all_desc_words
for desc in movies_data.description:
all_desc_words += desc
# Split list of all descriptions into tokens
all_tokens = tokenizer.tokenize(all_desc_words)
# Calculate frequency distribution to find most common words
movie_fd = FreqDist(token.lower() for token in all_tokens)
# Create custom stop word list
movie_stop_words = [word[0] for word in movie_fd.most_common(50)]
print(colored("Movie description stop words: ", color="blue", attrs=['bold']) + str(movie_stop_words) + "\n")
Movie description stop words: ['a', 'the', 'to', 'of', 'and', 'in', 'his', 'is', 'with', 'her', 'an', 's', 'he', 'for', 'who', 'on', 'by', 'their', 'from', 'that', 'when', 'as', 'young', 'she', 'they', 'but', 'are', 'after', 'has', 'life', 'man', 'at', 'two', 'him', 'one', 'love', 'up', 'woman', 'it', 'into', 'family', 'out', 'story', 'new', 'about', 'old', 'be', 'girl', 'this', 'father']
Select which stop word list to use and print it. Set stop = 0
to use no pre-defined stop word list, stop = 1
to use NLTK, or any other value, e.g. stop = -1
, for spaCy. Set add_custom_words = True
to add the movie_stop_words
to the stop words.
Here I have used NLTK to filter out common words that do not contribute to the meaning of the descriptions. This prevents unrepresentative words being picked up by the all the topics in the topic modelling algorithms.
stop = 1
add_custom_words = True
if stop == 0:
# No stop words
stop_words = []
elif stop == 1:
# NLTK stop words
stop_words = nltk.corpus.stopwords.words('english')
else:
# spaCy stop words
stop_words = list(spacy.lang.en.stop_words.STOP_WORDS)
if add_custom_words:
# Add custom stop words to list
stop_words = list(set(stop_words + movie_stop_words))
if stop_words:
# Print stop word list if not empty
print(colored("Stop words: ", color="blue", attrs=['bold']) + str(stop_words) + "\n")
print(colored("Number of stop words: ", color="blue", attrs=['bold']) + str(len(stop_words)))
Stop words: ['too', 'man', "haven't", 'over', 'once', 'very', 'own', "shan't", 'doesn', 'yours', 'theirs', 'hers', "that'll", 'other', 'don', 'through', 'while', 't', 'won', 'again', 'an', "mustn't", 'here', 'does', 'ma', 'its', 'd', "doesn't", "hasn't", "wasn't", 'in', "you've", 'him', 'that', 'was', 'yourself', 'what', 'whom', 'by', 'family', 'so', 'at', 'out', 'then', 'couldn', 'she', 'isn', 'himself', 'me', 'not', 'i', 'herself', 'or', 'two', 'her', 'with', 'off', 'as', 'nor', 'wasn', 'we', 'one', 'his', 'girl', "mightn't", 'will', 'your', 'a', 'be', 'o', 'most', 'old', 'during', 'love', 'about', 'needn', 'mustn', 'under', 'now', 'shouldn', 'doing', 'few', "wouldn't", 'only', 'both', 'didn', 'if', 'am', 'from', 'it', 'same', 'they', 'because', 'some', 'were', 'to', 'each', 'where', 'm', 'll', "didn't", 'story', 'this', 'father', 'hadn', 'more', 'can', "isn't", 'he', "you'd", "you're", 'against', 'should', 'mightn', 'ours', 'wouldn', 'haven', 'shan', 'yourselves', 'been', 'above', 'y', 're', 'all', 'of', 'for', 'their', "don't", 'no', 'below', 'our', 'have', 've', 'but', 'you', "hadn't", 'do', "shouldn't", 'did', 'my', 'and', 'up', 'having', 'being', 'myself', 'between', 'aren', 'which', "she's", 'the', "it's", 'weren', 'young', 'down', "couldn't", "you'll", 'on', 'life', 'had', 'when', 'there', 'why', "weren't", 'until', 'who', 'these', 'are', 'into', 'any', 'than', 'ain', 'itself', 'after', 'ourselves', 'just', 'is', 'them', 'such', 'those', "aren't", 'how', 'has', 'before', 'hasn', "needn't", 'woman', 'further', 's', 'new', "won't", "should've", 'themselves'] Number of stop words: 192
Create a function to convert accented characters. For example, è becomes e in "Arsène Baudu and Hyacinthe, a pair of small-time crooks".
def remove_accents(text):
text = unicodedata.normalize('NFD', text)\
.encode('ascii', 'ignore')\
.decode("utf-8")
return str(text)
print(colored("Before: ", color="blue", attrs=['bold']) + "Arsène Baudu and Hyacinthe, a pair of small-time crooks")
print(colored("After: ", color="blue", attrs=['bold']) + str(remove_accents("Arsène Baudu and Hyacinthe, a pair of small-time crooks")))
Before: Arsène Baudu and Hyacinthe, a pair of small-time crooks After: Arsene Baudu and Hyacinthe, a pair of small-time crooks
Load a model to produce contractions, e.g. can't becomes cannot in "They fall in love, but can't quite seem to get the timing right."
# Load pre-trained word embedding
contraction = Contractions(api_key="glove-twitter-25")
print(colored("Before: ", color="blue", attrs=['bold']) + "They fall in love, but can't quite seem to get the timing right.")
print(colored("After: ", color="blue", attrs=['bold']) + list(contraction.expand_texts(["They fall in love, but can't quite seem to get the timing right."], precise=True))[0])
Before: They fall in love, but can't quite seem to get the timing right. After: They fall in love, but cannot quite seem to get the timing right.
Process film descriptions and store as description_docs
def process_text_samples(data, stop_list=[], contractions=False, spell_correct=False, lemmatize=False, stem=False, stemmer_algorithm="porter"):
# List of all samples
samples = []
# Create spell checker
spell_check = SpellChecker()
for sample in data:
# Change accented characters, e.g à -> a
sample = remove_accents(sample)
# Contract words, e.g "hasn't" -> "has not"
if contractions:
sample = list(contraction.expand_texts([sample], precise=True))
sample = ''.join(contracted)
# Input sample text into spaCy language processor
doc = nlp(sample)
# Split sample text into sentences
sentences = list(doc.sents)
for sent_idx in range(len(sentences)):
# Remove punctuation tokens, e.g. ! , .
sentences[sent_idx] = [token for token in sentences[sent_idx] if not token.is_punct]
# Remove stop words
if stop_list:
sentences[sent_idx] = [token for token in sentences[sent_idx] if token.text.lower() not in stop_list]
# Correct spelling mistakes, e.g. "athiest" -> "atheist"
if spell_correct:
sentences[sent_idx] = [nlp(spell_check.correction(token.text))[0] for token in sentences[sent_idx]]
# Apply lemmatization
if lemmatize:
# Resolve words to their dictionary form using PoS tags
sentences[sent_idx] = [token.lemma_.lower() for token in sentences[sent_idx]]
# Apply stemming (only if lemmatization not applied)
elif stem:
# Set stemmer type
if stemmer_algorithm[0].lower() == "s":
# Snowball stemmer is advancement on Porter
stemmer = SnowballStemmer(language='english')
else:
# Default to Porter stemmer if not specified
stemmer = PorterStemmer()
# Stem tokens
for word_idx in range(len(sentences[sent_idx])):
# Apply stemmer to each word
stemmed = stemmer.stem(sentences[sent_idx][word_idx].text)
# Convert back to type Token and update word in sentence
sentences[sent_idx][word_idx] = nlp(stemmed)[0]
# Remove remaining punctuation within tokens, e.g. "(years)" -> "years", not including -
sentences[sent_idx] = [token.translate(str.maketrans('', '', '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~')) for token in sentences[sent_idx]]
# Split words containing dash or spaces caused by lemmatization, e.g. "16-year" -> "16" + "year"
for k in range(len(sentences)):
new_sentence = []
for token in sentences[k]:
split_token = re.split(' |-', token)
for word in split_token:
# Check word not empty
if word:
new_sentence.append(word)
# Replace words in sentence
sentences[k] = new_sentence
# Remove empty lists from list of sentences
sentences = [sent for sent in sentences if sent != []]
samples.append(sentences)
return samples
# List of description 'documents' seperated into sentences
description_docs = process_text_samples(data=sampled_movies_data.description,
spell_correct=False,
stop_list=stop_words,
lemmatize=True)
# View extract
print(colored("First 5 descriptions:", color="blue", attrs=['bold']))
for i in range(5):
print(colored(str(i), color="blue", attrs=['bold']) + " " + str(description_docs[i]))
# View extract
print(colored("First 5 descriptions:", color="blue", attrs=['bold']))
for i in range(5):
print(colored(str(i), color="blue", attrs=['bold']) + " " + str(description_docs[i]))
First 5 descriptions: 0 [['reichau', 'former', 'army', 'captain', 'back', 'france', 'serve', 'three', 'year', 'prison', 'belong', 'oassecret', 'armed', 'organization', 'dissident', 'paramilitary', 'group', 'algerian']] 1 [['kimberly', 'would', 'stop', 'nothing', 'child'], ['recover', 'cancer', 'possibility', 'seem', 'slim'], ['however', 'world', 's', 'first', 'successful', 'human', 'cloning', 'project', 'bring']] 2 [['alien', 'child', 'evacuate', 'die', 'world', 'send', 'earth', 'live', 'among', 'human'], ['peace', 'threaten', 'survivor', 'home', 'planet', 'invade', 'earth']] 3 [['bonga', 'three', 'good', 'hearted', 'bum', 'give', 'shelter', 'homeless', 'child', 'find', 'adoptive', 'parent'], ['day', 'rich', 'boy', 'tired', 'parent', 'indifference', 'ask', 'bonga', 'shelter']] 4 [['since', 'college', 'art', 'nora', 'not', 'luck', 'man'], ['30', 'work', 'nyc', 'hotel', 'go', 'nowhere'], ['meet', 'frenchman']] First 5 descriptions: 0 [['reichau', 'former', 'army', 'captain', 'back', 'france', 'serve', 'three', 'year', 'prison', 'belong', 'oassecret', 'armed', 'organization', 'dissident', 'paramilitary', 'group', 'algerian']] 1 [['kimberly', 'would', 'stop', 'nothing', 'child'], ['recover', 'cancer', 'possibility', 'seem', 'slim'], ['however', 'world', 's', 'first', 'successful', 'human', 'cloning', 'project', 'bring']] 2 [['alien', 'child', 'evacuate', 'die', 'world', 'send', 'earth', 'live', 'among', 'human'], ['peace', 'threaten', 'survivor', 'home', 'planet', 'invade', 'earth']] 3 [['bonga', 'three', 'good', 'hearted', 'bum', 'give', 'shelter', 'homeless', 'child', 'find', 'adoptive', 'parent'], ['day', 'rich', 'boy', 'tired', 'parent', 'indifference', 'ask', 'bonga', 'shelter']] 4 [['since', 'college', 'art', 'nora', 'not', 'luck', 'man'], ['30', 'work', 'nyc', 'hotel', 'go', 'nowhere'], ['meet', 'frenchman']]
Again I have a filter to remove samples if they contain less words than min_words
. This is because removing stop words in the code above will sometimes make some descriptions too short which makes them unsuitable for model training.
min_words = 5
# Convert docs in description_docs from list of sentences to list of words
word_docs = []
for doc in description_docs:
# Make a list of all words
words = [word for sent in doc for word in sent]
if len(words) >= min_words:
word_docs.append(words)
Convert the list of films into a form suitable for LDiA and LSA. A dictionary is constructed mapping each word to its integer id. This dictionary is used to construct a document-term matrix.
# Construct a dictionary mapping words to their integer ids
word_id_dict = corpora.Dictionary(word_docs)
# Build a document-term matrix from the list of documents
corpus = [word_id_dict.doc2bow(doc) for doc in word_docs]
# View corpus
pd.DataFrame(corpus)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | (0, 1) | (1, 1) | (2, 1) | (3, 1) | (4, 1) | (5, 1) | (6, 1) | (7, 1) | (8, 1) | (9, 1) | ... | None | None | None | None | None | None | None | None | None | None |
1 | (18, 1) | (19, 1) | (20, 1) | (21, 1) | (22, 1) | (23, 1) | (24, 1) | (25, 1) | (26, 1) | (27, 1) | ... | None | None | None | None | None | None | None | None | None | None |
2 | (20, 1) | (24, 1) | (35, 1) | (37, 1) | (38, 1) | (39, 1) | (40, 2) | (41, 1) | (42, 1) | (43, 1) | ... | None | None | None | None | None | None | None | None | None | None |
3 | (16, 1) | (20, 1) | (50, 1) | (51, 1) | (52, 2) | (53, 1) | (54, 1) | (55, 1) | (56, 1) | (57, 1) | ... | None | None | None | None | None | None | None | None | None | None |
4 | (66, 1) | (67, 1) | (68, 1) | (69, 1) | (70, 1) | (71, 1) | (72, 1) | (73, 1) | (74, 1) | (75, 1) | ... | None | None | None | None | None | None | None | None | None | None |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4982 | (30, 1) | (74, 1) | (93, 1) | (117, 1) | (160, 2) | (191, 1) | (294, 2) | (340, 1) | (359, 1) | (564, 1) | ... | None | None | None | None | None | None | None | None | None | None |
4983 | (101, 1) | (285, 1) | (379, 1) | (627, 1) | (639, 1) | (925, 1) | (968, 1) | (1120, 1) | (1155, 1) | (1483, 1) | ... | None | None | None | None | None | None | None | None | None | None |
4984 | (259, 1) | (346, 1) | (639, 1) | (873, 1) | (1087, 1) | (1218, 1) | (2041, 1) | (2181, 1) | (2607, 1) | (3488, 1) | ... | None | None | None | None | None | None | None | None | None | None |
4985 | (128, 1) | (179, 1) | (600, 1) | (627, 2) | (1030, 2) | (1064, 1) | (2168, 1) | (2530, 1) | (2670, 1) | (3909, 2) | ... | None | None | None | None | None | None | None | None | None | None |
4986 | (101, 1) | (171, 1) | (344, 1) | (381, 1) | (405, 1) | (455, 1) | (683, 1) | (740, 1) | (1332, 1) | (3940, 1) | ... | None | None | None | None | None | None | None | None | None | None |
4987 rows × 40 columns
Create a function to plot an interactive t-SNE graph.
def plot_tSNE(model, title="t-SNE", save_to_file=False, file_name="tsne.html"):
# Create list of topic weights from corpus
topic_weights = []
for i, row in enumerate(model[corpus]):
topic_weights.append([weight for i, weight in row])
# Pad weights to same length by filling with 0s
padded_weights = pd.DataFrame(topic_weights).fillna(0).values
# Find the dominant topic number in each document
dominant_topics = np.argmax(padded_weights, axis=1)
# Perform t-SNE dimensionality reduction
tSNE_model = TSNE(n_components=2, verbose=0, random_state=0, angle=.99, init='pca')
tsne_features = tSNE_model.fit_transform(padded_weights)
# Plot the graph against first two components
fig = px.scatter(x=tsne_features[:,0], y=tsne_features[:,1], color=dominant_topics, width=1200)
fig.update_layout(title=title)
if save_to_file:
# Save the graph to a HTML file
fig.write_html(file_name)
# Display the graph
fig.show()
Define a function to find the best k number of topics for either LDiA or LSA.
def find_best_k(corp, id2word, model_type="LDiA"):
# Record coherence score of each k value
k_scores = []
# Test different k values
for k in range(2, 48, 5):
# Create model
if model_type.lower() == "lsa":
model = LsiModel(corp, id2word=id2word, num_topics=k)
else:
model = LdaMulticore(corp, id2word=id2word, num_topics=k)
# Calculate coherence score
coherence_model = CoherenceModel(model=model, texts=word_docs, dictionary=word_id_dict, coherence='c_v')
# Record score for k
k_scores.append([k, coherence_model.get_coherence()])
k_scores_df = pd.DataFrame(k_scores, columns=['Number of Topics', 'Coherence Score'])
k_scores_df = k_scores_df.set_index('Number of Topics')
return k_scores_df
LDiA is a commonly used unsupervised machine learning algorithm used for topic modelling. For this project, the aim is to find hidden clusters of similarity between some descriptions, and to find distinguishing features of others.
As I selected the top 7 most common genres, it makes sense to see how the unsupervised algorithms categorise films into this many topics. However, as this technique is unsupervised, topics do not always correspond to a genre. This can be observed by comparing the common words of each topic in the graph below to the frequency distribution of genres in an earlier section. LDiA is therefore picking up on hidden patterns that may not be apparent to a human.
use_optimised_params = True
save_graphs = True
# Set number of topics
sampled_unique_genres, sampled_all_genres = find_unique_genres(sampled_movies_data)
k_topics = len(sampled_unique_genres)
# Optimal parameters found in section below
optimal_alpha = 0.3
optimal_eta = 0.3
if use_optimised_params:
# Create model with best alpha and eta
LDiA_model = LdaMulticore(corpus, id2word=word_id_dict, num_topics=k_topics, alpha=optimal_alpha, eta=optimal_eta)
else:
# Create model with default alpha and eta
LDiA_model = LdaMulticore(corpus, id2word=word_id_dict, num_topics=k_topics)
# Calculate coherence score
LDiA_coherence_model = CoherenceModel(model=LDiA_model, texts=word_docs, dictionary=word_id_dict, coherence='c_v')
print(colored("Coherence score: ", color="blue", attrs=['bold']) + str(LDiA_coherence_model.get_coherence()) + "\n")
# Print topic keywords
topic_scores = LDiA_model.print_topics(num_topics=k_topics)
print(colored("Topic keywords:", color="blue", attrs=['bold']))
for i in range(len(topic_scores)):
print(colored(str(i) + " ", color="blue", attrs=['bold']) + str(topic_scores[i][1:]))
print("")
# Display graph of topic distribution
vis_data = gensim_models.prepare(LDiA_model, corpus, word_id_dict)
if save_graphs:
pyLDAvis.save_html(vis_data, 'Gensim_LDiA_' + str(k_topics) + '_Topics.html')
pyLDAvis.display(vis_data)
Coherence score: 0.19187554536294868 Topic keywords: 0 ('0.011*"s" + 0.010*"find" + 0.004*"get" + 0.004*"murder" + 0.004*"live" + 0.004*"crime" + 0.003*"year" + 0.003*"become" + 0.003*"wife" + 0.003*"cop"',) 1 ('0.013*"s" + 0.007*"go" + 0.006*"take" + 0.004*"be" + 0.003*"find" + 0.003*"get" + 0.003*"boy" + 0.003*"kill" + 0.003*"try" + 0.003*"year"',) 2 ('0.010*"s" + 0.005*"find" + 0.004*"get" + 0.004*"friend" + 0.003*"take" + 0.003*"killer" + 0.003*"follow" + 0.003*"be" + 0.003*"world" + 0.003*"three"',) 3 ('0.014*"s" + 0.008*"friend" + 0.006*"get" + 0.004*"find" + 0.004*"fall" + 0.003*"become" + 0.003*"discover" + 0.003*"house" + 0.003*"take" + 0.003*"want"',) 4 ('0.009*"year" + 0.005*"s" + 0.005*"take" + 0.005*"try" + 0.005*"become" + 0.004*"murder" + 0.004*"wife" + 0.004*"find" + 0.004*"get" + 0.004*"police"',) 5 ('0.005*"s" + 0.005*"be" + 0.004*"find" + 0.004*"night" + 0.004*"decide" + 0.004*"go" + 0.003*"take" + 0.003*"discover" + 0.003*"murder" + 0.003*"city"',) 6 ('0.004*"s" + 0.004*"group" + 0.004*"try" + 0.004*"find" + 0.004*"friend" + 0.003*"become" + 0.003*"get" + 0.003*"small" + 0.003*"live" + 0.003*"back"',)
The graph above shows the similarities and difference between topics, along with their most relevent terms. While some tokens do overlap, adjusting the λ parameters shows that the topics do all have distinct words.
The t-SNE graph below allows each film to be plotted in a lower-dimension by selecting the most representative topic weights. The shape of the clusters reflect how the algorithm tries to keep similar films close together in the vector space.
Before optimisation, the majority of films' most relevant topic is the same. However, after optimisation the dominant topics are more varied. This difference can be observered by changing parameter use_optimised_params
from True
to False
.
# Plot each film and colour by the best matching topic
plot_tSNE(LDiA_model, title="LDiA t-SNE showing all films categoried into " + str(k_topics) + " topics",
save_to_file=save_graphs, file_name="tSNE_LDiA_" + str(k_topics) + "_Topics.html")
The parameters optimal_alpha
and optimal_eta
used for the model above were found using the function optimise_LDiA
below. When LDiA was run on the default parameters, it had coherence score 0.141. While when run on the best alpha-eta combination, it achieved a higher score of 0.192.
run_optimisation = False
def optimise_LDiA(corp, id2word, k=10):
# Record coherence score
alpha_eta_scores = []
# Test different alpha
for a in range(1, 20, 2):
# Test different eta
for n in range(1, 20, 2):
# Create model
model = LdaMulticore(corp, id2word=id2word, num_topics=k, alpha=a/10, eta=n/10)
# Calculate coherence score
coherence_model = CoherenceModel(model=model, texts=word_docs, dictionary=word_id_dict, coherence='c_v')
# Record score
alpha_eta_scores.append([a/10, n/10, coherence_model.get_coherence()])
alpha_eta_scores_df = pd.DataFrame(alpha_eta_scores, columns=['Alpha', 'Eta', 'Coherence Score'])
return alpha_eta_scores_df
if run_optimisation:
# Get dataframe of best alpha and eta
optimal_scores_df = optimise_LDiA(corpus, word_id_dict, k=k_topics)
# Sort by best parameter combinations
optimal_scores_df.sort_values(by=['Coherence Score'], ascending=False)
Alpha | Eta | Coherence Score | |
---|---|---|---|
14 | 0.3 | 0.9 | 0.183692 |
2 | 0.1 | 0.5 | 0.183407 |
10 | 0.3 | 0.1 | 0.180763 |
11 | 0.3 | 0.3 | 0.178898 |
1 | 0.1 | 0.3 | 0.175868 |
... | ... | ... | ... |
78 | 1.5 | 1.7 | 0.064671 |
47 | 0.9 | 1.5 | 0.064235 |
89 | 1.7 | 1.9 | 0.063932 |
65 | 1.3 | 1.1 | 0.063840 |
96 | 1.9 | 1.3 | 0.063565 |
100 rows × 3 columns
The number of topics can vary as LDiA does not specifically learn to associate a topic with a genre. The quality of topics modelling can be assessed by the coherence score, with a higher score being better. In this experiment, different numbers are attempted to find the best score.
k_scores_df = find_best_k(corpus, word_id_dict, model_type="ldia")
# Plot graph of coherence scores for different k values
fig = px.line(k_scores_df, title="Coherence for k Topics")
fig.show()
Although there are only 7 genres, generally it seems that a larger number of topics (k > n) yield a better score for LDiA.
# Find the best k value from the highest coherence score
best_coherence = k_scores_df.nlargest(1,'Coherence Score')
best_k = best_coherence.idxmax()[0]
print(colored("Best score: ", color="blue", attrs=['bold']))
print(best_coherence)
print("\n")
# Print topic keywords
topic_scores = LDiA_model.print_topics(num_topics=k_topics)
print(colored("Topic keywords:", color="blue", attrs=['bold']))
for i in range(len(topic_scores)):
print(colored(str(i) + " ", color="blue", attrs=['bold']) + str(topic_scores[i][1:]))
# View graph of best k
print(colored("\nLDiA graph with " + str(best_k) + " topics: ", color="blue", attrs=['bold']))
LDiA_model = LdaMulticore(corpus, id2word=word_id_dict, num_topics=best_k, alpha=optimal_alpha, eta=optimal_eta)
vis_data = gensim_models.prepare(LDiA_model, corpus, word_id_dict)
if save_graphs:
pyLDAvis.save_html(vis_data, 'Gensim_LDiA_' + str(best_k) + '_Topics.html')
pyLDAvis.display(vis_data)
Best score: Coherence Score Number of Topics 47 0.230579 Topic keywords: 0 ('0.011*"s" + 0.010*"find" + 0.004*"get" + 0.004*"murder" + 0.004*"live" + 0.004*"crime" + 0.003*"year" + 0.003*"become" + 0.003*"wife" + 0.003*"cop"',) 1 ('0.013*"s" + 0.007*"go" + 0.006*"take" + 0.004*"be" + 0.003*"find" + 0.003*"get" + 0.003*"boy" + 0.003*"kill" + 0.003*"try" + 0.003*"year"',) 2 ('0.010*"s" + 0.005*"find" + 0.004*"get" + 0.004*"friend" + 0.003*"take" + 0.003*"killer" + 0.003*"follow" + 0.003*"be" + 0.003*"world" + 0.003*"three"',) 3 ('0.014*"s" + 0.008*"friend" + 0.006*"get" + 0.004*"find" + 0.004*"fall" + 0.003*"become" + 0.003*"discover" + 0.003*"house" + 0.003*"take" + 0.003*"want"',) 4 ('0.009*"year" + 0.005*"s" + 0.005*"take" + 0.005*"try" + 0.005*"become" + 0.004*"murder" + 0.004*"wife" + 0.004*"find" + 0.004*"get" + 0.004*"police"',) 5 ('0.005*"s" + 0.005*"be" + 0.004*"find" + 0.004*"night" + 0.004*"decide" + 0.004*"go" + 0.003*"take" + 0.003*"discover" + 0.003*"murder" + 0.003*"city"',) 6 ('0.004*"s" + 0.004*"group" + 0.004*"try" + 0.004*"find" + 0.004*"friend" + 0.003*"become" + 0.003*"get" + 0.003*"small" + 0.003*"live" + 0.003*"back"',) LDiA graph with 47 topics:
When comparing the t-SNE graph below to the previous one with fewer topics, films now appear to be more split and seperated. This could be more useful for identifying characteristic features for categorising descriptions.
plot_tSNE(LDiA_model, title="LDiA t-SNE showing all films categoried into " + str(best_k) + " topics",
save_to_file=save_graphs, file_name="tSNE_LDiA_" + str(best_k) + "_Topics.html")
LSA is a dimensionality reduction technique that uses singular value decomposition (SVD). Like LDiA, it can also be used for topic modelling.
When using the desired number of topics, LSA achieves a far higher coherence score suggesting that it has produced topics that are less semantically similar to one another than those produced by LDiA. This makes sense as the LSA algorithm tries to separate samples, while LDiA keeps similar them close together. This is also reflected by the shape of the clusters in the t-SNE plot.
# Create model
LSA_model = LsiModel(corpus, num_topics=k_topics, id2word=word_id_dict)
# Calculate coherence score
LSA_coherence_model = CoherenceModel(model=LSA_model, texts=word_docs, dictionary=word_id_dict, coherence='c_v')
print(colored("Coherence score: ", color="blue", attrs=['bold']) + str(LSA_coherence_model.get_coherence()) + "\n")
# Print topic keywords
topic_scores = LSA_model.print_topics(num_topics=k_topics)
print(colored("Topic keywords:", color="blue", attrs=['bold']))
for i in range(len(topic_scores)):
print(colored(str(i) + " ", color="blue", attrs=['bold']) + str(topic_scores[i][1:]))
print("")
Coherence score: 0.22073122242540325 Topic keywords: 0 ('0.683*"s" + 0.201*"find" + 0.163*"get" + 0.161*"friend" + 0.153*"year" + 0.145*"take" + 0.120*"go" + 0.115*"be" + 0.106*"murder" + 0.106*"become"',) 1 ('-0.695*"s" + 0.342*"find" + 0.206*"get" + 0.134*"take" + 0.133*"be" + 0.131*"friend" + 0.123*"year" + 0.112*"go" + 0.096*"murder" + 0.093*"try"',) 2 ('-0.792*"find" + 0.462*"get" + 0.179*"be" + 0.127*"take" + 0.093*"year" + 0.091*"friend" + -0.077*"s" + 0.062*"become" + -0.057*"murder" + 0.054*"job"',) 3 ('-0.610*"get" + 0.532*"year" + 0.373*"take" + -0.228*"find" + -0.101*"friend" + 0.097*"live" + 0.080*"become" + 0.072*"force" + 0.068*"day" + -0.064*"s"',) 4 ('0.747*"friend" + -0.303*"take" + 0.236*"year" + -0.225*"murder" + 0.193*"good" + -0.136*"police" + -0.104*"kill" + -0.102*"get" + -0.094*"killer" + 0.076*"meet"',) 5 ('0.630*"year" + -0.385*"take" + -0.373*"friend" + 0.281*"get" + -0.203*"become" + 0.164*"be" + 0.140*"find" + -0.107*"group" + -0.105*"go" + -0.075*"world"',) 6 ('0.595*"take" + -0.431*"murder" + -0.360*"become" + 0.245*"get" + 0.236*"find" + -0.134*"police" + -0.107*"try" + -0.106*"killer" + -0.093*"town" + -0.089*"investigate"',)
plot_tSNE(LSA_model, title="LSA t-SNE showing all films categoried into " + str(k_topics) + " topics",
save_to_file=save_graphs, file_name="tSNE_LSA_" + str(k_topics) + "_Topics.html")
The number of topics can vary as LSA does not specifically learn to associate a topic with a genre. The quality of topics modelling can be assessed by the coherence score, with a higher score being better. In this experiment, different numbers are attempted to find the best score.
k_scores_df = find_best_k(corpus, word_id_dict, model_type="lsa")
fig = px.line(k_scores_df, title="Coherence for k Topics")
fig.show()
The optimal number of topics for LSA is 2 as this achieved the highest coherence score.
# Find the best k value from the highest coherence score
best_coherence = k_scores_df.nlargest(1,'Coherence Score')
best_k = best_coherence.idxmax()[0]
print(colored("Best score: ", color="blue", attrs=['bold']))
print(best_coherence)
print("\n")
# Print topic keywords
LSA_model = LsiModel(corpus, num_topics=best_k, id2word=word_id_dict)
topic_scores = LSA_model.print_topics(num_topics=best_k)
print(colored("Topic keywords:", color="blue", attrs=['bold']))
for i in range(len(topic_scores)):
print(colored(str(i) + " ", color="blue", attrs=['bold']) + str(topic_scores[i][1:]))
Best score: Coherence Score Number of Topics 2 0.309552 Topic keywords: 0 ('0.683*"s" + 0.201*"find" + 0.163*"get" + 0.161*"friend" + 0.153*"year" + 0.145*"take" + 0.120*"go" + 0.115*"be" + 0.106*"murder" + 0.106*"become"',) 1 ('-0.695*"s" + 0.343*"find" + 0.207*"get" + 0.134*"take" + 0.132*"be" + 0.131*"friend" + 0.123*"year" + 0.111*"go" + 0.096*"murder" + 0.093*"try"',)
plot_tSNE(LSA_model, title="LSA t-SNE showing all films categoried into " + str(best_k) + " topics",
save_to_file=save_graphs, file_name="tSNE_LSA_" + str(best_k) + "_Topics.html")