assignment1
In [1]:
#You are welcome to use this code for your submission if you would like to; otherwise, you are welcome to use your own code that you have developed so far.
# The main aim is to help you folks to move forward to interesting parts of the Question, and to not get stuck in some implementation details.
# Carefully note the comments throughout the code, particularly theo nes relating to handle small numbers in the log space and how to prevent NaN
# when normalising a non-negative vector to get probability distribution.
#========
# You may need to install some packages:
#install.packages(‘tm’)
#install.packages(‘SnowballC’)
eps=1e-10
# reading the data
read.data <- function(file.name='./assessments_datasets/TaskA.txt', sample.size=1000, seed=100, pre.proc=TRUE, spr.ratio= 0.90) {
# INPUTS:
## file.name: name of the input .txt file
## sample.size: if == 0 reads all docs, otherwise only reads a subset of the corpus
## seed: random seed for sampling (read above)
## pre.proc: if TRUE performs the preprocessing (recommended)
## spr.ratio: is used to reduce the sparcity of data by removing very infrequent words
# OUTPUTS:
## docs: the unlabled corpus (each row is a document)
## word.doc.mat: the count matrix (each rows and columns corresponds to words and documents, respectively)
## label: the real cluster labels (will be used in visualization/validation and not for clustering)
# Read the data
text <- readLines(file.name)
# select a subset of data if sample.size > 0
if (sample.size>0){
set.seed(seed)
text <- text[sample(length(text), sample.size)]
}
## the terms before the first '\t' are the lables (the newsgroup names) and all the remaining text after '\t' are the actual documents
docs <- strsplit(text, '\t')
# store the labels for evaluation
labels <- unlist(lapply(docs, function(x) x[1]))
# store the unlabeled texts
docs <- data.frame(unlist(lapply(docs, function(x) x[2])))
library(tm)
# create a corpus
docs <- DataframeSource(docs)
corp <- Corpus(docs)
# Preprocessing:
if (pre.proc){
corp <- tm_map(corp, removeWords, stopwords("english")) # remove stop words (the most common word in a language that can be find in any document)
corp <- tm_map(corp, removePunctuation) # remove pnctuation
corp <- tm_map(corp, stemDocument) # perform stemming (reducing inflected and derived words to their root form)
corp <- tm_map(corp, removeNumbers) # remove all numbers
corp <- tm_map(corp, stripWhitespace) # remove redundant spaces
}
# Create a matrix which its rows are the documents and colomns are the words.
dtm <- DocumentTermMatrix(corp)
## reduce the sparcity of out dtm
dtm <- removeSparseTerms(dtm, spr.ratio)
## convert dtm to a matrix
word.doc.mat <- t(as.matrix(dtm))
# Return the result
return (list("docs" = docs, "word.doc.mat"= word.doc.mat, "labels" = labels))
}
## --- helper function ------------------------------------------------------------------
# Input: logA1, logA2 ... logAn
# Output: log(A1+A2+...+An)
#
# This function is needed to prevent numerical overflow/underflow when working with small numbers,
# because we can easily get small numbers by multiplying p1 * p2 * ... * pn (where 0 <= pi <= 1 are probabilities).
#
# Example: Suppose we are interested in p1*p2*p3 + q1*q2+q3 where all numbers are probabilities \in [0,1]
# To prevent numerical errors, we do the computation in the log space and convert the result back using the exp function
# Hence our approach is to form the vector v = [log(p1)+log(p2)+log(p3) , log(q1)+log(q2)+log(q3)]
# Then get the results by: exp(logSum(v))
logSum <- function(v) {
m = max(v)
return ( m + log(sum(exp(v-m))))
}
##--- Initialize model parameters randomly --------------------------------------------
initial.param <- function(vocab_size, K=4, seed=123456){
rho <- matrix(1/K,nrow = K, ncol=1) # assume all clusters have the same size (we will update this later on)
mu <- matrix(runif(K*vocab_size),nrow = K, ncol = vocab_size) # initiate Mu
mu <- prop.table(mu, margin = 1) # normalization to ensure that sum of each row is 1
return (list("rho" = rho, "mu"= mu))
}
##--- E Step for Document Clustering --------------------------------------------
# this function currently implements the E-step of the soft-EM
# Student needs to modify this function if wants to make it Hard-EM
#
E.step <- function(gamma, model, counts, hard){
# Model Parameter Setting
N <- dim(counts)[2] # number of documents
K <- dim(model$mu)[1]
# E step:
for (n in 1:N){
for (k in 1:K){
## calculate the posterior based on the estimated mu and rho in the "log space"
gamma[n,k] <- log(model$rho[k,1]) + sum(counts[,n] * log(model$mu[k,]))
}
if(hard)
{
max.index = which.max(gamma[n, ])
gamma[n, ] = 0.0
gamma[n, max.index] = 1.0
}else {
# normalisation to sum to 1 in the log space
logZ = logSum(gamma[n,])
gamma[n,] = gamma[n,] - logZ
}
}
# converting back from the log space
gamma <- exp(gamma)
return (gamma)
}
##--- M Step for Document Clustering --------------------------------------------
M.step <- function(gamma, model, counts){
# Model Parameter Setting
N <- dim(counts)[2] # number of documents
W <- dim(counts)[1] # number of words i.e. vocabulary size
K <- dim(model$mu)[1] # number of clusters
# M step: Student needs to write this part for soft/hard EM
#......
#
# hint: before you normalise a vector so that it sums to 1, first add a small number (eps) to all elements of the vector.
# for example, suppose you have a vector [n1,n2,n3] and you want to normalise it to make it a probability distribution.
# you first need to add eps to elements [n1+eps,n2+eps,n3+eps], then divide the elements by (n1+n2+n3+ 3*eps) so that the vecotr sums to 1.
# this prevents NaN for vectors where all elements aer zero such as [0,0,0] because after adding eps you have [eps,eps,eps] which
# results in the uniform distribution after normalisation.
for (k in 1:K){
model$rho[k, 1] = sum(gamma[, k]) / N
for(w in 1:W)
{
model$mu[k, w] = eps
for (n in 1:N) {
model$mu[k, w] = model$mu[k, w] + gamma[n, k] * counts[w,n]
}
}
model$mu[k, ] = model$mu[k, ] / sum(model$mu[k, ])
}
# Return the result
return (model)
}
##--- EM for Document Clustering --------------------------------------------
EM <- function(counts, K=4, max.epoch=10, seed=123456, hard = TRUE){
#INPUTS:
## counts: word count matrix
## K: the number of clusters
#OUTPUTS:
## model: a list of model parameters
# Model Parameter Setting
N <- dim(counts)[2] # number of documents
W <- dim(counts)[1] # number of unique words (in all documents)
# Initialization
model <- initial.param(W, K=K, seed=seed)
gamma <- matrix(0, nrow = N, ncol = K)
print(train_obj(model,counts))
# Build the model
for(epoch in 1:max.epoch){
# E Step
gamma <- E.step(gamma, model, counts, hard)
# M Step
model <- M.step(gamma, model, counts)
print(train_obj(model,counts))
}
# Return Model
return(list("model"=model,"gamma"=gamma))
}
##--- the training objective function --------------------------------------------
# Input:
# model: the model object containing the mu and rho
# counts: the word-document frequency matrix
# Output:
# nloglike: the negative log-likelihood i.e. log P(counts|model)
#
train_obj <- function(model, counts) {
N <- dim(counts)[2] # number of documents
K <- dim(model$mu)[1]
nloglike = 0
for (n in 1:N){
lprob <- matrix(0,ncol = 1, nrow=K)
for (k in 1:K){
lprob[k,1] = sum(counts[,n] * log(model$mu[k,]))
}
nloglike <- nloglike - logSum(lprob + log(model$rho))
}
return (nloglike)
}
##--- Cluster Visualization -------------------------------------------------
cluster.viz <- function(doc.word.mat, color.vector, title=' '){
p.comp <- prcomp(doc.word.mat, scale. = TRUE, center = TRUE)
plot(p.comp$x, col=color.vector, pch=1, main=title)
}
### main body ##################################################################
# Reading documents
## Note: sample.size=0 means all read all documents!
##(for develiopment and debugging use a smaller subset e.g., sample.size = 40)
data <- read.data(file.name='./assessments_datasets/TaskA.txt', sample.size=0, seed=100, pre.proc=TRUE, spr.ratio= .99)
# word-document frequency matrix
counts <- data$word.doc.mat
# below is toy data if you want to work with
# counts <- matrix(c(1,1,0,1,0,0,0,1,1,1,0,1,0,0,1,1,0,0),nrow=3,ncol=6)
run <- function(counts, K, max.epoch, hard)
{
res <- EM(counts, K, max.epoch, hard)
# visualization
## find the culster with the maximum probability (since we have soft assignment here)
label.hat <- apply(res$gamma, 1, which.max)
## visualize the stimated clusters
title = 'Estimated Clusters (Soft EM)'
if(hard)
{
title = 'Estimated Clusters (Hard EM)'
}
cluster.viz(t(scale(counts)), label.hat, title)
}
# # calling the EM algorithm on the data
# res <- EM(counts, K=4, max.epoch=5, hard = TRUE)
# # visualization
# ## find the culster with the maximum probability (since we have soft assignment here)
# label.hat <- apply(res$gamma, 1, which.max)
# ## normalize the count matrix for better visualization
# counts<-scale(counts) # only use when the dimensionality of the data (number of words) is large enough
# ## visualize the stimated clusters
# cluster.viz(t(counts), label.hat, 'Estimated Clusters (Soft EM)')
Loading required package: NLP
In [2]:
## visualize the real clusters
cluster.viz(t(scale(counts)), factor(data$label), 'Real Clusters')
In [3]:
run(counts, K=4, max.epoch=5, hard = TRUE)
[1] 2174891
[1] 1955927
[1] 1945223
[1] 1942292
[1] 1940321
[1] 1938639
In [5]:
run(counts, K=4, max.epoch=5, hard = FALSE)
[1] 2178752
[1] 1959468
[1] 1950340
[1] 1945006
[1] 1941495
[1] 1938991
In [ ]: