You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
library(tm)
library(ggplot2)
#tm is the text mining package of R
#ggplot is for visualization
#there are 2 sets of files for each type of mail and one will be used for training while other will be for testing
spam.path<-"data/spam/"
spam2.path<-"data/spam_2/"
easyham.path<-"data/easy_ham/"
easyham2.path<-"data/easy_ham_2/"
hardham.path<-"data/hard_ham//"
hardham2.path<-"data/hard_ham_2/"
get.msg<-function(path){
print(path)
connection<-file(path,open="rt", encoding="Latin1")
text<-readLines(connection)
#the message begins after a full line break
t<-which(text=="")[1]+1
print(length(text))
print(t)
msg<-text[seq(t, length(text))]
#print(msg)
close(connection)
return (paste(msg, collapse="\n"))
}
#tdm=term document matrix
get.tdm<-function(doc.vec){
doc.corpus<-Corpus(VectorSource(doc.vec))
control<-list(stopwords=TRUE, removePunctuation=TRUE, removeNumbers=TRUE, minDocFreq=2)
doc.dtm<-TermDocumentMatrix(doc.corpus, control)
return (doc.dtm)
}
# create a vector of emails
#use apply function
spam.docs<-dir(spam.path)
#this returns a list of file names in the directory
spam.docs<-spam.docs[seq(1,length(spam.docs)-1)]
#spam.docs<-spam.docs[which(spam.docs!="")]
#cmds file is a UNIX file which we dont need
#spam.docs<-spam.docs[!startsWith(spam.docs, "cmds")]
all.spam<-sapply(spam.docs, function(p) get.msg(paste(spam.path,p, sep="")))
spam.tdm<-get.tdm(all.spam)
#use the command below for inspection
#head(all.spam)
#z<-TermDocumentMatrix(Corpus(VectorSource(all.spam)), list(stopwords=TRUE, removeNumbers=TRUE, removePunctuation=TRUE, minDocFreq=2))
spam.matrix<- as.matrix(spam.tdm)
spam.counts<-rowSums(spam.matrix)
spam.df<-data.frame(cbind(names(spam.counts), as.numeric(spam.counts)), stringAsFactors=FALSE)
names(spam.df)<-c("term", "frequency")
spam.df$frequency<-as.numeric(spam.df$frequency)
spam.occurence<-sapply(1:nrow(spam.matrix)
, function(i){
length(which(spam.matrix[i,]>0))/ncol(spam.matrix)
})
spam.density<-spam.df$frequency/sum(spam.df$frequency)
spam.df<-transform(spam.df, density=spam.density, occurence=spam.occurence)
head(spam.df[with(spam.df,order(-occurence)), ])
#constructuon of Ham dataset
easy_ham.docs<-dir(easyham.path)
#this returns a list of file names in the directory
easy_ham.docs<-easy_ham.docs[seq(1,500)]
#spam.docs<-spam.docs[which(spam.docs!="")]
#cmds file is a UNIX file which we dont need
#spam.docs<-spam.docs[!startsWith(spam.docs, "cmds")]
all.easy_ham<-sapply(easy_ham.docs, function(p) get.msg(paste(easyham.path,p, sep="")))
easy_ham.tdm<-get.tdm(all.easy_ham)
#use the command below for inspection
#head(all.spam)
#z<-TermDocumentMatrix(Corpus(VectorSource(all.spam)), list(stopwords=TRUE, removeNumbers=TRUE, removePunctuation=TRUE, minDocFreq=2))
easy_ham.matrix<- as.matrix(easy_ham.tdm)
easy_ham.counts<-rowSums(easy_ham.matrix)
easy_ham.df<-data.frame(cbind(names(easy_ham.counts), as.numeric(easy_ham.counts)), stringAsFactors=FALSE)
names(easy_ham.df)<-c("term", "frequency")
easy_ham.df$frequency<-as.numeric(easy_ham.df$frequency)
easy_ham.occurence<-sapply(1:nrow(easy_ham.matrix)
, function(i){
length(which(easy_ham.matrix[i,]>0))/ncol(easy_ham.matrix)
})
easy_ham.density<-easy_ham.df$frequency/sum(easy_ham.df$frequency)
easy_ham.df<-transform(easy_ham.df, density=easy_ham.density, occurence=easy_ham.occurence)
easy_ham.df$NA.<-NULL
head(easy_ham.df[with(easy_ham.df,order(-occurence)), ])
#Classification function
classify.email<-function(path, training.df, prior=0.5, c=1e-6){
msg<-get.msg(path)
msg.tdm<-get.tdm(msg)
msg.freq<-rowSums(as.matrix(msg.tdm))
#Find intersection of words
msg.match<-intersect(names(msg.freq), training.df$term)
if(length(msg.match)<1){
return (prior*c^(length(msg.freq)))
}
else{
match.probs<-training.df$occurence[match(msg.match, training.df$term)]
return (prior*prod(match.probs) * c^(length(msg.freq)-length(msg.match)))
}
}
hardham.docs<-dir(hardham.path)
hardham.docs<-hardham.docs[seq(1:length(hardham.docs))]
hardham.spamtest<-sapply(hardham.docs, function(p) classify.email(paste(hardham.path,p, sep=""),
training.df = easy_ham.df))
hardham.hamtest<-sapply(hardham.docs, function(p) classify.email(paste(hardham.path, p, sep=""), training.df = easy_ham.df))
hardham.res<-ifelse(hardham.spamtest>hardham.hamtest, TRUE, FALSE)
summary(hardham.res)
This code only returns false for all values
The text was updated successfully, but these errors were encountered:
This code only returns false for all values
The text was updated successfully, but these errors were encountered: