Chapter 3 Only classifying data as ham and not spam #45

mbk0073 · 2017-02-07T05:51:01Z


library(tm)
library(ggplot2)

#tm is the text mining package of R
#ggplot is for visualization
#there are 2 sets of files for each type of mail and one will be used for training while other will be for testing

spam.path<-"data/spam/"
spam2.path<-"data/spam_2/"
easyham.path<-"data/easy_ham/"
easyham2.path<-"data/easy_ham_2/"
hardham.path<-"data/hard_ham//"
hardham2.path<-"data/hard_ham_2/"

get.msg<-function(path){
  print(path)
  connection<-file(path,open="rt", encoding="Latin1")
  
  text<-readLines(connection)
  #the message begins after a full line break
   
  t<-which(text=="")[1]+1
  print(length(text))
  print(t)
  msg<-text[seq(t, length(text))]
 #print(msg) 
 
  close(connection)
  return (paste(msg, collapse="\n"))
  
}

#tdm=term document matrix

get.tdm<-function(doc.vec){
  doc.corpus<-Corpus(VectorSource(doc.vec))
  control<-list(stopwords=TRUE, removePunctuation=TRUE, removeNumbers=TRUE, minDocFreq=2)
  doc.dtm<-TermDocumentMatrix(doc.corpus, control)
  return (doc.dtm)
  
}



# create a vector of emails
#use apply function

spam.docs<-dir(spam.path)
#this returns a list of file names in the directory
spam.docs<-spam.docs[seq(1,length(spam.docs)-1)]
#spam.docs<-spam.docs[which(spam.docs!="")]
#cmds file is a UNIX file which we dont need
#spam.docs<-spam.docs[!startsWith(spam.docs, "cmds")]

all.spam<-sapply(spam.docs, function(p) get.msg(paste(spam.path,p, sep="")))

spam.tdm<-get.tdm(all.spam)

#use the command below for inspection
#head(all.spam)
#z<-TermDocumentMatrix(Corpus(VectorSource(all.spam)), list(stopwords=TRUE, removeNumbers=TRUE, removePunctuation=TRUE, minDocFreq=2))

spam.matrix<- as.matrix(spam.tdm)
spam.counts<-rowSums(spam.matrix)
spam.df<-data.frame(cbind(names(spam.counts), as.numeric(spam.counts)), stringAsFactors=FALSE)
names(spam.df)<-c("term", "frequency")
spam.df$frequency<-as.numeric(spam.df$frequency)
spam.occurence<-sapply(1:nrow(spam.matrix)
                       , function(i){
                          length(which(spam.matrix[i,]>0))/ncol(spam.matrix)
                       })
spam.density<-spam.df$frequency/sum(spam.df$frequency)
spam.df<-transform(spam.df, density=spam.density, occurence=spam.occurence)

head(spam.df[with(spam.df,order(-occurence)), ])
#constructuon of Ham dataset













easy_ham.docs<-dir(easyham.path)
#this returns a list of file names in the directory
easy_ham.docs<-easy_ham.docs[seq(1,500)]
#spam.docs<-spam.docs[which(spam.docs!="")]
#cmds file is a UNIX file which we dont need
#spam.docs<-spam.docs[!startsWith(spam.docs, "cmds")]

all.easy_ham<-sapply(easy_ham.docs, function(p) get.msg(paste(easyham.path,p, sep="")))

easy_ham.tdm<-get.tdm(all.easy_ham)


#use the command below for inspection
#head(all.spam)
#z<-TermDocumentMatrix(Corpus(VectorSource(all.spam)), list(stopwords=TRUE, removeNumbers=TRUE, removePunctuation=TRUE, minDocFreq=2))

easy_ham.matrix<- as.matrix(easy_ham.tdm)
easy_ham.counts<-rowSums(easy_ham.matrix)
easy_ham.df<-data.frame(cbind(names(easy_ham.counts), as.numeric(easy_ham.counts)), stringAsFactors=FALSE)
names(easy_ham.df)<-c("term", "frequency")
easy_ham.df$frequency<-as.numeric(easy_ham.df$frequency)
easy_ham.occurence<-sapply(1:nrow(easy_ham.matrix)
                       , function(i){
                         length(which(easy_ham.matrix[i,]>0))/ncol(easy_ham.matrix)
                       })
easy_ham.density<-easy_ham.df$frequency/sum(easy_ham.df$frequency)
easy_ham.df<-transform(easy_ham.df, density=easy_ham.density, occurence=easy_ham.occurence)
easy_ham.df$NA.<-NULL
head(easy_ham.df[with(easy_ham.df,order(-occurence)), ])


#Classification function

classify.email<-function(path, training.df, prior=0.5, c=1e-6){
  msg<-get.msg(path)
  msg.tdm<-get.tdm(msg)
  msg.freq<-rowSums(as.matrix(msg.tdm))
  #Find intersection of words
  msg.match<-intersect(names(msg.freq), training.df$term)
  if(length(msg.match)<1){
    return (prior*c^(length(msg.freq)))
    
  }
  else{
    match.probs<-training.df$occurence[match(msg.match, training.df$term)]
    return (prior*prod(match.probs) * c^(length(msg.freq)-length(msg.match)))
  }
}





hardham.docs<-dir(hardham.path)
hardham.docs<-hardham.docs[seq(1:length(hardham.docs))]

hardham.spamtest<-sapply(hardham.docs, function(p) classify.email(paste(hardham.path,p, sep=""), 
                                                                  training.df = easy_ham.df))

hardham.hamtest<-sapply(hardham.docs, function(p) classify.email(paste(hardham.path, p, sep=""), training.df = easy_ham.df))

hardham.res<-ifelse(hardham.spamtest>hardham.hamtest, TRUE, FALSE)
summary(hardham.res)

This code only returns false for all values

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Chapter 3 Only classifying data as ham and not spam #45

Chapter 3 Only classifying data as ham and not spam #45

mbk0073 commented Feb 7, 2017

Chapter 3 Only classifying data as ham and not spam #45

Chapter 3 Only classifying data as ham and not spam #45

Comments

mbk0073 commented Feb 7, 2017