Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chapter 3 Only classifying data as ham and not spam #45

Open
mbk0073 opened this issue Feb 7, 2017 · 0 comments
Open

Chapter 3 Only classifying data as ham and not spam #45

mbk0073 opened this issue Feb 7, 2017 · 0 comments

Comments

@mbk0073
Copy link

mbk0073 commented Feb 7, 2017


library(tm)
library(ggplot2)

#tm is the text mining package of R
#ggplot is for visualization
#there are 2 sets of files for each type of mail and one will be used for training while other will be for testing

spam.path<-"data/spam/"
spam2.path<-"data/spam_2/"
easyham.path<-"data/easy_ham/"
easyham2.path<-"data/easy_ham_2/"
hardham.path<-"data/hard_ham//"
hardham2.path<-"data/hard_ham_2/"

get.msg<-function(path){
  print(path)
  connection<-file(path,open="rt", encoding="Latin1")
  
  text<-readLines(connection)
  #the message begins after a full line break
   
  t<-which(text=="")[1]+1
  print(length(text))
  print(t)
  msg<-text[seq(t, length(text))]
 #print(msg) 
 
  close(connection)
  return (paste(msg, collapse="\n"))
  
}

#tdm=term document matrix

get.tdm<-function(doc.vec){
  doc.corpus<-Corpus(VectorSource(doc.vec))
  control<-list(stopwords=TRUE, removePunctuation=TRUE, removeNumbers=TRUE, minDocFreq=2)
  doc.dtm<-TermDocumentMatrix(doc.corpus, control)
  return (doc.dtm)
  
}



# create a vector of emails
#use apply function

spam.docs<-dir(spam.path)
#this returns a list of file names in the directory
spam.docs<-spam.docs[seq(1,length(spam.docs)-1)]
#spam.docs<-spam.docs[which(spam.docs!="")]
#cmds file is a UNIX file which we dont need
#spam.docs<-spam.docs[!startsWith(spam.docs, "cmds")]

all.spam<-sapply(spam.docs, function(p) get.msg(paste(spam.path,p, sep="")))

spam.tdm<-get.tdm(all.spam)

#use the command below for inspection
#head(all.spam)
#z<-TermDocumentMatrix(Corpus(VectorSource(all.spam)), list(stopwords=TRUE, removeNumbers=TRUE, removePunctuation=TRUE, minDocFreq=2))

spam.matrix<- as.matrix(spam.tdm)
spam.counts<-rowSums(spam.matrix)
spam.df<-data.frame(cbind(names(spam.counts), as.numeric(spam.counts)), stringAsFactors=FALSE)
names(spam.df)<-c("term", "frequency")
spam.df$frequency<-as.numeric(spam.df$frequency)
spam.occurence<-sapply(1:nrow(spam.matrix)
                       , function(i){
                          length(which(spam.matrix[i,]>0))/ncol(spam.matrix)
                       })
spam.density<-spam.df$frequency/sum(spam.df$frequency)
spam.df<-transform(spam.df, density=spam.density, occurence=spam.occurence)

head(spam.df[with(spam.df,order(-occurence)), ])
#constructuon of Ham dataset













easy_ham.docs<-dir(easyham.path)
#this returns a list of file names in the directory
easy_ham.docs<-easy_ham.docs[seq(1,500)]
#spam.docs<-spam.docs[which(spam.docs!="")]
#cmds file is a UNIX file which we dont need
#spam.docs<-spam.docs[!startsWith(spam.docs, "cmds")]

all.easy_ham<-sapply(easy_ham.docs, function(p) get.msg(paste(easyham.path,p, sep="")))

easy_ham.tdm<-get.tdm(all.easy_ham)


#use the command below for inspection
#head(all.spam)
#z<-TermDocumentMatrix(Corpus(VectorSource(all.spam)), list(stopwords=TRUE, removeNumbers=TRUE, removePunctuation=TRUE, minDocFreq=2))

easy_ham.matrix<- as.matrix(easy_ham.tdm)
easy_ham.counts<-rowSums(easy_ham.matrix)
easy_ham.df<-data.frame(cbind(names(easy_ham.counts), as.numeric(easy_ham.counts)), stringAsFactors=FALSE)
names(easy_ham.df)<-c("term", "frequency")
easy_ham.df$frequency<-as.numeric(easy_ham.df$frequency)
easy_ham.occurence<-sapply(1:nrow(easy_ham.matrix)
                       , function(i){
                         length(which(easy_ham.matrix[i,]>0))/ncol(easy_ham.matrix)
                       })
easy_ham.density<-easy_ham.df$frequency/sum(easy_ham.df$frequency)
easy_ham.df<-transform(easy_ham.df, density=easy_ham.density, occurence=easy_ham.occurence)
easy_ham.df$NA.<-NULL
head(easy_ham.df[with(easy_ham.df,order(-occurence)), ])


#Classification function

classify.email<-function(path, training.df, prior=0.5, c=1e-6){
  msg<-get.msg(path)
  msg.tdm<-get.tdm(msg)
  msg.freq<-rowSums(as.matrix(msg.tdm))
  #Find intersection of words
  msg.match<-intersect(names(msg.freq), training.df$term)
  if(length(msg.match)<1){
    return (prior*c^(length(msg.freq)))
    
  }
  else{
    match.probs<-training.df$occurence[match(msg.match, training.df$term)]
    return (prior*prod(match.probs) * c^(length(msg.freq)-length(msg.match)))
  }
}





hardham.docs<-dir(hardham.path)
hardham.docs<-hardham.docs[seq(1:length(hardham.docs))]

hardham.spamtest<-sapply(hardham.docs, function(p) classify.email(paste(hardham.path,p, sep=""), 
                                                                  training.df = easy_ham.df))

hardham.hamtest<-sapply(hardham.docs, function(p) classify.email(paste(hardham.path, p, sep=""), training.df = easy_ham.df))

hardham.res<-ifelse(hardham.spamtest>hardham.hamtest, TRUE, FALSE)
summary(hardham.res)

This code only returns false for all values

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant