-
Notifications
You must be signed in to change notification settings - Fork 0
/
SecondStepProcedure.R
119 lines (95 loc) · 4.67 KB
/
SecondStepProcedure.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Topic Modeling Solution
setwd("/home/jc/Documents/Paper Soft Skills Sampled Programs")
listado <- data.frame(dir())
library(readtext)
library(tm)
DirSource()
# Get the data directory from readtext
DATA_DIR <- system.file("extdata/", package = "readtext")
textos <- readtext(listado$dir..)
textos$doc_id <- gsub("[^0-9-]", "", textos$doc_id)
library(quanteda)
AllPrograms <- corpus(textos)
source("~/Documents/GitHub/SoftSkillsUniversityPrograms/SampleAnalysis.R")
docvars(AllPrograms, "Programa") <- Muestra$NOMBRE_DEL_PROGRAMA
docvars(AllPrograms, "Program.Level") <- Muestra$`Academic Level`
docvars(AllPrograms, "Institution") <- Muestra$NOMBRE_INSTITUCIÓN
SPEC <- corpus_subset(AllPrograms, Program.Level == "Specialization")
MS <- corpus_subset(AllPrograms, Program.Level == "Masters")
PhD <- corpus_subset(AllPrograms, Program.Level == "Doctorate")
spanishstopwords <- c("egresado", "programa", "programas", "crédito", stopwords("spanish"))
textos$Sector <- Muestra$SECTOR
textos$Program.Level <- Muestra$`Academic Level`
textos$Accreditation <- Muestra$Accreditation
# General Classification (All programs)
library(quanteda)
tokens <- textos$text %>%
tokens(what = "word",
remove_punct = TRUE,
remove_numbers = TRUE,
remove_url = TRUE) %>%
tokens_tolower() %>%
tokens_remove(c("campo", "través", "maestría", "país", "áreas", "nivel", "calidad", "estudios", "universidad", "profesionales", "perfil", "profesional", "especialización", "nacional", "formación", "egresado", "programa", "programas", "crédito", stopwords("spanish")))
dfm <- dfm_trim(dfm(tokens), min_docfreq = 0.005, max_docfreq = 0.99,
docfreq_type = "prop", verbose = TRUE)
topfeatures(dfm, n = 40, scheme = "docfreq")
dfm <- dfm_remove(dfm, c("así", "estudiantes", "área", "así"))
library(quanteda.textstats)
Programs <- textstat_simil(dfm, margin = "documents", method = "jaccard")
ProgramsDF <- data.frame(as.matrix(Programs))
ProgramsDF <- data.frame(jaccard = ProgramsDF[lower.tri(ProgramsDF, diag = FALSE)])
# In fourth place, we applied
# a Gaussian finite mixture model fitted by EM algorithm
library(mclust)
fit <- Mclust(ProgramsDF)
summary(fit)
Classification <- data.frame(fit$classification)
names(Classification)[1] <- "classification"
# Public-Private Classification
library(dplyr)
OfficialP <- textos %>% filter(., Sector == "Official")
tokens.O <- OfficialP$text %>%
tokens(what = "word",
remove_punct = TRUE,
remove_numbers = TRUE,
remove_url = TRUE) %>%
tokens_tolower() %>%
tokens_remove(c("campo", "través", "maestría", "país", "áreas", "nivel", "calidad", "estudios", "universidad", "profesionales", "perfil", "profesional", "especialización", "nacional", "formación", "egresado", "programa", "programas", "crédito", stopwords("spanish")))
dfm <- dfm_trim(dfm(tokens.O), min_docfreq = 0.005, max_docfreq = 0.99,
docfreq_type = "prop", verbose = TRUE)
topfeatures(dfm, n = 40, scheme = "docfreq")
dfm <- dfm_remove(dfm, c("así", "estudiantes", "área", "así"))
library(quanteda.textstats)
Programs <- textstat_simil(dfm, margin = "documents", method = "jaccard")
ProgramsDF <- data.frame(as.matrix(Programs))
ProgramsDF <- data.frame(jaccard = ProgramsDF[lower.tri(ProgramsDF, diag = FALSE)])
# In fourth place, we applied
# a Gaussian finite mixture model fitted by EM algorithm
library(mclust)
fit <- Mclust(ProgramsDF)
summary(fit)
Classification <- data.frame(fit$classification)
names(Classification)[1] <- "classification"
PrivateP <- textos %>% filter(., Sector == "Private")
tokens.P <- PrivateP$text %>%
tokens(what = "word",
remove_punct = TRUE,
remove_numbers = TRUE,
remove_url = TRUE) %>%
tokens_tolower() %>%
tokens_remove(c("campo", "través", "maestría", "país", "áreas", "nivel", "calidad", "estudios", "universidad", "profesionales", "perfil", "profesional", "especialización", "nacional", "formación", "egresado", "programa", "programas", "crédito", stopwords("spanish")))
dfm <- dfm_trim(dfm(tokens.P), min_docfreq = 0.005, max_docfreq = 0.99,
docfreq_type = "prop", verbose = TRUE)
topfeatures(dfm, n = 40, scheme = "docfreq")
dfm <- dfm_remove(dfm, c("así", "estudiantes", "área", "así"))
library(quanteda.textstats)
Programs <- textstat_simil(dfm, margin = "documents", method = "jaccard")
ProgramsDF <- data.frame(as.matrix(Programs))
ProgramsDF <- data.frame(jaccard = ProgramsDF[lower.tri(ProgramsDF, diag = FALSE)])
# In fourth place, we applied
# a Gaussian finite mixture model fitted by EM algorithm
library(mclust)
fit <- Mclust(ProgramsDF)
summary(fit)
Classification <- data.frame(fit$classification)
names(Classification)[1] <- "classification"