°øÀ¯ÀÚ·á HOME > ÀÚ·á½Ç > °øÀ¯ÀÚ·á
[Á¤º¸±â¼úÈ°¿ë¿¬±¸] LDA in R Code #2
°ü¸®ÀÚ 16-06-06 16:28 2,350
   LDA¿¬½À.zip (149.8K) [62] DATE : 2016-06-06 16:28:56
#load files into corpus
#get listing of .txt files in directory
filenames <- list.files(getwd(),pattern="*.txt")
#read files into a character vector
files <- lapply(filenames,readLines)
#create corpus from vector
docs <- Corpus(VectorSource(files))
#inspect a particular document in corpus
#start preprocessing
#Transform to lower case
docs <-tm_map(docs,content_transformer(tolower))
#remove potentially problematic symbols
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))})
docs <- tm_map(docs, toSpace, "-")
docs <- tm_map(docs, toSpace, "¡¯")
docs <- tm_map(docs, toSpace, "¡®")
docs <- tm_map(docs, toSpace, "•")
#remove punctuation
docs <- tm_map(docs, removePunctuation)
#Strip digits
docs <- tm_map(docs, removeNumbers)
#remove stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
#remove whitespace
docs <- tm_map(docs, stripWhitespace)
#Good practice to check every now and then
#Stem document
docs <- tm_map(docs,stemDocument)
#fix up 1) differences between us and aussie english 2) general errors
docs <- tm_map(docs, content_transformer(gsub),
 pattern = "organiz", replacement = "organ")
docs <- tm_map(docs, content_transformer(gsub),
 pattern = "organis", replacement = "organ")
docs <- tm_map(docs, content_transformer(gsub),
 pattern = "andgovern", replacement = "govern")
docs <- tm_map(docs, content_transformer(gsub),
 pattern = "inenterpris", replacement = "enterpris")
docs <- tm_map(docs, content_transformer(gsub),
 pattern = "team-", replacement = "team")
#define and eliminate all custom stopwords
myStopwords <- c("can", "say","one","way","use",
"post","look","right","now","think","¡®ve ",
"¡®re ","anoth","put","set","new","good",
docs <- tm_map(docs, removeWords, myStopwords)
#inspect a document as a check
#Create document-term matrix
dtm <- DocumentTermMatrix(docs)
#convert rownames to filenames
rownames(dtm) <- filenames
#collapse matrix by summing over columns
freq <- colSums(as.matrix(dtm))
#length should be total number of terms
#create sort order (descending)
ord <- order(freq,decreasing=TRUE)
#List all terms in decreasing order of freq and write to disk
#load topic models library
#Set parameters for Gibbs sampling
burnin <- 4000
 iter <- 2000
 thin <- 500
 seed <-list(2003,5,63,100001,765)
 nstart <- 5
 best <- TRUE
#Number of topics
k <- 5
#Run LDA using Gibbs sampling (ÀÌ°ÍÀÌ ½Ã°£ ¸¹ÀÌ °É¸²)
ldaOut <-LDA(dtm,k, method="Gibbs", control=list(nstart=nstart, seed = seed, best=best, burnin = burnin, iter = iter, thin=thin))
#write out results
#docs to topics
ldaOut.topics <- as.matrix(topics(ldaOut))
#top 6 terms in each topic
ldaOut.terms <- as.matrix(terms(ldaOut,6))
#probabilities associated with each topic assignment
topicProbabilities <- as.data.frame(ldaOut@gamma)
#Find relative importance of top 2 topics
topic1ToTopic2 <- lapply(1:nrow(dtm),function(x)
#Find relative importance of second and third most important topics
topic2ToTopic3 <- lapply(1:nrow(dtm),function(x)
#write to file