°øÀ¯ÀÚ·á HOME > ÀÚ·á½Ç > °øÀ¯ÀÚ·á
 
[Á¤º¸±â¼úÈ°¿ë¿¬±¸] LDA¿¡ ÀÇÇÑ Topic Modeling R ÄÚµå
°ü¸®ÀÚ 16-04-14 14:32 2,543
   Samsung.csv (2.8K) [104] DATE : 2016-04-14 14:32:56
install.packages("ggplot2")
install.packages("grid")
install.packages("plyr")
install.packages("tm")
install.packages("lda")
install.packages("reshape")
install.packages("ScottKnott")
library(reshape)
library(ScottKnott)
library(lda)
library(tm)
 
setwd("F:/")
 
dataValues<- read.csv('Samsung.csv',sep=",")
dim(dataValues)
##dataValues=dataValues[sample(nrow(dataValues),replace=FALSE),] ## sample takes a sample of the specified size from the elements of x using either with or without replacement.

CorpusObj <- dataValues$text
##CorpusObj <- as.character(dataValues$text)
## Text Pre-processing.
## Creating a Corpus from the Orginal Function
## interprets each element of the vector x as a document
CorpusObj<- VectorSource(dataValues$text)
## text¸¦ Corpus ·Î ¸¸µé±â
CorpusObj<-Corpus(CorpusObj)
CorpusObj <- tm_map(CorpusObj, PlainTextDocument)
CorpusObj <- tm_map(CorpusObj, tolower) # convert all text to lower case
CorpusObj <- tm_map(CorpusObj, removePunctuation)
CorpusObj <- tm_map(CorpusObj, removeNumbers)

CorpusObj <- tm_map(CorpusObj, removeWords, stopwords("english"))
##CorpusObj <- tm_map(CorpusObj, stemDocument, language = "english") ## Stemming the words
##CorpusObj<-tm_map(CorpusObj,stripWhitespace)
##create a term document matrix
 
### Faster Way of doing LDA
corpusLDA <- lexicalize(CorpusObj)
## K: ¿äÀÎÀÇ ¼ö, ,vocab=corpusLDA$vocab (´Ü¾î ³»¿ë)
ldaModel=lda.collapsed.gibbs.sampler(corpusLDA$documents,K=10,vocab=corpusLDA$vocab,burnin=9999,num.iterations=1000,alpha=1,eta=0.1)
top.words <- top.topic.words(ldaModel$topics, 5, by.score=TRUE)
print(top.words)