공유자료 HOME > 자료실 > 공유자료
 
[정보기술활용연구] LDA에 의한 Topic Modeling R 코드
관리자 16-04-14 14:32 1,026
   Samsung.csv (2.8K) [60] DATE : 2016-04-14 14:32:56
install.packages("ggplot2")
install.packages("grid")
install.packages("plyr")
install.packages("tm")
install.packages("lda")
install.packages("reshape")
install.packages("ScottKnott")
library(reshape)
library(ScottKnott)
library(lda)
library(tm)
 
setwd("F:/")
 
dataValues<- read.csv('Samsung.csv',sep=",")
dim(dataValues)
##dataValues=dataValues[sample(nrow(dataValues),replace=FALSE),] ## sample takes a sample of the specified size from the elements of x using either with or without replacement.

CorpusObj <- dataValues$text
##CorpusObj <- as.character(dataValues$text)
## Text Pre-processing.
## Creating a Corpus from the Orginal Function
## interprets each element of the vector x as a document
CorpusObj<- VectorSource(dataValues$text)
## text를 Corpus 로 만들기
CorpusObj<-Corpus(CorpusObj)
CorpusObj <- tm_map(CorpusObj, PlainTextDocument)
CorpusObj <- tm_map(CorpusObj, tolower) # convert all text to lower case
CorpusObj <- tm_map(CorpusObj, removePunctuation)
CorpusObj <- tm_map(CorpusObj, removeNumbers)

CorpusObj <- tm_map(CorpusObj, removeWords, stopwords("english"))
##CorpusObj <- tm_map(CorpusObj, stemDocument, language = "english") ## Stemming the words
##CorpusObj<-tm_map(CorpusObj,stripWhitespace)
##create a term document matrix
 
### Faster Way of doing LDA
corpusLDA <- lexicalize(CorpusObj)
## K: 요인의 수, ,vocab=corpusLDA$vocab (단어 내용)
ldaModel=lda.collapsed.gibbs.sampler(corpusLDA$documents,K=10,vocab=corpusLDA$vocab,burnin=9999,num.iterations=1000,alpha=1,eta=0.1)
top.words <- top.topic.words(ldaModel$topics, 5, by.score=TRUE)
print(top.words)