#setwd( // ÀÚ½ÅÀÇ ÀÛ¾÷ Æú´õ path // )
#The sequence of column in CSV does matter the smoot_spline method to work properly
if(Sys.info()["user"] == 'obkwon'){
path = "// °¨¼º¾îÈÖ»çÀü°ú csv ÆÄÀÏÀÌ ÀÖ´Â À§Ä¡ + '/'"
}else{
path = " // °¨¼º¾îÈÖ»çÀü°ú csv ÆÄÀÏÀÌ ÀÖ´Â À§Ä¡ + '/'"
}
# function score.sentiment
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
# Parameters
# sentences: vector of text to score
# pos.words: vector of words of postive sentiment
# neg.words: vector of words of negative sentiment
# .progress: passed to lapply() to control of progress bar
# create simple array of scores with laply
scores = lapply(sentences,
function(sentence, pos.words, neg.words)
{
# remove punctuation
sentence = gsub("[[:punct:]]", "", sentence)
# remove control characters
sentence = gsub("[[:cntrl:]]", "", sentence)
# remove digits?
sentence = gsub('\\d+', '', sentence)
# remove html links
sentence = gsub("http\\w+", "", sentence)
# remove unnecessary spaces
sentence = gsub("[ \t]{2,}", "", sentence)
sentence = gsub("^\\s+|\\s+$", "", sentence)
# define error handling function when trying tolower
tryTolower = function(x)
{
# create missing value
y = NA
# tryCatch error
try_error = tryCatch(tolower(x), error=function(e) e)
# if not an error
if (!inherits(try_error, "error"))
y = tolower(x)
# result
return(y)
}
# use tryTolower with sapply
sentence = sapply(sentence, tryTolower)
# split sentence into words with str_split (stringr package)
word.list = str_split(sentence, "
\\s+")
words = unlist(word.list)
# compare words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# get the position of the matched term or NA
# we just want a TRUE/FALSE
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# final score
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words, .progress=.progress )
# data frame with scores for each sentence
scores.df = data.frame(text=sentences, score=scores)
return(scores.df)
}
# import positive and negative words
posText = paste(path,"positive-words.txt", sep = "")
negText = paste(path,"negative-words.txt", sep = "")
# import positive and negative words
pos.words = readLines(posText)
neg.words = readLines(negText)
movies <- read.csv("movies.csv", header=T)
## movies <- data.frame(name=c("¿µÈ1", "¿µÈ2"), review=c("You're awesome and I love you.", "I hate and hate and hate, So angry!"), score=c(0,0))
(results <- score.sentiment(movies$review, pos.words, neg.words))
movies$score <- results$score
movies