°øÀ¯ÀÚ·á HOME > ÀÚ·á½Ç > °øÀ¯ÀÚ·á
 
[Á¤º¸±â¼úÈ°¿ë¿¬±¸] TF-IDF ¹× ÄÚ»çÀÎ À¯»ç¹®¼­ ã±â R ÄÚµå
°ü¸®ÀÚ 16-04-09 14:58 4,308
install.packages("tm")
library(tm)
doc1 <- "The fox chases the rabbit with cabbage which men can eat, too"
doc2 <- "The rabbit ate the cabbage"
doc3 <- "The fox caught the rabbit"
doc.list <- list(doc1, doc2, doc3)
n.docs <- length(doc.list)
names(doc.list) <- paste("doc", c(1:n.docs), sep="")
my.corpus <- Corpus(VectorSource(doc.list))
my.corpus <- tm_map(my.corpus, PlainTextDocument)   ## ¶Ç´Â my.corpus <- tm_map(my.corpus, content_transformer(tolower))
TDM_Tf <- TermDocumentMatrix(my.corpus, control=list(removePuctuation = TRUE, removeNumbers = TRUE, stopwords = TRUE, weighting = weightTf))
TDM_TfIdf <- TermDocumentMatrix(my.corpus, control=list(removePuctuation = TRUE, removeNumbers = TRUE, stopwords = TRUE, weighting = weightTfIdf))
m1<-as.matrix(TDM_Tf)
m1
m2<-as.matrix(TDM_TfIdf)
m2
## m2°¡ TfIdf °ªÀÓ - °¢ term º°·Î °¢ docuementÀÇ keywordÀÇ Á¤µµ¸¦ º¸¿©ÁÜ
 
# doc ³»ÀÇ ¼ö´Â ƯÁ¤ ´Ü¾îÀÇ Á¤±ÔÈ­µÈ µîÀåȽ¼ö¶ó°í ÇÏÀÚ
doc1 = c(0.1, 0.2, 0.2)
doc2 = c(0.14, 0.0, 0.0)
doc3 = c(0.0, 0.14, 0.11)
# À¯Å¬¸®µð¾ð °Å¸® ±¸Çϱâ (ÀÛÀ»¼ö·Ï À¯»çÇÑ °ÍÀÓ)
d_12 = sqrt(sum((doc1 - doc2)**2))
d_23 = sqrt(sum((doc2 - doc3)**2))
d_31 = sqrt(sum((doc3 - doc1)**2))
d_12
d_23
d_31
# ÄÚ»çÀÎ À¯»çµµ ±¸Çϱâ (Ŭ¼ö·Ï À¯»çÇÑ °ÍÀÓ)
cos_12 = sum(doc1*doc2)/((sqrt(sum(doc1**2)) * sqrt(sum(doc2**2))))
cos_23 = sum(doc2*doc3)/((sqrt(sum(doc2**2)) * sqrt(sum(doc3**2))))
cos_31 = sum(doc3*doc1)/((sqrt(sum(doc3**2)) * sqrt(sum(doc1**2))))
cos_12
cos_23
cos_31
# target ¹®¼­¸¦ ÀâÀ½(query)
query = c(0.1, 0.2, 0.2)
(d_query1 = sqrt(sum((query - doc1)**2)))
(d_query2 = sqrt(sum((query - doc2)**2)))
(d_query3 = sqrt(sum((query - doc3)**2)))
(cos_q1 = sum(query*doc1)/((sqrt(sum(query**2)) * sqrt(sum(doc1**2)))))
(cos_q2 = sum(query*doc2)/((sqrt(sum(query**2)) * sqrt(sum(doc2**2)))))
(cos_q3 = sum(query*doc3)/((sqrt(sum(query**2)) * sqrt(sum(doc3**2)))))
# °á±¹ d_query´Â ÀÛÀ»¼ö·Ï, cos´Â Ŭ¼ö·Ï À¯»çÇÑ ¹®¼­ÀÓ