°øÀ¯ÀÚ·á HOME > ÀÚ·á½Ç > °øÀ¯ÀÚ·á
 
[PYTHON] LMDB ÆÄÀ̽ã
°ü¸®ÀÚ 19-10-23 15:33 2,918

import os


imdb_dir = 'D:/src/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')  # aclimdb Æú´õÀÇ ÈÆ·Ã µ¥ÀÌÅÍ ³»¿ëÀ» °¡Á®¿Â´Ù


labels = []    # labels¿Í texts ¶ó´Â µÎ °³ÀÇ ºó ¸®½ºÆ®¸¦ ¸¸µç´Ù
texts = []


for label_type in ['neg', 'pos']:  # train Æú´õ¿¡ ÀÖ´Â pos 12,500 + neg 12,500°³ µ¥ÀÌÅÍ Àд´Ù
     dir_name = os.path.join(train_dir, label_type) # neg¿Í pos Æú´õ °¢°¢¿¡ Á¢±ÙÇÑ´Ù


for fname in os.listdir(dir_name):
  if fname[-4:] == '.txt':  # ¸¶Áö¸· 4 ±ÛÀÚ°¡ .txt ·Î ³¡³ª´ÂÁö¸¦ È®ÀÎÇÑ´Ù
       f = open(os.path.join(dir_name, fname), encoding='utf8')
       texts.append(f.read()) # ÅؽºÆ®¸¦ Àо texts ¸®½ºÆ®¿¡ ¿¬°áÇÑ´Ù
       f.close()  if label_type == 'neg':  # ¸¸¾à ÇöÀç Æú´õ°¡ neg Æú´õ¶ó¸é
    labels.append(0) # texts¿Í °°Àº ¼ø¼­ÀÇ labels ¸®½ºÆ®¿¡´Â 0À» ÀúÀåÇÑ´Ù
  else:
    labels.append(1) # pos Æú´õ¶ó¸é °°Àº ¼ø¼­ÀÇ labels ¸®½ºÆ®¿¡ 1À» ÀúÀåÇÑ´Ù


# µ¥ÀÌÅÍ È®ÀÎ
print('texts 0:', texts[0])
print('texts len:', len(texts))


print('labels 0:', labels[0])
print('labels len:', len(labels))


# ÅؽºÆ®¿¡ »ç¿ëµÈ ´Ü¾îÀÇ Á¾·ù¸¦ ºóµµ ¼øÀ¸·Î Á¤·ÄÇÏ´Â ÀÛ¾÷À» ¼öÇàÇÑ´Ù
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import math


validation_ratio = math.floor(len(texts) * 0.3)   # °ËÁõ »ùÇÃÀº ÀüüÀÇ 30%·Î ÇÑ´Ù
max_words = 10000                  # µ¥ÀÌÅͼ¿¡¼­ °¡Àå ºóµµ ³ôÀº 10,000 °³ÀÇ ´Ü¾î¸¸ »ç¿ëÇÑ´Ù
maxlen = 200     # Ç×»ó 200 ´Ü¾î°¡ µÇµµ·Ï ±æÀ̸¦ °íÁ¤ÇÑ´Ù


tokenizer = Tokenizer(num_words=max_words) # »óÀ§ºóµµ 10,000 °³ÀÇ ´Ü¾î¸¸À» Ãß·Á³»´Â Tokenizer °´Ã¼ »ý¼º
tokenizer.fit_on_texts(texts)        # ´Ü¾î À妽º¸¦ ±¸ÃàÇÑ´Ù
word_index = tokenizer.word_index        # ´Ü¾î À妽º¸¸ °¡Á®¿Â´Ù


# Tokenizing °á°ú È®ÀÎ
print('Àüü¿¡¼­ %s°³ÀÇ °íÀ¯ÇÑ ÅäÅ«À» ã¾Ò½À´Ï´Ù.' % len(word_index))
print('word_index type: ', type(word_index))
print('word_index: ', word_index)


# ¹®ÀÚ¸¦ ¼ýÀÚ·Î º¯È¯ÇÏ´Â ÀÛ¾÷À» ¼öÇàÇÑ´Ù
# »óÀ§ ºóµµ 10,000(max_words)°³ÀÇ ´Ü¾î¸¸ ÃßÃâÇÏ¿© word_indexÀÇ ¼ýÀÚ ¸®½ºÆ®·Î º¯È¯ÇÑ´Ù.
data = tokenizer.texts_to_sequences(texts)  # Tokenizer °á°ú°¡ ¿©±â¼­ ¹Ý¿µµÈ´Ù.


print('data 0:', data[0])


# PaddingÀº µ¥ÀÌÅÍÀÇ ±æÀ̸¦ °íÁ¤½ÃÄÑ ÁØ´Ù
# ÁöÁ¤µÈ ±æÀÌ¿¡ ¸ðÀÚ¶ó´Â °ÍÀº 0À¸·Î ä¿ì°í, ³ÑÄ¡´Â °ÍÀº À߶󳽴Ù
# ÅÙ¼­ÀÇ Å©±â¸¦ ¸ÂÃç¾ß ÇÏ´Â °æ¿ì À¯¿ëÇÏ´Ù
# one-hot encoding µîÀ» ÅëÇØ ±æÀÌ°¡ °íÁ¤µÉ ¼ö ÀÖ´Ù¸é ÇÏÁö ¾Ê¾Æµµ µÈ´Ù
# ´Ü¾îÀÇ ¼±ÅÃÀº µÚ¿¡¼­ºÎÅÍ Çϸç, nested list¸¦ 2D ÅÙ¼­(2Â÷¿ø ³ÑÆÄÀÌ ¹è¿­)·Î ¸¸µç´Ù
from keras.preprocessing.sequence import pad_sequences


sequences = [[1, 2, 3, 4, 5], [1, 2, 3, 4], [1]]    # nested list
padded = pad_sequences(sequences, maxlen=3)    # 2D tensor
print(padded)


data = pad_sequences(data, maxlen=maxlen)


print('data:', data)
print('data 0:', data[0])


# one-hot encodingÀº ¸ðµç ¼ýÀÚ¸¦ 0°ú 1·Î¸¸ ¸¸µç´Ù
# ¿ø-ÇÖ ÀÎÄÚµù ÇÔ¼ö
def to_one_hot(sequences, dimension):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results


# data¸¦ one-hot-ÀÎÄÚµùÀ¸·Î 0°ú 1ÀÇ º¤ÅÍ·Î º¯È¯
# labelÀº ÀÌ¹Ì 0°ú 1·Î űëµÇ¾î ÀÖÀ¸¹Ç·Î, list¿¡¼­ ³ÑÆÄÀÌ ¹è¿­·Î¸¸ º¯È¯. float32¸¦ ÁöÁ¤ÇÏÁö ¾ÊÀ¸¸é int32·Î ÀúÀåµÈ´Ù
data = to_one_hot(data, dimension=max_words)
labels = np.asarray(labels).astype('float32')


print('data:', data)


len(data[0])     # dimension=10000À¸·Î ÇßÀ¸¹Ç·Î °¢ ÇàÀº 10,000°³¸¦ °¡Áö°í ÀÖ´Ù
print('data [0][0:100]:', data[0][0:100])


## Train µ¥ÀÌÅÍ¿Í Validation µ¥ÀÌÅÍ Áغñ
print('µ¥ÀÌÅÍ ÅÙ¼­ÀÇ Å©±â:', data.shape)    # (25000, 10000)
print('·¹À̺í ÅÙ¼­ÀÇ Å©±â:', labels.shape)   # (25000,) data¿Í labelÀÌ ¸ðµÎ 2D ÅÙ¼­°¡ µÇ¾úÀ½


indices = np.arange(data.shape[0])   # 0 ~ 24999 ±îÁöÀÇ ¼ýÀÚ¸¦ »ý¼º
np.random.shuffle(indices)        # 0 ~ 24999 ±îÁöÀÇ ¼ýÀÚ¸¦ ·£´ýÇÏ°Ô ¼¯À½
data = data[indices]        # ÀÌ°ÍÀ» À妽º·Î ÇÏ¿© 2D ÅÙ¼­ µ¥ÀÌÅ͸¦ ¼¯À½
labels = labels[indices]    # labelµµ °°Àº ¼ø¼­·Î ¼¯À½


x_train = data[validation_ratio:]    # ÈƷõ¥ÀÌÅÍÀÇ 70%¸¦ ÈƷõ¥ÀÌÅÍ
y_train = labels[validation_ratio:]   # ÈƷõ¥ÀÌÅÍÀÇ 70%¸¦ ÈƷõ¥ÀÌÅÍ Label (data¿Í labels´Â °°Àº ¼ø¼­)
x_val = data[:validation_ratio]    # ÈƷõ¥ÀÌÅÍÀÇ 30%¸¦ °ËÁõµ¥ÀÌÅÍ
y_val = labels[:validation_ratio]    # ÈƷõ¥ÀÌÅÍÀÇ 30%¸¦ °ËÁõµ¥ÀÌÅÍ Label


## ¸ðµ¨ Á¤ÀÇÇϱâ
from keras.models import Sequential
from keras.layers import Dense


model = Sequential()                                                          # ¸ðµ¨À» »õ·Î Á¤ÀÇ


model.add(Dense(64, activation='relu', input_shape=(max_words,))) # ù ¹ø° Àº´ÐÃþ
model.add(Dense(32, activation='relu'))                             # µÎ ¹ø° Àº´ÐÃþ
model.add(Dense(1, activation='sigmoid'))                   # Ãâ·ÂÃþ


model.summary()


# ¸ðµ¨ ÄÄÆÄÀÏ
# °¡ÁßÄ¡ ¾÷µ¥ÀÌÆ® ¹æ¹ýÀº RMSpropÀ» »ç¿ëÇÏ¿´´Ù. À̵¿Æò±ÕÀÇ ¹æ¹ýÀ» µµÀÔÇÏ¿© Á¶ÀýÇØ°£´Ù
# ½Å°æ¸ÁÀÇ Ãâ·ÂÀÌ È®·üÀ̹ǷΠcrossentropy¸¦ »ç¿ëÇÏ´Â °ÍÀÌ ÃÖ¼±ÀÌ´Ù
# crossentropy´Â ¿øº»ÀÇ È®·ü ºÐÆ÷¿Í ¿¹ÃøÀÇ È®·ü ºÐÆ÷¸¦ ÃøÁ¤ÇÏ¿© Á¶ÀýÇØ °£´Ù
# ¶ÇÇÑ ÀÌÁø ºÐ·ùÀ̹ǷΠbinary_crossentropy¸¦ »ç¿ëÇÑ´Ù
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])


# ¸ðµ¨ ÈÆ·Ã
# 32°³¾¿ ¹Ì´Ï ¹èÄ¡¸¦ ¸¸µé¾î 10¹øÀÇ epoch·Î ÈÆ·ÃÇÑ´Ù. º¸Åë 32°³¿¡¼­ ½ÃÀÛÇÏ¿© 512°³±îÁö Áß¿¡¼­ ã´Â´Ù
# ÈÆ·Ã µ¥ÀÌÅÍ·Î ÈÆ·ÃÇÏ°í, °ËÁõ µ¥ÀÌÅÍ·Î °ËÁõÇÑ´Ù
# ¹Ýȯ°ªÀÇ history´Â ÈÆ·ÃÇÏ´Â µ¿¾È ¹ß»ýÇÑ ¸ðµç Á¤º¸¸¦ ´ã°í ÀÖ´Â µñ¼Å³Ê¸®ÀÌ´Ù
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))
history_dict = history.history


# multidimensional numpy arrays¸¦ ÀúÀåÇÒ ¼ö ÀÖ´Â h5 file(HDF) Æ÷¸ËÀ¸·Î ÀúÀåÇÑ´Ù
model.save('text_binary_model.h5')


# ÈƷõ¥ÀÌÅÍ¿¡¼­ »ç¿ëµÈ »óÀ§ºóµµ 10,000°³ÀÇ ´Ü¾î·Î µÈ Tokenizer ÀúÀå
# »õ·Î ÀԷµǴ ¹®Àå¿¡¼­µµ °°Àº ´Ü¾î°¡ ÃßÃâµÇ°Ô ÇÑ´Ù
import pickle


with open('text_binary_tokenizer', 'wb') as handle:
       pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


# history µñ¼Å³Ê¸® ¾È¿¡ ÀÖ´Â Á¤È®µµ¿Í ¼Õ½Ç°ªÀ» °¡Á®¿Í º»´Ù
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']


print('Accuracy of each epoch:', acc)  # [0.79, 0.90, 0.93, 0.94, 0.96, 0.97, 0.98, 0.98, 0.98, 0.99]
epochs = range(1, len(acc) +1)   # range(1, 11)


import matplotlib.pyplot as plt


# ÈƷõ¥ÀÌÅÍÀÇ Á¤È®µµ¿¡ ºñÇØ °ËÁõµ¥ÀÌÅÍÀÇ Á¤È®µµ´Â ³·°Ô ³ªÅ¸³­´Ù
# epoch°¡ ³ô¾ÆÁö¸é ¸ðµ¨Àº ÈƷõ¥ÀÌÅÍ¿¡ ¸Å¿ì ¹Î°¨ÇØÁ® ¿ÀÈ÷·Á »õ·Î¿î µ¥ÀÌÅ͸¦ Àß ¸ø ¸ÂÃá´Ù
plt.plot(epochs, acc, 'bo', label='Training Acc')
plt.plot(epochs, val_acc, 'b', label='Validation Acc')
plt.title('Training and validation accuracy')
plt.legend()


plt.figure()    # »õ·Î¿î ±×¸²À» ±×¸°´Ù

# ÈƷõ¥ÀÌÅÍÀÇ ¼Õ½Ç°ªÀº ³·¾ÆÁö³ª, °ËÁõµ¥ÀÌÅÍÀÇ ¼Õ½Ç°ªÀº ³ô¾ÆÁø´Ù
# ¼Õ½Ç°ªÀº ¿À·ù°ªÀ» ¸»ÇÑ´Ù. ¿¹Ãø°ú Á¤´äÀÇ Â÷À̸¦ °Å¸® °è»êÀ¸·Î ±¸ÇÑ °ªÀÌ´Ù
plt.plot(epochs, loss, 'bo', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()


plt.show()