LSTM을 이용한 게임평가점수 예측

2022. 9. 21. 20:28

728x90

프로젝트 개요

각종 미디어에 대한 평점을 매기는 사이트인 메타크리틱에서 게임관련 데이터를 사용
게임에 대한 평가(문자열)를 입력으로 하고 그에 대한 점수(숫자) 예측하는 모델
이 때, 점수는 일의 자리를 버리고 사용(67점과 63점을 동일한 target으로 사용)

integrated_txt.txt

0.05MB

데이터는 하나의 행에 2개의 정보가 들어간다. [평가, 점수]

평가와 점수는 Tab으로 분리되어 있다

각각의 행들은 Enter로 분리되어 있다.

In [ ]:

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# CUDA 기기가 존재한다면, 아래 코드가 CUDA 장치를 출력합니다:
print(device)

# cuda:0

In [ ]:

from google.colab import drive
drive.mount('/content/drive')

file_path = open('/content/integrated_txt.txt', 'r')
data_lst = []
for file in file_path:
    txt = file.split('\n')[0]
    data_lst.append(txt.split('\t'))

data_arr = np.array(data_lst)

X_data = data_arr[:, 0]
t_data = data_arr[:, 1]

print(f'데이터 개수 : {X_data.shape}')

import collections
target_count = collections.Counter(t_data)
target_count = [target_count[str(idx)] for idx in range(10)]
print('점수 분리 기준 : [ 0-9 | 10-19 | 20-29 | 30-39 | 40-49 | 50-59 | 60-69 | 70-79 | 80-89 | 90-100 ]')
print(f'레이블 별 데이터 개수 : {target_count}')

# Mounted at /content/drive
# 데이터 개수 : (263,)
# 점수 분리 기준 : [ 0-9 | 10-19 | 20-29 | 30-39 | 40-49 | 50-59 | 60-69 | 70-79 | 80-89 | 90-100 ]
# 레이블 별 데이터 개수 : [6, 22, 14, 33, 29, 29, 31, 31, 35, 33]

In [ ]:

from sklearn.model_selection import train_test_split

X_train, X_test, t_train, t_test = train_test_split(X_data, t_data, test_size=0.2)

sklearn 모듈을 이용하여 140개의 데이터를 마구잡이로 섞어준 뒤 비율에 맞게 Train data와 test 데이터로 분리
train_test_split은 numpy array가 입력되었을 때, 해당 array내의 순서를 섞어준 뒤 비율에 맞게 데이터를 분리하여줌(분리 비율 설정 가능)

In [ ]:

print(f'X train 데이터 길이 : {X_train.shape}, X test 데이터 길이 : {X_test.shape}')

# X train 데이터 길이 : (210,), X test 데이터 길이 : (53,)

In [ ]:

word_set = set([])
for idx in range(len(X_train)):
    tmp = set(X_train[idx].split())
    word_set = word_set | tmp

word2index = {tkn: i for i, tkn in enumerate(word_set, 1)}
word2index['<unk>'] = 0

index2word = {v: k for k, v in word2index.items()}

#target 설정용
one_hot_encoding = [0]*10

i = 5   #테스트용 변경가능 

print(index2word[i])
print(word2index[index2word[i]])
print(f'word set의 길이 : {len(word_set)} => X train({len(X_train)}개의 문장)속에 {len(word_set)}개의 단어가 있음')

# 3DS
# 5
# word set의 길이 : 2507 => X train(210개의 문장)속에 2507개의 단어가 있음

테스트 할 때, 테스트 데이터 속 단어가 훈련 데이터 내에 있다는 보장이 없기 때문에 unknown데이터를 추가해준다(7번 줄)

In [ ]:

def build_data(sentence, word2index):
    encoded = [word2index[token] if token in word2index else word2index['<unk>'] for token in sentence] # 각 문자를 정수로 변환하여 순서대로 저장, 만약에 단어가 word set에 없다면 ['unk']인 0을 저장
    input_seq = torch.LongTensor(encoded).unsqueeze(0)
    return input_seq

In [ ]:

i = 1 #테스트용 변경가능(제한 범위 : 0 ~ 데이터 개수)
sentence = X_data[i].split()
#10칸 짜리 리스트를 복사하여 target값에 해당하는 index값만 1로 변경(one hot encoding)
Y = one_hot_encoding.copy()
Y[int(t_data[i])] = 1
X, Y = build_data(sentence, word2index), torch.FloatTensor(Y).unsqueeze(0)
print(i, '번째 문장의 input sequence')
print(X, '\n')
print(i, '번째 평가의 target값')
print(Y)

# 1 번째 문장의 input sequence
# tensor([[ 812,  578, 2341, 1876,  643, 2357,  903, 1662, 1197, 2314, 1509,  604,
#          2078,  713, 2496,  405,  890, 1509,  311,  565,  890, 1509,  243, 1203,
#           485]]) 
# 
# 1 번째 평가의 target값
# tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [ ]:

decoded = ' '.join([index2word[token] for token in X.data.numpy()[0]])
print(decoded)

# Grand Theft Auto V for new-gen is without a doubt the best way to experience one of the biggest releases of the last half decade.

문장에 대한 인코딩이 정상적으로 이루어졌음을 확인할 수 있다.

In [ ]:

class LSTM(nn.Module):
    def __init__(self, n_layers, hidden_size, n_vocab, embed_dim, n_classes):
        super(LSTM, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.lstm = nn.LSTM(input_size=embed_dim, 
                            hidden_size=self.hidden_size, 
                            num_layers=self.n_layers, 
                            batch_first=True)
        self.fc = nn.Linear(self.hidden_size, n_classes)

    def forward(self, x):
        x = self.embed(x)
        _, (output, _) = self.lstm(x)
        output = output.view(-1, self.hidden_size)
        output = self.fc(output)  
        return output

In [ ]:

model = LSTM(1, 512, len(word2index), int(len(word2index)/2), 10)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [ ]:

for epoch in range(1, 300 + 1):
    loss_tot = 0
    #훈련 순서 무작위로 섞어주기
    idx = np.random.permutation(len(X_train))
    for i in idx:
        sentence = X_train[i].split()

        Y = one_hot_encoding.copy()
        Y[int(t_train[i])] = 1
        X, Y = build_data(sentence, word2index), torch.FloatTensor(Y).unsqueeze(0)

        X = X.to(device)
        Y = Y.to(device)

        optimizer.zero_grad
        output = model(X)
        loss = nn.functional.cross_entropy(output, Y)
        loss_tot += loss.item()
        loss.backward()
        optimizer.step()
    if epoch % 50 == 0:
        print(f'epcoh: {epoch}, loss mean: {loss_tot/len(X_train)}')

# epcoh: 50, loss mean: 6.463479482905851
# epcoh: 100, loss mean: 8.96197790957631
# epcoh: 150, loss mean: 5.028339070365543
# epcoh: 200, loss mean: 2.059221541313898
# epcoh: 250, loss mean: 15.931298896244593
# epcoh: 300, loss mean: 0.5032242803345287

In [ ]:

correct = 0
plus_minus_1 = 0
with torch.no_grad():
    for i, sentence in enumerate(X_test):

        Y = one_hot_encoding.copy()
        Y[int(t_test[i])] = 1
        X, Y = build_data(sentence.split(), word2index), torch.FloatTensor(Y).unsqueeze(0)

        X = X.to(device)
        output = model(X).cpu()
        loss = nn.functional.cross_entropy(output, Y)

        if int(t_test[i]) == np.argmax(output.data.numpy()): correct += 1
        elif int(t_test[i]) == np.argmax(output.data.numpy()) + 1 or int(t_test[i]) == np.argmax(output.data.numpy()) - 1: plus_minus_1 += 1
print(f'테스트 데이터 개수 : {X_test.shape} \n정확히 예측한 경우 : {correct}\n한 칸 벗어난 예측을 한 경우(ex, 예측=6, 타겟=7 또는 5) : {plus_minus_1}')

# 테스트 데이터 개수 : (53,) 
# 정확히 예측한 경우 : 9
# 한 칸 벗어난 예측을 한 경우(ex, 예측=6, 타겟=7 또는 5) : 12

저작자표시

'활동' 카테고리의 다른 글

디지털스마트부산아카데미 수료 및 수상 (0)	2022.11.26
x-corps 경진대회 수상 (0)	2022.05.21
교내 프로그래밍 경진대회 수상 (0)	2022.05.18

RUNnRUN 잡담

LSTM을 이용한 게임평가점수 예측

'활동' 카테고리의 다른 글

+ Recent posts

티스토리툴바