FastText Language Model Source Code
Date: 23.05.18
Writer: 9tailwolf : doryeon514@gm.gist.ac.kr
Example : FastText by using Pytorch
Library
import torch
import torch.nn as nn
import torch.optim as optim
import nltk
nltk.download('book', quiet=True)
from nltk.book import *
import torch.nn.functional as FFastText Model
class FastText(nn.Module):
def __init__(self,input_layer, hidden_layer1, output_layer):
super().__init__()
self.layer = hidden_layer1
self.mode = True
self.E = nn.Embedding(input_layer, hidden_layer1)
self.W = nn.Linear(hidden_layer1, output_layer)
self.softmax = nn.Softmax()
def forward(self, x):
if self.mode:
x = self.E(x)
res = self.W(self.softmax(x))
return res
else:
x = self.E(x)
x = x.sum(1)
res = self.W(self.softmax(x))
return res
def mode_change(self):
self.mode ^= True Tokenizer Function
def tokenizer(sentence,num,lim_word = 15):
dict_number = {'.':0}
dict_word = {0:'.'}
for s in sentence:
for word in s.split():
if word not in dict_number.keys() and len(word)<lim_word-1:
dict_number[word] = len(dict_number)
dict_word[len(dict_number)-1] = word
fastword = '<' + word + '>'
for i in range(num,len(fastword)+1):
spl = fastword[i-num:i]
if spl not in dict_number.keys():
dict_number[spl] = len(dict_number)
dict_word[len(dict_number)-1] = spl
return dict_number, dict_wordMaking Data Function
def make_data_for_skip_gram(sentence,dict_number,num,n): # divide word by num size with n gram model
X = []
Y = []
for sen in sentence:
s = list(sen.split()) # words
for word in s:
w = '<' + word + '>'
datas = [w[i-num:i] for i in range(num,len(w)+1)]
for i in range(1,n+1):
for wi in range(len(datas)-i):
X.append(dict_number[datas[wi]])
Y.append(dict_number[datas[wi+i]])
X.append(dict_number[datas[wi+i]])
Y.append(dict_number[datas[wi]])
return X,Y
def make_data_for_CBOW(sentence,dict_number,num,lim_word = 15):
X = []
Y = []
for sen in sentence:
s = list(sen.split()) # words
for word in s:
if len(word)>lim_word-2:
continue
w = '<' + word + '>'
x = [dict_number[w[i-num:i]] for i in range(num,len(w)+1)]
for _ in range(len(x),15):
x.append(0)
X.append(x)
Y.append(dict_number[word])
return X,YMain Function : Data
emma_raw = nltk.corpus.gutenberg.raw('austen-emma.txt')
new_s = ""
for s in emma_raw:
if s not in '".,_;?!1234567890[]-\'':
new_s+=s
new_s = new_s.lower()
sentence = list(new_s.split('\n'))[:500]Main Function
dict_number, dict_word = tokenizer(sentence,3)
X,Y = make_data_for_skip_gram(sentence, dict_number,3,3)
model = FastText(len(dict_number),30,len(dict_number))
model = model.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.08)
epochs = 5000
X,Y = torch.LongTensor(X),torch.LongTensor(Y)
X = X.to(device)
Y = Y.to(device)
for epoch in range(epochs):
Y_pred = model(X)
loss = loss_fn(Y_pred, Y)
model.train()
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (epoch+1) % 100==0:
print(epoch+1, loss.float())Test
def make_tensor(model,dict_number,word,num):
word = '<' + word + '>'
w = [dict_number[word[i-num:i]] for i in range(num,len(word)+1)]
w += [0 for _ in range(len(w),15)]
w = torch.LongTensor(w).to(device)
x = model.E(w)
res = x.sum(1)
return res
def test_model(model,dict_number,word,word_c1,word_c2,num):
w,w1,w2 = make_tensor(model,dict_number,word,num),make_tensor(model,dict_number,word_c1,num),make_tensor(model,dict_number,word_c2,num)
w1 = (w-w1).tolist()
w2 = (w-w2).tolist()
res1 = 0
res2 = 0
for i in w1:
res1 += i**2
for i in w2:
res2 += i**2
return res1,res2
test_model(model,dict_number,'father','mother','friend',3)
# >>> (4637.682454764163, 6111.9375018630235)