FastText Language Model Source Code

Date: 23.05.18

Writer: 9tailwolf : doryeon514@gm.gist.ac.kr

Example : FastText by using Pytorch

Library

import torch
import torch.nn as nn
import torch.optim as optim
import nltk
nltk.download('book', quiet=True)
from nltk.book import *
import torch.nn.functional as F

FastText Model

class FastText(nn.Module):
    def __init__(self,input_layer, hidden_layer1, output_layer):
        super().__init__()
        self.layer = hidden_layer1
        self.mode = True
        self.E = nn.Embedding(input_layer, hidden_layer1)
        self.W = nn.Linear(hidden_layer1, output_layer)
        
        self.softmax = nn.Softmax()
        
    def forward(self, x):
        if self.mode:
            x = self.E(x)
            res = self.W(self.softmax(x))
            return res
        else:
            x = self.E(x)
            x = x.sum(1)
            res = self.W(self.softmax(x))
            return res
    
    def mode_change(self):
        self.mode ^= True

Tokenizer Function

def tokenizer(sentence,num,lim_word = 15):
    dict_number = {'.':0}
    dict_word = {0:'.'}
    for s in sentence:
        for word in s.split():
            if word not in dict_number.keys() and len(word)<lim_word-1:
                dict_number[word] = len(dict_number)
                dict_word[len(dict_number)-1] = word
                
            fastword = '<' + word + '>'
            for i in range(num,len(fastword)+1):
                spl = fastword[i-num:i]
                if spl not in dict_number.keys():
                    dict_number[spl] = len(dict_number)
                    dict_word[len(dict_number)-1] = spl
                
    return dict_number, dict_word

Making Data Function

def make_data_for_skip_gram(sentence,dict_number,num,n): # divide word by num size with n gram model
    X = []
    Y = []
    for sen in sentence:
        s = list(sen.split()) # words
        for word in s:
            w = '<' + word + '>'
            datas = [w[i-num:i] for i in range(num,len(w)+1)]
            for i in range(1,n+1):
                for wi in range(len(datas)-i):
                    X.append(dict_number[datas[wi]])
                    Y.append(dict_number[datas[wi+i]])
                    X.append(dict_number[datas[wi+i]])
                    Y.append(dict_number[datas[wi]])  
            
    return X,Y

def make_data_for_CBOW(sentence,dict_number,num,lim_word = 15):
    X = []
    Y = []
    for sen in sentence:
        s = list(sen.split()) # words
        for word in s:
            if len(word)>lim_word-2:
                continue
            w = '<' + word + '>'
            x = [dict_number[w[i-num:i]] for i in range(num,len(w)+1)]
            for _ in range(len(x),15):
                x.append(0)
            X.append(x)
            Y.append(dict_number[word])

    return X,Y

Main Function : Data

emma_raw = nltk.corpus.gutenberg.raw('austen-emma.txt')
new_s = ""
for s in emma_raw:
    if s not in '".,_;?!1234567890[]-\'':
        new_s+=s
new_s = new_s.lower()
sentence = list(new_s.split('\n'))[:500]

Main Function

dict_number, dict_word = tokenizer(sentence,3)
X,Y = make_data_for_skip_gram(sentence, dict_number,3,3)
model = FastText(len(dict_number),30,len(dict_number))
model = model.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.08)

epochs = 5000
X,Y = torch.LongTensor(X),torch.LongTensor(Y)
X = X.to(device)
Y = Y.to(device)

for epoch in range(epochs):
    Y_pred = model(X)
    loss = loss_fn(Y_pred, Y)
    model.train()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if (epoch+1) % 100==0:
        print(epoch+1, loss.float())

Test

def make_tensor(model,dict_number,word,num):
  word = '<' + word + '>'
  w = [dict_number[word[i-num:i]] for i in range(num,len(word)+1)]
  w += [0 for _ in range(len(w),15)]
  w = torch.LongTensor(w).to(device)
  x = model.E(w)
  res = x.sum(1)
  return res
  
def test_model(model,dict_number,word,word_c1,word_c2,num):
    w,w1,w2 = make_tensor(model,dict_number,word,num),make_tensor(model,dict_number,word_c1,num),make_tensor(model,dict_number,word_c2,num)
    w1 = (w-w1).tolist()
    w2 = (w-w2).tolist()
    res1 = 0
    res2 = 0
    for i in w1:
       res1 += i**2
    for i in w2:
      res2 += i**2
    return res1,res2
    
test_model(model,dict_number,'father','mother','friend',3)
# >>> (4637.682454764163, 6111.9375018630235)