Word2Vec Embedding Model Source Code
Date: 23.05.16
Writer: 9tailwolf : doryeon514@gm.gist.ac.kr
CBOW by using Pytorch
Initial Skip_gram by using Pytorch
Subsampling
Word2Vec by Pytorch
CBOW by using Pytorch
Library
import torch
import torch.nn as nn
import torch.optim as optimContinuous Bag-of-Words Model
class CBOW(nn.Module):
def __init__(self,input_layer, hidden_layer1, hidden_layer2, output_layer):
super().__init__()
self.E = nn.Embedding(input_layer, hidden_layer1)
self.W = nn.Linear(hidden_layer1, output_layer)
self.b = nn.Parameter(torch.ones(output_layer))
self.softmax = nn.Softmax()
def forward(self, x):
x = self.E(x)
x = x.sum(1)
res = self.W(self.softmax(x)) + self.b
return resTokenizer Function
def tokenizer(sentence):
dict_number = {}
dict_word = {}
for s in sentence:
for word in s.split():
if word not in dict_number.keys():
dict_number[word] = len(dict_number)
dict_word[len(dict_number)-1] = word
return dict_number, dict_wordMaking Data Function
def make_data(sentence,dict_number):
X = []
Y = []
for sen in sentence:
s = sen.split()
X.append([dict_number[w] for w in s[:-1]])
Y.append(dict_number[s[-1]])
return X,YMain Function : Data
sentences = [
'i love you',
'i hate them',
'i hit ball',
'i kill mouse',
'you love me',
'you hate them',
'you kill mouse',
'they like us',
'they are bad'
]Main Function
dict_number, dict_word = tokenizer(sentences)
X,Y = make_data(sentences, dict_number)
model = CBOW(len(dict_number),3,3,len(dict_number))
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)
epochs = 10000
X,Y = torch.LongTensor(X),torch.LongTensor(Y)
for epoch in range(epochs):
Y_pred = model(X)
loss = loss_fn(Y_pred, Y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (epoch + 1)% 1000 == 0:
print(epoch+1, loss.float())Test
def test_model(model,dict_number,sentence):
x = torch.LongTensor([[dict_number[i] for i in sentence.split()]])
predict = model(x).data.max(1, keepdim=True)[1]
print(sentence,dict_word[predict.squeeze().item()])
test_model(model,dict_number,'you love')
# >>> you love meInitial Skip_gram by using Pytorch
Library
import torch
import torch.nn as nn
import torch.optim as optim
import nltk
nltk.download('book', quiet=True)
from nltk.book import *Skip-gram Model
class Skip_Gram(nn.Module):
def __init__(self,input_layer, hidden_layer1, output_layer):
super().__init__()
self.E = nn.Embedding(input_layer, hidden_layer1)
self.W = nn.Linear(hidden_layer1, output_layer)
self.softmax = nn.Softmax()
def forward(self, x):
x = self.E(x)
res = self.W(self.softmax(x))
return resTokenizer Function
def tokenizer(sentence):
dict_number = {}
dict_word = {}
for s in sentence:
for word in s.split():
if word not in dict_number.keys():
dict_number[word] = len(dict_number)
dict_word[len(dict_number)-1] = word
return dict_number, dict_wordMaking Data Function
def make_data(sentence,dict_number,num): # num is a parameter that defind num-gram model.
X = []
Y = []
for sen in sentence:
s = list(sen.split())
for i in range(1,num+1):
for wi in range(len(s)-i):
X.append(dict_number[s[wi]])
Y.append(dict_number[s[wi+i]])
X.append(dict_number[s[wi+i]])
Y.append(dict_number[s[wi]])
return X,YMain Function : Data
emma_raw = nltk.corpus.gutenberg.raw('austen-emma.txt')
new_s = ""
for s in emma_raw:
if s not in '".,_;?!1234567890[]-':
new_s+=s
sentence = list(new_s.split('\n'))[:1000]Main Function
dict_number, dict_word = tokenizer(sentence)
X,Y = make_data(sentence, dict_number,3)
model = Skip_Gram(len(dict_number),10,len(dict_number))
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.03)
epochs = 1000
X,Y = torch.LongTensor(X),torch.LongTensor(Y)
for epoch in range(epochs):
Y_pred = model(X)
loss = loss_fn(Y_pred, Y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (epoch+1) % 10==0:
print(epoch+1, loss.float())Test
def test_model(model,dict_number,word):
x = torch.LongTensor([dict_number[word]])
predict = model(x).data.max(1, keepdim=True)[1]
print(word,dict_word[predict.squeeze().item()])
test_model(model,dict_number,'enough')
# >>> enough andSubsampling
Library
!pip install d2l==1.0.0-alpha1.post0
import collections
import math
import os
import random
import torch
import matplotlib.pyplot as plt
from d2l import torch as d2lRead Data
d2l.DATA_HUB['ptb'] = (d2l.DATA_URL + 'ptb.zip',
'319d85e578af0cdc590547f26231e4e31cdf1e42')
def read_ptb():
"""Load the PTB dataset into a list of text lines."""
data_dir = d2l.download_extract('ptb')
# Read the training set
with open(os.path.join(data_dir, 'ptb.train.txt')) as f:
raw_text = f.read()
return [line.split() for line in raw_text.split('\n')]
sentences = read_ptb()
sentences = [[word for word in sentence if word!='<unk>'] for sentence in sentences]Subsampling
$$P(w_i) = \max\left(1 - \sqrt{\frac{t}{f(w_i)}}, 0\right)$$
def subsampling(sentences):
counter = collections.Counter([word for sentence in sentences for word in sentence])
words_number = sum(counter.values())
def probability(word,t=1e-4):
return random.random() > 1 - math.sqrt(t / counter[word] * words_number)
subsampled_sentences = [[word for word in sentence if probability(word)] for sentence in sentences ]
subsampled_counter = collections.Counter([word for sentence in subsampled_sentences for word in sentence])
return subsampled_sentences, subsampled_counter, counter
subsampled_sentences, subsampled_counter, counter = subsampling(sentences)Data Visualization
data = [(counter[word],subsampled_counter[word]) for word in counter]
data.sort(key = lambda x:x[0])
X,Y = [i[0] for i in data], [i[1] for i in data]
plt.scatter(X,Y,c='blue')
plt.xlabel('Vannila Word Frequency')
plt.ylabel('Subsampled Word Frequency')
plt.title('Subsampled Word Frequency by Vannila Word Frequency')
plt.show()Word2Vec by Pytorch
Library and Data
!pip install d2l==1.0.0-alpha1.post0
from d2l import torch as d2l
import os
d2l.DATA_HUB['ptb'] = (d2l.DATA_URL + 'ptb.zip',
'319d85e578af0cdc590547f26231e4e31cdf1e42')
def read_ptb():
"""Load the PTB dataset into a list of text lines."""
data_dir = d2l.download_extract('ptb')
# Read the training set
with open(os.path.join(data_dir, 'ptb.train.txt')) as f:
raw_text = f.read()
return [line.split() for line in raw_text.split('\n')]
sentences = read_ptb()
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import math
import collections
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")Skip-gram Model
class Skip_Gram(nn.Module):
def __init__(self,input_layer, hidden_layer1,hidden_layer2):
super().__init__()
self.hidden_layer2 = hidden_layer2
self.E = nn.Embedding(input_layer, hidden_layer1)
self.W = nn.Linear(hidden_layer1, hidden_layer2)
self.Q = nn.Linear(hidden_layer2, 1)
self.sigmoid = nn.Sigmoid()
self.relu = nn.ReLU()
def forward(self, x):
x = self.E(x)
x = x.sum(1)
x = self.relu(self.W(x).view(-1, self.hidden_layer2))
x = self.Q(x).view(-1)
return self.sigmoid(x)Tokenizer Function
def tokenizer(sentence):
dict_number = {}
dict_word = {}
for s in sentence:
for word in s:
if word not in dict_number.keys():
dict_number[word] = len(dict_number)
dict_word[len(dict_number)-1] = word
return dict_number, dict_wordMaking Data Function
def make_data(sentence,dict_number,num): # num is a parameter that defind num-gram model.
data_dict = {i:set() for i in dict_number.values()}
X = []
Y = []
for s in sentence:
for i in range(1,num+1):
for wi in range(len(s)-i):
if dict_number[s[wi]] not in data_dict[dict_number[s[wi+i]]]:
X.append([dict_number[s[wi]], dict_number[s[wi+i]]])
Y.append(1)
data_dict[dict_number[s[wi]]].add(dict_number[s[wi+i]])
data_dict[dict_number[s[wi+i]]].add(dict_number[s[wi]])
d_size = len(X)*2
while len(X)<d_size:
w1,w2 = random.randrange(0,len(dict_number)),random.randrange(0,len(dict_number))
if w1 not in data_dict[w2]:
X.append([w1,w2])
Y.append(0)
return X,Y,data_dictSubsampling Function
def subsampling(sentences):
counter = collections.Counter([word for sentence in sentences for word in sentence])
words_number = sum(counter.values())
def probability(word,t=1e-4):
return random.random() > 1 - math.sqrt(t / counter[word] * words_number)
subsampled_sentences = [[word for word in sentence if probability(word)] for sentence in sentences ]
subsampled_counter = collections.Counter([word for sentence in subsampled_sentences for word in sentence])
return subsampled_sentences, subsampled_counter, counter
sentence, subsampled_counter, counter = subsampling(sentences)Main Function
dict_number, dict_word = tokenizer(sentence)
X,Y,dict_check = make_data(sentence, dict_number, 3)
model = Skip_Gram(len(dict_number),100,100)
model = model.to(device)
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.03)
X = torch.LongTensor(X).to(device)
Y = torch.LongTensor(Y).to(device)
epochs = 1000
for epoch in range(epochs):
Y_pred = model(X)
loss = loss_fn(Y_pred.to(torch.float32), Y.to(torch.float32))
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (epoch+1) % 10==0:
print(epoch+1, loss.float())Test
def kNN(model,word,k):
words = [(math.inf, None) for _ in range(k)]
t = torch.LongTensor([dict_number[word]]).to(device)
t = model.E(t)
for w in dict_number.keys():
if w != word:
T = torch.LongTensor([dict_number[w]]).to(device)
T = model.E(T)
T = (T - t).tolist()
res = 0
for i in T[0]:
res += i**2
res **= 0.5
if words[-1][0]>res:
words.append((res,w))
words.sort(key = lambda x:x[0])
if len(words)>k:
words.pop()
return words