I'm trying to build a network that models a Continuous Bag of Words model in NumPy, and I'm trying to figure out where I made a mistake here. My network doesn't seem to be learning at all and predicts that every sentence contains the same word. Of course there is something that I did wrong, but I cannot say exactly where.
So my approach is as follows:

Tokenize the body in two vectors (contexts
and targets
). Each set of elements in the context window in the contexts
The vector is assigned to an entry in the field targets
. So these are mine X_train
and Y_train
. For example with a context size of 2
the words ("Hello", "How", "You", "Do") can be assigned to the target word "are" because it is the middle word of the body ("Hello", "How", "are" "," you do "). The middle word is the goal and the words +CONTEXT_SIZE
belong contexts
.

Get the indexing of the tokens from the corpus and pass them to the embedded embedding layer (vocab_size, embedding_size)
. The bedding layer is a lookup table from which to map vocab_size
to embedding_size
Dimensions, so I'm going to look at the embedding vector that represents them average contextual words, i.e. word_vec
. So this vector has a shape (1, embedding_size)
(A column vector).

Now I'm coming past word_vec
(average embedded word vector) in a hidden layer (with shape) embedding_size, vocab_size)
. That has one tanh
Activation function with a preload. So I have to normalize my vector after multiplying it by W
. The output vector of this level has the form (1, vocab_size)
Basically, I can simply pass this on to my Softmax function.

After getting the word index with softmax, I find the maximum probability and predict the output. Should be okay, right?
My gradients seem to tend to explode, but the clipping doesn't seem to fix anything. I will post the full code below. I would be grateful if someone could point me to the right steps. Was it an initialization error or did I do something wrong elsewhere?
import numpy as np
import re
from nltk.corpus import brown
import matplotlib.pyplot as plt
class CBOW:
def __init__(self, contexts, targets, vocab_size, embedding_size, learning_rate, context_size=3):
# Initialize Stuff
self.embedding_matrix = np.random.randn(vocab_size, embedding_size) * 0.01
self.num_classes = vocab_size
self.W = np.random.randn(embedding_size, self.num_classes) * 0.01
self.bias = np.zeros((1, self.W.shape(1)))
self.word_vec = None
self.gradients = dict()
self.cache_contexts = None # For backprop
self.cache_Z = None # For backprop
self.learning_rate = learning_rate
self.contexts = contexts
self.targets = targets
self.context_width = 2*context_size
def Embedding(self, contexts, train):
# Op.shape = 1 * embedding_size (Column Vector)
# Take the average vector from the context window
word_vec = np.mean(self.embedding_matrix(contexts, :), axis=1)
#assert word_vec.shape == (self.context_width, self.embedding_matrix.shape(1))
assert word_vec.shape == (1, self.embedding_matrix.shape(1))
self.word_vec = word_vec
def tanh(self, inp):
return np.tanh(inp)
def tanh_delta(self, inp):
return (1  inp*inp)
def Dense(self, word_vec):
# Pass the word_vec thru the Dense Layer
op = np.dot(word_vec, self.W) + self.bias
# Normalize before passing to tanh
op = op / (np.sum(1e2 + np.max(op), keepdims=True))
op = self.tanh(op)
assert op.shape == (1, self.W.shape(1))
return op
def softmax(self, inp):
assert inp.shape == (1, self.num_classes)
max_val = np.max(inp, axis=1)
softmax_out = np.exp(inp  max_val) / (np.sum(np.exp(inp  max_val), keepdims=True) + 1e3)
return softmax_out
def forward(self, contexts, train):
self.Embedding(contexts, train)
Z = self.Dense(self.word_vec)
self.cache_Z = Z
softmax_out = softmax(Z)
self.cache_contexts = contexts
return softmax_out
def cross_entropy(self, softmax_out, Y):
target = np.zeros(softmax_out.shape)
target(:, np.squeeze(Y)) = 1
loss = (target * np.log(softmax_out + 1e3))
return np.max(loss)
def backward(self, Y, softmax_out):
# Perform backprop
# Z = tanh(O)
# O = W*word_vec + bias
# softmax_out = softmax(Z)
# dL/d(Z) = (softmax_out  Y)
Z = self.cache_Z
target = np.zeros(softmax_out.shape)
target(:, np.squeeze(Y)) = 1
dL_dZ = softmax_out  target
assert dL_dZ.shape == (1, self.num_classes)
self.gradients('dL_dZ') = dL_dZ
# dZ_dO = (1  tanh^2(O)) = (1  z*z)
dZ_dO = self.tanh_delta(Z)
assert dZ_dO.shape == (1, self.W.shape(1)) # (1, num_classes)
# dL/dO = dL_dZ * dZ/dO
dL_dO = dL_dZ * dZ_dO
assert dL_dO.shape == (1, self.W.shape(1))
self.gradients('dL_dO') = dL_dO
# dL/d(W) = dL/d(O) * d(O)/d(W)
# d(O)/dW = word_vec
dO_dW = self.word_vec
dL_dW = np.dot(dO_dW.T, dL_dO)
assert dL_dW.shape == self.W.shape # (embedding_size, num_classes)
self.gradients('dL_dW') = dL_dW
# dL/d(word_vec) = dL/dO * dO/d(word_vec) = dL/dO * W
dL_dword_vec = np.dot(dL_dO, self.W.T)
assert dL_dword_vec.shape == self.word_vec.shape # (1, embedding_size)
self.gradients('dL_dword_vec') = dL_dword_vec
# dL/d(bias) = dL/dO * dO/d(bias) = dL/dO
dL_dbias = dL_dO
assert dL_dbias.shape == self.bias.shape
self.gradients('dL_dbias') = dL_dbias
# Clip all gradients
for grad in self.gradients:
self.gradients(grad) = np.clip(self.gradients(grad), 500, 500)
#print(self.gradients)
def update(self):
contexts = self.cache_contexts
dL_dword_vec = self.gradients('dL_dword_vec')
self.embedding_matrix(contexts) = self.learning_rate * dL_dword_vec
self.W = self.learning_rate * self.gradients('dL_dW')
self.bias = self.learning_rate * self.gradients('dL_dbias')
def train(self, epochs):
losses = ()
X = self.contexts
Y = self.targets
vocab_size = self.num_classes
context_width = Y.shape(1)
for epoch in range(epochs):
epoch_loss = 0
factor = (2 * CONTEXT_SIZE)
inds = list(range(0, context_width))
np.random.shuffle(inds)
print(f"Item #{inds(0)}")
for i in inds:
X_item = X(:, i*factor:(i+1)* factor)
Y_item = Y(:, i:i+1)
softmax_out = self.forward(X_item, train=True)
self.backward(Y_item, softmax_out)
self.update()
loss = self.cross_entropy(softmax_out, Y_item)
epoch_loss += np.squeeze(loss)
losses.append(epoch_loss)
if epoch:
print(f"Loss after epoch #{epoch}: {epoch_loss}")
plt.plot(np.arange(epochs), losses)
plt.xlabel('# of epochs')
plt.ylabel('cost')
Initialize and train the network with:
model = CBOW(contexts, targets, vocab_size, embedding_size=50, learning_rate=0.05)
model.train(100)
Here is the code for dealing with the body:
def tokenize(text):
pattern = re.compile(r'(AZaz)+(w^')*(w^')*(AZaz)+(w^')*')
try:
return pattern.findall(text.lower())
except:
return pattern.findall(text)
def generate_mapping(tokens):
# Generates the word_to_idx and idx_to_word mappings
word_to_idx, idx_to_word = {}, {}
for idx, token in enumerate(set(tokens)):
word_to_idx(token) = idx
idx_to_word(idx) = token
return word_to_idx, idx_to_word
def get_training_data(tokens, word_to_ix, CONTEXT_SIZE):
X, Y = (), ()
for i in range(CONTEXT_SIZE, len(tokens)  CONTEXT_SIZE):
for j in range(i  CONTEXT_SIZE, i + CONTEXT_SIZE + 1):
if j == i: continue
Y.append(word_to_ix(tokens(j)))
X.append(word_to_ix(tokens(i)))
X, Y = np.array(X), np.array(Y)
X, Y = X(np.newaxis, :), Y(np.newaxis, :)
assert 2 * CONTEXT_SIZE * X.shape(1) == Y.shape(1)
return X, Y
WINDOW_SIZE = 3
CONTEXT_SIZE = 3
# Taken from the PyTorch tutorial section on Word Embeddings
corpus = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells."""
corpus_tokens = tokenize(corpus)
word_to_idx, idx_to_word = generate_mapping(corpus_tokens)
targets, contexts = get_training_data(corpus_tokens, word_to_idx, WINDOW_SIZE)
vocab_size = len(set(corpus_tokens))
Here is the code for testing on the same body by splitting it into tokens
def get_max_idx(inp):
# Get counts and indices
c = np.array(np.unique(np.argmax(probs, axis=1), return_counts=True))
# Get the maximum count
idx = np.argmax(c(1, :))
return c(0, idx)
def get_index_of_max(inp):
return np.argmax(inp, axis=1)
def get_max_prob_result(inp, ix_to_word):
return ix_to_word(get_max_idx(inp))
def make_context_vector(context, word_to_idx):
return np.array(((word_to_idx(word) for word in tokenize(context))))
def generate_model_probs(model, context_vector, train):
softmax_out = model.forward(context_vector, train)
return softmax_out
def test(model, corpus, word_to_idx, idx_to_word):
corpus_tokens = tokenize(corpus)
corpus_size = len(corpus_tokens)
for i in range(0 + CONTEXT_SIZE, corpus_size  CONTEXT_SIZE):
word = " ".join((corpus_tokens(j) for j in range(i  CONTEXT_SIZE, i + CONTEXT_SIZE + 1) if j != i))
print("Word:" + word)
context_vector = make_context_vector(word, word_to_idx)
probs = generate_model_probs(model, context_vector, train=False)
print(f"Prediction: {get_max_prob_result(probs, idx_to_word)}")
print(probs)