Word embedding

Create word embeddings simply from scratch

September 19, 2021

We will use the following method to build simple word embeddings.

We create a matrix where we put the co-occurences of all the words.

We factorize that matrix.

import numpy as np
import scipy as sp
import pandas as pd
sentences = [
    "a dog is a sweet animal",
    "a cat is a mean beast", 
    "a human is a different creature",
    "a cat is a nice pet",
    "a dog is a nice pet also"
from collections import defaultdict
from itertools import product, combinations

Nij_counts = defaultdict(int)

N = 1
k = 5 # The window size
window_size = 2

vocab = set()
for sentence in sentences: 
    for idx_a, word_a in enumerate(sentence.split(" ")): 
        start = idx_a - 1 
        stop = idx_a + 2
        for word_b in sentence.split(" ")[start:stop]:
            if word_a == word_b:
            Nij_counts[(word_a, word_b)] += 1
            N += 1

Ni_counts = defaultdict(int)
Nj_counts = defaultdict(int)
for (i,j), N_ij in Nij_counts.items():
  Ni_counts[ i ] += N_ij
  Nj_counts[ j ] += N_ij

Pi = {k:v/N for k,v in Ni_counts.items()}
Pj = {k:v/N for k,v in Nj_counts.items()}
Pij = {k:v/N for k,v in Nij_counts.items()}
pmi_matrix = np.zeros((len(vocab), len(vocab)))
for i, word_i in enumerate(vocab): 
    for j, word_j in enumerate(vocab):
        pmi_matrix[i][j] = np.log( Pij.get((word_i, word_j), 0) / (Pi[word_i] * Pj[word_j] ))  #- np.log(k)

pmi_matrix[ pmi_matrix < 0] = 0
pd.DataFrame(pmi_matrix, columns=vocab, index=vocab)
RuntimeWarning: divide by zero encountered in log
  pmi_matrix[i][j] = np.log( Pij.get((word_i, word_j), 0) / (Pi[word_i] * Pj[word_j] ))  #- np.log(k)
cat different animal dog sweet nice is beast pet creature also a mean human
cat 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.875469 0.000000 0.000000 0.000000 0.000000 0.470004 0.000000 0.000000
different 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.178054 0.000000 0.470004 0.000000 0.000000
animal 0.000000 0.000000 0.000000 0.000000 3.178054 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
dog 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.875469 0.000000 0.000000 0.000000 0.000000 0.470004 0.000000 0.000000
sweet 0.000000 0.000000 3.178054 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.470004 0.000000 0.000000
nice 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.079442 0.000000 0.000000 0.470004 0.000000 0.000000
is 1.568616 0.000000 0.000000 1.568616 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.470004 0.000000 1.568616
beast 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.178054 0.000000
pet 0.000000 0.000000 0.000000 0.000000 0.000000 2.079442 0.000000 0.000000 0.000000 0.000000 2.772589 0.000000 0.000000 0.000000
creature 0.000000 3.178054 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
also 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.772589 0.000000 0.000000 0.000000 0.000000 0.000000
a 0.000000 0.875469 0.000000 0.000000 0.875469 0.875469 0.875469 0.000000 0.000000 0.000000 0.000000 0.000000 0.875469 0.000000
mean 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.178054 0.000000 0.000000 0.000000 0.470004 0.000000 0.000000
human 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.875469 0.000000 0.000000 0.000000 0.000000 0.470004 0.000000 0.000000
U, sigma, Vt = np.linalg.svd(pmi_matrix)
word_embeddings = U * sigma
pd.DataFrame(U * sigma, index=vocab)
0 1 2 3 4 5 6 7 8 9 10 11 12 13
cat -0.125528 -0.082352 -0.056639 -0.154977 1.770129e-16 -3.472768e-16 2.250913e-16 -5.898841e-17 0.103016 0.950212 -0.152147 0.030330 -5.896389e-17 -8.886491e-18
different -0.036187 -0.397648 -0.036706 -1.837233 -8.467333e-01 -5.539689e-01 -2.376266e+00 -2.507431e-01 -0.210186 -0.083805 0.012984 0.002184 1.223874e-32 6.412951e-34
animal -1.399474 0.009853 -1.144664 0.061518 -2.343343e+00 9.292952e-01 6.095900e-01 8.310343e-02 -0.019071 -0.265032 -0.152087 0.016544 -4.343979e-33 -1.361987e-33
dog -0.125528 -0.082352 -0.056639 -0.154977 -1.407845e-15 1.465704e-16 3.570005e-16 5.723916e-16 0.103016 0.950212 -0.152147 0.030330 -6.870092e-17 8.445844e-18
sweet -0.036187 -0.397648 -0.036706 -1.837233 7.382504e-02 -8.159933e-01 1.762162e+00 -1.719547e+00 -0.210186 -0.083805 0.012984 0.002184 1.219035e-32 1.377710e-33
nice -0.036590 -2.088265 0.044529 0.326219 1.017894e-15 -1.242717e-15 -1.230756e-15 1.167800e-15 0.001746 0.082020 -0.166661 -0.199703 4.333732e-31 -4.584136e-33
is -0.019859 -0.171200 -0.013028 -0.387229 -1.320726e-15 0.000000e+00 0.000000e+00 0.000000e+00 2.720756 -0.141082 0.018090 0.002996 -4.170220e-31 0.000000e+00
beast -1.399474 0.009853 -1.144664 0.061518 6.504474e-01 -2.190208e+00 1.499843e-01 1.220968e+00 -0.019071 -0.265032 -0.152087 0.016544 2.060855e-32 3.835476e-33
pet -2.170444 0.092111 2.695585 -0.024651 -8.828112e-16 -1.487032e-15 -1.830279e-15 2.331132e-15 -0.007100 -0.134905 -0.083034 0.009092 7.846944e-33 2.243644e-34
creature -1.399474 0.009853 -1.144664 0.061518 1.692895e+00 1.260913e+00 -7.595744e-01 -1.304071e+00 -0.019071 -0.265032 -0.152087 0.016544 -1.541832e-32 1.128522e-33
also -0.036888 -2.694710 0.065685 0.599222 1.951203e-15 -1.874111e-15 -2.216379e-15 1.708128e-15 -0.091009 -0.110411 0.133121 0.151153 -3.268027e-31 1.746002e-33
a -1.662965 0.007278 -0.615354 0.016051 1.023148e-15 1.197922e-15 -3.519035e-16 -9.126765e-16 0.017296 0.637811 0.526446 -0.059639 -8.140555e-34 9.449184e-34
mean -0.036187 -0.397648 -0.036706 -1.837233 7.729083e-01 1.369962e+00 6.141042e-01 1.970290e+00 -0.210186 -0.083805 0.012984 0.002184 -1.920898e-32 -7.791129e-35
human -0.125528 -0.082352 -0.056639 -0.154977 -8.785929e-16 8.522401e-16 4.452092e-16 -3.537999e-16 0.103016 0.950212 -0.152147 0.030330 1.276648e-16 4.406470e-19
U_embeddings = {word: (U * sigma)[index, :] for index, word in enumerate(vocab)}
V_embeddings = {word: Vt[index, :] for index, word in enumerate(vocab)}
(U_embeddings['cat'] @ U_embeddings['dog'],
U_embeddings['cat'] @ U_embeddings['is'],
U_embeddings['cat'] @ U_embeddings['human'])
(0.987348921588194, 0.22090341150415482, 0.987348921588194)