Rcambier’s Blog - Word embedding

We will use the following method to build simple word embeddings.

We create a matrix where we put the co-occurences of all the words.

We factorize that matrix.

import numpy as np
import scipy as sp
import pandas as pd

sentences = [
    "a dog is a sweet animal",
    "a cat is a mean beast", 
    "a human is a different creature",
    "a cat is a nice pet",
    "a dog is a nice pet also"
]

from collections import defaultdict
from itertools import product, combinations

Nij_counts = defaultdict(int)

N = 1
k = 5 # The window size
window_size = 2

vocab = set()
for sentence in sentences: 
    for idx_a, word_a in enumerate(sentence.split(" ")): 
        start = idx_a - 1 
        stop = idx_a + 2
        for word_b in sentence.split(" ")[start:stop]:
            if word_a == word_b:
              continue
            Nij_counts[(word_a, word_b)] += 1
            N += 1
            vocab.add(word_a)
            vocab.add(word_b)

Ni_counts = defaultdict(int)
Nj_counts = defaultdict(int)
for (i,j), N_ij in Nij_counts.items():
  Ni_counts[ i ] += N_ij
  Nj_counts[ j ] += N_ij


Pi = {k:v/N for k,v in Ni_counts.items()}
Pj = {k:v/N for k,v in Nj_counts.items()}
Pij = {k:v/N for k,v in Nij_counts.items()}

pmi_matrix = np.zeros((len(vocab), len(vocab)))
for i, word_i in enumerate(vocab): 
    for j, word_j in enumerate(vocab):
        pmi_matrix[i][j] = np.log( Pij.get((word_i, word_j), 0) / (Pi[word_i] * Pj[word_j] ))  #- np.log(k)

pmi_matrix[ pmi_matrix < 0] = 0
pd.DataFrame(pmi_matrix, columns=vocab, index=vocab)

RuntimeWarning: divide by zero encountered in log
  pmi_matrix[i][j] = np.log( Pij.get((word_i, word_j), 0) / (Pi[word_i] * Pj[word_j] ))  #- np.log(k)

	cat	different	animal	dog	sweet	nice	is	beast	pet	creature	also	a	mean	human
cat	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.875469	0.000000	0.000000	0.000000	0.000000	0.470004	0.000000	0.000000
different	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	3.178054	0.000000	0.470004	0.000000	0.000000
animal	0.000000	0.000000	0.000000	0.000000	3.178054	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
dog	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.875469	0.000000	0.000000	0.000000	0.000000	0.470004	0.000000	0.000000
sweet	0.000000	0.000000	3.178054	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.470004	0.000000	0.000000
nice	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.079442	0.000000	0.000000	0.470004	0.000000	0.000000
is	1.568616	0.000000	0.000000	1.568616	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.470004	0.000000	1.568616
beast	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	3.178054	0.000000
pet	0.000000	0.000000	0.000000	0.000000	0.000000	2.079442	0.000000	0.000000	0.000000	0.000000	2.772589	0.000000	0.000000	0.000000
creature	0.000000	3.178054	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
also	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.772589	0.000000	0.000000	0.000000	0.000000	0.000000
a	0.000000	0.875469	0.000000	0.000000	0.875469	0.875469	0.875469	0.000000	0.000000	0.000000	0.000000	0.000000	0.875469	0.000000
mean	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	3.178054	0.000000	0.000000	0.000000	0.470004	0.000000	0.000000
human	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.875469	0.000000	0.000000	0.000000	0.000000	0.470004	0.000000	0.000000

U, sigma, Vt = np.linalg.svd(pmi_matrix)
word_embeddings = U * sigma

pd.DataFrame(U * sigma, index=vocab)

	0	1	2	3	4	5	6	7	8	9	10	11	12	13
cat	-0.125528	-0.082352	-0.056639	-0.154977	1.770129e-16	-3.472768e-16	2.250913e-16	-5.898841e-17	0.103016	0.950212	-0.152147	0.030330	-5.896389e-17	-8.886491e-18
different	-0.036187	-0.397648	-0.036706	-1.837233	-8.467333e-01	-5.539689e-01	-2.376266e+00	-2.507431e-01	-0.210186	-0.083805	0.012984	0.002184	1.223874e-32	6.412951e-34
animal	-1.399474	0.009853	-1.144664	0.061518	-2.343343e+00	9.292952e-01	6.095900e-01	8.310343e-02	-0.019071	-0.265032	-0.152087	0.016544	-4.343979e-33	-1.361987e-33
dog	-0.125528	-0.082352	-0.056639	-0.154977	-1.407845e-15	1.465704e-16	3.570005e-16	5.723916e-16	0.103016	0.950212	-0.152147	0.030330	-6.870092e-17	8.445844e-18
sweet	-0.036187	-0.397648	-0.036706	-1.837233	7.382504e-02	-8.159933e-01	1.762162e+00	-1.719547e+00	-0.210186	-0.083805	0.012984	0.002184	1.219035e-32	1.377710e-33
nice	-0.036590	-2.088265	0.044529	0.326219	1.017894e-15	-1.242717e-15	-1.230756e-15	1.167800e-15	0.001746	0.082020	-0.166661	-0.199703	4.333732e-31	-4.584136e-33
is	-0.019859	-0.171200	-0.013028	-0.387229	-1.320726e-15	0.000000e+00	0.000000e+00	0.000000e+00	2.720756	-0.141082	0.018090	0.002996	-4.170220e-31	0.000000e+00
beast	-1.399474	0.009853	-1.144664	0.061518	6.504474e-01	-2.190208e+00	1.499843e-01	1.220968e+00	-0.019071	-0.265032	-0.152087	0.016544	2.060855e-32	3.835476e-33
pet	-2.170444	0.092111	2.695585	-0.024651	-8.828112e-16	-1.487032e-15	-1.830279e-15	2.331132e-15	-0.007100	-0.134905	-0.083034	0.009092	7.846944e-33	2.243644e-34
creature	-1.399474	0.009853	-1.144664	0.061518	1.692895e+00	1.260913e+00	-7.595744e-01	-1.304071e+00	-0.019071	-0.265032	-0.152087	0.016544	-1.541832e-32	1.128522e-33
also	-0.036888	-2.694710	0.065685	0.599222	1.951203e-15	-1.874111e-15	-2.216379e-15	1.708128e-15	-0.091009	-0.110411	0.133121	0.151153	-3.268027e-31	1.746002e-33
a	-1.662965	0.007278	-0.615354	0.016051	1.023148e-15	1.197922e-15	-3.519035e-16	-9.126765e-16	0.017296	0.637811	0.526446	-0.059639	-8.140555e-34	9.449184e-34
mean	-0.036187	-0.397648	-0.036706	-1.837233	7.729083e-01	1.369962e+00	6.141042e-01	1.970290e+00	-0.210186	-0.083805	0.012984	0.002184	-1.920898e-32	-7.791129e-35
human	-0.125528	-0.082352	-0.056639	-0.154977	-8.785929e-16	8.522401e-16	4.452092e-16	-3.537999e-16	0.103016	0.950212	-0.152147	0.030330	1.276648e-16	4.406470e-19

U_embeddings = {word: (U * sigma)[index, :] for index, word in enumerate(vocab)}
V_embeddings = {word: Vt[index, :] for index, word in enumerate(vocab)}

(U_embeddings['cat'] @ U_embeddings['dog'],
U_embeddings['cat'] @ U_embeddings['is'],
U_embeddings['cat'] @ U_embeddings['human'])

(0.987348921588194, 0.22090341150415482, 0.987348921588194)