import numpy as np
import scipy as sp
import pandas as pd
We will use the following method to build simple word embeddings.
We create a matrix where we put the co-occurences of all the words.
We factorize that matrix.
= [
sentences "a dog is a sweet animal",
"a cat is a mean beast",
"a human is a different creature",
"a cat is a nice pet",
"a dog is a nice pet also"
]
from collections import defaultdict
from itertools import product, combinations
= defaultdict(int)
Nij_counts
= 1
N = 5 # The window size
k = 2
window_size
= set()
vocab for sentence in sentences:
for idx_a, word_a in enumerate(sentence.split(" ")):
= idx_a - 1
start = idx_a + 2
stop for word_b in sentence.split(" ")[start:stop]:
if word_a == word_b:
continue
+= 1
Nij_counts[(word_a, word_b)] += 1
N
vocab.add(word_a)
vocab.add(word_b)
= defaultdict(int)
Ni_counts = defaultdict(int)
Nj_counts for (i,j), N_ij in Nij_counts.items():
+= N_ij
Ni_counts[ i ] += N_ij
Nj_counts[ j ]
= {k:v/N for k,v in Ni_counts.items()}
Pi = {k:v/N for k,v in Nj_counts.items()}
Pj = {k:v/N for k,v in Nij_counts.items()} Pij
= np.zeros((len(vocab), len(vocab)))
pmi_matrix for i, word_i in enumerate(vocab):
for j, word_j in enumerate(vocab):
= np.log( Pij.get((word_i, word_j), 0) / (Pi[word_i] * Pj[word_j] )) #- np.log(k)
pmi_matrix[i][j]
< 0] = 0
pmi_matrix[ pmi_matrix =vocab, index=vocab) pd.DataFrame(pmi_matrix, columns
RuntimeWarning: divide by zero encountered in log
pmi_matrix[i][j] = np.log( Pij.get((word_i, word_j), 0) / (Pi[word_i] * Pj[word_j] )) #- np.log(k)
cat | different | animal | dog | sweet | nice | is | beast | pet | creature | also | a | mean | human | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
cat | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.875469 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.470004 | 0.000000 | 0.000000 |
different | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.178054 | 0.000000 | 0.470004 | 0.000000 | 0.000000 |
animal | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.178054 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
dog | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.875469 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.470004 | 0.000000 | 0.000000 |
sweet | 0.000000 | 0.000000 | 3.178054 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.470004 | 0.000000 | 0.000000 |
nice | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.079442 | 0.000000 | 0.000000 | 0.470004 | 0.000000 | 0.000000 |
is | 1.568616 | 0.000000 | 0.000000 | 1.568616 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.470004 | 0.000000 | 1.568616 |
beast | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.178054 | 0.000000 |
pet | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.079442 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.772589 | 0.000000 | 0.000000 | 0.000000 |
creature | 0.000000 | 3.178054 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
also | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.772589 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
a | 0.000000 | 0.875469 | 0.000000 | 0.000000 | 0.875469 | 0.875469 | 0.875469 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.875469 | 0.000000 |
mean | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.178054 | 0.000000 | 0.000000 | 0.000000 | 0.470004 | 0.000000 | 0.000000 |
human | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.875469 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.470004 | 0.000000 | 0.000000 |
= np.linalg.svd(pmi_matrix)
U, sigma, Vt = U * sigma word_embeddings
* sigma, index=vocab) pd.DataFrame(U
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
cat | -0.125528 | -0.082352 | -0.056639 | -0.154977 | 1.770129e-16 | -3.472768e-16 | 2.250913e-16 | -5.898841e-17 | 0.103016 | 0.950212 | -0.152147 | 0.030330 | -5.896389e-17 | -8.886491e-18 |
different | -0.036187 | -0.397648 | -0.036706 | -1.837233 | -8.467333e-01 | -5.539689e-01 | -2.376266e+00 | -2.507431e-01 | -0.210186 | -0.083805 | 0.012984 | 0.002184 | 1.223874e-32 | 6.412951e-34 |
animal | -1.399474 | 0.009853 | -1.144664 | 0.061518 | -2.343343e+00 | 9.292952e-01 | 6.095900e-01 | 8.310343e-02 | -0.019071 | -0.265032 | -0.152087 | 0.016544 | -4.343979e-33 | -1.361987e-33 |
dog | -0.125528 | -0.082352 | -0.056639 | -0.154977 | -1.407845e-15 | 1.465704e-16 | 3.570005e-16 | 5.723916e-16 | 0.103016 | 0.950212 | -0.152147 | 0.030330 | -6.870092e-17 | 8.445844e-18 |
sweet | -0.036187 | -0.397648 | -0.036706 | -1.837233 | 7.382504e-02 | -8.159933e-01 | 1.762162e+00 | -1.719547e+00 | -0.210186 | -0.083805 | 0.012984 | 0.002184 | 1.219035e-32 | 1.377710e-33 |
nice | -0.036590 | -2.088265 | 0.044529 | 0.326219 | 1.017894e-15 | -1.242717e-15 | -1.230756e-15 | 1.167800e-15 | 0.001746 | 0.082020 | -0.166661 | -0.199703 | 4.333732e-31 | -4.584136e-33 |
is | -0.019859 | -0.171200 | -0.013028 | -0.387229 | -1.320726e-15 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 2.720756 | -0.141082 | 0.018090 | 0.002996 | -4.170220e-31 | 0.000000e+00 |
beast | -1.399474 | 0.009853 | -1.144664 | 0.061518 | 6.504474e-01 | -2.190208e+00 | 1.499843e-01 | 1.220968e+00 | -0.019071 | -0.265032 | -0.152087 | 0.016544 | 2.060855e-32 | 3.835476e-33 |
pet | -2.170444 | 0.092111 | 2.695585 | -0.024651 | -8.828112e-16 | -1.487032e-15 | -1.830279e-15 | 2.331132e-15 | -0.007100 | -0.134905 | -0.083034 | 0.009092 | 7.846944e-33 | 2.243644e-34 |
creature | -1.399474 | 0.009853 | -1.144664 | 0.061518 | 1.692895e+00 | 1.260913e+00 | -7.595744e-01 | -1.304071e+00 | -0.019071 | -0.265032 | -0.152087 | 0.016544 | -1.541832e-32 | 1.128522e-33 |
also | -0.036888 | -2.694710 | 0.065685 | 0.599222 | 1.951203e-15 | -1.874111e-15 | -2.216379e-15 | 1.708128e-15 | -0.091009 | -0.110411 | 0.133121 | 0.151153 | -3.268027e-31 | 1.746002e-33 |
a | -1.662965 | 0.007278 | -0.615354 | 0.016051 | 1.023148e-15 | 1.197922e-15 | -3.519035e-16 | -9.126765e-16 | 0.017296 | 0.637811 | 0.526446 | -0.059639 | -8.140555e-34 | 9.449184e-34 |
mean | -0.036187 | -0.397648 | -0.036706 | -1.837233 | 7.729083e-01 | 1.369962e+00 | 6.141042e-01 | 1.970290e+00 | -0.210186 | -0.083805 | 0.012984 | 0.002184 | -1.920898e-32 | -7.791129e-35 |
human | -0.125528 | -0.082352 | -0.056639 | -0.154977 | -8.785929e-16 | 8.522401e-16 | 4.452092e-16 | -3.537999e-16 | 0.103016 | 0.950212 | -0.152147 | 0.030330 | 1.276648e-16 | 4.406470e-19 |
= {word: (U * sigma)[index, :] for index, word in enumerate(vocab)}
U_embeddings = {word: Vt[index, :] for index, word in enumerate(vocab)} V_embeddings
'cat'] @ U_embeddings['dog'],
(U_embeddings['cat'] @ U_embeddings['is'],
U_embeddings['cat'] @ U_embeddings['human']) U_embeddings[
(0.987348921588194, 0.22090341150415482, 0.987348921588194)