Deep Learning¶

Assignment 5¶

The goal of this assignment is to train a skip-gram model over Text8 data.

In [1]:

# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE

Download the data from the source website if necessary.

In [2]:

url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip

Read the data into a string.

In [3]:

def read_data(filename):
  f = zipfile.ZipFile(filename)
  for name in f.namelist():
    return tf.compat.as_str(f.read(name)).split()
  f.close()
  
words = read_data(filename)
print('Data size %d' % len(words))

Data size 17005207

Build the dictionary and replace rare words with UNK token.

In [4]:

vocabulary_size = 50000

def build_dataset(words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count = unk_count + 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
  return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])
del words  # Hint to reduce memory.

Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5239, 3084, 12, 6, 195, 2, 3137, 46, 59, 156]

Function to generate a training batch for the skip-gram model.

In [5]:

data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1 # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  for i in range(batch_size // num_skips):
    target = skip_window  # target label at the center of the buffer
    targets_to_avoid = [ skip_window ]
    for j in range(num_skips):
      while target in targets_to_avoid:
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  return batch, labels

print('data:', [reverse_dictionary[di] for di in data[:8]])

for num_skips, skip_window in [(2, 1), (4, 2)]:
    data_index = 0
    batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])

data: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first']

with num_skips = 2 and skip_window = 1:
    batch: ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term']
    labels: ['as', 'anarchism', 'originated', 'a', 'as', 'term', 'of', 'a']

with num_skips = 4 and skip_window = 2:
    batch: ['as', 'as', 'as', 'as', 'a', 'a', 'a', 'a']
    labels: ['a', 'anarchism', 'originated', 'term', 'term', 'as', 'originated', 'of']

Train a skip-gram model.

In [6]:

batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):

  # Input data.
  train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  
  # Variables.
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  softmax_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
  softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Model.
  # Look up embeddings for inputs.
  embed = tf.nn.embedding_lookup(embeddings, train_dataset)
  # Compute the softmax loss, using a sample of the negative labels each time.
  loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed,
                               train_labels, num_sampled, vocabulary_size))

  # Optimizer.
  optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
  
  # Compute the similarity between minibatch examples and all embeddings.
  # We use the cosine distance:
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(
    normalized_embeddings, valid_dataset)
  similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

In [7]:

num_steps = 100001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  average_loss = 0
  for step in range(num_steps):
    batch_data, batch_labels = generate_batch(
      batch_size, num_skips, skip_window)
    feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
    _, l = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += l
    if step % 2000 == 0:
      if step > 0:
        average_loss = average_loss / 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step %d: %f' % (step, average_loss))
      average_loss = 0
    # note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in xrange(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8 # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        log = 'Nearest to %s:' % valid_word
        for k in xrange(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log = '%s %s,' % (log, close_word)
        print(log)
  final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step 0: 8.264635
Nearest to not: ouadda, geldof, crystallizes, bioinformatics, flaminius, plight, conforms, cohn,
Nearest to of: hyperion, ecclesiae, avery, preliminary, anders, unilateral, ottawa, promotional,
Nearest to but: trusting, forgiving, mummification, gerry, estudios, coffins, fbi, sotho,
Nearest to war: authenticated, shirt, dysplasia, disengage, rourke, federline, operands, abitibi,
Nearest to its: prohibits, euskara, derleth, espionage, collation, deluge, bembo, wh,
Nearest to has: hog, architect, hydroelectric, reconciling, member, muhammed, fulda, audiences,
Nearest to known: ibiza, conversational, enjoys, ava, rare, boyer, thugs, phimosis,
Nearest to time: unsound, perceived, gilded, ndebele, tuxedo, progeny, lina, qassam,
Nearest to are: lunches, subjectivism, rigorous, zodiac, godfather, kappa, yoda, bonding,
Nearest to american: jeux, sourcewatch, calwell, brahms, twh, juveniles, dreamed, combining,
Nearest to over: carta, nikos, my, reprimanded, letterman, biconditional, demons, fanpage,
Nearest to no: wonder, ignaz, puerto, debrecen, horseback, greco, langston, transponder,
Nearest to had: embroidery, smallest, probes, rewrite, cimmerian, congresses, humpback, framework,
Nearest to as: flagella, bastion, cowpox, hilt, spices, subscribed, otherworldly, supposedly,
Nearest to nine: noah, regiments, dissipation, johansen, freising, suriname, weasel, associating,
Nearest to is: gtk, baptise, prefixes, meribbaal, xia, armando, gzip, brzezinski,
Average loss at step 2000: 4.369195
Average loss at step 4000: 3.863247
Average loss at step 6000: 3.785764
Average loss at step 8000: 3.691525
Average loss at step 10000: 3.617537
Nearest to not: it, they, also, sometimes, he, never, you, who,
Nearest to of: in, for, intermarriages, from, rejuvenated, and, cottages, preliminary,
Nearest to but: however, discusses, commute, superscription, did, until, estudios, qh,
Nearest to war: disengage, federline, abitibi, leyden, dysplasia, superlative, czech, tokelau,
Nearest to its: his, the, their, prohibits, collation, stirling, yah, a,
Nearest to has: had, is, was, have, crusades, calming, sweep, estuaries,
Nearest to known: conversational, ibiza, used, apostol, manually, enjoys, ava, supra,
Nearest to time: perceived, buckingham, qassam, wellesley, motivational, discernible, replicators, blaming,
Nearest to are: were, is, have, beauvoir, functional, was, complication, alvaro,
Nearest to american: jeux, calwell, tramlink, montenegrin, entire, fayyum, click, kgf,
Nearest to over: my, nikos, until, standalone, biconditional, tired, conrad, dignity,
Nearest to no: wonder, henley, debrecen, transponder, langston, rik, segmental, sabo,
Nearest to had: have, has, was, were, rosalynn, veered, embroidery, refugee,
Nearest to as: by, chosroes, referendum, played, flagella, myrdal, spectrometers, bede,
Nearest to nine: eight, seven, six, five, zero, three, four, two,
Nearest to is: was, are, has, be, does, cthulhu, slice, mountainous,
Average loss at step 12000: 3.604574
Average loss at step 14000: 3.574436
Average loss at step 16000: 3.406878
Average loss at step 18000: 3.457766
Average loss at step 20000: 3.542967
Nearest to not: it, also, never, usually, now, they, you, who,
Nearest to of: filters, discos, intermarriages, freitas, homogenized, gigabyte, preliminary, flybys,
Nearest to but: however, which, is, and, though, that, where, if,
Nearest to war: disengage, federline, leyden, cecil, superlative, bangkok, athenian, shrunk,
Nearest to its: their, his, the, collation, systems, infanticide, some, mentions,
Nearest to has: had, is, have, was, crusades, imminent, sweep, bus,
Nearest to known: used, ibiza, conversational, supra, transwomen, manually, forever, ava,
Nearest to time: buckingham, qassam, discernible, nazism, year, perceived, way, seems,
Nearest to are: were, is, have, was, functional, other, but, complication,
Nearest to american: british, calwell, jeux, french, canadian, sacrilegious, and, dreamed,
Nearest to over: my, nikos, curfew, after, biconditional, tired, dignity, until,
Nearest to no: wonder, langston, there, compatibles, nick, inferior, cheat, regionally,
Nearest to had: has, have, was, were, rosalynn, having, culver, beans,
Nearest to as: pensacola, bix, myrdal, yitzchak, dart, kingfishers, intracellular, bluescreen,
Nearest to nine: eight, seven, six, four, zero, three, five, two,
Nearest to is: was, has, are, but, does, were, gizzard, be,
Average loss at step 22000: 3.502944
Average loss at step 24000: 3.490890
Average loss at step 26000: 3.481496
Average loss at step 28000: 3.479962
Average loss at step 30000: 3.503358
Nearest to not: they, now, still, usually, it, never, you, only,
Nearest to of: in, preliminary, discos, from, for, and, own, ieung,
Nearest to but: however, which, or, and, although, where, when, though,
Nearest to war: disengage, bangkok, federline, leyden, superlative, cecil, shrunk, athenian,
Nearest to its: their, his, the, incorporating, registering, stirling, balderus, heater,
Nearest to has: had, have, is, was, since, calming, imminent, sweep,
Nearest to known: used, ibiza, transwomen, supra, such, jewell, manually, conversational,
Nearest to time: year, nazism, discernible, qassam, buckingham, way, explodes, course,
Nearest to are: were, have, is, include, while, maas, other, noether,
Nearest to american: british, english, canadian, autograph, french, jeux, australian, tramlink,
Nearest to over: until, nikos, off, my, standalone, biconditional, curfew, suspend,
Nearest to no: a, wonder, there, nick, langston, secaucus, compatibles, inferior,
Nearest to had: has, have, was, were, having, is, aryans, aan,
Nearest to as: by, bix, under, intracellular, yeomen, be, synoptic, unanimity,
Nearest to nine: eight, seven, six, four, five, three, zero, two,
Nearest to is: was, has, are, became, be, were, had, does,
Average loss at step 32000: 3.506307
Average loss at step 34000: 3.494333
Average loss at step 36000: 3.450909
Average loss at step 38000: 3.300008
Average loss at step 40000: 3.429721
Nearest to not: never, now, it, still, almost, usually, they, also,
Nearest to of: in, intermarriages, for, biafra, hysteria, from, decrement, garage,
Nearest to but: however, while, although, and, though, which, it, before,
Nearest to war: disengage, federline, leyden, bangkok, valuables, shrunk, autographs, afghan,
Nearest to its: their, his, the, her, balderus, registering, infanticide, cameras,
Nearest to has: had, have, was, is, sweep, phenomenally, imminent, px,
Nearest to known: used, such, ibiza, supra, transwomen, possible, conversational, described,
Nearest to time: year, way, discernible, explodes, buckingham, terrier, maury, day,
Nearest to are: were, have, is, maas, searle, while, be, can,
Nearest to american: british, australian, french, english, canadian, italian, autograph, winning,
Nearest to over: nikos, off, curfew, standalone, until, annan, my, sonar,
Nearest to no: any, wonder, langston, secaucus, nick, hingis, liquified, regionally,
Nearest to had: has, have, was, having, were, beans, since, aan,
Nearest to as: intracellular, bluescreen, duchamp, chosroes, leclerc, dart, fortify, bix,
Nearest to nine: eight, seven, six, five, four, zero, three, one,
Nearest to is: was, are, has, be, does, mitigated, became, hustler,
Average loss at step 42000: 3.434099
Average loss at step 44000: 3.452721
Average loss at step 46000: 3.451069
Average loss at step 48000: 3.357493
Average loss at step 50000: 3.383801
Nearest to not: never, now, they, usually, still, almost, who, preferential,
Nearest to of: in, and, preliminary, including, abusing, wilmot, during, stocking,
Nearest to but: however, although, and, while, though, when, during, where,
Nearest to war: disengage, bangkok, leyden, shrunk, federline, autographs, superlative, valuables,
Nearest to its: their, his, the, her, balderus, infanticide, vuoksi, your,
Nearest to has: had, have, is, was, ante, since, deems, watson,
Nearest to known: used, such, possible, described, transwomen, regarded, supra, ibiza,
Nearest to time: year, way, period, day, tarantino, lata, node, discernible,
Nearest to are: were, is, have, these, bloomsbury, complication, be, include,
Nearest to american: italian, australian, french, british, international, english, canadian, rsted,
Nearest to over: off, nikos, through, rudder, curfew, annan, dtmf, on,
Nearest to no: any, a, langston, rik, alison, regionally, nick, hingis,
Nearest to had: has, have, were, was, having, since, been, prohibit,
Nearest to as: authentication, stool, yeomen, shavuot, duchamp, eno, referendum, including,
Nearest to nine: eight, seven, six, four, three, five, zero, two,
Nearest to is: was, has, are, does, bloodstream, be, erlang, but,
Average loss at step 52000: 3.435205
Average loss at step 54000: 3.425108
Average loss at step 56000: 3.441868
Average loss at step 58000: 3.393682
Average loss at step 60000: 3.394030
Nearest to not: never, still, usually, now, you, they, almost, to,
Nearest to of: for, in, rejuvenated, beggars, impeachment, intermarriages, original, discos,
Nearest to but: however, although, and, though, see, determines, which, than,
Nearest to war: bangkok, disengage, leyden, shrunk, latinized, autographs, agnesi, valuables,
Nearest to its: their, his, the, her, your, vuoksi, gallico, rudy,
Nearest to has: had, have, was, is, having, sweep, ante, gnaeus,
Nearest to known: used, such, possible, transwomen, regarded, described, ibiza, supra,
Nearest to time: year, way, period, tarantino, process, explodes, motivation, place,
Nearest to are: were, is, have, searle, these, include, bloomsbury, those,
Nearest to american: british, italian, australian, english, international, french, canadian, european,
Nearest to over: through, until, off, dtmf, nikos, continuity, caudal, around,
Nearest to no: any, regionally, langston, uniformitarianism, little, alison, horch, parallels,
Nearest to had: has, have, was, having, were, thresholds, phenomenally, won,
Nearest to as: bix, authentication, gaylord, contra, angel, flagella, meddling, fortify,
Nearest to nine: eight, six, seven, four, five, three, zero, one,
Nearest to is: was, are, has, does, became, be, contains, expect,
Average loss at step 62000: 3.239307
Average loss at step 64000: 3.256177
Average loss at step 66000: 3.403254
Average loss at step 68000: 3.388942
Average loss at step 70000: 3.358393
Nearest to not: never, now, still, usually, generally, almost, partially, sheedy,
Nearest to of: including, preliminary, in, intermarriages, and, for, include, like,
Nearest to but: however, although, see, which, and, that, than, though,
Nearest to war: disengage, bangkok, leyden, wars, superlative, chalcedonian, autographs, agnesi,
Nearest to its: their, his, the, her, our, whose, hypnotized, superoxide,
Nearest to has: had, have, was, is, since, sweep, imminent, uppercase,
Nearest to known: used, such, regarded, possible, described, transwomen, viewed, considered,
Nearest to time: process, tarantino, period, year, place, way, course, encrypting,
Nearest to are: were, have, include, is, these, including, maas, those,
Nearest to american: british, italian, australian, canadian, english, international, dubs, rsted,
Nearest to over: through, dtmf, off, around, about, until, curfew, continuity,
Nearest to no: any, there, little, langston, regionally, parallels, liquified, wonder,
Nearest to had: has, have, having, was, were, thresholds, phenomenally, won,
Nearest to as: angel, like, bix, lyman, fortify, echelon, luminosity, before,
Nearest to nine: eight, seven, six, five, four, three, zero, one,
Nearest to is: was, has, are, became, be, does, being, becomes,
Average loss at step 72000: 3.373208
Average loss at step 74000: 3.345302
Average loss at step 76000: 3.310933
Average loss at step 78000: 3.349958
Average loss at step 80000: 3.379658
Nearest to not: still, never, usually, now, actually, they, almost, it,
Nearest to of: in, commutator, including, original, collegium, sammy, avery, simplest,
Nearest to but: however, although, see, while, and, which, than, though,
Nearest to war: disengage, bangkok, discursive, wars, leyden, chalcedonian, nebuchadrezzar, privatisation,
Nearest to its: their, his, her, the, our, heater, whose, your,
Nearest to has: had, have, is, was, px, since, having, fineness,
Nearest to known: used, possible, regarded, described, such, transwomen, considered, treated,
Nearest to time: year, course, tarantino, cover, process, explodes, coffin, day,
Nearest to are: were, include, is, have, maas, these, although, including,
Nearest to american: british, italian, canadian, australian, calwell, international, ling, french,
Nearest to over: off, around, caudal, through, until, dtmf, curfew, across,
Nearest to no: little, reliefs, langston, any, regionally, liquified, nick, there,
Nearest to had: has, have, were, was, having, phenomenally, prohibit, refused,
Nearest to as: brainwashing, hereford, mentors, before, fortify, uranus, stool, yeomen,
Nearest to nine: eight, seven, six, four, five, three, zero, one,
Nearest to is: was, has, are, remains, became, includes, planing, although,
Average loss at step 82000: 3.405994
Average loss at step 84000: 3.407185
Average loss at step 86000: 3.386949
Average loss at step 88000: 3.352540
Average loss at step 90000: 3.364312
Nearest to not: still, actually, never, they, usually, almost, nor, now,
Nearest to of: including, in, purdue, for, actual, preliminary, discos, simplest,
Nearest to but: however, and, although, though, while, which, he, they,
Nearest to war: disengage, bangkok, discursive, carefree, wars, chalcedonian, leyden, agnesi,
Nearest to its: their, his, her, the, our, infanticide, heater, gallico,
Nearest to has: had, have, is, was, since, additionally, having, maintains,
Nearest to known: used, such, described, possible, regarded, transwomen, seen, considered,
Nearest to time: tarantino, course, process, took, year, period, day, explodes,
Nearest to are: were, is, include, have, these, maas, contain, although,
Nearest to american: british, italian, australian, canadian, indian, french, spanish, regents,
Nearest to over: off, through, around, against, dtmf, caudal, about, across,
Nearest to no: little, any, reliefs, langston, only, liquified, there, inferior,
Nearest to had: has, have, were, was, having, refused, already, continued,
Nearest to as: brainwashing, stool, lombok, hereford, mentors, yeomen, chosroes, renormalization,
Nearest to nine: eight, seven, six, five, four, three, zero, two,
Nearest to is: was, has, are, does, became, be, although, becomes,
Average loss at step 92000: 3.396994
Average loss at step 94000: 3.249787
Average loss at step 96000: 3.357772
Average loss at step 98000: 3.245364
Average loss at step 100000: 3.357103
Nearest to not: never, still, almost, usually, nor, generally, sheedy, now,
Nearest to of: including, in, original, and, ritz, blitter, from, grasso,
Nearest to but: however, although, and, though, while, which, that, where,
Nearest to war: disengage, bangkok, swamps, leyden, wars, chalcedonian, discursive, agnesi,
Nearest to its: their, his, the, her, our, firms, infanticide, twiggy,
Nearest to has: had, have, is, was, since, additionally, having, px,
Nearest to known: possible, used, regarded, such, seen, described, considered, defined,
Nearest to time: year, process, course, reason, tarantino, day, constantinople, replicates,
Nearest to are: were, have, although, include, while, contain, these, is,
Nearest to american: british, canadian, italian, australian, belgian, indian, doped, french,
Nearest to over: caudal, across, around, nearly, through, off, costs, dtmf,
Nearest to no: little, any, langston, reliefs, there, uniformitarianism, pieter, inferior,
Nearest to had: has, have, having, was, were, won, thresholds, since,
Nearest to as: like, brainwashing, chosroes, stool, hennessy, vl, within, hereford,
Nearest to nine: eight, seven, six, four, five, three, zero, two,
Nearest to is: was, has, became, becomes, appears, are, makes, be,

In [8]:

num_points = 400

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])

In [9]:

%matplotlib inline

def plot(embeddings, labels):
  assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
  pylab.figure(figsize=(15,15))  # in inches
  for i, label in enumerate(labels):
    x, y = embeddings[i,:]
    pylab.scatter(x, y)
    pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                   ha='right', va='bottom')
  pylab.show()

words = [reverse_dictionary[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)

No description has been provided for this image

Problem¶

An alternative to Word2Vec is called CBOW (Continuous Bag of Words). In the CBOW model, instead of predicting a context word from a word vector, you predict a word from the sum of all the word vectors in its context. Implement and evaluate a CBOW model trained on the text8 dataset.

The main difference between skip-gram and CBOW is the inputs and outputs are reversed - skip-gram predicts the context given the word, while CBOW predicts the word given the context. The second difference is that unlike skip-gram where each context word is a separate, label, in CBOW the context words are summed together in the input vector (so instead of having a single bit "activated" we'll have 2 or more bits turned on). This blog post explains it pretty well, although the exact implementation used here isn't as complicated as the one suggested in the post.

The code is very similar to skip-gram, with the exception of the batch generation function.

Full disclousure: these code updates came from other course participants through discussions about the assignment. There are multiple ways to implement this so there is no single right answer.

In [10]:

data_index = 0

def generate_batch_cbow(batch_size, skip_window):
  global data_index
  context_window = 2 * skip_window
  assert batch_size % context_window == 0
  num_labels = batch_size / context_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(num_labels, 1), dtype=np.int32)
  span = 2 * skip_window + 1 # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  for i in range(num_labels):
    target = skip_window  # target label at the center of the buffer
    labels[i, 0] = buffer[target]
    targets_to_avoid = [ skip_window ]
    for j in range(context_window):
      while target in targets_to_avoid:
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * context_window + j] = buffer[target]
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  return batch, labels

In [11]:

batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
context_window = 2 * skip_window
num_labels = batch_size / context_window
# num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 32 # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):

  # Input data.
  train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
  train_labels = tf.placeholder(tf.int32, shape=[num_labels, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  
  # Variables.
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  softmax_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
  softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Model.
  # Look up embeddings for inputs.
  embed = tf.nn.embedding_lookup(embeddings, train_dataset)

  # Create a mask to apply to the embed tensor to essentially "sum" the context words
  seq_ids = np.zeros(batch_size, dtype=np.int32)
  cur_id = -1
  for i in range(batch_size):
    if i % context_window == 0:
      cur_id = cur_id + 1
    seq_ids[i] = cur_id
  
  # Use segment_sum to add together the related words and reduce the output to be num_labels in size.
  embed_sum = tf.segment_sum(embed, seq_ids)
  
  # Compute the softmax loss, using a sample of the negative labels each time.
  loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed_sum,
                               train_labels, num_sampled, vocabulary_size))

  # Optimizer.
  optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
  
  # Compute the similarity between minibatch examples and all embeddings.
  # We use the cosine distance:
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(
    normalized_embeddings, valid_dataset)
  similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

In [12]:

num_steps = 100001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  average_loss = 0
  for step in range(num_steps):
    batch_data, batch_labels = generate_batch_cbow(batch_size, skip_window)
    feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
    _, l = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += l
    if step % 2000 == 0:
      if step > 0:
        average_loss = average_loss / 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step %d: %f' % (step, average_loss))
      average_loss = 0
    # note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in xrange(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8 # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        log = 'Nearest to %s:' % valid_word
        for k in xrange(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log = '%s %s,' % (log, close_word)
        print(log)
  final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step 0: 7.065996
Nearest to UNK: browsers, rotary, graded, poisonous, police, funchal, corn, freeciv,
Nearest to war: reconciled, glasnost, satan, affections, barbaric, separatist, yardbirds, enact,
Nearest to seven: bonnie, marriage, originated, brahma, consuls, justifies, usher, reshaped,
Nearest to united: alarms, excommunication, pic, laramie, mcgregor, grampus, responses, fgth,
Nearest to when: benito, illustrated, hydroponics, metamorphic, intriguing, melanesian, drugs, prohibitively,
Nearest to were: restoring, feel, relate, busch, king, cree, hedgehog, discriminated,
Nearest to had: handlebar, ech, chop, reporter, ridley, vast, improper, officiated,
Nearest to he: wardrobe, ing, functionalism, authoritarian, bio, humours, experimenting, olaf,
Nearest to during: trauma, ipa, hangul, greenhouses, schumpeter, specials, chat, infallibility,
Nearest to known: algebras, vladislav, sayyaf, medes, untrained, mimi, act, ed,
Nearest to into: forgo, pus, discord, abstractly, proliferate, infrastructural, plenty, crucible,
Nearest to all: nio, boac, unchecked, contrasts, jersey, overs, isao, stipulations,
Nearest to there: counterattack, reactance, shortening, marginal, polytheism, scientologists, saito, lamo,
Nearest to than: stellar, utah, precaution, jujutsu, chogm, boolean, earnestly, excision,
Nearest to five: stagnation, duke, haran, folksong, llu, stabilizes, behaviorism, stamps,
Nearest to be: departments, resorted, cdu, slovakian, scotty, reconsider, conscript, biconditional,
Average loss at step 2000: 3.689275
Average loss at step 4000: 3.127331
Average loss at step 6000: 3.030139
Average loss at step 8000: 2.877302
Average loss at step 10000: 2.793477
Nearest to UNK: frosts, foul, disregarding, coupled, gaborone, straightforwardly, fluctuations, insertions,
Nearest to war: paucity, verisign, reconciled, separatist, ear, decomposing, lufthansa, cm,
Nearest to seven: eight, six, nine, three, five, four, zero, two,
Nearest to united: ziegler, pic, grampus, responses, powerbook, purdue, kyushu, diner,
Nearest to when: after, viability, hydroponics, where, airlines, samara, took, rigs,
Nearest to were: are, have, but, had, was, kingdoms, ard, matrimonial,
Nearest to had: has, have, were, sqrat, was, aurelianus, stalker, ballads,
Nearest to he: she, it, they, there, who, dimitri, tama, troika,
Nearest to during: greenhouses, voter, under, arius, ipa, trauma, fen, bz,
Nearest to known: well, sayyaf, yardage, absolutive, used, vladislav, such, snoopy,
Nearest to into: with, flirt, out, gave, giulio, rowdy, almeida, from,
Nearest to all: many, howlin, institute, impending, meteorites, sketch, postumus, reagent,
Nearest to there: it, they, he, despised, invisibly, covertly, bebox, oregano,
Nearest to than: chogm, gliders, fptp, spontaneously, detonates, utah, jahan, procure,
Nearest to five: eight, six, nine, three, seven, four, zero, two,
Nearest to be: have, is, magnesians, uncut, protestors, become, arnauld, been,
Average loss at step 12000: 2.766466
Average loss at step 14000: 2.730358
Average loss at step 16000: 2.534443
Average loss at step 18000: 2.566399
Average loss at step 20000: 2.654789
Nearest to UNK: aldosterone, parole, vista, brilliance, bissette, divine, monopoles, facet,
Nearest to war: decomposing, reconciled, paucity, airplay, lufthansa, gunnery, hello, sunday,
Nearest to seven: eight, four, six, zero, nine, five, three, two,
Nearest to united: ziegler, grampus, city, powerbook, pic, responses, please, dactylic,
Nearest to when: if, hydroponics, because, where, denies, archtop, diamagnets, during,
Nearest to were: are, was, have, had, tripod, canetti, dagesh, hobby,
Nearest to had: has, have, was, having, waking, were, impatient, unsuitable,
Nearest to he: she, it, they, there, who, then, rue, never,
Nearest to during: under, on, greenhouses, in, with, at, breeze, sothoth,
Nearest to known: used, such, well, considered, defined, understood, regarded, described,
Nearest to into: through, from, rowdy, picks, under, in, disordered, within,
Nearest to all: these, many, several, omnivores, ethically, panspermia, seizing, suggest,
Nearest to there: they, it, he, usually, which, still, misguided, melatonin,
Nearest to than: or, chogm, utah, detonates, mond, hearts, acknowledgment, mao,
Nearest to five: zero, six, seven, eight, four, three, two, nine,
Nearest to be: been, have, produce, refer, become, magnesians, uncut, being,
Average loss at step 22000: 2.619580
Average loss at step 24000: 2.585866
Average loss at step 26000: 2.560046
Average loss at step 28000: 2.563704
Average loss at step 30000: 2.587996
Nearest to UNK: adventure, colonists, h, paralleling, vit, caricatured, kerosene, kurdistan,
Nearest to war: decomposing, sirach, jomo, edu, synchronize, martyrdom, sunday, spray,
Nearest to seven: nine, eight, five, six, four, three, zero, two,
Nearest to united: grampus, city, ziegler, georgian, rimet, gettier, powerbook, responses,
Nearest to when: if, where, after, repetitions, before, from, since, however,
Nearest to were: are, was, have, had, being, been, is, hostel,
Nearest to had: have, has, was, having, since, were, demography, halts,
Nearest to he: she, they, it, who, there, inkjet, lifespans, never,
Nearest to during: after, in, throughout, before, until, through, from, at,
Nearest to known: used, described, understood, well, defined, considered, regarded, available,
Nearest to into: from, within, under, through, with, on, rowdy, around,
Nearest to all: several, meteorites, any, various, many, taxing, civilized, almohades,
Nearest to there: they, usually, he, it, still, often, tambo, she,
Nearest to than: chogm, much, becoming, far, utah, algebraically, dada, reconsidered,
Nearest to five: eight, seven, six, nine, four, zero, three, two,
Nearest to be: refer, produce, have, xy, forster, become, say, were,
Average loss at step 32000: 2.578251
Average loss at step 34000: 2.536099
Average loss at step 36000: 2.510159
Average loss at step 38000: 2.320349
Average loss at step 40000: 2.483141
Nearest to UNK: b, dutton, brian, hinged, d, mazar, abhorrent, cfaf,
Nearest to war: decomposing, paternity, leblanc, synchronize, spray, decree, egypt, muhammad,
Nearest to seven: nine, eight, five, six, three, four, two, zero,
Nearest to united: grampus, pic, georgian, city, toluene, avtovaz, powerbook, trigonometric,
Nearest to when: while, after, if, before, by, denies, cruz, until,
Nearest to were: are, was, have, been, being, brimstone, those, exe,
Nearest to had: has, have, having, was, demography, impatient, shrugged, became,
Nearest to he: she, it, they, soon, subsequently, eventually, there, who,
Nearest to during: at, in, within, throughout, through, before, between, on,
Nearest to known: used, understood, considered, described, defined, such, regarded, available,
Nearest to into: through, from, around, back, picks, away, emission, painless,
Nearest to all: both, any, every, two, drayton, dol, argentinian, each,
Nearest to there: it, they, still, usually, she, which, he, now,
Nearest to than: chogm, or, tenses, hairless, shareware, volunteering, consecrate, becoming,
Nearest to five: seven, six, four, eight, nine, two, three, zero,
Nearest to be: produce, have, refer, become, been, are, is, steal,
Average loss at step 42000: 2.491943
Average loss at step 44000: 2.495752
Average loss at step 46000: 2.491358
Average loss at step 48000: 2.388357
Average loss at step 50000: 2.403683
Nearest to UNK: braille, ma, kryptonians, berry, mauritius, mennonites, iy, h,
Nearest to war: conflict, wars, computations, decomposing, synchronize, muhammad, leblanc, commodity,
Nearest to seven: eight, six, nine, three, four, five, two, zero,
Nearest to united: pic, past, georgian, rimet, grampus, ny, gettier, ziegler,
Nearest to when: before, if, after, while, though, where, however, foreman,
Nearest to were: are, was, have, is, had, hostel, modernists, ocular,
Nearest to had: has, have, having, was, wanted, were, impatient, explains,
Nearest to he: she, it, they, who, there, then, soon, eventually,
Nearest to during: in, throughout, within, after, from, through, nilpotent, at,
Nearest to known: used, understood, defined, described, available, regarded, considered, famous,
Nearest to into: from, through, around, across, under, within, toward, away,
Nearest to all: every, meteorites, some, taxing, howlin, civilized, ovid, panspermia,
Nearest to there: they, it, he, she, still, incomes, now, jabal,
Nearest to than: reconsidered, chogm, or, but, even, spontaneously, tenses, far,
Nearest to five: four, six, eight, seven, three, nine, zero, two,
Nearest to be: have, refer, become, produce, being, easily, been, officiate,
Average loss at step 52000: 2.451219
Average loss at step 54000: 2.423089
Average loss at step 56000: 2.458059
Average loss at step 58000: 2.389679
Average loss at step 60000: 2.408637
Nearest to UNK: haas, slammed, domingue, singularly, pfa, fatimid, seabed, relegated,
Nearest to war: conflict, muhammad, wars, elizabeth, leblanc, impacts, decomposing, am,
Nearest to seven: five, six, eight, four, nine, three, zero, two,
Nearest to united: following, past, rimet, pic, baltic, grampus, mutation, hawaii,
Nearest to when: if, before, although, while, after, during, by, though,
Nearest to were: are, was, had, have, be, been, including, romanticism,
Nearest to had: has, have, was, yet, already, were, gave, never,
Nearest to he: she, it, they, who, eventually, there, soon, never,
Nearest to during: in, before, after, within, when, under, although, following,
Nearest to known: used, defined, regarded, understood, considered, described, referred, famous,
Nearest to into: around, through, from, under, with, disordered, on, within,
Nearest to all: some, those, manpower, any, reset, every, many, both,
Nearest to there: they, it, he, this, still, sometimes, verbose, we,
Nearest to than: chogm, agena, far, tenses, utah, slightly, algebraically, acknowledgment,
Nearest to five: four, seven, three, six, eight, nine, zero, two,
Nearest to be: been, produce, remain, refer, were, become, easily, have,
Average loss at step 62000: 2.205583
Average loss at step 64000: 2.229476
Average loss at step 66000: 2.384556
Average loss at step 68000: 2.391171
Average loss at step 70000: 2.338602
Nearest to UNK: aussie, superconductor, lemnos, servitude, duo, ller, hemiparesis, zapata,
Nearest to war: wars, conflict, elizabeth, leblanc, series, coup, hacienda, commodity,
Nearest to seven: eight, six, nine, five, four, zero, three, two,
Nearest to united: nation, pic, past, baltic, purdue, goguryeo, west, toluene,
Nearest to when: if, where, before, however, after, rgb, californians, because,
Nearest to were: are, have, was, had, is, be, been, cusco,
Nearest to had: has, have, was, having, were, already, subsequently, saw,
Nearest to he: she, they, it, soon, there, we, hitler, who,
Nearest to during: before, until, throughout, after, in, at, under, despite,
Nearest to known: used, defined, understood, regarded, described, such, considered, famous,
Nearest to into: from, through, around, across, upside, back, within, away,
Nearest to all: some, various, every, both, many, panspermia, any, pharmacist,
Nearest to there: it, they, still, she, we, he, sometimes, often,
Nearest to than: becoming, dualities, but, detail, or, volunteering, chogm, fiorentina,
Nearest to five: four, seven, eight, six, nine, three, zero, two,
Nearest to be: been, become, produce, is, have, easily, provide, were,
Average loss at step 72000: 2.365039
Average loss at step 74000: 2.334780
Average loss at step 76000: 2.327395
Average loss at step 78000: 2.343068
Average loss at step 80000: 2.355344
Nearest to UNK: willfully, vulpes, parole, astronomer, manuel, riley, ross, zacharias,
Nearest to war: wars, conflict, leblanc, coup, pandit, elizabeth, commodity, rias,
Nearest to seven: five, six, eight, zero, four, nine, three, two,
Nearest to united: nation, pic, purdue, expressionism, median, baltic, toluene, newsreel,
Nearest to when: after, before, where, if, until, for, without, then,
Nearest to were: are, have, had, cusco, was, including, modernists, include,
Nearest to had: has, have, was, already, were, having, never, are,
Nearest to he: she, it, they, there, we, originally, never, hitler,
Nearest to during: after, throughout, before, until, from, in, through, despite,
Nearest to known: defined, used, understood, described, regarded, such, cited, referred,
Nearest to into: through, around, within, from, along, across, away, with,
Nearest to all: every, banishment, taxing, various, both, many, out, civilized,
Nearest to there: it, he, they, she, sometimes, we, said, strikers,
Nearest to than: but, chogm, much, dualities, fiorentina, becoming, agena, while,
Nearest to five: four, seven, eight, six, three, nine, zero, two,
Nearest to be: been, become, easily, being, remain, refer, seem, produce,
Average loss at step 82000: 2.377959
Average loss at step 84000: 2.393377
Average loss at step 86000: 2.356136
Average loss at step 88000: 2.312366
Average loss at step 90000: 2.316218
Nearest to UNK: ting, plummeted, johan, battista, parole, haas, clancy, konrad,
Nearest to war: wars, conflict, coup, elizabeth, muhammad, apparatus, commodity, sliders,
Nearest to seven: nine, eight, five, four, six, zero, three, two,
Nearest to united: baltic, confederate, nation, pic, papal, usa, purdue, southern,
Nearest to when: after, before, while, until, if, without, by, although,
Nearest to were: are, was, had, have, being, while, been, canetti,
Nearest to had: has, have, was, were, began, continued, already, having,
Nearest to he: she, they, it, there, subsequently, originally, initially, who,
Nearest to during: throughout, before, in, until, after, despite, while, from,
Nearest to known: described, defined, used, understood, regarded, such, seen, opposed,
Nearest to into: through, around, within, across, from, beyond, under, upside,
Nearest to all: many, several, some, various, every, aware, any, regardless,
Nearest to there: it, they, he, still, we, she, not, keeshond,
Nearest to than: chogm, much, algebraically, detail, fiorentina, becoming, far, spacetime,
Nearest to five: eight, four, seven, six, nine, three, two, zero,
Nearest to be: easily, have, is, provide, was, remain, produce, been,
Average loss at step 92000: 2.363738
Average loss at step 94000: 2.205631
Average loss at step 96000: 2.325339
Average loss at step 98000: 2.205846
Average loss at step 100000: 2.315003
Nearest to UNK: extraneous, ller, tsu, nocs, hallstatt, reggie, formalize, christiansen,
Nearest to war: wars, conflict, leblanc, coup, decomposing, muhammad, season, trade,
Nearest to seven: nine, eight, four, five, six, zero, three, two,
Nearest to united: baltic, confederate, pic, nation, newsreel, city, southern, georgian,
Nearest to when: if, while, after, although, before, though, where, because,
Nearest to were: are, have, brimstone, exist, those, had, modernists, cusco,
Nearest to had: has, have, having, already, refused, was, demography, enjoys,
Nearest to he: she, they, it, there, who, we, initially, eventually,
Nearest to during: throughout, in, despite, through, within, at, until, among,
Nearest to known: defined, described, understood, such, used, possible, opposed, referred,
Nearest to into: through, within, from, around, across, beyond, back, upside,
Nearest to all: various, these, every, both, some, several, many, any,
Nearest to there: they, he, she, still, it, believed, however, we,
Nearest to than: chogm, or, much, acknowledgment, fanfic, mauss, bskyb, bakersfield,
Nearest to five: six, seven, eight, four, nine, zero, two, three,
Nearest to be: have, been, produce, lead, remain, easily, refer, become,

In [13]:

num_points = 400

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])

In [14]:

%matplotlib inline

def plot(embeddings, labels):
  assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
  pylab.figure(figsize=(15,15))  # in inches
  for i, label in enumerate(labels):
    x, y = embeddings[i,:]
    pylab.scatter(x, y)
    pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                   ha='right', va='bottom')
  pylab.show()

words = [reverse_dictionary[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)