# All Import Statements Defined Here
# Note: Do not add to this list.
# ----------------

import sys
assert sys.version_info[0]==3
assert sys.version_info[1] >= 5

from platform import python_version
assert int(python_version().split(".")[1]) >= 5, "Please upgrade your Python version following the instructions in \
    the README.txt file found in the same directory as this notebook. Your Python version is " + python_version()

from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import pprint
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]
import nltk
nltk.download('reuters') #to specify download location, optionally add the argument: download_dir='/specify/desired/path/'
from nltk.corpus import reuters
import numpy as np
import random
import scipy as sp
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from adjustText import adjust_text

START_TOKEN = '<START>'
END_TOKEN = '<END>'

np.random.seed(0)
random.seed(0)
# ----------------

[nltk_data] Downloading package reuters to /home/zstar/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


def read_corpus(category="grain"):
    """ Read files from the specified Reuter's category.
        Params:
            category (string): category name
        Return:
            list of lists, with words from each of the processed files
    """
    files = reuters.fileids(category)
    return [[START_TOKEN] + [w.lower() for w in list(reuters.words(f))] + [END_TOKEN] for f in files]


reuters_corpus = read_corpus()
pprint.pprint(reuters_corpus[:3], compact=True, width=100)

[['<START>', 'china', 'daily', 'says', 'vermin', 'eat', '7', '-', '12', 'pct', 'grain', 'stocks',
  'a', 'survey', 'of', '19', 'provinces', 'and', 'seven', 'cities', 'showed', 'vermin', 'consume',
  'between', 'seven', 'and', '12', 'pct', 'of', 'china', "'", 's', 'grain', 'stocks', ',', 'the',
  'china', 'daily', 'said', '.', 'it', 'also', 'said', 'that', 'each', 'year', '1', '.', '575',
  'mln', 'tonnes', ',', 'or', '25', 'pct', ',', 'of', 'china', "'", 's', 'fruit', 'output', 'are',
  'left', 'to', 'rot', ',', 'and', '2', '.', '1', 'mln', 'tonnes', ',', 'or', 'up', 'to', '30',
  'pct', ',', 'of', 'its', 'vegetables', '.', 'the', 'paper', 'blamed', 'the', 'waste', 'on',
  'inadequate', 'storage', 'and', 'bad', 'preservation', 'methods', '.', 'it', 'said', 'the',
  'government', 'had', 'launched', 'a', 'national', 'programme', 'to', 'reduce', 'waste', ',',
  'calling', 'for', 'improved', 'technology', 'in', 'storage', 'and', 'preservation', ',', 'and',
  'greater', 'production', 'of', 'additives', '.', 'the', 'paper', 'gave', 'no', 'further',
  'details', '.', '<END>'],
 ['<START>', 'thai', 'trade', 'deficit', 'widens', 'in', 'first', 'quarter', 'thailand', "'", 's',
  'trade', 'deficit', 'widened', 'to', '4', '.', '5', 'billion', 'baht', 'in', 'the', 'first',
  'quarter', 'of', '1987', 'from', '2', '.', '1', 'billion', 'a', 'year', 'ago', ',', 'the',
  'business', 'economics', 'department', 'said', '.', 'it', 'said', 'janunary', '/', 'march',
  'imports', 'rose', 'to', '65', '.', '1', 'billion', 'baht', 'from', '58', '.', '7', 'billion',
  '.', 'thailand', "'", 's', 'improved', 'business', 'climate', 'this', 'year', 'resulted', 'in',
  'a', '27', 'pct', 'increase', 'in', 'imports', 'of', 'raw', 'materials', 'and', 'semi', '-',
  'finished', 'products', '.', 'the', 'country', "'", 's', 'oil', 'import', 'bill', ',', 'however',
  ',', 'fell', '23', 'pct', 'in', 'the', 'first', 'quarter', 'due', 'to', 'lower', 'oil', 'prices',
  '.', 'the', 'department', 'said', 'first', 'quarter', 'exports', 'expanded', 'to', '60', '.', '6',
  'billion', 'baht', 'from', '56', '.', '6', 'billion', '.', 'export', 'growth', 'was', 'smaller',
  'than', 'expected', 'due', 'to', 'lower', 'earnings', 'from', 'many', 'key', 'commodities',
  'including', 'rice', 'whose', 'earnings', 'declined', '18', 'pct', ',', 'maize', '66', 'pct', ',',
  'sugar', '45', 'pct', ',', 'tin', '26', 'pct', 'and', 'canned', 'pineapples', 'seven', 'pct', '.',
  'products', 'registering', 'high', 'export', 'growth', 'were', 'jewellery', 'up', '64', 'pct',
  ',', 'clothing', '57', 'pct', 'and', 'rubber', '35', 'pct', '.', '<END>'],
 ['<START>', 'sri', 'lanka', 'gets', 'usda', 'approval', 'for', 'wheat', 'price', 'food',
  'department', 'officials', 'said', 'the', 'u', '.', 's', '.', 'department', 'of', 'agriculture',
  'approved', 'the', 'continental', 'grain', 'co', 'sale', 'of', '52', ',', '500', 'tonnes', 'of',
  'soft', 'wheat', 'at', '89', 'u', '.', 's', '.', 'dlrs', 'a', 'tonne', 'c', 'and', 'f', 'from',
  'pacific', 'northwest', 'to', 'colombo', '.', 'they', 'said', 'the', 'shipment', 'was', 'for',
  'april', '8', 'to', '20', 'delivery', '.', '<END>']]


def distinct_words(corpus):
    """ Determine a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): sorted list of distinct words across the corpus
            n_corpus_words (integer): number of distinct words across the corpus
    """
    
    # ------------------
    # Write your implementation here.
    corpus_words = sorted(list(set([word for sentence in corpus for word in sentence])))
    n_corpus_words = len(corpus_words)
    # ------------------

    return corpus_words, n_corpus_words


# ---------------------
# Run this sanity check
# Note that this not an exhaustive check for correctness.
# ---------------------

# Define toy corpus
test_corpus = ["{} All that glitters isn't gold {}".format(START_TOKEN, END_TOKEN).split(" "), "{} All's well that ends well {}".format(START_TOKEN, END_TOKEN).split(" ")]
test_corpus_words, num_corpus_words = distinct_words(test_corpus)

# Correct answers
ans_test_corpus_words = sorted([START_TOKEN, "All", "ends", "that", "gold", "All's", "glitters", "isn't", "well", END_TOKEN])
ans_num_corpus_words = len(ans_test_corpus_words)

# Test correct number of words
assert(num_corpus_words == ans_num_corpus_words), "Incorrect number of distinct words. Correct: {}. Yours: {}".format(ans_num_corpus_words, num_corpus_words)

# Test correct words
assert (test_corpus_words == ans_test_corpus_words), "Incorrect corpus_words.\nCorrect: {}\nYours:   {}".format(str(ans_test_corpus_words), str(test_corpus_words))

# Print Success
print ("-" * 80)
print("Passed All Tests!")
print ("-" * 80)

--------------------------------------------------------------------------------
Passed All Tests!
--------------------------------------------------------------------------------


def compute_co_occurrence_matrix(corpus, window_size=4):
    """ Compute co-occurrence matrix for the given corpus and window_size (default of 4).
    
        Note: Each word in a document should be at the center of a window. Words near edges will have a smaller
              number of co-occurring words.
              
              For example, if we take the document "<START> All that glitters is not gold <END>" with window size of 4,
              "All" will co-occur with "<START>", "that", "glitters", "is", and "not".
    
        Params:
            corpus (list of list of strings): corpus of documents
            window_size (int): size of context window
        Return:
            M (a symmetric numpy matrix of shape (number of unique words in the corpus , number of unique words in the corpus)): 
                Co-occurence matrix of word counts. 
                The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
            word2ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    """
    # ------------------
    # Write your implementation here.
    words, n_words = distinct_words(corpus)
    word2ind = dict([(word, i) for i, word in enumerate(words)])
    M = np.zeros((n_words, n_words))

    def cal_occur(sentence):
        for i , word in enumerate(sentence):
            for j in range(max(i-window_size,0),min(i+window_size,len(sentence))):
                if j != i:
                    M[word2ind[word]][word2ind[sentence[j]]] += 1

    for sentence in corpus:
        cal_occur(sentence)

    # only counting once in the previous cal_occur function, so the result is M+Transpose(M)
    M = M + M.T
    # ------------------

    return M, word2ind


# ---------------------
# Run this sanity check
# Note that this is not an exhaustive check for correctness.
# ---------------------

# Define toy corpus and get student's co-occurrence matrix
test_corpus = ["{} All that glitters isn't gold {}".format(START_TOKEN, END_TOKEN).split(" "), "{} All's well that ends well {}".format(START_TOKEN, END_TOKEN).split(" ")]
M_test, word2ind_test = compute_co_occurrence_matrix(test_corpus, window_size=1)

# Correct M and word2ind
M_test_ans = np.array( 
    [[0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,],
     [0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,],
     [0., 1., 0., 0., 0., 0., 0., 0., 1., 0.,],
     [0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,],
     [0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,],
     [0., 0., 0., 0., 0., 0., 0., 1., 1., 0.,],
     [1., 0., 0., 0., 0., 0., 0., 1., 0., 0.,],
     [0., 0., 0., 0., 0., 1., 1., 0., 0., 0.,],
     [0., 0., 1., 0., 1., 1., 0., 0., 0., 1.,],
     [1., 0., 0., 1., 1., 0., 0., 0., 1., 0.,]]
)
ans_test_corpus_words = sorted([START_TOKEN, "All", "ends", "that", "gold", "All's", "glitters", "isn't", "well", END_TOKEN])
word2ind_ans = dict(zip(ans_test_corpus_words, range(len(ans_test_corpus_words))))

# Test correct word2ind
assert (word2ind_ans == word2ind_test), "Your word2ind is incorrect:\nCorrect: {}\nYours: {}".format(word2ind_ans, word2ind_test)

# Test correct M shape
assert (M_test.shape == M_test_ans.shape), "M matrix has incorrect shape.\nCorrect: {}\nYours: {}".format(M_test.shape, M_test_ans.shape)

# Test correct M values
for w1 in word2ind_ans.keys():
    idx1 = word2ind_ans[w1]
    for w2 in word2ind_ans.keys():
        idx2 = word2ind_ans[w2]
        student = M_test[idx1, idx2]
        correct = M_test_ans[idx1, idx2]
        if student != correct:
            print("Correct M:")
            print(M_test_ans)
            print("Your M: ")
            print(M_test)
            raise AssertionError("Incorrect count at index ({}, {})=({}, {}) in matrix M. Yours has {} but should have {}.".format(idx1, idx2, w1, w2, student, correct))

# Print Success
print ("-" * 80)
print("Passed All Tests!")
print ("-" * 80)
[START_TOKEN, "All", "ends", "that", "gold", "All's", "glitters", "isn't", "well", END_TOKEN]

--------------------------------------------------------------------------------
Passed All Tests!
--------------------------------------------------------------------------------

['<START>',
 'All',
 'ends',
 'that',
 'gold',
 "All's",
 'glitters',
 "isn't",
 'well',
 '<END>']


def reduce_to_k_dim(M, k=2):
    """ Reduce a co-occurence count matrix of dimensionality (num_corpus_words, num_corpus_words)
        to a matrix of dimensionality (num_corpus_words, k) using the following SVD function from Scikit-Learn:
            - http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
    
        Params:
            M (numpy matrix of shape (number of unique words in the corpus , number of unique words in the corpus)): co-occurence matrix of word counts
            k (int): embedding size of each word after dimension reduction
        Return:
            M_reduced (numpy matrix of shape (number of corpus words, k)): matrix of k-dimensioal word embeddings.
                    In terms of the SVD from math class, this actually returns U * S
    """    
    n_iters = 10     # Use this parameter in your call to `TruncatedSVD`
    M_reduced = None
    print("Running Truncated SVD over %i words..." % (M.shape[0]))
    
    # ------------------
    # Write your implementation here.
    svd = TruncatedSVD(n_components=k, n_iter=n_iters, random_state=22)
    M_reduced = svd.fit_transform(M)
    # ------------------

    print("Done.")
    return M_reduced


# ---------------------
# Run this sanity check
# Note that this is not an exhaustive check for correctness 
# In fact we only check that your M_reduced has the right dimensions.
# ---------------------

# Define toy corpus and run student code
test_corpus = ["{} All that glitters isn't gold {}".format(START_TOKEN, END_TOKEN).split(" "), "{} All's well that ends well {}".format(START_TOKEN, END_TOKEN).split(" ")]
M_test, word2ind_test = compute_co_occurrence_matrix(test_corpus, window_size=1)
M_test_reduced = reduce_to_k_dim(M_test, k=2)

# Test proper dimensions
assert (M_test_reduced.shape[0] == 10), "M_reduced has {} rows; should have {}".format(M_test_reduced.shape[0], 10)
assert (M_test_reduced.shape[1] == 2), "M_reduced has {} columns; should have {}".format(M_test_reduced.shape[1], 2)

# Print Success
print ("-" * 80)
print("Passed All Tests!")
print ("-" * 80)

Running Truncated SVD over 10 words...
Done.
--------------------------------------------------------------------------------
Passed All Tests!
--------------------------------------------------------------------------------


def plot_embeddings(M_reduced, word2ind, words):
    """ Plot in a scatterplot the embeddings of the words specified in the list "words".
        NOTE: do not plot all the words listed in M_reduced / word2ind.
        Include a label next to each point.
        
        Params:
            M_reduced (numpy matrix of shape (number of unique words in the corpus , 2)): matrix of 2-dimensioal word embeddings
            word2ind (dict): dictionary that maps word to indices for matrix M
            words (list of strings): words whose embeddings we want to visualize
    """
    texts = []
    # ------------------
    # Write your implementation here.
    for i,word in enumerate(words):
        x = M_reduced[i][0]
        y = M_reduced[i][1]
        plt.scatter(x, y, marker='o',s=20, color='b', alpha=0.5)
        texts.append(plt.text(x, y, word, fontsize=9))
    adjust_text(texts, arrowprops=dict(arrowstyle='->', color='b', lw=0.5))
    # ------------------


# ---------------------
# Run this sanity check
# Note that this is not an exhaustive check for correctness.
# The plot produced should look like the "test solution plot" depicted below. 
# ---------------------

print ("-" * 80)
print ("Outputted Plot:")

M_reduced_plot_test = np.array([[1, 1], [-1, -1], [1, -1], [-1, 1], [0, 0]])
word2ind_plot_test = {'test1': 0, 'test2': 1, 'test3': 2, 'test4': 3, 'test5': 4}
words = ['test1', 'test2', 'test3', 'test4', 'test5']
plot_embeddings(M_reduced_plot_test, word2ind_plot_test, words)

print ("-" * 80)

--------------------------------------------------------------------------------
Outputted Plot:
--------------------------------------------------------------------------------


# -----------------------------
# Run This Cell to Produce Your Plot
# ------------------------------
reuters_corpus = read_corpus()
M_co_occurrence, word2ind_co_occurrence = compute_co_occurrence_matrix(reuters_corpus)
M_reduced_co_occurrence = reduce_to_k_dim(M_co_occurrence, k=2)

# Rescale (normalize) the rows to make them each of unit-length
M_lengths = np.linalg.norm(M_reduced_co_occurrence, axis=1)
M_normalized = M_reduced_co_occurrence / M_lengths[:, np.newaxis] # broadcasting

words = ['tonnes', 'grain', 'wheat',  'agriculture', 'corn', 'maize', 'export', 'department', 'barley', 'grains', 'soybeans', 'sorghum']

plot_embeddings(M_normalized, word2ind_co_occurrence, words)

Running Truncated SVD over 7146 words...
Done.


def load_embedding_model():
    """ Load GloVe Vectors
        Return:
            wv_from_bin: All 400000 embeddings, each lengh 200
    """
    import gensim.downloader as api
    wv_from_bin = api.load("glove-wiki-gigaword-200")
    print("Loaded vocab size %i" % len(list(wv_from_bin.index_to_key)))
    return wv_from_bin


# -----------------------------------
# Run Cell to Load Word Vectors
# Note: This will take a couple minutes
# -----------------------------------
wv_from_bin = load_embedding_model()

[==================================================] 100.0% 252.1/252.1MB downloaded
Loaded vocab size 400000


def get_matrix_of_vectors(wv_from_bin, required_words=['tonnes', 'grain', 'wheat',  'agriculture', 'corn', 'maize', 'export', 'department', 'barley', 'grains', 'soybeans', 'sorghum']):
    """ Put the GloVe vectors into a matrix M.
        Param:
            wv_from_bin: KeyedVectors object; the 400000 GloVe vectors loaded from file
        Return:
            M: numpy matrix shape (num words, 200) containing the vectors
            word2ind: dictionary mapping each word to its row number in M
    """
    import random
    words = list(wv_from_bin.index_to_key)
    print("Shuffling words ...")
    random.seed(225)
    random.shuffle(words)
    words = words[:10000]
    print("Putting %i words into word2ind and matrix M..." % len(words))
    word2ind = {}
    M = []
    curInd = 0
    for w in words:
        try:
            M.append(wv_from_bin.get_vector(w))
            word2ind[w] = curInd
            curInd += 1
        except KeyError:
            continue
    for w in required_words:
        if w in words:
            continue
        try:
            M.append(wv_from_bin.get_vector(w))
            word2ind[w] = curInd
            curInd += 1
        except KeyError:
            continue
    M = np.stack(M)
    print("Done.")
    return M, word2ind


# -----------------------------------------------------------------
# Run Cell to Reduce 200-Dimensional Word Embeddings to k Dimensions
# Note: This should be quick to run
# -----------------------------------------------------------------
M, word2ind = get_matrix_of_vectors(wv_from_bin)
M_reduced = reduce_to_k_dim(M, k=2)

# Rescale (normalize) the rows to make them each of unit-length
M_lengths = np.linalg.norm(M_reduced, axis=1)
M_reduced_normalized = M_reduced / M_lengths[:, np.newaxis] # broadcasting

Shuffling words ...
Putting 10000 words into word2ind and matrix M...
Done.
Running Truncated SVD over 10012 words...
Done.


words = ['tonnes', 'grain', 'wheat',  'agriculture', 'corn', 'maize', 'export', 'department', 'barley', 'grains', 'soybeans', 'sorghum']
plot_embeddings(M_reduced_normalized, word2ind, words)


# ------------------
# Write your implementation here.
wv_from_bin.most_similar("set")
# ------------------

[('setting', 0.7918007969856262),
 ('sets', 0.7892743945121765),
 ('up', 0.7310757040977478),
 ('put', 0.6977022290229797),
 ('next', 0.6920910477638245),
 ('break', 0.6817953586578369),
 ('time', 0.677823543548584),
 ('out', 0.6720898151397705),
 ('before', 0.6685832738876343),
 ('three', 0.6600466370582581)]


# ------------------
# Write your implementation here.
w1 = "hungry"
w2 = "starve"
w3 = "full"
w1_w2_dist = wv_from_bin.distance(w1, w2)
w1_w3_dist = wv_from_bin.distance(w1, w3)

print("Synonyms {}, {} have cosine distance: {}".format(w1, w2, w1_w2_dist))
print("Antonyms {}, {} have cosine distance: {}".format(w1, w3, w1_w3_dist))

# ------------------

Synonyms hungry, starve have cosine distance: 0.5705483555793762
Antonyms hungry, full have cosine distance: 0.7731066644191742


# Run this cell to answer the analogy -- man : grandfather :: woman : x
pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'grandfather'], negative=['man']))

[('grandmother', 0.7608445286750793),
 ('granddaughter', 0.7200808525085449),
 ('daughter', 0.7168302536010742),
 ('mother', 0.7151536345481873),
 ('niece', 0.7005682587623596),
 ('father', 0.6659887433052063),
 ('aunt', 0.6623408794403076),
 ('grandson', 0.6618767976760864),
 ('grandparents', 0.644661009311676),
 ('wife', 0.6445354223251343)]


# ------------------
# Write your implementation here.

pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'king'], negative=['man']))

# ------------------

[('queen', 0.6978678107261658),
 ('princess', 0.6081745028495789),
 ('monarch', 0.5889754891395569),
 ('throne', 0.5775108933448792),
 ('prince', 0.5750998258590698),
 ('elizabeth', 0.5463595986366272),
 ('daughter', 0.5399126410484314),
 ('kingdom', 0.5318052768707275),
 ('mother', 0.5168544054031372),
 ('crown', 0.5164473056793213)]


# ------------------
# Write your implementation here.

pprint.pprint(wv_from_bin.most_similar(positive=['food', 'beer'], negative=['bar']))

# ------------------

[('beverages', 0.5785805583000183),
 ('bottled', 0.5605046153068542),
 ('beverage', 0.5562623143196106),
 ('drinks', 0.5507627725601196),
 ('foods', 0.5487022399902344),
 ('foodstuffs', 0.5441000461578369),
 ('drink', 0.5392265915870667),
 ('supplies', 0.538281261920929),
 ('vegetables', 0.5174034237861633),
 ('consumption', 0.4991244971752167)]


# Run this cell
# Here `positive` indicates the list of words to be similar to and `negative` indicates the list of words to be
# most dissimilar from.
pprint.pprint(wv_from_bin.most_similar(positive=['girl', 'toy'], negative=['boy']))
print()
pprint.pprint(wv_from_bin.most_similar(positive=['boy', 'toy'], negative=['girl']))

[('toys', 0.7094953060150146),
 ('doll', 0.5932914614677429),
 ('dolls', 0.570662260055542),
 ('barbie', 0.5407706499099731),
 ('mattel', 0.5328551530838013),
 ('accessories', 0.5206909775733948),
 ('hasbro', 0.49227219820022583),
 ('jewelry', 0.47385692596435547),
 ('lego', 0.4690813422203064),
 ('apparel', 0.46136239171028137)]

[('toys', 0.71570885181427),
 ('hasbro', 0.5164632797241211),
 ('robot', 0.47317108511924744),
 ('pet', 0.4670490324497223),
 ('manufacturer', 0.4668163061141968),
 ('mattel', 0.4582391679286957),
 ('lego', 0.45811763405799866),
 ('miniature', 0.4441472291946411),
 ('makers', 0.44298243522644043),
 ('manufactured', 0.44275349378585815)]


# ------------------
# Write your implementation here.

pprint.pprint(wv_from_bin.most_similar(positive=['black', 'professor'], negative=['white']))
print()
pprint.pprint(wv_from_bin.most_similar(positive=['white', 'professor'], negative=['black']))
# ------------------

[('university', 0.6867130994796753),
 ('sociology', 0.683375358581543),
 ('emeritus', 0.6366648077964783),
 ('lecturer', 0.6357998847961426),
 ('graduate', 0.6073018908500671),
 ('harvard', 0.6018887162208557),
 ('psychology', 0.5925478339195251),
 ('teaches', 0.5869435667991638),
 ('anthropology', 0.586176335811615),
 ('scholar', 0.5861257314682007)]

[('university', 0.6411644220352173),
 ('harvard', 0.6299682855606079),
 ('lecturer', 0.6206913590431213),
 ('associate', 0.6085301637649536),
 ('researcher', 0.5977059006690979),
 ('scientist', 0.5916552543640137),
 ('yale', 0.5878740549087524),
 ('scholar', 0.5807212591171265),
 ('dr.', 0.5745213031768799),
 ('assistant', 0.5710031390190125)]

CS224N Assignment 1: Exploring Word Vectors (25 Points)¶

Due 3:15pm, Tue Jan 11 ¶

Word Vectors¶

Part 1: Count-Based Word Vectors (10 points)¶

Co-Occurrence¶

Plotting Co-Occurrence Word Embeddings¶

Question 1.1: Implement `distinct_words` [code] (2 points)¶

Question 1.2: Implement `compute_co_occurrence_matrix` [code] (3 points)¶

Question 1.3: Implement `reduce_to_k_dim` [code] (1 point)¶

Question 1.4: Implement `plot_embeddings` [code] (1 point)¶

Question 1.5: Co-Occurrence Plot Analysis [written] (3 points)¶

Write your answer here.¶

Part 2: Prediction-Based Word Vectors (15 points)¶

Note: If you are receiving a "reset by peer" error, rerun the cell to restart the download.¶

Reducing dimensionality of Word Embeddings¶

Question 2.1: GloVe Plot Analysis [written] (3 points)¶

Cosine Similarity¶

Question 2.2: Words with Multiple Meanings (1.5 points) [code + written]¶

Question 2.3: Synonyms & Antonyms (2 points) [code + written]¶

Question 2.4: Analogies with Word Vectors [written] (1.5 points)¶

Question 2.5: Finding Analogies [code + written] (1.5 points)¶

Question 2.6: Incorrect Analogy [code + written] (1.5 points)¶

Question 2.7: Guided Analysis of Bias in Word Vectors [written] (1 point)¶

Question 2.8: Independent Analysis of Bias in Word Vectors [code + written] (1 point)¶

Question 2.9: Thinking About Bias [written] (2 points)¶

Submission Instructions¶

*	`<START>`	all	that	glitters	is	not	gold	well	ends	`<END>`
`<START>`	0	2	0	0	0	0	0	0	0	0
all	2	0	1	0	1	0	0	0	0	0
that	0	1	0	1	0	0	0	1	1	0
glitters	0	0	1	0	1	0	0	0	0	0
is	0	1	0	1	0	1	0	1	0	0
not	0	0	0	0	1	0	1	0	0	0
gold	0	0	0	0	0	1	0	0	0	1
well	0	0	1	0	1	0	0	0	1	1
ends	0	0	1	0	0	0	0	1	0	0
`<END>`	0	0	0	0	0	0	1	1	0	0

CS224N Assignment 1: Exploring Word Vectors (25 Points)¶

Due 3:15pm, Tue Jan 11 ¶

Word Vectors¶

Part 1: Count-Based Word Vectors (10 points)¶

Co-Occurrence¶

Plotting Co-Occurrence Word Embeddings¶

Question 1.1: Implement distinct_words [code] (2 points)¶

Question 1.2: Implement compute_co_occurrence_matrix [code] (3 points)¶

Question 1.3: Implement reduce_to_k_dim [code] (1 point)¶

Question 1.4: Implement plot_embeddings [code] (1 point)¶

Question 1.5: Co-Occurrence Plot Analysis [written] (3 points)¶

Write your answer here.¶

Part 2: Prediction-Based Word Vectors (15 points)¶

Note: If you are receiving a "reset by peer" error, rerun the cell to restart the download.¶

Reducing dimensionality of Word Embeddings¶

Question 2.1: GloVe Plot Analysis [written] (3 points)¶

Cosine Similarity¶

Question 2.2: Words with Multiple Meanings (1.5 points) [code + written]¶

Question 2.3: Synonyms & Antonyms (2 points) [code + written]¶

Question 2.4: Analogies with Word Vectors [written] (1.5 points)¶

Question 2.5: Finding Analogies [code + written] (1.5 points)¶

Question 2.6: Incorrect Analogy [code + written] (1.5 points)¶

Question 2.7: Guided Analysis of Bias in Word Vectors [written] (1 point)¶

Question 2.8: Independent Analysis of Bias in Word Vectors [code + written] (1 point)¶

Question 2.9: Thinking About Bias [written] (2 points)¶

Submission Instructions¶

Question 1.1: Implement `distinct_words` [code] (2 points)¶

Question 1.2: Implement `compute_co_occurrence_matrix` [code] (3 points)¶

Question 1.3: Implement `reduce_to_k_dim` [code] (1 point)¶

Question 1.4: Implement `plot_embeddings` [code] (1 point)¶