From 6cc1b477c5c1bd87a19c4187c1ecbd9aa3c418f9 Mon Sep 17 00:00:00 2001 From: Hardik Patel Date: Thu, 27 Apr 2017 14:31:46 -0400 Subject: [PATCH] Adding README.md and a simple python script to check the nearest words --- README.md | 32 ++++++++++++++++++++++++++++++ test_word_vectors.py | 47 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 README.md create mode 100644 test_word_vectors.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..36b7295 --- /dev/null +++ b/README.md @@ -0,0 +1,32 @@ +# fnlp + +This repo contains scripts to gather finance data and train NLP models using the text data. + +## Word Vectors + +Trained word vectors are available on the [releases](https://github.com/hardikp/fnlp/releases) page. + +Let's check if the closest words make sense. + +```bash +$ python3 test_word_vectors.py --word IRA +Roth +SEP +IRAs +401 +retirement + +$ python3 test_word_vectors.py --word option +call +put +options +exercise +underlying + +$ python3 test_word_vectors.py --word stock +shares +market +stocks +share +price +``` diff --git a/test_word_vectors.py b/test_word_vectors.py new file mode 100644 index 0000000..3ad34c6 --- /dev/null +++ b/test_word_vectors.py @@ -0,0 +1,47 @@ +from __future__ import absolute_import, division, print_function + +from argparse import ArgumentParser +from collections import Counter +from sklearn.metrics.pairwise import cosine_similarity +import numpy as np + + +def print_nearest_words(args): + # Load the word vectors + embeddings_index = {} + f = open(args.vectors) + for line in f: + values = line.split(' ') + w = values[0] + coefs = np.asarray(values[1:], dtype='float32') + embeddings_index[w] = coefs + f.close() + + # Get the similarity scores + score_dict = {} + for w in embeddings_index.keys(): + if args.word == w: + continue + + score = cosine_similarity(embeddings_index[args.word].reshape(1, -1), embeddings_index[w].reshape(1, -1))[0][0] + score_dict[w] = score + + closest = Counter(score_dict).most_common(args.num_words) + + for word, score in closest: + if args.verbose: + print(score, word) + else: + print(word) + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--vectors', default='vectors.txt', help='Word vector file') + parser.add_argument('--vocab', default='vocab.txt', help='Vocab file') + parser.add_argument('--word', default='dollar', help='Input word') + parser.add_argument('--verbose', type=bool, default=False, help='Print score') + parser.add_argument('--num_words', type=int, default=5, help='Number of closest words to print') + args = parser.parse_args() + + print_nearest_words(args)