-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_data.py
executable file
·109 lines (77 loc) · 3.31 KB
/
read_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 14 15:30:58 2019
@author: Binit Gajera
"""
#%%
import matplotlib.pyplot as plt
import numpy as np
import logging
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
#%matplotlib inline
# %%
class Embeddings:
def __init__(self):
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
self.model = BertModel.from_pretrained('bert-base-uncased')
self.model.eval()
def getEmbeddings(self, word):
# Load pre-trained model tokenizer (vocabulary)
# Define a new example sentence with multiple meanings of the word "bank"
# text = word
# Add the special tokens.
# marked_text = "[CLS] " + text + " [SEP]"
# marked_text = text
# Split the sentence into tokens.
tokenized_text = self.tokenizer.tokenize(word)
# print(tokenized_text)
# Map the token strings to their vocabulary indeces.
indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
# Display the words with their indeces.
# for tup in zip(tokenized_text, indexed_tokens):
# print('{:<12} {:>6,}'.format(tup[0], tup[1]))
# Mark each of the 22 tokens as belonging to sentence "1".
# segments_ids = [1] * len(tokenized_text)
# print (segments_ids)
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
# segments_tensors = torch.tensor([segments_ids])
# Load pre-trained model (weights)
# Put the model in "evaluation" mode, meaning feed-forward operation.
# Predict hidden states features for each layer
with torch.no_grad():
encoded_layers, _ = self.model(tokens_tensor)
# print(len(encoded_layers))
# print ("Number of layers:", len(encoded_layers))
# layer_i = 0
# print ("Number of batches:", len(encoded_layers[layer_i]))
# batch_i = 0
# print ("Number of tokens:", len(encoded_layers[layer_i][batch_i]))
# token_i = 0
# print ("Number of hidden units:", len(encoded_layers[layer_i][batch_i][token_i]))
# For the 5th token in our sentence, select its feature values from layer 5.
# token_i = 0
# layer_i = 0
# vec = encoded_layers[layer_i][batch_i][token_i]
# # Plot the values as a histogram to show their distribution.
# plt.figure(figsize=(10,10))
# plt.hist(vec, bins=200)
# plt.show()
# `encoded_layers` is a Python list.
# print(' Type of encoded_layers: ', type(encoded_layers))
# # Each layer in the list is a torch tensor.
# print('Tensor shape for each layer: ', encoded_layers[0].size())
# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
token_embeddings = torch.stack(encoded_layers, dim=0)
token_embeddings.size()
# Remove dimension 1, the "batches".
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings.size()
# Swap dimensions 0 and 1.
token_embeddings = token_embeddings.permute(1,0,2)
token_embeddings.size()
return token_embeddings[0].reshape(-1)
# %%