-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_data.py
115 lines (92 loc) · 3.54 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import random
import json
import torch
import torch
import torch.optim as optim
import matplotlib.pyplot as plt
import torchtext
from torchtext import data
import spacy
import pandas as pd
import numpy as np
import time
import torch.nn.utils.rnn as tnt
import sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
# Download spacy
#!python -m spacy download en
# Get the data in tsv form
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
file = "/content/drive/My Drive/Colab Notebooks/Chat bot/intents.json"
path = "/content/drive/My Drive/Colab Notebooks/Chat bot"
data2 = dict()
def get_dataframe(filename):
global label
global tags
global labeldecode
with open(file, 'r') as json_data:
intents = json.load(json_data)
# Converting to dataframe
tags = []
patterns = []
patterns_full = []
for intent in intents['intents']:
tag = intent['tag']
# add to tag list
tags.append(tag)
for pattern in intent['patterns']:
patterns.append(pattern)
patterns_full.append(patterns)
patterns = []
label = []
# Label encode the labels
for i, tag in enumerate(tags):
label.append(i)
labeldecode = dict()
for i in range(len(label)):
labeldecode[label[i]] = tags[i]
print(labeldecode)
data1 = dict()
# Dictionary
print(len(patterns_full[0]))
for i in range(len(tags)):
for j in range(len(patterns_full[i])):
data1[str(patterns_full[i][j])] = int(label[i])
print(data1)
df = pd.DataFrame(list(data1.items()),columns = ['text','label'])
print(df)
print("=================================")
df.value_counts('label')
return df
df = get_dataframe(file)
print(label)
print(labeldecode)
train = df
validation = train
train.to_csv('/content/drive/My Drive/Colab Notebooks/Chat bot/train.tsv', sep='\t', index=False)
validation.to_csv('/content/drive/My Drive/Colab Notebooks/Chat bot/valid.tsv', sep='\t', index=False)
# Get data in tabular and bucket iterator form by tokenizing it
TEXT = data.Field(sequential=True, lower=True, tokenize='spacy', include_lengths=True)
LABELS = data.Field(sequential=False, use_vocab=False)
def Tokenize(path):
global TEXT
TEXT = data.Field(sequential=True, lower=True, tokenize='spacy', include_lengths=True)
global LABELS
LABELS = data.Field(sequential=False, use_vocab=False)
train_data, val_data = data.TabularDataset.splits( path=path, train='train.tsv',
validation='valid.tsv', format='tsv', skip_header=True,
fields=[('text', TEXT), ('label', LABELS)])
train_iter, val_iter = data.BucketIterator.splits((train_data, val_data),
batch_sizes=(9, 9),
sort_key=lambda x: len(x.text),
device=None, sort_within_batch=True,
repeat=False)
return train_iter, val_iter, train_data, val_data
train_iter, val_iter, train_data, val_data = Tokenize(path)
# Build the vocab data
TEXT.build_vocab(train_data,val_data)
TEXT.vocab.load_vectors(torchtext.vocab.GloVe(name='6B', dim=100))
vocab = TEXT.vocab
print("Shape of Vocab:",TEXT.vocab.vectors.shape)