-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata.py
More file actions
56 lines (49 loc) · 2.18 KB
/
data.py
File metadata and controls
56 lines (49 loc) · 2.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def load_dataset():
# ########################
# DATASET PREPROCESS
# ########################
corpus = []
word_to_idx = { }
idx_to_word = { }
train_dataset = [] # { "data" : ['<s>', 'john'], "target" : "a" }
i = 0
# dataset
dataset = [
['<s>', "john", "went", "to", "the", "store", "<e>"],
['<s>', "john", "went", "to", "the", "mall", "<e>"],
['<s>', "john", "went", "home", "early", "<e>"],
['<s>', "john", "is", "a", "engineer", "<e>"],
['<s>', "john", "is", "at", "home", "<e>"],
['<s>', "john", "can", "run", "fast", "<e>"],
['<s>', "john", "can", "go", "rest", "<e>"],
['<s>', "john", "can", "fly", "<e>"],
['<s>', "john", "can", "fly", "high", "<e>"],
['<s>', "john", "is", "not", "a", "data", "<e>"],
['<s>', "john", "is", "only", "at", "home", "<e>"],
['<s>', "john", "will", "jump", "<e>"],
['<s>', "john", "will", "jump", "high", "<e>"],
['<s>', "john", "will", "jump", "fast", "<e>"],
['<s>', "john", "is", "faster", "than", "me", "<e>"],
['<s>', "john", "is", "faster", "than", "a", "dog", "<e>"],
['<s>', "john", "is", "really", "fast", "<e>"],
['<s>', "john", "is", "only", "dead", "<e>"],
['<s>', "john", "is", "in", "love", "<e>"],
['<s>', "john", "is", "fly", "tomorrow", "<e>"],
['<s>', "john", "is", "fly", "high", "now", "<e>"]
]
# generate corpus & naive label encoding! (give each word a id #)
for words in dataset:
for w in words:
if w not in corpus:
corpus.append(w)
word_to_idx[w] = i
idx_to_word[i] = w
i += 1
# change dataset format to many-to-one format
for sentence in dataset:
tmp = []
for i in range(len(sentence)-1):
tmp.append(sentence[i]) # we need to cumulate the sentence
target = sentence[i + 1] # set the next word in the sentence as the target!
train_dataset.append({"data" : tmp.copy(), "target" : target})
return corpus, word_to_idx, idx_to_word, train_dataset