Skip to content

Commit 290cf9f

Browse files
authored
Merge pull request #5 from egracheva/master
Random seed bug
2 parents 9f5b5f4 + 57a562c commit 290cf9f

6 files changed

Lines changed: 30 additions & 38 deletions

File tree

SMILESX/embeddingvis.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,18 +46,17 @@ def Embedding_Vis(data,
4646
print("***SMILES_X for embedding visualization starts...***\n\n")
4747
np.random.seed(seed=123)
4848
seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist()
49-
# Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times
50-
selection_seed = seed_list[k_fold_index]
5149

5250
print("******")
53-
print("***Fold #{} initiated...***".format(selection_seed))
51+
print("***Fold #{} initiated...***".format(k_fold_index))
5452
print("******")
5553

5654
print("***Sampling and splitting of the dataset.***\n")
55+
# Reproducing the data split of the requested fold (k_fold_index)
5756
x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
5857
utils.random_split(smiles_input=data.smiles,
5958
prop_input=np.array(data.iloc[:,1]),
60-
random_state=selection_seed,
59+
random_state=seed_list[k_fold_index],
6160
scaling = True)
6261

6362
# data augmentation or not
@@ -102,7 +101,7 @@ def Embedding_Vis(data,
102101
train_unique_tokens.insert(0,'pad')
103102

104103
# Tokens as a list
105-
tokens = token.get_vocab(input_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt')
104+
tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(k_fold_index)+'.txt')
106105
# Add 'pad', 'unk' tokens to the existing list
107106
tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)
108107

@@ -116,7 +115,7 @@ def Embedding_Vis(data,
116115
token_to_int = token.get_tokentoint(tokens)
117116
int_to_token = token.get_inttotoken(tokens)
118117

119-
model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5',
118+
model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(k_fold_index)+'.hdf5',
120119
custom_objects={'AttentionM': model.AttentionM()})
121120

122121
print("Chosen model summary:\n")
@@ -183,6 +182,6 @@ def Embedding_Vis(data,
183182
plt.yticks([])
184183
ax.axis('tight')
185184

186-
plt.savefig(save_dir+'Visualization_'+data_name+'_Embedding_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
185+
plt.savefig(save_dir+'Visualization_'+data_name+'_Embedding_fold_'+str(k_fold_index)+'.png', bbox_inches='tight')
187186
plt.show()
188187
##

SMILESX/inference.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,6 @@ def Inference(data_name,
4444
os.makedirs(save_dir, exist_ok=True)
4545

4646
print("***SMILES_X for inference starts...***\n\n")
47-
np.random.seed(seed=123)
48-
seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist()
49-
5047
print("***Checking the SMILES list for inference***\n")
5148
smiles_checked = list()
5249
smiles_rejected = list()
@@ -95,7 +92,7 @@ def Inference(data_name,
9592
for ifold in range(k_fold_number):
9693

9794
# Tokens as a list
98-
tokens = token.get_vocab(input_dir+data_name+'_tokens_set_seed'+str(seed_list[ifold])+'.txt')
95+
tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt')
9996
# Add 'pad', 'unk' tokens to the existing list
10097
vocab_size = len(tokens)
10198
tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)
@@ -105,7 +102,7 @@ def Inference(data_name,
105102
int_to_token = token.get_inttotoken(tokens)
106103

107104
# Best architecture to visualize from
108-
model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(seed_list[ifold])+'.hdf5',
105+
model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5',
109106
custom_objects={'AttentionM': model.AttentionM()})
110107

111108
if ifold == 0:

SMILESX/interpret.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -67,18 +67,17 @@ def Interpretation(data,
6767
print("***SMILES_X Interpreter starts...***\n\n")
6868
np.random.seed(seed=123)
6969
seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist()
70-
# Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times
71-
selection_seed = seed_list[k_fold_index]
7270

7371
print("******")
74-
print("***Fold #{} initiated...***".format(selection_seed))
72+
print("***Fold #{} initiated...***".format(k_fold_index))
7573
print("******")
7674

7775
print("***Sampling and splitting of the dataset.***\n")
76+
# Reproducing the data split of the requested fold (k_fold_index)
7877
x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
7978
utils.random_split(smiles_input=data.smiles,
8079
prop_input=np.array(data.iloc[:,1]),
81-
random_state=selection_seed,
80+
random_state=seed_list[k_fold_index],
8281
scaling = True)
8382

8483
np.savetxt(save_dir+'smiles_train.txt', np.asarray(x_train), newline="\n", fmt='%s')
@@ -145,7 +144,7 @@ def Interpretation(data,
145144
train_unique_tokens.insert(0,'pad')
146145

147146
# Tokens as a list
148-
tokens = token.get_vocab(input_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt')
147+
tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(k_fold_index)+'.txt')
149148
# Add 'pad', 'unk' tokens to the existing list
150149
tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)
151150

@@ -160,7 +159,7 @@ def Interpretation(data,
160159
int_to_token = token.get_inttotoken(tokens)
161160

162161
# Best architecture to visualize from
163-
model_topredict = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5',
162+
model_topredict = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(k_fold_index)+'.hdf5',
164163
custom_objects={'AttentionM': model.AttentionM()})
165164
best_arch = [model_topredict.layers[2].output_shape[-1]/2,
166165
model_topredict.layers[3].output_shape[-1],
@@ -179,7 +178,7 @@ def Interpretation(data,
179178
print("\n")
180179

181180
print("***Interpretation from the best model.***\n")
182-
model_att.load_weights(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5')
181+
model_att.load_weights(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(k_fold_index)+'.hdf5')
183182
model_att.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])
184183

185184
smiles_toviz_x_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list= smiles_toviz_x_enum_tokens,
@@ -210,7 +209,7 @@ def Interpretation(data,
210209
fontsize = font_size,
211210
rotation = font_rotation)
212211
plt.yticks([])
213-
plt.savefig(save_dir+'Interpretation_1D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
212+
plt.savefig(save_dir+'Interpretation_1D_'+data_name+'_fold_'+str(k_fold_index)+'.png', bbox_inches='tight')
214213
#plt.show()
215214

216215
smiles_tmp = smiles_toviz_x_enum[ienumcard]
@@ -233,7 +232,7 @@ def Interpretation(data,
233232
colorMap='Reds',
234233
contourLines = 10,
235234
alpha = 0.25)
236-
fig.savefig(save_dir+'Interpretation_2D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
235+
fig.savefig(save_dir+'Interpretation_2D_'+data_name+'_fold_'+str(k_fold_index)+'.png', bbox_inches='tight')
237236
#fig.show()
238237

239238
model_topredict.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])
@@ -276,7 +275,7 @@ def Interpretation(data,
276275
rotation = font_rotation)
277276
plt.yticks(fontsize = 20)
278277
plt.ylabel('Temporal relative distance', fontsize = 25, labelpad = 15)
279-
plt.savefig(save_dir+'Interpretation_temporal_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
278+
plt.savefig(save_dir+'Interpretation_temporal_'+data_name+'_fold_'+str(k_fold_index)+'.png', bbox_inches='tight')
280279
#plt.show()
281280
##
282281

SMILESX/main.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -152,11 +152,10 @@ def Main(data,
152152
print("******")
153153

154154
print("***Sampling and splitting of the dataset.***\n")
155-
selection_seed = seed_list[ifold]
156155
x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
157156
utils.random_split(smiles_input=data.smiles,
158157
prop_input=np.array(data.iloc[:,1]),
159-
random_state=selection_seed,
158+
random_state=seed_list[ifold],
160159
scaling = True)
161160

162161
# data augmentation or not
@@ -217,9 +216,9 @@ def Main(data,
217216
print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size))
218217

219218
# Save the vocabulary for re-use
220-
token.save_vocab(tokens, save_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt')
219+
token.save_vocab(tokens, save_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt')
221220
# Tokens as a list
222-
tokens = token.get_vocab(save_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt')
221+
tokens = token.get_vocab(save_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt')
223222
# Add 'pad', 'unk' tokens to the existing list
224223
tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)
225224

@@ -355,7 +354,7 @@ def create_mod(params):
355354
multi_model.compile(loss="mse", optimizer=custom_adam, metrics=[metrics.mae,metrics.mse])
356355

357356
# Checkpoint, Early stopping and callbacks definition
358-
filepath=save_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5'
357+
filepath=save_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5'
359358

360359
checkpoint = ModelCheckpoint(filepath,
361360
monitor='val_loss',
@@ -394,14 +393,13 @@ def create_mod(params):
394393
plt.ylabel('Loss')
395394
plt.xlabel('Epoch')
396395
plt.legend(['Train', 'Validation'], loc='upper right')
397-
plt.savefig(save_dir+'History_fit_LSTMAtt_'+data_name+'_model_weights.best_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
396+
plt.savefig(save_dir+'History_fit_LSTMAtt_'+data_name+'_model_weights.best_fold_'+str(ifold)+'.png', bbox_inches='tight')
398397
plt.close()
399398

400399
print("Best val_loss @ Epoch #{}\n".format(np.argmin(history.history['val_loss'])+1))
401400

402401
print("***Predictions from the best model.***\n")
403-
model_train.load_weights(save_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5')
404-
# model.save(save_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5')
402+
model_train.load_weights(save_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5')
405403
model_train.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])
406404

407405
# predict and compare for the training, validation and test sets
@@ -520,5 +518,5 @@ def create_mod(params):
520518
plt.legend()
521519

522520
# Added fold number
523-
plt.savefig(save_dir+'TrainValid_Plot_LSTMAtt_'+data_name+'_model_weights.best_seed_'+str(selection_seed)+'_fold_'+str(ifold)+'.png', bbox_inches='tight', dpi=80)
521+
plt.savefig(save_dir+'TrainValid_Plot_LSTMAtt_'+data_name+'_model_weights.best_fold_'+str(ifold)+'.png', bbox_inches='tight', dpi=80)
524522
plt.close()

SMILESX/token.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -117,26 +117,25 @@ def TokensFinder(data,
117117
data_name,
118118
data_units = '',
119119
k_fold_number = 8,
120-
k_fold_index=0,
120+
k_fold_index = 0,
121121
augmentation = False,
122122
token_tofind = '',
123123
verbose = 1):
124124

125125
print("***SMILES_X token's finder starts...***\n\n")
126126
np.random.seed(seed=123)
127127
seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist()
128-
# Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times
129-
selection_seed = seed_list[k_fold_index]
130-
128+
131129
print("******")
132-
print("***Fold #{} initiated...***".format(selection_seed))
130+
print("***Fold #{} initiated...***".format(k_fold_index))
133131
print("******")
134132

135133
print("***Sampling and splitting of the dataset.***\n")
134+
# Reproducing the data split of the requested fold (k_fold_index)
136135
x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
137136
utils.random_split(smiles_input=data.smiles,
138137
prop_input=np.array(data.iloc[:,1]),
139-
random_state=selection_seed,
138+
random_state=seed_list[k_fold_index],
140139
scaling = True)
141140

142141
# data augmentation or not

SMILESX/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
# 3 arrays of properties for training, validation, test: y_train, y_valid, y_test,
1818
# the scaling function: scaler
1919
def random_split(smiles_input, prop_input, random_state, scaling = True):
20-
20+
np.random.seed(seed=random_state)
2121
full_idx = np.array([x for x in range(smiles_input.shape[0])])
2222
train_idx = np.random.choice(full_idx,
2323
size=math.ceil(0.8*smiles_input.shape[0]),

0 commit comments

Comments
 (0)