Merge pull request #5 from egracheva/master

GLambard · web-flow · commit 290cf9ffdfa4 · 2019-10-08T14:19:48.000+09:00
Random seed bug
diff --git a/SMILESX/embeddingvis.py b/SMILESX/embeddingvis.py
@@ -46,18 +46,17 @@ def Embedding_Vis(data,
     print("***SMILES_X for embedding visualization starts...***\n\n")
     np.random.seed(seed=123)
     seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist()
-    # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times
-    selection_seed = seed_list[k_fold_index]
         
     print("******")
-    print("***Fold #{} initiated...***".format(selection_seed))
+    print("***Fold #{} initiated...***".format(k_fold_index))
     print("******")
 
     print("***Sampling and splitting of the dataset.***\n")
+    # Reproducing the data split of the requested fold (k_fold_index)
     x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
     utils.random_split(smiles_input=data.smiles, 
                        prop_input=np.array(data.iloc[:,1]), 
-                       random_state=selection_seed, 
+                       random_state=seed_list[k_fold_index], 
                        scaling = True)
   
     # data augmentation or not
@@ -102,7 +101,7 @@ def Embedding_Vis(data,
     train_unique_tokens.insert(0,'pad')
     
     # Tokens as a list
-    tokens = token.get_vocab(input_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt')
+    tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(k_fold_index)+'.txt')
     # Add 'pad', 'unk' tokens to the existing list
     tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)
     
@@ -116,7 +115,7 @@ def Embedding_Vis(data,
     token_to_int = token.get_tokentoint(tokens)
     int_to_token = token.get_inttotoken(tokens)
 
-    model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5', 
+    model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(k_fold_index)+'.hdf5', 
                              custom_objects={'AttentionM': model.AttentionM()})
 
     print("Chosen model summary:\n")
@@ -183,6 +182,6 @@ def Embedding_Vis(data,
     plt.yticks([])
     ax.axis('tight')
     
-    plt.savefig(save_dir+'Visualization_'+data_name+'_Embedding_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
+    plt.savefig(save_dir+'Visualization_'+data_name+'_Embedding_fold_'+str(k_fold_index)+'.png', bbox_inches='tight')
     plt.show()
 ##
diff --git a/SMILESX/inference.py b/SMILESX/inference.py
@@ -44,9 +44,6 @@ def Inference(data_name,
     os.makedirs(save_dir, exist_ok=True)
     
     print("***SMILES_X for inference starts...***\n\n")
-    np.random.seed(seed=123)
-    seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist()
-        
     print("***Checking the SMILES list for inference***\n")
     smiles_checked = list()
     smiles_rejected = list()
@@ -95,7 +92,7 @@ def Inference(data_name,
     for ifold in range(k_fold_number):
         
         # Tokens as a list
-        tokens = token.get_vocab(input_dir+data_name+'_tokens_set_seed'+str(seed_list[ifold])+'.txt')
+        tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt')
         # Add 'pad', 'unk' tokens to the existing list
         vocab_size = len(tokens)
         tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)
@@ -105,7 +102,7 @@ def Inference(data_name,
         int_to_token = token.get_inttotoken(tokens)
         
         # Best architecture to visualize from
-        model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(seed_list[ifold])+'.hdf5', 
+        model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5', 
                                  custom_objects={'AttentionM': model.AttentionM()})
 
         if ifold == 0:
diff --git a/SMILESX/interpret.py b/SMILESX/interpret.py
@@ -67,18 +67,17 @@ def Interpretation(data,
     print("***SMILES_X Interpreter starts...***\n\n")
     np.random.seed(seed=123)
     seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist()
-    # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times
-    selection_seed = seed_list[k_fold_index]
         
     print("******")
-    print("***Fold #{} initiated...***".format(selection_seed))
+    print("***Fold #{} initiated...***".format(k_fold_index))
     print("******")
 
     print("***Sampling and splitting of the dataset.***\n")
+    # Reproducing the data split of the requested fold (k_fold_index)
     x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
     utils.random_split(smiles_input=data.smiles, 
                        prop_input=np.array(data.iloc[:,1]), 
-                       random_state=selection_seed, 
+                       random_state=seed_list[k_fold_index], 
                        scaling = True)
 
     np.savetxt(save_dir+'smiles_train.txt', np.asarray(x_train), newline="\n", fmt='%s')
@@ -145,7 +144,7 @@ def Interpretation(data,
     train_unique_tokens.insert(0,'pad')
     
     # Tokens as a list
-    tokens = token.get_vocab(input_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt')
+    tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(k_fold_index)+'.txt')
     # Add 'pad', 'unk' tokens to the existing list
     tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)
     
@@ -160,7 +159,7 @@ def Interpretation(data,
     int_to_token = token.get_inttotoken(tokens)
 
     # Best architecture to visualize from
-    model_topredict = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5', 
+    model_topredict = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(k_fold_index)+'.hdf5', 
                                           custom_objects={'AttentionM': model.AttentionM()})
     best_arch = [model_topredict.layers[2].output_shape[-1]/2, 
                  model_topredict.layers[3].output_shape[-1], 
@@ -179,7 +178,7 @@ def Interpretation(data,
     print("\n")
 
     print("***Interpretation from the best model.***\n")
-    model_att.load_weights(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5')
+    model_att.load_weights(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(k_fold_index)+'.hdf5')
     model_att.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])
 
     smiles_toviz_x_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list= smiles_toviz_x_enum_tokens, 
@@ -210,7 +209,7 @@ def Interpretation(data,
                fontsize = font_size, 
                rotation = font_rotation)
     plt.yticks([])
-    plt.savefig(save_dir+'Interpretation_1D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
+    plt.savefig(save_dir+'Interpretation_1D_'+data_name+'_fold_'+str(k_fold_index)+'.png', bbox_inches='tight')
     #plt.show()
     
     smiles_tmp = smiles_toviz_x_enum[ienumcard]
@@ -233,7 +232,7 @@ def Interpretation(data,
                                       colorMap='Reds', 
                                       contourLines = 10,
                                       alpha = 0.25)
-    fig.savefig(save_dir+'Interpretation_2D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
+    fig.savefig(save_dir+'Interpretation_2D_'+data_name+'_fold_'+str(k_fold_index)+'.png', bbox_inches='tight')
     #fig.show()
     
     model_topredict.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])
@@ -276,7 +275,7 @@ def Interpretation(data,
                rotation = font_rotation)
     plt.yticks(fontsize = 20)
     plt.ylabel('Temporal relative distance', fontsize = 25, labelpad = 15)
-    plt.savefig(save_dir+'Interpretation_temporal_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
+    plt.savefig(save_dir+'Interpretation_temporal_'+data_name+'_fold_'+str(k_fold_index)+'.png', bbox_inches='tight')
     #plt.show()
 ##
 
diff --git a/SMILESX/main.py b/SMILESX/main.py
@@ -152,11 +152,10 @@ def Main(data,
         print("******")
         
         print("***Sampling and splitting of the dataset.***\n")
-        selection_seed = seed_list[ifold]
         x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
         utils.random_split(smiles_input=data.smiles, 
                            prop_input=np.array(data.iloc[:,1]), 
-                           random_state=selection_seed, 
+                           random_state=seed_list[ifold], 
                            scaling = True)
               
         # data augmentation or not
@@ -217,9 +216,9 @@ def Main(data,
         print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size))
         
         # Save the vocabulary for re-use
-        token.save_vocab(tokens, save_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt')
+        token.save_vocab(tokens, save_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt')
         # Tokens as a list
-        tokens = token.get_vocab(save_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt')
+        tokens = token.get_vocab(save_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt')
         # Add 'pad', 'unk' tokens to the existing list
         tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)
         
@@ -355,7 +354,7 @@ def create_mod(params):
         multi_model.compile(loss="mse", optimizer=custom_adam, metrics=[metrics.mae,metrics.mse])
         
         # Checkpoint, Early stopping and callbacks definition
-        filepath=save_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5'
+        filepath=save_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5'
         
         checkpoint = ModelCheckpoint(filepath, 
                                      monitor='val_loss', 
@@ -394,14 +393,13 @@ def create_mod(params):
         plt.ylabel('Loss')
         plt.xlabel('Epoch')
         plt.legend(['Train', 'Validation'], loc='upper right')
-        plt.savefig(save_dir+'History_fit_LSTMAtt_'+data_name+'_model_weights.best_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
+        plt.savefig(save_dir+'History_fit_LSTMAtt_'+data_name+'_model_weights.best_fold_'+str(ifold)+'.png', bbox_inches='tight')
         plt.close()
         
         print("Best val_loss @ Epoch #{}\n".format(np.argmin(history.history['val_loss'])+1))
 
         print("***Predictions from the best model.***\n")
-        model_train.load_weights(save_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5')
-#         model.save(save_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5')  
+        model_train.load_weights(save_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5')
         model_train.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])
 
         # predict and compare for the training, validation and test sets
@@ -520,5 +518,5 @@ def create_mod(params):
         plt.legend()
 
         # Added fold number
-        plt.savefig(save_dir+'TrainValid_Plot_LSTMAtt_'+data_name+'_model_weights.best_seed_'+str(selection_seed)+'_fold_'+str(ifold)+'.png', bbox_inches='tight', dpi=80)
+        plt.savefig(save_dir+'TrainValid_Plot_LSTMAtt_'+data_name+'_model_weights.best_fold_'+str(ifold)+'.png', bbox_inches='tight', dpi=80)
         plt.close()
diff --git a/SMILESX/token.py b/SMILESX/token.py
@@ -117,26 +117,25 @@ def TokensFinder(data,
                  data_name, 
                  data_units = '',
                  k_fold_number = 8,
-                 k_fold_index=0,
+                 k_fold_index = 0,
                  augmentation = False, 
                  token_tofind = '', 
                  verbose = 1):
     
     print("***SMILES_X token's finder starts...***\n\n")
     np.random.seed(seed=123)
     seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist()
-    # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times
-    selection_seed = seed_list[k_fold_index]
-        
+    
     print("******")
-    print("***Fold #{} initiated...***".format(selection_seed))
+    print("***Fold #{} initiated...***".format(k_fold_index))
     print("******")
 
     print("***Sampling and splitting of the dataset.***\n")
+    # Reproducing the data split of the requested fold (k_fold_index)
     x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
     utils.random_split(smiles_input=data.smiles, 
                        prop_input=np.array(data.iloc[:,1]), 
-                       random_state=selection_seed, 
+                       random_state=seed_list[k_fold_index], 
                        scaling = True)
     
     # data augmentation or not
diff --git a/SMILESX/utils.py b/SMILESX/utils.py
@@ -17,7 +17,7 @@
 #         3 arrays of properties for training, validation, test: y_train, y_valid, y_test, 
 #         the scaling function: scaler
 def random_split(smiles_input, prop_input, random_state, scaling = True):
-
+    np.random.seed(seed=random_state)
     full_idx = np.array([x for x in range(smiles_input.shape[0])])
     train_idx = np.random.choice(full_idx, 
                                  size=math.ceil(0.8*smiles_input.shape[0]),