#!/usr/bin/env python # coding: utf-8 # In[1]: #!/usr/bin/env python3 #Script that performs ML Task 1 for an antigen. #Usage in command line: #python ThisFile.py antigenID nNeurons=5 nSeqTot=1e7 nRepeats=10 condition=1 layers= batch_size=min(nSeqTot, sequences in dfrom __future__ import division, print_function, absolute_importataset) # In[2]: #Should be the first command from __future__ import division, print_function, absolute_import #tf.keras.backend.set_floatx('float64') problems with float32 versus float64. Don't get. # In[3]: #Local settings - Files necessary in that folder: antigenID + "_Task1_Slices[Imb/B]alancedData.txt" #LocationDataFiles = "C:/Users/pprobert/Desktop/Etulos/ML1_ResultsTask1/PythonNotebook/" LocationDataFiles = "../Task1/" runningInCommandLine = True nEpochs = 150 refMaxSeqFor20pctsTest = 100000 # test will be 20% of that at max. repeat = 1; #For the jupyter version # In[4]: #Default parameters (changeable by command line) antigenID = "1ADQ_A" nNeurons = 50 nSeqTot = 5000 nRepeats = 10 condition = 1 layers = 1 useAACompo = False # In[5]: import tensorflow as tf import collections import os import random import urllib import zipfile import numpy as np import tensorflow as tf import csv import sys import math #pip install tensorflow_datasets #import tensorflow_datasets as tfds if(not(runningInCommandLine)): import matplotlib.pyplot as plt # In[6]: #Hides the presence of the GPU, to avoid problems on immunhub os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # In[7]: #Parameters from command line. This cell does not work from Python notebook #print(len(sys.argv), str(sys.argv)) if(runningInCommandLine): if(len(sys.argv) > 1): antigenID = sys.argv[1] if(len(sys.argv) > 2): nNeurons = int(sys.argv[2]) if(len(sys.argv) > 3): nSeqTot = int(sys.argv[3]) if(len(sys.argv) > 4): nRepeats = int(sys.argv[4]) # 1 = balanced dataset + sigmoid # 2 = imbalanced dataset + sigmoid (a bit useless) # 3 = shuffling labels + sigmoid # First hard case is Mascotte (top 1%) versus loosers exclusive (top 5% to 1%) # 4 = HardV1 balanced + relu # 5 = HardV1 shuffling + relu # Second hard case is Mascotte exclusive (top 1% to 0.1%) versus Heroes (top 0.1%) # 6 = HardV2 balanced + relu # 7 = HardV2 shuffling + relu if(len(sys.argv) > 5): condition = int(sys.argv[5]) if(len(sys.argv) > 6): layers = int(sys.argv[6]) if(condition > 10): useAACompo = True condition = condition - 10 #see below for argument 7 # # In[8]: #Reading the data files and loading as two vectors, depending on antigenID and on the condition (balanced/imbalanced) Shuffling is made later fileToOpen = LocationDataFiles + antigenID + "_Task1_SlicesBalancedData.txt" if(condition == 2): fileToOpen = LocationDataFiles + antigenID + "_Task1_SlicesImbalancedData.txt" if((condition == 4) or (condition == 5)): fileToOpen = LocationDataFiles + antigenID + "_Task1aMvsL_SlicesBalancedData.txt" if((condition == 6) or (condition == 7)): fileToOpen = LocationDataFiles + antigenID + "_Task1bHvsM_SlicesBalancedData.txt" # In[9]: #Defining the one hot encoding functions and reverse alphabet = 'ACDEFGHIKLMNPQRSTVWY' # define a mapping of chars to integers char_to_int = dict((c, i) for i, c in enumerate(alphabet)) int_to_char = dict((i, c) for i, c in enumerate(alphabet)) def hotEncodingAAString(myString): onehot_encoded = list() integer_encoded = [char_to_int[char] for char in myString] for value in integer_encoded: letter = [0 for _ in range(len(alphabet))] letter[value] = 1 onehot_encoded.append(letter) #print(onehot_encoded) return[onehot_encoded] #input: [0,0,...,1,0,0] def retrieveAA(onehot_encodedAA): #print("AA", onehot_encodedAA) #if(sum(onehot_encodedAA) != 1): # return '?' foundAA = '?' for i in range(len(onehot_encodedAA)): if(onehot_encodedAA[i] == 1): foundAA = int_to_char[i] return foundAA #input: [[[0,0,...,1,0,0] , ... , [0,1,0,0... 0]]] def retrieveString(onehot_encodedString): #print("String", onehot_encodedString) foundString = "" for k in range(len(onehot_encodedString[0])): foundString = foundString + retrieveAA(onehot_encodedString[0][k]) return foundString # print(hotEncodingAAString("ACD")); print(retrieveString(hotEncodingAAString("ACD"))) # In[10]: balancedDataset = open(fileToOpen, newline = '') #one line is a text with \t and \n data_reader = csv.reader(balancedDataset, delimiter='\t') #transform lines into lists sequences = [] labels = [] for line in data_reader: if(not (line[0].startswith("#") or line[0].startswith("Slide"))): sequences.append(line[0]) labels.append(line[1]) # In[11]: #Conversion of the dataset into One Hot Encoding- takes 20 sec hotEncodedKeys = np.array(list(map(hotEncodingAAString, sequences))) if(useAACompo == True): hotEncodedKeys = np.sum(hotEncodedKeys, axis = 2) print(hotEncodedKeys.shape) print("AA Composition option ON\n") # In[12]: #Conversion of the Lavels into 0 or 1 (binary classification) binary = {"Binder": 1,"NonBinder": 0} binaryLabels = np.array([binary[item] for item in labels]) # In[13]: #Shuffle the labels if((condition == 3) or (condition == 5) or (condition == 7)): random.shuffle(binaryLabels) # In[14]: #If too many sequences are requested, will downscale, just to have this info in the output nSeqTot = min(nSeqTot,len(hotEncodedKeys)) train_size = int(0.8 * nSeqTot) test_size = int(0.2 * refMaxSeqFor20pctsTest) if test_size + train_size > len(hotEncodedKeys): test_size = min(test_size, int(0.2*len(hotEncodedKeys))) print('Requested: dataset=', nSeqTot, 'train=', train_size, 'test=', test_size, 'totUsed=', train_size + test_size) # In[15]: #By default, we will take 50 batches, so the size of a batch is nr sequences / 50 batch_size = math.floor(nSeqTot / 50) if(runningInCommandLine): if(len(sys.argv) > 7): batch_size = int(sys.argv[7]) # In[16]: possibleDataIDs = [*range(0,len(hotEncodedKeys))] random.shuffle(possibleDataIDs) # In[17]: #Transform the lists into tensorflow datasets (not yet batched) # First, we separate the test and train raw data manually because if we do it with datasets.take() or skip(), # there is a problem with shuffling (test and train might overlap), and then very difficult to get back # what was the train and test datasets. # Also, for getting multilabel confusion matrix later, better to keep the raw data aside and recreate # a non-shuffled dataset to do model.predict in the good order. trainKeys = hotEncodedKeys[possibleDataIDs[0:train_size]].astype(float); trainLabels = binaryLabels[possibleDataIDs[0:train_size]]; testKeys = hotEncodedKeys[possibleDataIDs[train_size: train_size + test_size]].astype(float); testLabels = binaryLabels[possibleDataIDs[train_size: train_size + test_size]]; #Transform into tensors TFkeysTrain = tf.data.Dataset.from_tensor_slices(trainKeys) TFvalTrain = tf.data.Dataset.from_tensor_slices(trainLabels) TFkeysTest = tf.data.Dataset.from_tensor_slices(testKeys) TFvalTest = tf.data.Dataset.from_tensor_slices(testLabels) #Transforms them into input and output elements (together) tfTrainElements = tf.data.Dataset.zip((TFkeysTrain, TFvalTrain)) tfTestElements = tf.data.Dataset.zip((TFkeysTest, TFvalTest)) #Finally, this is batched so can be used inside the training. train_dataset = tfTrainElements.shuffle(train_size).batch(batch_size, drop_remainder=True) test_dataset = tfTestElements.shuffle(test_size).batch(batch_size, drop_remainder=True) print('Obtained: train=', len(list(train_dataset)), ' test=', len(list(test_dataset))) # In[19]: #This is creating an additional test dataset, comparing Mascotte versus Loosers... #Should make a function at some point... fileToOpen2 = LocationDataFiles + antigenID + "_Task1aMvsL_SlicesBalancedData.txt" hardTestDataset = open(fileToOpen2, newline = '') #one line is a text with \t and \n data_reader = csv.reader(hardTestDataset, delimiter='\t') #transform lines into lists sequencesExternalTest = [] labelsExternalTest = [] for line in data_reader: if(not (line[0].startswith("#") or line[0].startswith("Slide"))): sequencesExternalTest.append(line[0]) labelsExternalTest.append(line[1]) # In[20]: #print(sequencesExternalTest[0:5]) #print(labelsExternalTest[0:5]) # In[21]: #This step is also a little long hotEncodedKeysExternal = np.array(list(map(hotEncodingAAString, sequencesExternalTest))) if(useAACompo == True): hotEncodedKeysExternal = np.sum(hotEncodedKeysExternal, axis = 2) binaryLabelsExternal = np.array([binary[item] for item in labelsExternalTest]) TFkeysExt = tf.data.Dataset.from_tensor_slices(hotEncodedKeysExternal) TFvalExt = tf.data.Dataset.from_tensor_slices(binaryLabelsExternal) tfElementsExt = tf.data.Dataset.zip((TFkeysExt, TFvalExt)) external_dataset = tfElementsExt.shuffle(len(sequencesExternalTest)).batch(batch_size, drop_remainder=True) # In[22]: if(not(runningInCommandLine)): print(sequences[0:10]) print(labels[0:10], binaryLabels[0:10]) print(len(sequences), len(labels)) print(hotEncodingAAString(sequences[0])) # In[23]: print('ML Task 1, Antigen ', antigenID, ' nNeurons=', nNeurons, 'nSeqTot=', nSeqTot, 'nRepeats=', nRepeats, 'condition=', condition, 'layers=', layers, 'batch_size=', batch_size, 'external_dataset', len(sequencesExternalTest), 'test_max_20pct_of', refMaxSeqFor20pctsTest) # In[24]: for repeat in range(nRepeats): #Finally, this is batched so can be used inside the training. train_dataset = tfTrainElements.shuffle(train_size).batch(batch_size, drop_remainder=True) test_dataset = tfTestElements.shuffle(test_size).batch(batch_size, drop_remainder=True) # Rebuild the model from scratch [not sure how initialized] model = tf.keras.Sequential(); model.add(tf.keras.layers.Flatten()); if(nNeurons > 1): model.add(tf.keras.layers.Dense(nNeurons, activation='relu')) if(layers > 1): model.add(tf.keras.layers.Dense(nNeurons, activation='relu')) if(layers > 2): model.add(tf.keras.layers.Dense(nNeurons, activation='relu')) if(layers > 3): model.add(tf.keras.layers.Dense(nNeurons, activation='relu')) model.add(tf.keras.layers.Dense(1, activation='sigmoid')) print('Sigmoid activation function') # In[25]: #Now try with more binary classification stuff #nEpochs = 40 optimizer = tf.keras.optimizers.Adam() loss = 'binary_crossentropy' metrics = ['accuracy', 'FalseNegatives', 'FalsePositives', 'Precision', 'Recall', 'TrueNegatives', 'TruePositives'] model.compile(loss=loss, optimizer=optimizer, metrics=metrics) # In[26]: input_shape = [1, 11, 20] if useAACompo == True: input_shape = [1, 20] model.build(input_shape) model.summary() # In[27]: history = model.fit(train_dataset, epochs=nEpochs, validation_data=test_dataset, validation_steps=1) # In[28]: history_dict = history.history print(history_dict.keys()) # In[29]: loss_values = history_dict['loss'] val_loss_values = history_dict['val_loss'] accuracy = history_dict['val_accuracy'] epochs = range(1, nEpochs+1) if(not(runningInCommandLine)): plt.plot(epochs, loss_values, 'bo', label='Training loss') plt.plot(epochs, val_loss_values, 'b', label='Validation loss') plt.plot(epochs, accuracy, 'r', label='Accuracy') plt.title('Training and validation loss + validation accuracy') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.show() # In[30]: print('Evaluation:', model.evaluate(test_dataset)) # In[31]: print('Evaluation External:', model.evaluate(external_dataset)) # In[32]: #file_object = open('History.txt', 'a') #file_object.write(antigenID + "\t" + str(nEpochs) + "\t" + str(nNeurons) + "\t" + str(nSeqTot) + "\t" + str(repeat) + "\t" + str(nRepeats)+ "\t" + str(condition) + "\t" + str(batch_size) + "\t" + str(layers) + '\t' + str(model.evaluate(test_dataset)) + "\n") #file_object.close() # In[33]: def getDataset(dataset): testLabels = [] testFeatures = [] #i = 0 for (features, labels) in dataset: for (seq , en1) in zip(features , labels): # for each batch testFeatures.append(seq) testLabels.append(en1.numpy().astype(float)); # tf.dtypes.cast(en, tf.int32)) return (testFeatures, testLabels) # In[34]: #if you want to retrieve the datasets #(trainX, trainY) = getDataset(train_dataset) #(testX, testY) = getDataset(test_dataset) #decodedTrainX = list(map(retrieveString, trainX)) #decodedTestX = list(map(retrieveString, testX)) # In[35]: #print(decodedTrainX[0:10]) #print(decodedTestX[0:10]) # In[36]: #print(len(decodedTrainX), len(decodedTestX)) #a = np.ndarray.flatten(np.array(decodedTrainX)) #b = np.ndarray.flatten(np.array(decodedTestX)) #common_elements = list(set(a).intersection(set(b))) #print(len(common_elements)) #common_elements[0:10] # In[37]: def myRound(logit): if(logit < 0.5): return 0 return 1 # In[38]: # Note: the lists of labels to build these datasets were: # trainLabels # testLabels train_dataset = tfTrainElements.batch(batch_size, drop_remainder=True) test_dataset = tfTestElements.batch(batch_size, drop_remainder=True) pred_train = model.predict(train_dataset) pred_test = model.predict(test_dataset) classes_train = list(map(myRound, pred_train)) classes_test = list(map(myRound, pred_test)) # In[39]: con_mat_train = tf.math.confusion_matrix(labels=trainLabels[0:len(classes_train)], predictions=classes_train).numpy() print(con_mat_train) # In[40]: con_mat_test = tf.math.confusion_matrix(labels=testLabels[0:len(classes_test)], predictions=classes_test).numpy() print(con_mat_test) # In[41]: from sklearn.metrics import roc_curve from sklearn.metrics import auc # In[42]: fpr, tpr, _ = roc_curve(testLabels, pred_test) roc_auc = auc(fpr, tpr) print(roc_auc) # In[43]: real_condition = condition if useAACompo == True: real_condition = real_condition + 10 file_object = open('History.txt', 'a') file_object.write(antigenID + "\t" + str(nEpochs) + "\t" + str(nNeurons) + "\t" + str(nSeqTot) + "\t" + str(repeat) + "\t" + str(nRepeats)+ "\t" + str(real_condition) + "\t" + str(batch_size) + "\t" + str(layers) + '\t' + str(model.evaluate(test_dataset)) + '\t' + str(roc_auc) + '\t' + str(model.evaluate(train_dataset)) + "\t" + str(model.evaluate(external_dataset)) + "\n") file_object.close() if(not(runningInCommandLine)): plt.figure() lw = 2 plt.plot(fpr, tpr, color='darkorange',lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.show() # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: