#!/usr/bin/env python # coding: utf-8 # In[ ]: #!/usr/bin/env python3 # coding: utf-8 from __future__ import division, print_function, absolute_import # In[ ]: # Usage: # ThisScript.py embedding_dim=6 units=500 nSeqTot=1000 nRepeats=10 condition=1 wordSizeIn=1 wordSizeOut=1 # fileInput=seeBelow colInput=0 colOutput=1 simSuffix="" batch_size=nSeqTot/50 # # Requires: # an input file with two columns: input and output. # the location is defined just below: # # Outputs: # creates a folder unique to this simulation # puts the weights of the fitted model inside # In[ ]: # local settings runningInCommandLine = True # In[ ]: import tensorflow as tf import collections import os import random import urllib import zipfile import numpy as np import tensorflow as tf import csv import time import sys import math # In[ ]: if(not(runningInCommandLine)): import matplotlib.pyplot as plt os.environ["CUDA_VISIBLE_DEVICES"] = "-1" #to get an ID of simulation from datetime import datetime from random import seed import random import jellyfish # In[ ]: #Default parameters (changeable by command line) #in the same order of command line arguments embedding_dim = 6 units = 500 #This is the hidden dimension, both encoder and decoder nSeqTot = 10000 nRepeats = 10 condition = 1 wordSizeIn = 1 wordSizeOut = 1 FileInput = "Task4_E_EpiSeq_ParaSeq_NoDeg.txt" colInput = 0 colOutput = 1 simSuffix = "" batch_size = 200 # In[ ]: #Non-command line options refMaxSeqFor20pctsTest = 1000000 paddingInputs = 'post' paddingOutputs = 'post' EPOCHS = 250 removeStar = True #Paratopes are distinguished by ending by a star. It means we will remove ending star from input and ouptuts sequences if they have it # In[ ]: #get a simID to save the model weights random.seed(datetime.now()) simID = random.randint(1,9999) #both numbers included print("This run has ID",str(simID)) #Note: the repeats will be called ID.1, etc... # In[ ]: if(runningInCommandLine): if(len(sys.argv) > 1): embedding_dim = int(sys.argv[1]) if(len(sys.argv) > 2): units = int(sys.argv[2]) if(len(sys.argv) > 3): nSeqTot = int(sys.argv[3]) if(len(sys.argv) > 4): nRepeats = int(sys.argv[4]) # 1 = only binders to exactly one of the selected antigen (maybe need to balance) # 3 = same + shuffling labels # 21 = multi-labels allowed # 23 = same + shuffling labels # 31 = only binders to exactly 1 + include 50% non-binders (but to other antigens) in the training # 33 = same + shuffling labels if(len(sys.argv) > 5): condition = int(sys.argv[5]) if(len(sys.argv) > 6): wordSizeIn = int(sys.argv[6]) if(len(sys.argv) > 7): wordSizeOut = int(sys.argv[7]) if(len(sys.argv) > 8): FileInput = sys.argv[8] print("Input file has been manually given to ", FileInput) if(len(sys.argv) > 9): colInput = int(sys.argv[9]) print("Input column assign to be ", colInput) if(len(sys.argv) > 10): colOutput = int(sys.argv[10]) print("Input column assign to be ", colOutput) if(len(sys.argv) > 11): simSuffix = sys.argv[11] print("Input column assign to be ", simSuffix) # argument 12 is the batch_size, see below # In[7]: def RemoveEndingStar(text): if(len(text) > 0): if(text[-1] == '*'): text = text[:-1] return(text) # In[ ]: def getFeaturesLabels(fileName, colI=0, colO=1): epiParaSeq = open(fileName, newline = '') #one line is a text with \t and \n data_reader = csv.reader(epiParaSeq, delimiter='\t') #transform lines into lists sequences = [] labels = [] for line in data_reader: #print(line) if(not (line[0].startswith("#") or line[0].startswith("seqAGEpitope"))): if(removeStar): sequences.append(RemoveEndingStar(line[colI])) labels.append(RemoveEndingStar(line[colO])) else: sequences.append(line[colI]) labels.append(line[colO]) print("Got ", len(sequences), " sequences and ", len(labels), " from ", fileName) return (sequences, labels) # In[ ]: #Input: array of sequences, array of sequences (same size), size of words in sequences, size of words in labels def createSentencePairs(sequences, labels, wordSizeSeq, wordSizeLabels): if(len(sequences) != len(labels)): print("ERR: createSentencePairs, different size labels and sequences") return() print("First 5 sentence pairs generated:") pairs = [] for i in range(0,len(sequences)): inp = sequences[i] outp= labels[i] wordsIn = [inp[i:i+wordSizeIn] for i in range(0, len(inp), wordSizeIn)] if(wordSizeIn < 0): wordsIn = inp.split(' ') wordsOut = [outp[i:i+wordSizeOut] for i in range(0, len(outp), wordSizeOut)] if(wordSizeOut < 0): wordsOut = outp.split(' ') pair = [' '+' '.join(list(wordsIn))+' ', ' '+' '.join(list(wordsOut))+' '] if(i < 5): print(pair) pairs.append(pair) return pairs # In[ ]: #Generates the list of words in sequences and labels def getVocabulary(pairs): vocabIn = set() vocabOut = set() for couple in pairs: vocabIn.update(couple[0].split(' ')) vocabOut.update(couple[1].split(' ')) vocabIn = sorted(vocabIn) vocabOut = sorted(vocabOut) print("Feature vocabulary: ", len(vocabIn), "words: ", vocabIn) print("Labels vocabulary: ", len(vocabOut), "words: ", vocabOut) return (vocabIn, vocabOut) # In[ ]: #Note: padding is included in the dictionnary manually. def generateDico(vocab, addPadding = True): word2idx = {} idx2word = {} if(addPadding): word2idx[''] = 0 for index, word in enumerate(vocab): word2idx[word] = index + 1 for word, index in word2idx.items(): idx2word[index] = word return (word2idx, idx2word) # In[ ]: def processFile(fileName, wordSizeSeq = 1, wordSizeLabels = 1, colI = 0, colO = 1, paddingOptionInputs = 'post', paddingOptionOutputs = 'post'): (sequences, labels) = getFeaturesLabels(fileName, colI, colO) featurePairs = createSentencePairs(sequences, labels, wordSizeSeq, wordSizeLabels) (vocabIn, vocabOut) = getVocabulary(featurePairs) (word2idxIN, idx2wordIN) = generateDico(vocabIn) (word2idxOUT, idx2wordOUT) = generateDico(vocabOut) input_tensor = [[word2idxIN[s] for s in first.split(' ')] for first, second in featurePairs] output_tensor = [[word2idxOUT[s] for s in second.split(' ')] for first, second in featurePairs] NI = max(len(t) for t in input_tensor) NO = max(len(t) for t in output_tensor) print("Max length input: ", NI, "Max length output:", NO) paddedInputTensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor,maxlen=NI,padding=paddingOptionInputs) paddedOutputTensor = tf.keras.preprocessing.sequence.pad_sequences(output_tensor,maxlen=NO,padding=paddingOptionOutputs) return(paddedInputTensor, paddedOutputTensor, word2idxIN, idx2wordIN, word2idxOUT, idx2wordOUT, NI, NO) # In[ ]: def padNewSentences(sentences, maxLength, word2IdDictionary, paddingOption): input_tensor = [[word2IdDictionary[s] for s in sentence.split(' ')] for sentence in sentences] paddedSequences = tf.keras.preprocessing.sequence.pad_sequences(input_tensor,maxlen=maxLength,padding=paddingOption) return paddedSequences # In[ ]: class Encoder(tf.keras.Model): def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz): super(Encoder, self).__init__() self.batch_sz = batch_sz self.enc_units = enc_units self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform') print("Initializing Encoder. Embedding into DE=", embedding_dim, ", GRU with hidden=", self.enc_units) def call(self, x, hidden): x = self.embedding(x) output, state = self.gru(x, initial_state = hidden) return output, state def initialize_hidden_state(self): return tf.zeros((self.batch_sz, self.enc_units)) # In[ ]: class BahdanauAttention(tf.keras.layers.Layer): def __init__(self, units): super(BahdanauAttention, self).__init__() self.W1 = tf.keras.layers.Dense(units) self.W2 = tf.keras.layers.Dense(units) self.V = tf.keras.layers.Dense(1) print("Initializing BahdanauAttention for ", units, " hidden dimensions") def call(self, query, values): # query hidden state shape == (batch_size, hidden size) # query_with_time_axis shape == (batch_size, 1, hidden size) # values shape == (batch_size, max_len, hidden size) # we are doing this to broadcast addition along the time axis to calculate the score query_with_time_axis = tf.expand_dims(query, 1) # score shape == (batch_size, max_length, 1) # we get 1 at the last axis because we are applying score to self.V # the shape of the tensor before applying self.V is (batch_size, max_length, units) score = self.V(tf.nn.tanh( self.W1(query_with_time_axis) + self.W2(values))) # attention_weights shape == (batch_size, max_length, 1) attention_weights = tf.nn.softmax(score, axis=1) # context_vector shape after sum == (batch_size, hidden_size) context_vector = attention_weights * values context_vector = tf.reduce_sum(context_vector, axis=1) return context_vector, attention_weights # In[ ]: class Decoder(tf.keras.Model): def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz): super(Decoder, self).__init__() self.batch_sz = batch_sz self.dec_units = dec_units self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform') self.fc = tf.keras.layers.Dense(vocab_size) # used for attention self.attention = BahdanauAttention(self.dec_units) print("Initializing Decoder. Embedding from vocab size ", vocab_size, " into DE=", embedding_dim, ", GRU with hidden=", self.dec_units, " hidden dimensions; final layer outputs O=", vocab_size, " dimensions (alphabet)") def call(self, x, hidden, enc_output): # enc_output shape == (batch_size, max_length, hidden_size) context_vector, attention_weights = self.attention(hidden, enc_output) # x shape after passing through embedding == (batch_size, 1, embedding_dim) x = self.embedding(x) # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size) x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1) #print(tf.reduce_max(x)) # passing the concatenated vector to the GRU output, state = self.gru(x) # output shape == (batch_size * 1, hidden_size) output = tf.reshape(output, (-1, output.shape[2])) # output shape == (batch_size, vocab) x = self.fc(output) return x, state, attention_weights # In[ ]: def vocalise(tensor, dico, wordsize = 1): pulled = ''.join(dico[i] for i in tensor) if(wordsize < 0): pulled = ' '.join(dico[i] for i in tensor) return pulled.replace("", "").replace("", "").replace("", "") # In[ ]: def vocaliseBatch(batched_tensor, dico, wordsize = 1): result = [] for tensor in batched_tensor: pulled = ''.join(dico[i] for i in tensor) if(wordsize < 0): pulled = ' '.join(dico[i] for i in tensor) text = pulled.replace("", "").replace("", "") #Now, removes anything happening after the sep = '' stripped = text.split(sep, 1)[0] result.append(stripped) return result # In[ ]: #Returns a list of def evaluateBatch(input_tensor, output_tensor, encoder, decoder, max_length_inp, max_length_targ, word2idIN, word2idOUT): input_batch_size = input_tensor.shape[0] #print("Evaluate ", input_tensor) attention_plot = np.zeros((max_length_targ, max_length_inp)) # Calculate the encoder hidden states. Note: it doesn't take a python array but only a np array with a type (float) hidden = tf.zeros((input_batch_size, encoder.enc_units)) #print("Input", input_tensor.shape) #print("Hidden", hidden.shape) enc_out, enc_hidden = encoder(input_tensor, hidden) dec_hidden = enc_hidden #dec_input = tf.expand_dims(tf.expand_dims([word2idOUT[''], 1], 0)) dec_input = tf.repeat(tf.expand_dims([word2idxOUT['']], 0), input_batch_size, axis=0) #from the train function, dec_input = tf.expand_dims([word2idxOUT['']] * batch_size, 1) #print("DecIn", dec_input.shape) result = tf.zeros((input_batch_size, 1), tf.int64) #np.empty([1, len(word2idOUT)]) predictionEachPos = tf.zeros((input_batch_size, len(word2idOUT))) #print("PredEachPos", predictionEachPos.shape) loss = np.zeros((input_batch_size, 1)) for t in range(max_length_targ): predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out) #print(dec_hidden) #print("predictions", predictions.shape) #print("dec_hidden", dec_hidden.shape) #print("attention_weights", attention_weights.shape) loss_t = loss_function(output_tensor[:,t], predictions) #print(output_tensor[:,t]) #print("loss_t", loss_t.shape) loss += loss_t #print(loss) #print(loss_t.numpy()) # storing the attention weights to plot later on ###attention_weights = tf.reshape(attention_weights, (-1,)) ###attention_plot[t] = attention_weights.numpy() #print(predictions) predicted_id = tf.argmax(predictions, axis=1).numpy() #print("predicted_id", predicted_id.shape) #print(predicted_id) #print(result.shape) #print(tf.expand_dims(predicted_id, 1).shape) result = tf.concat([result, tf.expand_dims(predicted_id, 1)], axis = 1) #predictionEachPos = np.append(predictionEachPos, [predictions[0].numpy()]) #tf.concat([predictionEachPos, tf.expand_dims(predictions[:,0].numpy(), 0)], axis = 0) #Here, we will predict after stop, and cut later. How do we do for loss, don't know. #if predicted_id == word2idOUT['']: # return result, loss.numpy() / max_length_targ # the predicted ID is fed back into the model #dec_input = tf.expand_dims([predicted_id], 0) dec_input = tf.expand_dims(predicted_id, 1) #print(dec_input.shape) #dec_input = tf.expand_dims(output_tensor[:,t], 1) #dec_input = tf.repeat(tf.expand_dims([word2idxOUT['']], 0), input_batch_size, axis=0) return(result, attention_plot, np.array(predictionEachPos), loss.numpy() / max_length_targ) # In[ ]: # All functions defined, now the script is starting # In[ ]: # For testing, different commands: if(False): embedding_dim = 6 units = 500 #This is the hidden dimension, both encoder and decoder nSeqTot = 10000 nRepeats = 10 condition = 1 wordSizeIn = 1 wordSizeOut = 1 FileInput = "Task4_E_EpiSeq_ParaSeq_NoDeg.txt" colInput = 0 colOutput = 1 simSuffix = "TestSim" #batch_size = 200 => so it's calcualted as 1/50 of the usable data size refMaxSeqFor20pctsTest = 1000000 paddingInputs = 'post' #'pre' or 'post' paddingOutputs = 'post' EPOCHS = 250 # In[ ]: # ============== Now opens the file and preprocess the data. ================== #This step takes long time. (paddedInputTensor, paddedOutputTensor, word2idxIN, idx2wordIN, word2idxOUT, idx2wordOUT, NI, NO) = processFile(FileInput, wordSizeIn, wordSizeOut, colInput, colOutput, paddingInputs, paddingOutputs) #Note, we are actually working on np arrays. Tensorflow doesn't like the arrays from python. type(paddedInputTensor) # In[ ]: # ============== Calculates the size of the test and training dataset ================== nAvailable = len(paddedInputTensor) nSeqTot = min(nSeqTot,nAvailable) train_size = int(0.8 * nSeqTot) test_size = int(0.2 * refMaxSeqFor20pctsTest) if test_size + train_size > nAvailable: test_size = min(test_size, int(0.2*nAvailable)) print('Requested sequences numbers: dataset=', nSeqTot, 'train=', train_size, 'test=', test_size, 'totUsed=', train_size + test_size) # In[ ]: #Calculates batch sizes. #By default, we will take 50 batches, so the size of a batch is nr sequences / 50 batch_size = math.floor(nSeqTot / 50) if(runningInCommandLine): if(len(sys.argv) > 12): batch_size = int(sys.argv[12]) # In[ ]: #Defines the vocabulary sizes (I have already included pad as a word in the dictionary) vocab_inp_size = len(word2idxIN) vocab_tar_size = len(word2idxOUT) print("Sizes including word: vocab_inp_size=", vocab_inp_size, "vocab_tar_size=", vocab_tar_size) #Sum up of requested task print('ML Task 4, suffix', simSuffix, 'embedding_dim=', embedding_dim, 'units=', units, 'nSeqTot=', nSeqTot, 'nRepeats=', nRepeats, 'condition=', condition, 'wordSizeIn=', wordSizeIn, 'wordSizeOut', wordSizeOut, 'FileInput', FileInput, 'colInput', colInput, 'colOutput', colOutput, 'batch_size=', batch_size, 'refMaxSeqFor20pctsTest', refMaxSeqFor20pctsTest) # In[ ]: #This part can be repeated in the python script file for repeat in range(nRepeats): # In[ ]: #Creates code and folder: ex: T4_E6_H500_S10.0k_C1_W1_1_CI0CO1_B200_ID1.0 simCode = "T4_" + simSuffix + "_E" + str(embedding_dim) + "_H" + str(units) + "_S" + str(nSeqTot/1000) + "k_C" + str(condition) + "_W" + str(wordSizeIn) + "_" + str(wordSizeOut) + "_CI" + str(colInput) + "CO" + str(colOutput) + "_B" + str(batch_size) + "_ID" + str(simID) + "." + str(repeat) print(simCode) if(not(os.path.exists(simCode))): try: os.mkdir(simCode) except OSError: print ("Creation of the directory %s failed" % simCode) sys.exit("Could not create folder => abort") else: print ("Successfully created the directory %s " % simCode) else: print("WRN: Simulation folder", simCode, " already existed...") # In[ ]: #Now randomly gets a subsampling possibleDataIDs = [*range(0, nAvailable)] random.shuffle(possibleDataIDs) # In[ ]: #Transform the lists into tensorflow datasets (not yet batched) - this step takes time so I do it only once (2 mins for 100 000 sequences) trainKeys = paddedInputTensor[possibleDataIDs[0:train_size]].astype(float); testKeys = paddedInputTensor[possibleDataIDs[train_size: train_size + test_size]].astype(float); if((condition == 3) or (condition == 13)): random.shuffle(possibleDataIDs) trainLabels = paddedOutputTensor[possibleDataIDs[0:train_size]]; testLabels = paddedOutputTensor[possibleDataIDs[train_size: train_size + test_size]]; # In[ ]: #Transform into tensors TFkeysTrain = tf.data.Dataset.from_tensor_slices(trainKeys) TFvalTrain = tf.data.Dataset.from_tensor_slices(trainLabels) TFkeysTest = tf.data.Dataset.from_tensor_slices(testKeys) TFvalTest = tf.data.Dataset.from_tensor_slices(testLabels) # In[ ]: #Transforms them into input and output elements (together) tfTrainElements = tf.data.Dataset.zip((TFkeysTrain, TFvalTrain)) tfTestElements = tf.data.Dataset.zip((TFkeysTest, TFvalTest)) train_dataset = tfTrainElements.shuffle(train_size).batch(batch_size, drop_remainder=True) test_dataset = tfTestElements.shuffle(test_size).batch(batch_size, drop_remainder=True) encoder = Encoder(vocab_inp_size, embedding_dim, units, batch_size) decoder = Decoder(vocab_tar_size, embedding_dim, units, batch_size) # In[ ]: optimizer = tf.keras.optimizers.Adam() loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none') # In[ ]: def loss_function(real, pred): mask = tf.math.logical_not(tf.math.equal(real, 0)) loss_ = loss_object(real, pred) mask = tf.cast(mask, dtype=loss_.dtype) loss_ *= mask return tf.reduce_mean(loss_) # In[ ]: #This function implicitly needs: # - an encoder and decoder # - word2idxOUT # - batch_size # - an optimizer # In[ ]: @tf.function def train_step(inp, targ, enc_hidden, teacher_forcing = True): loss = 0 with tf.GradientTape() as tape: enc_output, enc_hidden = encoder(inp, enc_hidden) dec_hidden = enc_hidden dec_input = tf.expand_dims([word2idxOUT['']] * batch_size, 1) # Teacher forcing - feeding the target as the next input for t in range(1, targ.shape[1]): # passing enc_output to the decoder predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output) loss_step = loss_function(targ[:, t], predictions) loss += loss_step #Note: impossible to get content of tensors if in mode "@tf.function" => remove if want to print #print(tf.keras.eval(loss).numpy()) #print("foo = " + str(loss.numpy() / int(targ.shape[1])) + "Batch _size=" + str(targ.shape[1])) #note: this doesn't help either, tf.enable_eager_execution() - In TensorFlow 2.0 Eager execution is enabled by default. No need to set it up. # using teacher forcing if(teacher_forcing): dec_input = tf.expand_dims(targ[:, t], 1) #This would be the non-teacher-forcing: else: predicted_id = tf.argmax(predictions, axis=1) dec_input = tf.expand_dims(predicted_id, 1) batch_loss = (loss / int(targ.shape[1])) variables = encoder.trainable_variables + decoder.trainable_variables gradients = tape.gradient(loss, variables) optimizer.apply_gradients(zip(gradients, variables)) return batch_loss # In[ ]: #EPOCHS = 200 steps_per_epoch = len(trainKeys)//batch_size print("The nr of steps per epoch will be", steps_per_epoch) # In[ ]: for epoch in range(EPOCHS): start = time.time() enc_hidden = encoder.initialize_hidden_state() total_loss = 0 for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)): batch_loss = train_step(inp, targ, enc_hidden) total_loss += batch_loss #if batch % 100 == 0: # print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy())) # saving (checkpoint) the model every 2 epochs #if (epoch + 1) % 2 == 0: # checkpoint.save(file_prefix = checkpoint_prefix) print('Forced Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch)) print('Time taken for 1 epoch {} sec\n'.format(time.time() - start)) # In[ ]: encoder.save_weights(simCode + "/" + 'encoder') decoder.save_weights(simCode + "/" + 'decoder') # In[ ]: for epoch in range(EPOCHS//10): start = time.time() enc_hidden = encoder.initialize_hidden_state() total_loss = 0 for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)): batch_loss = train_step(inp, targ, enc_hidden, False) total_loss += batch_loss print('Unforced Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch)) print('Time taken for 1 epoch {} sec\n'.format(time.time() - start)) # In[ ]: encoder.save_weights(simCode + "/" + 'encoderUnforced') decoder.save_weights(simCode + "/" + 'decoderUnforced') # In[ ]: #Now displaying the predictions for all sequences, train or test list_batched_inputs = np.array_split(trainKeys, 100, 0) list_batched_outputs = np.array_split(trainLabels, 100, 0) for index in range(0,100): batched_inputs = list_batched_inputs[index] batched_outputs = list_batched_outputs[index] (result, attention_plot, arrayPred, loss) = evaluateBatch(batched_inputs, batched_outputs, encoder, decoder, NI, NO, word2idxIN, word2idxOUT) batched_predictions = result.numpy() textIn = vocaliseBatch(batched_inputs, idx2wordIN, wordSizeIn) textPred = vocaliseBatch(result.numpy(), idx2wordOUT, wordSizeOut) textExpected = vocaliseBatch(batched_outputs, idx2wordOUT, wordSizeOut) # In[ ]: AllResults = [] f = open(simCode + "/" + "TrainResults.txt", "a") if(index == 0): f.write("lenout\tlenpred\tlenin\tld\tldnorm\tpred\tout\tin\n") # In[ ]: for i in range(0,min(len(batched_predictions),100000)): ld = jellyfish.levenshtein_distance(textPred[i],textExpected[i]) ldnorm = ld/len(textPred[i]) lenpara = len(textExpected[i]) lenpredpara = len(textPred[i]) lenepi = len(textIn[i]) #AllResults.append([lenpredpara, lenpara, lenepi, ld, ldnorm, textPred[i], textExpected[i], textIn[i]]) f.write(str(lenpredpara) + "\t" + str(lenpara) + "\t" + str(lenepi) + "\t" + str(ld) + "\t" + str(ldnorm) + "\t" + str(textPred[i]) + "\t" + str(textExpected[i]) + "\t" + str(textIn[i]) + "\n") f.close() list_batched_inputs = np.array_split(testKeys, 100, 0) list_batched_outputs = np.array_split(testLabels, 100, 0) if(len(testKeys) > 100000): list_batched_inputs = np.array_split(testKeys[0:100000], 100, 0) list_batched_outputs = np.array_split(testLabels[0:100000], 100, 0) for index in range(0,100): batched_inputs = list_batched_inputs[index] batched_outputs = list_batched_outputs[index] (result, attention_plot, arrayPred, loss) = evaluateBatch(batched_inputs, batched_outputs, encoder, decoder, NI, NO, word2idxIN, word2idxOUT) batched_predictions = result.numpy() # In[ ]: textIn = vocaliseBatch(batched_inputs, idx2wordIN, wordSizeIn) textPred = vocaliseBatch(result.numpy(), idx2wordOUT, wordSizeOut) textExpected = vocaliseBatch(batched_outputs, idx2wordOUT, wordSizeOut) # In[ ]: AllResults = [] f = open(simCode + "/" + "TestResults.txt", "a") if(index == 0): f.write("lenout\tlenpred\tlenin\tld\tldnorm\tpred\tout\tin\n") # In[ ]: for i in range(0,min(len(batched_predictions),100000)): ld = jellyfish.levenshtein_distance(textPred[i],textExpected[i]) ldnorm = ld/len(textPred[i]) lenpara = len(textExpected[i]) lenpredpara = len(textPred[i]) lenepi = len(textIn[i]) #AllResults.append([lenpredpara, lenpara, lenepi, ld, ldnorm, textPred[i], textExpected[i], textIn[i]]) f.write(str(lenpredpara) + "\t" + str(lenpara) + "\t" + str(lenepi) + "\t" + str(ld) + "\t" + str(ldnorm) + "\t" + str(textPred[i]) + "\t" + str(textExpected[i]) + "\t" + str(textIn[i]) + "\n") f.close() # In[ ]: # In[ ]: