## ----echo=FALSE, results="hide", message=FALSE-------------------------------- knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE) library(BiocStyle) set.seed(42) # Check Python modules once at the start keras_installed <- reticulate::py_module_available("keras") numpy_installed <- reticulate::py_module_available("numpy") # If not installed, skip evaluation of all subsequent chunks knitr::opts_chunk$set( eval = keras_installed && numpy_installed ) ## ----------------------------------------------------------------------------- # suppressMessages(library(immApex)) # suppressMessages(library(keras3)) # suppressMessages(library(ggplot2)) # suppressMessages(library(viridis)) # suppressMessages(library(magrittr)) ## ----tidy = FALSE------------------------------------------------------------- # sequences <- generateSequences(prefix.motif = "CAS", # suffix.motif = "YF", # number.of.sequences = 1000, # min.length = 8, # max.length = 16) # head(sequences) ## ----tidy = FALSE------------------------------------------------------------- # nucleotide.sequences <- generateSequences(number.of.sequences = 1000, # min.length = 8, # max.length = 16, # sequence.dictionary = c("A", "C", "T", "G")) # head(nucleotide.sequences) ## ----tidy = FALSE, eval = reticulate::py_module_available("keras") && reticulate::py_module_available("numpy")---- # variational.sequences <- variationalSequences(sequences, # encoder = "onehotEncoder", # number.of.sequences = 100, # encoder.hidden.dim = c(256, 128), # latent.dim = 16, # batch.size = 16, # call.threshold = 0.1) # head(variational.sequences) ## ----tidy = FALSE------------------------------------------------------------- # mutated.sequences <- mutateSequences(sequences, # n.sequence = 1, # position.start = 3, # position.end = 8) # head(sequences) # head(mutated.sequences) ## ----tidy = FALSE------------------------------------------------------------- # data("immapex_example.data") # Adaptive_example <- formatGenes(immapex_example.data[["Adaptive"]], # region = "v", # technology = "Adaptive", # simplify.format = TRUE) # # head(Adaptive_example[,c("aminoAcid","vGeneName", "v_IMGT", "v_IMGT.check")]) ## ----tidy = FALSE------------------------------------------------------------- # TRBV_aa <- getIMGT(species = "human", # chain = "TRB", # frame = "inframe", # region = "v", # sequence.type = "aa") # # TRBV_aa[[1]][1] ## ----tidy = FALSE------------------------------------------------------------- # Adaptive_example <- inferCDR(Adaptive_example, # chain = "TRB", # reference = TRBV_aa, # technology = "Adaptive", # sequence.type = "aa", # sequences = c("CDR1", "CDR2")) # # Adaptive_example[200:210,c("CDR1_IMGT", "CDR2_IMGT")] ## ----tidy = FALSE------------------------------------------------------------- # sequence.matrix <- onehotEncoder(input.sequences = c(sequences, mutated.sequences), # convert.to.matrix = TRUE) # head(sequence.matrix[,1:20]) ## ----tidy = FALSE------------------------------------------------------------- # property.matrix <- propertyEncoder(input.sequences = c(sequences, mutated.sequences), # method.to.use = "FASGAI", # convert.to.matrix = TRUE) # # head(property.matrix[,1:20]) ## ----tidy = FALSE------------------------------------------------------------- # mulit.property.matrix <- propertyEncoder(input.sequences = c(sequences, mutated.sequences), # method.to.use = c("atchleyFactors", "kideraFactors"), # convert.to.matrix = TRUE) # # head(mulit.property.matrix[,1:20]) ## ----tidy = FALSE------------------------------------------------------------- # median.property.matrix <- propertyEncoder(input.sequences = c(sequences, mutated.sequences), # method.to.use = "crucianiProperties", # summary.function = "median") # # head(median.property.matrix[,1:3]) ## ----tidy = FALSE------------------------------------------------------------- # geometric.matrix <- geometricEncoder(sequences, # method.to.use = "BLOSUM62", # theta = pi/3) # head(geometric.matrix) ## ----tidy = FALSE------------------------------------------------------------- # token.matrix <- tokenizeSequences(input.sequences = c(sequences, mutated.sequences), # add.startstop = TRUE, # start.token = "!", # stop.token = "^", # convert.to.matrix = TRUE) # head(token.matrix[,1:18]) ## ----tidy = FALSE------------------------------------------------------------- # ppm.matrix <- probabilityMatrix(sequences) # head(ppm.matrix) ## ----tidy = FALSE------------------------------------------------------------- # set.seed(42) # back.freq <- sample(1:1000, 20) # back.freq <- back.freq/sum(back.freq) # # pwm.matrix <- probabilityMatrix(sequences, # max.length = 20, # convert.PWM = TRUE, # background.frequencies = back.freq) # head(pwm.matrix) ## ----tidy = FALSE------------------------------------------------------------- # adj.matrix <- adjacencyMatrix(sequences, # normalize = FALSE) # adj.matrix ## ----tidy = FALSE------------------------------------------------------------- # property.matrix <- propertyEncoder(input.sequences = c(sequences, mutated.sequences), # method.to.use = "FASGAI", # convert.to.matrix = TRUE) # # property.sequences <- sequenceDecoder(property.matrix, # encoder = "propertyEncoder", # aa.method.to.use = "FASGAI", # call.threshold = 1) # head(sequences) # head(property.sequences) ## ----tidy=FALSE--------------------------------------------------------------- # sequence.matrix <- onehotEncoder(input.sequences = c(sequences, mutated.sequences), # convert.to.matrix = TRUE) # # OHE.sequences <- sequenceDecoder(sequence.matrix, # encoder = "onehotEncoder") # # head(OHE.sequences) ## ----tidy = FALSE------------------------------------------------------------- # #Sampling to make Training/Validation Data Cohorts # set.seed(42) # num_sequences <- nrow(sequence.matrix) # indices <- 1:num_sequences # train_indices <- sample(indices, size = floor(0.8 * num_sequences)) # val_indices <- setdiff(indices, train_indices) # # x_train <- sequence.matrix[train_indices,] # x_val <- sequence.matrix[val_indices,] # # # Parameters # input_shape <- dim(x_train)[2] # epochs <- 20 # batch_size <- 128 # encoding_dim <- 40 # hidden_dim1 <- 256 # Hidden layer 1 size # hidden_dim2 <- 128 # Hidden layer 2 size # # es <- callback_early_stopping(monitor = "val_loss", # min_delta = 0, # patience = 4, # verbose = 1, # mode = "min") # # # Define the Model # input_seq <- layer_input(shape = c(input_shape)) # # # Encoder Layers # encoded <- input_seq %>% # layer_dense(units = hidden_dim1, name = "e.1") %>% # layer_batch_normalization(name = "bn.1") %>% # layer_activation('leaky_relu', name = "act.1") %>% # layer_dense(units = hidden_dim2, name = "e.2") %>% # layer_batch_normalization(name = "bn.2") %>% # layer_activation('leaky_relu', name = "act.2") %>% # layer_dense(units = encoding_dim, activation = 'selu', name = "latent") # # # Decoder Layers # decoded <- encoded %>% # layer_dense(units = hidden_dim2, name = "d.2") %>% # layer_batch_normalization(name = "bn.3") %>% # layer_activation('leaky_relu', name = "act.3") %>% # layer_dense(units = hidden_dim1, name = "d.1") %>% # layer_batch_normalization(name = "bn.4") %>% # layer_activation('leaky_relu', name = "act.4") %>% # layer_dense(units = input_shape, activation = 'sigmoid') # # # Autoencoder Model # autoencoder <- keras_model(input_seq, decoded) # autoencoder %>% keras3::compile(optimizer = optimizer_adam(learning_rate = 0.0001), # loss = "mse") # # # Train the model # history <- autoencoder %>% fit(x = x_train, # y = x_train, # validation_data = list(x_val, x_val), # epochs = epochs, # batch_size = batch_size, # shuffle = TRUE, # callbacks = es, # verbose = 0) # # plot(history) + # scale_color_viridis(option = "B", discrete = TRUE) + # scale_fill_manual(values = c("black","black")) + # theme_classic() ## ----tidy = FALSE------------------------------------------------------------- # class1.sequences <- generateSequences(prefix.motif = "CAS", # suffix.motif = "YF", # number.of.sequences = 10000, # min.length = 8, # max.length = 16) # # class2.sequences <- generateSequences(prefix.motif = "CASF", # suffix.motif = "YF", # number.of.sequences = 10000, # min.length = 8, # max.length = 16) # # labels <- as.numeric(c(rep(0, 10000), rep(1, 10000))) # # classifier.matrix <- onehotEncoder(input.sequences = c(class1.sequences, class2.sequences), # convert.to.matrix = TRUE) ## ----tidy = FALSE------------------------------------------------------------- # #Input shape will be 1D as we are using a matrix # input.shape <- dim(classifier.matrix)[2] # # #Simple model structure # classifier.model <- keras_model_sequential() %>% # layer_dense(units = 128, activation = "relu", # input_shape = c(input.shape)) %>% # layer_dense(units = 32, activation = "relu") %>% # layer_dense(units = 1, activation = "sigmoid") # # classifier.model %>% compile( # optimizer = optimizer_adam(learning_rate = 0.00001), # loss = "binary_crossentropy", # metrics = c("accuracy") # ) # # #Separating data and labels # set.seed(42) # val_indices <- sample(nrow(classifier.matrix), 10000*0.2) # x_val <- classifier.matrix[val_indices,] # x_train <- classifier.matrix[-val_indices,] # # val_labels <- labels[val_indices] # train_labels <- labels[-val_indices] # # #Training the classifier.model # history <- classifier.model %>% fit(x_train, # train_labels, # epochs = 20, # batch_size = 32, # validation_data = list(x_val, val_labels), # verbose = 0 # ) # # plot(history) + # scale_color_viridis(option = "B", discrete = TRUE) + # scale_fill_manual(values = c("black","black")) + # theme_classic() ## ----------------------------------------------------------------------------- # sessionInfo()