## ----echo=FALSE, results="hide", message=FALSE--------------------------------
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
library(BiocStyle)
set.seed(42)

## -----------------------------------------------------------------------------
suppressMessages(library(immApex))
suppressMessages(library(keras3))
suppressMessages(library(ggplot2))
suppressMessages(library(viridis))
suppressMessages(library(magrittr))
suppressMessages(library(tidygraph))
suppressMessages(library(ggraph))

## ----tidy = FALSE-------------------------------------------------------------
sequences <- generateSequences(prefix.motif = "CAS",
                               suffix.motif = "YF",
                               number.of.sequences = 1000,
                               min.length = 8,
                               max.length = 16)
head(sequences)

## ----tidy = FALSE-------------------------------------------------------------
nucleotide.sequences <- generateSequences(number.of.sequences = 1000,
                                          min.length = 8,
                                          max.length = 16, 
                                          sequence.dictionary = c("A", "C", "T", "G"))
head(nucleotide.sequences)

## ----tidy = FALSE, eval = FALSE-----------------------------------------------
# var.sequences <- variationalSequences(sequences,
#                                       encoder.function = "onehotEncoder",
#                                       number.of.sequences = 100,
#                                       encoder.hidden.dim = c(256, 128),
#                                       latent.dim = 16,
#                                       batch.size = 16,
#                                       call.threshold = 0.1)
# head(var.sequences)

## ----include=FALSE, eval = F--------------------------------------------------
# keras3::clear_session()

## ----tidy = FALSE-------------------------------------------------------------
mutated.sequences <- mutateSequences(sequences, 
                                     n.sequence = 1,
                                     position.start = 3,                                  
                                     position.end = 8)
head(sequences)
head(mutated.sequences)

## ----tidy = FALSE-------------------------------------------------------------
data("immapex_example.data")
Adaptive_example <- formatGenes(immapex_example.data[["Adaptive"]],
                                region = "v",
                                technology = "Adaptive", 
                                simplify.format = TRUE) 

head(Adaptive_example[,c("aminoAcid","vGeneName", "v_IMGT", "v_IMGT.check")])

## ----tidy = FALSE-------------------------------------------------------------
TRBV_aa <- getIMGT(species = "human",
                   chain = "TRB",
                   frame = "inframe",
                   region = "v",
                   sequence.type = "aa") 

TRBV_aa[[1]][1]

## ----tidy = FALSE-------------------------------------------------------------
Adaptive_example <- inferCDR(Adaptive_example,
                             chain = "TRB", 
                             reference = TRBV_aa,
                             technology = "Adaptive", 
                             sequence.type = "aa",
                             sequences = c("CDR1", "CDR2"))

Adaptive_example[200:210,c("CDR1_IMGT", "CDR2_IMGT")]

## ----tidy = FALSE-------------------------------------------------------------
sequence.matrix <- onehotEncoder(input.sequences =  c(sequences, mutated.sequences), 
                                 convert.to.matrix = TRUE)
head(sequence.matrix[,1:20])

## ----tidy = FALSE-------------------------------------------------------------
property.matrix <- propertyEncoder(input.sequences =  c(sequences, mutated.sequences), 
                                   method.to.use = "FASGAI",
                                   convert.to.matrix = TRUE)

head(property.matrix[,1:20])

## ----tidy = FALSE-------------------------------------------------------------
mulit.property.matrix <- propertyEncoder(input.sequences =  c(sequences, mutated.sequences), 
                                         method.to.use = c("atchleyFactors", "kideraFactors"),
                                         convert.to.matrix = TRUE)

head(mulit.property.matrix[,1:20])

## ----tidy = FALSE-------------------------------------------------------------
median.property.matrix <- propertyEncoder(input.sequences =  c(sequences, mutated.sequences), 
                                          method.to.use = "crucianiProperties",
                                          summary.function = "median")

head(median.property.matrix[,1:3])

## ----tidy = FALSE-------------------------------------------------------------
geometric.matrix <- geometricEncoder(sequences, 
                                     method.to.use = "BLOSUM62",
                                     theta = pi/3)
head(geometric.matrix)

## ----tidy = FALSE-------------------------------------------------------------
token.matrix <- tokenizeSequences(input.sequences =  c(sequences, mutated.sequences), 
                                  add.startstop = TRUE,
                                  start.token = "!",
                                  stop.token = "^", 
                                  convert.to.matrix = TRUE)
head(token.matrix[,1:18])

## ----tidy = FALSE-------------------------------------------------------------
ppm.matrix <- probabilityMatrix(sequences)
head(ppm.matrix)

## ----tidy = FALSE-------------------------------------------------------------
set.seed(42)
back.freq <- sample(1:1000, 20)
back.freq <- back.freq/sum(back.freq)

pwm.matrix <- probabilityMatrix(sequences,
                                max.length = 20,
                                convert.PWM = TRUE,
                                background.frequencies = back.freq)
head(pwm.matrix)

## ----tidy = FALSE-------------------------------------------------------------
adj.matrix <- adjacencyMatrix(sequences, 
                              normalize = FALSE)
adj.matrix

## ----tidy = FALSE, eval = FALSE-----------------------------------------------
# # Generating similar sequences
# set.seed(42)
# sequences <- generateSequences(prefix.motif = "CASSRT",
#                                suffix.motif = "YF",
#                                number.of.sequences = 1000,
#                                min.length = 8,
#                                max.length = 12)
# 
# # Building Edit-distance-based Network
# g1 <- buildNetwork(sequences,
#                    threshold = 2)
# 
# # Remove isolated nodes
# g1 <- delete_vertices(g1, which(degree(g1) == 0))
# 
# # Convert to tidygraph for use with ggraph
# g_tidy <- as_tbl_graph(g1)
# 
# # Plot the network
# ggraph(g_tidy,
#        layout = "fr") +
#   geom_edge_link(aes(width = weight),
#                  color = "black") +
#   geom_node_point(aes(size = degree(g1, mode = "all")),
#                   fill = "steelblue",
#                   color= "black",
#                   shape = 21) +
#   theme_void() +
#   scale_edge_width(range = c(0.1, 0.5)) +
#   theme(legend.position = "none")

## ----tidy = FALSE-------------------------------------------------------------
property.matrix <- propertyEncoder(input.sequences =  c(sequences, mutated.sequences), 
                                   method.to.use = "FASGAI",
                                   convert.to.matrix = TRUE)

property.sequences <- sequenceDecoder(property.matrix,
                                      encoder = "propertyEncoder",
                                      aa.method.to.use = "FASGAI",
                                      call.threshold = 1)
head(sequences)
head(property.sequences)

## ----tidy=FALSE---------------------------------------------------------------
sequence.matrix <- onehotEncoder(input.sequences =  c(sequences, mutated.sequences), 
                                 convert.to.matrix = TRUE)

OHE.sequences <- sequenceDecoder(sequence.matrix,
                                 encoder = "onehotEncoder")

head(OHE.sequences)

## ----tidy = FALSE, eval = FALSE-----------------------------------------------
# #Sampling to make Training/Validation Data Cohorts
# set.seed(42)
# num_sequences <- nrow(sequence.matrix)
# indices <- 1:num_sequences
# train_indices <- sample(indices, size = floor(0.8 * num_sequences))
# val_indices <- setdiff(indices, train_indices)
# 
# x_train <- sequence.matrix[train_indices,]
# x_val <- sequence.matrix[val_indices,]
# 
# # Parameters
# input_shape <- dim(x_train)[2]
# epochs <- 20
# batch_size <- 128
# encoding_dim <- 40
# hidden_dim1 <- 256 # Hidden layer 1 size
# hidden_dim2 <- 128  # Hidden layer 2 size
# 
# es <- callback_early_stopping(monitor = "val_loss",
#                               min_delta = 0,
#                               patience = 4,
#                               verbose = 1,
#                               mode = "min")
# 
# # Define the Model
# input_seq <- layer_input(shape = c(input_shape))
# 
# # Encoder Layers
# encoded <- input_seq %>%
#           layer_dense(units = hidden_dim1, name = "e.1") %>%
#           layer_batch_normalization(name = "bn.1") %>%
#           layer_activation('leaky_relu', name = "act.1") %>%
#           layer_dense(units = hidden_dim2, name = "e.2") %>%
#           layer_batch_normalization(name = "bn.2") %>%
#           layer_activation('leaky_relu', name = "act.2") %>%
#           layer_dense(units = encoding_dim, activation = 'selu', name = "latent")
# 
# # Decoder Layers
# decoded <- encoded %>%
#           layer_dense(units = hidden_dim2, name = "d.2") %>%
#           layer_batch_normalization(name = "bn.3") %>%
#           layer_activation('leaky_relu', name = "act.3") %>%
#           layer_dense(units = hidden_dim1, name = "d.1") %>%
#           layer_batch_normalization(name = "bn.4") %>%
#           layer_activation('leaky_relu', name = "act.4") %>%
#           layer_dense(units = input_shape, activation = 'sigmoid')
# 
# # Autoencoder Model
# autoencoder <- keras_model(input_seq, decoded)
# autoencoder %>% keras3::compile(optimizer = optimizer_adam(learning_rate = 0.0001),
#                                    loss = "mse")
# 
# # Train the model
# history <- autoencoder %>% fit(x = x_train,
#                                y = x_train,
#                                validation_data = list(x_val, x_val),
#                                epochs = epochs,
#                                batch_size = batch_size,
#                                shuffle = TRUE,
#                                callbacks = es,
#                                verbose = 0)
# 
# plot(history) +
#   scale_color_viridis(option = "B", discrete = TRUE) +
#   scale_fill_manual(values = c("black","black")) +
#   theme_classic()

## ----include=FALSE, eval = FALSE----------------------------------------------
# keras3::clear_session()

## ----tidy = FALSE-------------------------------------------------------------
class1.sequences <- generateSequences(prefix.motif = "CAS",
                                      suffix.motif = "YF",
                                      number.of.sequences = 10000,
                                      min.length = 8,
                                      max.length = 16)

class2.sequences <- generateSequences(prefix.motif = "CASF",
                                      suffix.motif = "YF",
                                      number.of.sequences = 10000,
                                      min.length = 8,
                                      max.length = 16)

labels <- as.numeric(c(rep(0, 10000), rep(1, 10000)))

classifier.matrix <- onehotEncoder(input.sequences =  c(class1.sequences, class2.sequences), 
                                   convert.to.matrix = TRUE)

## ----tidy = FALSE, eval = FALSE-----------------------------------------------
# #Input shape will be 1D as we are using a matrix
# input.shape <- dim(classifier.matrix)[2]
# 
# #Simple model structure
# classifier.model <- keras_model_sequential() %>%
#                         layer_dense(units = 128, activation = "relu",
#                                     input_shape = c(input.shape)) %>%
#                         layer_dense(units = 32, activation = "relu") %>%
#                         layer_dense(units = 1, activation = "sigmoid")
# 
# classifier.model %>% compile(
#         optimizer = optimizer_adam(learning_rate = 0.00001),
#         loss = "binary_crossentropy",
#         metrics = c("accuracy")
# )
# 
# #Separating data and labels
# set.seed(42)
# val_indices <- sample(nrow(classifier.matrix), 10000*0.2)
# x_val <- classifier.matrix[val_indices,]
# x_train <- classifier.matrix[-val_indices,]
# 
# val_labels <- labels[val_indices]
# train_labels <- labels[-val_indices]
# 
# #Training the classifier.model
# history <- classifier.model %>% fit(x_train,
#                                     train_labels,
#                                     epochs = 20,
#                                     batch_size = 32,
#                                     validation_data = list(x_val, val_labels),
#                                     verbose = 0
# )
# 
# plot(history) +
#   scale_color_viridis(option = "B", discrete = TRUE) +
#   scale_fill_manual(values = c("black","black")) +
#   theme_classic()

## ----include=FALSE, eval = FALSE----------------------------------------------
# keras3::clear_session()

## -----------------------------------------------------------------------------
sessionInfo()