##### Potting tanglegrams for bat co-phylogeny paper ###

## set working dir 
#setwd("/Users/mah13c/Library/CloudStorage/OneDrive-CSIRO/Documents/Avirup_co-evolution/Tanglegrams")
#setwd("C:\\Users\\san260\\OneDrive - CSIRO\\Desktop\\R_codes\\All_virus")

#### -- Universal parameters -- #### (i.e. can be used for all trees and not repeated each tree)
## libraries
library(dplyr)
library(ape) # this will help me visualise the phylogenetic trees in a R environment
library(paco) #cophylogeny residuals
library(reshape2)
library(ggplot2)
library(brms) #Daniel's suggestion #Reference reading material: https://cran.r-project.org/web/packages/brms/index.html
library(RColorBrewer) # for cool colours
library(dplyr)
library(plotrix)
library(ggplotify) ## cophylogeny plotting
library(cowplot) ## arranging plots
library(scales)

## Set the graphical parameters for R’s base plotting system — specifically,removes all the margins around the plotting area.
par(oma=c(0,0,0,0),mar=c(0,0,0,0))

## Set the universal min and max line width size for lwd setting in the tanglegrams, and the level of transparency
lmin=1
lmax=3.5
ltran=0.85
## Set the universal space and gap settings for each tanglegram
sp=200
gp=0

#### Begin specific trees####
###bat paramyxovirus####
fam="Paramyxoviridae"
#input files
adata <- read.csv("paco_residuals.csv", stringsAsFactors = FALSE)
htree <- read.tree("Bat_tree.nwk")
ptree <- read.tree("Paramxyovirus_WGS_tree.newick")
print(ptree$tip.label)

## Clean virus tip labels
ptree$tip.label <- gsub("_$", "", ptree$tip.label)
print(ptree$tip.label)
plot(ptree)

## Remove duplicate rows (entire row match)
adata <- adata %>% distinct()

## View and check the cleaned data
#View(adata)
setdiff(adata$Host,htree$tip.label) ## finds any hosts listed in your metadata table (adata$Host) that do not appear as tip labels in the phylogenetic tree objec
setdiff(adata$Virus,ptree$tip.label) ## finds any virus names or IDs in your metadata table (adata$Virus) that are not found among the tip labels of the virus phylogenetic tree (ptree)

## Identify all zones in your data that have at least 2 entries (rows) — and then store the names of those qualifying zones in a vector called sufficient_links
sufficient_links <- adata %>%
  dplyr::group_by(zone) %>%
  dplyr::summarise(n_links = dplyr::n(), .groups = "drop") %>%
  dplyr::filter(n_links >= 2) %>%  # Change 2 to 10 if needed
  dplyr::pull(zone)
## Prune trees so that they only include the tips that are also found in the metadata table
htree=keep.tip(htree,adata$host)
ptree=keep.tip(ptree,adata$virus)
adata$phylo <- adata$host 

## Ensure categorical variables are factors
adata$zone <- as.factor(adata$zone)
adata$host <- as.factor(adata$host)
adata$phylo <- as.factor(adata$phylo)

## Filter zones with sufficient links (n >= 2 or 10) again
sufficient_links <- adata %>%
  dplyr::group_by(zone) %>%
  dplyr::summarise(n_links = dplyr::n(), .groups = "drop") %>%
  dplyr::filter(n_links >= 2) %>%  # Change 2 to 10 if needed
  dplyr::pull(zone)

## Prepare data for modeling
residual_data <- adata %>%
  filter(zone %in% sufficient_links) %>%
  select(host, virus, zone, residual)
#View(residual_data)

# set colors for cophylogeny 
cols=brewer.pal(3,"Pastel2")
cols=setNames(cols,unique(residual_data$zone))
residual_data$rcol=cols[residual_data$zone]

# Keep only valid host-virus pairs
residual_data <- residual_data %>%
  filter(host %in% htree$tip.label, virus %in% ptree$tip.label)

## Create the cophylogeny plot (no automatic ggplot legend)

p1 <- as.ggplot(~cophyloplot(htree, ptree,
                             assoc = residual_data[c("host", "virus")],
                             show.tip.label = FALSE,
                             use.edge.length = FALSE,
                             space = sp*0.6,
                             gap = gp,
                             length.line = 1,
                             col = adjustcolor(residual_data$rcol, alpha.f = ltran),
                             lwd = scales::rescale(residual_data$residual, c(lmin, lmax)),
                             lty = 1)) +
  xlim(0.15, 0.9) +
  ylim(0.15, 0.85) +
  ggtitle(fam) +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "italic"))

final_plot_paramyxo <- p1
p1

####bat coronavirus####
fam="Coronaviridae"
#input files
adata <- read.csv("merged_corona_residual.csv", stringsAsFactors = FALSE)
#View(adata)  
htree <- read.tree("Bat_tree.nwk")
htree$tip.label
ptree <- read.tree("BatCorona_Cophylo_231124_input.newick")
plot(ptree)

## Clean virus tip labels
ptree$tip.label <- gsub("_$", "", ptree$tip.label)
print(ptree$tip.label)
plot(ptree)

## Remove duplicate rows (entire row match)
adata <- adata %>% distinct()

## View and check the cleaned data
#View(adata)
setdiff(adata$Host,htree$tip.label) ## finds any hosts listed in your metadata table (adata$Host) that do not appear as tip labels in the phylogenetic tree objec
setdiff(adata$Virus,ptree$tip.label) ## finds any virus names or IDs in your metadata table (adata$Virus) that are not found among the tip labels of the virus phylogenetic tree (ptree)

## Identify all zones in your data that have at least 2 entries (rows) — and then store the names of those qualifying zones in a vector called sufficient_links
sufficient_links <- adata %>%
  dplyr::group_by(zone) %>%
  dplyr::summarise(n_links = dplyr::n(), .groups = "drop") %>%
  dplyr::filter(n_links >= 2) %>%  # Change 2 to 10 if needed
  dplyr::pull(zone)
## Prune trees so that they only include the tips that are also found in the metadata table
htree=keep.tip(htree,adata$host)
ptree=keep.tip(ptree,adata$virus)
adata$phylo <- adata$host 

## Ensure categorical variables are factors
adata$zone <- as.factor(adata$zone)
adata$host <- as.factor(adata$host)
adata$phylo <- as.factor(adata$phylo)

## Filter zones with sufficient links (n >= 2 or 10) again
sufficient_links <- adata %>%
  dplyr::group_by(zone) %>%
  dplyr::summarise(n_links = dplyr::n(), .groups = "drop") %>%
  dplyr::filter(n_links >= 2) %>%  # Change 2 to 10 if needed
  dplyr::pull(zone)

## Prepare data for modeling
residual_data <- adata %>%
  filter(zone %in% sufficient_links) %>%
  select(host, virus, zone, residual)
#View(residual_data)

## Keep only valid host-virus pairs --> specific to corona
residual_data <- residual_data %>%
  filter(host %in% htree$tip.label, virus %in% ptree$tip.label)
## Extra check specific for corona
setdiff(residual_data$host, htree$tip.label)  # Hosts missing in tree
setdiff(residual_data$virus, ptree$tip.label) # Viruses missing in tree

# set colors for cophylogeny and PGLMM
cols=brewer.pal(3,"Pastel2")
cols=setNames(cols,unique(residual_data$zone))
residual_data$rcol=cols[residual_data$zone]

## Create the cophylogeny plot (no automatic ggplot legend)

p1 <- as.ggplot(~cophyloplot(htree, ptree,
                             assoc = residual_data[c("host", "virus")],
                             show.tip.label = FALSE,
                             use.edge.length = FALSE,
                             space = sp,
                             gap = gp,
                             length.line = 1,
                             col = adjustcolor(residual_data$rcol, alpha.f = ltran),
                             lwd = scales::rescale(residual_data$residual, c(lmin, lmax)),
                             lty = 1)) +
  xlim(0.15, 0.9) +
  ylim(0.15, 0.85) +
  ggtitle(fam) +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "italic"))

final_plot_corona <- p1
p1
#####
# Prune trees and plot cophylogeny

# Ensure tip labels are characters (not factors)
htree$tip.label <- as.character(htree$tip.label)
ptree$tip.label <- as.character(ptree$tip.label)

# Ensure association names are characters
residual_data$host <- as.character(residual_data$host)
residual_data$virus <- as.character(residual_data$virus)

# Extract unique host and virus names from associations
hosts_in_assoc <- unique(residual_data$host)
viruses_in_assoc <- unique(residual_data$virus)

# Prune trees to include only taxa involved in associations
htree_pruned <- ape::keep.tip(htree, hosts_in_assoc[hosts_in_assoc %in% htree$tip.label])
ptree_pruned <- ape::keep.tip(ptree, viruses_in_assoc[viruses_in_assoc %in% ptree$tip.label])

# check that associations match pruned tree tips
stopifnot(all(residual_data$host %in% htree_pruned$tip.label))
stopifnot(all(residual_data$virus %in% ptree_pruned$tip.label))

# cophylogeny plot
p1 <- as.ggplot(~cophyloplot(htree_pruned, ptree_pruned,
                             assoc = residual_data[c("host", "virus")],
                             show.tip.label = FALSE,
                             use.edge.length = FALSE,
                             space = sp,
                             gap = gp,
                             length.line = 1,  # Short horizontal connector
                             col = adjustcolor(residual_data$rcol, alpha.f = ltran),
                             lwd = scales::rescale(residual_data$residual, c(lmin, lmax)),
                             lty = 1)) +
  xlim(0.15, 0.9) +
  ylim(0.15, 0.85) +
  ggtitle(fam) +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "italic"))

# Display plot
p1
final_plot_corona <- p1



#### bat rhabdovirus####
fam="Rhabdoviridae"
#input files
adata <- read.csv("residual_data_host_virus_zone.csv", stringsAsFactors = FALSE)
htree <- read.tree("Bat_tree.nwk")
ptree <- read.tree("Rhabdovirus_phylogenetic_tree250425.newick")

## Clean virus tip labels
ptree$tip.label <- gsub("_$", "", ptree$tip.label)
print(ptree$tip.label)
plot(ptree)

## Remove duplicate rows (entire row match)
adata <- adata %>% distinct()

## Optional: View and check the cleaned data
#View(adata)
setdiff(adata$Host,htree$tip.label) ## finds any hosts listed in your metadata table (adata$Host) that do not appear as tip labels in the phylogenetic tree objec
setdiff(adata$Virus,ptree$tip.label) ## finds any virus names or IDs in your metadata table (adata$Virus) that are not found among the tip labels of the virus phylogenetic tree (ptree)

## Identify all zones in your data that have at least 2 entries (rows) — and then store the names of those qualifying zones in a vector called sufficient_links
sufficient_links <- adata %>%
  dplyr::group_by(zone) %>%
  dplyr::summarise(n_links = dplyr::n(), .groups = "drop") %>%
  dplyr::filter(n_links >= 2) %>%  # Change 2 to 10 if needed
  dplyr::pull(zone)
## Prune trees so that they only include the tips that are also found in the metadata table
htree=keep.tip(htree,adata$host)
ptree=keep.tip(ptree,adata$virus)
adata$phylo <- adata$host 

## Ensure categorical variables are factors
adata$zone <- as.factor(adata$zone)
adata$host <- as.factor(adata$host)
adata$phylo <- as.factor(adata$phylo)

## Filter zones with sufficient links (n >= 2 or 10) again
sufficient_links <- adata %>%
  dplyr::group_by(zone) %>%
  dplyr::summarise(n_links = dplyr::n(), .groups = "drop") %>%
  dplyr::filter(n_links >= 2) %>%  # Change 2 to 10 if needed
  dplyr::pull(zone)

## Prepare data for modeling
residual_data <- adata %>%
  filter(zone %in% sufficient_links) %>%
  select(host, virus, zone, residual)
#View(residual_data)

# set colors for cophylogeny and PGLMM
cols=brewer.pal(4,"Pastel2")
cols=setNames(cols,unique(residual_data$zone))
residual_data$rcol=cols[residual_data$zone]


## Create the cophylogeny plot (no automatic ggplot legend)

p1 <- as.ggplot(~cophyloplot(htree, ptree,
                             assoc = residual_data[c("host", "virus")],
                             show.tip.label = FALSE,
                             use.edge.length = FALSE,
                             space = sp*0.8,
                             gap = gp,
                             length.line = 1,  # Short horizontal connector
                             col = adjustcolor(residual_data$rcol, alpha.f = ltran),
                             lwd = scales::rescale(residual_data$residual, c(lmin, lmax)),
                             lty = 1)) +
  xlim(0.15, 0.9) +
  ylim(0.15, 0.85) +
  ggtitle(fam) +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "italic"))

p1
final_plot_rhabdo <- p1

#### dummy plot for legend --> from rhabo
legend_data <- data.frame(zone = unique(residual_data$zone))
legend_colors <- unique(residual_data[, c("zone", "rcol")])
legend_colors <- legend_colors[order(legend_colors$zone), ]

legend_plot <- ggplot(legend_data, aes(x = zone, fill = zone)) +
  geom_bar() +
  scale_fill_manual(values = setNames(legend_colors$rcol, legend_colors$zone)) +
  theme_void() +
  guides(fill = guide_legend(title = "Zone")) +
  theme(legend.position = "top",
        legend.title.position = "top",
        legend.margin = margin(t = 0, r = 0, b = 28, l = 6, unit = "pt"),
        legend.direction = "horizontal",
        legend.title = element_text(size = 12, face = "bold.italic", hjust = 0.5),
        legend.text = element_text(size = 10)) +
  geom_bar(fill="white") #+
  # xlim(0.15, 0.9) +
  # ylim(0.15, 0.85)
  #guides(fill=guide_coloursteps(title.position="top"))
## remove bars by overwriting with white bars
legend_plot

#### Plot all tangelgrams and legend together####
## Stack coronaviridae plot on top of the legend for the left panel
left <- cowplot::plot_grid(final_plot_corona,
                           legend_plot,
                           ncol = 1,
                           nrow = 2,
                           labels = c("A", ""),  
                           label_size = 14,
                           rel_heights=c(60,1)
)
left
## Stack the Paramyxo and Rhabdo plots for the right panel
right <- plot_grid(final_plot_paramyxo,
                   final_plot_rhabdo,
                   ncol = 1,
                   nrow = 2,
                   labels = c("B", "C"),  
                   label_size = 14,
                   rel_heights=c(1,1.3)
)
right

all_plots <- plot_grid(left, right, ncol = 2, nrow = 1, 
                       align = "h", axis = "t", 
                       rel_widths=c(1.3,1), rel_heights=c(1,1))
all_plots
## Save to file at publication size
ggsave("tanglegrams.pdf", width = 17, height = 14, units = "cm")
ggsave("tanglegrams.png", width = 17, height = 14, units = "cm")
ggsave("tanglegrams.tiff", width = 17, height = 14, units = "cm")

# Restore default margins
#par(oma = c(0, 0, 0, 0), mar = c(5, 4, 4, 2) + 0.1)
