## Installations
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("DESeq2")
# BiocManager::install("tximport")
eukrhythmic compilation
Working with eukrhythmic output files
List of output directories:
final-files/
00-nucleotide_assembly/
MAD.filtered.nospace.fasta
: Full length sequences of merged assembly groups (MAD) that have been de-duplicated
01-predicted_proteins/
MAD.fasta.transdecoder.cds
: Identified coding regions of transcripts from the Merged assembly groups (MAD)MAD.fasta.transdecoder.pep
: Translated peptide sequences from the Merged assembly groups (MAD)
02-annotation_table/
TaxonomicAndFunctionalAnnotations.csv
: Taxonomic levels, GOs, PFAMs, and KEGG KOs - assigned by short seq ID and the full sequence ID.
03-abundance_tables/
SeqID_Dict_NoSpaceNames.csv
: key to link long sequence read IDs to ShortSeq IDs that will link to taxonomic and functional annotations
intermediate-files/
04-compare
14-MAD-mapping-filtered
: location of individual salmon outputs for all samples (quant.sf)
Compilation scripts
collate-results.py
: description of code
~Arianna - how do you want to handle this?~
Notes for Sarah & Arianna: hopefully we can include a description of taking outputs from eukrhythmic and putting them in specific directories, etc. that can go stright into the run_tximport.R below.
Obtain library normalized count data
R script to run this: scripts/run_tximport.R
.
Our goal is to take eukrhthymic estimated counts (from salmon) to get transcript-level abundance estimates that will account for gene length. The outcome from this is that we get average transcript length across samples. For this we want to import all data into DESeq using tximport as an interface.
- tximport
- Why tximport? tximport allows us to correct for the changes in gene length across our samples. This way we can use transcript-level estimates for our metatranscriptomic analysis. Also, tximport knows the output from salmon, so we can easily put the salmon output into tximport. Then it can link directly to edgeR or DESeq2.
- DESeq
- DESeq: Love, M.I., Huber, W., Anders, S. (2014) Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology, 15:550. 10.1186/s13059-014-0550-8
- Explaination using DESeq: https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html
- https://bioconductor.org/packages/release/bioc/manuals/DESeq2/man/DESeq2.pdf
Set up R environment
Library required to execute code in R
Load necessary libraries & get number of available threads for data.table.
# | message: false
# | warning: false
library(data.table)
library(tidyr)
library(dplyr)
library(tximport)
library(readr)
<- getDTthreads()
num_threads # Conflicts between tidyverse and data.table can be ignored
Output files fromrun_tximport.R
provide an RData file that can be used for the DESeq input file (tximport_metaT-ALOHA-CA.Rdata
).
DESeq2 analysis
Import as DESeq object from tximport.
library(DESeq2)
library(tidyverse)
load("tximport_metaT-ALOHA-CA.Rdata", verbose = TRUE)
# colData : Rows of colData correspond to columns of countData
# design ~ x == serves the purpose of considering biological replicates
Working with R objects, txi
and sample_merged
.
By sample name
For taxonomic bar plot and overall cluster analysis. (March and July are separate at ALOHA)
See how to set up subsetting? https://github.com/WHOIGit/super-waffle/blob/main/01_seqpain_deseq.R
<- DESeqDataSetFromTximport(txi,
ds_tpm_samplename colData = sample_merged,
design = ~0 + SAMPLENAME)
optional
<- rowSums(counts(ds_tpm_samplename)) >= 10
keep <- ds_tpm_samplename[keep,] dds
dds
includes > 10. Now dim is 6972094, it was originally, 14737135. Keeping about 47% of data.
Extract count table
library(tximport)
<- makeCountsFromAbundance(
counts_scaled as.matrix(txi$counts),
as.matrix(txi$abundance),
as.matrix(txi$length),
countsFromAbundance = "scaledTPM"
)
<- as.data.frame(counts_scaled) counts_df
Rename so replicates have the same name for counts
<- colnames(counts_df)
names_orig <- sub("_[^_]+$", "", names_orig)
names_new colnames(counts_df) <- names_new
Mean across columns that have the same name - which are replicates.
<- counts_df %>%
mean_counts_df cbind(as.list(.) %>%
Filter(is.numeric, .) %>%
split(names(.)) %>%
lapply(as.data.frame) %>%
lapply(rowMeans) %>%
setNames(paste0("mean.", names(.)))) %>%
select(starts_with("mean"))
save(mean_counts_df, sample_merged, file = "Avg_scaled_tpm_08222023.RData")
Extract centered data
See “Count data transformations” in https://introtogenomics.readthedocs.io/en/latest/2021.11.11.DeseqTutorial.html
Use variance Stabilizing Transformation (vst
) command to estimate dispersion.
# output is a DESeqtransform object
<- vst(ds_tpm_samplename)
vsd_all
# make a transformed count matrix
# vsd_blind <- vst(ds_tpm_samplename, blind = FALSE)
Example output: > using ‘avgTxLength’ from assays(dds), correcting for library size
Look at output with assay.
head(assay(vsd_all), 4)
Extract as a data frame, where values are center normalized - based on avgTxLength.
<- as.data.frame(assay(vsd_all)) df_ctr_norm
save(df_ctr_norm, file = "normed_center_df_08222023.RData")
DEseq2 reanalysis for DE genes
Repeat DESeq2 design to extracted differentially expressed genes for specific patterns below. Subset is required ahead of time - we can do this with txi directly.
There are a few different ways we want to perform the untargeted analysis. First we will do a ‘whole community’ series of analyses to look at overall amount of differentially expressed genes. Below we will set up parameters associated with the queries we’ve planned.
See script scripts/run_get_params.R
This is stored on GRACE.
Create subset parameters
Subset samples as parameters
Use the metadata data frame to create sample lists for the comparisons we want to do.
<- read.csv("input-data/sample-list-revised.txt") metadata
For the whole community work, we want to compare CA vs NPSG and euphotic vs. subeuphotic.
<- metadata %>%
all_samples select(sample = SAMPLE)
<- metadata %>%
npsg_only filter(PACIFIC_REGION == "NPSG") %>%
select(sample = SAMPLE)
# Compare euphotic and subeuphotic
# Use this to compare across months too
<- metadata %>%
ca_only filter(PACIFIC_REGION != "NPSG") %>%
select(sample = SAMPLE)
# Compare euphotic and subeuphotic
<- metadata %>%
euphotic filter(LIGHT == "Euphotic")%>%
select(sample = SAMPLE)
# Compare NPSG to CA euphotic zone
<- metadata %>%
subeuphotic filter(LIGHT != "Euphotic")%>%
select(sample = SAMPLE)
# Compare NPSG to CA subeuphotic zone
Subset genes & taxa as parameters
Need to make a list of the SequenceID
from the taxonomy and functional annotations.
<- read.csv("TaxonomicAndFunctionalAnnotations.csv") taxfxn
Genes
Determine character lists from gene list as subsets for our later queries.
# Get list of annotations where a GO, PFAM, and KEGG ID were assigned.
<- as.character(
genes_fxn_all filter(taxfxn, GOs != "-" & PFAMs != "-" & KEGG_ko != "-") %>%
select(SequenceID) %>%
"SequenceID"]]) #this line outputs the selected vector from the pipe
.[[
# Removing unclassified and unannotated sequences
<- as.character(
genes_tax_fxn_all filter(taxfxn, GOs != "-" & PFAMs != "-" & KEGG_ko != "-"
& Domain == "Eukaryota" & Supergroup != "Unclassified") %>%
select(SequenceID) %>%
"SequenceID"]]) .[[
From KEGG database reformatting make key_geneid
. Isolate genes that are curated.
<- read.csv("../KEGG_DB/combined_kegg.csv")
kegg head(kegg$KO_number)
<- read.csv("../KEGG_DB/reformat-kegg-pfam-skh.csv")
curated_kegg
<- curated_kegg %>%
key_geneid select(-X) %>%
right_join(kegg %>% select(KEGG = KO_number, everything(), -X)) %>%
distinct() %>%
select(starts_with("KeggOrthology_"), Category01, Category02, FullName, GeneID, Gene_identification, KEGG, PFAM, Descriptions, REF = REFs)
# write.csv(key_geneid, file = "keygene_id.csv")
<- as.character(key_geneid %>%
kegg_keep filter(Category01 != "" & !(is.na(Category01))) %>%
select(KEGG) %>%
"KEGG"]])
.[[# length(kegg_keep)
<- as.character(taxfxn %>%
genes_kegg_curated mutate(KEGG_mod = str_remove_all(KEGG_ko, "ko:")) %>%
filter(KEGG_mod %in% kegg_keep) %>%
select(SequenceID) %>%
"SequenceID"]]) .[[
Taxa
Isolate transcript IDs (SequenceID) of individual taxa. For each taxa subset, we are also only selecting transcript IDs with PFAM or KEGG IDs.
For queries, we want to isolate:
<- as.character(
euks_only filter(taxfxn, Domain == "Eukaryota" & (PFAMs != "-" | KEGG_ko != "-")) %>%
select(SequenceID) %>%
"SequenceID"]])
.[[
<- as.character(
diatom filter(taxfxn, Class == "Bacillariophyta" & (PFAMs != "-" | KEGG_ko != "-")) %>%
select(SequenceID) %>%
"SequenceID"]])
.[[
#Dinoflagellata pPhylum
<- as.character(
dinos filter(taxfxn, Phylum == "Dinoflagellata" & (PFAMs != "-" | KEGG_ko != "-")) %>%
select(SequenceID) %>%
"SequenceID"]])
.[[
#Haptophyta in phylum
<- as.character(
hapto filter(taxfxn, Phylum == "Haptophyta" & (PFAMs != "-" | KEGG_ko != "-")) %>%
select(SequenceID) %>%
"SequenceID"]])
.[[
# Chlorophyta
<- as.character(
chloro filter(taxfxn, Phylum == "Chlorophyta" & (PFAMs != "-" | KEGG_ko != "-")) %>%
select(SequenceID) %>%
"SequenceID"]])
.[[
#Ciliophora
<- as.character(
ciliate filter(taxfxn, Phylum == "Ciliophora" & (PFAMs != "-" | KEGG_ko != "-")) %>%
select(SequenceID) %>%
"SequenceID"]])
.[[
# Rhizaria at supergroup
<- as.character(
rhizaria filter(taxfxn, Supergroup == "Rhizaria" & (PFAMs != "-" | KEGG_ko != "-")) %>%
select(SequenceID) %>%
"SequenceID"]]) .[[
Save objects for subseting.
save(
# These are data frames
all_samples, npsg_only, ca_only, euphotic, subeuphotic,# these are lists (character lists)
genes_fxn_all, genes_tax_fxn_all, key_geneid, genes_kegg_curated,
euks_only, diatom, dinos, hapto, chloro, ciliate, rhizaria,file = "sample-gene-lists-forTXI.RData")
Txi function
Code for this function is from: https://rdrr.io/github/hcnh174/hlsgr/man/subsetTxi.html Code: https://rdrr.io/github/hcnh174/hlsgr/src/R/rnaseq.R
# Subset txi directly
<- function(txi, samples, include_genes=rownames(txi$counts))
subsetTxi
{<- rownames(txi$counts)[rownames(txi$counts) %in% include_genes]
genes $abundance <- txi$abundance[genes, samples$sample]
txi$counts <- txi$counts[genes, samples$sample]
txi$length <- txi$length[genes, samples$sample]
txireturn(txi)
}
# Example usage
# tmp <- sample(taxfxn$SequenceID,10,replace = FALSE)
# pola <- data.frame(sample = c("PortofLA_1", "PortofLA_2"))
# txi_pola <- subsetTxi(txi, pola, tmp)
Whole community DESeq script
Compare across light and pacific region, first isolate only eukaryotes and annotated transcripts.
All samples
See script scripts/run_txi_wholecomm.R
.
# Subset eukaryotes only, and keep all samples.
<- subsetTxi(txi, all_samples, euks_only)
txi_euk_annot
<- DESeqDataSetFromTximport(txi_euk_annot,
ds_tpm_light colData = sample_merged,
design = ~0 + LIGHT)
<- DESeqDataSetFromTximport(txi_euk_annot,
ds_tpm_pac colData = sample_merged,
design = ~0 + PACIFIC_REGION)
<- DESeqDataSetFromTximport(txi_euk_annot,
ds_tpm_lightpac colData = sample_merged,
design = ~0 + PACIFIC_REGION + LIGHT)
save(ds_tpm_light, ds_tpm_pac, ds_tpm_lightpac, file = "light-pacregion-deseq.RData")
By region
See script scripts/run_txi_region.r
.
# Subset txi
<- subsetTxi(txi, npsg_only, euks_only)
txi_npsg
# Reset sample_merged
## Subset
<- sample_merged %>%
tmp_sample_merged filter(Sample_rep %in% as.character(npsg_only$sample))
## Set names
rownames(tmp_sample_merged) <- tmp_sample_merged$Sample_rep
rownames(tmp_sample_merged) <- colnames(txi_npsg$counts)
# Compare Euphotic vs. sub-euphotic samples in the NPSG
<- DESeqDataSetFromTximport(txi_npsg,
ds_tpm_npsg_light colData = tmp_sample_merged,
design = ~0 + LIGHT)
# Compare July vs. March in the NPSG
<- DESeqDataSetFromTximport(txi_npsg,
ds_tpm_npsg_month colData = tmp_sample_merged,
design = ~0 + MONTH)
save(ds_tpm_npsg_light, ds_tpm_npsg_month, file = "/vortexfs1/scratch/sarahhu/txi-objs-metaT/npsg-deseq.RData")
<- subsetTxi(txi, ca_only, euks_only)
txi_ca
<- sample_merged %>%
tmp_sample_merged filter(Sample_rep %in% as.character(ca_only$sample))
rownames(tmp_sample_merged) <- tmp_sample_merged$Sample_rep
rownames(tmp_sample_merged) <- colnames(txi_ca$counts)
# Compare euphotic vs. subeuphotic in coastal California
## Includes Port of LA and Catalina
<- DESeqDataSetFromTximport(txi_ca,
ds_tpm_ca_light colData = tmp_sample_merged,
design = ~0 + LIGHT)
save(ds_tpm_ca_light, file = "/vortexfs1/scratch/sarahhu/txi-objs-metaT/ca-deseq.RData")
##
rm(txi_npsg);rm(txi_ca) #Save room
By light availability
See script scripts/run_txi_light.r
.
# Subset txi
<- subsetTxi(txi, euphotic, euks_only)
txi_euph
<- sample_merged %>%
tmp_sample_merged filter(Sample_rep %in% as.character(euphotic$sample))
rownames(tmp_sample_merged) <- tmp_sample_merged$Sample_rep
rownames(tmp_sample_merged) <- colnames(txi_euph$counts)
# Compare CA versus NPSG within euphotic samples
<- DESeqDataSetFromTximport(txi_euph,
ds_tpm_euphotic colData = tmp_sample_merged,
design = ~0 + PACIFIC_REGION)
<- subsetTxi(txi, subeuphotic, euks_only)
txi_subeuph
<- sample_merged %>%
tmp_sample_merged filter(Sample_rep %in% as.character(subeuphotic$sample))
rownames(tmp_sample_merged) <- tmp_sample_merged$Sample_rep
rownames(tmp_sample_merged) <- colnames(txi_subeuph$counts)
# Compare CA versus NPSG at subeuphotic depths
<- DESeqDataSetFromTximport(txi_subeuph,
ds_tpm_subeuphotic colData = tmp_sample_merged,
design = ~0 + PACIFIC_REGION)
save(ds_tpm_subeuphotic, ds_tpm_euphotic, file = "/vortexfs1/scratch/sarahhu/txi-objs-metaT/euphotic-subeuphotic-deseq.RData")
# save(txi_euph, txi_subeuph, file = "/vortexfs1/scratch/sarahhu/txi-objs-metaT/light_txi.RData")
By taxonomic groups - DESeq
Compare and contrast within regions. For part of our analysis we want to subset only the euphotic zone samples. Specifically targeting the difference between the SPOT station surface compared to: NPSG surface, NPSG DCM, NPSG 150m, Port of LA, and Catalina.
Question 1
For the first of our two core questions, we want to know what genes are significantly differentially expressed among the euphotic zone samples for the dominant protistan phytoplankton. We are specifically focusing on diatoms, chlorophytes, archaeplastidia (chlorophya), dinoflagellates, and haptophytes for this.
How does nutrient utilization among euphotic zone phytoplankton vary between coastal California and the NPSG?
/scripts/run_euphotic_deseq_taxa.R
# Function to subset txi by individual taxa and perform DESeq
<- function(sample_set, gene_set){
deseq_region_bytax # First incorporate the txi subset fxn
<- function(txi, samples, include_genes=rownames(txi$counts))
subsetTxi
{<- rownames(txi$counts)[rownames(txi$counts) %in% include_genes]
genes $abundance <- txi$abundance[genes, samples$sample]
txi$counts <- txi$counts[genes, samples$sample]
txi$length <- txi$length[genes, samples$sample]
txireturn(txi)
}# Run subset and sample merge re-set
<- subsetTxi(txi, sample_set, gene_set)
txi_output # This script will keep replacing tmp_sample_merged
<- sample_merged %>%
tmp_sample_merged filter(Sample_rep %in% as.character(sample_set$sample))
rownames(tmp_sample_merged) <- tmp_sample_merged$Sample_rep
rownames(tmp_sample_merged) <- colnames(txi_output$counts)
#
# Import as DESeq object - use PACIFIC_REGION in the design
# DESeq
<- DESeqDataSetFromTximport(txi_output,
ds_tpm_output colData = tmp_sample_merged,
design = ~0 + PACIFIC_REGION)
# return(ds_tpm_output)
# Further process DESeq
<- 2 # Transcript to consider, must be in at least 3 samples
groupsize <- rowSums(counts(ds_tpm_output) >= 10) >= groupsize # And have >= to 10 counts
keep <- ds_tpm_output[keep,]
ds_tpm_output_filtered ###
# Filtering stats:
cat("\nStarted with ", dim(ds_tpm_output)[1], "observations. Filtering by 2 samples and 10 counts resulted in,", dim(ds_tpm_output_filtered)[1], ", which is", (100*(dim(ds_tpm_output_filtered)[1]/dim(ds_tpm_output)[1])), "% of the data.\n\n")
###
#
## Positive log fold change == up regulared in CA, compared to NPSG
$PACIFIC_REGION <- factor(ds_tpm_output_filtered$PACIFIC_REGION, levels = c("NPSG", "CA"))
ds_tpm_output_filtered<- DESeq2::DESeq(ds_tpm_output_filtered)
de_tax_output resultsNames(de_tax_output)
summary(de_tax_output)
return(de_tax_output)
cat("\n\nCompleted.\n\n")
}
## Apply to each taxa
# From the euphotic samples, what are DE genes among haptophytes between CA and NPSG?
cat("\n\nDiatoms start.\n\n")
<- deseq_region_bytax(euphotic, diatom)
de_diatom
cat("\n\Haptophytes start.\n\n")
# From the euphotic samples, what are DE genes among haptophytes between CA and NPSG?
<- deseq_region_bytax(euphotic, hapto)
de_hapto
cat("\n\Dinoflagellates start.\n\n")
# From the euphotic samples, what are DE genes among Dinoflagellates between CA and NPSG?
<- deseq_region_bytax(euphotic, dinos)
de_dinos
cat("\n\Chlorophytes start.\n\n")
# From the euphotic samples, what are DE genes among chlorophytes between CA and NPSG?
<- deseq_region_bytax(euphotic, chloro)
de_chloro
save(de_diatom, de_hapto, de_dinos, de_chloro, file = "/scratch/group/hu-lab/pacocean-metaT/Robjs/euphotic_by_taxa.RData")
Question 2
We can repeat the above approach, but select different taxa.
Do we see a similar euphotic vs. sub-euphotic shift in the metabolic potential of taxa observed at coastal California and the NPSG?
Use scripts/run_bydepth_deseq_taxa.R
.
Curated genes only
Here, we isolated annotated transcripts that we curated ourselves.
library(tximport)
library(tidyverse)
<- metadata %>%
all_samples select(sample = SAMPLE)
<- subsetTxi(txi, all_samples, genes_kegg_curated)
txi_curated
<- makeCountsFromAbundance(
counts_scaled_curated as.matrix(txi_curated$counts),
as.matrix(txi_curated$abundance),
as.matrix(txi_curated$length),
countsFromAbundance = "scaledTPM"
)
<- as.data.frame(counts_scaled_curated)
counts_df_curated
<- colnames(counts_df_curated)
names_orig <- sub("_[^_]+$", "", names_orig)
names_new colnames(counts_df_curated) <- names_new
<- counts_df_curated %>%
mean_counts_df_curated cbind(as.list(.) %>%
Filter(is.numeric, .) %>%
split(names(.)) %>%
lapply(as.data.frame) %>%
lapply(rowMeans) %>%
setNames(paste0("mean.", names(.)))) %>%
select(starts_with("mean"))
<- mean_counts_df_curated %>%
counts_curated rownames_to_column(var = "SequenceID") %>%
left_join(taxfxn)
save(counts_curated, sample_merged, file = "Avg_scaled_tpm_curated_08252023.RData")
<- sample_merged %>%
tmp_sample_merged filter(Sample_rep %in% as.character(all_samples$sample))
rownames(tmp_sample_merged) <- tmp_sample_merged$Sample_rep
rownames(tmp_sample_merged) <- colnames(txi_curated$counts)
<- DESeqDataSetFromTximport(txi_curated,
ds_tpm_curated colData = tmp_sample_merged,
design = ~0 + SAMPLENAME)
<- vst(ds_tpm_curated)
vsd_all
<- as.data.frame(assay(vsd_all))
df_ctr_norm_curated
<- df_ctr_norm_curated %>%
ctr_norm_curated rownames_to_column(var = "SequenceID") %>%
left_join(taxfxn)
save(ctr_norm_curated, ds_tpm_curated, file = "normed_center_df_curated_08252023.RData")
July versus March for NPSG
library(tidyverse)
library(DESeq2)
load("/scratch/group/hu-lab/pacocean-metaT/Robjs/npsg-deseq.RData", verbose = TRUE)
# Compare July vs. March in the NPSG
# ds_tpm_npsg_month <- DESeqDataSetFromTximport(txi_npsg,
# colData = tmp_sample_merged,
# design = ~0 + MONTH)
How many genes are differentially expressed between July and March?
## Positive log fold change == up regulated in July, compared to March
$MONTH <- factor(ds_tpm_npsg_month$MONTH, levels = c("March", "July"))
ds_tpm_npsg_month
<- DESeq(ds_tpm_npsg_month)
de_month
<- results(de_month, alpha=0.05)
results_month # plotMA(results_month)
<- summary(results_month)
out write.csv(out, file = "summary-months.csv")
# Plot log fold change
svg("rplot-month.svg", width = 9, height = 8)
data.frame(results_month) %>%
mutate(REGULATION = case_when(
> 0 ~ "upregulated in July",
log2FoldChange < 0 ~ "upregulated in March"
log2FoldChange
),SIGNIFICANT = case_when(
<= 0.05 ~ "Significantly",
pvalue TRUE ~ "Not significantly"
%>%
)) ggplot(aes(x = baseMean, y = log2FoldChange, color = SIGNIFICANT)) +
geom_point(stat = "identity") +
scale_x_log10() +
theme_classic() +
scale_color_manual(values = c("#878787", "#d73027")) +
labs(title = mcols(results_month)$description[2])
dev.off()
# Get table
<- data.frame(results_month) %>%
month_transcripts mutate(REGULATION = case_when(
> 0 ~ "upregulated in July",
log2FoldChange < 0 ~ "upregulated in March"
log2FoldChange
),SIGNIFICANT = case_when(
<= 0.05 ~ "Significantly",
pvalue TRUE ~ "Not significantly"
%>%
)) filter(SIGNIFICANT == "Significantly") %>%
rownames_to_column(var = "SequenceID")
write.csv(month_transcripts, file = "/scratch/group/hu-lab/pacocean-metaT/month-transcripts.csv")