######### Genomic view of archaeal and bacterial diversity in skeleton of coral Porites lutea and Isopora palifera################## 1. ########## Read QC ################ a) FASTQC , b) MultiQC: default parameters Command line used: for i in F*; do cd $i; fastqc *.gz; multiqc *.zip; cd ..; done; 2. ########## Trimming of Reads####### Using Trimmomatic v0.38 Command line used: for i in F* ; #iterating on sample folders do cd $i for j in *_1.fq.gz; #iterating on sample files do n=${j%%_1.fq.gz}; #strip the file name java -jar /usr/local/easybuild/software/Trimmomatic/0.38-Java-1.8.0_71/trimmomatic-0.38.jar PE -trimlog log_${n}.txt ${n}_1.fq.gz ${n}_2.fq.gz ${n}_1.paired.fq.gz ${n}_1_unpaired.fq.gz ${n}_2.paired.fq.gz ${n}_2.unpaired.fq.gz HEADCROP:5 SLIDINGWINDOW:4:20 MINLEN:30; done cd ..; done 3. ######## Read Mapping to Porites lutea genome ########### NOTE: No host read mapping was performed for Isopora palifera samples due to lack of Host genome########## Using bowtie2 v2.4.2 Command line used: bowtie2-build plutea.fna plutea for i in F*; do cd $i; bowtie2 -x /data/projects/punim1519/DP/new_reads/plutea_mapping/plutea -1 *_read1.* -2 *_read2.* -S ${i}_R1_R2_mapped_unmapped.sam; samtools view -bS *.sam >${i}_R1_R2_mapped_unmapped.bam; samtools view -b -f 12 -F 256 ${i}_R1_R2_mapped_unmapped.bam >${i}_R1R2Unmapped.bam; samtools sort -n -m 10G -@ 5 ${i}_R1R2Unmapped.bam -o ${i}_R1R2Unmapped_sorted.bam; samtools fastq -@ 10 ${i}_R1R2Unmapped_sorted.bam -1 ${i}_plutea_removed_R1.fastq.gz -2 ${i}_plutea_removed_R2.fastq.gz cd ..; done 4. ####### Metagenome assembly ############## Using MegaHIT v1.2.9 Command line used: megahit -1 Read1 -2 Read2 --k-list 33,55,77,99 -o Samplename_megahit_33_55_77_99 --min-contig-len 1000 5. ###### MetaWRAP Binning (Initial Binning)############## Using MetaWRAP v1.3.2 Command line used: metawrap binning -o {output folder} -t 30 -a {input folder/final_contigs.fa} --metabat2 --maxbin2 --concoct {reads folder} metawrap binning -o {output folder} -t 30 -a {input folder/final_contigs.fa} --metabat1 --maxbin2 --concoct {reads folder} ## Only for IP31a_i sample 6. ###### MetaWRAP BinRefinement ################## Using MetaWRAP v1.3.2 metawrap bin_refinement -o {output folder} -t 24 -A {Sample/initial_binning/metabat2_bins/} -B {Sample/initial_binning/maxbin2_bins/} -C {Sample/initial_binning/concoct_bins/} -c 50 -x 10 7. ###### Dereplicating the bins ################ Using dRep v3.0.0 Command line used: dRep dereplicate {output folder} -p 28 --completeness 50 --contamination 10 -g {BinRefinement folder with all the filtered bins/*.fa} 8. ###### Checking completeness and contamination of dereplicated bins ########### Using CheckM Command line used: checkm lineage_wf -x fa -t 28 --pplacer_threads 28 dRep_all_samples_combined_withcheckm_input/dereplicated_genomes/ checkm_results_dereplicated_bins --tmpdir /data/tmp Bins with >80% completeness and <10% contamination were filtered and used for downstream analysis 9. ####### Taxonomic classification of Bins########## Although CheckM provides a taxonomic classification of bins, we used GTDB-tk for this work to obtain congruency Using GTDB-tk Command line used: gtdbtk de_novo_wf --genome_dir 80percent_MAGs_dereplicated/80percent_MAGs -x fa --bacteria --outgroup_taxon p__Patescibacteria --cpus 14 --out_dir gtdbtk_analysis_denovowf_dereplicated_80_percent_bins_bacteria ####### Run for Bacteria ########## gtdbtk de_novo_wf --genome_dir 80percent_MAGs_dereplicated_Plutea/80percent_MAGs -x fa --Archaea --outgroup_taxon p__Altarchaeota --cpus 14 --out_dir gtdbtk_analysis_denovowf_dereplicated_80_percent_bins_archaea ########## Run for Archaea ########### 9. ###### Identifying Misbinned contigs in the filtered Bins##################### Using CAT and BAT v5.2.3 Command line used: for j in *fa; do CAT contigs -c $j -d /data/softwares/CAT_prepare_20210107/2021-01-07_CAT_database/ -t /data/softwares/CAT_prepare_20210107/2021-01-07_taxonomy/ -n 24 -o CAT_$j ; CAT add_names -i CAT_$j.contig2classification.txt -o CAT_$j.contig2classification.official_names.txt -t /data/softwares/CAT_prepare_20210107/2021-01-07_taxonomy/ --only_official; done 10. ##### extracting misbinned contig id on a per genome basis ################# Command line used: (to be run on CAT_$j.contig2classification.official_names.txt) for i in *.txt ; do echo $i; grep -i "Bacteria\|Eukaryota\|Viruses" $i >misbinned_contigs_$i; awk -F "\t" '{print $1}' misbinned_contigs_$i >contig_ids_to_remove_$i; done (for Archaea BINS) for i in *.txt ; do echo $i; grep -i "Archaea\|Eukaryota\|Viruses" $i >misbinned_contigs_$i; awk -F "\t" '{print $1}' misbinned_contigs_$i >contig_ids_to_remove_$i; done (for Bacteria BINS) 11. #### Removing misbinned contigs from the bin############################### Using: Python Script (Usage: python Misbinned_contig_remover.py Bin.fasta contigs_ids_to_remove.txt >Updated_Bin.fasta) #!/usr/bin/env python3 from Bio import SeqIO import sys ffile = SeqIO.parse(sys.argv[1], "fasta") header_set = set(line.strip() for line in open(sys.argv[2])) for seq_record in ffile: try: header_set.remove(seq_record.name) except KeyError: print(seq_record.format("fasta")) continue if len(header_set) != 0: print(len(header_set),'of the headers from list were not identified in the input fasta file.', file=sys.stderr) 12. ###### CheckM on Bins again to see if any Bin goes below the threshold of high-quality (completeness >80% and contamination <10%) and medium-low quality (completeness >50-<80% and contamination <10%) > ########## Usage: as above Only high-quality Bins were used for downstream analysis 13. ##### Taxonomic classification again and phylogenetic tree construction############## Using: GTDB-tk and IQ-Tree GTDBTK as above IQ-Tree iqtree -s gtdbtk.bac120.user_msa.fasta -m LG+G -nt AUTO -B 1000(BACTERIA) iqtree -s gtdbtk.ar122.user_msa.fasta -m LG+G -nt AUTO -B 1000 (ARCHAEA) 14. ##### Bins Gene prediction and annotation ####################### Using: Prodigal v2.6.3 implemented in Prokka v1.14.5 Command line used: for i in F* ; do prokka $i --outdir $i.prokka --addgenes --addmrna --rfam --Kingdom Bacteria --compliant --centre --cpus 20 --prefix $i; done for i in F* ; do prokka $i --outdir $i.prokka --addgenes --addmrna --rfam --Kingdom Archaea --compliant --centre --cpus 20 --prefix $i; done Using: InterproScan (via pfam 34.0) Command line used: for j in *.faa; do echo $j; bash /data/softwares/interproscan-5.53-87.0/interproscan.sh -i $j -appl Pfam -d output_location -f TSV; done ####Script to filter InterProScan results based on evalue (<=1e-5) and identify the unique hits from the raw results#### for i in *.tsv ; do echo $i ;awk -F "\t" '{if ($9 <=1e-5) print$0}' $i >evalue_filtered_$i; sort -u -k1,1 evalue_filtered_$i >evalue_filtered_uniq_$i; done 15. ########## Annotation of KEGG pathways ############################## Using: METABOLIC (using METABOLIC-G), CSV/TSV tool kit and EnrichM Command line used: perl /data/softwares/METABOLIC/ METABOLIC-G.pl -t 24 -m-cutoff 0.50 -in {Bin .faa files}/ -kofam-db full -o METABOLIC_analysis_output Collating results from METABOLIC-G Command line used: for f in *.result.txt; do csvtk add-header -t -n ,$f $f -o $f.tsv; done csvtk join -t *.tsv >METABOLIC-G_BACTERIA_ko_results_combined.txt awk 'BEGIN { FS = OFS = "\t" } { for(i=1; i<=NF; i++) if($i ~ /^ *$/) $i = 0 }; 1' iMETABOLIC-G_BACTERIA_ko_results_combined.txt >METABOLIC-G_BACTERIA_ko_results_combined_corrected.txt Running EnrichM classify Command line used: enrichm classify --output enrichm_classify_results --genome_and_annotation_matrix METABOLIC-G_BACTERIA_ko_results_combined_corrected.txt --cutoff 0 16. ########## Bin relative abundance ############################ Using BBMap Command line used: for i in *.fa; do echo $i; bbmap.sh in1= Sample_R1.fastq in2= Sample_R2.fastq covstats=constats_$i.txt ref=$i nodisk; done Calculating average coverage of contig for each bin for i in *.txt; do echo $i; awk -v N=2 '{ sum += $N } END { if (NR > 0) print ARGV[1]"\t"sum}' $i >>summed_coverage_of_contigs_per_genome.txt; done 17. ELP, Nitrogen Fixation, DMSP lyase and DMSP synthase, Ammonia Oxidation (first and rate limiting step in nitrification), superoxide dismutase, catalase identification Using output of InterProScan Command line used a) WD40 repeats for i in *.tsv; do echo $i; grep --with-filename "PF00400" -c $i >>WD40_PF00400.txt; done for i in *.tsv; do echo $i; grep --with-filename "PF07676" -c $i >>WD40_PF07676.txt; done b) Ankyrin repeats for i in *.tsv; do echo $i; grep --with-filename "PF00023" -c $i >>Ank_PF00023.txt; grep --with-filename "PF12796" -c $i >>Ank_PF12796.txt; grep --with-filename "PF13637" -c $i >>Ank_PF13637.txt; grep --with-filename "PF13857" -c $i >>Ank_PF13857.txt; done c) HEAT repeat for i in *.tsv; do echo $i; grep --with-filename "PF13646" -c $i >>HEAT_PF13646.txt; done d) TTP for i in *.tsv; do echo $i; grep --with-filename "PF00515" -c $i >>TTP_PF00515.txt; grep --with-filename "PF07719" -c $i >>TTP_PF07719.txt; grep --with-filename "PF09976" -c $i >>TTP_PF09976.txt; grep --with-filename "PF13174" -c $i >>TTP_PF13174.txt; grep --with-filename "PF13181" -c $i >>TTP_PF13181.txt; grep --with-filename "PF13371" -c $i >>TTP_PF13371.txt; grep --with-filename "PF13374" -c $i >>TTP_PF13374.txt; grep --with-filename "PF13424" -c $i >>TTP_PF13424.txt; grep --with-filename "PF13428" -c $i >>TTP_PF13428.txt; grep --with-filename "PF13429" -c $i >>TTP_PF13429.txt; grep --with-filename "PF13431" -c $i >>TTP_PF13431.txt; grep --with-filename "PF13432" -c $i >>TTP_PF13432.txt; grep --with-filename "PF14559" -c $i >>TTP_PF14559.txt; grep --with-filename "PF14561" -c $i >>TTP_PF14561.txt; grep --with-filename "PF16918" -c $i >>TTP_PF16918.txt; done e) Nitrogen Fixation Nfix for i in *.tsv; do echo $i; grep --with-filename "PF00142" -c $i >>Nfix_PF00142.txt; done f) dsyB dmsp-synthase (S-adenosylmethionine-dependent methyltransferase (AdoMet-Mtase) class I superfamily domain) and dimerization2 superfamily domain for i in *.tsv; do echo $i; grep --with-filename "PF00891" -c $i >>dsyB_PF00891.txt; done for i in *.tsv; do echo $i; grep --with-filename "PF16864" -c $i >>dsyB_PF16864.txt; done g) DMSP lyase for i in *.tsv; do echo $i; grep --with-filename "PF16867" -c $i >>dmsp_lyase_PF16867.txt; done h) Ammonia oxidation for i in *.tsv; do echo $i; grep --with-filename "PF12942" -c $i >>Ammonia_oxidation_AmoA_PF12942.txt; done i) Superoxide dismutase for i in *.tsv; do echo $i; grep --with-filename "PF00080" -c $i >>Sod_Copper_zinc.txt; grep --with-filename "PF00081" -c $i >>Sod_Iron_manganese.txt; grep --with-filename "PF02777" -c $i >>Sod_Iron_manganese_Cterminal.txt; done grep --with-filename "PF00199" -c $i >>Catalase; done