######### Genomic view of archaeal and bacterial diversity in skeleton of coral Porites lutea and Isopora palifera##################

1. ########## Read QC ################
     a) FASTQC ,
     b) MultiQC: default parameters
     
     Command line used:
        for i in F*; do cd $i; fastqc *.gz; multiqc *.zip; cd ..; done;
     
2. ########## Trimming of Reads#######
    Using Trimmomatic v0.38
    
    Command line used:
       for i in F* ;  #iterating on sample folders
         do
           cd $i
           for j in *_1.fq.gz;  #iterating on sample files 
             do
            n=${j%%_1.fq.gz}; #strip the file name
            java -jar /usr/local/easybuild/software/Trimmomatic/0.38-Java-1.8.0_71/trimmomatic-0.38.jar PE -trimlog log_${n}.txt ${n}_1.fq.gz ${n}_2.fq.gz ${n}_1.paired.fq.gz ${n}_1_unpaired.fq.gz ${n}_2.paired.fq.gz ${n}_2.unpaired.fq.gz HEADCROP:5 SLIDINGWINDOW:4:20 MINLEN:30;
          done
         cd ..;
       done
       
3. ######## Read Mapping to Porites lutea genome ########### NOTE: No host read mapping was performed for Isopora palifera samples due to lack of Host genome##########
    Using bowtie2 v2.4.2
    
    Command line used:
       bowtie2-build plutea.fna plutea
      for i in F*; do
        cd $i;
        bowtie2 -x /data/projects/punim1519/DP/new_reads/plutea_mapping/plutea -1 *_read1.* -2 *_read2.* -S ${i}_R1_R2_mapped_unmapped.sam;
        samtools view -bS *.sam >${i}_R1_R2_mapped_unmapped.bam;
        samtools view -b -f 12 -F 256 ${i}_R1_R2_mapped_unmapped.bam >${i}_R1R2Unmapped.bam;
        samtools sort -n -m 10G -@ 5 ${i}_R1R2Unmapped.bam -o ${i}_R1R2Unmapped_sorted.bam;
        samtools fastq -@ 10 ${i}_R1R2Unmapped_sorted.bam -1 ${i}_plutea_removed_R1.fastq.gz -2 ${i}_plutea_removed_R2.fastq.gz
        cd ..;
      done
      
4. ####### Metagenome assembly ##############
    Using MegaHIT v1.2.9
    
    Command line used:
      megahit -1 Read1 -2 Read2 --k-list 33,55,77,99 -o Samplename_megahit_33_55_77_99 --min-contig-len 1000
      
5. ###### MetaWRAP Binning (Initial Binning)##############
    Using MetaWRAP v1.3.2
    
    Command line used:
      metawrap binning -o {output folder} -t 30 -a {input folder/final_contigs.fa} --metabat2 --maxbin2 --concoct {reads folder}
      metawrap binning -o {output folder} -t 30 -a {input folder/final_contigs.fa} --metabat1 --maxbin2 --concoct {reads folder} ## Only for IP31a_i sample

6. ###### MetaWRAP BinRefinement ##################
    Using MetaWRAP v1.3.2
      metawrap bin_refinement -o {output folder} -t 24 -A {Sample/initial_binning/metabat2_bins/} -B {Sample/initial_binning/maxbin2_bins/} -C {Sample/initial_binning/concoct_bins/} -c 50 -x 10

7. ###### Dereplicating the bins  ################
    Using dRep v3.0.0
    
    Command line used:
      dRep dereplicate {output folder} -p 28 --completeness 50 --contamination 10 -g {BinRefinement folder with all the filtered bins/*.fa}
      
8. ###### Checking completeness and contamination of dereplicated bins ###########
    Using CheckM
    
    Command line used:
      checkm lineage_wf -x fa -t 28 --pplacer_threads 28 dRep_all_samples_combined_withcheckm_input/dereplicated_genomes/ checkm_results_dereplicated_bins --tmpdir /data/tmp
    
      Bins with >80% completeness and <10% contamination were filtered and used for downstream analysis
      
      
9. ####### Taxonomic classification of Bins##########
    Although CheckM provides a taxonomic classification of bins, we used GTDB-tk for this work to obtain congruency
    
      Using GTDB-tk
      
      Command line used:
      gtdbtk de_novo_wf --genome_dir 80percent_MAGs_dereplicated/80percent_MAGs -x fa --bacteria --outgroup_taxon p__Patescibacteria --cpus 14 --out_dir gtdbtk_analysis_denovowf_dereplicated_80_percent_bins_bacteria ####### Run for Bacteria ##########
      gtdbtk de_novo_wf --genome_dir 80percent_MAGs_dereplicated_Plutea/80percent_MAGs -x fa --Archaea --outgroup_taxon p__Altarchaeota --cpus 14 --out_dir gtdbtk_analysis_denovowf_dereplicated_80_percent_bins_archaea ########## Run for Archaea ###########
      
9. ###### Identifying Misbinned contigs in the filtered Bins#####################
    Using CAT and BAT v5.2.3
    
    Command line used:
    for j in *fa; do CAT contigs -c $j -d /data/softwares/CAT_prepare_20210107/2021-01-07_CAT_database/ -t /data/softwares/CAT_prepare_20210107/2021-01-07_taxonomy/ -n 24 -o CAT_$j ; CAT add_names -i CAT_$j.contig2classification.txt -o CAT_$j.contig2classification.official_names.txt -t /data/softwares/CAT_prepare_20210107/2021-01-07_taxonomy/ --only_official; done
    
10. ##### extracting misbinned contig id on a per genome basis #################
    
    Command line used: (to be run on CAT_$j.contig2classification.official_names.txt)
    
    for i in *.txt ; do echo $i; grep -i "Bacteria\|Eukaryota\|Viruses" $i >misbinned_contigs_$i; awk -F "\t" '{print $1}' misbinned_contigs_$i >contig_ids_to_remove_$i; done (for Archaea   BINS)
    for i in *.txt ; do echo $i; grep -i "Archaea\|Eukaryota\|Viruses"  $i >misbinned_contigs_$i; awk -F "\t" '{print $1}' misbinned_contigs_$i >contig_ids_to_remove_$i; done (for Bacteria BINS)
 
11. #### Removing misbinned contigs from the bin###############################
    Using: Python Script (Usage: python Misbinned_contig_remover.py Bin.fasta contigs_ids_to_remove.txt >Updated_Bin.fasta)
      
      #!/usr/bin/env python3

      from Bio import SeqIO
      import sys

      ffile = SeqIO.parse(sys.argv[1], "fasta")
      header_set = set(line.strip() for line in open(sys.argv[2]))

      for seq_record in ffile:
          try:
              header_set.remove(seq_record.name)
          except KeyError:
              print(seq_record.format("fasta"))
              continue
      if len(header_set) != 0:
          print(len(header_set),'of the headers from list were not identified in the input fasta file.', file=sys.stderr)

12. ###### CheckM on Bins again to see if any Bin goes below the threshold of high-quality (completeness >80% and contamination <10%) and medium-low quality  (completeness >50-<80% and contamination <10%)  > ##########
    Usage: as above
    
    Only high-quality Bins were used for downstream analysis

13. ##### Taxonomic classification again and phylogenetic tree construction##############
    Using: GTDB-tk and IQ-Tree
      GTDBTK as above 
      
      IQ-Tree 
        iqtree -s gtdbtk.bac120.user_msa.fasta -m LG+G -nt AUTO -B 1000(BACTERIA)
        iqtree -s gtdbtk.ar122.user_msa.fasta -m LG+G -nt AUTO -B 1000 (ARCHAEA)


14. ##### Bins Gene prediction  and annotation ####################### 
    Using: Prodigal v2.6.3 implemented in Prokka v1.14.5
    
      Command line used:
        for i in F* ; do prokka $i --outdir $i.prokka --addgenes --addmrna --rfam --Kingdom Bacteria --compliant --centre --cpus 20 --prefix $i; done
        for i in F* ; do prokka $i --outdir $i.prokka --addgenes --addmrna --rfam --Kingdom Archaea --compliant --centre --cpus 20 --prefix $i; done
        
     
    Using: InterproScan (via pfam 34.0)
      Command line used:
        for j in *.faa; do echo $j; bash /data/softwares/interproscan-5.53-87.0/interproscan.sh -i $j -appl Pfam -d output_location -f TSV; done
        
      ####Script to filter InterProScan results based on evalue (<=1e-5) and identify the unique hits from the raw results#### 

        for i in *.tsv ; do echo $i ;awk -F "\t" '{if ($9 <=1e-5) print$0}' $i >evalue_filtered_$i; sort -u -k1,1 evalue_filtered_$i >evalue_filtered_uniq_$i; done
        
15. ########## Annotation of KEGG pathways ##############################
       Using: METABOLIC (using METABOLIC-G), CSV/TSV tool kit and EnrichM 
      
       Command line used:
      perl /data/softwares/METABOLIC/ METABOLIC-G.pl -t 24 -m-cutoff 0.50 -in {Bin .faa files}/ -kofam-db full -o METABOLIC_analysis_output
      
      Collating results from METABOLIC-G 
        Command line used:
          
          for f in *.result.txt; do csvtk add-header -t -n ,$f $f -o $f.tsv; done

          csvtk join -t *.tsv >METABOLIC-G_BACTERIA_ko_results_combined.txt

          awk 'BEGIN { FS = OFS = "\t" } { for(i=1; i<=NF; i++) if($i ~ /^ *$/) $i = 0 }; 1' iMETABOLIC-G_BACTERIA_ko_results_combined.txt >METABOLIC-G_BACTERIA_ko_results_combined_corrected.txt
          
      
      Running EnrichM classify 
        Command line used:
          enrichm classify --output enrichm_classify_results --genome_and_annotation_matrix METABOLIC-G_BACTERIA_ko_results_combined_corrected.txt --cutoff 0
          
          
16. ########## Bin relative abundance ############################
      Using BBMap 
        
        Command line used:
          for i in *.fa; do echo $i; bbmap.sh in1= Sample_R1.fastq in2= Sample_R2.fastq covstats=constats_$i.txt ref=$i nodisk; done
          
        Calculating average coverage of contig for each bin
          for i in *.txt; do echo $i; awk -v N=2 '{ sum += $N } END { if (NR > 0) print ARGV[1]"\t"sum}' $i >>summed_coverage_of_contigs_per_genome.txt; done
      
        
17. ELP, Nitrogen Fixation, DMSP lyase and DMSP synthase, Ammonia Oxidation (first and rate limiting step in nitrification), superoxide dismutase, catalase identification
    Using output of InterProScan
        Command line used 
         a) WD40 repeats 
         
            for i in *.tsv; 
              do echo $i; 
                grep --with-filename "PF00400" -c $i >>WD40_PF00400.txt;
            done
           
            for i in *.tsv; 
              do echo $i; 
                grep --with-filename "PF07676" -c $i >>WD40_PF07676.txt;
            done
            
         b) Ankyrin repeats
           for i in *.tsv; 
             do echo $i; 
               grep --with-filename "PF00023" -c $i >>Ank_PF00023.txt; 
               grep --with-filename "PF12796" -c $i >>Ank_PF12796.txt; 
               grep --with-filename "PF13637" -c $i >>Ank_PF13637.txt; 
               grep --with-filename "PF13857" -c $i >>Ank_PF13857.txt;
            done
            
          c) HEAT repeat
            for i in *.tsv; 
              do echo $i; 
              grep --with-filename "PF13646" -c $i >>HEAT_PF13646.txt; 
            done
            
          d) TTP
            for i in *.tsv; 
              do echo $i; 
                grep --with-filename "PF00515" -c $i >>TTP_PF00515.txt; 
                grep --with-filename "PF07719" -c $i >>TTP_PF07719.txt; 
                grep --with-filename "PF09976" -c $i >>TTP_PF09976.txt; 
                grep --with-filename "PF13174" -c $i >>TTP_PF13174.txt; 
                grep --with-filename "PF13181" -c $i >>TTP_PF13181.txt; 
                grep --with-filename "PF13371" -c $i >>TTP_PF13371.txt; 
                grep --with-filename "PF13374" -c $i >>TTP_PF13374.txt; 
                grep --with-filename "PF13424" -c $i >>TTP_PF13424.txt; 
                grep --with-filename "PF13428" -c $i >>TTP_PF13428.txt; 
                grep --with-filename "PF13429" -c $i >>TTP_PF13429.txt; 
                grep --with-filename "PF13431" -c $i >>TTP_PF13431.txt; 
                grep --with-filename "PF13432" -c $i >>TTP_PF13432.txt; 
                grep --with-filename "PF14559" -c $i >>TTP_PF14559.txt; 
                grep --with-filename "PF14561" -c $i >>TTP_PF14561.txt; 
                grep --with-filename "PF16918" -c $i >>TTP_PF16918.txt;
            done
          
          e) Nitrogen Fixation Nfix
            for i in *.tsv; 
              do echo $i; 
              grep --with-filename "PF00142" -c $i >>Nfix_PF00142.txt; 
           done
           
          f) dsyB dmsp-synthase  (S-adenosylmethionine-dependent methyltransferase (AdoMet-Mtase) class I superfamily domain) and dimerization2 superfamily domain
            for i in *.tsv; 
              do echo $i; 
              grep --with-filename "PF00891" -c $i >>dsyB_PF00891.txt; 
            done 
            
            for i in *.tsv; 
              do echo $i; 
              grep --with-filename "PF16864" -c $i >>dsyB_PF16864.txt; 
            done
 
          g) DMSP lyase
            for i in *.tsv; 
              do echo $i; 
              grep --with-filename "PF16867" -c $i >>dmsp_lyase_PF16867.txt; 
            done

          h) Ammonia oxidation
            for i in *.tsv;
              do echo $i; 
              grep --with-filename "PF12942" -c $i >>Ammonia_oxidation_AmoA_PF12942.txt; 
            done
        
          i) Superoxide dismutase 
            for i in *.tsv; 
              do echo $i; 
              grep --with-filename "PF00080" -c $i >>Sod_Copper_zinc.txt; 
              grep --with-filename "PF00081" -c $i >>Sod_Iron_manganese.txt; 
              grep --with-filename "PF02777" -c $i >>Sod_Iron_manganese_Cterminal.txt; done
              grep --with-filename "PF00199" -c $i >>Catalase; done