Feel free to join the next Helmholtz Hacky Hour #26 on Wednesday, April 21, 2021 from 2PM to 3PM!

Commit 36b31899 authored by Florian Centler's avatar Florian Centler

initial commit

parent 8e51bef1
# Assuming unique info for each tax and func! Check this in input data!
taxFile <- "SampleID-taxa.txt"
funcFile <- "SampleID-fct.txt"
# Get read ids as row headers, taxonomic info as column V2
taxData <- read.table(file=taxFile, sep="\t", row.names=1, quote="\"")
# Parse taxonomic info and put in columns
taxData$Kingdom <- gsub(pattern=";.*", replacement="", gsub(pattern="^.*d__", replacement="", taxData$V2))
taxData$Phylum <- gsub(pattern=";.*", replacement="", gsub(pattern="^.*p__", replacement="", taxData$V2))
taxData$Class <- gsub(pattern=";.*", replacement="", gsub(pattern="^.*c__", replacement="", taxData$V2))
taxData$Order <- gsub(pattern=";.*", replacement="", gsub(pattern="^.*o__", replacement="", taxData$V2))
taxData$Family <- gsub(pattern=";.*", replacement="", gsub(pattern="^.*f__", replacement="", taxData$V2))
taxData$Genus <- gsub(pattern=";.*", replacement="", gsub(pattern="^.*g__", replacement="", taxData$V2))
taxData$Species <- gsub(pattern=";.*", replacement="", gsub(pattern="^.*s__", replacement="", taxData$V2))
taxData$taxInfo <- TRUE # mark read to have taxonomic info
# Record percentage info
taxData$KingdomPercent <- gsub(pattern=";.*", replacement="", gsub(pattern="^.*d__[^;]+;", replacement="", taxData$V2))
taxData$PhylumPercent <- gsub(pattern=";.*", replacement="", gsub(pattern="^.*p__[^;]+;", replacement="", taxData$V2))
taxData$ClassPercent <- gsub(pattern=";.*", replacement="", gsub(pattern="^.*c__[^;]+;", replacement="", taxData$V2))
taxData$OrderPercent <- gsub(pattern=";.*", replacement="", gsub(pattern="^.*o__[^;]+;", replacement="", taxData$V2))
taxData$FamilyPercent <- gsub(pattern=";.*", replacement="", gsub(pattern="^.*f__[^;]+;", replacement="", taxData$V2))
taxData$GenusPercent <- gsub(pattern=";.*", replacement="", gsub(pattern="^.*g__[^;]+;", replacement="", taxData$V2))
taxData$SpeciesPercent <- gsub(pattern=";.*", replacement="", gsub(pattern="^.*s__[^;]+;", replacement="", taxData$V2))
# drop parsed taxonomic input string
taxData$V2 <- NULL
# put NA to entries which were missing
for (i in 1:7) {
taxData[grepl('__', taxData[,i]), i] <- NA
}
for (i in 9:15) {
taxData[grepl('__', taxData[,i]), i] <- NA
}
# get functional info
funcData <- read.table(file=funcFile, sep="\t", quote="\"")
#funcData <- read.table(file=funcFile, sep="\t", row.names=1, quote="\"")
#funcData$Level1 <- gsub(pattern=";.*", replacement="", gsub(pattern="^[^;]*;", replacement="", funcData$V2))
funcData$Level2 <- gsub(pattern=";.*", replacement="", gsub(pattern="^[^;]*;[^;]*;", replacement="", funcData$V2))
#funcData$Level3 <- gsub(pattern=";.*", replacement="", gsub(pattern="^[^;]*;[^;]*;[^;]*;", replacement="", funcData$V2))
#funcData$Level4 <- gsub(pattern=";.*", replacement="", gsub(pattern="^[^;]*;[^;]*;[^;]*;[^;]*;", replacement="", funcData$V2))
#funcData$Level5 <- gsub(pattern=";.*", replacement="", gsub(pattern="^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;", replacement="", funcData$V2))
#funcData$Level6 <- gsub(pattern=";.*", replacement="", gsub(pattern="^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;", replacement="", funcData$V2))
funcData$funcInfo <- TRUE # mark read to have functional info
# put NA to entries which were missing
funcData[funcData==" eggNOG"] <- ""
funcData[funcData=="eggNOG"] <- ""
funcData[funcData==""] <- NA
# drop parsed taxonomic input string
funcData$V2 <- NULL
# if reads with only taxonomic or functional info are of interest:
#data <- merge(taxData, funcData, by = "row.names", all=TRUE)
# if only reads are of interest which have both taxonomic info and functional info use:
data <- merge(taxData, funcData, by = "row.names")
write.table(data, "datatxt.txt", sep="\t")
# lets get a table combining the levels of interest
# Select the taxa and eggnog level of interest (here exemplary: Species and EggNOG Level1)
myTable <- table(data$Species, data$Level1, useNA="always")
#rownames(myTable)[nrow(myTable)] <- "unclassified"
#colnames(myTable)[ncol(myTable)] <- "unclassified"
write.table(myTable, "myTable_sampleID_taxa-vs-fct.txt", sep = "\t")
This diff is collapsed.
#!/bin/bash
###########################################################################################
#DE NOVO ANALYZING OF UNMAPPED (UNKNOWN) READS
###########################################################################################
echo -e "\033[46m############################################################################################\nDE NOVO ANALYZING OF UNMAPPED (UNKNOWN) READS\n############################################################################################\033[0m"
echo "--------------------------------------------------------------------------------"
echo "$d : Starts to assemble the unaligned (unmapped) reads with IDBA_UD"
###echo -e "\033[34mABySS: A parallel assembler for short read sequence data. Simpson JT, Wong K, Jackman SD, Schein JE, Jones SJ, Birol I. Genome Research, 2009-June. (Genome Research, PubMed)\nVersion:2.0.2\033[0m"
echo "--------------------------------------------------------------------------------"
mkdir $path/DeNovo
mkdir $path/DeNovo/Assembly
mkdir $path/DeNovo/Bin_Contig
mkdir $path/DeNovo/Bin_Contig/CheckM
mkdir $path/DeNovo/REAssembly
mkdir $path/DeNovo/Assembly/IDBA
mkdir $path/DeNovo/Assembly/Quast
for ((i=0; i<$f; i++)); do
###Start the assembly process with the IDBA_UD tool
### mkdir $path/DeNovo/Assembly/ABySS/abyss_${meinarrayNEW[$i]}
mkdir $path/DeNovo/Assembly/IDBA/idba_${meinarrayNEW[$i]}
### echo "--------------------------------------------------------------------------------"
### echo "$d : ABySS for Sample: ${meinarrayNEW[$i]}"
### echo "--------------------------------------------------------------------------------"
### ###select the optimal k-mer value based on k-mer check or calc based on average read length minus 10 bp
### abyss-pe name=$path/DeNovo/Assembly/ABySS/abyss_${meinarrayNEW[$i]}/${meinarrayNEW[$i]} k=$kdef in=$path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/all_unmapped_${meinarrayNEW[$i]}_total.fasta 2>&1 | tee $path/DeNovo/Assembly/ABySS/abyss_${meinarrayNEW[$i]}/log_abyss_${meinarrayNEW[$i]}.txt
done
###number of parallel jobs
N=$f
for ((i=0; i<$f; i++)); do
(
echo "--------------------------------------------------------------------------------"
echo "$d : IDBA_UD for Sample: ${meinarrayNEW[$i]}"
echo "--------------------------------------------------------------------------------"
$pathT/MaxBin-2.2.4/auxiliary/idba-1.1.1/bin/./idba_ud -r $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmapped_${meinarrayNEW[$i]}_total.fasta -o $path/DeNovo/Assembly/IDBA/idba_${meinarrayNEW[$i]} --mink 51 --maxk 151 --step 10 --min_contig 1000 --pre_correction --num_threads 1 2>&1 | tee $path/DeNovo/Assembly/IDBA/idba_${meinarrayNEW[$i]}/logfile_ibda_${meinarrayNEW[$i]}.txt
###abyss-pe name=$path/DeNovo/Assembly/${meinarrayNEW[$i]}/${meinarrayNEW[$i]} k=$kdef in=$path/DeNovo/Unmappedfiles/${meinarrayNEW[$i]}/unmapped_${meinarrayNEW[$i]}_F.fastq $path/DeNovo/Unmappedfiles/${meinarrayNEW[$i]}/unmapped_${meinarrayNEW[$i]}_R.fastq 2>&1 | tee $path/DeNovo/Assembly/${meinarrayNEW[$i]}/log_abyss_${meinarrayNEW[$i]}.txt
) &
###allow only to exetute $N jobs in parallel
if [[ $(jobs -r -p | wc -l) -gt $N ]]; then
###wait only for first job
wait -n
fi
done
wait
sleep 20s
###Here the assembly Results will be compared
echo "--------------------------------------------------------------------------------"
echo "$d : Statistical analyze of ABySS output (created a short summary of assembly) with QUAST"
###echo -e "\033[34mQUAST: Alexey Gurevich, Vladislav Saveliev, Nikolay Vyahhi and Glenn Tesler,QUAST: quality assessment tool for genome assemblies,Bioinformatics (2013) 29 (8): 1072-1075.\033[0m"
echo "--------------------------------------------------------------------------------"
###NOTE: Please change the number of examples depending of the number of input samples (in this case: 3)
###python $pathT/quast-4.5/./quast.py $path/DeNovo/Assembly/${meinarrayNEW[0]}/${meinarrayNEW[0]}-6.fa -o $path/DeNovo/Quast/$Exp
python $pathT/quast-4.5/./quast.py $path/DeNovo/Assembly/IDBA/idba_${meinarrayNEW[0]}/contig.fa $path/DeNovo/Assembly/IDBA/idba_${meinarrayNEW[1]}/contig.fa $path/DeNovo/Assembly/IDBA/idba_${meinarrayNEW[2]}/contig.fa -o $path/DeNovo/Assembly/Quast/idba_$Exp
###INPUT="$(cat $path/Reference/log_combifasta.txt)"
###mkdir $path/DeNovo/Assembly/Quast/metaquast
###python $pathT/quast-4.5/./metaquast.py -R $Input -o $path/DeNovo/Assembly/Quast/metaquast/ $path/DeNovo/Assembly/IDBA/idba_${meinarrayNEW[0]}/contig.fa $path/DeNovo/Assembly/IDBA/idba_${meinarrayNEW[1]}/contig.fa $path/DeNovo/Assembly/IDBA/idba_${meinarrayNEW[2]}/contig.fa
###Here the clustering depending on MaxBin-algorithm will be performed
for ((i=0; i<$f; i++)); do
echo "$d : Binning for Sample: ${meinarrayNEW[$i]}"
mkdir $path/DeNovo/Bin_Contig/${meinarrayNEW[$i]}
done
for ((i=0; i<$f; i++)); do
###(
echo "--------------------------------------------------------------------------------"
echo "$d : Binning of contigs (based on IDBA_UD) with MaxBin2"
###echo -e "\033[34mMaxBin2: Wu YW, Tang YH, Tringe SG, Simmons BA, and Singer SW, 'MaxBin: an automated binning method to recover individual genomes from metagenomes using an expectation-maximization algorithm', Microbiome, 2:26, 2014.\033[0m"
echo "--------------------------------------------------------------------------------"
$pathT/MaxBin-2.2.4/./run_MaxBin.pl -contig $path/DeNovo/Assembly/IDBA/idba_${meinarrayNEW[$i]}/contig.fa -reads $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmapped_${meinarrayNEW[$i]}_total.fasta -reassembly -prob_threshold 0.8 -plotmarker -markerset 40 -out $path/DeNovo/Bin_Contig/${meinarrayNEW[$i]}/Bin_${meinarrayNEW[$i]} -thread 1 2>&1 | tee $path/DeNovo/Bin_Contig/${meinarrayNEW[$i]}/maxbin_log_${meinarrayNEW[$i]}.txt
###marker set 40 is recommended by MaxBin for archaea and bacteria communities
###) &
### ###allow only to exetute $N jobs in parallel
### if [[ $(jobs -r -p | wc -l) -gt $N ]]; then
### ###wait only for first job
### wait -n
### fi
done
###wait
for ((i=0; i<$f; i++)); do
###Check the successfully binning process depending on completeness and contamination rate with CheckM
###If CheckM failed, maybe export PATH=$PATH:/home/administrator/pplacer-Linux-v1.1.alpha19 the path to pplacer isn't in your path
mkdir $path/DeNovo/Bin_Contig/CheckM/${meinarrayNEW[$i]}
echo "--------------------------------------------------------------------------------"
echo "$d : Check the completeness and quality of generated bins with CheckM"
###echo -e "\033[34mCheckM: Parks DH, Imelfort M, Skennerton CT, Hugenholtz P, Tyson GW. 2014. Assessing the quality of microbial genomes recovered from isolates, single cells, and metagenomes. Genome Research, 25: 1043-1055.\033[0m"
echo "--------------------------------------------------------------------------------"
rm -r $path/DeNovo/Bin_Contig/CheckM/${meinarrayNEW[$i]}/
checkm lineage_wf -r -x fasta --nt -t 1 --reduced_tree --pplacer_threads 1 $path/DeNovo/Bin_Contig/${meinarrayNEW[$i]}/ $path/DeNovo/Bin_Contig/CheckM/${meinarrayNEW[$i]} >> $path/DeNovo/Bin_Contig/CheckM/logfile_checkm_${meinarrayNEW[$i]}
###checkm bin_compare -x fasta -y fa $path/DeNovo/Assembly/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}-6.fa $path/DeNovo/Bin_Contig/${meinarrayNEW[$i]}/ $path/DeNovo/Bin_Contig/CheckM/compare${meinarrayNEW[$i]}
###checkm qa -o 1 -f $path/DeNovo/Bin_Contig/CheckM/${meinarrayNEW[$i]} -t 1 $path/DeNovo/Bin_Contig/CheckM/${meinarrayNEW[$i]}/lineage.ms $path/DeNovo/Bin_Contig/CheckM/${meinarrayNEW[$i]}/
###-r -x fasta --nt -t 1 --pplacer_threads 1
w="$(find $path/DeNovo/Bin_Contig/${meinarrayNEW[$i]}/Bin_${meinarrayNEW[$i]}.reassem -type f -name "Bin_${meinarrayNEW[$i]}.reads.0*" | wc -w)"
sleep 1s
echo "Sample_${meinarrayNEW[$i]} had $w bins"
done
###wait
sleep 20s
for ((i=0; i<$f; i++)); do
mkdir $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas
###Prepare the possible input for the external annotation with Diamond/MEGAN6
mkdir $path/Annotation
mkdir $path/Annotation/${meinarrayNEW[$i]}
mkdir $path/Annotation/${meinarrayNEW[$i]}/mapped_annotation
mkdir $path/Annotation/${meinarrayNEW[$i]}/mapped_annotation/input
mkdir $path/Annotation/${meinarrayNEW[$i]}/mapped_annotation/output
mkdir $path/Annotation/${meinarrayNEW[$i]}/unmapped_annotation
mkdir $path/Annotation/${meinarrayNEW[$i]}/unmapped_annotation/input
mkdir $path/Annotation/${meinarrayNEW[$i]}/unmapped_annotation/output
w="$(find $path/DeNovo/Bin_Contig/${meinarrayNEW[$i]}/Bin_${meinarrayNEW[$i]}.reassem -type f -name "Bin_${meinarrayNEW[$i]}.reads.0*" | wc -w)"
sleep 2s
echo "$d : Reassembly for Sample: ${meinarrayNEW[$i]}"
echo "--------------------------------------------------------------------------------"
echo "$d : Reassemble the generated bins via IDBA_UD (integrated in MaxBin2) to obtain (maybe) full reconstructed genomes (it is a experimental step)"
###echo -e "\033[34mIDBA_UD: Peng, Y., et al. (2012) IDBA-UD: a de novo assembler for single-cell and metagenomic sequencing data with highly uneven depth, Bioinformatics, 28, 1420-1428.\033[0m"
###Start the reassembly of binned/clustered reads.
###Thereby two different ways are possible.
###1) If bin.reads inside the reassembly folder are obtained --> use the clustered reads per bin files
###2) A reassembly was not possible --> than use the unclassified file of reads as one cluster of unmapped and unclassified reads (mainly for mockdata)
echo "--------------------------------------------------------------------------------"
echo "$d : IDBA_UD Reassembly: ${meinarrayNEW[$i]}"
echo -e "\033[43m-- Is the contig file clustered and reassemled successfully within $path/DeNovo/Bin_Contig/${meinarrayNEW[$i]}/Bin_${meinarrayNEW[$i]}.reassem/Bin_${meinarrayNEW[$i]}.reads.0*?: \033[0m"
read -p "Bin_${meinarrayNEW[$i]}.reads.0* available (y/n)" response
if [ "$response" == "y" ]
then
### if [ "$answer" = y -o "$answer" = Y ] ; then
### if [ "'-e $reasfile'" ]; then
echo -e "\033[42mReassembly was successfully, folder contained reassemled read file (*.0X). Take the reassembled reads\033[0m"
###for ((j=1; j<$b; j++)); do
echo "Bin_${meinarrayNEW[$i]}.reassem contains $w bins"
for ((j=1; j<=$w; j++)); do
echo "$d : IDBA_UD Reassembly: ${meinarrayNEW[$i]} for the 0$j Bin"
mkdir $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas/RAContigs_perBin
$pathT/MaxBin-2.2.4/auxiliary/idba-1.1.1/bin/./idba_ud -r $path/DeNovo/Bin_Contig/${meinarrayNEW[$i]}/Bin_${meinarrayNEW[$i]}.reassem/Bin_${meinarrayNEW[$i]}.reads.00$j -o $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas/${meinarrayNEW[$i]}_idba_$j --mink 51 --maxk 151 --step 10 --min_contig 1000 --pre_correction --num_threads 2 2>&1 | tee $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas/logfile_RA_${meinarrayNEW[$i]}
$pathT/MaxBin-2.2.4/auxiliary/idba-1.1.1/bin/./idba_ud -r $path/DeNovo/Bin_Contig/${meinarrayNEW[$i]}/Bin_${meinarrayNEW[$i]}.reassem/Bin_${meinarrayNEW[$i]}.reads.0$j -o $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas/${meinarrayNEW[$i]}_idba_$j --mink 51 --maxk 151 --step 10 --min_contig 1000 --pre_correction --num_threads 2 2>&1 | tee $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas/logfile_RA_${meinarrayNEW[$i]}
cd $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas/${meinarrayNEW[$i]}_idba_$j/
###to avoid duplication after merge together of different idba_ud based contigs files, here the header will be transformed depending on sample-Id and Bin-number
sed s/\>contig/\>contig_${meinarrayNEW[$i]}_$j/g contig.fa > contig_${meinarrayNEW[$i]}_bin_$j.fa
###sed s/\>contig/\>contig_${meinarrayNEW[$i]}_0$j/g contig.fa > contig_${meinarrayNEW[$i]}_bin_0$j.fa
cp $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas/${meinarrayNEW[$i]}_idba_$j/contig_${meinarrayNEW[$i]}_bin_$j.fa $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas/RAContigs_perBin
done
cat $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas/RAContigs_perBin/*.fa > $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas/RAContigs_perBin/unm_${meinarrayNEW[$i]}_RAContig_total.fasta
else
echo -e "\033[31mSwitch to unclass file\033[0m"
fi
if [ "$response" == "n" ]
then
cp $path/DeNovo/Bin_Contig/${meinarrayNEW[$i]}/Bin_${meinarrayNEW[$i]}.reads.noclass $path/DeNovo/Bin_Contig/${meinarrayNEW[$i]}/Bin_${meinarrayNEW[$i]}.reassem/
mkdir $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas/RAContigs_perBin
echo -e "\033[41mReassembly failed, folder empty. Take the simple binned reads\033[0m"
echo -e "\033[31mUse alternative: Bin_${meinarrayNEW[$i]}.reads.noclass\033[0m"
$pathT/MaxBin-2.2.4/auxiliary/idba-1.1.1/bin/./idba_ud -r $path/DeNovo/Bin_Contig/${meinarrayNEW[$i]}/Bin_${meinarrayNEW[$i]}.reassem/Bin_${meinarrayNEW[$i]}.reads.noclass -o $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas/${meinarrayNEW[$i]}_idba_noclass --mink 51 --maxk 151 --step 10 --min_contig 1000 --pre_correction --num_threads 2 2>&1 | tee $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas/logfile_RA_${meinarrayNEW[$i]}
cd $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas/${meinarrayNEW[$i]}_idba_noclass/
sed s/\>contig/\>contig_${meinarrayNEW[$i]}_unclass/g contig.fa > contig_${meinarrayNEW[$i]}_bin_unclass.fa
cat $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas/${meinarrayNEW[$i]}_idba_noclass/contig_${meinarrayNEW[$i]}_bin_unclass.fa > $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas/RAContigs_perBin/unm_${meinarrayNEW[$i]}_RAContig_total.fasta
else
echo -e "\033[32mPer bin files were used\033[0m"
fi
cp $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas/RAContigs_perBin/unm_${meinarrayNEW[$i]}_RAContig_total.fasta $path/Annotation/${meinarrayNEW[$i]}/unmapped_annotation/input/
cp $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/all_mapped_${meinarrayNEW[$i]}_total.fasta $path/Annotation/${meinarrayNEW[$i]}/mapped_annotation/input/
done
###wait
mkdir $path/DeNovo/REAssembly/PostRA_QUAST
###Here the REassembly results will be compared
###for ((i=0; i<$f; i++)); do
echo "--------------------------------------------------------------------------------"
echo "$d : Statistical analyze of Reassembled output (created a short summary of assembly) with QUAST"
###echo -e "\033[34mQUAST: Alexey Gurevich, Vladislav Saveliev, Nikolay Vyahhi and Glenn Tesler,QUAST: quality assessment tool for genome assemblies,Bioinformatics (2013) 29 (8): 1072-1075.\033[0m"
echo "--------------------------------------------------------------------------------"
###NOTE: Please change the number of examples depending of the number of input samples (in this case: 3)
python $pathT/quast-4.5/./quast.py $path/DeNovo/REAssembly/${meinarrayNEW[0]}_reas/RAContigs_perBin/unm_${meinarrayNEW[0]}_RAContig_total.fasta $path/DeNovo/REAssembly/${meinarrayNEW[1]}_reas/RAContigs_perBin/unm_${meinarrayNEW[1]}_RAContig_total.fasta $path/DeNovo/REAssembly/${meinarrayNEW[2]}_reas/RAContigs_perBin/unm_${meinarrayNEW[2]}_RAContig_total.fasta -o $path/DeNovo/REAssembly/PostRA_QUAST/idba_RA_$Exp
#INPUT="$(cat $path/Reference/log_combifasta.txt)"
#mkdir $path/DeNovo/Assembly/Quast/metaquast
#python $pathT/quast-4.5/./metaquast.py -R $Input -o $path/DeNovo/Assembly/Quast/metaquast/ path/DeNovo/REAssembly/${meinarrayNEW[0]}_reas/RAContigs_perBin/unm_${meinarrayNEW[0]}_RAContig_total.fasta $path/DeNovo/REAssembly/${meinarrayNEW[1]}_reas/RAContigs_perBin/unm_${meinarrayNEW[1]}_RAContig_total.fasta $path/DeNovo/REAssembly/${meinarrayNEW[2]}_reas/RAContigs_perBin/unm_${meinarrayNEW[2]}_RAContig_total.fasta
###done
sleep 20s
echo "--------------------------------------------------------------------------------"
echo "$d : Map unmapped reads back to assembly (contigs) to calculate finally the potential community depending on read matches"
echo "--------------------------------------------------------------------------------"
######Map reads back to contigs######
###Find out which reads are included inside the contig###
###count the number of reads per contig###
###Find out which contig encoded which taxa###
###weighting: use the number of reads per contig per taxa als weighting###
###useful source: http://2014-5-metagenomics-workshop.readthedocs.io/en/latest/assembly/map.html ###
for ((i=0; i<$f; i++)); do
mkdir $path/Map_reads_backto_contigs
mkdir $path/Map_reads_backto_contigs/${meinarrayNEW[$i]}
$pathT/bowtie2-2.3.2/./bowtie2-build $path/DeNovo/REAssembly/${meinarrayNEW[$i]}_reas/RAContigs_perBin/unm_${meinarrayNEW[$i]}_RAContig_total.fasta $path/Map_reads_backto_contigs/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_contig_idx 2>&1 | tee -a $path/Map_reads_backto_contigs/${meinarrayNEW[$i]}/idx_logfile.txt
sleep 10s
$pathT/bowtie2-2.3.2/./bowtie2 -a -x $path/Map_reads_backto_contigs/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_contig_idx -f $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmapped_${meinarrayNEW[$i]}_total.fasta -S $path/Map_reads_backto_contigs/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_readsTOcontigs_map.sam 2>&1 | tee -a $path/Map_reads_backto_contigs/${meinarrayNEW[$i]}/bowtie_${meinarrayNEW[$i]}_logfile.txt
sleep 10s
samtools view -bS $path/Map_reads_backto_contigs/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_readsTOcontigs_map.sam > $path/Map_reads_backto_contigs/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_readsTOcontigs_map.bam
samtools sort -n $path/Map_reads_backto_contigs/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_readsTOcontigs_map.bam -o $path/Map_reads_backto_contigs/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_readsTOcontigs_map.sorted.bam
samtools index $path/Map_reads_backto_contigs/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_readsTOcontigs_map.sorted.bam
samtools idxstats $path/Map_reads_backto_contigs/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_readsTOcontigs_map.sorted.bam > $path/Map_reads_backto_contigs/${meinarrayNEW[$i]}/idx_stats_${meinarrayNEW[$i]}.txt
done
#wait
#!/bin/bash
############################################################################################
##PRESELECTION OF EXPECTED AND UNEXPECTED READS
############################################################################################
echo -e "\033[46m############################################################################################\nPRESELECTION OF EXPECTED AND UNEXPECTED READS\n############################################################################################\033[0m"
sleep 10s
###Input for Bowtie2 are the cleaned reads and reference genomes
###Please integrate all important reference genomes in the folder "Reference/BaseFasta"
echo "--------------------------------------------------------------------------------"
echo "$d : Create new output folders for Bowtie2"
###echo -e "\033[34mBOWTIE2: Fast gapped-read alignment with Bowtie 2. Nature Methods. 2012, 9:357-359.Langmead B, Salzberg S.\nVersion2.3.2\033[0m"
echo "--------------------------------------------------------------------------------"
mkdir $path/Bowtie2
mkdir $path/Bowtie2/IDX/
###In order to take all possible reference fasta files inside the bowtie index a pseudofile must be generated.
###This pseudofile obtained all files names in one line, selected with a comma.
###This line will be readed as input for the bowtie-build command.
echo "--------------------------------------------------------------------------------"
echo "$d : Prepare combined different reference file in FASTA to community_total.fasta, as base for bowtie2 "
echo "--------------------------------------------------------------------------------"
###cat $path/Reference/BaseFasta/*.fasta >> $path/Reference/community_total.fasta
cd $path/Reference/
rm $path/Reference/log_basefasta.txt
rm $path/Reference/log_combifasta_prep.txt
rm $path/Reference/log_combifasta.txt
cd $path/Reference/BaseFasta/
ls *.fasta > $path/Reference/log_basefasta.txt
wc -l $path/Reference/log_basefasta.txt
while read line; do
echo -n "$line," >> $path/Reference/log_combifasta_prep.txt
cat $path/Reference/log_combifasta_prep.txt | sed -e 's/,$//g' > $path/Reference/log_combifasta.txt
INPUT="$(cat $path/Reference/log_combifasta.txt)"
echo "$INPUT"
### rm $path/Reference/log_combifasta_prep.txt
done < $path/Reference/log_basefasta.txt
###Use the prepared pseudofile.
###Generate bowtie index of all potential reference sequences.
echo "--------------------------------------------------------------------------------"
echo "$d : Create Bowtie2 index based on community_total.fasta"
echo "--------------------------------------------------------------------------------"
echo "$d : NOTE: Depending on the number of input communities, it may take a while"
cd $path/Reference/BaseFasta/
$pathT/bowtie2-2.3.2/./bowtie2-build $INPUT $path/Bowtie2/IDX/community_idx 2>&1 | tee -a $path/Bowtie2/idx_logfile.txt
$pathT/bowtie2-2.3.2/./bowtie2-inspect -s $path/Bowtie2/IDX/community_idx 2>&1 | tee -a $path/Bowtie2/idx_inspect_logfile.txt
sleep 20s
echo "--------------------------------------------------------------------------------"
echo "$d : Mapped the cleaned and trimmed reads based on your own reference index"
echo "--------------------------------------------------------------------------------"
echo "$d : NOTE: depending on the number of input reads, it may take a while"
###number of parallel jobs
N=$f
for ((i=0; i<$f; i++)); do
echo "$d : Bowtie2 for Sample: ${meinarrayNEW[$i]}"
mkdir $path/Bowtie2/${meinarrayNEW[$i]}
mkdir $path/Bowtie2/${meinarrayNEW[$i]}/unconc
mkdir $path/Bowtie2/${meinarrayNEW[$i]}/alconc
done
for ((i=0; i<$f; i++)); do
(
$pathT/bowtie2-2.3.2/./bowtie2 --sensitive-local -a --fr -x $path/Bowtie2/IDX/community_idx -1 $path/CleanUp/Clean_${meinarrayNEW[$i]}/Clean_${meinarrayNEW[$i]}_1P.fastq -2 $path/CleanUp/Clean_${meinarrayNEW[$i]}/Clean_${meinarrayNEW[$i]}_2P.fastq -U $path/CleanUp/CombinedUP_${meinarrayNEW[$i]}.fastq -S $path/Bowtie2/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_bowtie.sam -p 1 2>&1 | tee -a $path/Bowtie2/${meinarrayNEW[$i]}/bowtie_${meinarrayNEW[$i]}_logfile.txt
) &
###allow only to exetute $N jobs in parallel
if [[ $(jobs -r -p | wc -l) -gt $N ]]; then
###wait only for first job
wait -n
fi
done
wait
###for SE
### $pathT/bowtie2-2.3.2/./bowtie2 --sensitive-local --un $path/Bowtie2/${meinarrayNEW[$i]}/un/ --al $path/Bowtie2/${meinarrayNEW[$i]}/al/ -x $path/Bowtie2/IDX/community_idx -U $path/CleanUp/Total_${meinarrayNEW[$i]}.fastq -S $path/Bowtie2/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_bowtie.sam 2>&1 | tee -a $path/Bowtie2/${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}_logfile.txt
###done
sleep 20s
##Than the generated SAM file must be transformed in a BAM file to generate a statistical file of mapped reads.
echo "--------------------------------------------------------------------------------"
echo "$d : Transformed the SAM (Bowtie2 output) to BAM"
##echo -e "\033[34mThe Sequence alignment/map (SAM) format and SAMtools. Bioinformatics, 25, 2078-9. Li H.*, Handsaker B.*, Wysoker A., Fennell T., Ruan J., Homer N., Marth G., Abecasis G., Durbin R. and 1000 Genome Project Data Processing Subgroup (2009) [PMID: 19505943]\033[0m"
echo "--------------------------------------------------------------------------------"
for ((i=0; i<$f; i++)); do
echo "$d : Samtools for Sample: ${meinarrayNEW[$i]}"
samtools view -bS $path/Bowtie2/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_bowtie.sam > $path/Bowtie2/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_bowtie.bam
samtools sort $path/Bowtie2/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_bowtie.bam -o $path/Bowtie2/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_bowtie.sorted.bam
##alternative you can use bedtools bam2fastq
## $pathT/BCL2BAM2FASTQ/bam2fastq/./bam2fastq $path/Bowtie2/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_bowtie.sorted.bam $path/Bowtie2/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_bowtie_sorted.fastq
## $pathT/BCL2BAM2FASTQ/bam2fastq/./bam2fastq $path/Bowtie2/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_bowtie.bam $path/Bowtie2/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_bowtie.fastq
##this fastq file should be puted into DIAMOND look at the ReadME to generate the *.sub file for cluster usage
samtools index $path/Bowtie2/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_bowtie.sorted.bam
samtools idxstats $path/Bowtie2/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_bowtie.sorted.bam > $path/Bowtie2/${meinarrayNEW[$i]}/idx_stats_${meinarrayNEW[$i]}
##The output is TAB-delimited with each line consisting of reference sequence name, sequence length, # mapped reads and # unmapped reads.
done
sleep 20s
###To use the BAM file usefull for further steps, it must be transformed in fastq/fasta files.
###Here same steps to sort, check and transforme the BAM file are defined.
###Thereby duplicated sequences (e.g. PCR-duplicates) will be deleted (only single PE reads are used)
echo "--------------------------------------------------------------------------------"
echo "$d : Prepared the unmapped files (unmapped files) and combined forward & reverse of unmapped reads to one total files."
echo "--------------------------------------------------------------------------------"
mkdir $path/Bowtie2/Unmappedfiles
for ((i=0; i<$f; i++)); do
mkdir $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}
echo "$d : Unmap_${meinarrayNEW[$i]}: Select unmapped paired end alignments (with read unmapped and mate unmapped)"
samtools view -u -f 12 $path/Bowtie2/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_bowtie.bam > $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmap_${meinarrayNEW[$i]}.unmapped.bam
#### echo "$d : Unmap_${meinarrayNEW[$i]}: SAM to BAM"
#### samtools view -bS $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/unmap_${meinarrayNEW[$i]}.unmapped.sam > $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/unmap_${meinarrayNEW[$i]}.unmapped.bam
echo "$d : Unmap_${meinarrayNEW[$i]}: Sorted unmapped paired end the BAM file by name"
samtools sort -n $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmap_${meinarrayNEW[$i]}.unmapped.bam -o $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmap_${meinarrayNEW[$i]}.sorted.bam
echo "$d : Unmap_${meinarrayNEW[$i]}: Fixmate of unmapped paired end"
samtools fixmate $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmap_${meinarrayNEW[$i]}.sorted.bam $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmap_${meinarrayNEW[$i]}.fixed.bam
echo "$d : Unmap_${meinarrayNEW[$i]}: Sorted the fixed BAM file of unmapped paired end by name"
samtools sort -n $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmap_${meinarrayNEW[$i]}.fixed.bam -o $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmap_${meinarrayNEW[$i]}.fixed.sorted.bam
echo "$d : Unmap_${meinarrayNEW[$i]}: BAM to FastQ (unmapped paired end)"
###One combined file
###bamToFastq -i $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/unmap_${meinarrayNEW[$i]}.fixed.sorted.bam -fq $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/unmapped_${meinarrayNEW[$i]}_total.fastq
###Two selected files (FR)
sleep 20s
bamToFastq -i $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmap_${meinarrayNEW[$i]}.fixed.sorted.bam -fq $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmap_${meinarrayNEW[$i]}_total.fastq
###echo "$d : Unmap_${meinarrayNEW[$i]}: Delete possible read duplicates with clumpify.sh by bbmap (unmapped paired end)"
###$pathT/bbmap/clumpify.sh in=$path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmap_${meinarrayNEW[$i]}_total.fastq out=$path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmap_${meinarrayNEW[$i]}_total_checked.fastq dedupe
sleep 20s
echo "$d : Unmap_${meinarrayNEW[$i]}: FastQ to FastA (unmapped paired end)"
fastq_to_fasta -i $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmap_${meinarrayNEW[$i]}_total.fastq -o $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmapped_${meinarrayNEW[$i]}_total.fasta &
sleep 30s
echo "--------------------------------------------------------------------------------"
echo "$d : Calculate the read length disturbance depending on paired end unmapped reads (interleaved PE calc)"
echo -e "\033[34mLength vs. Amount will be illstrated on the log_readlgth_${meinarrayNEW[$i]}_unmapped.txt file\033[0m"
echo -e "\033[35mTo Calculate the k-mer, use the average read length minus 10 bp to define k on Main.sh (calc manual via Excel)[Source: Zerbino]\033[0m"
echo "--------------------------------------------------------------------------------"
awk 'NR%4==2 {lengths[length($0)]++} END {for (l in lengths) {print l, lengths[l]}}' $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmapped_${meinarrayNEW[$i]}_total.fasta 2>&1 | tee -a $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/readlgth_${meinarrayNEW[$i]}_intPE_unmap.txt
###$pathT/MaxBin-2.2.3/auxiliary/idba-1.1.1/bin/./fq2fa --merge --filter $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmap_${meinarrayNEW[$i]}_F.fastq $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmap_${meinarrayNEW[$i]}_R.fastq $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/interleavedPE_unmap_${meinarrayNEW[$i]}_total.fasta
echo "$d : Unmap_${meinarrayNEW[$i]}: Select the total unmapped alignments"
samtools view -u -f 4 $path/Bowtie2/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_bowtie.bam > $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/all_unmap_${meinarrayNEW[$i]}.unmapped.bam
echo "$d : Unmap_${meinarrayNEW[$i]}: Sorted total unmapped the BAM file by name"
samtools sort -n $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/all_unmap_${meinarrayNEW[$i]}.unmapped.bam -o $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/all_unmap_${meinarrayNEW[$i]}.sorted.bam
echo "$d : Unmap_${meinarrayNEW[$i]}: Fixmate of total unmapped"
samtools fixmate $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/all_unmap_${meinarrayNEW[$i]}.sorted.bam $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/all_unmap_${meinarrayNEW[$i]}.fixed.bam
echo "$d : Unmap_${meinarrayNEW[$i]}: Sorted the fixed BAM file of total unmapped by name"
samtools sort -n $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/all_unmap_${meinarrayNEW[$i]}.fixed.bam -o $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/all_unmap_${meinarrayNEW[$i]}.fixed.sorted.bam
echo "$d : Unmap_${meinarrayNEW[$i]}: BAM to FastQ (unmapped total)"
###For SE
bamToFastq -i $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/all_unmap_${meinarrayNEW[$i]}.fixed.sorted.bam -fq $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/all_unmapped_${meinarrayNEW[$i]}_total.fastq
###echo "$d : Unmap_${meinarrayNEW[$i]}: Delete possible read duplicates with clumpify.sh by bbmap"
###$pathT/bbmap/clumpify.sh in=$path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/all_unmapped_${meinarrayNEW[$i]}_total.fastq out=$path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/all_unmapped_${meinarrayNEW[$i]}_total_checked.fastq dedupe
sleep 20s
echo "$d : Unmap_${meinarrayNEW[$i]}: FastQ to FastA (unmapped total)"
fastq_to_fasta -i $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/all_unmapped_${meinarrayNEW[$i]}_total.fastq -o $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/all_unmapped_${meinarrayNEW[$i]}_total.fasta
sleep 20s
echo "--------------------------------------------------------------------------------"
echo "$d : Calculate the read length disturbance depending on unmapped reads (total unmapped reads)"
echo -e "\033[33mLength vs. Amount will be illstrated on the log_readlgth_${meinarrayNEW[$i]}_unmapped.txt file.\033[0m"
echo -e "\033[33mTo Calculate the k-mer, use the average read length minus 10 bp to define k on Main.sh (calc manual via Excel)[Source: Zerbino]\033[0m"
echo "--------------------------------------------------------------------------------"
awk 'NR%4==2 {lengths[length($0)]++} END {for (l in lengths) {print l, lengths[l]}}' $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/all_unmapped_${meinarrayNEW[$i]}_total.fasta 2>&1 | tee -a $path/Bowtie2/Unmappedfiles/unmap_${meinarrayNEW[$i]}/readlgth_${meinarrayNEW[$i]}_all_unmap.txt
done
sleep 15s
echo "--------------------------------------------------------------------------------"
echo "$d : Prepared the mapped files (mapped files) and combined forward & reverse of mapped reads to one total files."
echo "--------------------------------------------------------------------------------"
mkdir $path/Bowtie2/Mappedfiles
for ((i=0; i<$f; i++)); do
mkdir $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}
echo "$d : Map_${meinarrayNEW[$i]}: Select total mapped alignments (without read unmapped and mate unmapped)"
samtools view -u -F 2316 $path/Bowtie2/${meinarrayNEW[$i]}/${meinarrayNEW[$i]}_bowtie.bam > $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}.mapped.bam
#### echo "$d : Map_${meinarrayNEW[$i]}: SAM to BAM"
#### samtools view -bS $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}.mapped.sam > $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}.mapped.bam
echo "$d : Map_${meinarrayNEW[$i]}: Sorted the BAM file of total mapped by name"
samtools sort -n $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}.mapped.bam -o $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}.sorted.bam
echo "$d : Map_${meinarrayNEW[$i]}: Fixmate of total mapped"
samtools fixmate $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}.sorted.bam $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}.fixed.bam
echo "$d : Map_${meinarrayNEW[$i]}: Sorted the fixed BAM file of total mapped by name"
samtools sort -n $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}.fixed.bam -o $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}.fixed.sorted.bam
echo "$d : Map_${meinarrayNEW[$i]}: BAM to FastQ (all mapped reads)"
###One combined file
bamToFastq -i $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}.fixed.sorted.bam -fq $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}_total.fastq
###echo "$d : Map_${meinarrayNEW[$i]}: Delete possible read duplicates with clumpify.sh by bbmap"
###$pathT/bbmap/clumpify.sh in=$path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}_total.fastq out=$path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}_total_checked.fastq dedupe
###Two selected files (FR)
###bamToFastq -i $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}.fixed.sorted.bam -fq $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}_F.fastq -fq2 $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}_R.fastq
###perl $pathT/velvet_1.2.10/contrib/shuffleSequences_fasta/./shuffleSequences_fastq.pl $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/forward_reads.fastq $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/reverse_reads.fastq $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/output.fastq
echo "$d : Map_${meinarrayNEW[$i]}: FastQ to FastA (all mapped reads)"
fastq_to_fasta -i $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}_total.fastq -o $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/all_mapped_${meinarrayNEW[$i]}_total.fasta
###$pathT/MaxBin-2.2.3/auxiliary/idba-1.1.1/bin/./fq2fa --merge --filter $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}_F.fastq $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/map_${meinarrayNEW[$i]}_R.fastq $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/mapped_${meinarrayNEW[$i]}_total.fasta
###echo "$d : Map_${meinarrayNEW[$i]}: Format check of total mapped file --> generate the final mapped_total file"
###$pathSH/additionalSH/check_builded_fastq_script < $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/mapped_${meinarrayNEW[$i]}_total_unchecked.fastq > $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/mapped_${meinarrayNEW[$i]}_total.fastq
echo "--------------------------------------------------------------------------------"
echo "$d : Calculate the read length disturbance depending on mapped reads (all mapped reads)"
echo -e "Length vs. Amount will be illstrated on the log_readlgth_${meinarrayNEW[$i]}_mapped.txt file."
echo -e "To Calculate the k-mer, use the average read length minus 10 bp to define kdef on Main.sh (calc manual via Excel)[Source: Zerbino]"
echo "--------------------------------------------------------------------------------"
awk 'NR%4==2 {lengths[length($0)]++} END {for (l in lengths) {print l, lengths[l]}}' $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/all_mapped_${meinarrayNEW[$i]}_total.fasta 2>&1 | tee -a $path/Bowtie2/Mappedfiles/map_${meinarrayNEW[$i]}/log_all_readlgth_${meinarrayNEW[$i]}_mapped.txt
echo -e "\033[33mNOTE: these fasta files of mapped reads should be puted directly into DIAMOND. Look at the ReadME to generate the *.sub file for cluster usage\033[0m."
done
sleep 15s
#!/bin/bash
############################################################################################
##CLEAN UP & QUALITY CHECK
############################################################################################
echo -e "\033[46m############################################################################################\nCLEAN UP & QUALITY CHECK\n############################################################################################\033[0m"
echo "--------------------------------------------------------------------------------"
echo "$d : Starting point to create first output folders"
echo "--------------------------------------------------------------------------------"
mkdir $path/PreQualCheck
mkdir $path/CleanUp
mkdir $path/CleanUp/RenamedFiles
mkdir $path/PostQualCheck
echo "--------------------------------------------------------------------------------"
echo "$d : Unzip zipped original row data files"
echo "--------------------------------------------------------------------------------"
###If the org. files are zipped, unzip the files here
cd $path/OrgFiles/
gunzip *.fastq.gz
N=$f
#(
for ((i=0; i<$f; i++)); do
###((i=i%f)); ((i++)) && wait
###open the parallel work
( #echo "$i"
###Here the quality check with FastQC and read trimming with Trimmomatic will be performe.
echo "$d : CleanUp for Sample: ${meinarrayNEW[$i]}"
###touch $path/CleanUp/readDist_${meinarrayNEW[$i]}.txt
mkdir $path/PreQualCheck/Pre_${meinarrayNEW[$i]}
echo "--------------------------------------------------------------------------------"
echo "$d : Pre-FastQC scan: Of row reads: ${meinarrayNEW[$i]}"
###echo -e "\033[34mFastQC: a quality control tool for high throughput sequence data.Andrews S. (2010).\nVersion: 0.11.5\033[0m"
echo "--------------------------------------------------------------------------------"
$pathT/FastQC/./fastqc $path/OrgFiles/${meinarrayF[$i]}.fastq -o $path/PreQualCheck/Pre_${meinarrayNEW[$i]}/
$pathT/FastQC/./fastqc $path/OrgFiles/${meinarrayR[$i]}.fastq -o $path/PreQualCheck/Pre_${meinarrayNEW[$i]}/
###close the parallel work
) &
###allow only to exetute $N jobs in parallel
if [[ $(jobs -r -p | wc -l) -gt $N ]]; then
###wait only for first job
wait -n
fi
done
wait
for ((i=0; i<$f; i++)); do
(
echo "--------------------------------------------------------------------------------"
echo "$d : Quality selection and row read trimming with Trimmomatic: ${meinarrayNEW[$i]}"
###echo -e "\033[34mTrimmomatic: A flexible trimmer for Illumina Sequence Data. Bioinformatics, btu170.Bolger, A. M., Lohse, M., & Usadel, B. (2014).\nVersion: 0.36\033[0m"
echo "--------------------------------------------------------------------------------"
###sudo apt-get install parallel
mkdir $path/CleanUp/Clean_${meinarrayNEW[$i]}
mkdir $path/CleanUp/RenamedFiles/${meinarrayNEW[$i]}
java -jar $pathT/Trimmomatic-0.36/./trimmomatic-0.36.jar PE -phred33 $path/OrgFiles/${meinarrayF[$i]}.fastq $path/OrgFiles/${meinarrayR[$i]}.fastq -baseout $path/CleanUp/Clean_${meinarrayNEW[$i]}/Clean_${meinarrayNEW[$i]}.fastq LEADING:3 TRAILING:3 SLIDINGWINDOW:4:20 AVGQUAL:20 MINLEN:35 CROP:151 -threads 1 2>&1 | tee -a $path/CleanUp/trim_logfile.txt
### CROP:$CROP CROP:151 (maximal allowed read length)
) &
if [[ $(jobs -r -p | wc -l) -gt $N ]]; then
wait -n
fi
done
wait
for ((i=0; i<$f; i++)); do
echo "--------------------------------------------------------------------------------"
echo "$d : File header check"