9-跟着science学习宏基因组-三种组装方法混样组装(megahit/spades/idba)
全部样本组装;核糖体基因识别;序列回帖;单个样本组装;核糖体基因识别;序列回帖;
合并全部样本
将全部样本的前端和后端分别合并,未匹配的也合并。这里我们可以注意到明没有添加序列标签,因为做组装也不需要。
zcat `echo ./trimming/*_1.fq.gz | xargs -n1 | sort | xargs` | gzip > ./trimming/R1.fastq.gz
zcat `echo ./trimming/*_2.fq.gz | xargs -n1 | sort | xargs` | gzip > ./trimming/R2.fastq.gz
gzip ./trimming/*_unpaired.fq
zcat `echo ./trimming/*_unpaired.fq.gz | xargs -n1 | sort | xargs` | gzip > ./trimming/singular.fastq.gz
按照处理合并
这个步骤在教程2中已经做过了
#按照处理合并序列,建立一个文件夹
mkdir ./treatment
#合并同一个处理所有前端序列
cat ./host_filtering/raw/SUBERR793599_R1_filtered.fastq_pairs_R1.fastq ./host_filtering/raw/SUBERR793600_R1_filtered.fastq_pairs_R1.fastq ./host_filtering/raw/SUBERR793601_R1_filtered.fastq_pairs_R1.fastq > ./treatment/C_forward.fastq
# 合并同一个处理所有后端序列
cat ./host_filtering/raw/SUBERR793599_R2_filtered.fastq_pairs_R2.fastq ./host_filtering/raw/SUBERR793600_R2_filtered.fastq_pairs_R2.fastq ./host_filtering/raw/SUBERR793601_R2_filtered.fastq_pairs_R2.fastq > ./treatment/C_reverse.fastq
#合并双端未匹配上的序列
cat ./host_filtering/raw/SUBERR793599_R1_filtered.fastq_singles.fastq ./host_filtering/raw/SUBERR793600_R1_filtered.fastq_singles.fastq ./host_filtering/raw/SUBERR793601_R1_filtered.fastq_singles.fastq > ./treatment/C_unpaired.fastq
#前端
cat ./host_filtering/raw/SUBERR793603_R1_filtered.fastq_pairs_R1.fastq ./host_filtering/raw/SUBERR793604_R1_filtered.fastq_pairs_R1.fastq ./host_filtering/raw/SUBERR793605_R1_filtered.fastq_pairs_R1.fastq > ./treatment/S_forward.fastq
# 后端
cat ./host_filtering/raw/SUBERR793603_R2_filtered.fastq_pairs_R2.fastq ./host_filtering/raw/SUBERR793604_R2_filtered.fastq_pairs_R2.fastq ./host_filtering/raw/SUBERR793605_R2_filtered.fastq_pairs_R2.fastq > ./treatment/S_reverse.fastq
#未匹配
cat ./host_filtering/raw/SUBERR793603_R1_filtered.fastq_singles.fastq ./host_filtering/raw/SUBERR793604_R1_filtered.fastq_singles.fastq ./host_filtering/raw/SUBERR793605_R1_filtered.fastq_singles.fastq > ./treatment/S_unpaired.fastq
#前端
cat ./host_filtering/raw/SUBERR793606_R1_filtered.fastq_pairs_R1.fastq ./host_filtering/raw/SUBERR793607_R1_filtered.fastq_pairs_R1.fastq ./host_filtering/raw/SUBERR793608_R1_filtered.fastq_pairs_R1.fastq > ./treatment/Sr_forward.fastq
# 后端
cat ./host_filtering/raw/SUBERR793606_R2_filtered.fastq_pairs_R2.fastq ./host_filtering/raw/SUBERR793607_R2_filtered.fastq_pairs_R2.fastq ./host_filtering/raw/SUBERR793608_R2_filtered.fastq_pairs_R2.fastq > ./treatment/Sr_reverse.fastq
#未匹配
cat ./host_filtering/raw/SUBERR793606_R1_filtered.fastq_singles.fastq ./host_filtering/raw/SUBERR793607_R1_filtered.fastq_singles.fastq ./host_filtering/raw/SUBERR793608_R1_filtered.fastq_singles.fastq > ./treatment/Sr_unpaired.fastq
按照处理组装 megahit
作者提到了kmer的设置和适用条件:
# meta '--min-count 2 --k-list 21,41,61,81,99' (generic metagenomes, default)
# meta-sensitive '--min-count 2 --k-list 21,31,41,51,61,71,81,91,99' (more sensitive but slower)
# meta-large '--min-count 2 --k-list 27,37,47,57,67,77,87' (large & complex metagenomes, like soil)
# bulk '--min-count 3 --k-list 31,51,71,91,99 --no-mercy' (experimental, standard bulk sequencing with >= 30x depth)
# single-cell '--min-count 3 --k-list 21,33,55,77,99,121 --merge_level 20,0.96' (experimental, single cell data)
作者的序列是300bp的,也就是长序列,因为宏基因组基本都是150甚至100的,这里300bp很长了。所以作者设置了kmer为:”longreads” : “21,33,55,77,99,127”。具体设置为:”kmers” : “33,55,77,99,127”这里少了21,可能没有必要了吧。
mkdir ./assembly
mkdir ./assembly/megahit/
mkdir ./assembly/megahit/C
megahit --continue --out-dir ./assembly/megahit/C/longreads/ -m 0.9 -t 6 --k-list 33,55,77,99,127 -1 ./treatment/C_forward.fastq -2 ./treatment/C_reverse.fastq -r ./treatment/C_unpaired.fastq
mkdir ./assembly/megahit/S
megahit --continue --out-dir ./assembly/megahit/S/longreads/ -m 0.9 -t 6 --k-list 33,55,77,99,127 -1 ./treatment/S_forward.fastq -2 ./treatment/S_reverse.fastq -r ./treatment/S_unpaired.fastq
mkdir ./assembly/megahit/Sr
megahit --continue --out-dir ./assembly/megahit/Sr/longreads/ -m 0.9 -t 6 --k-list 33,55,77,99,127 -1 ./treatment/Sr_forward.fastq -2 ./treatment/Sr_reverse.fastq -r ./treatment/Sr_unpaired.fastq
# 压缩
gzip -c ./assembly/megahit/C/longreads/final.contigs.fa > ./assembly/megahit/C/longreads/final.contigs.fa.gz
gzip -c ./assembly/megahit/S/longreads/final.contigs.fa > ./assembly/megahit/S/longreads/final.contigs.fa.gz
gzip -c ./assembly/megahit/Sr/longreads/final.contigs.fa > ./assembly/megahit/Sr/longreads/final.contigs.fa.gz
按照处理组装:spades
mkdir ./assembly/spades/
mkdir ./assembly/spades/C
# 33,55,77,99,127
metaspades.py -m 1200 -1 ./treatment/C_forward.fastq -2 ./treatment/C_reverse.fastq -s ./treatment/C_unpaired.fastq --only-assembler -k 33,55,77,99,127 -t 6 -o ./assembly/spades/C/longreads/contigs.fasta --tmp-dir ./assembly/spades/C/longreads/tmp/
mkdir ./assembly/spades/S
# 33,55,77,99,127
metaspades.py -m 1200 -1 ./treatment/S_forward.fastq -2 ./treatment/S_reverse.fastq -s ./treatment/S_unpaired.fastq --only-assembler -k 33,55,77,99,127 -t 6 -o ./assembly/spades/S/longreads/contigs.fasta --tmp-dir ./assembly/spades/S/longreads/tmp/
mkdir ./assembly/spades/Sr
# 33,55,77,99,127
metaspades.py -m 1200 -1 ./treatment/Sr_forward.fastq -2 ./treatment/Sr_reverse.fastq -s ./treatment/Sr_unpaired.fastq --only-assembler -k 33,55,77,99,127 -t 6 -o ./assembly/spades/Sr/longreads/contigs.fasta --tmp-dir ./assembly/spades/Sr/longreads/tmp/
idba 组装
注意这里到对序列转化为fa格式其次这里做的是全部序列的组装。
gunzip -c ./trimming/SUBERR793599_1.fq.gz > ./trimming/SUBERR793599_1.fastq
gunzip -c ./trimming/SUBERR793599_2.fq.gz > ./trimming/SUBERR793599_2.fastq
conda install -c bioconda idba
fq2fa --merge ./trimming/SUBERR793599_1.fastq ./trimming/SUBERR793599_2.fastq ./trimming/SUBERR793599.fasta
for i in ./rawdata_0.01/*_1.fq.gz
do
base=$(basename $i _1.fq.gz)
echo $base
gunzip -c ./trimming/${base}_1.fq.gz > ./trimming/${base}_1.fastq
gunzip -c ./trimming/${base}_2.fq.gz > ./trimming/${base}_2.fastq
fq2fa --merge ./trimming/${base}_1.fastq ./trimming/${base}_2.fastq ./trimming/${base}.fasta
done
# 全部样本合并起来
mkdir ./assembly/idba/
cat ./trimming/*.fasta > ./assembly/idba/input.fasta
idba_ud -r ./assembly/idba/input.fasta -o ./assembly/idba/ --num_threads 6 --mink 21 --maxk 121 --step 20 --pre_correction
quast 质量评估
这里可以都进行质量评价,但是作者只选择了一个,assembly
metaquast.py -o ./assembly/spades/C/longreads/quast --min-contig 0 --max-ref-number 0 -t 6 ./assembly/spades/C/longreads/assembly.fa.gz 2>&1 > ./assembly/spades/C/longreads/quast/quast.log
printf 'spades-C-longreads\t' > ./stats/spades/C/longreads/quast.report.txt && cat ./assembly/spades/C/longreads/quast/report.txt | sed 's/ */:/g' | cut -d : -f 2 | tr '\n' '\t' | cut -f 2- >> ./stats/spades/C/longreads/quast.report.txt
整理megahit按照处理组装的结果
zcat ./assembly/megahit/C/longreads/final.contigs.fa.gz | awk '{{print $1}}' | sed 's/_/contig/' > ./assembly/megahit/C/longreads/assembly.fa
gzip -c ./assembly/megahit/C/longreads/assembly.fa > ./assembly/megahit/C/longreads/assembly.fa.gz
整理spades按照处理组装的结果
cat ./assembly/spades/C/longreads/contigs.fasta/contigs.fasta | awk '{{print $1}}' | sed 's/_/contig/' > ./assembly/spades/C/longreads/contigs.fasta/assembly.fa
gzip -c ./assembly/spades/C/longreads/contigs.fasta/assembly.fa > ./assembly/spades/C/longreads/contigs.fasta/assembly.fa.gz
cat ./assembly/spades/S/longreads/contigs.fasta/contigs.fasta | awk '{{print $1}}' | sed 's/_/contig/' > ./assembly/spades/S/longreads/contigs.fasta/assembly.fa
gzip -c ./assembly/spades/S/longreads/contigs.fasta/assembly.fa > ./assembly/spades/S/longreads/contigs.fasta/assembly.fa.gz
cat ./assembly/spades/Sr/longreads/contigs.fasta/contigs.fasta | awk '{{print $1}}' | sed 's/_/contig/' > ./assembly/spades/Sr/longreads/contigs.fasta/assembly.fa
gzip -c ./assembly/spades/Sr/longreads/contigs.fasta/assembly.fa > ./assembly/spades/Sr/longreads/contigs.fasta/assembly.fa.gz
整理idea按照处理组装的结果
cat ./assembly/idba/scaffold.fa | awk '{{print $1}}' | sed 's/_/contig/' > ./assembly/idba/assembly.fa
gzip -c ./assembly/idba/assembly.fa > ./assembly/idba/assembly.fa
构建索引
bwa index ./assembly/spades/C/longreads/contigs.fasta/assembly.fa > ./assembly/spades/C/longreads/contigs.fasta/bwa-index.log
bwa index ./assembly/spades/S/longreads/contigs.fasta/assembly.fa > ./assembly/spades/S/longreads/contigs.fasta/bwa-index.log
bwa index ./assembly/spades/Sr/longreads/contigs.fasta/assembly.fa > ./assembly/spades/Sr/longreads/contigs.fasta/bwa-index.log
附
quast:合并质量评估结果
# 合并质量评估结果
firstfile = ./assembly/spades/C/longreads/quast/report.txt
cat {firstfile} | sed 's/ */:/g' | cut -d : -f 1 | tr '\n' '\t' | head -n 1 > {output} && printf '\n' >> {output}
# Add the result rows
cat {input.quast} >> {output}
megahit新版本去掉了一些参数
例如下面这两:
—max-read-len 302
—cpu-only
megahit --continue --out-dir ./assembly/megahit/C/longreads/ -m 0.9 --max-read-len 302 --cpu-only -t 6 --k-list 33,55,77,99,127 -1 ./treatment/C_forward.fastq -2 ./treatment/C_reverse.fastq -r ./treatment/C_unpaired.fastq 2> ./assembly/megahit/C/longreads/megahit.log
根际互作生物学研究室 简介
根际互作生物学研究室是沈其荣教授土壤微生物与有机肥团队下的一个关注于根际互作的研究小组。本小组由袁军副教授带领,主要关注:1.植物和微生物互作在抗病过程中的作用;2 环境微生物大数据整合研究;3 环境代谢组及其与微生物过程研究体系开发和应用。团队在过去三年中在 isme J, Microbiome, PCE,SBB,Horticulture Research等期刊上发表了多篇文章。欢迎关注 微生信生物 公众号对本研究小组进行了解。
团队工作及其成果 (点击查看)
了解 交流 合作
团队负责人邮箱 袁军:
junyuan@njau.edu.cn;
成员文涛:
2018203048@njau.edu.cn
团队公众号: