9-跟着science学习宏基因组-三种组装方法混样组装(megahit/spades/idba)

全部样本组装;核糖体基因识别;序列回帖;单个样本组装;核糖体基因识别;序列回帖;

合并全部样本

将全部样本的前端和后端分别合并,未匹配的也合并。这里我们可以注意到明没有添加序列标签,因为做组装也不需要。

zcat `echo ./trimming/*_1.fq.gz | xargs -n1 | sort | xargs` | gzip > ./trimming/R1.fastq.gz
zcat `echo ./trimming/*_2.fq.gz | xargs -n1 | sort | xargs` | gzip > ./trimming/R2.fastq.gz

gzip ./trimming/*_unpaired.fq

zcat `echo ./trimming/*_unpaired.fq.gz | xargs -n1 | sort | xargs` | gzip > ./trimming/singular.fastq.gz

按照处理合并

这个步骤在教程2中已经做过了

#按照处理合并序列,建立一个文件夹
mkdir ./treatment
#合并同一个处理所有前端序列
cat ./host_filtering/raw/SUBERR793599_R1_filtered.fastq_pairs_R1.fastq ./host_filtering/raw/SUBERR793600_R1_filtered.fastq_pairs_R1.fastq ./host_filtering/raw/SUBERR793601_R1_filtered.fastq_pairs_R1.fastq > ./treatment/C_forward.fastq
# 合并同一个处理所有后端序列
cat ./host_filtering/raw/SUBERR793599_R2_filtered.fastq_pairs_R2.fastq ./host_filtering/raw/SUBERR793600_R2_filtered.fastq_pairs_R2.fastq ./host_filtering/raw/SUBERR793601_R2_filtered.fastq_pairs_R2.fastq > ./treatment/C_reverse.fastq
#合并双端未匹配上的序列
cat ./host_filtering/raw/SUBERR793599_R1_filtered.fastq_singles.fastq ./host_filtering/raw/SUBERR793600_R1_filtered.fastq_singles.fastq ./host_filtering/raw/SUBERR793601_R1_filtered.fastq_singles.fastq > ./treatment/C_unpaired.fastq

#前端
cat ./host_filtering/raw/SUBERR793603_R1_filtered.fastq_pairs_R1.fastq ./host_filtering/raw/SUBERR793604_R1_filtered.fastq_pairs_R1.fastq ./host_filtering/raw/SUBERR793605_R1_filtered.fastq_pairs_R1.fastq > ./treatment/S_forward.fastq
# 后端
cat ./host_filtering/raw/SUBERR793603_R2_filtered.fastq_pairs_R2.fastq ./host_filtering/raw/SUBERR793604_R2_filtered.fastq_pairs_R2.fastq ./host_filtering/raw/SUBERR793605_R2_filtered.fastq_pairs_R2.fastq > ./treatment/S_reverse.fastq
#未匹配
cat ./host_filtering/raw/SUBERR793603_R1_filtered.fastq_singles.fastq ./host_filtering/raw/SUBERR793604_R1_filtered.fastq_singles.fastq ./host_filtering/raw/SUBERR793605_R1_filtered.fastq_singles.fastq > ./treatment/S_unpaired.fastq

#前端
cat ./host_filtering/raw/SUBERR793606_R1_filtered.fastq_pairs_R1.fastq ./host_filtering/raw/SUBERR793607_R1_filtered.fastq_pairs_R1.fastq ./host_filtering/raw/SUBERR793608_R1_filtered.fastq_pairs_R1.fastq > ./treatment/Sr_forward.fastq
# 后端
cat ./host_filtering/raw/SUBERR793606_R2_filtered.fastq_pairs_R2.fastq ./host_filtering/raw/SUBERR793607_R2_filtered.fastq_pairs_R2.fastq ./host_filtering/raw/SUBERR793608_R2_filtered.fastq_pairs_R2.fastq > ./treatment/Sr_reverse.fastq
#未匹配
cat ./host_filtering/raw/SUBERR793606_R1_filtered.fastq_singles.fastq ./host_filtering/raw/SUBERR793607_R1_filtered.fastq_singles.fastq ./host_filtering/raw/SUBERR793608_R1_filtered.fastq_singles.fastq > ./treatment/Sr_unpaired.fastq

按照处理组装 megahit

作者提到了kmer的设置和适用条件:


# meta '--min-count 2 --k-list 21,41,61,81,99' (generic metagenomes, default)
# meta-sensitive '--min-count 2 --k-list 21,31,41,51,61,71,81,91,99' (more sensitive but slower)
# meta-large '--min-count 2 --k-list 27,37,47,57,67,77,87' (large & complex metagenomes, like soil)
# bulk '--min-count 3 --k-list 31,51,71,91,99 --no-mercy' (experimental, standard bulk sequencing with >= 30x depth)
# single-cell '--min-count 3 --k-list 21,33,55,77,99,121 --merge_level 20,0.96' (experimental, single cell data)

作者的序列是300bp的,也就是长序列,因为宏基因组基本都是150甚至100的,这里300bp很长了。所以作者设置了kmer为:”longreads” : “21,33,55,77,99,127”。具体设置为:”kmers” : “33,55,77,99,127”这里少了21,可能没有必要了吧。

mkdir ./assembly
mkdir ./assembly/megahit/
mkdir ./assembly/megahit/C

megahit --continue --out-dir ./assembly/megahit/C/longreads/ -m 0.9 -t 6 --k-list 33,55,77,99,127 -1 ./treatment/C_forward.fastq -2 ./treatment/C_reverse.fastq -r ./treatment/C_unpaired.fastq
mkdir ./assembly/megahit/S
megahit --continue --out-dir ./assembly/megahit/S/longreads/ -m 0.9 -t 6 --k-list 33,55,77,99,127 -1 ./treatment/S_forward.fastq -2 ./treatment/S_reverse.fastq -r ./treatment/S_unpaired.fastq
mkdir ./assembly/megahit/Sr
megahit --continue --out-dir ./assembly/megahit/Sr/longreads/ -m 0.9 -t 6 --k-list 33,55,77,99,127 -1 ./treatment/Sr_forward.fastq -2 ./treatment/Sr_reverse.fastq -r ./treatment/Sr_unpaired.fastq

# 压缩
gzip -c ./assembly/megahit/C/longreads/final.contigs.fa > ./assembly/megahit/C/longreads/final.contigs.fa.gz
gzip -c ./assembly/megahit/S/longreads/final.contigs.fa > ./assembly/megahit/S/longreads/final.contigs.fa.gz
gzip -c ./assembly/megahit/Sr/longreads/final.contigs.fa > ./assembly/megahit/Sr/longreads/final.contigs.fa.gz

按照处理组装:spades

mkdir ./assembly/spades/
mkdir ./assembly/spades/C
# 33,55,77,99,127
metaspades.py -m 1200 -1 ./treatment/C_forward.fastq -2 ./treatment/C_reverse.fastq -s ./treatment/C_unpaired.fastq --only-assembler -k 33,55,77,99,127 -t 6 -o ./assembly/spades/C/longreads/contigs.fasta --tmp-dir ./assembly/spades/C/longreads/tmp/

mkdir ./assembly/spades/S
# 33,55,77,99,127
metaspades.py -m 1200 -1 ./treatment/S_forward.fastq -2 ./treatment/S_reverse.fastq -s ./treatment/S_unpaired.fastq --only-assembler -k 33,55,77,99,127 -t 6 -o ./assembly/spades/S/longreads/contigs.fasta --tmp-dir ./assembly/spades/S/longreads/tmp/

mkdir ./assembly/spades/Sr
# 33,55,77,99,127
metaspades.py -m 1200 -1 ./treatment/Sr_forward.fastq -2 ./treatment/Sr_reverse.fastq -s ./treatment/Sr_unpaired.fastq --only-assembler -k 33,55,77,99,127 -t 6 -o ./assembly/spades/Sr/longreads/contigs.fasta --tmp-dir ./assembly/spades/Sr/longreads/tmp/

idba 组装

注意这里到对序列转化为fa格式其次这里做的是全部序列的组装。

gunzip -c ./trimming/SUBERR793599_1.fq.gz > ./trimming/SUBERR793599_1.fastq
gunzip -c ./trimming/SUBERR793599_2.fq.gz > ./trimming/SUBERR793599_2.fastq

conda install -c bioconda idba

fq2fa --merge ./trimming/SUBERR793599_1.fastq ./trimming/SUBERR793599_2.fastq ./trimming/SUBERR793599.fasta

for i in ./rawdata_0.01/*_1.fq.gz
do
base=$(basename $i _1.fq.gz)
echo $base
gunzip -c ./trimming/${base}_1.fq.gz > ./trimming/${base}_1.fastq
gunzip -c ./trimming/${base}_2.fq.gz > ./trimming/${base}_2.fastq

fq2fa --merge ./trimming/${base}_1.fastq ./trimming/${base}_2.fastq ./trimming/${base}.fasta

done

# 全部样本合并起来
mkdir ./assembly/idba/
cat ./trimming/*.fasta > ./assembly/idba/input.fasta

idba_ud -r ./assembly/idba/input.fasta -o ./assembly/idba/ --num_threads 6 --mink 21 --maxk 121 --step 20 --pre_correction

quast 质量评估

这里可以都进行质量评价,但是作者只选择了一个,assembly

metaquast.py -o ./assembly/spades/C/longreads/quast --min-contig 0 --max-ref-number 0 -t 6 ./assembly/spades/C/longreads/assembly.fa.gz 2>&1 > ./assembly/spades/C/longreads/quast/quast.log

printf 'spades-C-longreads\t' > ./stats/spades/C/longreads/quast.report.txt && cat ./assembly/spades/C/longreads/quast/report.txt | sed 's/ */:/g' | cut -d : -f 2 | tr '\n' '\t' | cut -f 2- >> ./stats/spades/C/longreads/quast.report.txt

整理megahit按照处理组装的结果

zcat ./assembly/megahit/C/longreads/final.contigs.fa.gz | awk '{{print $1}}' | sed 's/_/contig/' > ./assembly/megahit/C/longreads/assembly.fa

gzip -c ./assembly/megahit/C/longreads/assembly.fa > ./assembly/megahit/C/longreads/assembly.fa.gz

整理spades按照处理组装的结果

cat ./assembly/spades/C/longreads/contigs.fasta/contigs.fasta | awk '{{print $1}}' | sed 's/_/contig/' > ./assembly/spades/C/longreads/contigs.fasta/assembly.fa
gzip -c ./assembly/spades/C/longreads/contigs.fasta/assembly.fa > ./assembly/spades/C/longreads/contigs.fasta/assembly.fa.gz

cat ./assembly/spades/S/longreads/contigs.fasta/contigs.fasta | awk '{{print $1}}' | sed 's/_/contig/' > ./assembly/spades/S/longreads/contigs.fasta/assembly.fa
gzip -c ./assembly/spades/S/longreads/contigs.fasta/assembly.fa > ./assembly/spades/S/longreads/contigs.fasta/assembly.fa.gz

cat ./assembly/spades/Sr/longreads/contigs.fasta/contigs.fasta | awk '{{print $1}}' | sed 's/_/contig/' > ./assembly/spades/Sr/longreads/contigs.fasta/assembly.fa
gzip -c ./assembly/spades/Sr/longreads/contigs.fasta/assembly.fa > ./assembly/spades/Sr/longreads/contigs.fasta/assembly.fa.gz

整理idea按照处理组装的结果

cat ./assembly/idba/scaffold.fa | awk '{{print $1}}' | sed 's/_/contig/' > ./assembly/idba/assembly.fa
gzip -c ./assembly/idba/assembly.fa > ./assembly/idba/assembly.fa

构建索引

bwa index ./assembly/spades/C/longreads/contigs.fasta/assembly.fa > ./assembly/spades/C/longreads/contigs.fasta/bwa-index.log

bwa index ./assembly/spades/S/longreads/contigs.fasta/assembly.fa > ./assembly/spades/S/longreads/contigs.fasta/bwa-index.log

bwa index ./assembly/spades/Sr/longreads/contigs.fasta/assembly.fa > ./assembly/spades/Sr/longreads/contigs.fasta/bwa-index.log

quast:合并质量评估结果

# 合并质量评估结果
firstfile = ./assembly/spades/C/longreads/quast/report.txt
cat {firstfile} | sed 's/ */:/g' | cut -d : -f 1 | tr '\n' '\t' | head -n 1 > {output} && printf '\n' >> {output}
# Add the result rows
cat {input.quast} >> {output}

megahit新版本去掉了一些参数

例如下面这两:

—max-read-len 302

—cpu-only

megahit --continue --out-dir ./assembly/megahit/C/longreads/ -m 0.9 --max-read-len 302 --cpu-only -t 6 --k-list 33,55,77,99,127 -1 ./treatment/C_forward.fastq -2 ./treatment/C_reverse.fastq -r ./treatment/C_unpaired.fastq 2> ./assembly/megahit/C/longreads/megahit.log

根际互作生物学研究室 简介

根际互作生物学研究室是沈其荣教授土壤微生物与有机肥团队下的一个关注于根际互作的研究小组。本小组由袁军副教授带领,主要关注:1.植物和微生物互作在抗病过程中的作用;2 环境微生物大数据整合研究;3 环境代谢组及其与微生物过程研究体系开发和应用。团队在过去三年中在 isme J, Microbiome, PCE,SBB,Horticulture Research等期刊上发表了多篇文章。欢迎关注 微生信生物 公众号对本研究小组进行了解。

团队工作及其成果 (点击查看)

了解 交流 合作

  • 团队负责人邮箱 袁军:

    junyuan@njau.edu.cn;

    成员文涛:

    2018203048@njau.edu.cn

  • 团队公众号:

(0)

相关推荐