8.12 读取文件案例
1 求和计算100万个值
file1 = 'pydata/one_million.csv'
f_object = open(file1, 'r')
total_num = 0
for line in f_object:
num = line.strip()
num = int(num)
total_num += num
print(total_num)
f_object.close()
2 计算基因长度
file1 = "pydata/H37Rv.gff"
total_lens = 0
f_object = open(file1, 'r')
for line in f_object:
if line.startswith('#'):
continue
else:
new_line = line.strip().split()
if new_line[2] == 'gene':
gene_len = int(new_line[4]) - int(new_line[3]) + 1
total_lens += gene_len
print(total_lens)
f_object.close()
3 读取压缩格式
#读取压缩格式
file1 = "pydata/clean.1.fq.gz"
import gzip
f_object = gzip.open(file1, 'rb')
for line in f_object:
if line.decode().startswith('@'):
fastqid = line.decode().strip().split()[0][1:]
print (fastqid)
4 写文件
当数据处理完成之后,需要将结果写入到文件中。python中写文件,首先也要打开一个文件,可以是空文件或者带有内容的文件,如果模式选择“w”,则会将文件清空然后写入,如果模式选择“a”,则是追加写入。
file_output = open("output.txt", 'w')
i = 0
while i < 10 :
file_output.write(f"{i} Hello,world!\n")
i += 1
file_output.close()
5 过滤blast比对结果
file1 = 'pydata/blast_m8.out'
f_object = open(file1, 'r')
f_output = open("blast_filter.out", 'w')
for line in f_object:
new_line = line.strip().split("\t")
if (float(new_line[2])) >= 50 & (float(new_line[3]) >= 100):
f_output.write(line)
f_object.close()
f_output.close()
6 fastq转换为fasta
#读取压缩格式
file1 = "pydata/clean.1.fq.gz"
import gzip
f_object = gzip.open(file1, 'rt', )
for line in f_object:
name = line.strip()[1:]
id = ">" + str(name)
line2 = f_object.readline()
line3 = f_object.readline()
line4 = f_object.readline()
print(id)
print(line2.strip())
f_object.close()
7 开发流程
#读取压缩格式
file1 = "pydata/sample.txt"
f_object = open(file1, 'r', )
for line in f_object:
new_line = line.strip().split()
#数据质控
print(f"#Sample {new_line[0]}")
print(f"mkdir {new_line[0]}; fastqc -f fastq -o {new_line[0]} {new_line[1]} {new_line[2]}")
#数据过滤
print(f"fastp -i {new_line[1]} -I {new_line[2]} -o {new_line[0]}_clean.1.fq.gz -O {new_line[0]}_clean.2.fq.gz -z 4 -q 20 -u 30 -n 10 ")
# 基因组拼接
print(f"ifs1/Software/biosoft/SPAdes-3.14.0-Linux/bin/spades.py -o {new_line[0]}_assembly -t 24 -1 {new_line[0]}_clean.1.fq.gz -2 {new_line[0]}_clean.2.fq.gz")
print("")
f_object.close()