8.12 读取文件案例

1 求和计算100万个值

file1 = 'pydata/one_million.csv'

f_object = open(file1, 'r')
total_num = 0
for line in f_object:
    num = line.strip()
    num = int(num)
    total_num += num
print(total_num)
f_object.close()

2 计算基因长度

file1 = "pydata/H37Rv.gff"

total_lens = 0
f_object = open(file1, 'r')
for line in f_object:
    if line.startswith('#'):
        continue
    else:
        new_line = line.strip().split()
        if new_line[2] == 'gene':
            gene_len = int(new_line[4]) - int(new_line[3]) + 1
            total_lens += gene_len

print(total_lens)
f_object.close()

3 读取压缩格式

#读取压缩格式
file1 = "pydata/clean.1.fq.gz"

import gzip

f_object = gzip.open(file1, 'rb')
for line in f_object:
    if line.decode().startswith('@'):
        fastqid = line.decode().strip().split()[0][1:]
        print (fastqid)

4 写文件

当数据处理完成之后，需要将结果写入到文件中。python中写文件，首先也要打开一个文件，可以是空文件或者带有内容的文件，如果模式选择“w”，则会将文件清空然后写入，如果模式选择“a”，则是追加写入。

file_output = open("output.txt", 'w')

i = 0
while i < 10 :
    file_output.write(f"{i} Hello,world!\n")
    i += 1
file_output.close()

5 过滤blast比对结果

file1 = 'pydata/blast_m8.out'

f_object = open(file1, 'r')
f_output = open("blast_filter.out", 'w')

for line in f_object:
    new_line = line.strip().split("\t")
    if (float(new_line[2])) >= 50 & (float(new_line[3]) >= 100):
        f_output.write(line)

f_object.close()
f_output.close()   

6 fastq转换为fasta

#读取压缩格式
file1 = "pydata/clean.1.fq.gz"

import gzip

f_object = gzip.open(file1, 'rt', )
for line in f_object:
    name = line.strip()[1:]
    id = ">" + str(name)
    line2 = f_object.readline()
    line3 = f_object.readline()
    line4 = f_object.readline()
    print(id)
    print(line2.strip())

f_object.close()

7 开发流程

#读取压缩格式
file1 = "pydata/sample.txt"

f_object = open(file1, 'r', )
for line in f_object:
    new_line = line.strip().split()
#数据质控
    print(f"#Sample  {new_line[0]}")
    print(f"mkdir {new_line[0]}; fastqc -f fastq -o {new_line[0]}  {new_line[1]} {new_line[2]}")
#数据过滤
    print(f"fastp -i {new_line[1]} -I {new_line[2]}  -o {new_line[0]}_clean.1.fq.gz -O {new_line[0]}_clean.2.fq.gz -z 4 -q 20 -u 30 -n 10 ")
# 基因组拼接
    print(f"ifs1/Software/biosoft/SPAdes-3.14.0-Linux/bin/spades.py -o {new_line[0]}_assembly -t 24 -1 {new_line[0]}_clean.1.fq.gz -2 {new_line[0]}_clean.2.fq.gz")
    print("")
f_object.close()