I am dealing with a large protein sequence (fasta) file (>8GB) and my idea is to create dictionary where key and value will be protein id and sequence respectively.
As for now I am OK to make and dump data into a dictionary using pickle
and then trying to open with cpickle
(I read pickle
is faster to dump data and cpickle
is faster to load data). But the main problem here is time: to make and dump it as a dictionary is taking too much time and memory (PC has 8GB memory).
Is there any faster option available to deal with large files in Python?
Here is my Python code to create the dictionary and dump the data:
from Bio import SeqIO
import pickle,sys
fastaSeq = {}
with open('uniref90.fasta') as fasta_file:
for seq_record in SeqIO.parse(fasta_file, 'fasta'):
header =seq_record.id
uniID = header.split('_')[1]
seqs = str(seq_record.seq)
fastaSeq[uniID] = seqs
f = open('uniref90.obj', 'wb')
pickle.dump(fastaSeq, f, pickle.HIGHEST_PROTOCOL)
f.close()
Loading dictionary and doing some task in separate Python program:
import cPickle as pickle
seq_dict = pickle.load(open("uniref90.obj", "rb"))
for skey in seq_dict.keys():
#doing something
Databases are your friend my son.
import sqlite3
from Bio import SeqIO
db = sqlite3.connect("./db")
c = db.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS map (k text unique, v text)''')
db.commit()
def keys(db):
cursor = db.cursor()
return cursor.execute("""SELECT k FROM map""").fetchall()
def get(key, db, default=None):
cursor = db.cursor()
result = cursor.execute("""SELECT v FROM map WHERE k = ?""", (key,)).fetchone()
if result is None:
return default
return result[0]
def save(key, value, db):
cursor = db.cursor()
cursor.execute("""INSERT INTO map VALUES (?,?)""", (key, value))
db.commit()
with open('uniref90.fasta') as fasta_file:
for seq_record in SeqIO.parse(fasta_file, 'fasta'):
header = seq_record.id
uniID = header.split('_')[1]
seqs = str(seq_record.seq)
save(uniID, seqs, db)