Python snippets for biology
Requires BioPython to be installed
Open common formats:
.gbk / .gbf / .gb
import SeqIO
with open('/my/path/file.gb','r') as file_handle:
record_dict = SeqIO.to_dict(SeqIO.parse(file_handle, 'gb'))
gbkFile = record_dict[list(record_dict.keys())[0]]
# the above is if there is only 1 record in the GBK file
# if multiple files are in the record, such as a genome&plas,
# this will only extract the first record
.fa / .fasta / .fna
import SeqIO
fastaList = list(SeqIO.parse("path/file.fasta", "fasta"))
Write common formats:
.gbk / .gbf / .gb
import SeqIO
SeqIO.write(seqRecordObj_or_list,'/my/path/file.gbk', 'gb')
.fa / .fasta / .fna
import SeqIO
SeqIO.write(seqRecordObj_or_list,'/my/path/file.fasta', 'fasta')
local BLASTing from Python
Also requires NCBI BLAST command line software, a local BLAST database, and Pandas
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import subprocess
from tempfile import NamedTemporaryFile
import pandas as pd
def BLAST(seq, db = 'path/to/db', Type = "blastn"):
# 'seq' is a sequence (as a str) of a protein or nucleotide sequence
# 'db' points to location of local BLAST database
# 'type' specifies the type of BLAST (e.g. 'n', 'p', 'x', etc)
query = NamedTemporaryFile()
tmp = NamedTemporaryFile()
SeqIO.write(SeqRecord(Seq(seq), id="temp"), query.name, "fasta")
flags = 'qstart qend sseqid sframe pident slen sseq length sstart send qlen'
# 'flags' specifies the specific outputs for 'output format 6' in the BLAST CL software
extras = "-max_target_seqs 20000 -culling_limit 10 -perc_identity 75"
# 'extras' are further flags that can be called on the CL
subprocess.call( #the actual CL BLAST
(f'{Type} -query {query.name} -out {tmp.name} '
f'-db {db} {extras} -outfmt "6 {flags}"'),
shell=True)
with open(tmp.name, "r") as file_handle: #opens BLAST file
align = file_handle.readlines()
tmp.close()
query.close()
df = pd.DataFrame([ele.split() for ele in align], columns = flags.split())
df = df.apply(pd.to_numeric, errors='ignore')
# puts the output of BLAST into a tidy Pandas dataframe
return df