Source code for sugar._io.fasta
# (C) 2024, Tom Eulenfeld, MIT license
"""
`FASTA`_ IO
"""
import re
from sugar import BioSeq
from sugar._io.util import _add_fmt_doc
filename_extensions_fasta = ['fasta', 'fa', 'fna', 'faa', 'fas']
[docs]
def is_fasta(f, **kw):
content = f.read(50)
return content.strip().startswith('>')
def _create_bioseq(datalines, id_, header):
data = ''.join(datalines)
seq = BioSeq(data, id=id_)
seq.meta._fasta = {}
seq.meta._fasta.header = header
return seq
CHS = r'[^,|;\s]' # allowed characters in the seq ID
# capture some cases from https://de.wikipedia.org/wiki/FASTA-Format
IDPATTERN= (
rf'[^\s]*gb[:|]({CHS}+)' # gb:id, gb|id
rf'|[^\s]*(?:emb|dbj|sp|tr|ref|lcl)[|]({CHS}+)' # xxx|id
rf'|({CHS}+)' # "id ", "id;", "id|", "id,"
)
def _id_from_header(header):
id_ = None
if header != '':
match = re.match(IDPATTERN, header)
if match is not None:
for id_ in match.groups():
if id_ is not None:
break
return id_
[docs]
@_add_fmt_doc('read')
def iter_fasta(f, comments=None):
"""
Iterate through a FASTA file and yield `.BioSeq` sequences
:param list comments: comment lines inside the file are stored in
the comments list (optional)
"""
id_ = None
header = None
data = None
for line in f:
if line.startswith('>'):
if data is not None:
yield _create_bioseq(data, id_, header)
line = line.lstrip('>').strip()
header = line
id_ = _id_from_header(header)
# if line == '':
# id_ = None
# elif ' ' not in line and '|' not in line:
# id_ = line
# elif '|' in line:
# id_ = line.split('|')[-2]
# else:
# id_ = line.split(maxsplit=1)[0]
data = []
elif line.startswith(';'): # line is a comment
if comments is not None:
comments.append(line)
else:
line = line.strip()
if line:
data.append(line)
if data is not None:
yield _create_bioseq(data, id_, header)
[docs]
@_add_fmt_doc('write')
def append_fasta(seq, f):
"""
Append a `.BioSeq` sequence to a FASTA file
"""
id_ = seq.id or ''
if '_fasta' in seq.meta and 'header' in seq.meta._fasta:
header = (' ' + seq.meta._fasta.header.removeprefix(id_).lstrip()).rstrip()
else:
header = ''
content = f'>{id_}{header}\n{seq.data}\n'
f.write(content)