Source code for sugar._io.genbank
# (C) 2023, Tom Eulenfeld, MIT license
"""
`GenBank`_ reader
"""
from sugar.core.fts import Defect, Feature, FeatureList, Location
from sugar.core.seq import Attr, BioSeq, Meta
from sugar._io.util import _add_fmt_doc
[docs]
def is_genbank(f, **kw):
content = f.read(5)
return content.lower() == 'locus'
is_fts_genbank = is_genbank
def _parse_locs(loc: str):
# See https://www.insdc.org/submitting-standards/feature-table/#3.4
if loc.startswith('complement'):
locs = _parse_locs(loc[loc.index('(')+1:loc.rindex(')')])[::-1]
for locobj in locs:
locobj.strand = {'-': '+', '+': '-'}.get(locobj.strand, locobj.strand)
elif loc.startswith(('join', 'order')):
locs = [_parse_locs(subloc.strip())
for subloc in
loc[loc.index('(')+1:loc.rindex(')')].split(',')]
locs = [l for ll in locs for l in ll]
else:
locs = [_parse_single_loc(loc)]
return locs
def _parse_single_loc(loc: str):
if ':' in loc:
from warnings import warn
warn('Found seqid inside GenBank loc field, '
'the seqid is saved in Location.meta._genbank.seqid. '
'Other parts of sugar may ignore this information.')
seqid, loc = loc.split(':')
meta = {'_genbank': {'seqid': seqid}}
else:
meta = None
defect = Defect.NONE
if loc[0] == '<':
defect |= Defect.BEYOND_LEFT
loc = loc[1:]
if '>' in loc:
defect |= Defect.BEYOND_RIGHT
loc = loc.replace('>', '')
if ".." in loc:
splitter = ".."
elif "." in loc:
splitter = "."
defect |= Defect.UNKNOWN_SINGLE_BETWEEN
elif "^" in loc:
splitter = "^"
defect |= Defect.BETWEEN_CONSECUTIVE
else:
# single base
return Location(int(loc)-1, int(loc), defect=defect, meta=meta)
# base range
start, stop = loc.split(splitter)
return Location(int(start)-1, int(stop), defect=defect, meta=meta)
[docs]
@_add_fmt_doc('read_fts')
def read_fts_genbank(f, exclude=()):
"""
Read GenBank feature records from file into `.FeatureList`
:param tuple exclude: Tuple of feature names to exclude,
possible options are ``'translation', 'fts'``,
sequences are excluded anyway.
"""
fts = FeatureList()
for seq in iter_genbank(f, exclude=('seq',) + exclude):
fts.extend(seq.fts)
return fts
[docs]
@_add_fmt_doc('read')
def iter_genbank(f, exclude=()):
"""
Read GenBank records and sequences from file into `.BioBasket`
:param tuple exclude: Tuple of feature names to exclude,
possible options are ``'seq', 'translation', 'fts'``.
"""
# allowed entries in exclude: features, translation, seq
meta = Meta()
attrs = Attr()
fts = []
misc = []
fttype = None
ftmeta = None
locs = None
val = None
key = None
subkey = None
parse = 'header'
seq = ''
for line in f:
line = line.rstrip()
if line.strip() == '':
continue
if fttype is not None and (
line.strip() == '//' or parse == 'fts' and len(line[:20].strip()) > 0):
# create a new feature
ft = Feature(type=fttype, locs=_parse_locs(locs),
meta=Meta(_genbank=ftmeta))
fts.append(ft)
fttype = None
meta.fts = FeatureList(fts)
if line.strip() == '//':
# create a new sequence object
assert len(misc) == 0
assert fttype is None
meta._genbank = attrs
if 'accession' in meta._genbank:
meta.id = meta._genbank.accession.split()[0]
if 'fts' in meta:
for ft in meta.fts:
ft.meta.seqid = meta.id
if 'translation' in exclude and 'fts' in meta:
for feature in meta.fts:
try:
del feature.meta._genbank.translation
except Exception:
pass
yield BioSeq(seq.upper(), meta=meta)
meta = Meta()
attrs = Attr()
fts = []
misc = []
fttype = None
ftmeta = None
locs = None
val = None
key = None
subkey = None
parse = 'header'
seq = ''
continue
if parse in ('header', 'reference'):
if len(line)>0:
if line[:12].strip() != '':
key = line[:12].lower().strip()
if key == 'features':
parse = 'fts'
key = None
continue
try:
val = line.strip().split(maxsplit=1)[1]
except Exception:
val = ''
if key == 'locus':
val = ', '.join(val.split())
if key == 'reference':
parse = 'reference'
if line[0] == ' ' and parse == 'reference':
key = 'reference'
if key == 'organism' and line[:12].strip() == '':
key = 'taxonomy'
attrs[key] = val if key not in attrs else attrs[key] + '; ' + val
elif parse == 'fts':
if 'fts' in exclude:
continue
if len(line[:20].strip()) > 0:
assert fttype is None
key = line[:20].strip().split()[0]
# subkey = None
if key.lower() == 'origin':
parse = 'origin'
meta.fts = FeatureList(fts)
continue
fttype = key
locs = ''
ftmeta = {}
key2 = None
val = line.strip()
try:
val = val.split(maxsplit=1)[1]
except Exception:
pass
else:
locs = val
elif len(line.strip()) > 0:
line = line.strip()
if line.startswith('/'):
line = line.removeprefix('/')
if '=' in line:
key2, val = line.split('=', maxsplit=1)
if not val.startswith('"'):
try:
val = int(val)
except Exception:
pass
else:
val = val.strip('"')
ftmeta[key2] = val
else:
ftmeta.setdefault('misc', []).append(line)
elif key2 is None:
# location spanning multiple lines
locs = locs + line
else:
ftmeta[key2] = ftmeta[key2] + line.strip('"')
elif parse == 'origin':
if 'seq' in exclude:
seq = ''
continue
if len(line) > 10:
seq = seq + line[10:].replace(' ', '')
else:
assert False