Source code for sugar._io.genbank

# (C) 2023, Tom Eulenfeld, MIT license
"""
`GenBank`_ reader
"""

from sugar.core.fts import Defect, Feature, FeatureList, Location
from sugar.core.seq import Attr, BioSeq, Meta
from sugar._io.util import _add_fmt_doc


[docs] def is_genbank(f, **kw): content = f.read(5) return content.lower() == 'locus'
is_fts_genbank = is_genbank def _parse_locs(loc: str): # See https://www.insdc.org/submitting-standards/feature-table/#3.4 if loc.startswith('complement'): locs = _parse_locs(loc[loc.index('(')+1:loc.rindex(')')])[::-1] for locobj in locs: locobj.strand = {'-': '+', '+': '-'}.get(locobj.strand, locobj.strand) elif loc.startswith(('join', 'order')): locs = [_parse_locs(subloc.strip()) for subloc in loc[loc.index('(')+1:loc.rindex(')')].split(',')] locs = [l for ll in locs for l in ll] else: locs = [_parse_single_loc(loc)] return locs def _parse_single_loc(loc: str): if ':' in loc: from warnings import warn warn('Found seqid inside GenBank loc field, ' 'the seqid is saved in Location.meta._genbank.seqid. ' 'Other parts of sugar may ignore this information.') seqid, loc = loc.split(':') meta = {'_genbank': {'seqid': seqid}} else: meta = None defect = Defect.NONE if loc[0] == '<': defect |= Defect.BEYOND_LEFT loc = loc[1:] if '>' in loc: defect |= Defect.BEYOND_RIGHT loc = loc.replace('>', '') if ".." in loc: splitter = ".." elif "." in loc: splitter = "." defect |= Defect.UNKNOWN_SINGLE_BETWEEN elif "^" in loc: splitter = "^" defect |= Defect.BETWEEN_CONSECUTIVE else: # single base return Location(int(loc)-1, int(loc), defect=defect, meta=meta) # base range start, stop = loc.split(splitter) return Location(int(start)-1, int(stop), defect=defect, meta=meta)
[docs] @_add_fmt_doc('read_fts') def read_fts_genbank(f, exclude=()): """ Read GenBank feature records from file into `.FeatureList` :param tuple exclude: Tuple of feature names to exclude, possible options are ``'translation', 'fts'``, sequences are excluded anyway. """ fts = FeatureList() for seq in iter_genbank(f, exclude=('seq',) + exclude): fts.extend(seq.fts) return fts
[docs] @_add_fmt_doc('read') def iter_genbank(f, exclude=()): """ Read GenBank records and sequences from file into `.BioBasket` :param tuple exclude: Tuple of feature names to exclude, possible options are ``'seq', 'translation', 'fts'``. """ # allowed entries in exclude: features, translation, seq meta = Meta() attrs = Attr() fts = [] misc = [] fttype = None ftmeta = None locs = None val = None key = None subkey = None parse = 'header' seq = '' for line in f: line = line.rstrip() if line.strip() == '': continue if fttype is not None and ( line.strip() == '//' or parse == 'fts' and len(line[:20].strip()) > 0): # create a new feature ft = Feature(type=fttype, locs=_parse_locs(locs), meta=Meta(_genbank=ftmeta)) fts.append(ft) fttype = None meta.fts = FeatureList(fts) if line.strip() == '//': # create a new sequence object assert len(misc) == 0 assert fttype is None meta._genbank = attrs if 'accession' in meta._genbank: meta.id = meta._genbank.accession.split()[0] if 'fts' in meta: for ft in meta.fts: ft.meta.seqid = meta.id if 'translation' in exclude and 'fts' in meta: for feature in meta.fts: try: del feature.meta._genbank.translation except Exception: pass yield BioSeq(seq.upper(), meta=meta) meta = Meta() attrs = Attr() fts = [] misc = [] fttype = None ftmeta = None locs = None val = None key = None subkey = None parse = 'header' seq = '' continue if parse in ('header', 'reference'): if len(line)>0: if line[:12].strip() != '': key = line[:12].lower().strip() if key == 'features': parse = 'fts' key = None continue try: val = line.strip().split(maxsplit=1)[1] except Exception: val = '' if key == 'locus': val = ', '.join(val.split()) if key == 'reference': parse = 'reference' if line[0] == ' ' and parse == 'reference': key = 'reference' if key == 'organism' and line[:12].strip() == '': key = 'taxonomy' attrs[key] = val if key not in attrs else attrs[key] + '; ' + val elif parse == 'fts': if 'fts' in exclude: continue if len(line[:20].strip()) > 0: assert fttype is None key = line[:20].strip().split()[0] # subkey = None if key.lower() == 'origin': parse = 'origin' meta.fts = FeatureList(fts) continue fttype = key locs = '' ftmeta = {} key2 = None val = line.strip() try: val = val.split(maxsplit=1)[1] except Exception: pass else: locs = val elif len(line.strip()) > 0: line = line.strip() if line.startswith('/'): line = line.removeprefix('/') if '=' in line: key2, val = line.split('=', maxsplit=1) if not val.startswith('"'): try: val = int(val) except Exception: pass else: val = val.strip('"') ftmeta[key2] = val else: ftmeta.setdefault('misc', []).append(line) elif key2 is None: # location spanning multiple lines locs = locs + line else: ftmeta[key2] = ftmeta[key2] + line.strip('"') elif parse == 'origin': if 'seq' in exclude: seq = '' continue if len(line) > 10: seq = seq + line[10:].replace(' ', '') else: assert False