Source code for sugar._io.embl

# (C) 2023, Tom Eulenfeld, MIT license
"""
`EMBL`_ flat file reader for ENA and UniProt

``EMBL`` stands for the European Molecular Biology Laboratory.
``ENA`` stands for the European Nucleotide Archive.
``UniProt`` stands for the Universal Protein Database.
"""

from sugar.core.fts import Defect, Feature, FeatureList, Location
from sugar.core.seq import Attr, BioSeq, Meta
from sugar._io.util import _add_fmt_doc
from sugar._io.genbank import _parse_locs


[docs] def is_embl(f, **kw): content = f.read(5) return content == 'ID '
is_fts_embl = is_embl _EMBL2GENBANK = { 'ID': 'locus', 'AC': 'accession', 'DT': 'date', 'DE': 'definition', 'GN': 'gene_name', 'KW': 'keywords', 'DR': 'dbsource', 'CC': 'comment', 'OS': 'organism', 'OC': 'taxonomy', 'OG': 'organelle', 'PA': 'parent_accession', 'PR': 'project', 'R': 'reference', }
[docs] @_add_fmt_doc('read_fts') def read_fts_embl(f, exclude=()): """ Read EMBL feature records from file into `.FeatureList` :param tuple exclude: Tuple of feature names to exclude, possible options are ``'translation', 'fts'``, sequences are excluded anyway. """ fts = FeatureList() for seq in iter_embl(f, exclude=('seq',) + exclude): fts.extend(seq.fts) return fts
[docs] @_add_fmt_doc('read') def iter_embl(f, exclude=(), genbank=True): """ Read EMBL records and sequences from file into `.BioBasket` :param tuple exclude: Tuple of feature names to exclude, possible options are ``'seq', 'translation', 'fts'`` or line keys (e.g. ``'CC'``). :param genbank: By default, use genbank like key names in the ``_embl`` meta data, if set to ``False`` will use EMBL two character keys, except for references which will be saved in the ``_embl.R`` attribute. """ # allowed entries in exclude: features, translation, seq meta = Meta() attrs = Attr() fts = [] misc = [] fttype = None ftmeta = None locs = None val = None key = None parse = 'header' seq = '' for line in f: line = line.rstrip() if fttype is not None and (not line.startswith('FT') or len(line[2:20].strip()) > 0): # create a new feature ft = Feature(type=fttype, locs=_parse_locs(locs), meta=Meta(_embl=ftmeta)) fts.append(ft) fttype = None if line.strip() == '' or line[:2] in exclude or line[:2] in ('XX', 'FH'): continue if line.strip() == '//': # create a new sequence object assert len(misc) == 0 assert fttype is None if 'AC' in attrs: meta.id = attrs['AC'].split(';')[0].strip() if 'fts' in meta: for ft in meta.fts: ft.meta.seqid = meta.id if genbank: meta._embl = {gbkey: attrs[emblkey] for emblkey, gbkey in _EMBL2GENBANK.items() if emblkey in attrs} else: meta._embl = attrs meta.fts = FeatureList(fts) if 'translation' in exclude and 'fts' in meta: for feature in meta.fts: try: del feature.meta._embl.translation except Exception: pass yield BioSeq(seq.upper(), meta=meta) meta = Meta() attrs = Attr() fts = [] misc = [] fttype = None ftmeta = None locs = None val = None key = None parse = 'header' seq = '' continue elif line.startswith('FT'): parse = 'fts' elif line.startswith('SQ'): parse = 'origin' if parse == 'header': key = line[:2] if not line.startswith('R') else 'R' line = line[2:].strip() if line == '': continue if key == 'ID': line = ' '.join(line.split()) attrs[key] = line if key not in attrs else attrs[key] + ' ' + line elif parse == 'fts': if 'fts' in exclude: continue line = line[2:] if len(line[:18].strip()) > 0: key = line[:18].strip().split()[0] fttype = line[:18].strip().split()[0] locs = '' ftmeta = {} key2 = None val = line.strip() try: val = val.split(maxsplit=1)[1] except Exception: pass else: locs = val elif len(line.strip()) > 0: line = line.strip() if line.startswith('/'): line = line.removeprefix('/') if '=' in line: key2, val = line.split('=') if not val.startswith('"'): try: val = int(val) except Exception: pass else: val = val.strip('"') ftmeta[key2] = val else: ftmeta.setdefault('misc', []).append(line) elif key2 is None: # location spanning multiple lines locs = locs + line else: ftmeta[key2] = ftmeta[key2] + line.strip('"') elif parse in ('origin', 'ena', 'uniprot'): if 'seq' not in exclude and not line.startswith('SQ') and line.strip() != '': if parse == 'origin': if line[-7:-5] == ' ': parse = 'ena' # char counts at the end of line try: int(line[-1]) except Exception: from warnings import warn warn('Inconsistent EMBL file, please check the read content, contact developers') else: parse = 'uniprot' if parse == 'ena': line = line[:-10] seq = seq + line.replace(' ', '') else: assert False