Source code for sugar._io.embl

# (C) 2023, Tom Eulenfeld, MIT license
"""
`EMBL`_ flat file reader for ENA and UniProt

``EMBL`` stands for the European Molecular Biology Laboratory.
``ENA`` stands for the European Nucleotide Archive.
``UniProt`` stands for the Universal Protein Database.
"""

from sugar.core.fts import Defect, Feature, FeatureList, Location
from sugar.core.seq import Attr, BioSeq, Meta
from sugar._io.util import _add_fmt_doc
from sugar._io.genbank import _parse_locs



[docs]
def is_embl(f, **kw):
    content = f.read(5)
    return content == 'ID   '



is_fts_embl = is_embl


_EMBL2GENBANK = {
    'ID': 'locus',
    'AC': 'accession',
    'DT': 'date',
    'DE': 'definition',
    'GN': 'gene_name',
    'KW': 'keywords',
    'DR': 'dbsource',
    'CC': 'comment',
    'OS': 'organism',
    'OC': 'taxonomy',
    'OG': 'organelle',
    'PA': 'parent_accession',
    'PR': 'project',
    'R': 'reference',
    }



[docs]
@_add_fmt_doc('read_fts')
def read_fts_embl(f, exclude=()):
    """
    Read EMBL feature records from file into `.FeatureList`

    :param tuple exclude: Tuple of feature names to exclude,
        possible options are ``'translation', 'fts'``,
        sequences are excluded anyway.
    """
    fts = FeatureList()
    for seq in iter_embl(f, exclude=('seq',) + exclude):
        fts.extend(seq.fts)
    return fts




[docs]
@_add_fmt_doc('read')
def iter_embl(f, exclude=(), genbank=True):
    """
    Read EMBL records and sequences from file into `.BioBasket`

    :param tuple exclude: Tuple of feature names to exclude,
        possible options are ``'seq', 'translation', 'fts'``
        or line keys (e.g. ``'CC'``).
    :param genbank: By default, use genbank like key names in the ``_embl`` meta data,
        if set to ``False`` will use EMBL two character keys, except for references which will be
        saved in the ``_embl.R`` attribute.
    """
    # allowed entries in exclude: features, translation, seq
    meta = Meta()
    attrs = Attr()
    fts = []
    misc = []
    fttype = None
    ftmeta = None
    locs = None
    val = None
    key = None
    parse = 'header'
    seq = ''
    for line in f:
        line = line.rstrip()
        if fttype is not None and (not line.startswith('FT') or len(line[2:20].strip()) > 0):
            # create a new feature
            ft = Feature(type=fttype, locs=_parse_locs(locs),
                            meta=Meta(_embl=ftmeta))
            fts.append(ft)
            fttype = None
        if line.strip() == '' or line[:2] in exclude or line[:2] in ('XX', 'FH'):
            continue
        if line.strip() == '//':
            # create a new sequence object
            assert len(misc) == 0
            assert fttype is None
            if 'AC' in attrs:
                meta.id = attrs['AC'].split(';')[0].strip()
                if 'fts' in meta:
                    for ft in meta.fts:
                        ft.meta.seqid = meta.id
            if genbank:
                meta._embl = {gbkey: attrs[emblkey] for emblkey, gbkey in _EMBL2GENBANK.items() if emblkey in attrs}
            else:
                meta._embl = attrs
            meta.fts = FeatureList(fts)
            if 'translation' in exclude and 'fts' in meta:
                for feature in meta.fts:
                    try:
                        del feature.meta._embl.translation
                    except Exception:
                        pass
            yield BioSeq(seq.upper(), meta=meta)
            meta = Meta()
            attrs = Attr()
            fts = []
            misc = []
            fttype = None
            ftmeta = None
            locs = None
            val = None
            key = None
            parse = 'header'
            seq = ''
            continue
        elif line.startswith('FT'):
            parse = 'fts'
        elif line.startswith('SQ'):
            parse = 'origin'
        if parse == 'header':
            key = line[:2] if not line.startswith('R') else 'R'
            line = line[2:].strip()
            if line == '':
                continue
            if key == 'ID':
                line = ' '.join(line.split())
            attrs[key] = line if key not in attrs else attrs[key] + ' ' + line
        elif parse == 'fts':
            if 'fts' in exclude:
                continue
            line = line[2:]
            if len(line[:18].strip()) > 0:
                key = line[:18].strip().split()[0]
                fttype = line[:18].strip().split()[0]
                locs = ''
                ftmeta = {}
                key2 = None
                val = line.strip()
                try:
                    val = val.split(maxsplit=1)[1]
                except Exception:
                    pass
                else:
                    locs = val
            elif len(line.strip()) > 0:
                line = line.strip()
                if line.startswith('/'):
                    line = line.removeprefix('/')
                    if '=' in line:
                        key2, val = line.split('=')
                        if not val.startswith('"'):
                            try:
                                val = int(val)
                            except Exception:
                                pass
                        else:
                            val = val.strip('"')
                        ftmeta[key2] = val
                    else:
                        ftmeta.setdefault('misc', []).append(line)
                elif key2 is None:
                    # location spanning multiple lines
                    locs = locs + line
                else:
                    ftmeta[key2] = ftmeta[key2] + line.strip('"')
        elif parse in ('origin', 'ena', 'uniprot'):
            if 'seq' not in exclude and not line.startswith('SQ') and line.strip() != '':
                if parse == 'origin':
                    if line[-7:-5] == '  ':
                        parse = 'ena'  # char counts at the end of line
                        try:
                            int(line[-1])
                        except Exception:
                            from warnings import warn
                            warn('Inconsistent EMBL file, please check the read content, contact developers')
                    else:
                        parse = 'uniprot'
                if parse == 'ena':
                    line = line[:-10]
                seq = seq + line.replace(' ', '')
        else:
            assert False