Source code for sugar._io.tab.core

# (C) 2024, Tom Eulenfeld, MIT license
"""
Generic tabular feature reader that is used by several plugins
"""

from collections import namedtuple
from sugar.core.fts import Feature, FeatureList, Location
from sugar.core.meta import Meta
from sugar._io.util import _add_fmt_doc


_BHV = namedtuple('_BlastHeaderValue', 'name long_name type')
_MHV = namedtuple('_MMseqsHeaderValue', 'name type blast_equivalent')
_IHV = namedtuple('_InfernalHeaderValue', 'name type blast_equivalent')
_HEADER = {
    'blast': [
        _BHV('qseqid', 'query id', str),
        _BHV('qgi', 'query gi', str),
        _BHV('qacc', 'query acc.', str),
        _BHV('qaccver', 'query acc.ver', str),
        _BHV('qlen', 'query length', int),
        _BHV('sseqid', 'subject id', str),
        _BHV('sallseqid', 'subject ids', str),
        _BHV('sgi', 'subject gi', str),
        _BHV('sallgi', 'subject gis', str),
        _BHV('sacc', 'subject acc.', str),
        _BHV('saccver', 'subject acc.ver', str),
        _BHV('sallacc', 'subject accs.', str),
        _BHV('slen', 'subject length', int),
        _BHV('qstart', 'q. start', int),
        _BHV('qend', 'q. end', int),
        _BHV('sstart', 's. start', int),
        _BHV('send', 's. end', int),
        _BHV('qseq', 'query seq', str),
        _BHV('sseq', 'subject seq', str),
        _BHV('evalue', 'evalue', float),
        _BHV('bitscore', 'bit score', float),
        _BHV('score', 'score', float),
        _BHV('length', 'alignment length', int),
        _BHV('pident', '% identity', float),
        _BHV('nident', 'identical', int),
        _BHV('mismatch', 'mismatches', int),
        _BHV('positive', 'positives', int),
        _BHV('gapopen', 'gap opens', int),
        _BHV('gaps', 'gaps', int),
        _BHV('ppos', '% positives', float),
        _BHV('frames', 'query/sbjct frames', str),
        _BHV('qframe', 'query frame', int),
        _BHV('sframe', 'sbjct frame', int),
        _BHV('btop', 'BTOP', str),
        _BHV('staxid', 'subject tax id', str),
        _BHV('ssciname', 'subject sci name', str),
        _BHV('scomname', 'subject com name', str),
        _BHV('sblastname', 'subject blast name', str),
        _BHV('sskingdom', 'subject super kingdom', str),
        _BHV('staxids', 'subject tax ids', str),
        _BHV('sscinames', 'subject sci names', str),
        _BHV('scomnames', 'subject com names', str),
        _BHV('sskingdoms', 'subject super kingdoms', str),
        _BHV('stitle', 'subject title', str),
        _BHV('salltitles', 'subject titles', str),
        _BHV('sstrand', 'subject strand', str),
        _BHV('qcovs', '% query coverage per subject', float),
        _BHV('qcovhsp', '% query coverage per hsp', float),
        _BHV('qcovus', '% query coverage per uniq subject', float),
    ],
    'mmseqs': [
        _MHV('query', str, 'qseqid'),
        _MHV('qlen', int, 'qlen'),
        _MHV('target', str, 'sseqid'),
        _MHV('tlen', int, 'slen'),
        _MHV('qstart', int, 'qstart'),
        _MHV('qend', int, 'qend'),
        _MHV('tstart', int, 'sstart'),
        _MHV('tend', int, 'send'),
        _MHV('qseq', str, 'qseq'),
        _MHV('tseq', str, 'sseq'),
        _MHV('evalue', float, 'evalue'),
        _MHV('bits', float, 'bitscore'),
        _MHV('alnlen', int, 'length'),
        _MHV('pident', float, 'pident'),
        _MHV('nident', int, 'nident'),
        _MHV('mismatch', int, 'mismatch'),
        _MHV('gapopen', int, 'gapopen'),
        _MHV('ppos', float, 'ppos'),
        _MHV('qframe', str, 'qframe'),
        _MHV('tframe', str, 'sframe'),
        _MHV('qcov', float, 'qcovs'),
        _MHV('fident', float, None),
        _MHV('raw', str, None),
        _MHV('cigar', str, None),
        _MHV('qheader', str, None),
        _MHV('theader', str, None),
        _MHV('qaln', str, None),
        _MHV('taln', str, None),
        _MHV('tcov', float, None),
        _MHV('qset', str, None),
        _MHV('qsetid', str, None),
        _MHV('tset', str, None),
        _MHV('tsetid', str, None),
        _MHV('taxid', str, None),
        _MHV('taxname', str, None),
        _MHV('taxlineage', str, None),
        _MHV('qorfstart', int, None),
        _MHV('qorfend', int, None),
        _MHV('torfstart', int, None),
        _MHV('torfend', int, None),
    ],
    'infernal': [
        # 18 fields from tblout fmt 1
        _IHV('target', str, 'sseqid'),
        _IHV('target_acc', str, None),
        _IHV('query', str, 'qseqid'),
        _IHV('query_acc', str, None),
        _IHV('model', str, None),
        _IHV('mstart', int, 'qstart'),
        _IHV('mend', int, 'qend'),
        _IHV('sstart', int, 'sstart'),
        _IHV('send', int, 'send'),
        _IHV('sstrand', str, 'sstrand'),
        _IHV('trunc', str, None),
        _IHV('pass', int, None),
        _IHV('gc', float, None),
        _IHV('bias', float, None),
        _IHV('bitscore', float, 'bitscore'),
        _IHV('evalue', float, 'evalue'),
        _IHV('inc', str, None),
        _IHV('description', str, None),
        # 11 additional field from tblout fmt 2
        _IHV('idx', int, None),
        _IHV('clan', str, None),
        _IHV('overlap', str, None),
        _IHV('anyidx', float, None),
        _IHV('anyfrct1', float, None),
        _IHV('anyfrct2', float, None),
        _IHV('winidx', float, None),
        _IHV('winfrct1', float, None),
        _IHV('winfrct2', float, None),
        _IHV('mlen', int, 'qlen'),
        _IHV('slen', int, 'slen'),
    ],
}
_CONVERTH = {
    'blast': {hv.name: hv.name for hv in _HEADER['blast']},
    'mmseqs': {hv.blast_equivalent: hv.name for hv in _HEADER['mmseqs']},
    'infernal': {hv.blast_equivalent: hv.name for hv in _HEADER['infernal']},
}
_DEFAULT_OUTFMT = {
    'blast': (  # outfmt 6
        'qseqid sseqid pident length mismatch gapopen '
        'qstart qend sstart send evalue bitscore'
    ).split(),
    'mmseqs': (  # fmtmode 0
        'query target fident alnlen mismatch gapopen '
        'qstart qend tstart tend evalue bits'
    ).split(),
    'infernal_1': (  # fmt 1
        'target target_acc query query_acc '
        'model mstart mend sstart send sstrand '
        'trunc pass gc bias bitscore evalue inc '
        'description'
    ).split(),
    'infernal_2': (  # fmt 2
        'idx target target_acc query query_acc clan '
        'model mstart mend sstart send sstrand '
        'trunc pass gc bias bitscore evalue inc '
        'overlap anyidx anyfrct1 anyfrct2 winidx winfrct1 winfrct2 '
        'mlen slen description'
    ).split(),
    # fmt 2, old? version found in the Infernal guide
    # and here https://docs.rfam.org/en/latest/genome-annotation.html
    'infernal_2old': (
        'idx target target_acc query query_acc clan '
        'model mstart mend sstart send sstrand '
        'trunc pass gc bias bitscore evalue inc '
        'overlap anyidx anyfrct1 anyfrct2 winidx winfrct1 winfrct2 '
        'description'
    ).split(),
    'infernal_3': (  # fmt 3
        'target target_acc query query_acc '
        'model mstart mend sstart send sstrand '
        'trunc pass gc bias bitscore evalue inc '
        'mlen slen description'
    ).split(),
}
_MMSEQS_HEADER_NAMES = [hv.name for hv in _HEADER['mmseqs']]

copyattrs = [('bitscore', 'score'),
             ('evalue', 'evalue'),
             ('saccver', 'seqid'),
             ('sacc', 'seqid'),
             ('sseqid', 'seqid'),
             ('qaccver', 'name'),
             ('qacc', 'name'),
             ('qseqid', 'name'),
             ]



[docs]
def extract_seqs(fts, x='source', change_id=False):
    """
    Extract sequences from BLAST features and return `.BioBasket`

    :param x: One of ``'source'`` or ``'query'``
    """
    # TODO remove change_id parameter
    # TODO move these functions to new helper.py module
    # TODO make them more general for all blast-like tools
    from sugar import BioSeq, BioBasket
    x = x[0]
    seqs = []
    for ft in fts:
        seq = BioSeq(ft.meta._blast[f'{x}seq'],
                     id=ft.meta._blast[f'{x}seqid'],
                     meta={'fts': FeatureList([ft])})
        if change_id:
            evalue = ft.meta.evalue
            from math import log10
            if abs(evalue) > 0:
                evalue = log10(evalue)
            start = min(ft.meta._blast[f'{x}start'], ft.meta._blast[f'{x}end'])
            seq.id = f'{seq.id}/{start}{ft.loc.strand}/e{evalue:.0f}'
        seqs.append(seq)
    return BioBasket(seqs)




[docs]
def get_query_fts(fts):
    """
    Convert BLAST source features to query features
    """
    fts = fts.copy()
    for ft in fts:
        _b = ft.meta._blast
        if 'qseqid' in _b:
            ft.meta.seqid = _b.qseqid
        if 'sseqid' in _b:
            ft.meta.name = _b.sseqid
        start, stop = _b.qstart, _b.qend
        if start > stop:
            start, stop, strand = stop, start, '-'
        elif start < stop:
            strand = '+'
        else:
            strand = '.'
        loc = Location(start-1, stop, strand)
        ft.locs = [loc]
    return fts



def _headers_from_fmtstrings(fmtstrs, fmt='blast', attr='name'):
    """
    Takes a list of format header names and returns a list of header objects
    """
    headers = []
    for fmtstr in fmtstrs:
        for h in _HEADER[fmt]:
            if getattr(h, attr) == fmtstr:
                headers.append(h)
                break
        else:
            raise ValueError(f'Unknown header string: {fmtstr}')
    return headers



[docs]
def read_tabular(f, *, sep='\t', outfmt=None, ftype=None, fmt='blast',
                  comments=None):
    """
    Reader function used by BLAST, Infernal and MMseqs2 readers
    """
    assert fmt in ('blast', 'infernal', 'mmseqs')
    fts = []
    headers = None
    maxsplit = -1
    if outfmt is not None:
        headers = _headers_from_fmtstrings(outfmt.split(), fmt=fmt)
    for line in f:
        if comments is not None and line.startswith('#'):
            comments.append(line)
        if fmt == 'blast' and outfmt is None and line.startswith('# Fields:'):
            headerline = line.removeprefix('# Fields:')
            headers = _headers_from_fmtstrings(map(str.strip, headerline.split(',')),
                                               fmt=fmt, attr='long_name')
        elif fmt == 'infernal' and headers is None and '--' in line:
            nheaders = len(line.lstrip('#').split())
            fmtv = {18: '1', 29: '2', 20: '3', 27: '2old'}[nheaders]
            headers = _headers_from_fmtstrings(_DEFAULT_OUTFMT[f'{fmt}_{fmtv}'], fmt=fmt)
            maxsplit = nheaders - 1
        elif line.startswith('#') or line.strip() == '':
            pass
        elif (fmt == 'mmseqs' and
                  len(header_fmtstrings := line.strip().split(sep)) > 1 and
                  set(header_fmtstrings) <= set(_MMSEQS_HEADER_NAMES)):
                if headers is None:
                    headers = _headers_from_fmtstrings(header_fmtstrings, fmt=fmt)
        else:
            if headers is None:
                # outfmt not defined in header
                # (as with BLAST outfmt 7 or MMseqs2 fmtmode 4)
                # and not supplied by user
                # assume default
                headers = _headers_from_fmtstrings(_DEFAULT_OUTFMT[fmt], fmt=fmt)
            splitted_line = line.strip().split(sep, maxsplit=maxsplit)
            if len(splitted_line) != len(headers):
                raise ValueError('Number of header fields does not equal '
                                 'number of data values')
            attrs = {}
            for i, v in enumerate(splitted_line):
                try:
                    v = headers[i].type(v)
                except ValueError:
                    pass
                attrs[headers[i].name] = v
            c = _CONVERTH[fmt]
            start = attrs[c['sstart']]
            stop = attrs[c['send']]
            qstart = attrs[c['qstart']]
            qstop = attrs[c['qend']]
            if attrs.get('sstrand') == 'N/A':  # only applicable for BLAST
                if start > stop:
                    # here old BLAST versions were inconsequenct
                    # sstrand is N/A, but start is smaller than stop indicating a ft on the - strand
                    from warnings import warn
                    warn('Inconsequent values: sstrand N/A, but start larger than stop. Strand is set to -.'
                         'Consider updating BLAST, as this was a problem with old BLAST versions. '
                         'If you are already using the latest BLAST version, '
                         'please submit a bug report to sugar developers.')
                    start, stop = stop, start
                    strand = '-'
                else:
                    # in some cases, setting '+' would be more appropriate
                    # but from the available data it is impossible to say
                    strand = '.'
            elif (start > stop and qstart < qstop) or (start < stop and qstart > qstop):
                if start > stop:
                    start, stop = stop, start
                else:
                    qstart, qstop = qstop, qstart
                strand = '-'
                if attrs.get('sstrand') not in (None, '-', 'minus'):
                    raise ValueError('Expected strand -, got + in sstrand')
            elif (start < stop and qstart < qstop) or (start > stop and qstart > qstop):
                if start > stop:
                    start, stop = stop, start
                    qstart, qstop = qstop, qstart
                strand = '+'
                if attrs.get('sstrand') not in (None, '+', 'plus'):
                    raise ValueError('Expected strand +, got - in sstrand')
            else:
                # single residuu features
                strand = attrs.get('sstrand', '.')
            loc = Location(start-1, stop, strand)
            _fmt = '_' + fmt
            ft = Feature(attrs.get(ftype, ftype), [loc],
                         meta=Meta({_fmt: attrs}))
            if 'pident' in ft.meta[_fmt] and 'fident' not in ft.meta[_fmt]:
                ft.meta[_fmt].fident = ft.meta[_fmt].pident / 100
            if 'fident' in ft.meta[_fmt] and 'pident' not in ft.meta[_fmt]:
                ft.meta[_fmt].pident = ft.meta[_fmt].fident * 100
            # TODO adapt
            for blastattr, metaattr in copyattrs:
                if blastattr in c and c[blastattr] in ft.meta[_fmt]:
                    ft.meta[metaattr] = ft.meta[_fmt][c[blastattr]]
            fts.append(ft)
    return FeatureList(fts)