Source code for sugar._io.tab.mmseqs

# (C) 2024, Tom Eulenfeld, MIT license
"""
`MMseqs2`_ reader for output generated with option fmtmode 4 (preferred) or 0
"""

from sugar._io.tab.core import read_tabular, _MMSEQS_HEADER_NAMES
from sugar._io.util import _add_fmt_doc



[docs]
def is_fts_mmseqs(f, *, sep='\t', outfmt=None, **kw):
    content = f.read(1000)
    # read first line
    line = content.splitlines()[0]
    fields = line.strip().split(sep)
    if len(fields) > 0 and set(fields) <= set(_MMSEQS_HEADER_NAMES):
        # MMseqs header present, fmtmode 4 was used
        return True
    # try to read it
    fts = read_fts_mmseqs([line], sep=sep, outfmt=outfmt, **kw)
    # BLAST and MMseqs2 have a *similar* default output
    # (BLAST outfmt 6 vs MMseqs2 fmtmode 0)
    # All header fields are the same, except the 3rd field.
    # In the 3rd field BLAST uses pident (percentage identity),
    # while MMseqs2 uses fident (fraction identity)!
    # If a BLAST file is interpreted as written by MMSeqs2,
    # the pident field is interpreted as fident resulting in
    # pident values above 100%
    # (if pident > 1% which is reasonable for most files).
    # This is a way to discriminate both file types,
    # but we need to check first if a file was written by MMSeqs2,
    # and only afterwards if it was written by BLAST.
    return len(fts) == 1 and (outfmt is not None or 0 <= fts[0].meta._mmseqs.pident <= 100)




[docs]
@_add_fmt_doc('read_fts')
def read_fts_mmseqs(f, *, sep='\t', outfmt=None, ftype=None, comments=None):
    """
    Tabular data reader for feature output generated by MMseqs2

    :param str sep: Separator of fields, default ``'\\t'``,
        can be set to ``None`` for any whitespace.
    :param str outfmt: The string passed to MMseqs2 option ``--format-output``,
        can be omitted if MMseqs2 was called with ``--fmtmode 4`` or
        ``--fmtmode 0`` with default format-output
    :param str ftype: Parameter used as ftype
    :param list comments: comment lines inside the file are stored in
        the comments list (optional)
    """
    return read_tabular(f, sep=sep, outfmt=outfmt, ftype=ftype, comments=comments, fmt='mmseqs')