Source code for sugar._io.tab.mmseqs

# (C) 2024, Tom Eulenfeld, MIT license
"""
`MMseqs2`_ reader for output generated with option fmtmode 4 (preferred) or 0
"""

from sugar._io.tab.core import read_tabular, _MMSEQS_HEADER_NAMES
from sugar._io.util import _add_fmt_doc


[docs] def is_fts_mmseqs(f, *, sep='\t', outfmt=None, **kw): content = f.read(1000) # read first line line = content.splitlines()[0] fields = line.strip().split(sep) if len(fields) > 0 and set(fields) <= set(_MMSEQS_HEADER_NAMES): # MMseqs header present, fmtmode 4 was used return True # try to read it fts = read_fts_mmseqs([line], sep=sep, outfmt=outfmt, **kw) # BLAST and MMseqs2 have a *similar* default output # (BLAST outfmt 6 vs MMseqs2 fmtmode 0) # All header fields are the same, except the 3rd field. # In the 3rd field BLAST uses pident (percentage identity), # while MMseqs2 uses fident (fraction identity)! # If a BLAST file is interpreted as written by MMSeqs2, # the pident field is interpreted as fident resulting in # pident values above 100% # (if pident > 1% which is reasonable for most files). # This is a way to discriminate both file types, # but we need to check first if a file was written by MMSeqs2, # and only afterwards if it was written by BLAST. return len(fts) == 1 and (outfmt is not None or 0 <= fts[0].meta._mmseqs.pident <= 100)
[docs] @_add_fmt_doc('read_fts') def read_fts_mmseqs(f, *, sep='\t', outfmt=None, ftype=None, comments=None): """ Tabular data reader for feature output generated by MMseqs2 :param str sep: Separator of fields, default ``'\\t'``, can be set to ``None`` for any whitespace. :param str outfmt: The string passed to MMseqs2 option ``--format-output``, can be omitted if MMseqs2 was called with ``--fmtmode 4`` or ``--fmtmode 0`` with default format-output :param str ftype: Parameter used as ftype :param list comments: comment lines inside the file are stored in the comments list (optional) """ return read_tabular(f, sep=sep, outfmt=outfmt, ftype=ftype, comments=comments, fmt='mmseqs')