Source code for sugar.core.seq

# (C) 2024, Tom Eulenfeld, MIT license
"""
Sequence related classes, `.BioSeq`, `.BioBasket`
"""

import collections
import collections.abc
import copy
from functools import reduce
import io
import math
import sys
from warnings import warn

from sugar.data import CODES
from sugar.core.fts import Feature, FeatureList, Location, LocationTuple
from sugar.core.meta import Attr, Meta
from sugar.core.util import _add_inplace_doc


CODES_INV = {frozenset(v): k for k, v in CODES.items()}
COMPLEMENT = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', '.': '.', '-': '-'}
COMPLEMENT_ALL = {c: CODES_INV[frozenset(COMPLEMENT[nt] for nt in nts)] for c, nts in CODES.items()}
COMPLEMENT_TRANS = str.maketrans(COMPLEMENT_ALL)


class _Sliceable_GetItem():
    def __init__(self, obj, method='_getitem', **kw):
        self.kw = kw
        self.call = getattr(obj, method)

    def __getitem__(self, i):
        return self.call(i, **self.kw)



[docs]
class _BioSeqStr():
    """
    Helper class to hold all string methods in the `BioSeq.str` namespace.

    The methods modify the data in-place, if applicable,
    which is different from the behavior of the original string methods.

    See `str` for documentation of the methods.

    :meta public:
    """
    def __init__(self, parent):
        self.__parent = parent


[docs]
    def center(self, width, *args):
        self.__parent.data = self.__parent.data.center(width, *args)
        return self.__parent



[docs]
    def count(self, sub, start=0, end=sys.maxsize):
        return self.__parent.data.count(str(sub), start, end)



[docs]
    def removeprefix(self, prefix, /):
        self.__parent.data = self.__parent.data.removeprefix(str(prefix))
        return self.__parent



[docs]
    def removesuffix(self, suffix, /):
        self.__parent.data = self.__parent.data.removesuffix(str(suffix))
        return self.__parent



[docs]
    def encode(self, encoding='utf-8', errors='strict'):
        encoding = 'utf-8' if encoding is None else encoding
        errors = 'strict' if errors is None else errors
        return self.__parent.data.encode(encoding, errors)



[docs]
    def endswith(self, suffix, start=0, end=sys.maxsize):
        return self.__parent.data.endswith(str(suffix), start, end)



[docs]
    def find(self, sub, start=0, end=sys.maxsize):
        return self.__parent.data.find(str(sub), start, end)



[docs]
    def index(self, sub, start=0, end=sys.maxsize):
        return self.__parent.data.index(str(sub), start, end)



[docs]
    def isalpha(self):
        return self.__parent.data.isalpha()



[docs]
    def isascii(self):
        return self.__parent.data.isascii()



[docs]
    def islower(self):
        return self.__parent.data.islower()



[docs]
    def isupper(self):
        return self.__parent.data.isupper()



[docs]
    def ljust(self, width, *args):
        """
        The ``ljust()`` and ``rjust()`` methods can be used to fill up an alignment with gaps.

        Example: ``seqs.ljust(500, '-')``
        """
        self.__parent.data = self.__parent.data.ljust(width, *args)
        return self.__parent



[docs]
    def lower(self):
        self.__parent.data = self.__parent.data.lower()
        return self.__parent



[docs]
    def lstrip(self, chars=None):
        if chars is not None:
            chars = str(chars)
        self.__parent.data = self.__parent.data.lstrip(chars)
        return self.__parent



[docs]
    @staticmethod
    def maketrans(*args):
        return str.maketrans(*args)



[docs]
    def replace(self, old, new, maxsplit=-1):
        self.__parent.data = self.__parent.data.replace(
            str(old), str(new), maxsplit)
        return self.__parent



[docs]
    def rfind(self, sub, start=0, end=sys.maxsize):
        return self.__parent.data.rfind(str(sub), start, end)



[docs]
    def rindex(self, sub, start=0, end=sys.maxsize):
        return self.__parent.data.rindex(str(sub), start, end)



[docs]
    def rjust(self, width, *args):
        self.__parent.data = self.__parent.data.rjust(width, *args)
        return self.__parent



[docs]
    def rstrip(self, chars=None):
        if chars is not None:
            chars = str(chars)
        self.__parent.data = self.__parent.data.rstrip(chars)
        return self.__parent



[docs]
    def split(self, sep=None, maxsplit=-1):
        if sep is not None:
            sep = str(sep)
        return self.__parent.data.split(sep, maxsplit)



[docs]
    def rsplit(self, sep=None, maxsplit=-1):
        if sep is not None:
            sep = str(sep)
        return self.__parent.data.rsplit(sep, maxsplit)



[docs]
    def splitlines(self, keepends=False):
        return self.__parent.data.splitlines(keepends)



[docs]
    def startswith(self, prefix, start=0, end=sys.maxsize):
        return self.__parent.data.startswith(str(prefix), start, end)



[docs]
    def strip(self, chars=None):
        if chars is not None:
            chars = str(chars)
        self.__parent.data = self.__parent.data.strip(chars)
        return self.__parent



[docs]
    def swapcase(self):
        self.__parent.data = self.__parent.data.swapcase()
        return self.__parent



[docs]
    def translate(self, *args):
        self.__parent.data = self.__parent.data.translate(*args)
        return self.__parent



[docs]
    def upper(self):
        self.__parent.data = self.__parent.data.upper()
        return self.__parent





[docs]
class _BioBasketStr():
    """
    Helper class to move all string methods into the `BioBasket.str` namespace

    It calls the corresponding `BioSeq.str` method under the hood and returns
    either the modified `BioBasket` object or a list of results.

    :meta public:
    """
    def __init__(self, parent):
        self.__parent = parent

    def __getattr__(self, name):
        def method(*args, **kw):
            results = [
                getattr(seq.str, name)(*args, **kw)
                for seq in self.__parent
            ]
            if name in (
                'center', 'remove_prefix', 'ljust', 'lower',
                'lstrip', 'replace', 'rjust', 'rstrip',
                'strip', 'swapcase', 'translate', 'upper'
                ):
                return self.__parent
            else:
                return results
        return method




[docs]
class BioSeq():
    """
    Class holding sequence data and metadata, exposing bioinformatics methods.

    Most methods work in-place by default, but return the BioSeq object again.
    Therefore, method chaining can be used.
    """

    def __init__(self, data, id='', meta=None, type=None):
        #: Property holding the data string
        self.data = str(data).upper()
        if hasattr(data, 'meta'):
            meta = data.meta
        elif 'meta' in data:
            meta = data['meta']
        elif meta is None:
            meta = {}
        #: Property holding metadata
        self.meta = Meta(meta)
        if id or 'id' not in self.meta:
            self.meta.id = id
        assert type in (None, 'nt', 'aa')
        if type is None:
            codes = set(CODES) | {'U'}
            type = 'nt' if all(nb in codes for nb in self.data) else 'aa'
        if type == 'nt' and 'U' in data:
            from warnings import warn
            warn('Found U in nucleotide sequence, '
                 'some methods will not work as expected')
        #: type of the sequence, either ``'nt'`` or ``'aa'``
        self.type = type

    def _repr_pretty_(self, p, cycle):
        if cycle:
            p.text('...')
        else:
            p.text(str(self))

    def __str__(self):
        return str(self.data)

    def __repr__(self):
        metastr = ', '.join(f'{prop}={repr(val)}' for prop, val in vars(self.meta).items())
        return f'{type(self).__name__}({repr(self.data)}, meta=dict({metastr}))'

    def __eq__(self, string):
        if isinstance(string, BioSeq):
            return self.data == string.data and self.meta == string.meta
        return self.data == string

    def __lt__(self, other):
        if isinstance(other, BioSeq):
            return self.meta.get('id', '') < other.meta.get('id', '')
        msg = f"'<' not supported between instances of '{type(self).__name__}' and '{type(other).__name__}'"
        raise TypeError(msg)

    def __len__(self):
        return len(self.data)

    def __setitem__(self, index, value):
        l = list(self.data)
        l[index] = str(value)
        self.data = ''.join(l)

    def __add__(self, other):
        if isinstance(other, BioSeq) and self.meta != other.meta:
            warn('Join two BioSeq objects with different meta data')
        return self.__class__(self.data + str(other), meta=self.meta)

    def __iadd__(self, other):
        if isinstance(other, BioSeq) and self.meta != other.meta:
            warn('Join two BioSeq objects with different meta data')
        self.data = self.data + str(other)
        return self

    def __radd__(self, other):
        return self.__class__(str(other) + self.data, meta=self.meta)

    @property
    def str(self):
        """
        Namespace holding all available string methods,
        see `_BioSeqStr` for available methods
        and `str` for documentation of the methods

        .. rubric:: Example:

        >>> seq = read()[0]
        >>> seq.str.find('ATG')  # Use string method
        30
        """
        return _BioSeqStr(self)

    @property
    def id(self):
        """Alias for ``BioSeq.meta.id``"""
        return self.meta.id

    @id.setter
    def id(self, value):
        self.meta.id = value

    @property
    def fts(self):
        """
        Alias for ``BioSeq.meta.fts``

        The fts object holds all feature metadata.
        It is an instance of `.FeatureList`.
        """
        return self.meta.setdefault('fts', FeatureList())

    @fts.setter
    def fts(self, value):
        self.meta.fts = FeatureList(value)
        for ft in self.meta.fts:
            if ft.seqid != self.id:
                warn('Feature seqid and sequence id mismatch')


[docs]
    def add_fts(self, fts):
        """
        Add some features to the feature list.

        If you want to set all features, use the `BioSeq.fts` attribute.

        :param fts: features to add
        """
        self.fts = self.fts + FeatureList(fts)
        self.fts.sort()


    @property
    def gc(self):
        """
        GC content of the sequence
        """
        GC = self.str.count('G') + self.str.count('C')
        AT = self.str.count('A') + self.str.count('T') + self.str.count('U')
        if GC + AT > 0:
            return GC / (GC + AT)
        else:
            return 0

    _add_inplace_doc

[docs]
    def rc(self, update_fts=False):
        """
        Reverse complement, alias for ``BioSeq.reverse().complement()``
        """
        self.reverse().complement()
        if update_fts:
            self.fts = self.fts.rc(seqlen=len(self))
        return self


    def __getitem__(self, index):
        return self._getitem(index)


[docs]
    def sl(self, **kw):
        """
        Method that allows you to slice the `BioSeq` object with non-default options.

        If you want to use the default options, you can slice the BioSeq object directly.
        For non-default options, slice the sliceable object returned by this method.

        :param bool inplace:
            Not only will the subsequence be returned,
            but the original sequence will be modified in-place
            (default: False).
        :param str gap:
            Gaps of the given characters are taken into account
            when slicing the sequence
            (default: gaps are not taken into account)

        .. rubric:: Slicing options:

        The slice specifies which part of the sequence is returned, and
        is defined inside the square brackets ``[]``
        The following types are supported.

        int,slice
            The location is specified by int or slice
        `.Location`
            specified by location
        `.Feature`
            specified by feature
        str
            Position of the first feature of the given type, e.g. ``'cds'``
            will return the sequence with the first coding sequence.

        .. rubric:: Example:

        >>> from sugar import read
        >>> seq = read()[0]
        >>> print(seq[:5])  # use direct slicing for default options
        ACCTG
        >>> print(seq[4])
        G
        >>> print(seq['cds'][:3])
        ATG
        >>> print(seq.sl(inplace=True, gap='-')[:5:2])  # non-default options
        ACG
        >>> print(seq)  # has been modified in-place
        ACG
        """
        return _Sliceable_GetItem(self, **kw)



[docs]
    def slindex(self, gap=None):
        """
        Method that translates an index to account for gaps

        .. rubric:: Example:

        >>> from sugar import BioSeq
        >>> seq = BioSeq('ATG---GGA')
        >>> print(seq)
        ATG---GGA
        >>> print(seq[1:5])
        TG--
        >>> print(seq.sl(gap='-')[1:5])
        TG---GG
        >>> print(seq.slindex(gap='-')[1:5])
        slice(1, 8, None)
        >>> print(seq[seq.slindex(gap='-')[1:5]])
        TG---GG
        """
        return _Sliceable_GetItem(self, method='_getindex', gap=gap)


    def _slice_locs(self, locs, splitter=None, filler=None, gap=None, update_fts=False):
        # Merge the sequences corresponding to the ordered locations
        sub_seqs = []
        prev_loc = None
        for i, loc in enumerate(locs):
            add_seq = self.sl(gap=gap)[loc.start:loc.stop]
            if loc.strand == '-':
                add_seq = add_seq.rc()
            if filler is not None and prev_loc is not None:
                if loc.strand == '-':
                    num = prev_loc.start - loc.stop
                else:
                    num = loc.start - prev_loc.stop
                if num > 0:
                    sub_seqs.append(num * filler)
            if splitter is not None and prev_loc is not None:
                sub_seqs.append(splitter)
            sub_seqs.append(add_seq.data)
            prev_loc = loc
        sub_seq = BioSeq(''.join(sub_seqs), meta=self.meta.copy())
        if update_fts:
            if len(locs) > 1:
                raise ValueError('update_fts is only allowed for locs wit a single Location. Sorry.')
            start, stop = locs.range
            fts = FeatureList()
            for loc in locs:
                fts.extend(self.fts.slice(loc.start, loc.stop, rel=start))
            if locs[0].strand == '-':
                fts = fts.rc(seqlen=stop-start)
            sub_seq.fts = fts
        return sub_seq

    def _getindex(self, index, gap=None):
        if not isinstance(index, (int, slice)):
            if isinstance(index, str):
                ft = self.fts.get(index)
                if ft is None:
                    raise ValueError(f'Feature of type {index} not found')
                index = ft
            if isinstance(index, Location):
                index = LocationTuple([Location])
            elif isinstance(index, Feature):
                index = index.locs
            if isinstance(index, LocationTuple):
                start, stop = index.range
                return self.slindex(gap=gap)[start:stop]
            else:
                raise TypeError(f"Index of type '{type(index).__name__}' not supported")
        # from bisect import bisect
        nogaps = [i for i, nt in enumerate(self.data) if nt not in gap]
        adj = lambda i: nogaps[i] if i is not None and i < len(nogaps) else None
        adj_stop = lambda i: nogaps[i-1]+1 if i is not None and 0 < i < len(nogaps) else 0 if i == 0 else None
        if isinstance(index, int):
            index = adj(index)
        elif isinstance(index, slice):
            index = slice(adj(index.start), adj_stop(index.stop), index.step)
        return index

    def _getitem(self, index, inplace=False, gap=None, update_fts=False, **kw):
        """
        Slice the sequence and return a subsequence

        This is the method which is called if you slice with ``BioSeq[]`` syntax
        or with ``BioSeq.sl()[]`` syntax
        If you want to use non-default options call this method directly,
        or by the `BioSeq.sl` attribute.
        """
        # TODO: doc for splitter and filler
        if not isinstance(index, (int, slice)):
            if isinstance(index, str):
                ft = self.fts.get(index)
                if ft is None:
                    raise ValueError(f'Feature of type {index} not found')
                index = ft
            if isinstance(index, Location):
                index = LocationTuple([index])
            elif isinstance(index, Feature):
                index = index.locs
            if isinstance(index, LocationTuple):
                subseq = self._slice_locs(index, gap=gap, update_fts=update_fts, **kw)
            else:
                raise TypeError(f"Index of type '{type(index).__name__}' not supported")
        else:
            subseq = self.__class__(
                self.data[index if gap is None else self._getindex(index, gap=gap)],
                meta=self.meta
                )
            if update_fts:
                if isinstance(index, int):
                    start, stop = index, index + 1
                elif isinstance(index, slice):
                    start, stop = index.start, index.stop
                    if index.step not in (None, 1):
                        raise ValueError('update_fts for slices only supported with step==1')
                subseq.fts = self.fts.slice(start, stop, rel=start or 0)
        if inplace:
            self.data = subseq.data
        return subseq


[docs]
    @_add_inplace_doc
    def complement(self):
        """
        Complementary sequence, i.e. transcription
        """
        if 'U' in self.data:
            self.str.replace('U', 'T').str.translate(COMPLEMENT_TRANS).str.replace('T', 'U')
        else:
            self.str.translate(COMPLEMENT_TRANS)
        return self



[docs]
    def countall(self, *args, **kw):
        return BioBasket([self]).countall(*args, **kw)



[docs]
    def countplot(self, *args, hue=None, **kw):
        return BioBasket([self]).countplot(*args, hue=hue, **kw)



[docs]
    def copy(self):
        """
        Return a deep copy of the object
        """
        return copy.deepcopy(self)



[docs]
    @classmethod
    def frombiopython(cls, obj):
        """
        Create a `.BioSeq` object from a biopython_ `~Bio.SeqRecord.SeqRecord` or `~Bio.Seq.Seq` object.

        :param obj: The object to convert.

        .. note::
            BioPython Features in the ``SeqRecord.features`` attribute are automatically converted.
        """
        from sugar.core._adapter import biopython2seq
        return biopython2seq(obj, cls=cls)



[docs]
    @classmethod
    def frombiotite(cls, obj):
        """
        Create a `.BioSeq` object from a biotite_ sequence object.

        :param obj: The object to convert.
        """
        from sugar.core._adapter import biotite2seq
        return biotite2seq(obj, cls=cls)



[docs]
    def match(self, *args, **kw):
        """
        Search regex and return match, see ``~.cane.match()``
        """
        from sugar.core.cane import match as _match
        return _match(self, *args, **kw)



[docs]
    def matchall(self, *args, **kw):
        """
        Search regex and return `.BioMatchList` with all matches, see `~.cane.match()`
        """
        kw['matchall'] = True
        return self.match(*args, **kw)



[docs]
    def find_orfs(self, *args, **kw):
        """
        Find ORFS in the sequence, see `~.cane.find_orfs()`
        """
        from sugar.core.cane import find_orfs
        return find_orfs(self, *args, **kw)



[docs]
    def plot_ftsviewer(self, *args, **kw):
        """
        Plot features of the sequence using DNAFeaturesViewer_, see `~.imaging.ftsviewer.plot_ftsviewer()`

        .. note::
            Using `BioSeq <.BioSeq.plot_ftsviewer>` or `.BioBasket.plot_ftsviewer()`
            over `.FeatureList.plot_ftsviewer()` has the advantage,
            that sequence lengths are used automatically.
        """
        return BioBasket([self]).plot_ftsviewer(*args, **kw)



[docs]
    def tostr(self, **kw):
        """
        Return a nice string, see `BioBasket.tostr()`
        """
        kw.setdefault('add_header', False)
        return BioBasket([self]).tostr(**kw)



[docs]
    def tofmtstr(self, fmt, **kw):
        """
        Write object to a string of given format, see `~.main.write()`
        """
        return BioBasket([self]).tofmtstr(fmt, **kw)



[docs]
    def tobiopython(self):
        """
        Convert BioSeq to biopython_ `~Bio.SeqRecord.SeqRecord` instance

        Attached ``BioSeq.fts`` features are automatically converted.
        """
        from sugar.core._adapter import seq2biopython
        return seq2biopython(self)



[docs]
    def tobiotite(self, **kw):
        """
        Convert BioSeq to biotite_ `~biotite.sequence.NucleotideSequence` or `~biotite.sequence.ProteinSequence` instance

        :param str type: ``'nt'`` creates a `~biotite.sequence.NucleotideSequence` instance,
            ``'aa'`` creates a `~biotite.sequence.ProteinSequence` instance,
            by default the class is inferred from the sequence itself.
        :param str gap: Gap characters that must be removed from the sequence string.
        :param bool warn: Whether to warn if gap characters have been removed, default is True.
        """
        from sugar.core._adapter import seq2biotite
        return seq2biotite(self, **kw)



[docs]
    def toftsviewer(self, **kw):
        r"""
        Convert features of this sequence to DNAFeaturesViewer_ `~dna_features_viewer.GraphicRecord`

        See `.FeatureList.toftsviewer`.
        """
        return self.fts.toftsviewer(seq=self, **kw)



[docs]
    @_add_inplace_doc
    def reverse(self):
        """
        Reverse the sequence
        """
        self.data = self.data[::-1]
        return self



[docs]
    @_add_inplace_doc
    def translate(self, *args, update_fts=False, **kw):
        """
        Translate nucleotide sequence to amino acid sequence, see `~.cane.translate()`.

        The original translate method of the str class can be used via ``BioBasket.str.translate()``.
        """
        from sugar.core.cane import translate
        self.data = translate(self.data, *args, **kw)
        self.type = 'aa'
        if update_fts:
            fts = self.fts.slice(0, len(self) * 3)
            for ft in fts:
                for loc in ft.locs:
                    loc.start = loc.start // 3
                    loc.stop = loc.stop // 3
            self.fts = fts
        return self



[docs]
    def write(self, fname=None, fmt=None, **kw):
        """
        Write sequence to file, see `~.main.write()`
        """
        return BioBasket([self]).write(fname=fname, fmt=fmt, **kw)




def _si_format(v, l=4):
    if v == 0:
        return '0'
    l2 = int(math.log10(v)) + 1
    if l2 > l:
        si = {1: 'k', 2: 'M', 3: 'G', 4: 'T'}
        j = (3 + l2 - l) // 3
        s = si.get(j, '?')
        v = v / 10 ** (3*j)
        # p = min(max(0, l - (l2 - 3*j) - 1), 1)
        p = 1 if l2 % 3 == l % 3 else 0
        return f'{v:.{p}f}{s}'
    else:
        return str(v)



[docs]
class BioBasket(collections.UserList):
    """
    Class holding a list of `BioSeq` objects

    The BioBasket object can be used like a list.
    It has useful bioinformatics methods attached to it.

    The list itself is stored in the ``data`` property.
    The BioBasket object may also have a metadata
    attribute.
    """
    def __init__(self, data=None, meta=None):
        if isinstance(data, (str, BioSeq)):
            warn(f'{self.__class__.__name__} object initialized with a sequence. '
                 'To hide this warning initialize with a list of sequences.')
            data = [data]
        if data is None:
            data = []
        if hasattr(data, 'meta'):
            meta = data.meta
        elif 'meta' in data:
            meta = data['meta']
        elif meta is None:
            meta = {}
        super().__init__(data)
        # FAKE, just for documenting tha data property
        #: Property holding the list of sequences
        self.data = self.data
        #: Property holding metadata
        self.meta = Meta(meta)

    def __eq__(self, other):
        if isinstance(other, BioBasket):
            return self.data == other.data and self.meta == other.meta
        return self.data == other

    # Implement all variants of &, |, -, ^
    def __and__(self, other):
        return self.__class__([seq for seq in self if seq in other])

    def __rand__(self, other):
        return self & other

    def __iand__(self, other):
        self.data = [seq for seq in self if seq in other]
        return self

    def __or__(self, other):
        return self + [seq for seq in other if seq not in self]

    def __ror__(self, other):
        return self | other

    def __ior__(self, other):
        self.data += [seq for seq in other if seq not in self]
        return self

    def __sub__(self, other):
        return self.__class__([seq for seq in self if seq not in other])

    def __rsub__(self, other):
        return self.__class__(other) - self

    def __isub__(self, other):
        self.data = [seq for seq in self if seq not in other]
        return self

    def __xor__(self, other):
        return (self | other) - (self & other)

    def __rxor__(self, other):
        return self ^ other

    def __ixor__(self, other):
        self.data = (self ^ other).data
        return self

    @property
    def str(self):
        """
        Namespace holding all available string methods.

        The `BioBasket.str` methods call the corresponding `BioSeq.str` methods under the hood
        and return either the modified `BioBasket` object or a list of results.
        See `_BioSeqStr` for available methods
        and `str` for method documentation.

        .. rubric:: Example:

        >>> seqs = read()
        >>> seqs.str.find('ATG')  # Use string method
        [30, 12]
        """
        return _BioBasketStr(self)

    @property
    def ids(self):
        """List of sequence ids"""
        return [seq.meta.id for seq in self]

    @property
    def fts(self):
        """
        `.FeatureList` of containing features of all sequences

        Can also be used as setter.
        Code example: ``seqs.fts = new_fts``.
        """
        fts = [ft for seq in self for ft in seq.fts]
        return FeatureList(fts)

    @fts.setter
    def fts(self, value):
        fts = FeatureList(value).groupby('seqid')
        for seq in self:
            if seq.id in fts:
                seq.fts = fts.pop(seq.id)
            else:
                try:
                    del seq.meta.fts
                except KeyError:
                    pass
        if len(fts) > 0:
            missing_ids = ', '.join(fts.keys())
            warn(f'Features for seqids {missing_ids} could not be '
                 'attached to any sequence')


[docs]
    def add_fts(self, fts):
        """
        Add some features to the feature list of the corresponding sequences.

        If you want to set all features, use the `BioBasket.fts` attribute.

        :param fts: features to add
        """
        fts = FeatureList(fts).groupby('seqid')
        for seq in self:
            if seq.id in fts:
                seq.fts = seq.fts + fts.pop(seq.id)
                seq.fts.sort()
        if len(fts) > 0:
            missing_ids = ', '.join(fts.keys())
            warn(f'Features for seqids {missing_ids} could not be '
                 'attached to any sequence')



[docs]
    @_add_inplace_doc
    def rc(self, **kw):
        """
        Reverse complement, alias for ``BioBasket.reverse().complement()``
        """
        for seq in self:
            seq.rc(**kw)
        return self


    def __str__(self):
        return self.tostr(add_hint=True)

    def _repr_pretty_(self, p, cycle):
        if cycle:
            p.text('...')
        else:
            p.text(str(self))

    def __repr__(self):
        metastr = ', '.join(f'{prop}={repr(val)}' for prop, val in vars(self.meta).items())
        return f'{type(self).__name__}({super().__repr__()}, meta=dict({metastr}))'

    def __getitem__(self, i):
        return self._getitem(i)


[docs]
    def sl(self, **kw):
        r"""
        Method that allows you to slice the `BioBasket` object with non-default options.

        If you want to use the default options, you can slice the BioBasket object directly.
        For non-default options, slice the sliceable object returned by this method.

        :param \*\*kw:
            All kwargs are documented in `BioSeq.sl()`.

        .. rubric:: Slice options:

        The slice specifies which part of the sequence(s) are returned and
        is defined inside the square brackets ``[]``
        The following options are supported.

        int
            Returns a `BioSeq` from the basket
        slice
            Returns a new `BioBasket` object with a subset of the sequences
        str,feature,location
            Returns a new `BioBasket` object with updated sequences inside, see `BioSeq.sl()`
        (int, object)
            Returns a `BioSeq` from the basket and slices it with the object, see `BioSeq.sl()`
        (slice, object)
            Returns a new `BioBasket` object with a subset of the sequences which are replaced
            by subsequences according to `BioSeq.sl()`

        .. rubric:: Example:

        >>> from sugar import read
        >>> seqs = read()
        >>> print(seqs[:2, 5:10])
        2 seqs in basket
        AB047639  5  CCCCT  ...
        AB677533  5  CCCCC  ...
        >>> print(seqs[:2, 'cds'][:, 0:3])
        2 seqs in basket
        AB047639  3  ATG  ...
        AB677533  3  ATG  ...
        """
        return _Sliceable_GetItem(self, **kw)


    def _getitem(self, i, **kw):
        """
        Slice sequences

        This is the method which is called if you slice with ``BioBasket[]`` syntax.
        If you want to use non-default options call this method directly,
        or by the `BioBasket.sl` attribute.
        """
        if isinstance(i, int):
            return self.data[i]
        elif isinstance(i, slice):
            seqs = self.__class__(self.data[i], meta=self.meta)
        elif isinstance(i, (Feature, FeatureList)):
            seqs = self.__class__(self.data, meta=self.meta)
            if isinstance(i, Feature):
                seqs.data = [seq._getitem(i, **kw) for seq in seqs.data if seq.id == i.seqid]
            else:
                seqs.data = [seq._getitem(ft, **kw) for ft in i for seq in seqs.data if seq.id == ft.seqid]
        elif isinstance(i, (str, LocationTuple, Location)):
            seqs = self.__class__(self.data, meta=self.meta)
            seqs.data = [seq._getitem(i, **kw) for seq in seqs.data]
        elif len(i) == 2:
            i, j = i
            if isinstance(i, int):
                return self.data[i]._getitem(j, **kw)
            elif isinstance(i, slice):
                seqs = self.__class__(self.data[i], meta=self.meta)
                seqs.data = [seq._getitem(j, **kw) for seq in seqs.data]
            else:
                raise TypeError('First dimension of 2D index must be of type int or slice')
        else:
            raise TypeError(f"Index of type '{type(i).__name__}' not supported")
        return seqs

    def __setitem__(self, i, value):
        if isinstance(i, (int, slice)):
            if isinstance(value, (str, BioSeq)):
                value = BioSeq(value)
            else:  # assume its list or BioBasket or similar
                value = [BioSeq(v) for v in value]
            self.data[i] = value
        elif len(i) == 2:
            i, j = i
            for seq in self[i]:
                seq[j] = value
        else:
            raise TypeError(f"Index of type '{type(i).__name__}' not supported")


[docs]
    @_add_inplace_doc
    def complement(self):
        """
        Complementary sequences, i.e. transcription
        """
        for seq in self:
            seq.complement()
        return self



[docs]
    @_add_inplace_doc
    def translate(self, *args, **kw):
        """
        Translate nucleotide sequences to amino acid sequences, see `~.cane.translate()`.

        The original translate method of the str class can be used via ``BioBasket.str.translate()``.
        """
        for seq in self:
            seq.translate(*args, **kw)
        return self



[docs]
    @_add_inplace_doc
    def reverse(self, *args, **kw):
        """
        Reverse sequences
        """
        for seq in self:
            seq.reverse(*args, **kw)
        return self



[docs]
    def copy(self):
        """
        Return a deep copy of the BioBasket object.
        """
        return copy.deepcopy(self)



[docs]
    def countall(self, rtype='counter', k=1):
        """
        Count letters in sequences

        This method may undergo disrupting changes or it may be removed in a later release.

        :param rtype:
          * ``'counter'`` Return `~collections.Counter` object
          * ``'prob'`` Return dictionary with normalized counts
          * ``'df'`` Return pandas DataFrame object with count, prob and tprob (total prob) fields
        """
        from operator import add
        if rtype == 'df':
            import pandas as pd
            records = [{'id': seq.id, 'word': word, 'count': count}
                       for seq in self for word, count in collections.Counter(
                           seq.data if k == 1 else
                           [reduce(add, kmer) for kmer in zip(*[seq.data[i:] for i in range(k)])]
                           ).items()]
            df = pd.DataFrame.from_records(records)
            df['prob'] =  df.groupby('id', group_keys=False)['count'].apply(lambda c: c/c.sum())
            df['tprob']= df['count'] / df['count'].sum()
            return df
        else:
            counters = [
                collections.Counter(
                    seq.data if k == 1 else
                    [reduce(add, kmer) for kmer in zip(*[seq.data[i:] for i in range(k)])]
                    ) for seq in self]
            counter = reduce(add, counters)
            if rtype == 'counter':
                return counter
            elif rtype == 'prob':
                s = counter.total()
                return {k: v / s for k, v in counter.items()}



[docs]
    def countplot(self, y='word', x='count', hue='id', order=None, plot='show',
                  figsize=None, ax=None, savefigkw={}, **kw):
        """
        Create a plot of letter counts

        This method may undergo disruptive changes, or it may be removed in a later release.

        Under the hood this method uses the pandas and seaborn libraries.
        For a help on most of the arguments, see
        `seaborn.barplot() <https://seaborn.pydata.org/generated/seaborn.barplot.html#seaborn.barplot>`_.
        """
        import matplotlib.pyplot as plt
        import seaborn as sns
        df = self.countall(rtype='df')
        if order in df:
            df = df.sort_values(order, ascending=False)
            order=None
        if ax is None:
            fig = plt.figure(figsize=figsize)
            ax = fig.add_subplot(111)
        ax = sns.barplot(df, y=y, x=x, hue=hue, order=order, ax=ax, **kw)
        if plot == 'show':
            plt.show()
        elif plot == 'return':
            return ax
        else:
            import matplotlib.pyplot as plt
            fig = ax.get_figure()
            fig.savefig(plot, **savefigkw)
            plt.close(fig)



[docs]
    @classmethod
    def frombiopython(cls, obj):
        """
        Create a `.BioBasket` object from a list of biopython_ `~Bio.SeqRecord.SeqRecord` or `~Bio.Seq.Seq` objects.

        :param obj: The object to convert, can also be a `~Bio.Align.MultipleSeqAlignment` object.

        .. note::
            BioPython Features in the ``SeqRecord.features`` attribute are automatically converted.
        """
        from sugar.core._adapter import biopython2seqs
        return biopython2seqs(obj, cls=cls)



[docs]
    @classmethod
    def frombiotite(cls, obj):
        """
        Create a `.BioBasket` object from a list of biotite_ sequence objects.

        :param obj: The object to convert, can also be a biotite ``Alignment`` object.
        """
        from sugar.core._adapter import biotite2seqs
        return biotite2seqs(obj, cls=cls)



[docs]
    @staticmethod
    def fromfmtstr(in_, fmt=None, **kw):
        """
        Read sequences from a string
        """
        from sugar import read
        if not isinstance(in_, bytes):
            in_ = in_.encode('latin1')
        return read(io.BytesIO(in_), fmt=fmt, **kw)



[docs]
    def todict(self):
        """
        Return a dictionary with sequence ids as keys and sequences as values

        .. note::
            This method is different from the `BioBasket.groupby()` method.
            Each value of the dict returned by ``todict()`` is a sequence,
            while each value of the dict returned by ``groupby()`` is a
            BioBasket.
        """
        return {seq.id: seq for seq in self}


    @property
    def d(self):
        """
        Alias for `BioBasket.todict()`
        """
        return self.todict()


[docs]
    def match(self, *args, **kw):
        """
        Search regex and return `.BioMatchList` of matches, see `~.cane.match()`
        """
        from sugar.core.cane import BioMatchList
        matches = BioMatchList()
        for seq in self:
            m = seq.match(*args, **kw)
            if kw.get('matchall'):
                matches.extend(m)
            else:
                matches.append(m)
        return matches



[docs]
    def matchall(self, *args, **kw):
        """
        Search regex and return `.BioMatchList` of all matches, see `~.cane.match()`
        """
        kw['matchall'] = True
        return self.match(*args, **kw)



[docs]
    def find_orfs(self, *args, **kw):
        """
        Find ORFS in sequences, see `~.cane.find_orfs()`
        """
        return reduce(lambda orfs1, orfs2: orfs1 + orfs2,
                      [seq.find_orfs(*args, **kw) for seq in self])



[docs]
    @_add_inplace_doc
    def sort(self, keys=('id',), reverse=False):
        """
        Sort sequences in-place

        :param keys: Tuple of meta keys or functions to use for sorting.
            Can also be a single string or a callable.
            Defaults to sorting by id.
        :param reverse: Use reverse order (default: False)

        :return: Sorted sequences

        .. rubric:: Example:

        >>> from sugar import read
        >>> seqs = read()
        >>> seqs.sort(len)  # doctest: +SKIP
        """
        from sugar.core.cane import _sorted
        self.data = _sorted(self.data, keys=keys, reverse=reverse, attr='meta')
        return self



[docs]
    def groupby(self, keys=('id',), flatten=False):
        """
        Group sequences

        :param keys: Tuple of meta keys or functions to use for grouping.
            Can also be a single string or a callable.
            By default, the method groups only by id.
        :return: Nested dict structure

        .. rubric:: Example:

        >>> from sugar import read
        >>> seqs = read()
        >>> grouped = seqs.groupby()
        """
        from sugar.core.cane import _groupby
        return _groupby(self, keys, attr='meta', flatten=flatten)



[docs]
    def select(self, inplace=False, **kw):
        r"""
        Select sequences

        :param \*\*kw: All kwargs must be of the form
            ``key_op=value``, where op is one of
            the operators from the `operator` module.
            Additionally, the operator ``'in'`` (membership) is supported.
            The different select conditions are combined with
            the *and* operator. If you need *or*, call select twice
            and combine the results with the ``|`` operator, e.g.
            ``seqs.select(...) | seqs.select(...)``
        :param inplace: Whether to modify the original object (default: False)
        :return: Selected sequences

        .. rubric:: Example:

        >>> from sugar import read
        >>> seqs = read()
        >>> seqs2 = seqs.select(len_gt=9500)  # Select sequences with length > 9500
        """
        from sugar.core.cane import _select
        selected = _select(self.data, **kw)
        if inplace:
            self.data = selected
            return self
        else:
            return self.__class__(selected)



[docs]
    def tofmtstr(self, fmt, **kw):
        """
        Write sequences to a string of the specified format, see `~.main.write()`
        """
        return self.write(None, fmt)



[docs]
    def tostr(self, h=19, w=80, wid=19, wlen=4, showgc=True,
              add_hint=False, raw=False, add_header=True):
        """
        Return string with information about sequences, used by ``__str__()`` method
        """
        if raw:
            return '\n'.join(str(seq) for seq in self)
        if len(self) == 0:
            return '0 seqs in basket!' * add_header
        out = [f'{len(self)} seqs in basket'] * add_header
        lenid = min(max([len(seq.id) for seq in self if seq.id], default=-2), wid)+2
        maxwlen = max(len(str(len(seq))) for seq in self)
        if 0 < wlen < 4:
            wlen = max(wlen, min(4, maxwlen))
        elif wlen in (0, None) or wlen > maxwlen:
            wlen = maxwlen
        sgclen = 11 if showgc and any(getattr(seq, 'type') == 'nt' for seq in self) else 0
        for i, seq in enumerate(self):
            if (h in (None, 0) or (h2:=(n:=len(self))-h) <= 0 or
                    not (n) // 2 - (h2+1) // 2 <= i <= (n) // 2 + (h2) // 2):
                id_ = ('' if not seq.id else
                       seq.id if wid in (None, 0) or len(seq.id) <= wid else
                       seq.id[:wid-3] + '...').ljust(lenid)
                data = seq.data if w in (None, 0) or len(seq) <= w-lenid-wlen-3-sgclen else seq.data[:w-lenid-wlen-6-sgclen] + '...'
                if showgc and getattr(seq, 'type') == 'nt':
                    data = data + f'  GC:{100*seq.gc:.2f}%'
                out.append(id_ + _si_format(len(seq), max(4, wlen)).rjust(wlen) + '  ' + data)
            elif i == len(self) // 2:
                out.append('...')
        if add_hint:
            out.append('  customize output with BioBasket.tostr() method')
        return '\n'.join(out)



[docs]
    def tobiopython(self, *, msa=False):
        """
        Convert the BioBasket to a list of biopython_ `~Bio.SeqRecord.SeqRecord` objects

        :param bool msa: Return a biopython `~Bio.Align.MultipleSeqAlignment` object instead of a list.

        Attached ``BioSeq.fts`` features are not converted.
        """
        from sugar.core._adapter import seqs2biopython
        return seqs2biopython(self, msa=msa)



[docs]
    def tobiotite(self, **kw):
        """
        Convert BioBasket to a list of biotite_ `~biotite.sequence.NucleotideSequence` or `~biotite.sequence.ProteinSequence` instance

        :param str type: ``'nt'`` creates a `~biotite.sequence.NucleotideSequence` instance,
            ``'aa'`` creates a `~biotite.sequence.ProteinSequence` instance,
            by default the class is inferred from the sequence itself.
        :param bool msa: Return a biotite ``Alignment`` object instead of a list, default is False
        :param str gap: Gap characters that must be removed from the sequence strings.
        :param bool warn: Wether to warn if gap characters have been removed, default is True,
            not used with ``msa=True``
        """
        from sugar.core._adapter import seqs2biotite
        return seqs2biotite(self, **kw)



[docs]
    def write(self, fname=None, fmt=None, **kw):
        """
        Write sequences to file, see `~.main.write()`
        """
        from sugar._io import write
        return write(self, fname=fname, fmt=fmt, **kw)



[docs]
    def plot_alignment(self, *args, **kw):
        """
        Plot an alignment, see `~.imaging.alignment.plot_alignment()`
        """
        from sugar.imaging import plot_alignment
        return plot_alignment(self, *args, **kw)



[docs]
    def plot_ftsviewer(self, *args, **kw):
        """
        Plot features of the sequences using DNAFeaturesViewer_, see `~.imaging.ftsviewer.plot_ftsviewer()`

        .. note::
            Using `BioSeq <.BioSeq.plot_ftsviewer>` or `.BioBasket.plot_ftsviewer()`
            over `.FeatureList.plot_ftsviewer()` has the advantage,
            that sequence lengths are used automatically.
        """
        from sugar.imaging import plot_ftsviewer
        return plot_ftsviewer(self.fts, *args, seqs=self, **kw)



[docs]
    def merge(self, spacer='', update_fts=False, keys=('id',)):
        data = []
        for group in self.groupby(keys, flatten=True).values():
            seq = group[0]
            for oseq in group[1:]:
                seq.data = seq.data + spacer + oseq.data
                if update_fts:
                    nfts = []
                    for ft in oseq.fts:
                        rel = len(seq) - len(oseq)
                        locs = [Location(l.start + rel, l.stop + rel, l.strand, l.defect) for l in ft.locs]
                        nfts.append(Feature(locs=locs, meta=ft.meta))
                    seq.add_fts(nfts)
            data.append(seq)
        self.data = data
        return self




    # def consensus(self, gap='-'):
    #     n = len(self)
    #     data = [seq.data for seq in self]
    #     cons = []
    #     perc = []
    #     percnongaps = []
    #     for nt in zip(*data):
    #         max_count = 0
    #         nt = str(nt)
    #         num_gaps = nt.count(gap)
    #         for letter in list(set(nt) - {gap}):
    #             count = nt.count(letter)