Source code for sugar.core.fts

# (C) 2024, Tom Eulenfeld, MIT license

"""
Feature related classes `.Feature`, `.FeatureList`, `.Location`, `.Strand`, `.Defect`
"""

# Originally, this file was based on the annotation module from the biotite package.
# Later, it was rewritten. Constants in Defect amd Strand classes mostly retained their names.

from copy import deepcopy
import collections
import io
import sys
from enum import IntFlag, StrEnum, auto
from sugar.core.meta import Meta
from sugar.core.util import _add_inplace_doc



[docs]
class Defect(IntFlag):
    """
    Types of location defects

    A location has a defect,
    when the feature is not exactly located between start and stop base
    """
    #: No location defect
    NONE = 0
    #: Part of the feature has been truncated
    #: before the start base
    #: (e.g. by slicing with `FeatureList.slice()`)
    MISS_LEFT = auto()
    #: Part of the feature has been truncated
    #: after or at the stop base
    #: (e.g. by slicing with `FeatureList.slice()`)
    MISS_RIGHT = auto()
    #: The feature starts at an unknown position
    #: before the start base
    BEYOND_LEFT = auto()
    #: The feature stops at an unknown position
    #: after or at the stop base
    BEYOND_RIGHT = auto()
    #: The feature starts at an unknown position
    UNKNOWN_LEFT = auto()
    #: The feature stops at an unknown position
    UNKNOWN_RIGHT = auto()
    #: The position is between two consecutive bases
    BETWEEN_CONSECUTIVE = auto()
    #: The exact position is unknown, but it is at a
    #: single base between the start and stop residue
    UNKNOWN_SINGLE_BETWEEN = auto()

    def _reverse(self):
        """
        Return reversed defect

        i.e. describe the same defect from the view of the reverse strand.
        """
        defect = Defect(self)
        if len((self.MISS_LEFT | self.MISS_RIGHT) & self) == 1:
            defect ^= self.MISS_LEFT | self.MISS_RIGHT
        if len((self.BEYOND_LEFT | self.BEYOND_RIGHT) & self) == 1:
            defect ^= self.BEYOND_LEFT | self.BEYOND_RIGHT
        if len((self.UNKNOWN_LEFT | self.UNKNOWN_RIGHT) & self) == 1:
            defect ^= self.UNKNOWN_LEFT | self.UNKNOWN_RIGHT
        return defect




[docs]
class Strand(StrEnum):
    """
    Types of strand of feature location
    """
    #: The feature is located on the forward strand
    FORWARD = '+'
    #: The feature is located on the reverse strand
    REVERSE = '-'
    #: The feature is not associated with any strand
    NONE = '.'
    #: The strandness of the feature is unknown
    UNKNOWN = '?'

    def _reverse(self):
        """
        Return reversed strand
        """
        return Strand({'+': '-', '-': '+'}.get(self, self))




[docs]
class Location():
    """
    Class describing the contiguous position of a feature
    """
    def __init__(self, start, stop, strand='+', defect=0, meta=None):
        if start >= stop:
            raise ValueError('start must be lower than stop')
        #: Start location (zero-based numbering)
        self.start = start
        #: Stop location (zero-based numbering)
        self.stop = stop
        self.strand = strand
        self.defect = defect
        self.meta = meta

    def __repr__(self):
        return (f"Location({self.start}, {self.stop}, strand='{self.strand}', "
                f"defect={self.defect.value})")

    def __eq__(self, other):
        if not isinstance(other, Location):
            return False
        return (    self.start  == other.start
                and self.stop   == other.stop
                and self.strand == other.strand
                and self.defect == other.defect
                and self.meta == other.meta
                )

    def __hash__(self):
        return hash((self.start, self.stop, self.strand, self.defect))

    @property
    def meta(self):
        """
        Location can optionally have metadata
        """
        if self._meta is None:
            self._meta = Meta()
        return self._meta

    @property
    def range(self):
        """
        Get the range of the location or location tuple

        :returns:
            tuple ``start, stop`` with start and stop location
            (zero-based numbering)
        """
        return self.start, self.stop

    @meta.setter
    def meta(self, v):
        self._meta = None if v is None else Meta(v)

    @property
    def _stride(self):
        """
        Stride is -1 for the reverse strand, else +1
        """
        return -1 if self.strand == '-' else 1

    def __len__(self):
        return self.stop - self.start

    @property
    def strand(self):
        """Strand of the location"""
        return self._strand

    @strand.setter
    def strand(self, v):
        self._strand = Strand(v)

    @property
    def defect(self):
        """Defect of the location"""
        return self._defect

    @defect.setter
    def defect(self, v):
        self._defect = Defect(v)

    def _reverse(self, seqlen=0):
        """
        Return reversed location

        :param seqlen: Length of the sequence which the location belongs to,
            the default 0 will return negative start and stop base locations.
        """
        loc = self
        start, stop = seqlen - loc.stop, seqlen - loc.start
        strand = loc.strand._reverse()
        defect = loc.defect._reverse()
        return Location(start, stop, strand, defect, meta=loc.meta)

    def __lt__(self, other):
        if isinstance(other, (Location, LocationTuple)):
            start, stop = self.range
            start2, stop2 = other.range
            return start < start2 or (start==start2 and stop < stop2)
        msg = f"'<' not supported between instances of '{type(self).__name__}' and '{type(other).__name__}'"
        raise TypeError(msg)

    def __le__(self, other):
        if isinstance(other, (Location, LocationTuple)):
            start, stop = self.range
            start2, stop2 = other.range
            return start < start2 or (start==start2 and stop <= stop2)
        msg = f"'<=' not supported between instances of '{type(self).__name__}' and '{type(other).__name__}'"
        raise TypeError(msg)

    def __gt__(self, other):
        if isinstance(other, (Location, LocationTuple)):
            start, stop = self.range
            start2, stop2 = other.range
            return start > start2 or (start==start2 and stop > stop2)
        msg = f"'>' not supported between instances of '{type(self).__name__}' and '{type(other).__name__}'"
        raise TypeError(msg)

    def __ge__(self, other):
        if isinstance(other, (Location, LocationTuple)):
            start, stop = self.range
            start2, stop2 = other.range
            return start > start2 or (start==start2 and stop >= stop2)
        msg = f"'>=' not supported between instances of '{type(self).__name__}' and '{type(other).__name__}'"
        raise TypeError(msg)

    @property
    def mid(self):
        """
        Return the middle position of the location or location tuple
        """
        return sum(self.range) // 2


[docs]
    def shift(self, offset):
        """
        Shift the location by the given offset in-place

        :param int offset: The offset to shift the location
        :return: The shifted location
        :rtype: `.Location`
        """
        self.start += offset
        self.stop  += offset
        return self



[docs]
    def overlaps(self, other):
        """
        Whether the location/locations overlap with the other location/locations
        """
        if isinstance(other, (Location, LocationTuple)):
            lr1 = self.range
            lr2 = other.range
            return lr1[0] < lr2[1] and lr1[1] > lr2[0]
        msg = f"{type(self).__name__}.overlaps() not supported for instances of '{type(other).__name__}'"
        raise TypeError(msg)



[docs]
    def overlaplen(self, other):
        """
        Return overlap length with the other location or location tuple
        """
        if isinstance(other, (Location, LocationTuple)):
            lr1 = self.range
            lr2 = other.range
            return max(0, min(lr1[1], lr2[1]) - max(lr1[0], lr2[0]))
        msg = f"{type(self).__name__}.overlaplen() not supported for instances of '{type(other).__name__}'"
        raise TypeError(msg)



[docs]
    def contains(self, other):
        """
        Whether the location range contains the other location range
        """
        if isinstance(other, (Location, LocationTuple)):
            lr1 = self.range
            lr2 = other.range
            return lr1[0] <= lr2[0] and lr2[1] <= lr1[1]
        msg = f"{type(self).__name__}.overlaps() not supported for instances of '{type(other).__name__}'"
        raise TypeError(msg)



[docs]
    def distance(self, other, *, pos='inner', sign=False):
        """
        Distance to other location or location tuple

        :param str pos: ``'inner'`` returns the shortest distance between the locations,
            ``'middle'`` returns the distance between the mid locations
        :param bool sign: If set to True, the returned distance will have a negative sign
            if the other location has a smaller position.
            Otherwise the distance will always be larger equal than zero.
        """
        assert pos in ('inner', 'middle')
        if isinstance(other, (Location, LocationTuple)):
            if self.overlaps(other):
                return 0
            if pos == 'middle':
                dist = other.mid - self.mid
            elif self > other:
                dist = other.stop - self.start
            else:
                dist = other.start - self.stop
            if not sign:
                dist = abs(dist)
            return dist
        msg = f"{type(self).__name__}.distance() not supported for instances of '{type(other).__name__}'"
        raise TypeError(msg)




[docs]
class LocationTuple(tuple):
    """
    Tuple of contiguous locations, describing the position of a feature
    """
    def __new__(cls, locs=None, start=None, stop=None, strand='+'):
        if start is not None or stop is not None:
            if locs is not None:
                raise ValueError('One of locs or start/stop can be given')
            locs = (Location(start, stop, strand),)
        if locs is None:
            raise ValueError('No location specified')
        if len(locs) == 0:
            raise ValueError('LocationTuple must include at least one location')
        for i, loc in enumerate(locs):
            if not isinstance(loc, Location):
                try:
                    locs[i] = Location(*loc)
                except Exception as ex:
                    msg = 'LocationTuple needs ot be initialized with a tuple of Locations or tuples'
                    raise TypeError(msg) from ex
        locs = tuple(locs)
        if len(locs) > 0:
            strands = set(loc.strand for loc in locs)
            if len(strands) > 1:
                msg = f'Found multiple strand values in Locations: {" ".join(strands)}'
                raise ValueError(msg)
            if locs[0].strand == '-':
                locs = sorted(locs, key=lambda loc: loc.stop, reverse=True)
            else:
                locs = sorted(locs, key=lambda loc: loc.start)
        return super().__new__(cls, locs)

    @property
    def start(self):
        """
        Get the start position of location tuple
        """
        return min(loc.start for loc in self)

    @property
    def stop(self):
        """
        Get the stop position of location tuple
        """
        return max(loc.stop for loc in self)


[docs]
    def shift(self, offset):
        """
        Shift the locations by the given offset in-place

        :param int offset: The offset to shift the locations
        :return: The shifted location tuple
        :rtype: `.LocationTuple`
        """
        for loc in self:
            loc.shift(offset)
        return self


    __lt__ = Location.__lt__
    __le__ = Location.__le__
    __gt__ = Location.__gt__
    __ge__ = Location.__ge__
    range = Location.range
    mid = Location.mid
    contains = Location.contains
    distance = Location.distance
    overlaplen = Location.overlaplen
    overlaps = Location.overlaps

    def _reverse(self, seqlen=0):
        """Return reversed LocationTuple"""
        return LocationTuple([loc._reverse(seqlen=seqlen) for loc in self])




[docs]
class Feature():
    """
    A single feature/annotation

    :param str type: The name of the feature class, e.g. *gene* or *CDS*
    :param list locs:
        A list of feature locations. In most cases this list will
        contain only one location
        but multiple locations are possible,
        for example in virus genomes (due to frame shifts).
    :param start,stop,strand:
        Instead of specifying the locations, a single location can be given
        by start and stop indices and optionally strand.
    :param dict meta:
        The metadata describing the feature.

    .. note::
        The following metadata attributes are directly accessible as
        attributes of Feature: *type*, *name*, *id* and *seqid*.
        For example, the feature id can be obtained by both `Feature.id`
        and ``Feature.meta.id``.
    """

    def __init__(self, type=None, locs=None, meta=None, **kw):
        if meta is None:
            meta = {}
        self.meta = Meta(meta)
        if type is not None:
            self.meta.type = type
        self._locs = LocationTuple(locs=locs, **kw)

    @property
    def locs(self):
        """
        `LocationTuple` of feature locations
        """
        return self._locs

    @locs.setter
    def locs(self, value):
        self._locs = LocationTuple(value)

    @property
    def type(self):
        """
        Alias for ``Feature.meta.type``
        """
        return self.meta.get('type')

    @type.setter
    def type(self, value):
        self.meta.type = value

    @property
    def id(self):
        """
        Alias for ``Feature.meta.id``
        """
        return self.meta.get('id')

    @id.setter
    def id(self, value):
        self.meta.id = value

    @property
    def seqid(self):
        """
        Alias for ``Feature.meta.seqid``
        """
        return self.meta.get('seqid')

    @seqid.setter
    def seqid(self, value):
        self.meta.seqid = value

    @property
    def name(self):
        """
        Alias for ``Feature.meta.name``
        """
        return self.meta.get('name')

    @name.setter
    def name(self, value):
        self.meta.name = value

    def __repr__(self):
        meta = self.meta.copy()
        meta.pop('type', None)
        return f'Feature("{self.type}", [{", ".join([loc.__repr__() for loc in self.locs])}], meta={meta!r})'

    @property
    def loc(self):
        """
        Access first location
        """
        l, *_ = self.locs
        return l

    def __eq__(self, other):
        if not isinstance(other, Feature):
            return False
        return (self.type  == other.type
                and self.locs == other.locs
                and self.meta == other.meta)

    def __lt__(self, other):
        if isinstance(other, Feature):
            if self.seqid != other.seqid:
                return self.seqid < other.seqid
            return self.locs < other.locs
        if isinstance(other, LocationTuple):
            return self.locs < other
        msg = f"'<' not supported between instances of '{type(self).__name__}' and '{type(other).__name__}'"
        raise TypeError(msg)

    def __len__(self):
        lr = self.locs.range
        return lr[1] - lr[0]


[docs]
    def contains(self, other):
        """
        Whether the feature location range contains other
        """
        if isinstance(other, Feature):
            return self.locs.contains(other.locs)
        if isinstance(other, (Location, LocationTuple)):
            return self.locs.contains(other)
        msg = f"Feature.contains() not supported for instances of '{type(other).__name__}'"
        raise TypeError(msg)



[docs]
    def distance(self, other, **kw):
        """
        Distance to other location or location tuple, see `LocationTuple.distance()`
        """
        if isinstance(other, Feature):
            return self.locs.distance(other.locs, **kw)
        if isinstance(other, (Location, LocationTuple)):
            return self.locs.distance(other, **kw)
        msg = f"Feature.distance() not supported for instances of '{type(other).__name__}'"
        raise TypeError(msg)



[docs]
    def overlaps(self, other):
        """
        Whether the feature location overlaps with the other
        """
        if isinstance(other, Feature):
            return self.locs.overlaps(other.locs)
        if isinstance(other, (Location, LocationTuple)):
            return self.locs.overlaps(other)
        msg = f"Feature.overlaps() not supported for instances of '{type(other).__name__}'"
        raise TypeError(msg)



[docs]
    def overlaplen(self, other):
        """
        Return overlap length with the other location or location tuple
        """
        if isinstance(other, Feature):
            return self.locs.overlaplen(other.locs)
        if isinstance(other, (Location, LocationTuple)):
            return self.locs.overlaplen(other)
        msg = f"Feature.overlaplen() not supported for instances of '{type(other).__name__}'"
        raise TypeError(msg)



[docs]
    def rc(self, seqlen=0):
        """
        Reverse complement the feature.

        After the in-place operation, the feature will be described by the reverse complement strand.

        :param int seqlen: The sequence length, the default of 0 will result in negative
            location indices.
        """
        self.locs = self.locs._reverse(seqlen=seqlen)
        return self



[docs]
    def write(self, fname=None, fmt=None, **kw):
        """
        Write feature to file, see `~.main.write_fts()`
        """
        return FeatureList([self]).write(fname=fname, fmt=fmt, **kw)


    # def __hash__(self):
    #     return hash((self.type, self.locs, frozenset(self.meta.items())))


[docs]
    @classmethod
    def frombiopython(cls, obj):
        """
        Create a `.Feature` object from a biopython_ `~Bio.SeqFeature.SeqFeature` object.

        :param obj: The object to convert.

        Location defects are ignored.
        """
        from sugar.core._adapter import biopython2ft
        return biopython2ft(obj, cls=cls)



[docs]
    def tobiopython(self):
        """
        Convert Feature to biopython_ `~Bio.SeqFeature.SeqFeature` instance
        """
        from sugar.core._adapter import ft2biopython
        return ft2biopython(self)



[docs]
    def toftsviewer(self, *, label='default', **kw):
        r"""
        Convert feature to DNAFeaturesViewer_ `~dna_features_viewer.GraphicFeature`

        :param label: The label of the feature,
            may be a str key of the meta dictionary,
            or a function taking the feature and returning the label,
            or the str label itself,
            defaults to ``'name'`` and if that is not present in the metadata, ``'type'``.
        :param \*\*kw: All other kwargs are passed to `~dna_features_viewer.GraphicFeature`.

        Instead of passing label, color and hatch to this function, corresponding values can also be passed via
        the ``Feature.meta`` attribute with the keys ``'_ftsviewer_label'``, ``'_ftsviewer_color'`` and ``'_ftsviewer_hatch'``.
        """
        try:
            from dna_features_viewer import GraphicFeature
        except ImportError as ex:
            raise ImportError('Please install dna_features_viewer to use ftsviewer functionality') from ex
        start, stop = self.locs.range
        strand = {'+': 1, '-': -1, '.': 0, '?': 0}[self.loc.strand]
        if '_ftsviewer_label' in self.meta:
            label = self.meta['_ftsviewer_label']
        elif label == 'default':
            label = self.meta.get('name') or self.type
        elif isinstance(label, str):
            label = self.meta.get(label, label)
        elif label is not None:
            label = str(label(self))
        if '_ftsviewer_color' in self.meta:
            kw['color'] = self.meta['_ftsviewer_color']
        if '_ftsviewer_hatch' in self.meta:
            kw['hatch'] = self.meta['_ftsviewer_hatch']
        return GraphicFeature(
            start=start, end=stop, strand=strand,
            open_left=Defect.MISS_LEFT in self.loc.defect,
            open_right=Defect.MISS_RIGHT in self.locs[-1].defect,
            label=label,
            **kw)





[docs]
class FeatureList(collections.UserList):
    def __init__(self, data=None):
        """
        A `FeatureList` is a list of features belonging to a single sequence or to different sequences.

        :param list data: the features
        """
        if hasattr(data, 'data'):
            data = data.data
        super().__init__(data)


[docs]
    @classmethod
    def frompandas(cls, df, ftype=None, one_based=False):
        """
        Convert `pandas.DataFrame` object to `FeatureList`

        :param df: Dataframe with at least start and stop columns.
            The following columns can be used: type, start, stop, len, strand, defect.
            Other columns are stored as metadata.
        :param ftype: If the dataframe has no type column,
            the ``ftype`` column is used instead,
            if it does not exist, ``ftype`` is used directly as type.
        :param one_based: Whether the data uses one-based numbering.
            It will be converted to the zero-based numbering used by sugar.

        :return: created `FeatureList` instance
        """
        if ftype is not None and 'type' not in df:
            df = df.copy()
            if ftype in df:
                df['type'] = df[ftype]
            else:
                df['type'] = ftype
        if 'len' in df:
            df = df.copy()
            if 'start' in df and 'stop' not in df:
                df['stop'] = df['start'] + df['len']
            elif 'start' not in df and 'stop' in df:
                df['start'] = df['stop'] - df['len']
            del df['len']
        fts = []
        for rec in df.to_dict('records'):
            loc = Location(rec.pop('start') - one_based,
                           rec.pop('stop'),
                           strand=rec.pop('strand', '?'),
                           defect=rec.pop('defect', Defect.NONE))
            ft = Feature(locs=[loc], meta=rec)
            fts.append(ft)
        return cls(fts)



[docs]
    @classmethod
    def frombiopython(cls, obj):
        """
        Create a `.FeatureList` object from a list of biopython_ `~Bio.SeqFeature.SeqFeature` objects.

        :param obj: The object to convert.

        Location defetcs are ignored.
        """
        from sugar.core._adapter import biopython2fts
        return biopython2fts(obj, cls=cls)


    def __str__(self):
        return self.tostr()

    def _repr_pretty_(self, p, cycle):
        if cycle:
            p.text('...')
        else:
            p.text(str(self))

    # Implement all variants of &, |, -, ^
    def __and__(self, other):
        return self.__class__([ft for ft in self if ft in other])

    def __rand__(self, other):
        return self & other

    def __iand__(self, other):
        self.data = [ft for ft in self if ft in other]
        return self

    def __or__(self, other):
        return self + [ft for ft in other if ft not in self]

    def __ror__(self, other):
        return self | other

    def __ior__(self, other):
        self.data += [ft for ft in other if ft not in self]
        return self

    def __sub__(self, other):
        return self.__class__([ft for ft in self if ft not in other])

    def __rsub__(self, other):
        return self.__class__(other) - self

    def __isub__(self, other):
        self.data = [ft for ft in self if ft not in other]
        return self

    def __xor__(self, other):
        return (self | other) - (self & other)

    def __rxor__(self, other):
        return self ^ other

    def __ixor__(self, other):
        self.data = (self ^ other).data
        return self


[docs]
    def tostr(self, raw=False, w=80, wt=12, wl=20, h=80, exclude_fts=()):
        """
        Return string with information about features, used by ``__str__()`` method
        """
        def _sort_meta_key(m):
            order = ['name', 'gene']
            try:
                return order.index(m[0])
            except ValueError:
                return len(order) + m[0].startswith('_')
        if raw:
            out = []
            for ft in self:
                if str(getattr(ft, 'type', None)) in exclude_fts:
                    continue
                for l in ft.locs:
                    ftstr = (f'{getattr(ft, "type", ".")} {l.start} {l.stop} {l.strand}'
                             f' {ft.meta.get("name", ".")} {ft.meta.get("id", ".")} {ft.meta.get("seqid", ".")}')
                    out.append(ftstr)
            return '\n'.join(out)
        out = []
        wt, wtmax = 0, wt
        wl, wlmax = 0, wl
        wlstart = 0
        wllen = 0
        for ift, ft in enumerate(self):
            if h and ift+1 == h:
                break
            wt = min(max(len(str(ft.type)), wt), wtmax)
            wlstart = max(max(len(f'{l.start:_}') for l in ft.locs), wlstart)
            wllen = max(max(len(f'{len(l):_}') for l in ft.locs), wllen)
        wllen = min(wllen, max(wlmax-wlstart-3, 5))
        for ift, ft in enumerate(self):
            if h and ift+1 == h:
                out.append(f'... and {len(self)-h+1:_} more')
                break
            t = str(getattr(ft, 'type', None))
            if t in exclude_fts:
                continue
            exclude_types = ('translation', 'type')
            metastr = ';'.join(f'{k}={v}' for k, v in
                               sorted(vars(ft.meta).items(), key=_sort_meta_key)
                               if k not in exclude_types)
            for i, l in enumerate(ft.locs):
                locstr = f'{l.start:>{wlstart}_}{l.strand} {len(l):>{wllen}_}'
                ftstr = f'{t if i == 0 else "":>{wt}} {locstr}'
                if i == 0:
                    ftstr = ftstr + f'  {metastr}'
                elif l.meta is not None:
                    locmetastr = ';'.join(f'{k}={v}' for k, v in
                                          sorted(l.meta.items(), key=_sort_meta_key)
                                          if k not in exclude_types)
                    ftstr = ftstr + f'  {locmetastr}'

                if w and len(ftstr) > w:
                    ftstr = ftstr[:w-3] + '...'
                out.append(ftstr)
        return '\n'.join(out)



[docs]
    def tofmtstr(self, fmt, **kw):
        """
        Write features to a string of the given format, see `~.main.write_fts()`
        """
        return self.write(None, fmt, **kw)



[docs]
    def tolists(self, keys='type start stop strand'):
        """
        Return a generator yielding a list for each feature

        :param keys: Parameters from the metadata or location to return,
            ``'len'`` is also allowed,
            can be a string or tuple, defaults to ``'type start stop strand'``

        .. rubric:: Example:

        >>> from sugar import read_fts
        >>> fts = read_fts().select('cDNA_match')
        >>> for record in fts.tolists('type start strand len'):
        ...     print(*record)
        cDNA_match 101888622 - 4245
        cDNA_match 103140200 - 30745
        cDNA_match 103944892 - 7136
        cDNA_match 107859806 - 2392
        """
        if isinstance(keys, str):
            keys = keys.split()
        for ft in self:
            yield [
                ft.loc.strand if k == 'strand' else
                ft.loc.defect if k == 'defect' else
                ft.locs.range[0] if k == 'start' else
                ft.locs.range[0] + 1 if k == 'start1' else
                ft.locs.range[1] if k == 'stop' else
                len(ft) if k == 'len' else
                ft.meta.get(k)
                for k in keys
            ]



[docs]
    def topandas(self, keys='type start stop strand', **kw):
        """
        Return a `pandas.DataFrame` of the features

        :param keys: Parameters from the metadata or location to return,
            ``'len'`` is also allowed,
            can be a string or tuple, defaults to ``'type start stop strand'``.

        .. rubric:: Example:

        >>> from sugar import read_fts
        >>> fts = read_fts().select('cDNA_match')
        >>> df = fts.topandas()  # doctest: +SKIP
        >>> print(df)  # doctest: +SKIP
                type      start      stop   strand
        0  cDNA_match  101888622  101892867      -
        1  cDNA_match  103140200  103170945      -
        2  cDNA_match  103944892  103952028      -
        3  cDNA_match  107859806  107862198      -
        """
        import pandas
        if isinstance(keys, str):
            keys = keys.split()
        kw.setdefault('columns', keys)
        return pandas.DataFrame(self.tolists(keys=keys), **kw)



[docs]
    def get(self, type):
        """
        Return the first feature of given feature type, e.g. ``'cds'``

        :param type: String or list of multiple strings
        """
        type_ = type
        if not isinstance(type_, str):
            type_ = tuple(t.lower() for t in type_)
        for ft in self.data:
            if (isinstance(type_, str) and ft.type.lower() == type_.lower() or
                    isinstance(type_, tuple) and ft.type.lower() in type_):
                return ft



[docs]
    def select(self, type=None, *, inplace=False, strand=None, **kw):
        r"""
        Select features

        Two different operating modi can be used, or both.
        Use the ``type`` argument to select features of one type (use a string)
        or of different types (use a list).

        All other kwargs must be of the form
        ``key_op=value``, where op is one of
        the operators from the `operator` module.
        Additionally, the operator ``'in'`` (membership) is supported.
        The different selection criteria are combined with
        the *and* operator. If you need *or*, call select twice
        and combine the results with ``|`` operator, e.g.
        ``fts.select(...) | fts.select(...)``

        :param type: String or list of multiple strings
        :param inplace: Whether to modify the original object (default: False)
        :param \*\*kw: Selection criteria
        :return: Selected features

        .. rubric:: Example:

        >>> from sugar import read_fts
        >>> fts = read_fts()
        >>> fts2 = fts.select('CDS')  # select all CDS fts
        >>> fts3 = fts.select(len_gt=100_000)  # select all fts with length > 100 kB
        """
        from sugar.core.cane import _select
        selected = self.data
        if type is not None:
            if not isinstance(type, str):
                type = tuple(t.lower() for t in type)
            selected = [
                ft for ft in selected
                if (isinstance(type, str) and ft.type.lower() == type.lower() or
                    isinstance(type, tuple) and ft.type.lower() in type)]
        if strand is not None:
            selected = [ft for ft in selected if ft.loc.strand == strand]
        selected = _select(selected, **kw)
        if inplace:
            self.data = selected
            return self
        else:
            return self.__class__(selected)



[docs]
    def tobiopython(self):
        """
        Convert the FeatureList to a list of biopython_ `~Bio.SeqFeature.SeqFeature` objects
        """
        from sugar.core._adapter import fts2biopython
        return fts2biopython(self)



[docs]
    def todict(self):
        """
        Return a dictionary with feature ids as keys and features as values

        .. note::
            This method is different from the `FeatureList.groupby()` method.
            Each value of the dict returned by ``todict()`` is a feature,
            while each value of the dict returned by ``groupby()`` is a
            FeatureList.
        """
        return {ft.id: ft for ft in self}



[docs]
    def toftsviewer(self, *, label='default', colorby='type', color=None,
                            circular=False,
                            seqlen=None, seq=None,
                            first_index=0,
                            **kw):
        r"""
        Convert features to DNAFeaturesViewer_ `~dna_features_viewer.GraphicRecord`

        :param label: The label of the feature,
            may be a str key of the meta dictionary,
            or a function taking the feature and returning the label,
            or the str label itself,
            defaults to ``'name'`` and if that is not present in the metadata, ``'type'``.
        :param colorby: How to define the color of the features, might be any key in the metadata,
            defaults to ``'type'``, but can also be a function taking a Feature and returning an identifier
        :param color: The color of the features,
            this might be a constant color,
            a list of colors, or
            None for the default matplotlib color cycle (the default), or
            a dictionary mapping the feature identifiers to colors.
        :param circular: If True return an instance of `~dna_features_viewer.CircularGraphicRecord` instead
        :param seq: sequence or sequence data
        :param seqlen: length of sequence, defaults to the length of ``seq`` or the stop location of the last feature.
        :param \*\*kw: All other kwargs are passed to
            `~dna_features_viewer.GraphicFeature` or
            `~dna_features_viewer.GraphicRecord` or
            `~dna_features_viewer.CircularGraphicRecord`, respectively.
        """
        from sugar.core.util import _pop_kws_for_func
        from sugar.imaging.alignment import _get_fts_colordict
        try:
            if circular:
                from dna_features_viewer import CircularGraphicRecord as GR
            else:
                from dna_features_viewer import GraphicRecord as GR
        except ImportError as ex:
            raise ImportError('Please install dna_features_viewer to use ftsviewer functionality') from ex
        kw2 = _pop_kws_for_func(kw, GR)
        color, colorby = _get_fts_colordict(self, color, colorby)
        gfts = [ft.toftsviewer(label=label, color=color[colorby(ft)], **kw) for ft in self]
        if seqlen is None:
            try:
                seqlen = len(seq)
            except TypeError:
                seqlen = self.loc_range[1] - first_index
        return GR(sequence_length=seqlen, sequence=str(seq), features=gfts, first_index=first_index, **kw2)



[docs]
    def plot_ftsviewer(self, *args, **kw):
        """
        Plot features using DNAFeaturesViewer_, see `~.imaging.ftsviewer.plot_ftsviewer()`
        """
        from sugar.imaging import plot_ftsviewer
        return plot_ftsviewer(self, *args, **kw)



[docs]
    def groupby(self, keys=('seqid',), flatten=False):
        """
        Group features

        :param keys: Tuple of meta keys or functions to use for grouping.
            Can also be a single string or a callable.
            By default, the method groups by seqid only.
        :return: Nested dict structure

        .. rubric:: Example:

        >>> from sugar import read_fts
        >>> fts = read_fts()
        >>> fts.groupby('type')  # doctest: +SKIP
        """
        from sugar.core.cane import _groupby
        return _groupby(self, keys, attr='meta', flatten=flatten)


    @property
    def d(self):
        """
        Alias for ``FeatureList.todict()``
        """
        return self.todict()

    @property
    def loc_range(self):
        """
        Get the range of locations over all features

        :returns:
            tuple ``start, stop`` with start and stop locations
            (zero-based numbering)
        """
        if len(self) == 0:
            return None
        mins, maxs = zip(*[ft.locs.range for ft in self])
        return min(mins), max(maxs)


[docs]
    def write(self, fname=None, fmt=None, **kw):
        """
        Write features to file, see `~.main.write_fts()`
        """
        from sugar._io import write_fts
        return write_fts(self, fname=fname, fmt=fmt, **kw)



[docs]
    def slice(self, start, stop, *, rel=0):
        """
        Return a sub-feature between start and stop

        :param start,stop: start and stop locations
        :param int rel: Subtracts the value ``rel`` from each location position.
        """
        if start is None:
            start = -sys.maxsize
        if stop is None:
            stop = sys.maxsize
        sub_annot = []
        for ft in self:
            sublocs = []
            for loc in ft.locs:
                if loc.start < stop and loc.stop > start:
                    defect = loc.defect
                    if loc.start < start:
                        defect |= Defect.MISS_LEFT
                    if loc.stop > stop:
                        defect |= Defect.MISS_RIGHT
                    lstart = max(start, loc.start) - rel
                    lstop = min(stop, loc.stop) - rel
                    sublocs.append(Location(
                        lstart, lstop, loc.strand, defect, meta=loc.meta
                    ))
            if len(sublocs) > 0:
                new_ft = Feature(locs=sublocs, meta=ft.meta)
                sub_annot.append(new_ft)
        return self.__class__(sub_annot)



[docs]
    @_add_inplace_doc
    def rc(self, seqlen=0):
        """
        Reverse complement all features, see `Feature.rc()`

        :param int seqlen: The sequence length, the default 0 will result in negative
            location positions.
        """
        for ft in self:
            ft.rc(seqlen=seqlen)
        return self



[docs]
    @_add_inplace_doc
    def sort(self, keys=None, reverse=False):
        """
        Sort features in-place

        :param keys: Tuple of meta keys or functions to use for sorting.
            None can be used as a single value or in the tuple
            to apply the default sorting by position.
            Can also be a single string or a callable.
        :param reverse: Use reverse order (default: False)

        :return: Sorted features

        .. rubric:: Example:

        >>> from sugar import read_fts
        >>> fts = read_fts()
        >>> fts.sort(('type', len))  # doctest: +SKIP
        """
        from sugar.core.cane import _sorted
        self.data = _sorted(self.data, keys=keys, reverse=reverse, attr='meta')
        return self



[docs]
    def copy(self):
        """
        Return a deep copy of the object
        """
        return deepcopy(self)



[docs]
    def remove_duplicates(self):
        """
        Remove duplicate features
        """
        self.reverse()
        self.data = [ft for i, ft in enumerate(self) if not ft in self[i+1:]]
        self.reverse()
        return self



[docs]
    def remove_nested(self):
        """
        Remove nested features, i.e. features contained within others
        """
        self.remove_duplicates()
        fts = sorted(self.data, key=len, reverse=True)
        remove = []
        for i, ft in enumerate(fts):
            if ft in remove:
                continue
            for ft2 in fts[i+1:]:
                if ft2 in remove:
                    continue
                if ft.contains(ft2):
                    remove.append(ft2)
        self.data = [ft for ft in self.data if ft not in remove]
        return self



[docs]
    def remove_overlapping(self):
        """
        Remove overlapping features

        Features on earlier positions in the list are preferred.
        For example, to keep longer features, sort the list beforehand with
        ``fts.sort(len, reverse=True)``.
        """
        self.remove_duplicates()
        remove = []
        for i, ft in enumerate(self):
            if ft in remove:
                continue
            for ft2 in self[i+1:]:
                if ft2 in remove:
                    continue
                if ft.overlaps(ft2):
                    remove.append(ft2)
        self.data = [ft for ft in self.data if ft not in remove]
        return self