# (C) 2024, Tom Eulenfeld, MIT license
"""
Sequence related classes, `.BioSeq`, `.BioBasket`
"""
import collections
import collections.abc
import copy
from functools import reduce
import io
import math
import sys
from warnings import warn
from sugar.data import CODES
from sugar.core.fts import Feature, FeatureList, Location, LocationTuple
from sugar.core.meta import Attr, Meta
from sugar.core.util import _add_inplace_doc
CODES_INV = {frozenset(v): k for k, v in CODES.items()}
COMPLEMENT = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', '.': '.', '-': '-'}
COMPLEMENT_ALL = {c: CODES_INV[frozenset(COMPLEMENT[nt] for nt in nts)] for c, nts in CODES.items()}
COMPLEMENT_TRANS = str.maketrans(COMPLEMENT_ALL)
class _Sliceable_GetItem():
def __init__(self, obj, method='_getitem', **kw):
self.kw = kw
self.call = getattr(obj, method)
def __getitem__(self, i):
return self.call(i, **self.kw)
[docs]
class _BioSeqStr():
"""
Helper class to hold all string methods in the `BioSeq.str` namespace.
The methods modify the data in-place, if applicable,
which is different from the behavior of the original string methods.
See `str` for documentation of the methods.
:meta public:
"""
def __init__(self, parent):
self.__parent = parent
[docs]
def center(self, width, *args):
self.__parent.data = self.__parent.data.center(width, *args)
return self.__parent
[docs]
def count(self, sub, start=0, end=sys.maxsize):
return self.__parent.data.count(str(sub), start, end)
[docs]
def removeprefix(self, prefix, /):
self.__parent.data = self.__parent.data.removeprefix(str(prefix))
return self.__parent
[docs]
def removesuffix(self, suffix, /):
self.__parent.data = self.__parent.data.removesuffix(str(suffix))
return self.__parent
[docs]
def encode(self, encoding='utf-8', errors='strict'):
encoding = 'utf-8' if encoding is None else encoding
errors = 'strict' if errors is None else errors
return self.__parent.data.encode(encoding, errors)
[docs]
def endswith(self, suffix, start=0, end=sys.maxsize):
return self.__parent.data.endswith(str(suffix), start, end)
[docs]
def find(self, sub, start=0, end=sys.maxsize):
return self.__parent.data.find(str(sub), start, end)
[docs]
def index(self, sub, start=0, end=sys.maxsize):
return self.__parent.data.index(str(sub), start, end)
[docs]
def isalpha(self):
return self.__parent.data.isalpha()
[docs]
def isascii(self):
return self.__parent.data.isascii()
[docs]
def islower(self):
return self.__parent.data.islower()
[docs]
def isupper(self):
return self.__parent.data.isupper()
[docs]
def ljust(self, width, *args):
"""
The ``ljust()`` and ``rjust()`` methods can be used to fill up an alignment with gaps.
Example: ``seqs.ljust(500, '-')``
"""
self.__parent.data = self.__parent.data.ljust(width, *args)
return self.__parent
[docs]
def lower(self):
self.__parent.data = self.__parent.data.lower()
return self.__parent
[docs]
def lstrip(self, chars=None):
if chars is not None:
chars = str(chars)
self.__parent.data = self.__parent.data.lstrip(chars)
return self.__parent
[docs]
@staticmethod
def maketrans(*args):
return str.maketrans(*args)
[docs]
def replace(self, old, new, maxsplit=-1):
self.__parent.data = self.__parent.data.replace(
str(old), str(new), maxsplit)
return self.__parent
[docs]
def rfind(self, sub, start=0, end=sys.maxsize):
return self.__parent.data.rfind(str(sub), start, end)
[docs]
def rindex(self, sub, start=0, end=sys.maxsize):
return self.__parent.data.rindex(str(sub), start, end)
[docs]
def rjust(self, width, *args):
self.__parent.data = self.__parent.data.rjust(width, *args)
return self.__parent
[docs]
def rstrip(self, chars=None):
if chars is not None:
chars = str(chars)
self.__parent.data = self.__parent.data.rstrip(chars)
return self.__parent
[docs]
def split(self, sep=None, maxsplit=-1):
if sep is not None:
sep = str(sep)
return self.__parent.data.split(sep, maxsplit)
[docs]
def rsplit(self, sep=None, maxsplit=-1):
if sep is not None:
sep = str(sep)
return self.__parent.data.rsplit(sep, maxsplit)
[docs]
def splitlines(self, keepends=False):
return self.__parent.data.splitlines(keepends)
[docs]
def startswith(self, prefix, start=0, end=sys.maxsize):
return self.__parent.data.startswith(str(prefix), start, end)
[docs]
def strip(self, chars=None):
if chars is not None:
chars = str(chars)
self.__parent.data = self.__parent.data.strip(chars)
return self.__parent
[docs]
def swapcase(self):
self.__parent.data = self.__parent.data.swapcase()
return self.__parent
[docs]
def translate(self, *args):
self.__parent.data = self.__parent.data.translate(*args)
return self.__parent
[docs]
def upper(self):
self.__parent.data = self.__parent.data.upper()
return self.__parent
[docs]
class _BioBasketStr():
"""
Helper class to move all string methods into the `BioBasket.str` namespace
It calls the corresponding `BioSeq.str` method under the hood and returns
either the modified `BioBasket` object or a list of results.
:meta public:
"""
def __init__(self, parent):
self.__parent = parent
def __getattr__(self, name):
def method(*args, **kw):
results = [
getattr(seq.str, name)(*args, **kw)
for seq in self.__parent
]
if name in (
'center', 'remove_prefix', 'ljust', 'lower',
'lstrip', 'replace', 'rjust', 'rstrip',
'strip', 'swapcase', 'translate', 'upper'
):
return self.__parent
else:
return results
return method
[docs]
class BioSeq():
"""
Class holding sequence data and metadata, exposing bioinformatics methods.
Most methods work in-place by default, but return the BioSeq object again.
Therefore, method chaining can be used.
"""
def __init__(self, data, id='', meta=None, type=None):
#: Property holding the data string
self.data = str(data).upper()
if hasattr(data, 'meta'):
meta = data.meta
elif 'meta' in data:
meta = data['meta']
elif meta is None:
meta = {}
#: Property holding metadata
self.meta = Meta(meta)
if id or 'id' not in self.meta:
self.meta.id = id
assert type in (None, 'nt', 'aa')
if type is None:
codes = set(CODES) | {'U'}
type = 'nt' if all(nb in codes for nb in self.data) else 'aa'
if type == 'nt' and 'U' in data:
from warnings import warn
warn('Found U in nucleotide sequence, '
'some methods will not work as expected')
#: type of the sequence, either ``'nt'`` or ``'aa'``
self.type = type
def _repr_pretty_(self, p, cycle):
if cycle:
p.text('...')
else:
p.text(str(self))
def __str__(self):
return str(self.data)
def __repr__(self):
metastr = ', '.join(f'{prop}={repr(val)}' for prop, val in vars(self.meta).items())
return f'{type(self).__name__}({repr(self.data)}, meta=dict({metastr}))'
def __eq__(self, string):
if isinstance(string, BioSeq):
return self.data == string.data and self.meta == string.meta
return self.data == string
def __lt__(self, other):
if isinstance(other, BioSeq):
return self.meta.get('id', '') < other.meta.get('id', '')
msg = f"'<' not supported between instances of '{type(self).__name__}' and '{type(other).__name__}'"
raise TypeError(msg)
def __len__(self):
return len(self.data)
def __setitem__(self, index, value):
l = list(self.data)
l[index] = str(value)
self.data = ''.join(l)
def __add__(self, other):
if isinstance(other, BioSeq) and self.meta != other.meta:
warn('Join two BioSeq objects with different meta data')
return self.__class__(self.data + str(other), meta=self.meta)
def __iadd__(self, other):
if isinstance(other, BioSeq) and self.meta != other.meta:
warn('Join two BioSeq objects with different meta data')
self.data = self.data + str(other)
return self
def __radd__(self, other):
return self.__class__(str(other) + self.data, meta=self.meta)
@property
def str(self):
"""
Namespace holding all available string methods,
see `_BioSeqStr` for available methods
and `str` for documentation of the methods
.. rubric:: Example:
>>> seq = read()[0]
>>> seq.str.find('ATG') # Use string method
30
"""
return _BioSeqStr(self)
@property
def id(self):
"""Alias for ``BioSeq.meta.id``"""
return self.meta.id
@id.setter
def id(self, value):
self.meta.id = value
@property
def fts(self):
"""
Alias for ``BioSeq.meta.fts``
The fts object holds all feature metadata.
It is an instance of `.FeatureList`.
"""
return self.meta.setdefault('fts', FeatureList())
@fts.setter
def fts(self, value):
self.meta.fts = FeatureList(value)
for ft in self.meta.fts:
if ft.seqid != self.id:
warn('Feature seqid and sequence id mismatch')
[docs]
def add_fts(self, fts):
"""
Add some features to the feature list.
If you want to set all features, use the `BioSeq.fts` attribute.
:param fts: features to add
"""
self.fts = self.fts + FeatureList(fts)
self.fts.sort()
@property
def gc(self):
"""
GC content of the sequence
"""
GC = self.str.count('G') + self.str.count('C')
AT = self.str.count('A') + self.str.count('T') + self.str.count('U')
if GC + AT > 0:
return GC / (GC + AT)
else:
return 0
_add_inplace_doc
[docs]
def rc(self, update_fts=False):
"""
Reverse complement, alias for ``BioSeq.reverse().complement()``
"""
self.reverse().complement()
if update_fts:
self.fts = self.fts.rc(seqlen=len(self))
return self
def __getitem__(self, index):
return self._getitem(index)
[docs]
def sl(self, **kw):
"""
Method that allows you to slice the `BioSeq` object with non-default options.
If you want to use the default options, you can slice the BioSeq object directly.
For non-default options, slice the sliceable object returned by this method.
:param bool inplace:
Not only will the subsequence be returned,
but the original sequence will be modified in-place
(default: False).
:param str gap:
Gaps of the given characters are taken into account
when slicing the sequence
(default: gaps are not taken into account)
.. rubric:: Slicing options:
The slice specifies which part of the sequence is returned, and
is defined inside the square brackets ``[]``
The following types are supported.
int,slice
The location is specified by int or slice
`.Location`
specified by location
`.Feature`
specified by feature
str
Position of the first feature of the given type, e.g. ``'cds'``
will return the sequence with the first coding sequence.
.. rubric:: Example:
>>> from sugar import read
>>> seq = read()[0]
>>> print(seq[:5]) # use direct slicing for default options
ACCTG
>>> print(seq[4])
G
>>> print(seq['cds'][:3])
ATG
>>> print(seq.sl(inplace=True, gap='-')[:5:2]) # non-default options
ACG
>>> print(seq) # has been modified in-place
ACG
"""
return _Sliceable_GetItem(self, **kw)
[docs]
def slindex(self, gap=None):
"""
Method that translates an index to account for gaps
.. rubric:: Example:
>>> from sugar import BioSeq
>>> seq = BioSeq('ATG---GGA')
>>> print(seq)
ATG---GGA
>>> print(seq[1:5])
TG--
>>> print(seq.sl(gap='-')[1:5])
TG---GG
>>> print(seq.slindex(gap='-')[1:5])
slice(1, 8, None)
>>> print(seq[seq.slindex(gap='-')[1:5]])
TG---GG
"""
return _Sliceable_GetItem(self, method='_getindex', gap=gap)
def _slice_locs(self, locs, splitter=None, filler=None, gap=None, update_fts=False):
# Merge the sequences corresponding to the ordered locations
sub_seqs = []
prev_loc = None
for i, loc in enumerate(locs):
add_seq = self.sl(gap=gap)[loc.start:loc.stop]
if loc.strand == '-':
add_seq = add_seq.rc()
if filler is not None and prev_loc is not None:
if loc.strand == '-':
num = prev_loc.start - loc.stop
else:
num = loc.start - prev_loc.stop
if num > 0:
sub_seqs.append(num * filler)
if splitter is not None and prev_loc is not None:
sub_seqs.append(splitter)
sub_seqs.append(add_seq.data)
prev_loc = loc
sub_seq = BioSeq(''.join(sub_seqs), meta=self.meta.copy())
if update_fts:
if len(locs) > 1:
raise ValueError('update_fts is only allowed for locs wit a single Location. Sorry.')
start, stop = locs.range
fts = FeatureList()
for loc in locs:
fts.extend(self.fts.slice(loc.start, loc.stop, rel=start))
if locs[0].strand == '-':
fts = fts.rc(seqlen=stop-start)
sub_seq.fts = fts
return sub_seq
def _getindex(self, index, gap=None):
if not isinstance(index, (int, slice)):
if isinstance(index, str):
ft = self.fts.get(index)
if ft is None:
raise ValueError(f'Feature of type {index} not found')
index = ft
if isinstance(index, Location):
index = LocationTuple([Location])
elif isinstance(index, Feature):
index = index.locs
if isinstance(index, LocationTuple):
start, stop = index.range
return self.slindex(gap=gap)[start:stop]
else:
raise TypeError(f"Index of type '{type(index).__name__}' not supported")
# from bisect import bisect
nogaps = [i for i, nt in enumerate(self.data) if nt not in gap]
adj = lambda i: nogaps[i] if i is not None and i < len(nogaps) else None
adj_stop = lambda i: nogaps[i-1]+1 if i is not None and 0 < i < len(nogaps) else 0 if i == 0 else None
if isinstance(index, int):
index = adj(index)
elif isinstance(index, slice):
index = slice(adj(index.start), adj_stop(index.stop), index.step)
return index
def _getitem(self, index, inplace=False, gap=None, update_fts=False, **kw):
"""
Slice the sequence and return a subsequence
This is the method which is called if you slice with ``BioSeq[]`` syntax
or with ``BioSeq.sl()[]`` syntax
If you want to use non-default options call this method directly,
or by the `BioSeq.sl` attribute.
"""
# TODO: doc for splitter and filler
if not isinstance(index, (int, slice)):
if isinstance(index, str):
ft = self.fts.get(index)
if ft is None:
raise ValueError(f'Feature of type {index} not found')
index = ft
if isinstance(index, Location):
index = LocationTuple([index])
elif isinstance(index, Feature):
index = index.locs
if isinstance(index, LocationTuple):
subseq = self._slice_locs(index, gap=gap, update_fts=update_fts, **kw)
else:
raise TypeError(f"Index of type '{type(index).__name__}' not supported")
else:
subseq = self.__class__(
self.data[index if gap is None else self._getindex(index, gap=gap)],
meta=self.meta
)
if update_fts:
if isinstance(index, int):
start, stop = index, index + 1
elif isinstance(index, slice):
start, stop = index.start, index.stop
if index.step not in (None, 1):
raise ValueError('update_fts for slices only supported with step==1')
subseq.fts = self.fts.slice(start, stop, rel=start or 0)
if inplace:
self.data = subseq.data
return subseq
[docs]
@_add_inplace_doc
def complement(self):
"""
Complementary sequence, i.e. transcription
"""
if 'U' in self.data:
self.str.replace('U', 'T').str.translate(COMPLEMENT_TRANS).str.replace('T', 'U')
else:
self.str.translate(COMPLEMENT_TRANS)
return self
[docs]
def countall(self, *args, **kw):
return BioBasket([self]).countall(*args, **kw)
[docs]
def countplot(self, *args, hue=None, **kw):
return BioBasket([self]).countplot(*args, hue=hue, **kw)
[docs]
def copy(self):
"""
Return a deep copy of the object
"""
return copy.deepcopy(self)
[docs]
@classmethod
def frombiopython(cls, obj):
"""
Create a `.BioSeq` object from a biopython_ `~Bio.SeqRecord.SeqRecord` or `~Bio.Seq.Seq` object.
:param obj: The object to convert.
.. note::
BioPython Features in the ``SeqRecord.features`` attribute are automatically converted.
"""
from sugar.core._adapter import biopython2seq
return biopython2seq(obj, cls=cls)
[docs]
@classmethod
def frombiotite(cls, obj):
"""
Create a `.BioSeq` object from a biotite_ sequence object.
:param obj: The object to convert.
"""
from sugar.core._adapter import biotite2seq
return biotite2seq(obj, cls=cls)
[docs]
def match(self, *args, **kw):
"""
Search regex and return match, see ``~.cane.match()``
"""
from sugar.core.cane import match as _match
return _match(self, *args, **kw)
[docs]
def matchall(self, *args, **kw):
"""
Search regex and return `.BioMatchList` with all matches, see `~.cane.match()`
"""
kw['matchall'] = True
return self.match(*args, **kw)
[docs]
def find_orfs(self, *args, **kw):
"""
Find ORFS in the sequence, see `~.cane.find_orfs()`
"""
from sugar.core.cane import find_orfs
return find_orfs(self, *args, **kw)
[docs]
def plot_ftsviewer(self, *args, **kw):
"""
Plot features of the sequence using DNAFeaturesViewer_, see `~.imaging.ftsviewer.plot_ftsviewer()`
.. note::
Using `BioSeq <.BioSeq.plot_ftsviewer>` or `.BioBasket.plot_ftsviewer()`
over `.FeatureList.plot_ftsviewer()` has the advantage,
that sequence lengths are used automatically.
"""
return BioBasket([self]).plot_ftsviewer(*args, **kw)
[docs]
def tostr(self, **kw):
"""
Return a nice string, see `BioBasket.tostr()`
"""
kw.setdefault('add_header', False)
return BioBasket([self]).tostr(**kw)
[docs]
def tofmtstr(self, fmt, **kw):
"""
Write object to a string of given format, see `~.main.write()`
"""
return BioBasket([self]).tofmtstr(fmt, **kw)
[docs]
def tobiopython(self):
"""
Convert BioSeq to biopython_ `~Bio.SeqRecord.SeqRecord` instance
Attached ``BioSeq.fts`` features are automatically converted.
"""
from sugar.core._adapter import seq2biopython
return seq2biopython(self)
[docs]
def tobiotite(self, **kw):
"""
Convert BioSeq to biotite_ `~biotite.sequence.NucleotideSequence` or `~biotite.sequence.ProteinSequence` instance
:param str type: ``'nt'`` creates a `~biotite.sequence.NucleotideSequence` instance,
``'aa'`` creates a `~biotite.sequence.ProteinSequence` instance,
by default the class is inferred from the sequence itself.
:param str gap: Gap characters that must be removed from the sequence string.
:param bool warn: Whether to warn if gap characters have been removed, default is True.
"""
from sugar.core._adapter import seq2biotite
return seq2biotite(self, **kw)
[docs]
def toftsviewer(self, **kw):
r"""
Convert features of this sequence to DNAFeaturesViewer_ `~dna_features_viewer.GraphicRecord`
See `.FeatureList.toftsviewer`.
"""
return self.fts.toftsviewer(seq=self, **kw)
[docs]
@_add_inplace_doc
def reverse(self):
"""
Reverse the sequence
"""
self.data = self.data[::-1]
return self
[docs]
@_add_inplace_doc
def translate(self, *args, update_fts=False, **kw):
"""
Translate nucleotide sequence to amino acid sequence, see `~.cane.translate()`.
The original translate method of the str class can be used via ``BioBasket.str.translate()``.
"""
from sugar.core.cane import translate
self.data = translate(self.data, *args, **kw)
self.type = 'aa'
if update_fts:
fts = self.fts.slice(0, len(self) * 3)
for ft in fts:
for loc in ft.locs:
loc.start = loc.start // 3
loc.stop = loc.stop // 3
self.fts = fts
return self
[docs]
def write(self, fname=None, fmt=None, **kw):
"""
Write sequence to file, see `~.main.write()`
"""
return BioBasket([self]).write(fname=fname, fmt=fmt, **kw)
def _si_format(v, l=4):
if v == 0:
return '0'
l2 = int(math.log10(v)) + 1
if l2 > l:
si = {1: 'k', 2: 'M', 3: 'G', 4: 'T'}
j = (3 + l2 - l) // 3
s = si.get(j, '?')
v = v / 10 ** (3*j)
# p = min(max(0, l - (l2 - 3*j) - 1), 1)
p = 1 if l2 % 3 == l % 3 else 0
return f'{v:.{p}f}{s}'
else:
return str(v)
[docs]
class BioBasket(collections.UserList):
"""
Class holding a list of `BioSeq` objects
The BioBasket object can be used like a list.
It has useful bioinformatics methods attached to it.
The list itself is stored in the ``data`` property.
The BioBasket object may also have a metadata
attribute.
"""
def __init__(self, data=None, meta=None):
if isinstance(data, (str, BioSeq)):
warn(f'{self.__class__.__name__} object initialized with a sequence. '
'To hide this warning initialize with a list of sequences.')
data = [data]
if data is None:
data = []
if hasattr(data, 'meta'):
meta = data.meta
elif 'meta' in data:
meta = data['meta']
elif meta is None:
meta = {}
super().__init__(data)
# FAKE, just for documenting tha data property
#: Property holding the list of sequences
self.data = self.data
#: Property holding metadata
self.meta = Meta(meta)
def __eq__(self, other):
if isinstance(other, BioBasket):
return self.data == other.data and self.meta == other.meta
return self.data == other
# Implement all variants of &, |, -, ^
def __and__(self, other):
return self.__class__([seq for seq in self if seq in other])
def __rand__(self, other):
return self & other
def __iand__(self, other):
self.data = [seq for seq in self if seq in other]
return self
def __or__(self, other):
return self + [seq for seq in other if seq not in self]
def __ror__(self, other):
return self | other
def __ior__(self, other):
self.data += [seq for seq in other if seq not in self]
return self
def __sub__(self, other):
return self.__class__([seq for seq in self if seq not in other])
def __rsub__(self, other):
return self.__class__(other) - self
def __isub__(self, other):
self.data = [seq for seq in self if seq not in other]
return self
def __xor__(self, other):
return (self | other) - (self & other)
def __rxor__(self, other):
return self ^ other
def __ixor__(self, other):
self.data = (self ^ other).data
return self
@property
def str(self):
"""
Namespace holding all available string methods.
The `BioBasket.str` methods call the corresponding `BioSeq.str` methods under the hood
and return either the modified `BioBasket` object or a list of results.
See `_BioSeqStr` for available methods
and `str` for method documentation.
.. rubric:: Example:
>>> seqs = read()
>>> seqs.str.find('ATG') # Use string method
[30, 12]
"""
return _BioBasketStr(self)
@property
def ids(self):
"""List of sequence ids"""
return [seq.meta.id for seq in self]
@property
def fts(self):
"""
`.FeatureList` of containing features of all sequences
Can also be used as setter.
Code example: ``seqs.fts = new_fts``.
"""
fts = [ft for seq in self for ft in seq.fts]
return FeatureList(fts)
@fts.setter
def fts(self, value):
fts = FeatureList(value).groupby('seqid')
for seq in self:
if seq.id in fts:
seq.fts = fts.pop(seq.id)
else:
try:
del seq.meta.fts
except KeyError:
pass
if len(fts) > 0:
missing_ids = ', '.join(fts.keys())
warn(f'Features for seqids {missing_ids} could not be '
'attached to any sequence')
[docs]
def add_fts(self, fts):
"""
Add some features to the feature list of the corresponding sequences.
If you want to set all features, use the `BioBasket.fts` attribute.
:param fts: features to add
"""
fts = FeatureList(fts).groupby('seqid')
for seq in self:
if seq.id in fts:
seq.fts = seq.fts + fts.pop(seq.id)
seq.fts.sort()
if len(fts) > 0:
missing_ids = ', '.join(fts.keys())
warn(f'Features for seqids {missing_ids} could not be '
'attached to any sequence')
[docs]
@_add_inplace_doc
def rc(self, **kw):
"""
Reverse complement, alias for ``BioBasket.reverse().complement()``
"""
for seq in self:
seq.rc(**kw)
return self
def __str__(self):
return self.tostr(add_hint=True)
def _repr_pretty_(self, p, cycle):
if cycle:
p.text('...')
else:
p.text(str(self))
def __repr__(self):
metastr = ', '.join(f'{prop}={repr(val)}' for prop, val in vars(self.meta).items())
return f'{type(self).__name__}({super().__repr__()}, meta=dict({metastr}))'
def __getitem__(self, i):
return self._getitem(i)
[docs]
def sl(self, **kw):
r"""
Method that allows you to slice the `BioBasket` object with non-default options.
If you want to use the default options, you can slice the BioBasket object directly.
For non-default options, slice the sliceable object returned by this method.
:param \*\*kw:
All kwargs are documented in `BioSeq.sl()`.
.. rubric:: Slice options:
The slice specifies which part of the sequence(s) are returned and
is defined inside the square brackets ``[]``
The following options are supported.
int
Returns a `BioSeq` from the basket
slice
Returns a new `BioBasket` object with a subset of the sequences
str,feature,location
Returns a new `BioBasket` object with updated sequences inside, see `BioSeq.sl()`
(int, object)
Returns a `BioSeq` from the basket and slices it with the object, see `BioSeq.sl()`
(slice, object)
Returns a new `BioBasket` object with a subset of the sequences which are replaced
by subsequences according to `BioSeq.sl()`
.. rubric:: Example:
>>> from sugar import read
>>> seqs = read()
>>> print(seqs[:2, 5:10])
2 seqs in basket
AB047639 5 CCCCT ...
AB677533 5 CCCCC ...
>>> print(seqs[:2, 'cds'][:, 0:3])
2 seqs in basket
AB047639 3 ATG ...
AB677533 3 ATG ...
"""
return _Sliceable_GetItem(self, **kw)
def _getitem(self, i, **kw):
"""
Slice sequences
This is the method which is called if you slice with ``BioBasket[]`` syntax.
If you want to use non-default options call this method directly,
or by the `BioBasket.sl` attribute.
"""
if isinstance(i, int):
return self.data[i]
elif isinstance(i, slice):
seqs = self.__class__(self.data[i], meta=self.meta)
elif isinstance(i, (Feature, FeatureList)):
seqs = self.__class__(self.data, meta=self.meta)
if isinstance(i, Feature):
seqs.data = [seq._getitem(i, **kw) for seq in seqs.data if seq.id == i.seqid]
else:
seqs.data = [seq._getitem(ft, **kw) for ft in i for seq in seqs.data if seq.id == ft.seqid]
elif isinstance(i, (str, LocationTuple, Location)):
seqs = self.__class__(self.data, meta=self.meta)
seqs.data = [seq._getitem(i, **kw) for seq in seqs.data]
elif len(i) == 2:
i, j = i
if isinstance(i, int):
return self.data[i]._getitem(j, **kw)
elif isinstance(i, slice):
seqs = self.__class__(self.data[i], meta=self.meta)
seqs.data = [seq._getitem(j, **kw) for seq in seqs.data]
else:
raise TypeError('First dimension of 2D index must be of type int or slice')
else:
raise TypeError(f"Index of type '{type(i).__name__}' not supported")
return seqs
def __setitem__(self, i, value):
if isinstance(i, (int, slice)):
if isinstance(value, (str, BioSeq)):
value = BioSeq(value)
else: # assume its list or BioBasket or similar
value = [BioSeq(v) for v in value]
self.data[i] = value
elif len(i) == 2:
i, j = i
for seq in self[i]:
seq[j] = value
else:
raise TypeError(f"Index of type '{type(i).__name__}' not supported")
[docs]
@_add_inplace_doc
def complement(self):
"""
Complementary sequences, i.e. transcription
"""
for seq in self:
seq.complement()
return self
[docs]
@_add_inplace_doc
def translate(self, *args, **kw):
"""
Translate nucleotide sequences to amino acid sequences, see `~.cane.translate()`.
The original translate method of the str class can be used via ``BioBasket.str.translate()``.
"""
for seq in self:
seq.translate(*args, **kw)
return self
[docs]
@_add_inplace_doc
def reverse(self, *args, **kw):
"""
Reverse sequences
"""
for seq in self:
seq.reverse(*args, **kw)
return self
[docs]
def copy(self):
"""
Return a deep copy of the BioBasket object.
"""
return copy.deepcopy(self)
[docs]
def countall(self, rtype='counter', k=1):
"""
Count letters in sequences
This method may undergo disrupting changes or it may be removed in a later release.
:param rtype:
* ``'counter'`` Return `~collections.Counter` object
* ``'prob'`` Return dictionary with normalized counts
* ``'df'`` Return pandas DataFrame object with count, prob and tprob (total prob) fields
"""
from operator import add
if rtype == 'df':
import pandas as pd
records = [{'id': seq.id, 'word': word, 'count': count}
for seq in self for word, count in collections.Counter(
seq.data if k == 1 else
[reduce(add, kmer) for kmer in zip(*[seq.data[i:] for i in range(k)])]
).items()]
df = pd.DataFrame.from_records(records)
df['prob'] = df.groupby('id', group_keys=False)['count'].apply(lambda c: c/c.sum())
df['tprob']= df['count'] / df['count'].sum()
return df
else:
counters = [
collections.Counter(
seq.data if k == 1 else
[reduce(add, kmer) for kmer in zip(*[seq.data[i:] for i in range(k)])]
) for seq in self]
counter = reduce(add, counters)
if rtype == 'counter':
return counter
elif rtype == 'prob':
s = counter.total()
return {k: v / s for k, v in counter.items()}
[docs]
def countplot(self, y='word', x='count', hue='id', order=None, plot='show',
figsize=None, ax=None, savefigkw={}, **kw):
"""
Create a plot of letter counts
This method may undergo disruptive changes, or it may be removed in a later release.
Under the hood this method uses the pandas and seaborn libraries.
For a help on most of the arguments, see
`seaborn.barplot() <https://seaborn.pydata.org/generated/seaborn.barplot.html#seaborn.barplot>`_.
"""
import matplotlib.pyplot as plt
import seaborn as sns
df = self.countall(rtype='df')
if order in df:
df = df.sort_values(order, ascending=False)
order=None
if ax is None:
fig = plt.figure(figsize=figsize)
ax = fig.add_subplot(111)
ax = sns.barplot(df, y=y, x=x, hue=hue, order=order, ax=ax, **kw)
if plot == 'show':
plt.show()
elif plot == 'return':
return ax
else:
import matplotlib.pyplot as plt
fig = ax.get_figure()
fig.savefig(plot, **savefigkw)
plt.close(fig)
[docs]
@classmethod
def frombiopython(cls, obj):
"""
Create a `.BioBasket` object from a list of biopython_ `~Bio.SeqRecord.SeqRecord` or `~Bio.Seq.Seq` objects.
:param obj: The object to convert, can also be a `~Bio.Align.MultipleSeqAlignment` object.
.. note::
BioPython Features in the ``SeqRecord.features`` attribute are automatically converted.
"""
from sugar.core._adapter import biopython2seqs
return biopython2seqs(obj, cls=cls)
[docs]
@classmethod
def frombiotite(cls, obj):
"""
Create a `.BioBasket` object from a list of biotite_ sequence objects.
:param obj: The object to convert, can also be a biotite ``Alignment`` object.
"""
from sugar.core._adapter import biotite2seqs
return biotite2seqs(obj, cls=cls)
[docs]
@staticmethod
def fromfmtstr(in_, fmt=None, **kw):
"""
Read sequences from a string
"""
from sugar import read
if not isinstance(in_, bytes):
in_ = in_.encode('latin1')
return read(io.BytesIO(in_), fmt=fmt, **kw)
[docs]
def todict(self):
"""
Return a dictionary with sequence ids as keys and sequences as values
.. note::
This method is different from the `BioBasket.groupby()` method.
Each value of the dict returned by ``todict()`` is a sequence,
while each value of the dict returned by ``groupby()`` is a
BioBasket.
"""
return {seq.id: seq for seq in self}
@property
def d(self):
"""
Alias for `BioBasket.todict()`
"""
return self.todict()
[docs]
def match(self, *args, **kw):
"""
Search regex and return `.BioMatchList` of matches, see `~.cane.match()`
"""
from sugar.core.cane import BioMatchList
matches = BioMatchList()
for seq in self:
m = seq.match(*args, **kw)
if kw.get('matchall'):
matches.extend(m)
else:
matches.append(m)
return matches
[docs]
def matchall(self, *args, **kw):
"""
Search regex and return `.BioMatchList` of all matches, see `~.cane.match()`
"""
kw['matchall'] = True
return self.match(*args, **kw)
[docs]
def find_orfs(self, *args, **kw):
"""
Find ORFS in sequences, see `~.cane.find_orfs()`
"""
return reduce(lambda orfs1, orfs2: orfs1 + orfs2,
[seq.find_orfs(*args, **kw) for seq in self])
[docs]
@_add_inplace_doc
def sort(self, keys=('id',), reverse=False):
"""
Sort sequences in-place
:param keys: Tuple of meta keys or functions to use for sorting.
Can also be a single string or a callable.
Defaults to sorting by id.
:param reverse: Use reverse order (default: False)
:return: Sorted sequences
.. rubric:: Example:
>>> from sugar import read
>>> seqs = read()
>>> seqs.sort(len) # doctest: +SKIP
"""
from sugar.core.cane import _sorted
self.data = _sorted(self.data, keys=keys, reverse=reverse, attr='meta')
return self
[docs]
def groupby(self, keys=('id',), flatten=False):
"""
Group sequences
:param keys: Tuple of meta keys or functions to use for grouping.
Can also be a single string or a callable.
By default, the method groups only by id.
:return: Nested dict structure
.. rubric:: Example:
>>> from sugar import read
>>> seqs = read()
>>> grouped = seqs.groupby()
"""
from sugar.core.cane import _groupby
return _groupby(self, keys, attr='meta', flatten=flatten)
[docs]
def select(self, inplace=False, **kw):
r"""
Select sequences
:param \*\*kw: All kwargs must be of the form
``key_op=value``, where op is one of
the operators from the `operator` module.
Additionally, the operator ``'in'`` (membership) is supported.
The different select conditions are combined with
the *and* operator. If you need *or*, call select twice
and combine the results with the ``|`` operator, e.g.
``seqs.select(...) | seqs.select(...)``
:param inplace: Whether to modify the original object (default: False)
:return: Selected sequences
.. rubric:: Example:
>>> from sugar import read
>>> seqs = read()
>>> seqs2 = seqs.select(len_gt=9500) # Select sequences with length > 9500
"""
from sugar.core.cane import _select
selected = _select(self.data, **kw)
if inplace:
self.data = selected
return self
else:
return self.__class__(selected)
[docs]
def tofmtstr(self, fmt, **kw):
"""
Write sequences to a string of the specified format, see `~.main.write()`
"""
return self.write(None, fmt)
[docs]
def tostr(self, h=19, w=80, wid=19, wlen=4, showgc=True,
add_hint=False, raw=False, add_header=True):
"""
Return string with information about sequences, used by ``__str__()`` method
"""
if raw:
return '\n'.join(str(seq) for seq in self)
if len(self) == 0:
return '0 seqs in basket!' * add_header
out = [f'{len(self)} seqs in basket'] * add_header
lenid = min(max([len(seq.id) for seq in self if seq.id], default=-2), wid)+2
maxwlen = max(len(str(len(seq))) for seq in self)
if 0 < wlen < 4:
wlen = max(wlen, min(4, maxwlen))
elif wlen in (0, None) or wlen > maxwlen:
wlen = maxwlen
sgclen = 11 if showgc and any(getattr(seq, 'type') == 'nt' for seq in self) else 0
for i, seq in enumerate(self):
if (h in (None, 0) or (h2:=(n:=len(self))-h) <= 0 or
not (n) // 2 - (h2+1) // 2 <= i <= (n) // 2 + (h2) // 2):
id_ = ('' if not seq.id else
seq.id if wid in (None, 0) or len(seq.id) <= wid else
seq.id[:wid-3] + '...').ljust(lenid)
data = seq.data if w in (None, 0) or len(seq) <= w-lenid-wlen-3-sgclen else seq.data[:w-lenid-wlen-6-sgclen] + '...'
if showgc and getattr(seq, 'type') == 'nt':
data = data + f' GC:{100*seq.gc:.2f}%'
out.append(id_ + _si_format(len(seq), max(4, wlen)).rjust(wlen) + ' ' + data)
elif i == len(self) // 2:
out.append('...')
if add_hint:
out.append(' customize output with BioBasket.tostr() method')
return '\n'.join(out)
[docs]
def tobiopython(self, *, msa=False):
"""
Convert the BioBasket to a list of biopython_ `~Bio.SeqRecord.SeqRecord` objects
:param bool msa: Return a biopython `~Bio.Align.MultipleSeqAlignment` object instead of a list.
Attached ``BioSeq.fts`` features are not converted.
"""
from sugar.core._adapter import seqs2biopython
return seqs2biopython(self, msa=msa)
[docs]
def tobiotite(self, **kw):
"""
Convert BioBasket to a list of biotite_ `~biotite.sequence.NucleotideSequence` or `~biotite.sequence.ProteinSequence` instance
:param str type: ``'nt'`` creates a `~biotite.sequence.NucleotideSequence` instance,
``'aa'`` creates a `~biotite.sequence.ProteinSequence` instance,
by default the class is inferred from the sequence itself.
:param bool msa: Return a biotite ``Alignment`` object instead of a list, default is False
:param str gap: Gap characters that must be removed from the sequence strings.
:param bool warn: Wether to warn if gap characters have been removed, default is True,
not used with ``msa=True``
"""
from sugar.core._adapter import seqs2biotite
return seqs2biotite(self, **kw)
[docs]
def write(self, fname=None, fmt=None, **kw):
"""
Write sequences to file, see `~.main.write()`
"""
from sugar._io import write
return write(self, fname=fname, fmt=fmt, **kw)
[docs]
def plot_alignment(self, *args, **kw):
"""
Plot an alignment, see `~.imaging.alignment.plot_alignment()`
"""
from sugar.imaging import plot_alignment
return plot_alignment(self, *args, **kw)
[docs]
def plot_ftsviewer(self, *args, **kw):
"""
Plot features of the sequences using DNAFeaturesViewer_, see `~.imaging.ftsviewer.plot_ftsviewer()`
.. note::
Using `BioSeq <.BioSeq.plot_ftsviewer>` or `.BioBasket.plot_ftsviewer()`
over `.FeatureList.plot_ftsviewer()` has the advantage,
that sequence lengths are used automatically.
"""
from sugar.imaging import plot_ftsviewer
return plot_ftsviewer(self.fts, *args, seqs=self, **kw)
[docs]
def merge(self, spacer='', update_fts=False, keys=('id',)):
data = []
for group in self.groupby(keys, flatten=True).values():
seq = group[0]
for oseq in group[1:]:
seq.data = seq.data + spacer + oseq.data
if update_fts:
nfts = []
for ft in oseq.fts:
rel = len(seq) - len(oseq)
locs = [Location(l.start + rel, l.stop + rel, l.strand, l.defect) for l in ft.locs]
nfts.append(Feature(locs=locs, meta=ft.meta))
seq.add_fts(nfts)
data.append(seq)
self.data = data
return self
# def consensus(self, gap='-'):
# n = len(self)
# data = [seq.data for seq in self]
# cons = []
# perc = []
# percnongaps = []
# for nt in zip(*data):
# max_count = 0
# nt = str(nt)
# num_gaps = nt.count(gap)
# for letter in list(set(nt) - {gap}):
# count = nt.count(letter)