/* Copyright (c) 2006-2018 Elmar Pruesse This file is part of SINA. SINA is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. SINA is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with SINA. If not, see . Additional permission under GNU GPL version 3 section 7 If you modify SINA, or any covered work, by linking or combining it with components of ARB (or a modified version of that software), containing parts covered by the terms of the ARB-public-library-license, the licensors of SINA grant you additional permission to convey the resulting work. Corresponding Source for a non-source form of such a combination shall include the source code for the parts of ARB used as well as that of the covered work. */ #ifndef _ALIGNED_BASE_H_ #define _ALIGNED_BASE_H_ #include #include #include #include namespace sina { enum base_types { BASE_A=0, BASE_G=1, BASE_C=2, BASE_TU=3, BASE_MAX=4, BASE_LC=4 }; enum base_types_bitmask { BASEM_A=1<(__builtin_ctz(_data & 0xf)); } base_iupac& complement() { _data = ((_data & BASEM_G) << (BASE_C - BASE_G)) | ((_data & BASEM_C) >> (BASE_C - BASE_G)) | ((_data & BASEM_A) << (BASE_TU - BASE_A)) | ((_data & BASEM_TU) >> (BASE_TU - BASE_A)) | (_data & BASEM_LC); return *this; } base_iupac& setLowerCase() { _data |= BASEM_LC; return *this; } base_iupac& setUpperCase() { _data &= ~BASEM_LC; return *this; } bool isLowerCase() const { return (_data & BASEM_LC) != 0; } int ambig_order() const { return count_bits(_data & 0xf); } bool is_ambig() const { return ambig_order() > 1; } bool has_A() const { return (_data & BASEM_A) != 0; } bool has_G() const { return (_data & BASEM_G) != 0; } bool has_C() const { return (_data & BASEM_C) != 0; } bool has_TU() const { return (_data & BASEM_TU) != 0; } bool comp(const base_iupac& rhs) const{ //optimistic, match if IUPAC suggests match possible return (0xf & _data & rhs._data) != 0; //this would compute average //return 1.f - (2.f/count_bits(_data | rhs._data)) * // count_bits(_data & rhs._data) ; } bool comp_pessimistic(const base_iupac& rhs) const { return !is_ambig() && (0xf & _data) == (0xf & rhs._data); } bool comp_exact(const base_iupac& rhs) const { return (0xf & _data) == (0xf & rhs._data); } struct matrix_type { float v[BASE_MAX*BASE_MAX]; }; // this does an IUPAC aware comparison using the given scoring matrix float comp(const base_iupac& rhs, const matrix_type& m) const { float rval = 0; int c = 0; // use some mean bit magic to do a real fast // log2(x) with x in 0,1,2,4 const unsigned int t = 0x30002010; // given this array we can compute log2(x) as follows: // log2(x) = (t >> (x*4)) & 0xF // (shift x nibbles to right, mask everything but leftmost nibble) // "a &= a-1" unsets least significant bit // "a & -a" unsets all but least significant bit for(value_type lm = _data & 0xf; lm != 0u; lm &= lm-1) { unsigned char l = (t >> (((lm & -lm)-1)*4)) & 0xF; for (value_type rm = rhs._data & 0xf; rm != 0u; rm &= rm-1) { unsigned char r = (t >> (((rm &-rm)-1)*4)) & 0xF; rval += m.v[l*BASE_MAX+r]; c++; } } #if 0 // enable to verify above code float tval = 0; for (int l=0; l class aligned : public T { public: using idx_type = unsigned int; using base_type = T; aligned(const idx_type& pos=0, const base_type& base='-') : T(base), _idx(pos) {} base_type getBase() const { return *this;} void setBase(const T& b) { T::operator=(b); } idx_type getPosition() const { return _idx;} void setPosition(const idx_type& i) { _idx=i; } bool operator<(const aligned &rhs) const { return _idx < rhs._idx; } float getWeight() const { return 1; } private: idx_type _idx; friend struct aligned_base_reverse_position; }; /** * This implementation of "aligned" is more compact (4 vs 8 bytes), * but slower (needs shifts to access position) and definitely * hacky. It casts around happily assuming that the byte order is * little endian that and sizeof(T)==1 */ class position { protected: unsigned char data[3]{0,0,0}; }; template class aligned_compact : public position, public T { public: using idx_type = uint32_t; using base_type = T; aligned_compact(idx_type pos=0, unsigned char value='-') : T(value) { setPosition(pos); } base_type getBase() const { return *this;} void setBase(const T& b) { T::operator=(b); } idx_type getPosition() const { return (*(uint32_t*)this & 0xFFFFFF); } void setPosition(idx_type pos) { *(uint32_t*)&data = (pos & 0xFFFFFF) | (T::getData() << 24) ; } bool operator<(const aligned_compact &rhs) const { return getPosition() < rhs.getPosition(); } float getWeight() const { return 1; } private: friend struct aligned_base_reverse_position; }; #define COMPACT_ALIGNED_BASE #ifndef COMPACT_ALIGNED_BASE using aligned_base = aligned; #else using aligned_base = aligned_compact; #endif }// namespace sina namespace std { template<> struct numeric_limits : numeric_limits {}; } // namespace std std::ostream& operator<<(std::ostream& out, sina::aligned_base ab); #endif // _ALIGNED_BASE_H_ /* Local Variables: mode:c++ c-file-style:"stroustrup" c-file-offsets:((innamespace . 0)(inline-open . 0)(case-label . +)) indent-tabs-mode:nil fill-column:99 End: */ // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=99 :