#ifndef SEQ_VECTOR__HPP
#define SEQ_VECTOR__HPP

/*  $Id: seq_vector.hpp 62074 2014-03-12 16:29:48Z vasilche $
* ===========================================================================
*
*                            PUBLIC DOMAIN NOTICE
*               National Center for Biotechnology Information
*
*  This software/database is a "United States Government Work" under the
*  terms of the United States Copyright Act.  It was written as part of
*  the author's official duties as a United States Government employee and
*  thus cannot be copyrighted.  This software/database is freely available
*  to the public for use. The National Library of Medicine and the U.S.
*  Government have not placed any restriction on its use or reproduction.
*
*  Although all reasonable efforts have been taken to ensure the accuracy
*  and reliability of the software and data, the NLM and the U.S.
*  Government do not and cannot warrant the performance or results that
*  may be obtained by using this software or data. The NLM and the U.S.
*  Government disclaim all warranties, express or implied, including
*  warranties of performance, merchantability or fitness for any particular
*  purpose.
*
*  Please cite the author in any work or product based on this material.
*
* ===========================================================================
*
* Author: Aleksey Grichenko, Michael Kimelman, Eugene Vasilchenko
*
* File Description:
*   Sequence data container for object manager
*
*/

#include <objmgr/bioseq_handle.hpp>
#include <objmgr/scope.hpp>
#include <objmgr/seq_map.hpp>
#include <objmgr/seq_vector_ci.hpp>
#include <objects/seq/Seq_literal.hpp>

BEGIN_NCBI_SCOPE

/** @addtogroup ObjectManagerSequenceRep
 *
 * @{
 */

class CRandom;

BEGIN_SCOPE(objects)

class CScope;
class CSeq_loc;
class CSeqMap;
class CSeqVector_CI;

/////////////////////////////////////////////////////////////////////////////
///
///  CSeqVector --
///
///  Provide sequence data in the selected coding

class NCBI_XOBJMGR_EXPORT CSeqVector : public CObject, public CSeqVectorTypes
{
public:
    typedef CBioseq_Handle::EVectorCoding EVectorCoding;
    typedef CSeqVector_CI const_iterator;
    typedef CMutex TMutex;
    typedef TMutex::TWriteLockGuard TMutexGuard;

    CSeqVector(void);
    explicit
    CSeqVector(const CBioseq_Handle& bioseq,
               EVectorCoding coding = CBioseq_Handle::eCoding_Ncbi,
               ENa_strand strand = eNa_strand_unknown);
    CSeqVector(const CSeqMap& seqMap, CScope& scope,
               EVectorCoding coding = CBioseq_Handle::eCoding_Ncbi,
               ENa_strand strand = eNa_strand_unknown);
    CSeqVector(const CSeqMap& seqMap, const CTSE_Handle& top_tse,
               EVectorCoding coding = CBioseq_Handle::eCoding_Ncbi,
               ENa_strand strand = eNa_strand_unknown);
    CSeqVector(const CSeq_loc& loc, CScope& scope,
               EVectorCoding coding = CBioseq_Handle::eCoding_Ncbi,
               ENa_strand strand = eNa_strand_unknown);
    CSeqVector(const CSeq_loc& loc, const CTSE_Handle& top_tse,
               EVectorCoding coding = CBioseq_Handle::eCoding_Ncbi,
               ENa_strand strand = eNa_strand_unknown);
    CSeqVector(const CBioseq& bioseq,
               CScope* scope = 0,
               EVectorCoding coding = CBioseq_Handle::eCoding_Ncbi,
               ENa_strand strand = eNa_strand_unknown);
    CSeqVector(const CSeqVector& vec);

    virtual ~CSeqVector(void);

    CSeqVector& operator= (const CSeqVector& vec);

    bool empty(void) const;
    TSeqPos size(void) const;

    /// Get mutex for a few non-MT-safe methods to make them MT-safe at a cost
    /// of performance.
    TMutex& GetMutex(void) const;

    /// 0-based array of residues
    /// Note: this method is not MT-safe,
    /// do not call it in parallel with other, even MT-safe method.
    /// It will be MT-safe to call this method after locking GetMutex().
    TResidue operator[] (TSeqPos pos) const;

    /// true if sequence at 0-based position 'pos' has gap
    /// Note: this method is not MT-safe,
    /// do not call it in parallel with other, even MT-safe method.
    /// It will be MT-safe to call this method after locking GetMutex().
    bool IsInGap(TSeqPos pos) const;

    /// returns number of gap symbols ahead including base at position 'pos'
    /// returns 0 if the position is not in gap
    TSeqPos GetGapSizeForward(TSeqPos pos) const;

    /// returns gap Seq-literal object ref
    /// returns null if it's not a gap or an unspecified gap
    CConstRef<CSeq_literal> GetGapSeq_literal(TSeqPos pos) const;

    /// Check if the sequence data is available for the interval [start, stop).
    bool CanGetRange(TSeqPos start, TSeqPos stop) const;
    bool CanGetRange(const const_iterator& start,
                     const const_iterator& stop) const;

    /// Fill the buffer string with the sequence data for the interval
    /// [start, stop).
    void GetSeqData(TSeqPos start, TSeqPos stop, string& buffer) const;
    void GetSeqData(const const_iterator& start,
                    const const_iterator& stop,
                    string& buffer) const;
    void GetPackedSeqData(string& buffer,
                          TSeqPos start = 0,
                          TSeqPos stop = kInvalidSeqPos);

    typedef CSeq_inst::TMol TMol;

    TMol GetSequenceType(void) const;
    bool IsProtein(void) const;
    bool IsNucleotide(void) const;

    CScope& GetScope(void) const;
    const CSeqMap& GetSeqMap(void) const;
    ENa_strand GetStrand(void) const;
    void SetStrand(ENa_strand strand);

    /// Target sequence coding. CSeq_data::e_not_set -- do not
    /// convert sequence (use GetCoding() to check the real coding).
    TCoding GetCoding(void) const;
    void SetCoding(TCoding coding);
    /// Set coding to either Iupacaa or Iupacna depending on molecule type
    void SetIupacCoding(void);
    /// Set coding to either Ncbi8aa or Ncbi8na depending on molecule type
    void SetNcbiCoding(void);
    /// Set coding to either Iupac or Ncbi8xx
    void SetCoding(EVectorCoding coding);

    /// Return gap symbol corresponding to the selected coding
    TResidue GetGapChar(ECaseConversion case_cvt = eCaseConversion_none) const;

    const_iterator begin(void) const;
    const_iterator end(void) const;

    /// Randomization of ambiguities and gaps in ncbi2na coding
    void SetRandomizeAmbiguities(void);
    void SetRandomizeAmbiguities(Uint4 seed);
    void SetRandomizeAmbiguities(CRandom& random_gen);
    void SetRandomizeAmbiguities(CRef<INcbi2naRandomizer> randomizer);
    void SetNoAmbiguities(void);

private:

    friend class CBioseq_Handle;
    friend class CSeqVector_CI;

    void x_InitSequenceType(void);

    // this internal method is not MT-safe and must be guarded if necessary
    CSeqVector_CI& x_GetIterator(TSeqPos pos) const;
    // this internal method is not MT-safe and must be guarded if necessary
    CSeqVector_CI* x_CreateIterator(TSeqPos pos) const;

    void x_ResetIterator(void) const;

    void x_InitRandomizer(CRandom& random_gen);

    void x_GetPacked8SeqData(string& dst_str,
                             TSeqPos src_pos, TSeqPos src_end);
    void x_GetPacked4naSeqData(string& dst_str,
                               TSeqPos src_pos, TSeqPos src_end);
    void x_GetPacked2naSeqData(string& dst_str,
                               TSeqPos src_pos, TSeqPos src_end);

    CHeapScope               m_Scope;
    CConstRef<CSeqMap>       m_SeqMap;
    CTSE_Handle              m_TSE;
    TSeqPos                  m_Size;
    TMol                     m_Mol;
    ENa_strand               m_Strand;
    TCoding                  m_Coding;
    CRef<INcbi2naRandomizer> m_Randomizer;

    mutable TMutex                  m_IteratorMutex;
    mutable AutoPtr<CSeqVector_CI>  m_Iterator;
};


/////////////////////////////////////////////////////////////////////////////
///
///  CNcbi2naRandomizer --
///

class NCBI_XOBJMGR_EXPORT CNcbi2naRandomizer : public INcbi2naRandomizer
{
public:
    // If seed == 0 then use random number for seed
    CNcbi2naRandomizer(CRandom& gen);
    ~CNcbi2naRandomizer(void);

    void RandomizeData(char* buffer,  // buffer to be randomized
                       size_t count,  // number of bases in the buffer
                       TSeqPos pos);  // sequence pos of the buffer

private:
    enum {
        kRandomizerPosMask = 0x3f,
        kRandomDataSize    = kRandomizerPosMask + 1,
        kRandomValue       = 16
    };

    char m_FixedTable[16];
    char m_RandomTable[16][kRandomDataSize];
};


/////////////////////////////////////////////////////////////////////
//
//  Inline methods
//
/////////////////////////////////////////////////////////////////////


inline
CSeqVector_CI& CSeqVector::x_GetIterator(TSeqPos pos) const
{
    CSeqVector_CI* iter = m_Iterator.get();
    if ( !iter ) {
        iter = x_CreateIterator(pos);
    }
    else {
        iter->SetPos(pos);
    }
    return *iter;
}


inline
CSeqVector::TMutex& CSeqVector::GetMutex(void) const
{
    return m_IteratorMutex;
}


inline
CSeqVector::TResidue CSeqVector::operator[] (TSeqPos pos) const
{
    return *x_GetIterator(pos);
}


inline
bool CSeqVector::IsInGap(TSeqPos pos) const
{
    return x_GetIterator(pos).IsInGap();
}


inline
bool CSeqVector::empty(void) const
{
    return m_Size == 0;
}


inline
TSeqPos CSeqVector::size(void) const
{
    return m_Size;
}


inline
CSeqVector_CI CSeqVector::begin(void) const
{
    return CSeqVector_CI(*this, 0);
}


inline
CSeqVector_CI CSeqVector::end(void) const
{
    return CSeqVector_CI(*this, size());
}


inline
CSeqVector::TCoding CSeqVector::GetCoding(void) const
{
    return m_Coding;
}

inline
CSeqVector::TResidue CSeqVector::GetGapChar(ECaseConversion case_cvt) const
{
    return sx_GetGapChar(GetCoding(), case_cvt);
}

inline
const CSeqMap& CSeqVector::GetSeqMap(void) const
{
    return *m_SeqMap;
}

inline
CScope& CSeqVector::GetScope(void) const
{
    return m_Scope;
}

inline
ENa_strand CSeqVector::GetStrand(void) const
{
    return m_Strand;
}


inline
CSeqVector::TMol CSeqVector::GetSequenceType(void) const
{
    return m_Mol;
}


inline
bool CSeqVector::IsProtein(void) const
{
    return CSeq_inst::IsAa(GetSequenceType());
}


inline
bool CSeqVector::IsNucleotide(void) const
{
    return CSeq_inst::IsNa(GetSequenceType());
}


inline
bool CSeqVector::CanGetRange(const const_iterator& start,
                             const const_iterator& stop) const
{
    return CanGetRange(start.GetPos(), stop.GetPos());
}


inline
void CSeqVector::GetSeqData(const const_iterator& start,
                            const const_iterator& stop,
                            string& buffer) const
{
    GetSeqData(start.GetPos(), stop.GetPos(), buffer);
}


/* @} */


END_SCOPE(objects)
END_NCBI_SCOPE

#endif  // SEQ_VECTOR__HPP
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199
0200
0201
0202
0203
0204
0205
0206
0207
0208
0209
0210
0211
0212
0213
0214
0215
0216
0217
0218
0219
0220
0221
0222
0223
0224
0225
0226
0227
0228
0229
0230
0231
0232
0233
0234
0235
0236
0237
0238
0239
0240
0241
0242
0243
0244
0245
0246
0247
0248
0249
0250
0251
0252
0253
0254
0255
0256
0257
0258
0259
0260
0261
0262
0263
0264
0265
0266
0267
0268
0269
0270
0271
0272
0273
0274
0275
0276
0277
0278
0279
0280
0281
0282
0283
0284
0285
0286
0287
0288
0289
0290
0291
0292
0293
0294
0295
0296
0297
0298
0299
0300
0301
0302
0303
0304
0305
0306
0307
0308
0309
0310
0311
0312
0313
0314
0315
0316
0317
0318
0319
0320
0321
0322
0323
0324
0325
0326
0327
0328
0329
0330
0331
0332
0333
0334
0335
0336
0337
0338
0339
0340
0341
0342
0343
0344
0345
0346
0347
0348
0349
0350
0351
0352
0353
0354
0355
0356
0357
0358
0359
0360
0361
0362
0363
0364
0365
0366
0367
0368
0369
0370
0371
0372
0373
0374
0375
0376
0377
0378
0379
0380
0381
0382
0383
0384
0385
0386