src/objtools/blast/seqdb_writer/writedb_general.hpp

Go to the documentation of this file.
00001 #ifndef OBJTOOLS_WRITERS_WRITEDB__WRITEDB_GENERAL_HPP
00002 #define OBJTOOLS_WRITERS_WRITEDB__WRITEDB_GENERAL_HPP
00003 
00004 /*  $Id: writedb_general.hpp 177192 2009-11-27 02:47:04Z ucko $
00005  * ===========================================================================
00006  *
00007  *                            PUBLIC DOMAIN NOTICE
00008  *               National Center for Biotechnology Information
00009  *
00010  *  This software/database is a "United States Government Work" under the
00011  *  terms of the United States Copyright Act.  It was written as part of
00012  *  the author's official duties as a United States Government employee and
00013  *  thus cannot be copyrighted.  This software/database is freely available
00014  *  to the public for use. The National Library of Medicine and the U.S.
00015  *  Government have not placed any restriction on its use or reproduction.
00016  *
00017  *  Although all reasonable efforts have been taken to ensure the accuracy
00018  *  and reliability of the software and data, the NLM and the U.S.
00019  *  Government do not and cannot warrant the performance or results that
00020  *  may be obtained by using this software or data. The NLM and the U.S.
00021  *  Government disclaim all warranties, express or implied, including
00022  *  warranties of performance, merchantability or fitness for any particular
00023  *  purpose.
00024  *
00025  *  Please cite the author in any work or product based on this material.
00026  *
00027  * ===========================================================================
00028  *
00029  * Author:  Kevin Bealer
00030  *
00031  */
00032 
00033 /// @file writedb_general.hpp
00034 /// Implementation for general purpose utilities for WriteDB.
00035 /// 
00036 /// Defines classes:
00037 ///     CWriteDB_PackedStringsCompare
00038 ///     CWriteDB_PackedBuffer
00039 ///     CWriteDB_PackedStrings
00040 ///     CArrayString
00041 ///     CWriteDB_PackedSemiTree
00042 /// 
00043 /// Implemented for: UNIX, MS-Windows
00044 
00045 #include <objects/seq/seq__.hpp>
00046 
00047 BEGIN_NCBI_SCOPE
00048 
00049 /// Import definitions from the objects namespace.
00050 USING_SCOPE(objects);
00051 
00052 /// Divide by a number, rounding up to a whole integer.
00053 inline int s_DivideRoundUp(int value, int blocksize)
00054 {
00055     return ((value + (blocksize - 1)) / blocksize);
00056 }
00057 
00058 /// Round up to the next multiple of some number.
00059 inline int s_RoundUp(int value, int blocksize)
00060 {
00061     return s_DivideRoundUp(value, blocksize)*blocksize;
00062 }
00063 
00064 /// Comparator for sorting null terminated strings.
00065 class CWriteDB_PackedStringsCompare {
00066 public:
00067 #ifdef NCBI_OS_SOLARIS
00068     /// Constructor.
00069     ///
00070     /// This definition silences a false-alarm warning on solaris.
00071     CWriteDB_PackedStringsCompare()
00072     {
00073     }
00074 #endif // NCBI_OS_SOLARIS
00075     
00076     /// Compare two null terminated strings.
00077     bool operator()(const char * a, const char * b) const
00078     {
00079         return strcmp(a, b) < 0;
00080     }
00081 };
00082 
00083 /// Sortable packed array of null terminated strings.
00084 ///
00085 /// This class holds NUL terminated string data for multiple strings.
00086 /// Compared to a vector<string>, it is more efficient in terms of
00087 /// both time and space, but it omits many general purpose features
00088 /// such as iteration, deletion and modification of stored data.
00089 template<int ALLOCSIZE>
00090 class CWriteDB_PackedBuffer : public CObject {
00091 public:
00092     /// Constructor
00093     CWriteDB_PackedBuffer()
00094     {
00095         m_Zero[0] = char(0);
00096     }
00097     
00098     /// Destructor
00099     ~CWriteDB_PackedBuffer()
00100     {
00101         Clear();
00102     }
00103     
00104     /// Insert a new string plus a NUL terminator.
00105     ///
00106     /// This inserts a string and returns the location of that string
00107     /// in the packed buffer.  The returned address will not change
00108     /// over the lifetime of this object.
00109     ///
00110     /// @param data The string data.
00111     /// @param length The length of the string data.
00112     /// @return A pointer to the packed string.
00113     const char * Insert(const char * data, int length)
00114     {
00115         if (m_Packed.empty()) {
00116             x_AddBlock();
00117         }
00118         
00119         string * back = m_Packed.back();
00120         
00121         if ((back->size() + length + 1) > back->capacity()) {
00122             x_AddBlock();
00123             back = m_Packed.back();
00124         }
00125         
00126         const char * rv = back->data() + back->size();
00127         back->append(data, length);
00128         back->append(& m_Zero[0], 1);
00129         
00130         return rv;
00131     }
00132     
00133     /// Free all held memory.
00134     void Clear()
00135     {
00136         vector<string*> p2;
00137         m_Packed.swap(p2);
00138         
00139         NON_CONST_ITERATE(vector<string*>, iter, p2) {
00140             string * pstr = *iter;
00141             delete pstr;
00142             *iter = NULL;
00143         }
00144     }
00145     
00146 private:
00147     /// This object type.
00148     typedef CWriteDB_PackedBuffer<ALLOCSIZE> TThis;
00149     
00150     /// Prevent copy constructor.
00151     CWriteDB_PackedBuffer(const TThis & other);
00152     
00153     /// Prevent copy operator.
00154     TThis & operator=(const TThis & other);
00155     
00156     /// Add an block of ALLOCSIZE bytes to the sorted string vector.
00157     void x_AddBlock()
00158     {
00159         string * p = 0;
00160         
00161         try {
00162             p = new string;
00163             int blksz = ALLOCSIZE;
00164             p->reserve(blksz);
00165         }
00166         catch(...) {
00167             delete p;
00168             throw;
00169         }
00170         
00171         m_Packed.push_back(p);
00172     }
00173     
00174     /// Array of pointers to the start of the strings stored here.
00175     vector<string*> m_Packed;
00176     
00177     /// Array containing NUL byte for convenience.
00178     char m_Zero[1];
00179 };
00180 
00181 /// Sortable packed array of strings, optimized for space.
00182 ///
00183 /// This stores a sortable packed collection of strings.  Features
00184 /// here are minimal and targeted for the ISAM code, which follows a
00185 /// simple (input, sort, output) data processing pattern.
00186 
00187 template<int ALLOCSIZE>
00188 class CWriteDB_PackedStrings : public CObject {
00189 public:
00190     /// Constructor
00191     CWriteDB_PackedStrings(CWriteDB_PackedBuffer<ALLOCSIZE> & buffer)
00192         : m_Buffer(buffer)
00193     {
00194     }
00195     
00196     /// Destructor
00197     ~CWriteDB_PackedStrings()
00198     {
00199     }
00200     
00201     /// Insert string data - must be null terminated.
00202     void Insert(const char * x, int length)
00203     {
00204         m_KeyLoc.push_back(m_Buffer.Insert(x, length));
00205     }
00206     
00207     /// Sort the keyloc array (in place).
00208     void Sort()
00209     {
00210         CWriteDB_PackedStringsCompare cmp;
00211         std::sort(m_KeyLoc.begin(), m_KeyLoc.end(), cmp);
00212     }
00213     
00214     /// Sort the keyloc array (in place).
00215     int Size()
00216     {
00217         return m_KeyLoc.size();
00218     }
00219     
00220     /// Get the list of null terminated strings.
00221     const vector<const char *> & GetList()
00222     {
00223         return m_KeyLoc;
00224     }
00225     
00226 private:
00227     /// This object type.
00228     typedef CWriteDB_PackedStrings<ALLOCSIZE> TThis;
00229     
00230     /// Prevent copy constructor.
00231     CWriteDB_PackedStrings(const TThis & other);
00232     
00233     /// Prevent assignment.
00234     TThis & operator=(const TThis & other);
00235     
00236     /// Reference to shared packed-buffer object.
00237     CWriteDB_PackedBuffer<ALLOCSIZE> & m_Buffer;
00238     
00239     /// Pointers to the beginning of the strings stored here.
00240     vector<const char *> m_KeyLoc;
00241 };
00242 
00243 /// Fixed-buffer string type.
00244 ///
00245 /// This type stores a string in a fixed buffer.  The string length
00246 /// may be up to the specific size (the string is not NUL terminated
00247 /// in the internal array).  Compared to std::string, this type is
00248 /// compact, more cache-aware, and does not use dynamic allocation.
00249 /// It cannot handle embedded NULs or strings larger than STR_SIZE.
00250 
00251 template<int STR_SIZE>
00252 class CArrayString {
00253 public:
00254     /// Constructor.
00255     CArrayString()
00256     {
00257         memset(m_Data, 0, STR_SIZE);
00258     }
00259     
00260     /// Copy constructor.
00261     CArrayString(const CArrayString<STR_SIZE> & other)
00262     {
00263         memcpy(m_Data, other.m_Data, STR_SIZE);
00264     }
00265     
00266     /// Construct from a string in a memory location.
00267     CArrayString(const char * x, int L)
00268     {
00269         _ASSERT(L <= STR_SIZE);
00270         memcpy(m_Data, x, L);
00271         if (L < STR_SIZE) {
00272             m_Data[L] = 0;
00273         }
00274     }
00275     
00276     /// Compare two strings (lexicographically).
00277     /// @param other Another string against which to compare.
00278     /// @return -1, 0, or 1 if string is less, equal, or greater than other.
00279     int Cmp(const CArrayString<STR_SIZE> & other) const
00280     {
00281         for(int i = 0; i<STR_SIZE; i++) {
00282             char ch1 = m_Data[i], ch2 = other.m_Data[i];
00283             
00284             if (ch1 < ch2)
00285                 return -1;
00286             
00287             if (ch1 > ch2)
00288                 return 1;
00289             
00290             if ((! ch1) && (! ch2))
00291                 break;
00292         }
00293         return 0;
00294     }
00295     
00296     /// Return true if this string is less than 'other'.
00297     /// @param other Another string against which to compare.
00298     /// @return True if this string is less than other.
00299     bool operator <(const CArrayString<STR_SIZE> & other) const
00300     {
00301         return Cmp(other) < 0;
00302     }
00303     
00304     /// Return true if this string is less than 'other'.
00305     /// @param other Another string against which to compare.
00306     /// @return True if this string is equal to other.
00307     bool operator==(const CArrayString<STR_SIZE> & other) const
00308     {
00309         return Cmp(other) == 0;
00310     }
00311     
00312     /// Assign this string from another.
00313     /// @param other Another string to assign from.
00314     CArrayString<STR_SIZE>& operator=(const CArrayString<STR_SIZE> & other)
00315     {
00316         memcpy(m_Data, other.m_Data, STR_SIZE);
00317         return *this;
00318     }
00319     
00320     /// Get a pointer to the start of this string's data.
00321     /// @return A pointer to the start of this string's data.
00322     const char * Data() const
00323     {
00324         return & m_Data[0];
00325     }
00326     
00327     /// Get this string's length.
00328     /// @return This string's length.
00329     int Size() const
00330     {
00331         for(int i = 0; i<STR_SIZE; i++) {
00332             if (! m_Data[i])
00333                 break;
00334         }
00335         return STR_SIZE;
00336     }
00337     
00338 private:
00339     /// Data for this string, NUL terminated iff less than STR_SIZE bytes.
00340     char m_Data[STR_SIZE];
00341 };
00342 
00343 
00344 /// Packed string data container with sorting and iteration.
00345 /// 
00346 /// This class efficiently stores a packed array of string data.
00347 /// Strings can be added, sorted, and iterated over.  The actual data
00348 /// is stored in packed buffers, but the beginning of each string is
00349 /// removed and used as a first level index to a packed list of the
00350 /// strings (minus this first part).  This reduces the total string
00351 /// storage required, because each string is shorter, and makes the
00352 /// final sorting easier (because each subset of the string table is
00353 /// partially sorted and comparisons are done with shorter strings).
00354 class CWriteDB_PackedSemiTree {
00355 public:
00356     // The capacity() of a string is not necessarily what was provided
00357     // to std::string::reserve(int).  Instead, reserve can provide
00358     // extra space to shift the capacity to something that matches the
00359     // underlying new or malloc implementation.  The value 65000 here
00360     // is designed to get approximately 64K.  Since C++ string buffers
00361     // are normally allocated with metadata prefixes, asking for 2^16
00362     // bytes is unlikely to result in clean 64k block allocations.
00363     
00364     enum {
00365         /// This is the number of bytes for the first level index.
00366         PREFIX = 6,
00367         
00368         /// This is the amount of string capacity to request.
00369         BLOCK = 65000
00370     };
00371     
00372     /// A packed list of buffers.
00373     typedef CWriteDB_PackedStrings<BLOCK> TPacked;
00374     
00375     /// A map from the string prefixes to a packed buffer.
00376     typedef map< CArrayString<PREFIX>, CRef<TPacked> > TPackedMap;
00377     
00378     /// Constructor.
00379     CWriteDB_PackedSemiTree()
00380         : m_Size(0)
00381     {
00382     }
00383     
00384     /// Destructor.
00385     ~CWriteDB_PackedSemiTree()
00386     {
00387         Clear();
00388     }
00389     
00390     /// Insert string data into the container.
00391     void Insert(const char * x, int L);
00392     
00393     /// Sort all contained data.
00394     void Sort();
00395     
00396     /// Return the number of contained entries.
00397     int Size() const
00398     {
00399         return m_Size;
00400     }
00401     
00402     /// Class providing iteration over string data.
00403     ///
00404     /// This class assumes all PackedStrings objects are non-empty.
00405     class Iterator {
00406     public:
00407         /// Create an iterator.
00408         ///
00409         /// The specified TPacked iterator will normally point to the
00410         /// first TPacked object (in which case iteration will start
00411         /// there) or to pile::end(), in which case this is the end.
00412         ///
00413         /// @param The total string collection.
00414         /// @param i The PackedString object to start with.
00415         Iterator(TPackedMap & pile, TPackedMap::iterator i)
00416             : m_Packed (pile),
00417               m_Pos1 (i),
00418               m_Pos2 (0)
00419         {
00420         }
00421         
00422         /// Move forward one item (prefix).
00423         Iterator & operator++()
00424         {
00425             if (m_Pos1 != m_Packed.end()) {
00426                 m_Pos2 ++;
00427                 
00428                 if (m_Pos2 >= m_Pos1->second->Size()) {
00429                     m_Pos1 ++;
00430                     m_Pos2 = 0;
00431                 }
00432             }
00433             return *this;
00434         }
00435         
00436         /// Compare two iterators for equality.
00437         /// @param Iterator to compare this iterator to.
00438         /// @return true If the iterators are equal.
00439         bool operator ==(Iterator & other)
00440         {
00441             return (m_Pos1 == other.m_Pos1 &&
00442                     m_Pos2 == other.m_Pos2);
00443         }
00444         
00445         /// Compare two iterators for inequality.
00446         /// @param Iterator to compare this iterator to.
00447         /// @return true If the iterators are unequal.
00448         bool operator !=(Iterator & other)
00449         {
00450             return ! ((*this) == other);
00451         }
00452         
00453         /// Get the string pointed to by this iterator.
00454         ///
00455         /// The C++ tradition of using operator*() for this task is
00456         /// not followed here, because this string needs to be pieced
00457         /// together, and that would require a string allocation for
00458         /// each returned item.  Instead, I allow the user to pass in
00459         /// a string, allowing the calling code to use a single string
00460         /// for all allocations.
00461         ///
00462         /// @param data The returned string.
00463         void Get(string & data)
00464         {
00465             _ASSERT(m_Pos1 != m_Packed.end());
00466             
00467             data.resize(0);
00468             data.append(m_Pos1->first.Data(), m_Pos1->first.Size());
00469             data.append(m_Pos1->second->GetList()[m_Pos2]);
00470         }
00471         
00472     private:
00473         /// The CWriteDB_PackedString container.
00474         TPackedMap & m_Packed;
00475         
00476         /// The iterator for the current TPacked object.
00477         TPackedMap::iterator m_Pos1;
00478         
00479         /// An integer to iterate within the TPacked object.
00480         int m_Pos2;
00481     };
00482     
00483     /// Get an iterator to the beginning of this collection.
00484     Iterator Begin()
00485     {
00486         return Iterator(m_Packed, m_Packed.begin());
00487     }
00488     
00489     /// Get an iterator to the end of this collection.
00490     Iterator End()
00491     {
00492         return Iterator(m_Packed, m_Packed.end());
00493     }
00494     
00495     /// Clear all objects from this container.
00496     void Clear();
00497     
00498 private:
00499     /// Number of elements stored in this container.
00500     int m_Size;
00501     
00502     /// Map of string prefixes to packed string lists.
00503     TPackedMap m_Packed;
00504     
00505     /// Shared list of packed string buffers.
00506     CWriteDB_PackedBuffer<BLOCK> m_Buffer;
00507 };
00508 
00509 /// Compute length of sequence from raw packing.
00510 /// @param protein Specify true for protein formats, false for nucleotide.
00511 /// @param seq Sequence data (in na2 format for nucletide).
00512 int WriteDB_FindSequenceLength(bool protein, const string & seq);
00513 
00514 END_NCBI_SCOPE
00515 
00516 
00517 #endif // OBJTOOLS_WRITERS_WRITEDB__WRITEDB_GENERAL_HPP
00518 
00519 
00520 
00521 

Generated on Sun Dec 6 22:41:04 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Mon Dec 07 16:21:11 2009 by modify_doxy.py rev. 173732