00001 #ifndef OBJTOOLS_WRITERS_WRITEDB__WRITEDB_GENERAL_HPP 00002 #define OBJTOOLS_WRITERS_WRITEDB__WRITEDB_GENERAL_HPP 00003 00004 /* $Id: writedb_general.hpp 177192 2009-11-27 02:47:04Z ucko $ 00005 * =========================================================================== 00006 * 00007 * PUBLIC DOMAIN NOTICE 00008 * National Center for Biotechnology Information 00009 * 00010 * This software/database is a "United States Government Work" under the 00011 * terms of the United States Copyright Act. It was written as part of 00012 * the author's official duties as a United States Government employee and 00013 * thus cannot be copyrighted. This software/database is freely available 00014 * to the public for use. The National Library of Medicine and the U.S. 00015 * Government have not placed any restriction on its use or reproduction. 00016 * 00017 * Although all reasonable efforts have been taken to ensure the accuracy 00018 * and reliability of the software and data, the NLM and the U.S. 00019 * Government do not and cannot warrant the performance or results that 00020 * may be obtained by using this software or data. The NLM and the U.S. 00021 * Government disclaim all warranties, express or implied, including 00022 * warranties of performance, merchantability or fitness for any particular 00023 * purpose. 00024 * 00025 * Please cite the author in any work or product based on this material. 00026 * 00027 * =========================================================================== 00028 * 00029 * Author: Kevin Bealer 00030 * 00031 */ 00032 00033 /// @file writedb_general.hpp 00034 /// Implementation for general purpose utilities for WriteDB. 00035 /// 00036 /// Defines classes: 00037 /// CWriteDB_PackedStringsCompare 00038 /// CWriteDB_PackedBuffer 00039 /// CWriteDB_PackedStrings 00040 /// CArrayString 00041 /// CWriteDB_PackedSemiTree 00042 /// 00043 /// Implemented for: UNIX, MS-Windows 00044 00045 #include <objects/seq/seq__.hpp> 00046 00047 BEGIN_NCBI_SCOPE 00048 00049 /// Import definitions from the objects namespace. 00050 USING_SCOPE(objects); 00051 00052 /// Divide by a number, rounding up to a whole integer. 00053 inline int s_DivideRoundUp(int value, int blocksize) 00054 { 00055 return ((value + (blocksize - 1)) / blocksize); 00056 } 00057 00058 /// Round up to the next multiple of some number. 00059 inline int s_RoundUp(int value, int blocksize) 00060 { 00061 return s_DivideRoundUp(value, blocksize)*blocksize; 00062 } 00063 00064 /// Comparator for sorting null terminated strings. 00065 class CWriteDB_PackedStringsCompare { 00066 public: 00067 #ifdef NCBI_OS_SOLARIS 00068 /// Constructor. 00069 /// 00070 /// This definition silences a false-alarm warning on solaris. 00071 CWriteDB_PackedStringsCompare() 00072 { 00073 } 00074 #endif // NCBI_OS_SOLARIS 00075 00076 /// Compare two null terminated strings. 00077 bool operator()(const char * a, const char * b) const 00078 { 00079 return strcmp(a, b) < 0; 00080 } 00081 }; 00082 00083 /// Sortable packed array of null terminated strings. 00084 /// 00085 /// This class holds NUL terminated string data for multiple strings. 00086 /// Compared to a vector<string>, it is more efficient in terms of 00087 /// both time and space, but it omits many general purpose features 00088 /// such as iteration, deletion and modification of stored data. 00089 template<int ALLOCSIZE> 00090 class CWriteDB_PackedBuffer : public CObject { 00091 public: 00092 /// Constructor 00093 CWriteDB_PackedBuffer() 00094 { 00095 m_Zero[0] = char(0); 00096 } 00097 00098 /// Destructor 00099 ~CWriteDB_PackedBuffer() 00100 { 00101 Clear(); 00102 } 00103 00104 /// Insert a new string plus a NUL terminator. 00105 /// 00106 /// This inserts a string and returns the location of that string 00107 /// in the packed buffer. The returned address will not change 00108 /// over the lifetime of this object. 00109 /// 00110 /// @param data The string data. 00111 /// @param length The length of the string data. 00112 /// @return A pointer to the packed string. 00113 const char * Insert(const char * data, int length) 00114 { 00115 if (m_Packed.empty()) { 00116 x_AddBlock(); 00117 } 00118 00119 string * back = m_Packed.back(); 00120 00121 if ((back->size() + length + 1) > back->capacity()) { 00122 x_AddBlock(); 00123 back = m_Packed.back(); 00124 } 00125 00126 const char * rv = back->data() + back->size(); 00127 back->append(data, length); 00128 back->append(& m_Zero[0], 1); 00129 00130 return rv; 00131 } 00132 00133 /// Free all held memory. 00134 void Clear() 00135 { 00136 vector<string*> p2; 00137 m_Packed.swap(p2); 00138 00139 NON_CONST_ITERATE(vector<string*>, iter, p2) { 00140 string * pstr = *iter; 00141 delete pstr; 00142 *iter = NULL; 00143 } 00144 } 00145 00146 private: 00147 /// This object type. 00148 typedef CWriteDB_PackedBuffer<ALLOCSIZE> TThis; 00149 00150 /// Prevent copy constructor. 00151 CWriteDB_PackedBuffer(const TThis & other); 00152 00153 /// Prevent copy operator. 00154 TThis & operator=(const TThis & other); 00155 00156 /// Add an block of ALLOCSIZE bytes to the sorted string vector. 00157 void x_AddBlock() 00158 { 00159 string * p = 0; 00160 00161 try { 00162 p = new string; 00163 int blksz = ALLOCSIZE; 00164 p->reserve(blksz); 00165 } 00166 catch(...) { 00167 delete p; 00168 throw; 00169 } 00170 00171 m_Packed.push_back(p); 00172 } 00173 00174 /// Array of pointers to the start of the strings stored here. 00175 vector<string*> m_Packed; 00176 00177 /// Array containing NUL byte for convenience. 00178 char m_Zero[1]; 00179 }; 00180 00181 /// Sortable packed array of strings, optimized for space. 00182 /// 00183 /// This stores a sortable packed collection of strings. Features 00184 /// here are minimal and targeted for the ISAM code, which follows a 00185 /// simple (input, sort, output) data processing pattern. 00186 00187 template<int ALLOCSIZE> 00188 class CWriteDB_PackedStrings : public CObject { 00189 public: 00190 /// Constructor 00191 CWriteDB_PackedStrings(CWriteDB_PackedBuffer<ALLOCSIZE> & buffer) 00192 : m_Buffer(buffer) 00193 { 00194 } 00195 00196 /// Destructor 00197 ~CWriteDB_PackedStrings() 00198 { 00199 } 00200 00201 /// Insert string data - must be null terminated. 00202 void Insert(const char * x, int length) 00203 { 00204 m_KeyLoc.push_back(m_Buffer.Insert(x, length)); 00205 } 00206 00207 /// Sort the keyloc array (in place). 00208 void Sort() 00209 { 00210 CWriteDB_PackedStringsCompare cmp; 00211 std::sort(m_KeyLoc.begin(), m_KeyLoc.end(), cmp); 00212 } 00213 00214 /// Sort the keyloc array (in place). 00215 int Size() 00216 { 00217 return m_KeyLoc.size(); 00218 } 00219 00220 /// Get the list of null terminated strings. 00221 const vector<const char *> & GetList() 00222 { 00223 return m_KeyLoc; 00224 } 00225 00226 private: 00227 /// This object type. 00228 typedef CWriteDB_PackedStrings<ALLOCSIZE> TThis; 00229 00230 /// Prevent copy constructor. 00231 CWriteDB_PackedStrings(const TThis & other); 00232 00233 /// Prevent assignment. 00234 TThis & operator=(const TThis & other); 00235 00236 /// Reference to shared packed-buffer object. 00237 CWriteDB_PackedBuffer<ALLOCSIZE> & m_Buffer; 00238 00239 /// Pointers to the beginning of the strings stored here. 00240 vector<const char *> m_KeyLoc; 00241 }; 00242 00243 /// Fixed-buffer string type. 00244 /// 00245 /// This type stores a string in a fixed buffer. The string length 00246 /// may be up to the specific size (the string is not NUL terminated 00247 /// in the internal array). Compared to std::string, this type is 00248 /// compact, more cache-aware, and does not use dynamic allocation. 00249 /// It cannot handle embedded NULs or strings larger than STR_SIZE. 00250 00251 template<int STR_SIZE> 00252 class CArrayString { 00253 public: 00254 /// Constructor. 00255 CArrayString() 00256 { 00257 memset(m_Data, 0, STR_SIZE); 00258 } 00259 00260 /// Copy constructor. 00261 CArrayString(const CArrayString<STR_SIZE> & other) 00262 { 00263 memcpy(m_Data, other.m_Data, STR_SIZE); 00264 } 00265 00266 /// Construct from a string in a memory location. 00267 CArrayString(const char * x, int L) 00268 { 00269 _ASSERT(L <= STR_SIZE); 00270 memcpy(m_Data, x, L); 00271 if (L < STR_SIZE) { 00272 m_Data[L] = 0; 00273 } 00274 } 00275 00276 /// Compare two strings (lexicographically). 00277 /// @param other Another string against which to compare. 00278 /// @return -1, 0, or 1 if string is less, equal, or greater than other. 00279 int Cmp(const CArrayString<STR_SIZE> & other) const 00280 { 00281 for(int i = 0; i<STR_SIZE; i++) { 00282 char ch1 = m_Data[i], ch2 = other.m_Data[i]; 00283 00284 if (ch1 < ch2) 00285 return -1; 00286 00287 if (ch1 > ch2) 00288 return 1; 00289 00290 if ((! ch1) && (! ch2)) 00291 break; 00292 } 00293 return 0; 00294 } 00295 00296 /// Return true if this string is less than 'other'. 00297 /// @param other Another string against which to compare. 00298 /// @return True if this string is less than other. 00299 bool operator <(const CArrayString<STR_SIZE> & other) const 00300 { 00301 return Cmp(other) < 0; 00302 } 00303 00304 /// Return true if this string is less than 'other'. 00305 /// @param other Another string against which to compare. 00306 /// @return True if this string is equal to other. 00307 bool operator==(const CArrayString<STR_SIZE> & other) const 00308 { 00309 return Cmp(other) == 0; 00310 } 00311 00312 /// Assign this string from another. 00313 /// @param other Another string to assign from. 00314 CArrayString<STR_SIZE>& operator=(const CArrayString<STR_SIZE> & other) 00315 { 00316 memcpy(m_Data, other.m_Data, STR_SIZE); 00317 return *this; 00318 } 00319 00320 /// Get a pointer to the start of this string's data. 00321 /// @return A pointer to the start of this string's data. 00322 const char * Data() const 00323 { 00324 return & m_Data[0]; 00325 } 00326 00327 /// Get this string's length. 00328 /// @return This string's length. 00329 int Size() const 00330 { 00331 for(int i = 0; i<STR_SIZE; i++) { 00332 if (! m_Data[i]) 00333 break; 00334 } 00335 return STR_SIZE; 00336 } 00337 00338 private: 00339 /// Data for this string, NUL terminated iff less than STR_SIZE bytes. 00340 char m_Data[STR_SIZE]; 00341 }; 00342 00343 00344 /// Packed string data container with sorting and iteration. 00345 /// 00346 /// This class efficiently stores a packed array of string data. 00347 /// Strings can be added, sorted, and iterated over. The actual data 00348 /// is stored in packed buffers, but the beginning of each string is 00349 /// removed and used as a first level index to a packed list of the 00350 /// strings (minus this first part). This reduces the total string 00351 /// storage required, because each string is shorter, and makes the 00352 /// final sorting easier (because each subset of the string table is 00353 /// partially sorted and comparisons are done with shorter strings). 00354 class CWriteDB_PackedSemiTree { 00355 public: 00356 // The capacity() of a string is not necessarily what was provided 00357 // to std::string::reserve(int). Instead, reserve can provide 00358 // extra space to shift the capacity to something that matches the 00359 // underlying new or malloc implementation. The value 65000 here 00360 // is designed to get approximately 64K. Since C++ string buffers 00361 // are normally allocated with metadata prefixes, asking for 2^16 00362 // bytes is unlikely to result in clean 64k block allocations. 00363 00364 enum { 00365 /// This is the number of bytes for the first level index. 00366 PREFIX = 6, 00367 00368 /// This is the amount of string capacity to request. 00369 BLOCK = 65000 00370 }; 00371 00372 /// A packed list of buffers. 00373 typedef CWriteDB_PackedStrings<BLOCK> TPacked; 00374 00375 /// A map from the string prefixes to a packed buffer. 00376 typedef map< CArrayString<PREFIX>, CRef<TPacked> > TPackedMap; 00377 00378 /// Constructor. 00379 CWriteDB_PackedSemiTree() 00380 : m_Size(0) 00381 { 00382 } 00383 00384 /// Destructor. 00385 ~CWriteDB_PackedSemiTree() 00386 { 00387 Clear(); 00388 } 00389 00390 /// Insert string data into the container. 00391 void Insert(const char * x, int L); 00392 00393 /// Sort all contained data. 00394 void Sort(); 00395 00396 /// Return the number of contained entries. 00397 int Size() const 00398 { 00399 return m_Size; 00400 } 00401 00402 /// Class providing iteration over string data. 00403 /// 00404 /// This class assumes all PackedStrings objects are non-empty. 00405 class Iterator { 00406 public: 00407 /// Create an iterator. 00408 /// 00409 /// The specified TPacked iterator will normally point to the 00410 /// first TPacked object (in which case iteration will start 00411 /// there) or to pile::end(), in which case this is the end. 00412 /// 00413 /// @param The total string collection. 00414 /// @param i The PackedString object to start with. 00415 Iterator(TPackedMap & pile, TPackedMap::iterator i) 00416 : m_Packed (pile), 00417 m_Pos1 (i), 00418 m_Pos2 (0) 00419 { 00420 } 00421 00422 /// Move forward one item (prefix). 00423 Iterator & operator++() 00424 { 00425 if (m_Pos1 != m_Packed.end()) { 00426 m_Pos2 ++; 00427 00428 if (m_Pos2 >= m_Pos1->second->Size()) { 00429 m_Pos1 ++; 00430 m_Pos2 = 0; 00431 } 00432 } 00433 return *this; 00434 } 00435 00436 /// Compare two iterators for equality. 00437 /// @param Iterator to compare this iterator to. 00438 /// @return true If the iterators are equal. 00439 bool operator ==(Iterator & other) 00440 { 00441 return (m_Pos1 == other.m_Pos1 && 00442 m_Pos2 == other.m_Pos2); 00443 } 00444 00445 /// Compare two iterators for inequality. 00446 /// @param Iterator to compare this iterator to. 00447 /// @return true If the iterators are unequal. 00448 bool operator !=(Iterator & other) 00449 { 00450 return ! ((*this) == other); 00451 } 00452 00453 /// Get the string pointed to by this iterator. 00454 /// 00455 /// The C++ tradition of using operator*() for this task is 00456 /// not followed here, because this string needs to be pieced 00457 /// together, and that would require a string allocation for 00458 /// each returned item. Instead, I allow the user to pass in 00459 /// a string, allowing the calling code to use a single string 00460 /// for all allocations. 00461 /// 00462 /// @param data The returned string. 00463 void Get(string & data) 00464 { 00465 _ASSERT(m_Pos1 != m_Packed.end()); 00466 00467 data.resize(0); 00468 data.append(m_Pos1->first.Data(), m_Pos1->first.Size()); 00469 data.append(m_Pos1->second->GetList()[m_Pos2]); 00470 } 00471 00472 private: 00473 /// The CWriteDB_PackedString container. 00474 TPackedMap & m_Packed; 00475 00476 /// The iterator for the current TPacked object. 00477 TPackedMap::iterator m_Pos1; 00478 00479 /// An integer to iterate within the TPacked object. 00480 int m_Pos2; 00481 }; 00482 00483 /// Get an iterator to the beginning of this collection. 00484 Iterator Begin() 00485 { 00486 return Iterator(m_Packed, m_Packed.begin()); 00487 } 00488 00489 /// Get an iterator to the end of this collection. 00490 Iterator End() 00491 { 00492 return Iterator(m_Packed, m_Packed.end()); 00493 } 00494 00495 /// Clear all objects from this container. 00496 void Clear(); 00497 00498 private: 00499 /// Number of elements stored in this container. 00500 int m_Size; 00501 00502 /// Map of string prefixes to packed string lists. 00503 TPackedMap m_Packed; 00504 00505 /// Shared list of packed string buffers. 00506 CWriteDB_PackedBuffer<BLOCK> m_Buffer; 00507 }; 00508 00509 /// Compute length of sequence from raw packing. 00510 /// @param protein Specify true for protein formats, false for nucleotide. 00511 /// @param seq Sequence data (in na2 format for nucletide). 00512 int WriteDB_FindSequenceLength(bool protein, const string & seq); 00513 00514 END_NCBI_SCOPE 00515 00516 00517 #endif // OBJTOOLS_WRITERS_WRITEDB__WRITEDB_GENERAL_HPP 00518 00519 00520 00521
1.4.6
Modified on Mon Dec 07 16:21:11 2009 by modify_doxy.py rev. 173732