NCBI C++ ToolKit
writedb_impl.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: writedb_impl.cpp 77844 2017-05-10 13:38:07Z rackerst $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Kevin Bealer
27  *
28  */
29 
30 /// @file writedb_impl.cpp
31 /// Implementation for the CWriteDB_Impl class.
32 /// class for WriteDB.
33 #include <ncbi_pch.hpp>
40 #include <objects/blastdb/defline_extra.hpp> // for kAsnDeflineObjLabel
41 #include <serial/typeinfo.hpp>
42 #include <corelib/ncbi_bswap.hpp>
43 
44 #include "writedb_impl.hpp"
46 
47 #include <iostream>
48 #include <sstream>
49 
51 
52 /// Import C++ std namespace.
54 
56  bool protein,
57  const string & title,
58  EIndexType indices,
59  bool parse_ids,
60  bool long_ids,
61  bool use_gi_mask)
62  : m_Dbname (dbname),
63  m_Protein (protein),
64  m_Title (title),
65  m_MaxFileSize (0),
66  m_MaxVolumeLetters (0),
67  m_Indices (indices),
68  m_Closed (false),
69  m_MaskDataColumn (-1),
70  m_ParseIDs (parse_ids),
71  m_UseGiMask (use_gi_mask),
72  m_Pig (0),
73  m_Hash (0),
74  m_SeqLength (0),
75  m_HaveSequence (false),
76  m_LongSeqId (long_ids)
77 {
79 
80  m_Date = now.AsString("b d, Y ");
81  string t = now.AsString("H:m P");
82 
83  if (t[0] == '0') {
84  t.assign(t, 1, t.size() - 1);
85  }
86 
87  m_Date += t;
88 }
89 
91 {
92  try {
93  Close();
94  } catch (const CWriteDBException& e) {
95  LOG_POST(Error << "BLAST Database creation error: " << e.GetMsg());
96  }
97 
98 }
99 
101 {
102  m_Bioseq.Reset();
104  m_Deflines.Reset();
105  m_Ids.clear();
106  m_Linkouts.clear();
107  m_Memberships.clear();
108  m_Pig = 0;
109  m_Hash = 0;
110  m_SeqLength = 0;
111 
112  m_Sequence.erase();
113  m_Ambig.erase();
114  m_BinHdr.erase();
115 
116  NON_CONST_ITERATE(vector<int>, iter, m_HaveBlob) {
117  *iter = 0;
118  }
119 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
120  (!defined(NCBI_COMPILER_MIPSPRO)) )
121  NON_CONST_ITERATE(vector< CRef<CBlastDbBlob> >, iter, m_Blobs) {
122  (**iter).Clear();
123  }
124 #endif
125 }
126 
128  const CTempString & ambig)
129 {
130  // Publish previous sequence (if any)
131  x_Publish();
132 
133  // Blank slate for new sequence.
135 
136  m_Sequence.assign(seq.data(), seq.length());
137  m_Ambig.assign(ambig.data(), ambig.length());
138 
140  x_ComputeHash(seq, ambig);
141  }
142 
144 }
145 
147 {
148  // Publish previous sequence
149  x_Publish();
150 
151  // Blank slate for new sequence.
153 
154  m_Bioseq.Reset(& bs);
155  if (m_Bioseq->GetInst().CanGetMol() && (m_Bioseq->IsAa() != m_Protein)) {
157  msg << "Invalid molecule type of sequence added ("
158  << (m_Bioseq->IsAa() ? "protein" : "nucleotide")
159  << "); expected " << (m_Protein ? "protein" : "nucleotide");
161  }
162 
164  x_ComputeHash(bs);
165  }
166 
168 }
169 
171 {
172  AddSequence(bs);
173  m_SeqVector = sv;
174 }
175 
177 {
178  CSeqVector sv(bsh);
179  AddSequence(*bsh.GetCompleteBioseq(), sv);
180 }
181 
182 
183 /// class to support searching for duplicate isam keys
184 template <class T>
186 
187  public:
188  // data member
191 
192  // constructor
193  CWriteDB_IsamKey(const string &fn) {
194  source = new CNcbiIfstream(fn.c_str(),
195  IOS_BASE::in | IOS_BASE::binary);
196  key = x_GetNextKey();
197  };
198 
200  delete source;
201  };
202 
203  // advance key to catch up other
204  bool AdvanceKey(const CWriteDB_IsamKey & other) {
205  while (!source->eof()) {
206  T next_key = x_GetNextKey();
207  if (next_key >= other.key) {
208  key = next_key;
209  return true;
210  }
211  }
212  return false;
213  };
214 
215  // less_than, used for sorting
216  bool operator <(const CWriteDB_IsamKey &other) const {
217  return (key < other.key);
218  };
219 
220  private:
221  // read in the next key, for numeric id
223 #define INT4_SIZE 4
224  char s[INT4_SIZE] = { '\0' };
225  source->read(s, INT4_SIZE);
226  if ((source->gcount() != INT4_SIZE) || source->eof()) {
227  return T();
228  }
229  source->seekg(INT4_SIZE, ios_base::cur);
230 #ifdef WORDS_BIGENDIAN
231  Int4 next_key = (Int4) *((Int4 *) s);
232 #else
233  Int4 next_key = CByteSwap::GetInt4((const unsigned char *)s);
234 #endif
235  return next_key;
236  };
237 };
238 
239 // customized string file reading
240 template <> inline string
242 #define CHAR_BUFFER_SIZE 256
243  char s[CHAR_BUFFER_SIZE] = { '\0' };
244  source->getline(s, CHAR_BUFFER_SIZE);
245  if ((source->gcount() == 0) || source->eof()) {
246  return kEmptyStr;
247  }
248  char * p = s;
249  while (*p != 0x02) ++p;
250  string in(s, p);
251 
252  // check if the current key is PDB-like,
253  // if so, advance for the next
254  // PDB key must be [0-9]...
255  if ( (in.size() == 4)
256  && ((in[0] - '0') * (in[0] - '9') <= 0) ) {
257 
258  // probing the next key to make sure this is pdb id
259  char next_token[4];
260  source->read(next_token, 4);
261  source->seekg(-4, ios_base::cur);
262  string next_key(next_token, 4);
263 
264  if (next_key == in) {
265  // automatically advance to next key
266  return x_GetNextKey();
267  }
268  }
269  return in;
270 };
271 
272 /// Comparison function for set<CWriteDB_IsamKey<T> *>
273 template <class T>
276  const CWriteDB_IsamKey<T> * rhs) const {
277  return (*lhs < *rhs);
278  }
279 };
280 
281 /// Check for duplicate ids across volumes
282 template <class T>
284  CWriteDB_IsamKey_Compare<T> > & keys) {
285  while (!keys.empty()) {
286  // pick the smallest key
287  CWriteDB_IsamKey<T> * key = *(keys.begin());
288 
289  keys.erase(key);
290 
291  if (keys.empty()) {
292  delete key;
293  return;
294  }
295 
296  const CWriteDB_IsamKey<T> * next = *(keys.begin());
297  if (key->AdvanceKey(*next)) {
298  if (keys.find(key) != keys.end()) {
300  msg << "Error: Duplicate seq_id <"
301  << key->key
302  << "> is found multiple times across volumes.";
304  }
305  keys.insert(key);
306  } else {
307  delete key;
308  }
309  }
310 };
311 
313 {
314  if (m_Closed)
315  return;
316 
317  m_Closed = true;
318 
319  x_Publish();
320  m_Sequence.erase();
321  m_Ambig.erase();
322 
323  if (! m_Volume.Empty()) {
324  m_Volume->Close();
325 
326  if (m_UseGiMask) {
327  for (unsigned int i=0; i<m_GiMasks.size(); ++i) {
328  m_GiMasks[i]->Close();
329  }
330  }
331 
332  if (m_VolumeList.size() == 1) {
334  }
335 
336  // disable the check for duplicate ids across volumes
337  /*
338  else if (m_Indices != CWriteDB::eNoIndex) {
339  set<CWriteDB_IsamKey<string> *, CWriteDB_IsamKey_Compare<string> > sids;
340  ITERATE(vector< CRef<CWriteDB_Volume> >, iter, m_VolumeList) {
341  string fn = (*iter)->GetVolumeName() + (m_Protein ? ".psd" : ".nsd");
342  if (CFile(fn).Exists()) {
343  sids.insert(new CWriteDB_IsamKey<string>(fn));
344  }
345  }
346  s_CheckDuplicateIds(sids);
347 
348  set<CWriteDB_IsamKey<Int4> *, CWriteDB_IsamKey_Compare<Int4> > nids;
349  ITERATE(vector< CRef<CWriteDB_Volume> >, iter, m_VolumeList) {
350  string fn = (*iter)->GetVolumeName() + (m_Protein ? ".pnd" : ".nnd");
351  if (CFile(fn).Exists()) {
352  nids.insert(new CWriteDB_IsamKey<Int4>(fn));
353  }
354  }
355  s_CheckDuplicateIds(nids);
356  } */
357 
358  if (m_VolumeList.size() > 1 || m_UseGiMask) {
359  x_MakeAlias();
360  }
361 
362  m_Volume.Reset();
363  }
364 }
365 
367 {
368  return m_Dbname + (m_Protein ? ".pal" : ".nal");
369 }
370 
372 {
373  string dblist;
374  if (m_VolumeList.size() > 1) {
375  for(unsigned i = 0; i < m_VolumeList.size(); i++) {
376  if (dblist.size())
377  dblist += " ";
378 
380  }
381  } else {
382  dblist = m_Dbname;
383  }
384 
385  string masklist("");
386  if (m_UseGiMask) {
387  for (unsigned i = 0; i < m_GiMasks.size(); i++) {
388  const string & x = m_GiMasks[i]->GetName();
389  if (x != "") {
390  masklist += x + " ";
391  }
392  }
393  }
394 
395  string nm = x_MakeAliasName();
396 
397  ofstream alias(nm.c_str());
398 
399  alias << "#\n# Alias file created: " << m_Date << "\n#\n"
400  << "TITLE " << m_Title << "\n"
401  << "DBLIST " << dblist << "\n";
402 
403  if (masklist != "") {
404  alias << "MASKLIST " << masklist << "\n";
405  }
406 }
407 
409  string & bin_hdr)
410 {
411  if (! bin_hdr.empty()) {
412  return;
413  }
414 
415  if (! bioseq.CanGetDescr()) {
416  return;
417  }
418 
419  // Getting the binary headers, when they exist, is probably faster
420  // than building new deflines from the 'visible' CBioseq parts.
421 
422  vector< vector< char >* > bindata;
423 
424  ITERATE(list< CRef< CSeqdesc > >, iter, bioseq.GetDescr().Get()) {
425  if ((**iter).IsUser()) {
426  const CUser_object & uo = (**iter).GetUser();
427  const CObject_id & oi = uo.GetType();
428 
429  if (oi.IsStr() && oi.GetStr() == kAsnDeflineObjLabel) {
430  if (uo.CanGetData()) {
431  const vector< CRef< CUser_field > > & D = uo.GetData();
432 
433  if (D.size() &&
434  D[0].NotEmpty() &&
435  D[0]->CanGetLabel() &&
436  D[0]->GetLabel().IsStr() &&
437  D[0]->GetLabel().GetStr() == kAsnDeflineObjLabel &&
438  D[0]->CanGetData() &&
439  D[0]->GetData().IsOss()) {
440 
441  bindata = D[0]->GetData().GetOss();
442  break;
443  }
444  }
445  }
446  }
447  }
448 
449  if (! bindata.empty()) {
450  if (bindata[0] && (! bindata[0]->empty())) {
451  vector<char> & b = *bindata[0];
452 
453  bin_hdr.assign(& b[0], b.size());
454  }
455  }
456 }
457 
458 static void
459 s_CheckEmptyLists(CRef<CBlast_def_line_set> & deflines, bool owner);
460 
463 {
465  SerialAssign(*bdls, *deflines);
466  s_CheckEmptyLists(bdls, true);
467  return bdls;
468 }
469 
470 static void
472 {
473  CBlast_def_line_set * bdls = 0;
474  CConstRef<CBlast_def_line_set> here(&*deflines);
475 
476  if (! owner) {
477  here = s_EditDeflineSet(here);
478  return;
479  }
480 
481  bdls = const_cast<CBlast_def_line_set*>(here.GetPointer());
482 
483  NON_CONST_ITERATE(list< CRef< CBlast_def_line > >, iter, bdls->Set()) {
484  CRef<CBlast_def_line> defline = *iter;
485  if (defline->CanGetMemberships() &&
486  defline->GetMemberships().size() == 0) {
487 
488  defline->ResetMemberships();
489  }
490 
491  if (defline->CanGetLinks() &&
492  defline->GetLinks().size() == 0) {
493 
494  defline->ResetLinks();
495  }
496  }
497 
498  deflines.Reset(bdls);
499 }
500 
501 void
504  const vector< vector<int> > & membbits,
505  const vector< vector<int> > & linkouts,
506  int pig)
507 {
508  if (! (bioseq.CanGetDescr() && bioseq.CanGetId())) {
509  return;
510  }
511 
512  vector<int> taxids;
513  string titles;
514 
515  // Scan the CBioseq for taxids and the title string.
516 
517  ITERATE(list< CRef< CSeqdesc > >, iter, bioseq.GetDescr().Get()) {
518  const CSeqdesc & desc = **iter;
519 
520  if (desc.IsTitle()) {
521  //defline->SetTitle((**iter)->GetTitle());
522  titles = (**iter).GetTitle();
523  }
524  else {
525  const COrg_ref * org_pt = NULL;
526  if (desc.IsSource()) {
527  org_pt = &(desc.GetSource().GetOrg());
528  }
529  else if( desc.IsOrg()) {
530  org_pt = &(desc.GetOrg());
531  }
532 
533  if((NULL != org_pt) && org_pt->CanGetDb()) {
534  ITERATE(vector< CRef< CDbtag > >,
535  dbiter,
536  org_pt->GetDb()) {
537 
538  if ((**dbiter).CanGetDb() &&
539  (**dbiter).GetDb() == "taxon") {
540 
541  const CObject_id & oi = (**dbiter).GetTag();
542 
543  if (oi.IsId()) {
544  //defline->SetTaxid(oi.GetId());
545  taxids.push_back(oi.GetId());
546  }
547  }
548  }
549  }
550  }
551  }
552 
553  // The bioseq has a field contianing the ids for the first
554  // defline. The title string contains the title for the first
555  // defline, plus all the other defline titles and ids. This code
556  // unpacks them and builds a normal blast defline set.
557 
558  list< CRef<CSeq_id> > ids = bioseq.GetId();
559 
560  unsigned taxid_i(0), mship_i(0), links_i(0);
561  bool used_pig(false);
562 
563  // Build the deflines.
564 
566  CRef<CBlast_def_line> defline;
567 
568  while(! ids.empty()) {
569  defline.Reset(new CBlast_def_line);
570 
571  defline->SetSeqid() = ids;
572  ids.clear();
573 
574  /*
575  size_t pos = titles.find(" >");
576  string T;
577 
578  if (pos != titles.npos) {
579  T.assign(titles, 0, pos);
580  titles.erase(0, pos + 2);
581 
582  pos = titles.find(" ");
583  string nextid;
584 
585  if (pos != titles.npos) {
586  nextid.assign(titles, 0, pos);
587  titles.erase(0, pos + 1);
588  } else {
589  nextid.swap(titles);
590  }
591 
592  // Parse '|' seperated ids.
593  if ( nextid.find('|') == NPOS
594  || !isalpha((unsigned char)(nextid[0]))) {
595  ids.push_back(CRef<CSeq_id> (new CSeq_id(CSeq_id::e_Local, nextid)));
596  } else {
597  CSeq_id::ParseFastaIds(ids, nextid);
598  }
599  } else {
600  T = titles;
601  }
602 
603  */
604  defline->SetTitle(titles);
605 
606  if (taxid_i < taxids.size()) {
607  defline->SetTaxid(taxids[taxid_i++]);
608  }
609 
610  if (mship_i < membbits.size()) {
611  const vector<int> & V = membbits[mship_i++];
612  defline->SetMemberships().assign(V.begin(), V.end());
613  }
614 
615  if (links_i < linkouts.size()) {
616  const vector<int> & V = linkouts[mship_i++];
617  defline->SetLinks().assign(V.begin(), V.end());
618  }
619 
620  if ((! used_pig) && pig) {
621  defline->SetOther_info().push_back(pig);
622  used_pig = true;
623  }
624 
625  bdls->Set().push_back(defline);
626  }
627 
628  s_CheckEmptyLists(bdls, true);
629  deflines = bdls;
630 }
631 
632 void CWriteDB_Impl::
633 x_SetDeflinesFromBinary(const string & bin_hdr,
635 {
637 
638  istringstream iss(bin_hdr);
639  iss >> MSerial_AsnBinary >> *bdls;
640 
641  s_CheckEmptyLists(bdls, true);
642  deflines.Reset(&* bdls);
643 }
644 
645 
646 static bool s_UseFastaReaderDeflines(CConstRef<CBioseq> & bioseq, CConstRef<CBlast_def_line_set> & deflines, bool long_seqid)
647 {
648  if(deflines.Empty())
649  return false;
650 
651  const CSeq_id * bioseq_id = bioseq->GetNonLocalId();
652 
653  if(bioseq_id == NULL ||
654  // For bare pir and prf ids go with the one from defline.
655  // This is to parse bare ids as local ones. The bare pdb ids are pdb in
656  // bioseq (parsed by CFastaReader), but local in deflines (parsed by
657  // CSeq_id).
658  (!long_seqid && (bioseq_id->IsPrf() || bioseq_id->IsPir()))) {
659  return true;
660  }
661 
662  // Bioseq has non-local id, make sure at least one id is non-local from CFastaReader
663  // defline
664  ITERATE(list< CRef<CBlast_def_line> >, iter, deflines->Get()) {
665  CRef<CSeq_id> id = FindBestChoice((**iter).GetSeqid(), &CSeq_id::BestRank);
666  if (id.NotEmpty() && !id->IsLocal()) {
667  return true;
668  }
669  }
670  return false;
671 
672 }
673 
674 void
677  string & bin_hdr,
678  const vector< vector<int> > & membbits,
679  const vector< vector<int> > & linkouts,
680  int pig,
681  int OID,
682  bool parse_ids,
683  bool long_ids)
684 {
685  bool use_bin = (deflines.Empty() && pig == 0);
686 
687  if (! bin_hdr.empty() && OID<0) {
688  return;
689  }
690 
691  if (deflines.Empty()) {
692  // Use bioseq if deflines are not provided.
693 
694  if (bioseq.Empty()) {
696  eArgErr,
697  "Error: Cannot find CBioseq or deflines.");
698  }
699 
700  // CBioseq objects from SeqDB have binary headers embedded in
701  // them. If these are found, we try to use them. However,
702  // using binary headers may not help us much if we also want
703  // lists of sequence identifiers (for building ISAM files).
704 
705  if (use_bin) {
706  x_GetBioseqBinaryHeader(*bioseq, bin_hdr);
707  }
708 
709  if (bin_hdr.empty()) {
710  x_GetFastaReaderDeflines(*bioseq,
711  deflines,
712  membbits,
713  linkouts,
714  pig,
715  false,
716  parse_ids,
717  long_ids);
718  }
719 
720  if(!s_UseFastaReaderDeflines(bioseq, deflines, long_ids)) {
721  deflines.Reset();
722  }
723 
724  if (bin_hdr.empty() && deflines.Empty()) {
726  deflines,
727  membbits,
728  linkouts,
729  pig);
730  }
731  }
732 
733  if (bin_hdr.empty() &&
734  (deflines.Empty() || deflines->Get().empty())) {
735 
737  eArgErr,
738  "Error: No deflines provided.");
739  }
740 
741  if (pig != 0) {
742  const list<int> * L = 0;
743 
744  if (deflines->Get().front()->CanGetOther_info()) {
745  L = & deflines->Get().front()->GetOther_info();
746  }
747 
748  // If the pig does not agree with the current value, set the
749  // new value and force a rebuild of the binary headers. If
750  // there is more than one value in the list, leave the others
751  // in place.
752 
753  if ((L == 0) || L->empty()) {
755  bdls->Set().front()->SetOther_info().push_back(pig);
756 
757  deflines.Reset(&* bdls);
758  bin_hdr.erase();
759  } else if (L->front() != pig) {
761  bdls->Set().front()->SetOther_info().front() = pig;
762 
763  deflines.Reset(&* bdls);
764  bin_hdr.erase();
765  }
766  }
767 
768  if (OID>=0) {
769  // Re-inject the BL_ORD_ID
770  CRef<CSeq_id> gnl_id(new CSeq_id);
771  gnl_id->SetGeneral().SetDb("BL_ORD_ID");
772  gnl_id->SetGeneral().SetTag().SetId(OID);
774  bdls->Set().front()->SetSeqid().front() = gnl_id;
775 
776  deflines.Reset(&* bdls);
777  }
778 
779  if (bin_hdr.empty() || OID>=0) {
780  // Compress the deflines to binary.
781 
782  CNcbiOstrstream oss;
783  oss << MSerial_AsnBinary << *deflines;
784  bin_hdr = CNcbiOstrstreamToString(oss);
785  }
786 
787  if (deflines.Empty() && (! bin_hdr.empty())) {
788  // Uncompress the deflines from binary.
789 
790  x_SetDeflinesFromBinary(bin_hdr, deflines);
791  }
792 }
793 
795 {
796  int OID = -1;
797  if (! m_ParseIDs) {
798  OID = (m_Volume ) ? m_Volume->GetOID() : 0;
799  }
801  m_Deflines,
802  m_BinHdr,
804  m_Linkouts,
805  m_Pig,
806  OID,
807  m_ParseIDs,
808  m_LongSeqId);
809 }
810 
812 {
813  if (! m_Ids.empty()) {
814  return;
815  }
816 
817  if (m_Deflines.Empty()) {
818  if (m_BinHdr.empty()) {
820  eArgErr,
821  "Error: Cannot find IDs or deflines.");
822  }
823 
825  }
826 
827  ITERATE(list< CRef<CBlast_def_line> >, iter, m_Deflines->Get()) {
828  const list< CRef<CSeq_id> > & ids = (**iter).GetSeqid();
829  // m_Ids.insert(m_Ids.end(), ids.begin(), ids.end());
830  // Spelled out for WorkShop. :-/
831  m_Ids.reserve(m_Ids.size() + ids.size());
832  ITERATE (list<CRef<CSeq_id> >, it, ids) {
833  m_Ids.push_back(*it);
834  }
835  }
836 }
837 
839 {
840  // Scan and mask the sequence itself.
841  for(unsigned i = 0; i < m_Sequence.size(); i++) {
842  if (m_MaskLookup[m_Sequence[i] & 0xFF] != 0) {
843  m_Sequence[i] = m_MaskByte[0];
844  }
845  }
846 }
847 
849 {
850  if (! m_SeqLength) {
851  if (! m_Sequence.empty()) {
853  } else if (m_SeqVector.size()) {
855  } else if (! (m_Bioseq &&
856  m_Bioseq->CanGetInst() &&
857  m_Bioseq->GetInst().GetLength())) {
858 
860  eArgErr,
861  "Need sequence data.");
862  }
863 
864  if (m_Bioseq.NotEmpty()) {
865  const CSeq_inst & si = m_Bioseq->GetInst();
866  m_SeqLength = si.GetLength();
867  }
868  }
869 
870  return m_SeqLength;
871 }
872 
874 {
875  if (! m_Sequence.empty())
876  return;
877 
878  if (! (m_Bioseq.NotEmpty() && m_Bioseq->CanGetInst())) {
880  eArgErr,
881  "Need sequence data.");
882  }
883 
884  const CSeq_inst & si = m_Bioseq->GetInst();
885 
886  if (m_Bioseq->GetInst().CanGetSeq_data()) {
887  const CSeq_data & sd = si.GetSeq_data();
888 
889  string msg;
890 
891  switch(sd.Which()) {
894  break;
895 
898  break;
899 
902  break;
903 
906  break;
907 
910  break;
911 
914  break;
915 
916  default:
917  msg = "Need to write conversion for data type [";
918  msg += NStr::IntToString((int) sd.Which());
919  msg += "].";
920  }
921 
922  if (! msg.empty()) {
923  NCBI_THROW(CWriteDBException, eArgErr, msg);
924  }
925  } else {
926  int sz = m_SeqVector.size();
927 
928  if (sz == 0) {
930  eArgErr,
931  "No sequence data in Bioseq, "
932  "and no Bioseq_Handle available.");
933  }
934 
935  if (m_Protein) {
936  // I add one to the string length to allow the "i+1" in
937  // the loop to be done safely.
938 
939  m_Sequence.reserve(sz);
941  } else {
942  // I add one to the string length to allow the "i+1" in the
943  // loop to be done safely.
944 
945  string na8;
946  na8.reserve(sz + 1);
947  m_SeqVector.GetSeqData(0, sz, na8);
948  na8.resize(sz + 1);
949 
950  string na4;
951  na4.resize((sz + 1) / 2);
952 
953  for(int i = 0; i < sz; i += 2) {
954  na4[i/2] = (na8[i] << 4) + na8[i+1];
955  }
956 
957  WriteDB_Ncbi4naToBinary(na4.data(),
958  (int) na4.size(),
959  (int) si.GetLength(),
960  m_Sequence,
961  m_Ambig);
962  }
963  }
964 }
965 
967 {
968 }
969 
970 // The CPU should be kept at 190 degrees for 10 minutes.
972 {
973  // We need sequence, ambiguity, and binary deflines. If any of
974  // these is missing, it is created from other data if possible.
975 
976  // For now I am disabling binary headers, because in normal usage
977  // I would expect to see sequences from ID1 or similar, and the
978  // non-binary case is slightly more complex.
979 
980  x_CookHeader();
981  x_CookIds();
982  x_CookSequence();
983  x_CookColumns();
984 
985  if (m_Protein && m_MaskedLetters.size()) {
986  x_MaskSequence();
987  }
988 }
989 
991 {
992  return m_HaveSequence;
993 }
994 
996 {
998  m_HaveSequence = true;
999 }
1000 
1002 {
1004  m_HaveSequence = false;
1005 }
1006 
1008 {
1009  // This test should fail only on the first call, or if an
1010  // exception was thrown.
1011 
1012  if (x_HaveSequence()) {
1013  _ASSERT(! (m_Bioseq.Empty() && m_Sequence.empty()));
1014 
1016  } else {
1017  return;
1018  }
1019 
1020  x_CookData();
1021 
1022  bool done = false;
1023 
1024  if (! m_Volume.Empty()) {
1026  m_Ambig,
1027  m_BinHdr,
1028  m_Ids,
1029  m_Pig,
1030  m_Hash,
1031  m_Blobs,
1033  }
1034 
1035  if (! done) {
1036  int index = (int) m_VolumeList.size();
1037 
1038  if (m_Volume.NotEmpty()) {
1039  m_Volume->Close();
1040  }
1041 
1042  {
1044  m_Protein,
1045  m_Title,
1046  m_Date,
1047  index,
1048  m_MaxFileSize,
1050  m_Indices));
1051 
1052  m_VolumeList.push_back(m_Volume);
1053 
1054 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1055  (!defined(NCBI_COMPILER_MIPSPRO)) )
1056  _ASSERT(m_Blobs.size() == m_ColumnTitles.size() * 2);
1057  _ASSERT(m_Blobs.size() == m_ColumnMetas.size() * 2);
1058  _ASSERT(m_Blobs.size() == m_HaveBlob.size() * 2);
1059 
1060  for(size_t i = 0; i < m_ColumnTitles.size(); i++) {
1062  m_ColumnMetas[i],
1063  m_MaxFileSize);
1064  }
1065 #endif
1066  }
1067 
1068  // need to reset OID, hense recalculate the header and id
1069  x_CookHeader();
1070  x_CookIds();
1071 
1073  m_Ambig,
1074  m_BinHdr,
1075  m_Ids,
1076  m_Pig,
1077  m_Hash,
1078  m_Blobs,
1080 
1081  if (! done) {
1083  eArgErr,
1084  "Cannot write sequence to volume.");
1085  }
1086  }
1087 }
1088 
1090 {
1092  bdls(const_cast<CBlast_def_line_set*>(& deflines));
1093 
1094  s_CheckEmptyLists(bdls, true);
1095  m_Deflines = bdls;
1096 }
1097 
1098 inline int s_AbsMax(int a, int b)
1099 {
1100  return std::max(((a < 0) ? -a : a),
1101  ((b < 0) ? -b : b));
1102 }
1103 
1104 // Filtering data format on disk:
1105 //
1106 // Size of integer type for this blob (1, 2, or 4) (4 bytes).
1107 //
1108 // Array of filtering types:
1109 // Filter-type (enumeration)
1110 // Array of offsets:
1111 // Start Offset
1112 // End Offset
1113 //
1114 // The isize is one of 1, 2, or 4, written in the first byte, and
1115 // followed by 0, 1, or 3 NUL bytes to align the data offset to a
1116 // multiple of `isize'.
1117 //
1118 // All other integer values in this array use isize bytes, including
1119 // array counts and the `type' enumerations. After all the offset is
1120 // written, the blob is aligned to a multiple of 4 using the `eSimple'
1121 // method.
1122 //
1123 // Each array is an element count followed by that many elements.
1124 
1125 #if 0
1126 
1127 // I think this is a better approach; but it needs more testing,
1128 // particularly with regard to platform portability.
1129 
1130 struct SWriteInt1 {
1131  static void WriteInt(CBlastDbBlob & blob, int value)
1132  {
1133  blob.WriteInt1(value);
1134  }
1135 };
1136 
1137 struct SWriteInt2 {
1138  static void WriteInt(CBlastDbBlob & blob, int value)
1139  {
1140  blob.WriteInt2(value);
1141  }
1142 };
1143 
1144 struct SWriteInt4 {
1145  static void WriteInt(CBlastDbBlob & blob, int value)
1146  {
1147  blob.WriteInt4(value);
1148  }
1149 };
1150 
1151 template<class TWriteSize, class TRanges>
1152 void s_WriteRanges(CBlastDbBlob & blob,
1153  int count,
1154  const TRanges & ranges)
1155 {
1156  typedef vector< pair<TSeqPos, TSeqPos> > TPairVector;
1157 
1158  Int4 num_written = 0;
1159  TWriteSize::WriteInt(blob, count);
1160 
1161  for ( typename TRanges::const_iterator r1 = (ranges).begin(),
1162  r1_end = (ranges).end();
1163  r1 != r1_end;
1164  ++r1 ) {
1165 
1166  if (r1->offsets.size()) {
1167  num_written ++;
1168  TWriteSize::WriteInt(blob, r1->algorithm_id);
1169  TWriteSize::WriteInt(blob, r1->offsets.size());
1170 
1171  ITERATE(TPairVector, r2, r1->offsets) {
1172  TWriteSize::WriteInt(blob, r2->first);
1173  TWriteSize::WriteInt(blob, r2->second);
1174  }
1175  }
1176  }
1177 
1178  _ASSERT(num_written == count);
1179 }
1180 
1181 #endif
1182 
1183 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1184  (!defined(NCBI_COMPILER_MIPSPRO)) )
1185 
1187  const vector <TGi> & gis)
1188 {
1189  // No GI is found for the sequence
1190  // TODO should we generate a warning?
1191  if (m_UseGiMask && !gis.size()) {
1192  return;
1193  }
1194 
1195  TSeqPos seq_length = x_ComputeSeqLength();
1196 
1197  // Check validity of data and determine maximum integer value
1198  // stored here before writing anything. The best numeric_size
1199  // will be selected; this numeric size is applied uniformly to all
1200  // integers in this blob (except for the first one, which is the
1201  // integer size itself, and which is always a single byte.)
1202 
1203  typedef vector< pair<TSeqPos, TSeqPos> > TPairVector;
1204 
1205  int range_list_count = 0;
1206  int offset_pairs_count = 0;
1207 
1208 
1209  ITERATE(CMaskedRangesVector, r1, ranges) {
1210  if (r1->empty()) {
1211  continue;
1212  }
1213 
1214  range_list_count ++;
1215  offset_pairs_count += r1->offsets.size();
1216 
1217  if ( !m_MaskAlgoRegistry.IsRegistered(r1->algorithm_id) ) {
1218  string msg("Error: Algorithm IDs must be registered before use.");
1219  msg += " Unknown algorithm ID = " +
1220  NStr::IntToString((int)r1->algorithm_id);
1221  NCBI_THROW(CWriteDBException, eArgErr, msg);
1222  }
1223 
1224 
1225  ITERATE(TPairVector, r2, r1->offsets) {
1226  if ((r2->first > r2->second) ||
1227  (r2->second > seq_length)) {
1228 
1230  eArgErr,
1231  "Error: Masked data offsets out of bounds.");
1232  }
1233  }
1234  }
1235 
1236 
1237  // Gi-based masks
1238  if (m_UseGiMask) {
1239  ITERATE(CMaskedRangesVector, r1, ranges) {
1240  if (r1->offsets.size()) {
1241  m_GiMasks[m_MaskAlgoMap[r1->algorithm_id]]
1242  ->AddGiMask(gis, r1->offsets);
1243  }
1244  }
1245  return;
1246  }
1247 
1248  // OID-based masks
1249  const int col_id = x_GetMaskDataColumnId();
1250  CBlastDbBlob & blob = SetBlobData(col_id);
1251  blob.Clear();
1252  blob.WriteInt4(range_list_count);
1253 
1254  CBlastDbBlob & blob2 = SetBlobData(col_id);
1255  blob2.Clear();
1256  blob2.WriteInt4(range_list_count);
1257 
1258  ITERATE(CMaskedRangesVector, r1, ranges) {
1259  if (r1->offsets.size()) {
1260  blob.WriteInt4(r1->algorithm_id);
1261  blob.WriteInt4(r1->offsets.size());
1262  blob2.WriteInt4(r1->algorithm_id);
1263  blob2.WriteInt4(r1->offsets.size());
1264 
1265  ITERATE(TPairVector, r2, r1->offsets) {
1266  blob.WriteInt4(r2->first);
1267  blob.WriteInt4(r2->second);
1268  blob2.WriteInt4_LE(r2->first);
1269  blob2.WriteInt4_LE(r2->second);
1270  }
1271  }
1272  }
1273 
1276 }
1277 
1278 static const string s_EscapeColon(const string &in) {
1279  const char l = 0x1;
1280  return NStr::Replace(in, ":", string(l,1));
1281 }
1282 
1283 int CWriteDB_Impl::
1285  const string & options,
1286  const string & name)
1287 {
1288  int algorithm_id = m_MaskAlgoRegistry.Add(program, options);
1289 
1290  string key = NStr::IntToString(algorithm_id);
1291  string value = NStr::IntToString((int)program) + ":" +
1292  s_EscapeColon(options);
1293 
1294  if (m_UseGiMask) {
1295  m_MaskAlgoMap[algorithm_id] = m_GiMasks.size();
1297  (new CWriteDB_GiMask(name, value, m_MaxFileSize)));
1298  } else {
1300  }
1301 
1302  return algorithm_id;
1303 }
1304 
1305 int CWriteDB_Impl::
1306 RegisterMaskAlgorithm(const string &id,
1307  const string &description,
1308  const string &options)
1309 {
1310  int algorithm_id = m_MaskAlgoRegistry.Add(id);
1311 
1312  string key = NStr::IntToString(algorithm_id);
1313  string value = "100:" +
1314  s_EscapeColon(options) + ":" +
1315  s_EscapeColon(id) + ":" +
1316  s_EscapeColon(description);
1317 
1319 
1320  return algorithm_id;
1321 }
1322 
1323 int CWriteDB_Impl::FindColumn(const string & title) const
1324 {
1325  for(int i = 0; i < (int) m_ColumnTitles.size(); i++) {
1326  if (title == m_ColumnTitles[i]) {
1327  return i;
1328  }
1329  }
1330 
1331  return -1;
1332 }
1333 
1334 int CWriteDB_Impl::CreateColumn(const string & title, bool mbo)
1335 {
1336  _ASSERT(FindColumn(title) == -1);
1337 
1338  size_t col_id = m_Blobs.size() / 2;
1339 
1340  _ASSERT(m_HaveBlob.size() == col_id);
1341  _ASSERT(m_ColumnTitles.size() == col_id);
1342  _ASSERT(m_ColumnMetas.size() == col_id);
1343 
1344  CRef<CBlastDbBlob> new_blob(new CBlastDbBlob);
1345  CRef<CBlastDbBlob> new_blob2(new CBlastDbBlob);
1346 
1347  m_Blobs .push_back(new_blob);
1348  m_Blobs .push_back(new_blob2);
1349  m_HaveBlob .push_back(0);
1350  m_ColumnTitles.push_back(title);
1351  m_ColumnMetas .push_back(TColumnMeta());
1352 
1353  if (m_Volume.NotEmpty()) {
1354  size_t id2 = m_Volume->CreateColumn(title, m_ColumnMetas.back(), mbo);
1355  _ASSERT(id2 == col_id);
1356  (void)id2; // get rid of compiler warning
1357  }
1358 
1359  return col_id;
1360 }
1361 
1363  const string & key,
1364  const string & value)
1365 {
1366  if ((col_id < 0) || (col_id >= (int) m_ColumnMetas.size())) {
1367  NCBI_THROW(CWriteDBException, eArgErr,
1368  "Error: provided column ID is not valid");
1369  }
1370 
1371  m_ColumnMetas[col_id][key] = value;
1372 
1373  if (m_Volume.NotEmpty()) {
1374  m_Volume->AddColumnMetaData(col_id, key, value);
1375  }
1376 }
1377 
1379 {
1380  if ((col_id < 0) || (col_id * 2 >= (int) m_Blobs.size())) {
1381  NCBI_THROW(CWriteDBException, eArgErr,
1382  "Error: provided column ID is not valid");
1383  }
1384 
1385  if (m_HaveBlob[col_id] > 1) {
1386  NCBI_THROW(CWriteDBException, eArgErr,
1387  "Error: Already have blob for this sequence and column");
1388  }
1389 
1390  ++m_HaveBlob[col_id];
1391 
1392  // Blobs are reused to reduce buffer reallocation; a missing blob
1393  // means the corresponding column does not exist.
1394 
1395  return *m_Blobs[col_id * 2 + m_HaveBlob[col_id] - 1];
1396 }
1397 #endif
1398 
1400 {
1401  m_Pig = pig;
1402 }
1403 
1405 {
1406  m_MaxFileSize = sz;
1407 }
1408 
1410 {
1411  m_MaxVolumeLetters = sz;
1412 }
1413 
1416  bool long_seqids)
1417 {
1418  // Get information
1419 
1421  string binary_header;
1422  vector< vector<int> > v1, v2;
1423 
1424  CConstRef<CBioseq> bsref(& bs);
1425  x_ExtractDeflines(bsref, deflines, binary_header, v2, v2, 0, -1, parse_ids,
1426  long_seqids);
1427 
1428  // Convert to return type
1429 
1431  bdls.Reset(const_cast<CBlast_def_line_set*>(&*deflines));
1432 
1433  return bdls;
1434 }
1435 
1436 void CWriteDB_Impl::SetMaskedLetters(const string & masked)
1437 {
1438  // Only supported for protein.
1439 
1440  if (! m_Protein) {
1442  eArgErr,
1443  "Error: Nucleotide masking not supported.");
1444  }
1445 
1446  m_MaskedLetters = masked;
1447 
1448  if (masked.empty()) {
1449  vector<char> none;
1450  m_MaskLookup.swap(none);
1451  return;
1452  }
1453 
1454  // Convert set of masked letters to stdaa, use the result to build
1455  // a lookup table.
1456 
1457  string mask_bytes;
1460  0,
1461  (int) m_MaskedLetters.size(),
1462  mask_bytes,
1464 
1465  _ASSERT(mask_bytes.size() == m_MaskedLetters.size());
1466 
1467  // Build a table of character-to-bool.
1468  // (Bool is represented by char 0 and 1.)
1469 
1470  m_MaskLookup.resize(256, (char)0);
1471 
1472  for (unsigned i = 0; i < mask_bytes.size(); i++) {
1473  int ch = ((int) mask_bytes[i]) & 0xFF;
1474  m_MaskLookup[ch] = (char)1;
1475  }
1476 
1477  // Convert the masking character - always 'X' - to stdaa.
1478 
1479  if (m_MaskByte.empty()) {
1480  string mask_byte = "X";
1481 
1482  CSeqConvert::Convert(mask_byte,
1484  0,
1485  1,
1486  m_MaskByte,
1488 
1489  _ASSERT(m_MaskByte.size() == 1);
1490  }
1491 }
1492 
1493 void CWriteDB_Impl::ListVolumes(vector<string> & vols)
1494 {
1495  vols.clear();
1496 
1497  ITERATE(vector< CRef<CWriteDB_Volume> >, iter, m_VolumeList) {
1498  vols.push_back((**iter).GetVolumeName());
1499  }
1500 }
1501 
1502 void CWriteDB_Impl::ListFiles(vector<string> & files)
1503 {
1504  files.clear();
1505 
1506  ITERATE(vector< CRef<CWriteDB_Volume> >, iter, m_VolumeList) {
1507  (**iter).ListFiles(files);
1508  }
1509 
1510  if (m_VolumeList.size() > 1) {
1511  files.push_back(x_MakeAliasName());
1512  }
1513 }
1514 
1515 /// Compute the hash of a (raw) sequence.
1516 ///
1517 /// The hash of the provided sequence will be computed and assigned to
1518 /// the m_Hash field. For protein, the sequence is in the Ncbistdaa
1519 /// format. For nucleotide, the sequence and optional ambiguities are
1520 /// in 'raw' format, meaning they are packed just as sequences are
1521 /// packed in nsq files.
1522 ///
1523 /// @param sequence The sequence data. [in]
1524 /// @param ambiguities Nucleotide ambiguities are provided here. [in]
1526  const CTempString & ambig)
1527 {
1528  if (m_Protein) {
1529  m_Hash = SeqDB_SequenceHash(sequence.data(), sequence.size());
1530  } else {
1531  string na8;
1532  SeqDB_UnpackAmbiguities(sequence, ambig, na8);
1533  m_Hash = SeqDB_SequenceHash(na8.data(), na8.size());
1534  }
1535 }
1536 
1537 /// Compute the hash of a (Bioseq) sequence.
1538 ///
1539 /// The hash of the provided sequence will be computed and
1540 /// assigned to the m_Hash member. The sequence is packed as a
1541 /// CBioseq.
1542 ///
1543 /// @param sequence The sequence as a CBioseq. [in]
1545 {
1546  m_Hash = SeqDB_SequenceHash(sequence);
1547 }
1548 
1549 #define TAB_REPLACEMENT " "
1550 
1551 void CWriteDB_Impl::
1553  CConstRef<CBlast_def_line_set> & deflines,
1554  const vector< vector<int> > & membits,
1555  const vector< vector<int> > & linkout,
1556  int pig,
1557  bool accept_gt,
1558  bool parse_ids,
1559  bool long_seqids)
1560 {
1561  if (! bioseq.CanGetDescr()) {
1562  return;
1563  }
1564 
1565  string fasta;
1566 
1567  // Scan the CBioseq for the CFastaReader user object.
1568 
1569  ITERATE(list< CRef< CSeqdesc > >, iter, bioseq.GetDescr().Get()) {
1570  const CSeqdesc & desc = **iter;
1571 
1572  if (desc.IsUser() &&
1573  desc.GetUser().CanGetType() &&
1574  desc.GetUser().GetType().IsStr() &&
1575  desc.GetUser().GetType().GetStr() == "CFastaReader" &&
1576  desc.GetUser().CanGetData()) {
1577 
1578  const vector< CRef< CUser_field > > & D = desc.GetUser().GetData();
1579 
1580  ITERATE(vector< CRef< CUser_field > >, iter, D) {
1581  const CUser_field & f = **iter;
1582 
1583  if (f.CanGetLabel() &&
1584  f.GetLabel().IsStr() &&
1585  f.GetLabel().GetStr() == "DefLine" &&
1586  f.CanGetData() &&
1587  f.GetData().IsStr()) {
1588  fasta = NStr::Replace(f.GetData().GetStr(), "\\t", TAB_REPLACEMENT);
1589  fasta = NStr::ParseEscapes(fasta);
1590  break;
1591  }
1592  }
1593  }
1594  }
1595 
1596  if (fasta.empty())
1597  return;
1598 
1599  // The bioseq has a field contianing the ids for the first
1600  // defline. The title string contains the title for the first
1601  // defline, plus all the other defline titles and ids. This code
1602  // unpacks them and builds a normal blast defline set.
1603 
1604  unsigned mship_i(0), links_i(0);
1605  bool used_pig(false);
1606 
1607  // Build the deflines.
1608 
1610  CRef<CBlast_def_line> defline;
1611 
1612  if (!parse_ids) {
1613 
1614  // Generate an BL_ORD_ID in case no parse is needed
1615  CRef<CSeq_id> gnl_id(new CSeq_id());
1616  gnl_id->SetGeneral().SetDb("BL_ORD_ID");
1617  gnl_id->SetGeneral().SetTag().SetId(0); // will be filled later
1618 
1619  // Build the local defline.
1620  defline.Reset(new CBlast_def_line);
1621  defline->SetSeqid().push_back(gnl_id);
1622 
1623  string title(fasta, 1, fasta.size());
1624  // Replace ^A with space
1625  NStr::ReplaceInPlace(title, "\001", " ");
1626  // Replace tabs with three spaces
1627  NStr::ReplaceInPlace(title, "\t", TAB_REPLACEMENT);
1628  defline->SetTitle(title);
1629 
1630  if (mship_i < membits.size()) {
1631  const vector<int> & V = membits[mship_i++];
1632  defline->SetMemberships().assign(V.begin(), V.end());
1633  }
1634 
1635  if (links_i < linkout.size()) {
1636  const vector<int> & V = linkout[mship_i++];
1637  defline->SetLinks().assign(V.begin(), V.end());
1638  }
1639 
1640  if ((! used_pig) && pig) {
1641  defline->SetOther_info().push_back(pig);
1642  used_pig = true;
1643  }
1644 
1645  bdls->Set().push_back(defline);
1646 
1647  } else {
1648 
1649  int skip = 1;
1650  while(fasta.size()) {
1651  size_t id_start = skip;
1652  size_t pos_title = fasta.find(" ", skip);
1653  size_t pos_next = fasta.find("\001", skip);
1654  skip = 1;
1655 
1656  if (pos_next == fasta.npos) {
1657  if (accept_gt) {
1658  pos_next = fasta.find(" >");
1659  skip = 2;
1660  }
1661  } else {
1662  // If there is a ^A, turn off GT checking.
1663  accept_gt = false;
1664  }
1665 
1666  if (pos_next == fasta.npos) {
1667  pos_next = fasta.size();
1668  skip = 0;
1669  }
1670 
1671  if (pos_title == fasta.npos || pos_title >= pos_next) {
1672  // title field is missing
1673  pos_title = pos_next;
1674  }
1675 
1676  string ids(fasta, id_start, pos_title - id_start);
1677  if (pos_title == pos_next) pos_title--;
1678  string title(fasta, pos_title + 1, pos_next-pos_title - 1);
1679  string remaining(fasta, pos_next, fasta.size() - pos_next);
1680  fasta.swap(remaining);
1681 
1682  // Parse '|' seperated ids.
1683  list< CRef<CSeq_id> > seqids;
1684  if ( (ids.find('|') == NPOS && long_seqids)
1685  || !isalpha((unsigned char)(ids[0]))) {
1686 
1687  seqids.push_back(CRef<CSeq_id> (new CSeq_id(CSeq_id::e_Local, ids)));
1688  } else {
1689  CSeq_id::ParseFastaIds(seqids, ids);
1690 
1691  if (!long_seqids) {
1692 
1693  // If accession's molecule type is different than
1694  // expected, change sequence id to local. CFastaReader
1695  // cannot distingush between bare pir protein ids genbank
1696  // nucleotide ids.
1697  for (auto& it: seqids) {
1698  CSeq_id::EAccessionInfo info = it->IdentifyAccession();
1699  if (!it->IsLocal() && !it->IsGi() &&
1700  (info & (CSeq_id::fAcc_prot | CSeq_id::fAcc_nuc)) &&
1701  bioseq.IsAa() == !!(info & CSeq_id::fAcc_nuc)) {
1702 
1703  string label = it->GetSeqIdString(true);
1704  it.Reset(new CSeq_id(CSeq_id::e_Local, label));
1705  }
1706  }
1707  }
1708  }
1709 
1710  // Build the actual defline.
1711 
1712  defline.Reset(new CBlast_def_line);
1713  defline->SetSeqid().swap(seqids);
1714  defline->SetTitle(title);
1715 
1716  if (mship_i < membits.size()) {
1717  const vector<int> & V = membits[mship_i++];
1718  defline->SetMemberships().assign(V.begin(), V.end());
1719  }
1720 
1721  if (links_i < linkout.size()) {
1722  const vector<int> & V = linkout[mship_i++];
1723  defline->SetLinks().assign(V.begin(), V.end());
1724  }
1725 
1726  if ((! used_pig) && pig) {
1727  defline->SetOther_info().push_back(pig);
1728  used_pig = true;
1729  }
1730 
1731  bdls->Set().push_back(defline);
1732  }
1733  }
1734  s_CheckEmptyLists(bdls, true);
1735  deflines = bdls;
1736 }
1737 
1738 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1739  (!defined(NCBI_COMPILER_MIPSPRO)) )
1741 {
1742  if (m_MaskDataColumn == -1) {
1743  m_MaskDataColumn = CreateColumn("BlastDb/MaskData", true);
1744  }
1745  return m_MaskDataColumn;
1746 }
1747 #endif
1748 
1750 
1751 
EIndexType
Whether and what kind of indices to build.
Definition: writedb.hpp:103
CBioseq_Handle –.
bool CanGetLinks(void) const
Check if it is safe to call GetLinks method.
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:167
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:182
Uint8 m_MaxVolumeLetters
Max letters per volume.
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
Definition: Seq_inst_.hpp:802
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:783
string m_BinHdr
Binary header in format that will be written to disk.
static void x_SetDeflinesFromBinary(const string &bin_hdr, CConstRef< CBlast_def_line_set > &deflines)
Extract a defline set from a binary ASN.1 blob.
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
int WriteDB_FindSequenceLength(bool protein, const string &seq)
Compute length of sequence from raw packing.
void SetMaskedLetters(const string &masked)
Set bases that should not be used in sequences.
Definition: dbpivot.c:60
static const unsigned char msg[]
Definition: ccm.c:378
void WriteInt4(Int4 x)
Write a 4 byte integer to the blob.
Definition: seqdbblob.cpp:323
void SetDb(const TDb &value)
Assign a value to Db data member.
Definition: Dbtag_.hpp:220
bool m_LongSeqId
If true, use long sequence id format (database|accession) for all acessions.
int m_MaskDataColumn
Column ID for masking data column.
void SetPig(int pig)
Set the PIG identifier of this sequence.
static void x_GetFastaReaderDeflines(const CBioseq &bioseq, CConstRef< CBlast_def_line_set > &deflines, const vector< vector< int > > &membits, const vector< vector< int > > &linkout, int pig, bool accept_gt, bool parse_ids, bool long_seqids)
Extract a defline set from a CFastaReader generated CBioseq.
bool WriteSequence(const string &seq, const string &ambig, const string &binhdr, const TIdList &ids, int pig, int hash, const TBlobList &blobs, int maskcol_id=-1)
Add a sequence to this volume.
CWriteDB_Column::TColumnMeta TColumnMeta
Per-column metadata.
#define T(s)
Definition: common.h:225
done
Definition: token1.c:1
static void s_CheckEmptyLists(CRef< CBlast_def_line_set > &deflines, bool owner)
unsigned NCBI_INT8_TYPE Uint8
Unsigned 8 byte sized integer.
Definition: ncbitype.h:146
signed int Int4
Alias for signed int.
Definition: ncbitype.h:120
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:62
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1296
vector< CRef< CBlastDbBlob > > m_Blobs
Blob data for the current sequence, indexed by letter.
int FindColumn(const string &title) const
Find an existing column.
~CWriteDB_Impl()
Destructor.
const struct ncbi::grid::netcache::search::fields::KEY key
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:836
bool CanGetLabel(void) const
Check if it is safe to call GetLabel method.
void WriteInt2(int x)
Write a 1 byte integer to the blob.
Definition: seqdbblob.cpp:313
bool IsAa(void) const
Definition: Bioseq.cpp:350
string m_MaskedLetters
Masked protein letters (IUPAC).
void Close()
Close the file and flush any remaining data to disk.
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3306
void SetMaxVolumeLetters(Uint8 sz)
Set the maximum letters in one volume.
void x_ComputeHash(const CTempString &sequence, const CTempString &ambiguities)
Compute the hash of a (raw) sequence.
const TStr & GetStr(void) const
Get the variant data.
void x_CookColumns()
Prepare column data to be appended to disk.
Tdata & Set(void)
Assign a value to data member.
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
TSeqid & SetSeqid(void)
Assign a value to Seqid data member.
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids, bool long_seqids)
Extract deflines from a CBioseq.
bool m_Protein
True if DB is protein.
const TLabel & GetLabel(void) const
Get the Label member data.
vector< TRangeWithFuzz > TRanges
Definition: Seq_loc.cpp:4221
#define MSerial_AsnBinary
Definition: serialbase.hpp:682
const TData & GetData(void) const
Get the Data member data.
CWriteDB_Volume class.
CDirEntry –.
Definition: ncbifile.hpp:263
int m_Hash
Sequence hash for this sequence.
static Int4 GetInt4(const unsigned char *ptr)
Definition: ncbi_bswap.hpp:121
void SetTaxid(TTaxid value)
Assign a value to Taxid data member.
string m_Ambig
Ambiguities in format that will be written to disk.
static string MakeShortName(const string &base, int index)
Construct the short name for a volume.
bool AdvanceKey(const CWriteDB_IsamKey &other)
#define TAB_REPLACEMENT
string m_Date
Time stamp (for all volumes.)
void WriteDB_IupacnaToBinary(const CSeq_inst &si, string &seq, string &amb)
Build blast db nucleotide format from Iupacna Seq-inst.
CWriteDB_GiMask class.
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
bool x_HaveSequence() const
Returns true if we have unwritten sequence data.
void AddColumnMetaData(int col_id, const string &key, const string &value)
Add meta data to a column.
const CSeq_id * GetNonLocalId() const
Find a non-local ID if present, consulting assembly details if all IDs for the overall sequence are l...
Definition: Bioseq.cpp:292
Add an index from sequence hash to OID.
Definition: writedb.hpp:125
STL namespace.
int CreateColumn(const string &title, bool mbo=false)
Set up a generic CWriteDB metadata column.
const TMemberships & GetMemberships(void) const
Get the Memberships member data.
void SetDeflines(const CBlast_def_line_set &deflines)
This method replaces any stored header data for the current sequence with the provided CBlast_def_lin...
void SetMaskData(const CMaskedRangesVector &ranges, const vector< TGi > &gis)
Set filtering data for a sequence.
#define NULL
Definition: ncbistd.hpp:225
TSeqPos size(void) const
Definition: seq_vector.hpp:291
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1343
bool CanGetDb(void) const
Check if it is safe to call GetDb method.
Definition: Org_ref_.hpp:457
CBlastDbBlob & SetBlobData(int col_id)
Get a blob to use for a given column letter.
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
Definition: ncbiutil.hpp:262
#define kEmptyStr
Definition: ncbistr.hpp:120
int s_AbsMax(int a, int b)
consecutive codes for std aas
Definition: Seq_data_.hpp:113
void RenameSingle()
Rename all volumes files to single-volume names.
bool CanGetInst(void) const
Check if it is safe to call GetInst method.
Definition: Bioseq_.hpp:320
void x_ResetSequenceData()
Clear sequence data from last sequence.
vector< vector< int > > m_Linkouts
Linkout bits - outer vector is per-defline, inner is bits.
vector< char > m_MaskLookup
Is (blast-aa) byte masked?
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:776
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:280
#define NPOS
Definition: ncbistr.hpp:130
map< int, int > m_MaskAlgoMap
Mapping from algo_id to gi-mask id.
const TType & GetType(void) const
Get the Type member data.
C & SerialAssign(C &dest, const C &src, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
Definition: serialbase.hpp:483
int i
void WriteDB_Ncbi2naToBinary(const CSeq_inst &si, string &seq)
Build blast db nucleotide format from Ncbi2na Seq-inst.
int CreateColumn(const string &title, const TColumnMeta &meta, Uint8 max_sz, bool mbo=true)
Create a new database column.
vector< TColumnMeta > m_ColumnMetas
Meta data for all columns.
void ResetMemberships(void)
Reset Memberships data member.
vector< vector< int > > m_Memberships
Membership bits - outer vector is per-defline, inner is bits.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5079
bool IsSource(void) const
Check if variant Source is selected.
Definition: Seqdesc_.hpp:1190
void AddSequence(const CTempString &sequence, const CTempString &ambiguities)
Add a new sequence as raw sequence and ambiguity data.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:101
static const char si[8][64]
Definition: des.c:152
vector< string > m_ColumnTitles
Column titles.
bool CanGetType(void) const
Check if it is safe to call GetType method.
void ListVolumes(vector< string > &vols)
List Volumes.
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
bool IsStr(void) const
Check if variant Str is selected.
const TOrg & GetOrg(void) const
Get the variant data.
Definition: Seqdesc_.cpp:240
void ListFiles(vector< string > &files)
List Filenames.
IUPAC 1 letter nuc acid code.
Definition: Seq_data_.hpp:104
void AddColumnMetaData(int col_id, const string &key, const string &value)
Add meta data to a column.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:644
static SIZE_TYPE ParseFastaIds(CBioseq::TId &ids, const CTempString &s, bool allow_partial_failure=false)
Parse an entire set of |-delimited FASTA-style IDs, appending the results to IDS. ...
Definition: Seq_id.cpp:1895
bool IsTitle(void) const
Check if variant Title is selected.
Definition: Seqdesc_.hpp:1026
Defines implementation class of WriteDB.
void SeqDB_UnpackAmbiguities(const CTempString &sequence, const CTempString &ambiguities, string &result)
Unpack an ambiguous nucleotide sequence.
Definition: seqdbvol.cpp:1597
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:702
void SetTag(TTag &value)
Assign a value to Tag data member.
Definition: Dbtag_.cpp:66
void SetMaxFileSize(Uint8 sz)
Set the maximum size for any file in the database.
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
CWriteDBException.
bool m_ParseIDs
Generate ISAM files.
CSeqVector –.
Definition: seq_vector.hpp:64
CRef< CWriteDB_Volume > m_Volume
This volume is currently accepting sequences.
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
string GetName(void) const
Get the base entry name with extension (if any).
Definition: ncbifile.hpp:3855
bool IsId(void) const
Check if variant Id is selected.
Definition: Object_id_.hpp:264
CNcbiIfstream * source
void Clear()
Clear all owned data and reference an empty string.
Definition: seqdbblob.cpp:58
const TTitle & GetTitle(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1032
bool IsPrf(void) const
Check if variant Prf is selected.
Definition: Seq_id_.hpp:916
string m_Title
Title field of database.
const int & GetOID() const
Get the current OID of the volume.
CConstRef< CBlast_def_line_set > m_Deflines
Deflines to write as header.
static const string s_EscapeColon(const string &in)
void Close()
Close the volume.
void WriteDB_StdaaToBinary(const CSeq_inst &si, string &seq)
Build blast db protein format from Stdaa protein Seq-inst.
const CharType(& source)[N]
Definition: pointer.h:1107
string AsString(const CTimeFormat &format=kEmptyStr, TSeconds out_tz=eCurrentTimeZone) const
Transform time to string.
Definition: ncbitime.cpp:1515
static void x_ExtractDeflines(CConstRef< CBioseq > &bioseq, CConstRef< CBlast_def_line_set > &deflines, string &bin_hdr, const vector< vector< int > > &membbits, const vector< vector< int > > &linkouts, int pig, int OID=-1, bool parse_ids=true, bool long_seqid=false)
Get deflines from a CBioseq and other meta-data.
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value...
Definition: ncbiobj.hpp:1289
string m_Dbname
Database base name.
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:198
bool CanGetMemberships(void) const
Check if it is safe to call GetMemberships method.
CWriteDB_IsamKey(const string &fn)
void WritePadBytes(int align, EPadding fmt)
Align the offset by writing pad bytes.
Definition: seqdbblob.cpp:562
static void x_BuildDeflinesFromBioseq(const CBioseq &bioseq, CConstRef< CBlast_def_line_set > &deflines, const vector< vector< int > > &membits, const vector< vector< int > > &linkout, int pig)
Construct deflines from a CBioseq and other meta-data.
void SetTitle(const TTitle &value)
Assign a value to Title data member.
class to support searching for duplicate isam keys
static MDB_envinfo info
Definition: mdb_load.c:37
T max(T x_, T y_)
const TLinks & GetLinks(void) const
Get the Links member data.
string m_MaskByte
Byte that replaced masked letters.
const TDb & GetDb(void) const
Get the Db member data.
Definition: Org_ref_.hpp:463
void ResetLinks(void)
Reset Links data member.
CTime –.
Definition: ncbitime.hpp:290
void x_SetHaveSequence()
Records that we now have unwritten sequence data.
int RegisterMaskAlgorithm(EBlast_filter_program program, const string &options, const string &name="")
Register a type of filtering data found in this database.
int Add(EBlast_filter_program program, const string &options=string())
Attempt to register the information about a masking algorithm.
char value[7]
Definition: config.c:428
bool CanGetId(void) const
Check if it is safe to call GetId method.
Definition: Bioseq_.hpp:274
int m_Pig
PIG to attach to headers for protein sequences.
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
Definition: pointer.h:1084
const CVect2< U > & v2
Definition: globals.hpp:440
static void x_GetBioseqBinaryHeader(const CBioseq &bioseq, string &binhdr)
Get binary version of deflines from 'user' data in Bioseq.
#define CHAR_BUFFER_SIZE
CSeqVector m_SeqVector
SeqVector for next sequence to write.
void x_MaskSequence()
Replace masked input letters with m_MaskByte value.
bool CanGetDescr(void) const
Check if it is safe to call GetDescr method.
Definition: Bioseq_.hpp:299
Blast defline related defines.
void x_MakeAlias()
Flush accumulated sequence data to volume.
`Blob' Class for SeqDB (and WriteDB).
Definition: seqdbblob.hpp:56
TGeneral & SetGeneral(void)
Select the variant.
Definition: Seq_id_.cpp:375
bool m_Closed
True if database has been closed.
static string & Replace(const string &src, const string &search, const string &replace, string &dst, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3215
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1014
#define kAsnDeflineObjLabel
bool CanGetSeq_data(void) const
Check if it is safe to call GetSeq_data method.
Definition: Seq_inst_.hpp:796
static int BestRank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:564
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6854
4 bit nucleic acid code
Definition: Seq_data_.hpp:107
Comparison function for set *>
static CRef< CBlast_def_line_set > s_EditDeflineSet(CConstRef< CBlast_def_line_set > &deflines)
USING_SCOPE(std)
Import C++ std namespace.
bool operator<(const CWriteDB_IsamKey &other) const
Uint8 m_MaxFileSize
Maximum size of any file.
bool m_UseGiMask
Generate GI-based mask files.
EBlast_filter_program
This defines the possible sequence filtering algorithms to be used in a BLAST database.
2 bit nucleic acid code
Definition: Seq_data_.hpp:106
vector< int > m_HaveBlob
List of blob columns that are active for this sequence.
int m_SeqLength
When a sequence is added, this will be populated with the length of that sequence.
void x_ClearHaveSequence()
Records that we no longer have unwritten sequence data.
TLinks & SetLinks(void)
Assign a value to Links data member.
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1153
void x_CookHeader()
Convert header data into usable forms.
TMemberships & SetMemberships(void)
Assign a value to Memberships data member.
void WriteInt1(int x)
Write a 1 byte integer to the blob.
Definition: seqdbblob.cpp:303
void x_CookSequence()
Convert sequence data into usable forms.
TOther_info & SetOther_info(void)
Assign a value to Other_info data member.
Data conversion tools for CWriteDB and associated code.
bool CanGetMol(void) const
Check if it is safe to call GetMol method.
Definition: Seq_inst_.hpp:584
static void s_CheckDuplicateIds(set< CWriteDB_IsamKey< T > *, CWriteDB_IsamKey_Compare< T > > &keys)
Check for duplicate ids across volumes.
static string ParseEscapes(const CTempString str, EEscSeqRange mode=eEscSeqRange_Standard, char user_char= '?')
Parse C-style escape sequences in the specified string.
Definition: ncbistr.cpp:4627
bool IsPir(void) const
Check if variant Pir is selected.
Definition: Seq_id_.hpp:853
vector< CRef< CSeq_id > > m_Ids
Ids for next sequence to write, for use during ISAM construction.
void WriteInt4_LE(Int4 x)
Definition: seqdbblob.cpp:363
IO_PREFIX::ostrstream CNcbiOstrstream
Portable alias for ostrstream.
Definition: ncbistre.hpp:155
#define _ASSERT
int x_GetMaskDataColumnId()
Get the mask data column id.
Use current time.
Definition: ncbitime.hpp:295
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:367
TObjectType * GetPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1564
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:326
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string...
Definition: ncbiexpt.hpp:546
void x_Publish()
Flush accumulated sequence data to volume.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:245
bool CanGetData(void) const
Check if it is safe to call GetData method.
void x_CookIds()
Collect ids for ISAM files.
CMaskInfoRegistry m_MaskAlgoRegistry
Registry for masking algorithms in this database.
bool operator()(const CWriteDB_IsamKey< T > *lhs, const CWriteDB_IsamKey< T > *rhs) const
Just write NUL bytes until aligned.
Definition: seqdbblob.hpp:271
bool IsUser(void) const
Check if variant User is selected.
Definition: Seqdesc_.hpp:1122
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:709
const TData & GetData(void) const
Get the Data member data.
CConstRef< CBioseq > m_Bioseq
Bioseq object for next sequence to write.
void WriteDB_IupacaaToBinary(const CSeq_inst &si, string &seq)
Build blast db protein format from Iupacaa protein Seq-inst.
static bool ambig(char c)
This represents a set of masks for a given sequence.
Definition: writedb.hpp:63
std::istream & in(std::istream &in_, double &x_)
size_type length(void) const
Return the length of the represented array.
Definition: tempstr.hpp:360
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
const char * data(void) const
Return a pointer to the array represented.
Definition: tempstr.hpp:353
bool m_HaveSequence
True if we have a sequence to write.
const Tdata & Get(void) const
Get the member data.
int x_ComputeSeqLength()
Compute the length of the current sequence.
Definition: set.hpp:44
void WriteDB_EaaToBinary(const CSeq_inst &si, string &seq)
Build blast db protein format from Eaa protein Seq-inst.
CConstRef< CBioseq > GetCompleteBioseq(void) const
Get the complete bioseq.
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:443
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_data_.hpp:475
void WriteDB_Ncbi4naToBinary(const CSeq_inst &seqinst, string &seq, string &amb)
Build blast db nucleotide format from Ncbi4na Seq-inst.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:98
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:497
unsigned SeqDB_SequenceHash(const char *sequence, int length)
Returns a path minus filename.
Definition: seqdbobj.cpp:146
bool IsOrg(void) const
Check if variant Org is selected.
Definition: Seqdesc_.hpp:1046
IUPAC 1 letter amino acid code.
Definition: Seq_data_.hpp:105
void x_CookData()
Convert and compute final data formats.
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
vector< CRef< CWriteDB_GiMask > > m_GiMasks
Gi-based masks.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:756
vector< CRef< CWriteDB_Volume > > m_VolumeList
List of all volumes so far, up to and including m_Volume.
bool CanGetData(void) const
Check if it is safe to call GetData method.
CWriteDB_Impl(const string &dbname, bool protein, const string &title, EIndexType indices, bool parse_ids, bool long_ids, bool use_gi_mask)
Constructor.
bool IsRegistered(int algo_id) const
Verify whether the provided algorithm ID has been registered with this object.
#define INT4_SIZE
string m_Sequence
Sequence data in format that will be written to disk.
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:305
string x_MakeAliasName()
Compute name of alias file produced.
static bool s_UseFastaReaderDeflines(CConstRef< CBioseq > &bioseq, CConstRef< CBlast_def_line_set > &deflines, bool long_seqid)
EIndexType m_Indices
Indexing mode.
Modified on Wed Aug 16 06:04:52 2017 by modify_doxy.py rev. 533848