NCBI C++ ToolKit
alnmrg.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: alnmrg.cpp 40800 2009-01-22 00:07:02Z todorov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Kamen Todorov, NCBI
27 *
28 * File Description:
29 * Alignment merger. Demonstration of CAlnMix usage.
30 *
31 * ===========================================================================
32 */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbiapp.hpp>
35 #include <corelib/ncbiargs.hpp>
36 #include <corelib/ncbienv.hpp>
37 
38 #include <serial/iterator.hpp>
39 #include <serial/objistr.hpp>
40 #include <serial/objostr.hpp>
41 #include <serial/serial.hpp>
42 
43 #include <objects/seq/Bioseq.hpp>
47 
53 
56 #include <objmgr/scope.hpp>
57 #include <objmgr/seq_vector.hpp>
58 
61 
65 
66 #include <common/test_assert.h> /* This header must go last */
67 
70 
72 {
73  virtual void Init (void);
74  virtual int Run (void);
75  CScope& GetScope (void) const;
76  void SetOptions (void);
77  void LoadInputAlns (void);
78  void PrintMergedAln(void);
79  void ViewMergedAln (void);
80  void LoadSeqEntry (CNcbiIstream& is);
81  void LoadFasta (CNcbiIstream& is);
82  void LoadBlastDb (const string& db);
83  bool AddAlnToMix (const CSeq_align* aln) {
84  m_Mix->Add(*aln, m_AddFlags);
85  return true;
86  }
87 
88 private:
93  CRef<CAlnMix> m_Mix; // must appear AFTER m_ObjMgr!
94 };
95 
96 
98  public CObject,
100 {
101 public:
102  virtual void SetTaskName (const string& name)
103  {
104  cerr << name << "..." << endl;
105  };
106  virtual void SetTaskCompleted (int completed)
107  {
108  cerr << completed << " out of " << m_Total << endl;
109  }
110  virtual void SetTaskTotal (int total)
111  {
112  m_Total = total;
113  }
114  virtual bool InterruptTask ()
115  {
116  return false;
117  }
118 private:
119  int m_Total;
120 };
121 
122 
124 {
125  // Create command-line argument descriptions class
126  auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
127 
128  // Specify USAGE context
129  arg_desc->SetUsageContext
130  (GetArguments().GetProgramBasename(),
131  "Alignment merger demo program");
132 
133  // Describe the expected command-line arguments
134  arg_desc->AddDefaultKey
135  ("in", "input_file_name",
136  "Name of file to read from (standard input by default)",
138 
139  arg_desc->AddDefaultKey
140  ("asnout", "asn_out_file_name",
141  "Text ASN output",
143 
144  arg_desc->AddOptionalKey
145  ("asnoutb", "asn_out_file_name_b",
146  "Text ASN output, to a file opened in binary mode (for MS-Win tests)",
148 
149  arg_desc->AddDefaultKey
150  ("b", "bin_obj_type",
151  "This forced the input file to be read in binary ASN.1 mode\n"
152  "and specifies the type of the top-level ASN.1 object.\n",
154 
155  arg_desc->AddDefaultKey
156  ("bout", "bool",
157  "This forced the output file to be written in binary ASN.1 mode.\n",
159 
160  arg_desc->AddOptionalKey
161  ("log", "log_file_name",
162  "Name of log file to write to",
164 
165  arg_desc->AddDefaultKey
166  ("dsout", "bool",
167  "Output in Dense-seg format",
169 
170  arg_desc->AddDefaultKey
171  ("gapjoin", "bool",
172  "Consolidate segments of equal lens with a gap on the query sequence",
174 
175  arg_desc->AddDefaultKey
176  ("mingap", "bool",
177  "Consolidate all segments with a gap on the query sequence",
179 
180  arg_desc->AddDefaultKey
181  ("rmleadtrailgaps", "bool",
182  "Remove leading and trailing gaps",
184 
185  arg_desc->AddDefaultKey
186  ("minusstrand", "bool",
187  "Minus strand on the refseq when merging.",
189 
190  arg_desc->AddDefaultKey
191  ("fillunaln", "bool",
192  "Fill unaligned regions.",
194 
195  arg_desc->AddDefaultKey
196  ("calcscore", "bool",
197  "Calculate each aligned seq pair score and use it when merging."
198  "(Don't stitch off ObjMgr for this).",
200 
201  arg_desc->AddDefaultKey
202  ("sortseqsbyscore", "bool",
203  "Sort sequences by score.",
205 
206  arg_desc->AddDefaultKey
207  ("sortinputbyscore", "bool",
208  "Sort input by score.",
210 
211  arg_desc->AddDefaultKey
212  ("noobjmgr", "bool",
213  // ObjMgr is used to identify sequences and obtain a bioseqhandle.
214  // Also used to calc scores and determine the type of molecule
215  "Skip ObjMgr in identifying sequences, calculating scores, etc.",
217 
218  arg_desc->AddOptionalKey
219  ("se_in", "SeqEntryInputFile",
220  "An optional Seq-entry file to load a local top level seq entry from.",
222 
223  arg_desc->AddOptionalKey
224  ("fasta_in", "FastaFile",
225  "An optional FASTA file to load into ObjMgr's scope.",
227 
228  arg_desc->AddOptionalKey
229  ("blastdb", "BlastDb",
230  "Add an optional BLAST dataloader to ObjMgr's scope.",
232 
233  arg_desc->AddDefaultKey
234  ("queryseqmergeonly", "bool",
235  "Merge the query seq only, keep subject seqs on separate rows "
236  "(even if the same seq).",
238 
239  arg_desc->AddDefaultKey
240  ("truncateoverlaps", "bool",
241  "Truncate overlaps",
243 
244  arg_desc->AddDefaultKey
245  ("allowtranslocation", "bool",
246  "Allow translocation",
248 
249  arg_desc->AddDefaultKey
250  ("forcetranslation", "bool",
251  "Force translation of nucleotides",
253 
254  arg_desc->AddDefaultKey
255  ("preserverows", "bool",
256  "Preserve rows",
258 
259 
260  // Viewing args:
261  arg_desc->AddOptionalKey
262  ("v", "",
263  "View format:\n"
264  " 1. CSV table\n"
265  " 2. Print segments\n"
266  " 3. Print chunks\n"
267  " 4. Popset style using GetAlnSeqString\n"
268  " (memory efficient for large alns, but slower)\n"
269  " 5. Popset style using GetSeqString\n"
270  " (memory inefficient)\n"
271  " 6. Popset style using GetWholeAlnSeqString\n"
272  " (fastest, but memory inefficient)\n",
274 
275  arg_desc->AddOptionalKey
276  ("a", "AnchorRow",
277  "Anchor row (zero based)",
279 
280  arg_desc->AddDefaultKey
281  ("w", "ScreenWidth",
282  "Screen width for some of the viewers",
284 
285  arg_desc->AddDefaultKey
286  ("cf", "GetChunkFlags",
287  "Flags for GetChunks (CAlnMap::TGetChunkFlags)",
289 
290  arg_desc->AddDefaultKey
291  ("progress", "bool",
292  "Show progress feedback on stderr",
294 
295  // Setup arg.descriptions for this application
296  SetupArgDescriptions(arg_desc.release());
297 }
298 
299 
301 {
302  if (!m_Scope) {
305 
306  m_Scope = new CScope(*m_ObjMgr);
307  m_Scope->AddDefaults();
308  }
309  return *m_Scope;
310 }
311 
312 
314 {
315  const CArgs& args = GetArgs();
316 
317  if ( args["log"] ) {
318  SetDiagStream( &args["log"].AsOutputFile() );
319  } else {
321  }
322 
323  m_MergeFlags = 0;
324  m_AddFlags = 0;
325 
326  if (args["gapjoin"] && args["gapjoin"].AsBoolean()) {
328  }
329 
330  if (args["mingap"] && args["mingap"].AsBoolean()) {
332  }
333 
334  if (args["rmleadtrailgaps"] && args["rmleadtrailgaps"].AsBoolean()) {
336  }
337 
338  if (args["minusstrand"] && args["minusstrand"].AsBoolean()) {
340  }
341 
342  if (args["queryseqmergeonly"] && args["queryseqmergeonly"].AsBoolean()) {
344  }
345 
346  if (args["fillunaln"] && args["fillunaln"].AsBoolean()) {
348  }
349 
350  if (args["truncateoverlaps"] && args["truncateoverlaps"].AsBoolean()) {
352  }
353 
354  if (args["allowtranslocation"] && args["allowtranslocation"].AsBoolean()) {
356  }
357 
358  if (args["forcetranslation"] && args["forcetranslation"].AsBoolean()) {
360  }
361 
362  if (args["preserverows"] && args["preserverows"].AsBoolean()) {
364  }
365 
366  if (args["calcscore"] && args["calcscore"].AsBoolean()) {
368  }
369 
370  if (args["sortseqsbyscore"] && args["sortseqsbyscore"].AsBoolean()) {
372  }
373 
374  if (args["sortinputbyscore"] && args["sortinputbyscore"].AsBoolean()) {
376  }
377 
378  if ( !(args["noobjmgr"] && args["noobjmgr"].AsBoolean()) ) {
379  GetScope(); // first call creates the scope
380  if (args["se_in"]) {
381  LoadSeqEntry(args["se_in"].AsInputFile());
382  }
383  if (args["fasta_in"]) {
384  LoadFasta(args["fasta_in"].AsInputFile());
385  }
386  if (args["blastdb"]) {
387  LoadBlastDb(args["blastdb"].AsString());
388  }
389  }
390 }
391 
392 
394 {
395  string se_asn_type;
396  {{
397  auto_ptr<CObjectIStream> obj_is
399 
400  se_asn_type = obj_is->ReadFileHeader();
401  obj_is->Close();
402  is.seekg(0);
403  }}
404 
405  auto_ptr<CObjectIStream> obj_is
407 
408  if (se_asn_type == "Seq-entry") {
409  CRef<CSeq_entry> se (new CSeq_entry);
410  *obj_is >> *se;
412  } else {
413  NCBI_THROW(CAlnException, eInvalidRequest,
414  "se_in only accepts a Seq-entry should be supplied in a text asn.1 file.");
415  }
416 }
417 
418 
420 {
421  CFastaReader fasta_reader(is,
423  GetScope().AddTopLevelSeqEntry(*fasta_reader.ReadSet());
424 }
425 
426 
427 void CAlnMrgApp::LoadBlastDb(const string& dbname)
428 {
429  // Create GenBank data loader and register it with the OM.
430  // * The last argument "eDefault" informs the OM that the loader must
431  // * be included in scopes during the CScope::AddDefaults() call.
432 
433  GetScope(); /* make sure m_ObjMgr and m_Scope are created */
434  CDataLoader* blast_loader =
436  (*m_ObjMgr,
437  dbname,
439  true,
441 
442  _ASSERT(blast_loader);
443  GetScope().AddDataLoader(blast_loader->GetName());
444 }
445 
446 
448 {
449  const CArgs& args = GetArgs();
450  auto_ptr<CObjectOStream> asn_out
452  (args["bout"] && args["bout"].AsBoolean() ?
454  args["asnoutb"] ?
455  args["asnoutb"].AsOutputFile() : args["asnout"].AsOutputFile()));
456 
457  if (args["dsout"] && args["dsout"].AsBoolean()) {
458  *asn_out << m_Mix->GetDenseg();
459  } else {
460  *asn_out << m_Mix->GetSeqAlign();
461  }
462 }
463 
464 
466 {
467  const CArgs& args = GetArgs();
468 
469  int screen_width = args["w"].AsInteger();
470 
471  CAlnVec aln_vec(m_Mix->GetDenseg(), GetScope());
472  aln_vec.SetGapChar('-');
473  aln_vec.SetEndChar('.');
474  if (args["a"]) {
475  aln_vec.SetAnchor(args["a"].AsInteger());
476  }
477 
478  if (args["v"]) {
479  switch (args["v"].AsInteger()) {
480  case 1:
481  CAlnMapPrinter(aln_vec, NcbiCout).CsvTable();
482  break;
483  case 2:
484  CAlnMapPrinter(aln_vec, NcbiCout).Segments();
485  break;
486  case 3:
487  CAlnMapPrinter(aln_vec, NcbiCout).Chunks(args["cf"].AsInteger());
488  break;
489  case 4:
490  CAlnVecPrinter(aln_vec, NcbiCout)
492  break;
493  case 5:
494  CAlnVecPrinter(aln_vec, NcbiCout)
496  break;
497  case 6:
498  CAlnVecPrinter(aln_vec, NcbiCout)
499  .PopsetStyle(screen_width,
501  break;
502  default:
503  NcbiCout << "Unknown view format." << NcbiEndl;
504  }
505  }
506 }
507 
508 
510 {
511  const CArgs& args = GetArgs();
512  string sname = args["in"].AsString();
513 
514  // get the asn type of the top-level object
515  string asn_type = args["b"].AsString();
516  bool binary = !asn_type.empty();
517  auto_ptr<CObjectIStream> in
519 
520  CAlnAsnReader reader(&GetScope());
521  reader.Read(in.get(),
522  bind1st(mem_fun(&CAlnMrgApp::AddAlnToMix), this),
523  asn_type);
524 }
525 
526 
528 {
529  _TRACE("Run()");
530  const CArgs& args = GetArgs();
531 
532  SetOptions();
533 
534  m_Mix = m_Scope ? new CAlnMix(GetScope()) : new CAlnMix();
535  CRef<CAlnMrgTaskProgressCallback> progress_callback;
536  if (args["progress"] && args["progress"].AsBoolean()) {
537  progress_callback.Reset(new CAlnMrgTaskProgressCallback);
538  }
539  m_Mix->SetTaskProgressCallback(progress_callback.GetPointerOrNull());
540  LoadInputAlns();
541 
543 
544  PrintMergedAln();
545  if ( args["v"] ) {
546  ViewMergedAln();
547  }
548  return 0;
549 }
550 
551 
552 /////////////////////////////////////////////////////////////////////////////
553 // MAIN
554 
555 
556 int main(int argc, const char* argv[])
557 {
558  // Execute main application function
559  return CAlnMrgApp().AppMain(argc, argv, 0, eDS_Default, 0);
560 }
static CObjectOStream * Open(ESerialDataFormat format, CNcbiOstream &outStream, bool deleteOutStream)
Create serial object writer and attach it to an output stream.
Definition: objostr.cpp:126
CRef< CAlnMix > m_Mix
Definition: alnmrg.cpp:93
void AddDataLoader(const string &loader_name, TPriority pri=kPriority_Default)
Add data loader by name.
Definition: scope.cpp:476
#define NcbiCerr
Definition: ncbistre.hpp:399
const CSeq_align & GetSeqAlign(void) const
Definition: alnmix.cpp:302
void PopsetStyle(int scrn_width=70, EAlgorithm algorithm=eUseAlnSeqString)
Printing methods.
Definition: alnvecprint.cpp:87
USING_SCOPE(ncbi)
CScope & GetScope(void) const
Definition: alnmrg.cpp:300
#define dbname
Defines command line argument related classes.
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:948
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:568
virtual void SetTaskTotal(int total)
Definition: alnmrg.cpp:110
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:470
Data loader implementation that uses the blast databases.
ASN.1 text.
Definition: serialdef.hpp:73
CAlnMix::TMergeFlags m_MergeFlags
Definition: alnmrg.cpp:89
int main(int argc, const char *argv[])
Definition: alnmrg.cpp:556
#define NcbiCout
Definition: ncbistre.hpp:398
#define NcbiEndl
Definition: ncbistre.hpp:403
virtual void SetTaskCompleted(int completed)
Definition: alnmrg.cpp:106
void ViewMergedAln(void)
Definition: alnmrg.cpp:465
Defines unified interface to application:
void SetTaskProgressCallback(ITaskProgressCallback *callback)
Hook a callback to a task.
Helper class for reading seq-align objects from a CObjectIStream.
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:628
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:488
CRef< CSeq_entry > ReadSet(int max_seqs=kMax_Int, ILineErrorListener *pMessageListener=0)
Read multiple sequences (by default, as many as are available.)
Definition: fasta.cpp:482
TObjectType * GetPointerOrNull(void) THROWS_NONE
Get pointer value.
Definition: ncbiobj.hpp:977
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: gbloader.cpp:243
void SetOptions(void)
Definition: alnmrg.cpp:313
Open as binary file; for eInputFile, eOutputFile, eIOFile.
Definition: ncbiargs.hpp:593
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:566
void CsvTable(char delim= ',')
Printing methods.
Definition: alnmapprint.cpp:92
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
Definition: ncbidiag.cpp:7794
void LoadFasta(CNcbiIstream &is)
Definition: alnmrg.cpp:419
CRef< CScope > m_Scope
Definition: alnmrg.cpp:92
void PrintMergedAln(void)
Definition: alnmrg.cpp:447
virtual void SetTaskName(const string &name)
Definition: alnmrg.cpp:102
bool AddAlnToMix(const CSeq_align *aln)
Definition: alnmrg.cpp:83
Operators to edit gaps in sequences.
Task clients implement this callback interface.
CAlnMix::TAddFlags m_AddFlags
Definition: alnmrg.cpp:90
Try standard log file (app.name + ".log") in /log/, use stderr on failure.
Definition: ncbidiag.hpp:1763
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:200
virtual bool InterruptTask()
Definition: alnmrg.cpp:114
The Object manager core.
void Chunks(CAlnMap::TGetChunkFlags flags=CAlnMap::fAlnSegsOnly)
int TMergeFlags
Definition: alnmix.hpp:114
CArgDescriptions –.
Definition: ncbiargs.hpp:514
Magic spell ;-) needed for some weird compilers... very empiric.
An arbitrary string.
Definition: ncbiargs.hpp:563
Open file right away; for eInputFile, eOutputFile, eIOFile.
Definition: ncbiargs.hpp:591
string GetName(void) const
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
Definition: ncbiapp.hpp:692
virtual void Init(void)
Initialize the application.
Definition: alnmrg.cpp:123
Base class for reading FASTA sequences.
Definition: fasta.hpp:78
CScope –.
Definition: scope.hpp:90
CArgs –.
Definition: ncbiargs.hpp:356
Name of file (must be writable)
Definition: ncbiargs.hpp:569
void Read(CObjectIStream *obj_in_stream, TCallback callback, const string &top_level_asn_object=kEmptyStr)
Read all seq-align objects from the stream.
Definition: Seq_entry.hpp:55
CObject –.
Definition: ncbiobj.hpp:180
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
void LoadInputAlns(void)
Definition: alnmrg.cpp:509
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CNcbiApplication –.
Definition: ncbiapp.hpp:120
CRef< CObjectManager > m_ObjMgr
Definition: alnmrg.cpp:91
void LoadSeqEntry(CNcbiIstream &is)
Definition: alnmrg.cpp:393
Reject deflines that lack IDs.
Definition: fasta.hpp:93
#define _ASSERT
int TAddFlags
Definition: alnmix.hpp:82
void Add(const CDense_seg &ds, TAddFlags flags=0)
Definition: alnmix.cpp:120
nucleotide database
Definition: bdbloader.hpp:57
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string...
Definition: ncbiexpt.hpp:547
virtual int Run(void)
Run the application.
Definition: alnmrg.cpp:527
void SetGapChar(TResidue gap_char)
Definition: alnvec.hpp:339
std::istream & in(std::istream &in_, double &x_)
memory efficient, recommended for large alns
Definition: alnvec.hpp:214
const CDense_seg & GetDenseg(void) const
Definition: alnmix.cpp:295
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:139
#define _TRACE(message)
Definition: ncbidbg.hpp:120
{'true', 't', 'false', 'f'}, case-insensitive
Definition: ncbiargs.hpp:564
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, const string &dbname="nr", const EDbType dbtype=eUnknown, bool use_fixed_size_slices=true, CObjectManager::EIsDefault is_default=CObjectManager::eNonDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: bdbloader.cpp:52
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:768
TLoader * GetLoader(void) const
Get pointer to the loader.
User-defined methods of the data storage class.
ASN.1 binary.
Definition: serialdef.hpp:74
void Merge(TMergeFlags flags=0)
Definition: alnmix.cpp:273
void LoadBlastDb(const string &db)
Definition: alnmrg.cpp:427
Modified on Wed Apr 25 11:19:27 2018 by modify_doxy.py rev. 546573