NCBI C++ ToolKit
segmasker.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: segmasker.cpp 71171 2016-02-16 16:55:37Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file segmasker.cpp
31  * SEG filtering application
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbiapp.hpp>
36 
37 // Objects includes
39 
40 // Filtering applications IO
50 
51 // Object manager includes
53 #include <objmgr/bioseq_handle.hpp>
54 
55 #include <algo/segmask/segmask.hpp>
56 
57 #ifndef SKIP_DOXYGEN_PROCESSING
60 #endif /* SKIP_DOXYGEN_PROCESSING */
61 
62 /////////////////////////////////////////////////////////////////////////////
63 // SegMaskerApplication::
64 
65 
67 {
68 public:
69  /// Application constructor
72  version->SetVersionInfo(1, 0, 0);
73  SetFullVersion(version);
74  }
75 
76 private:
77  /** @inheritDoc */
78  virtual void Init(void);
79  /** @inheritDoc */
80  virtual int Run(void);
81  /** @inheritDoc */
82  virtual void Exit(void);
83 
84  /// Retrieves the sequence reader interface for the application
86  /// Retrieves the output writer interface for the application
88 
89  /// Contains the description of this application
90  static const char * const USAGE_LINE;
91 };
92 
93 /////////////////////////////////////////////////////////////////////////////
94 // Init test for all different types of arguments
95 
96 const char * const SegMaskerApplication::USAGE_LINE
97  = "Low complexity region masker based on the SEG algorithm";
98 
100 {
102 
103  // Create command-line argument descriptions class
104  auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
105 
106  // Specify USAGE context
107  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
108  USAGE_LINE);
109 
110  arg_desc->SetCurrentGroup("Input/output options");
111  arg_desc->AddDefaultKey(kInput, "input_file_name",
112  "input file name",
114  arg_desc->AddDefaultKey(kOutput, "output_file_name",
115  "output file name",
117  arg_desc->AddDefaultKey(kInputFormat, "input_format",
118  "controls the format of the masker input",
120  CArgAllow_Strings* strings_allowed = new CArgAllow_Strings();
121  for (size_t i = 0; i < kNumInputFormats; i++) {
122  strings_allowed->Allow(kInputFormats[i]);
123  }
124  arg_desc->SetConstraint(kInputFormat, strings_allowed);
125  arg_desc->AddFlag ( "parse_seqids",
126  "Parse Seq-ids in FASTA input", true );
127 
128  arg_desc->AddDefaultKey(kOutputFormat, "output_format",
129  "controls the format of the masker output",
131  strings_allowed = new CArgAllow_Strings();
132  for (size_t i = 0; i < kNumOutputFormats; i++) {
133  strings_allowed->Allow(kOutputFormats[i]);
134  }
135  arg_desc->SetConstraint(kOutputFormat, strings_allowed);
136 
137  arg_desc->SetCurrentGroup("SEG algorithm options");
138  arg_desc->AddDefaultKey("window", "integer_value", "SEG window",
141  arg_desc->AddDefaultKey("locut", "float_value", "SEG locut",
144  arg_desc->AddDefaultKey("hicut", "float_value", "SEG hicut",
147 
148  // Setup arg.descriptions for this application
149  SetupArgDescriptions(arg_desc.release());
150 }
151 
154 {
155  const CArgs& args = GetArgs();
156  const string& format(args[kInputFormat].AsString());
157  CMaskReader* retval = NULL;
158 
159  if (format == "fasta") {
160  CNcbiIstream& input = args[kInput].AsInputFile();
161  retval = new CMaskFastaReader(input, false, args["parse_seqids"]);
162  } else if (format == "blastdb") {
163  retval = new CMaskBDBReader(args[kInput].AsString(), false);
164  } else {
165  _ASSERT("Unknown input format" == 0);
166  }
167  return retval;
168 }
169 
172 {
173  const CArgs& args = GetArgs();
174  const string& format(args[kOutputFormat].AsString());
175  CMaskWriter* retval = NULL;
176 
177  if (format == "interval") {
178  CNcbiOstream& output = args[kOutput].AsOutputFile();
179  retval = new CMaskWriterInt(output);
180  } else if (format == "fasta") {
181  CNcbiOstream& output = args[kOutput].AsOutputFile();
182  retval = new CMaskWriterFasta(output);
183  } else if (NStr::StartsWith(format, "seqloc_asn1_binary")) {
184  CNcbiOstream& output = args[kOutput].AsOutputFile(CArgValue::fBinary);
185  retval = new CMaskWriterSeqLoc(output, format);
186  } else if (NStr::StartsWith(format, "seqloc_")) {
187  CNcbiOstream& output = args[kOutput].AsOutputFile();
188  retval = new CMaskWriterSeqLoc(output, format);
189  } else if (NStr::StartsWith(format, "maskinfo_asn1_bin")) {
190  CNcbiOstream& output = args[kOutput].AsOutputFile(CArgValue::fBinary);
191  retval =
192  new CMaskWriterBlastDbMaskInfo(output, format, 1,
195  } else if (NStr::StartsWith(format, "maskinfo_")) {
196  CNcbiOstream& output = args[kOutput].AsOutputFile();
197  retval =
198  new CMaskWriterBlastDbMaskInfo(output, format, 1,
201  } else {
202  throw runtime_error("Unknown output format");
203  }
204  return retval;
205 }
206 
207 /////////////////////////////////////////////////////////////////////////////
208 // Run demo
209 
210 
212 {
213  int retval = 0;
214  const CArgs& args = GetArgs();
215 
216  try {
217 
219 
220  CSegMasker masker(args["window"].AsInteger(),
221  args["locut"].AsDouble(),
222  args["hicut"].AsDouble());
223 
224  CRef<CSeq_entry> seq_entry;
225  auto_ptr<CMaskReader> reader(x_GetReader());
226  auto_ptr<CMaskWriter> writer(x_GetWriter());
227 
228  while ( (seq_entry = reader->GetNextSequence()).NotEmpty() ) {
229 
230  // Allow skipping of oid
231  if(seq_entry->Which() == CSeq_entry::e_not_set)
232  continue;
233 
234  CScope scope(*objmgr);
235  CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*seq_entry);
236  CBioseq_Handle bioseq_handle = seh.GetSeq();
237  CSeqVector sequence_data =
239  auto_ptr<CSegMasker::TMaskList> masks(masker(sequence_data));
240  writer->Print(bioseq_handle, *masks, GetArgs()["parse_seqids"]);
241  // writer->Print(bioseq_handle, *masks);
242 
243  }
244 
245  } catch (const CException& e) {
246  cerr << e.what() << endl;
247  retval = 1;
248  }
249 
250  return retval;
251 }
252 
253 
254 /////////////////////////////////////////////////////////////////////////////
255 // Cleanup
256 
257 
259 {
260  SetDiagStream(0);
261 }
262 
263 
264 /////////////////////////////////////////////////////////////////////////////
265 // MAIN
266 
267 
268 #ifndef SKIP_DOXYGEN_PROCESSING
269 int main(int argc, const char* argv[])
270 {
271  // Execute main application function
272  return SegMaskerApplication().AppMain(argc, argv);
273 }
274 #endif /* SKIP_DOXYGEN_PROCESSING */
275 
CBioseq_Handle –.
const int kSegWindow
Window that SEG examines at once.
Definition: blast_seg.c:45
const char * kInputFormats[]
Input formats allowed, the first one is the default.
const string version
version string
Definition: variables.hpp:66
const std::string kInput
Command line flag to specify the input.
Class for reading sequences from fasta files.
void SetFullVersion(CRef< CVersion > version)
Set version data for the program.
Definition: ncbiapp.cpp:905
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:926
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:568
int main(int argc, const char *argv[])
Definition: segmasker.cpp:269
Hide log file description.
Definition: ncbiapp.hpp:402
USING_SCOPE(objects)
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_entry_.hpp:228
const double kSegLocut
Locut parameter for SEG.
Definition: blast_seg.c:46
Hide configuration file description.
Definition: ncbiapp.hpp:403
Class for reading sequences from BLAST databases.
#define NULL
Definition: ncbistd.hpp:225
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:611
virtual const char * what(void) const
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:324
CMaskWriter * x_GetWriter()
Retrieves the output writer interface for the application.
Definition: segmasker.cpp:171
const char * kOutputFormats[]
Output formats allowed, the first one is the default.
Set coding to binary coding (Ncbi4na or Ncbistdaa)
ESerialDataFormat format
CVersion –.
Definition: version.hpp:247
int i
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:479
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5079
virtual int Run(void)
Definition: segmasker.cpp:211
const double kSegHicut
Hicut parameter for SEG.
Definition: blast_seg.c:47
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:566
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
Definition: ncbidiag.cpp:7360
virtual void Init(void)
Definition: segmasker.cpp:99
Contains the command line options common to filtering algorithms.
const std::string kOutputFormat
Command line flag to specify the output format.
CSeqVector –.
Definition: seq_vector.hpp:64
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
This class encapsulates the SEG filtering algorithm.
Definition: segmask.hpp:46
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:142
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
Definition: ncbiapp.cpp:1061
CSeq_entry_Handle –.
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:190
The Object manager core.
Output filter to print masked sequence locations as Blast-db-mask-info objects.
Hide version description.
Definition: ncbiapp.hpp:404
virtual void Exit(void)
Definition: segmasker.cpp:258
SegMaskerApplication()
Application constructor.
Definition: segmasker.cpp:70
CArgDescriptions –.
Definition: ncbiargs.hpp:514
An arbitrary string.
Definition: ncbiargs.hpp:563
CException –.
Definition: ncbiexpt.hpp:709
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
Definition: ncbiapp.hpp:681
CScope –.
Definition: scope.hpp:90
CArgs –.
Definition: ncbiargs.hpp:356
No variant selected.
Definition: Seq_entry_.hpp:88
Name of file (must be writable)
Definition: ncbiargs.hpp:569
Virtual base class for all input readers.
Definition: mask_reader.hpp:49
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
USING_NCBI_SCOPE
Definition: segmasker.cpp:58
const size_t kNumOutputFormats
Number of elements in kOutputFormats.
void SetVersionInfo(int ver_major, int ver_minor, int patch_level=0, const string &ver_name=kEmptyStr)
Set version information.
Definition: version.cpp:580
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CNcbiApplication –.
Definition: ncbiapp.hpp:120
Output filter to print masked sequences as sets of intervals.
#define _ASSERT
Output filter to write masked data in fasta format.
const size_t kNumInputFormats
Number of elements in kInputFormats.
CArgAllow_Strings * Allow(const string &value)
Add allowed string values.
Definition: ncbiargs.cpp:4329
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
Definition: ncbistr.hpp:5182
const std::string kInputFormat
Command line flag to specify the input format.
string BuildAlgorithmParametersString(const CArgs &args)
Builds an algorithm options string for the filtering applications (segmasker, dustmasker) by examinin...
Convertible into a floating point number (double)
Definition: ncbiargs.hpp:567
static const char *const USAGE_LINE
Contains the description of this application.
Definition: segmasker.cpp:90
Hide dryrun description.
Definition: ncbiapp.hpp:406
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:139
Open file in binary mode.
Definition: ncbiargs.hpp:239
const std::string kOutput
Command line flag to specify the output.
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1570
Output filter to print masked sequence locations as NCBI Seq-loc objects.
CMaskReader * x_GetReader()
Retrieves the sequence reader interface for the application.
Definition: segmasker.cpp:153
TSeq GetSeq(void) const
A base class for winmasker output writers.
Definition: mask_writer.hpp:51
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
static int input()
Modified on Sun Jun 25 17:51:09 2017 by modify_doxy.py rev. 533848