NCBI C++ ToolKit
win_mask_gen_counts.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: win_mask_gen_counts.cpp 69832 2015-11-16 19:29:56Z mozese2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * Implementation of CWinMaskCountsGenerator class.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <stdlib.h>
35 
36 #include <vector>
37 #include <sstream>
38 
39 #include <objects/seq/Bioseq.hpp>
40 #include <objects/seq/Seq_inst.hpp>
41 #include <objects/seq/Seq_data.hpp>
43 #include <objects/seq/IUPACna.hpp>
44 
46 #include <objmgr/scope.hpp>
48 #include <objmgr/bioseq_ci.hpp>
49 #include <objmgr/seq_vector.hpp>
50 
52 
57 
60 
61 //------------------------------------------------------------------------------
62 static Uint4 letter( char c )
63 {
64  switch( c )
65  {
66  case 'a': case 'A': return 0;
67  case 'c': case 'C': return 1;
68  case 'g': case 'G': return 2;
69  case 't': case 'T': return 3;
70  default: return 0;
71  }
72 }
73 
74 //------------------------------------------------------------------------------
75 static inline bool ambig( char c )
76 {
77  return c != 'a' && c != 'A' && c != 'c' && c != 'C'
78  && c != 'g' && c != 'G' && c != 't' && c != 'T';
79 }
80 
81 #if 0
82 //------------------------------------------------------------------------------
83 string mkdata( const CSeq_entry & entry )
84 {
85  const CBioseq & bioseq( entry.GetSeq() );
86 
87  if( bioseq.CanGetInst()
88  && bioseq.GetInst().CanGetLength()
89  && bioseq.GetInst().CanGetSeq_data() )
90  {
91  TSeqPos len( bioseq.GetInst().GetLength() );
92  const CSeq_data & seqdata( bioseq.GetInst().GetSeq_data() );
93  auto_ptr< CSeq_data > dest( new CSeq_data );
94  CSeqportUtil::Convert( seqdata, dest.get(), CSeq_data::e_Iupacna,
95  0, len );
96  return dest->GetIupacna().Get();
97  }
98 
99  return string( "" );
100 }
101 #endif
102 
103 //------------------------------------------------------------------------------
104 Uint8 CWinMaskCountsGenerator::fastalen( const string & fname ) const
105 {
106  Uint8 result = 0;
107 
108  for(CWinMaskUtil::CInputBioseq_CI bs_iter(fname, infmt); bs_iter; ++bs_iter)
109  {
110  CBioseq_Handle bsh = *bs_iter;
111 
112  if( CWinMaskUtil::consider( bsh, ids, exclude_ids ) )
113  result += bsh.GetBioseqLength();
114  }
115 
116  return result;
117 }
118 
119 //------------------------------------------------------------------------------
121 { return CSeqMaskerUtil::reverse_complement( seq, size ); }
122 
123 //------------------------------------------------------------------------------
125  const string & arg_input,
126  CNcbiOstream & os,
127  const string & infmt_arg,
128  const string & sformat,
129  const string & arg_th,
130  Uint4 mem_avail,
131  Uint1 arg_unit_size,
132  Uint8 arg_genome_size,
133  Uint4 arg_min_count,
134  Uint4 arg_max_count,
135  bool arg_check_duplicates,
136  bool arg_use_list,
137  const CWinMaskUtil::CIdSet * arg_ids,
138  const CWinMaskUtil::CIdSet * arg_exclude_ids,
139  bool use_ba, string const & metadata )
140 : input( arg_input ),
141  ustat( CSeqMaskerOstatFactory::create(
142  sformat, os, use_ba, metadata ) ),
143  max_mem( mem_avail*1024*1024ULL ), unit_size( arg_unit_size ),
144  genome_size( arg_genome_size ),
145  min_count( arg_min_count == 0 ? 1 : arg_min_count ),
146  max_count( 500 ),
147  t_high( arg_max_count ),
148  has_min_count( arg_min_count != 0 ),
149  no_extra_pass( arg_min_count != 0 && arg_max_count != 0 ),
150  check_duplicates( arg_check_duplicates ),use_list( arg_use_list ),
151  total_ecodes( 0 ),
152  score_counts( max_count, 0 ),
153  ids( arg_ids ), exclude_ids( arg_exclude_ids ),
154  infmt( infmt_arg )
155 {
156  // Parse arg_th to set up th[].
157  string::size_type pos( 0 );
158  Uint1 count( 0 );
159 
160  while( pos != string::npos && count < 4 )
161  {
162  string::size_type newpos = arg_th.find_first_of( ",", pos );
163  th[count++] = atof( arg_th.substr( pos, newpos - pos ).c_str() );
164  pos = (newpos == string::npos ) ? newpos : newpos + 1;
165  }
166 }
167 
168 //------------------------------------------------------------------------------
170  const string & arg_input,
171  const string & output,
172  const string & infmt_arg,
173  const string & sformat,
174  const string & arg_th,
175  Uint4 mem_avail,
176  Uint1 arg_unit_size,
177  Uint8 arg_genome_size,
178  Uint4 arg_min_count,
179  Uint4 arg_max_count,
180  bool arg_check_duplicates,
181  bool arg_use_list,
182  const CWinMaskUtil::CIdSet * arg_ids,
183  const CWinMaskUtil::CIdSet * arg_exclude_ids,
184  bool use_ba, string const & metadata )
185 : input( arg_input ),
186  ustat( CSeqMaskerOstatFactory::create(
187  sformat, output, use_ba, metadata ) ),
188  max_mem( mem_avail*1024*1024ULL ), unit_size( arg_unit_size ),
189  genome_size( arg_genome_size ),
190  min_count( arg_min_count == 0 ? 1 : arg_min_count ),
191  max_count( 500 ),
192  t_high( arg_max_count ),
193  has_min_count( arg_min_count != 0 ),
194  no_extra_pass( arg_min_count != 0 && arg_max_count != 0 ),
195  check_duplicates( arg_check_duplicates ),use_list( arg_use_list ),
196  total_ecodes( 0 ),
197  score_counts( max_count, 0 ),
198  ids( arg_ids ), exclude_ids( arg_exclude_ids ),
199  infmt( infmt_arg )
200 {
201  // Parse arg_th to set up th[].
202  string::size_type pos( 0 );
203  Uint1 count( 0 );
204 
205  while( pos != string::npos && count < 4 )
206  {
207  string::size_type newpos = arg_th.find_first_of( ",", pos );
208  th[count++] = atof( arg_th.substr( pos, newpos - pos ).c_str() );
209  pos = (newpos == string::npos ) ? newpos : newpos + 1;
210  }
211 }
212 
213 //------------------------------------------------------------------------------
215 
216 //------------------------------------------------------------------------------
218 {
219  // Generate a list of files to process.
220  vector< string > file_list;
221 
222  if( !use_list ) {
223  NStr::Split(input, ",", file_list);
224  } else {
225  string line;
226  CNcbiIfstream fl_stream( input.c_str() );
227 
228  while( getline( fl_stream, line ) ) {
229  if( !line.empty() ) {
230  file_list.push_back( line );
231  }
232  }
233  }
234 
235  // Check for duplicates, if necessary.
236  if( check_duplicates )
237  {
238  CheckDuplicates( file_list, infmt, ids, exclude_ids );
239  }
240 
241  if( unit_size == 0 )
242  {
243  if( genome_size == 0 )
244  {
245  LOG_POST( "computing the genome length" );
246  Uint8 total = 0;
247 
248  for( vector< string >::const_iterator i = file_list.begin();
249  i != file_list.end(); ++i )
250  {
251  total += fastalen( *i );
252  }
253 
254  genome_size = total;
255 
256  if( genome_size == 0 ) {
257  NCBI_THROW( GenCountsException, eNullGenome, "" );
258  }
259  }
260 
261  for( unit_size = 15; unit_size > 0; --unit_size ) {
262  if( (genome_size>>(2*unit_size)) >= 5 ) {
263  break;
264  }
265  }
266 
267  ++unit_size;
268  _TRACE( "unit size is: " << unit_size );
269  }
270 
271  // Estimate the length of the prefix.
272  // Prefix length is unit_size - suffix length, where suffix length
273  // is max N: (4**N) < max_mem.
274  Uint1 prefix_size( 0 ), suffix_size( unit_size );
275  Uint8 n_units( max_mem/sizeof( Uint4 ) );
276 
277  while( suffix_size > 0 ) {
278  Uint8 units_needed( 1<<(2*suffix_size) );
279  if( units_needed <= n_units ) break;
280  --suffix_size;
281  }
282 
283  NCBI_ASSERT( suffix_size > 0, "suffix size is 0" );
284  prefix_size = unit_size - suffix_size;
286 
287  // Now process for each prefix.
288  Uint4 prefix_exp( 1<<(2*prefix_size) );
289  Uint4 passno = 1;
290  LOG_POST( "pass " << passno );
291 
292  for( Uint4 prefix( 0 ); prefix < prefix_exp; ++prefix ) {
293  process( prefix, prefix_size, file_list, no_extra_pass );
294  }
295 
296  ++passno;
297 
298  // Now put the final statistics as comments at the end of the output.
299  for( Uint4 i( 1 ); i < max_count; ++i )
300  score_counts[i] += score_counts[i-1];
301 
302  Uint4 offset( total_ecodes - score_counts[max_count - 1] );
303  Uint4 index[4] = {0, 0, 0, 0};
304  double previous( 0.0 );
305  double current;
306 
307  if( no_extra_pass )
308  {
309  ostringstream s;
310  s << " " << total_ecodes << " ecodes";
311  ustat->setComment( s.str() );
312  }
313 
314  for( Uint4 i( 1 ); i <= max_count; ++i )
315  {
316  current = 100.0*(((double)(score_counts[i - 1] + offset))
317  /((double)total_ecodes));
318 
319  if( no_extra_pass )
320  {
321  ostringstream s;
322  s << " " << dec << i << "\t" << score_counts[i - 1] + offset << "\t"
323  << current;
324  ustat->setComment( s.str() );
325  }
326 
327  for( Uint1 j( 0 ); j < 4; ++j )
328  if( previous < th[j] && current >= th[j] )
329  index[j] = i;
330 
331  previous = current;
332  }
333 
334  // If min_count or t_high must be deduced do it and reprocess.
335  if( !no_extra_pass )
336  {
337  total_ecodes = 0;
338 
339  if( !has_min_count )
340  min_count = index[0];
341 
342  if( t_high == 0 )
343  t_high = index[3];
344 
345  if( min_count == 0 )
346  min_count = 1;
347 
348  for( Uint4 i( 0 ); i < max_count; ++i )
349  score_counts[i] = 0;
350 
351  LOG_POST( "pass " << passno );
352 
353  for( Uint4 prefix( 0 ); prefix < prefix_exp; ++prefix )
354  process( prefix, prefix_size, file_list, true );
355 
356  for( Uint4 i( 1 ); i < max_count; ++i )
357  score_counts[i] += score_counts[i-1];
358 
359  offset = total_ecodes - score_counts[max_count - 1];
360 
361  {
362  ostringstream s;
363  s << " " << total_ecodes << " ecodes";
364  ustat->setComment( s.str() );
365  }
366 
367  for( Uint4 i( 1 ); i <= max_count; ++i )
368  {
369  current
370  = 100.0*(((double)(score_counts[i - 1] + offset))
371  /((double)total_ecodes));
372  ostringstream s;
373  s << " " << dec << i << "\t" << score_counts[i - 1] + offset << "\t"
374  << current;
375  ustat->setComment( s.str() );
376  }
377  }
378 
379  ustat->setComment( "" );
380 
381  for( Uint1 i( 0 ); i < 4; ++i )
382  {
383  ostringstream s;
384  s << " " << th[i] << "%% threshold at " << index[i];
385  ustat->setComment( s.str() );
386  }
387 
388  ustat->setParam( "t_low ", index[0] );
389  ustat->setParam( "t_extend ", index[1] );
390  ustat->setParam( "t_threshold", index[2] );
391  ustat->setParam( "t_high ", index[3] );
392  ustat->finalize();
393 }
394 
395 //------------------------------------------------------------------------------
397  Uint1 prefix_size,
398  const vector< string > & input_list,
399  bool do_output )
400 {
401  Uint1 suffix_size( unit_size - prefix_size );
402  Uint4 vector_size( 1<<(2*suffix_size) );
403  vector< Uint4 > counts( vector_size, 0 );
404  Uint4 unit_mask( (1<<(2*unit_size)) - 1 );
405  Uint4 prefix_mask( ((1<<(2*prefix_size)) - 1)<<(2*suffix_size) );
406  Uint4 suffix_mask( (1<<2*suffix_size) - 1 );
407  if( unit_size == 16 ) unit_mask = 0xFFFFFFFF;
408  prefix <<= (2*suffix_size);
410 
411  for( vector< string >::const_iterator it( input_list.begin() );
412  it != input_list.end(); ++it )
413  {
414  for(CWinMaskUtil::CInputBioseq_CI bs_iter(*it, infmt); bs_iter; ++bs_iter)
415  {
416  CBioseq_Handle bsh = *bs_iter;
417 
418  if( CWinMaskUtil::consider( bsh, ids, exclude_ids ) )
419  {
420  CSeqVector data =
421  bs_iter->GetSeqVector(CBioseq_Handle::eCoding_Iupac);
422 
423  if( data.empty() )
424  continue;
425 
426  TSeqPos length( data.size() );
427  Uint4 count( 0 );
428  Uint4 unit( 0 );
429 
430  for( Uint4 i( 0 ); i < length; ++i ) {
431  if( ambig( data[i] ) )
432  {
433  count = 0;
434  unit = 0;
435  continue;
436  }
437  else
438  {
439  unit = ((unit<<2)&unit_mask) + letter( data[i] );
440 
441  if( count >= unit_size - 1 )
442  {
443  Uint4 runit( reverse_complement( unit, unit_size ) );
444 
445  if( unit <= runit && (unit&prefix_mask) == prefix )
446  ++counts[unit&suffix_mask];
447 
448  if( runit <= unit && (runit&prefix_mask) == prefix )
449  ++counts[runit&suffix_mask];
450  }
451 
452  ++count;
453  }
454  }
455  }
456  }
457  }
458 
459  for( Uint4 i( 0 ); i < vector_size; ++i )
460  {
461  Uint4 u( prefix + i ), ru( 0 );
462 
463  if( counts[i] > 0 )
464  {
465  ru = reverse_complement( u, unit_size );
466  if( u == ru ) ++total_ecodes; else total_ecodes += 2;
467  }
468 
469  if( counts[i] >= min_count )
470  {
471  if( counts[i] >= max_count )
472  if( u == ru ) ++score_counts[max_count - 1];
473  else score_counts[max_count - 1] += 2;
474  else if( u == ru ) ++score_counts[counts[i] - 1];
475  else score_counts[counts[i] - 1] += 2;
476 
477  if( do_output )
479  u, (counts[i] > t_high) ? t_high : counts[i] );
480  }
481  }
482 }
483 
484 //------------------------------------------------------------------------------
485 const char *
487 {
488  switch( GetErrCode() ) {
489  case eNullGenome: return "empty genome";
490  default: return CException::GetErrCodeString();
491  }
492 }
493 
CBioseq_Handle –.
Set coding to printable coding (Iupacna or Iupacaa)
~CWinMaskCountsGenerator()
Object destructor.
unsigned NCBI_INT8_TYPE Uint8
Unsigned 8 byte sized integer.
Definition: ncbitype.h:146
unsigned int Uint4
Alias for unsigned int.
Definition: ncbitype.h:121
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:836
void setParam(const string &name, Uint4 value)
Set a value of a WindowMasker parameter.
static TSeqPos Convert(const CSeq_data &in_seq, CSeq_data *out_seq, CSeq_data::E_Choice to_code, TSeqPos uBeginIdx=0, TSeqPos uLength=0, bool bAmbig=false, Uint4 seed=17734276)
string
Definition: cgiapp.hpp:498
const TSeqPos offset(200)
TSeqPos size(void) const
Definition: seq_vector.hpp:291
Factory of CSeqMaskerOstat objects.
User-defined methods of the data storage class.
int i
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:101
unsigned char Uint1
Alias for unsigned char.
Definition: ncbitype.h:117
static Uint4 reverse_complement(Uint4 seq, Uint1 size)
Reverse complement of a unit.
IUPAC 1 letter nuc acid code.
Definition: Seq_data_.hpp:104
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
const CWinMaskUtil::CIdSet * exclude_ids
Exceptions that CWinMaskCountsGenerator may throw.
bool empty(void) const
Definition: seq_vector.hpp:284
void process(Uint4 prefix, Uint1 prefix_size, const vector< string > &input, bool do_output)
static Uint4 letter(char c)
void operator()()
This function does the actual n-mer counting.
CRef< CSeqMaskerOstat > ustat
CSeqVector –.
Definition: seq_vector.hpp:64
Uint8 fastalen(const string &fname) const
USING_SCOPE(objects)
void finalize()
Perform any final tasks required to generate unit counts in the particular format.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:142
void setUnitCount(Uint4 unit, Uint4 count)
Add count value for a particular unit.
CRef< objects::CObjectManager > om
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:198
int size
The Object manager core.
Function iterating over bioseqs in input.
TSeqPos GetBioseqLength(void) const
Base class for sets of seq_id representations used with -ids and -exclude-ids options.
Definition: Seq_entry.hpp:55
int len
const CWinMaskUtil::CIdSet * ids
TErrCode GetErrCode(void) const
Get error code.
Definition: ncbiexpt.cpp:435
#define NCBI_ASSERT(expr, mess)
Definition: ncbidbg.hpp:128
else result
Definition: token2.c:20
void setUnitSize(Uint1 us)
Set the unit size value.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CWinMaskCountsGenerator(const string &input, const string &output, const string &infmt, const string &sformat, const string &th, Uint4 mem_avail, Uint1 unit_size, Uint8 genome_size, Uint4 min_count, Uint4 max_count, bool check_duplicates, bool use_list, const CWinMaskUtil::CIdSet *ids, const CWinMaskUtil::CIdSet *exclude_ids, bool use_ba, string const &metadata)
Constructor.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string...
Definition: ncbiexpt.hpp:546
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:245
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3362
static Uint4 reverse_complement(Uint4 seq, Uint1 size)
static bool ambig(char c)
static const char * prefix[]
Definition: pcregrep.c:251
static bool consider(const objects::CBioseq_Handle &bsh, const CIdSet *ids, const CIdSet *exclude_ids)
Check if the given bioseq should be considered for processing.
virtual const char * GetErrCodeString() const
Return description string corresponding to an error code.
void CheckDuplicates(const vector< string > &input, const string &infmt, const CWinMaskUtil::CIdSet *ids, const CWinMaskUtil::CIdSet *exclude_ids)
Check for possibly duplicate sequences in the input.
#define _TRACE(message)
Definition: ncbidbg.hpp:120
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:98
void setComment(const string &msg)
Add a comment to the unit counts file.
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
Definition: ncbiexpt.cpp:426
static int input()
Modified on Sun Jun 25 17:44:26 2017 by modify_doxy.py rev. 533848