00001 #include <ncbi_pch.hpp>
00002 #include "coligofarapp.hpp"
00003 #include "csamformatter.hpp"
00004 #include "coutputformatter.hpp"
00005 #include "cprogressindicator.hpp"
00006 #include "cscoringfactory.hpp"
00007 #include "cbitmaskaccess.hpp"
00008 #include "cseqscanner.hpp"
00009 #include "cguidesam.hpp"
00010 #include "cfeatmap.hpp"
00011 #include "caligner.hpp"
00012 #include "cfilter.hpp"
00013 #include "cbatch.hpp"
00014 #include "csnpdb.hpp"
00015 #include "oligofar-version.hpp"
00016
00017 #include "string-util.hpp"
00018
00019 #include <iostream>
00020 #include <iomanip>
00021 #include <fstream>
00022 #include <sstream>
00023
00024 #ifndef _WIN32
00025 #include <sys/resource.h>
00026 #include <sysexits.h>
00027 #else
00028 #define strtoll( a, b, c ) strtoui64( a, b, c )
00029 #define EX_USAGE 64
00030 #endif
00031
00032 USING_OLIGOFAR_SCOPES;
00033
00034 COligoFarApp::COligoFarApp( int argc, char ** argv ) :
00035 CApp( argc, argv ),
00036 m_hashPass( 0 ),
00037 m_strands( 0x03 ),
00038 m_readsPerRun( 250000 ),
00039 m_minRun(0),
00040 m_maxRun( numeric_limits<int>::max() ),
00041 m_topCnt( 10 ),
00042 m_topPct( 99 ),
00043 m_minPctid( 60 ),
00044 m_identityScore( 1.0 ),
00045 m_mismatchScore( -1.0 ),
00046 m_gapOpeningScore( -3.0 ),
00047 m_gapExtentionScore( -1.5 ),
00048 m_extentionPenaltyDropoff( -100 ),
00049 m_qualityChannels( 0 ),
00050 m_qualityBase( 33 ),
00051 m_minBlockLength( 1000 ),
00052
00053 m_memoryLimit( Uint8( sizeof(void*) == 4 ? 3 : 8 ) * int(kGigaByte) ),
00054 m_performTests( false ),
00055 m_colorSpace( false ),
00056 m_sodiumBisulfiteCuration( false ),
00057 m_outputSam( true ),
00058 #ifdef _WIN32
00059
00060 m_outputFile( "con:" ),
00061 #else
00062 m_guideFile( "/dev/null" ),
00063 m_outputFile( "/dev/stdout" ),
00064 #endif
00065 m_outputFlags( "z" ),
00066 m_geometry( "p" )
00067 {
00068 m_passParam.push_back( CPassParam() );
00069 #ifndef _WIN32
00070 ifstream meminfo( "/proc/meminfo" );
00071 string buff;
00072 if( !meminfo.fail() ) {
00073 m_memoryLimit = 0;
00074
00075 while( getline( meminfo, buff ) ) {
00076 istringstream line(buff);
00077 string name, units;
00078 Uint8 value;
00079 line >> name >> value >> units;
00080 if( units.length() ) {
00081 switch( tolower( units[0] ) ) {
00082 case 'g': value *= kKiloByte;
00083 case 'm': value *= kKiloByte;
00084 case 'k': value *= kKiloByte;
00085 }
00086 }
00087 if( name == "MemFree:" || name == "Buffers:" || name == "Cached:" || name == "SwapCached:" ) {
00088 m_memoryLimit += value;
00089 }
00090 }
00091 }
00092 #endif
00093 }
00094
00095 int COligoFarApp::RevNo()
00096 {
00097 return strtol( "$Rev: 175071 $"+6, 0, 10 );
00098 }
00099
00100 void COligoFarApp::Version( const char * )
00101 {
00102 cout << GetProgramBasename() << " ver " OLIGOFAR_VERSION " (Rev:" << RevNo() << ") " NCBI_SIGNATURE << endl;
00103 }
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139 void COligoFarApp::Help( const char * arg )
00140 {
00141 enum EFlags {
00142 fSynopsis = 0x01,
00143 fDetails = 0x02,
00144 fExtended = 0x04
00145 };
00146 int flags = fSynopsis;
00147 if( arg ) {
00148 switch( *arg ) {
00149 case 'b': case 'B': break;
00150 case 'e': case 'E': flags = fExtended; break;
00151 case 'f': case 'F': flags = fDetails; break;
00152 }
00153 }
00154 if( flags & fSynopsis )
00155 cout << "usage: [-hV] [--help[=full|brief|extended]] [-U version]\n"
00156 << " [short-read-options] [-0 qbase] [-d genomedb] [-b snpdb] [-g guidefile]\n"
00157 << " [-v featfile] [-l gilist|-y seqID] [--hash-bitmap-file=file]\n"
00158 << " [-o output] [-O -eumxtdhz] [-B batchsz] [-s 1|2|3] [-k skipPos]\n"
00159 << " [--pass0 hash-options] [--pass1 hash-options]\n"
00160 << " [-a maxamb] [-A maxamb] [-P phrap] [-F dust] [-X xdropoff] [-Y bandhw]\n"
00161 << " [-I idscore] [-M mismscore] [-G gapcore] [-Q gapextscore]\n"
00162 << " [-D minPair[-maxPair]] [-m margin] [-R geometry]\n"
00163 << " [-p cutoff] [-x dropoff] [-u topcnt] [-t toppct] [-L memlimit] [-T +|-]\n"
00164 << " [--NaHSO3=yes|no]\n"
00165 << "where hash-options are:\n"
00166 << " [-w win[/word]] [-N wcnt] [-f wstep] [-r wstart] [-S stride] [-H bits]\n"
00167 << " [-n mism] [-e gaps] [-j ins] [-J del] [-E dist]\n"
00168 << " [--add-splice=pos([min:]max)] [--longest-del=val] [--longest-ins=val]\n"
00169 << " [--max-inserted=val] [--max-deleted=val]\n"
00170 << "and short-read-options are:\n"
00171 << " [-i reads.col] [-1 reads1] [-2 reads2] [-q 0|1|4] [-c yes|no]\n"
00172 << "for more details and to see effective option values run:\n"
00173 << " oligofar [options] --help=full\n";
00174
00175 if( flags & fDetails ) {
00176 cout
00177 << "\nFile options:\n"
00178 << " --input-file=file -i file short reads tab-separated input file [" << m_readFile << "]\n"
00179 << " --fasta-file=file -d file database (fasta or basename of blastdb) file [" << m_fastaFile << "]\n"
00180 << " --snpdb-file=file -b file snp database subject file [" << m_snpdbFile << "]\n"
00181 << " --guide-file=file -g file guide file (output of sr-search in SAM 1.2 format [" << m_guideFile << "]\n"
00182 << " --feat-file=file -v file limit scanning to features listed in this file [" << m_featFile << "]\n"
00183 << " --gi-list=file -l file gi list to use for the blast db [" << m_gilistFile << "]\n"
00184 << " --read-1-file=file -1 file read 1 4-channel quality file (requires -i), fasta or fastq file [" << m_read1qualityFile << "]\n"
00185 << " --read-2-file=file -2 file read 2 4-channel quality file (requires -i), fasta or fastq file [" << m_read2qualityFile << "]\n"
00186 << " --output-file=output -o output set output file [" << m_outputFile << "]\n"
00187 << " --only-seqid=seqId -y seqId make database scan only seqIds indicated here [" << Join( ", ", m_seqIds ) << "]\n"
00188
00189 << " --colorspace=+|- -c +|- *reads are set in dibase colorspace [" << (m_colorSpace?"yes":"no") << "]\n"
00190 << " --quality-channels=cnt -q 0|1|4 number of channels in input file quality columns [" << m_qualityChannels << "]\n"
00191 << " --quality-base=value -0 value base quality number (ASCII value for character representing phrap score 0) [" << m_qualityBase << "]\n"
00192 << " --quality-base=+char -0 +char base quality char (character representing phrap score 0) [+" << char(m_qualityBase) << "]\n"
00193 << " --output-flags=flags -O flags add output flags (-huxmtdaez) [" << m_outputFlags << "]\n"
00194 << " --batch-size=count -B count how many short seqs to map at once [" << m_readsPerRun << "]\n"
00195 << " --batch-range=min[-max] which batches to run [" << m_minRun << "-" << m_maxRun << "]\n"
00196 << " --NaHSO3=+|- subject sequences sodium bisulfite curation [" << (m_sodiumBisulfiteCuration?"yes":"no") << "]\n"
00197
00198 << "\nGeneral hashing and scanning options:\n"
00199 << " --strands=1|2|3 -s 1|2|3 hash and lookup for strands (bitmask: 1 for +, 2 for -, 3 for both) [" << m_strands << "]\n"
00200 << " --hash-bitmap-file=file hash bitmap file [" << m_hashBitMask << "]\n"
00201 ;
00202
00203 cout
00204 << "\nPass-specific hashing and scanning options:\n";
00205 for( unsigned i = 0; i < max( size_t(2), m_passParam.size() ); ++i ) {
00206 cout
00207 << " --pass" << i << " following options will be used for pass " << i;
00208 if( i >= m_passParam.size() ) cout << " [off]\n";
00209 else {
00210 set<int> skipPos;
00211 copy( m_passParam[i].GetHashParam().GetSkipPositions().begin(), m_passParam[i].GetHashParam().GetSkipPositions().end(), inserter( skipPos, skipPos.end() ) );
00212 cout
00213 << ":\n"
00214 << " --window-size=win[/word] -w win[/word] hash using window size and word size [" << m_passParam[i].GetHashParam().GetWindowSize() << "/" << m_passParam[i].GetHashParam().GetWordSize() << "]\n"
00215 << " --window-skip=pos[,...] -k pos[,...] skip read positions when hashing (1-based) [" << Join( ",", skipPos ) << "]\n"
00216 << " --window-step=bases -f bases step between windows to hash (0 - consecutive) [" << m_passParam[i].GetHashParam().GetWindowStep() << "]\n"
00217 << " --window-start=bases -r bases start position for first window to hash (1 - default) [" << (m_passParam[i].GetHashParam().GetWindowStart() + 1) << "]\n"
00218 << " --stride-size=stride -S stride hash with given stride size [" << m_passParam[i].GetHashParam().GetStrideSize() << "]\n"
00219 << " --index-bits=bits -H bits set number of bits for index part of hash table [" << m_passParam[i].GetHashParam().GetHashBits() << "]\n"
00220 << " --max-windows=count -N count hash using maximum number of windows [" << m_passParam[i].GetHashParam().GetWindowCount() << "]\n"
00221 << " --input-max-amb=amb -a amb maximal number of ambiguities in hash window [" << m_passParam[i].GetHashParam().GetHashMaxAmb() << "]\n"
00222 << " --fasta-max-amb=amb -A amb maximal number of ambiguities in fasta window [" << m_passParam[i].GetMaxSubjAmb() << "]\n"
00223 << " --phrap-cutoff=score -P score set maximal phrap score to consider base as ambiguous [" << m_passParam[i].GetPhrapCutoff() << "]\n"
00224 << " --max-simplicity=val -F simpl low complexity filter cutoff for hash window [" << m_passParam[i].GetHashParam().GetMaxSimplicity() << "]\n"
00225 << " --max-mism=mismatch -n mismatch hash allowing up to given number of mismatches (0-3) [" << m_passParam[i].GetHashParam().GetHashMismatches() << "]\n"
00226 << " --max-indel=len -e len hash allowing indel of up to given length (0-2) [" << max( m_passParam[i].GetHashParam().GetHashInsertions(), m_passParam[i].GetHashParam().GetHashDeletions() ) << "]\n"
00227 << " --max-ins=len -j len hash allowing insertion of up to given length (0-2) [" << m_passParam[i].GetHashParam().GetHashInsertions() << "]\n"
00228 << " --max-del=len -J len hash allowing deletion of up to given length (0-2) [" << m_passParam[i].GetHashParam().GetHashDeletions() << "]\n"
00229 << " --max-hash-dist=cnt -E cnt hash allowing up to given total number of mismatches and indels (0-5) [" << m_passParam[i].GetHashParam().GetHashMaxDistance() << "]\n"
00230 << " --indel-pos=pos -K pos hash allowing indels only at this position [" << m_passParam[i].GetHashParam().GetHashIndelPosition() << "]\n"
00231 << " --indel-dropoff=value -X value set longest indel for alignment [" << m_passParam[i].GetAlignParam().GetMaxIndelLength() << "]\n"
00232 << " --band-half-width=value -Y value set maximal number of consecutive indels of same type for alignment [" << m_passParam[i].GetAlignParam().GetMaxIndelCount() << "]\n"
00233 << " --longest-ins=value set maximal length for insertions to be reliably found [" << m_passParam[i].GetAlignParam().GetMaxInsertionLength() << "]\n"
00234 << " --longest-del=value set maximal length for deletions to be reliably found [" << m_passParam[i].GetAlignParam().GetMaxDeletionLength() << "]\n"
00235 << " --max-inserted=value set maximal number of inserted bases to be allowed [" << m_passParam[i].GetAlignParam().GetMaxInsertionsCount() << "]\n"
00236 << " --max-deleted=value set maximal number of deleted bases to be allowed [" << m_passParam[i].GetAlignParam().GetMaxDeletionsCount() << "]\n"
00237 << " --add-splice=pos([min:]max) add non-penalized splice site for alignment [" << ReportSplices( i ) << "]\n"
00238 << " --pair-distance=min[-max] -D min[-max] pair distance [" << m_passParam[i].GetMinPair() << "-" << m_passParam[i].GetMaxPair() << "]\n"
00239 << " --pair-margin=len -m dist pair distance margin [" << m_passParam[i].GetPairMargin() << "]\n"
00240 ;
00241 }
00242 }
00243 cout
00244 << "\nAlignment and scoring options:\n"
00245 << " --identity-score=score -I score set identity score [" << m_identityScore << "]\n"
00246 << " --mismatch-score=score -M score set mismatch score [" << m_mismatchScore << "]\n"
00247 << " --gap-opening-score=score -G score set gap opening score [" << m_gapOpeningScore << "]\n"
00248 << " --gap-extention-score=val -Q score set gap extention score [" << m_gapExtentionScore << "]\n"
00249 << " --extention-dropoff -x score the worst penalty possible when extending alignment [" << m_extentionPenaltyDropoff << "]\n"
00250 << "\nFiltering and ranking options:\n"
00251 << " --min-pctid=pctid -p pctid set global percent identity cutoff [" << m_minPctid << "]\n"
00252 << " --top-count=val -u topcnt maximal number of top hits per read [" << m_topCnt << "]\n"
00253 << " --top-percent=val -t toppct maximal score of hit (in % to best) to be reported [" << m_topPct << "]\n"
00254 << " --geometry=value -R value restrictions on relative hit orientation and order for paired hits [" << (m_geometry) << "]\n"
00255 << "\nOther options:\n"
00256 << " --help=[brief|full|ext] -h print help with current parameter values and exit after parsing cmdline\n"
00257 << " --version -V print version and exit after parsing cmdline\n"
00258 << " --assert-version=version -U version make sure that the oligofar version is what expected [" OLIGOFAR_VERSION "]\n"
00259 << " --memory-limit=value -L value set rlimit for the program (k|M|G suffix is allowed) [" << m_memoryLimit << "]\n"
00260 << " --test-suite=+|- -T +|- turn test suite on/off [" << (m_performTests?"on":"off") << "]\n"
00261 << "\nRelative orientation flags recognized:\n"
00262 << " p|centripetal|inside|pcr|solexa reads are oriented so that vectors 5'->3' pointing to each other\n"
00263 << " f|centrifugal|outside reads are oriented so that vectors 5'->3' are pointing outside\n"
00264 << " i|incr|incremental|solid reads are on same strand, first preceeds second on this strand\n"
00265 << " d|decr|decremental reads are on same strand, first succeeds second on this strand\n"
00266 << "\nOutput flags (for -O):\n"
00267 << " - reset all flags\n"
00268 << " h report all hits before ranking\n"
00269 << " u report unmapped reads\n"
00270 << " x indicate that there are more reads of this rank\n"
00271 << " m indicate that there are more reads of lower ranks\n"
00272 << " t indicate that there were no more hits\n"
00273 << " d report differences between query and subject\n"
00274 << " e print empty line after all hits of the read are reported\n"
00275 << " r print raw scores rather then relative scores\n"
00276 << " z output in SAM 0.1.1 format (other flags are unsupported in this format)\n"
00277 << "Read file data options may be used only in combinations:\n"
00278 << " 1. with column file:\n"
00279 << " -q0 -i input.col -c no \n"
00280 << " -q1 -i input.col -c no \n"
00281 << " -q0 -i input.col -c yes\n"
00282 << " 2. with fasta or fastq files:\n"
00283 << " -q0 -1 reads1.fa [-2 reads2.fa] -c yes|no\n"
00284 << " -q1 -1 reads1.faq [-2 reads2.faq] -c no\n"
00285 << " 3. with Solexa 4-channel data\n"
00286 << " -q4 -i input.id -1 reads1.prb [-2 reads2.prb] -c no\n"
00287 << "\nNB: although -L flag is optional, it is strongly recommended to use it!\n"
00288 ;
00289 }
00290 if( flags & fExtended )
00291 cout << "\nExtended options:\n"
00292 << " --min-block-length=bases Length for subject sequence to be scanned at once [" << m_minBlockLength << "]\n"
00293 ;
00294 }
00295
00296 const option * COligoFarApp::GetLongOptions() const
00297 {
00298 static struct option opt[] = {
00299 {"help", 2, 0, 'h'},
00300 {"version", 0, 0, 'V'},
00301 {"assert-version", 1, 0, 'U'},
00302 {"window-size", 1, 0, 'w'},
00303 {"window-skip",1,0,'k'},
00304 {"window-step",1,0,'f'},
00305 {"window-start",1,0,'r'},
00306 {"max-windows",1,0,'N'},
00307 {"max-mism", 1, 0, 'n'},
00308 {"max-indel", 1, 0, 'e'},
00309 {"max-ins", 1, 0, 'j'},
00310 {"max-del", 1, 0, 'J'},
00311 {"max-hash-dist", 1, 0, 'E'},
00312 {"indel-pos", 1, 0, 'K'},
00313 {"input-max-amb", 1, 0, 'a'},
00314 {"fasta-max-amb", 1, 0, 'A'},
00315 {"colorspace", 1, 0, 'c'},
00316 {"NaHSO3", 1, 0, kLongOpt_NaHSO3},
00317 {"input-file", 1, 0, 'i'},
00318 {"fasta-file", 1, 0, 'd'},
00319 {"snpdb-file", 1, 0, 'b'},
00320 {"guide-file", 1, 0, 'g'},
00321 {"feat-file", 1, 0, 'v'},
00322 {"output-file", 1, 0, 'o'},
00323 {"output-flags", 1, 0, 'O'},
00324
00325 {"only-seqid", 1, 0, 'y'},
00326 {"gi-list", 1, 0, 'l'},
00327 {"strands", 1, 0, 's'},
00328 {"hash-bitmap-file", 1, 0, kLongOpt_hashBitMask },
00329 {"batch-size", 1, 0, 'B'},
00330 {"batch-range", 1, 0, kLongOpt_batchRange },
00331 {"guide-max-mism", 1, 0, 'x'},
00332 {"min-pctid", 1, 0, 'p'},
00333 {"top-count", 1, 0, 'u'},
00334 {"top-percent", 1, 0, 't'},
00335 {"read-1-file", 1, 0, '1'},
00336 {"read-2-file", 1, 0, '2'},
00337 {"quality-channels", 1, 0, 'q'},
00338 {"quality-base", 1, 0, '0'},
00339 {"phrap-score", 1, 0, 'P'},
00340 {"pair-distance", 1, 0, 'D'},
00341 {"pair-margin", 1, 0, 'm'},
00342 {"geometry", 1, 0, 'R'},
00343 {"max-simplicity", 1, 0, 'F'},
00344 {"identity-score", 1, 0, 'I'},
00345 {"mismatch-score", 1, 0, 'M'},
00346 {"gap-opening-score", 1, 0, 'G'},
00347 {"gap-extention-score", 1, 0, 'Q'},
00348 {"extention-dropoff", 1, 0, 'x'},
00349 {"indel-dropoff", 1, 0, 'X'},
00350 {"band-half-width", 1, 0, 'Y'},
00351 {"longest-ins", 1, 0, kLongOpt_maxInsertion},
00352 {"longest-del", 1, 0, kLongOpt_maxDeletion},
00353 {"max-inserted", 1, 0, kLongOpt_maxInsertions},
00354 {"max-deleted", 1, 0, kLongOpt_maxDeletions},
00355 {"add-splice", 1, 0, kLongOpt_addSplice},
00356 {"memory-limit", 1, 0, 'L'},
00357 {"test-suite", 1, 0, 'T'},
00358 {"index-bits", 1, 0, 'H'},
00359 {"pass0", 0, 0, kLongOpt_pass0},
00360 {"pass1", 0, 0, kLongOpt_pass1},
00361 {"min-block-length", 1, 0, kLongOpt_min_block_length },
00362 {0,0,0,0}
00363 };
00364 return opt;
00365 }
00366
00367 const char * COligoFarApp::GetOptString() const
00368 {
00369 return "U:H:S:w:N:f:r:k:n:e:E:j:J:K:a:A:c:i:d:b:v:g:o:O:l:y:s:B:x:p:u:t:1:2:q:0:P:m:D:R:F:I:M:G:Q:X:Y:L:T:";
00370 }
00371
00372 int COligoFarApp::ParseArg( int opt, const char * arg, int longindex )
00373 {
00374 switch( opt ) {
00375 case kLongOpt_hashBitMask: m_hashBitMask = arg; break;
00376 case kLongOpt_min_block_length: m_minBlockLength = NStr::StringToInt( arg ); break;
00377 case kLongOpt_NaHSO3: m_sodiumBisulfiteCuration = *arg == '+' ? true : *arg == '-' ? false : NStr::StringToBool( arg ); break;
00378 case kLongOpt_pass0: m_hashPass = 0; break;
00379 case kLongOpt_pass1: if( m_passParam.size() < 2 ) m_passParam.push_back( m_passParam.back() ); m_hashPass = 1; break;
00380 case kLongOpt_maxInsertion: m_passParam[m_hashPass].SetAlignParam().SetMaxInsertionLength( abs( NStr::StringToInt( arg ) ) ); break;
00381 case kLongOpt_maxDeletion: m_passParam[m_hashPass].SetAlignParam().SetMaxDeletionLength( abs( NStr::StringToInt( arg ) ) ); break;
00382 case kLongOpt_maxInsertions: m_passParam[m_hashPass].SetAlignParam().SetMaxInsertionCount( abs( NStr::StringToInt( arg ) ) ); break;
00383 case kLongOpt_maxDeletions: m_passParam[m_hashPass].SetAlignParam().SetMaxDeletionCount( abs( NStr::StringToInt( arg ) ) ); break;
00384 case kLongOpt_addSplice: AddSplice( arg ); break;
00385 case kLongOpt_batchRange: ParseRange( m_minRun, m_maxRun, arg, "-" ); break;
00386 case 'U': if( strcmp( arg, OLIGOFAR_VERSION ) ) THROW( runtime_error, "Expected oligofar version " << arg << ", called " OLIGOFAR_VERSION ); break;
00387
00388 case 'k':
00389 do {
00390 list<string> x;
00391 Split( arg, ",", back_inserter( x ) );
00392 CHashParam::TSkipPositions& sp = m_passParam[m_hashPass].SetHashParam().SetSkipPositions();
00393 ITERATE( list<string>, t, x ) sp.insert( sp.end(), NStr::StringToInt( *t ) );
00394 } while(0);
00395 break;
00396 case 'w':
00397 do {
00398 int win, word;
00399 ParseRange( win, word, arg, "/" );
00400 m_passParam[m_hashPass].SetHashParam().SetWindowSize( win );
00401 m_passParam[m_hashPass].SetHashParam().SetWordSize( word );
00402 } while(0);
00403 break;
00404 case 'N': m_passParam[m_hashPass].SetHashParam().SetWindowCount( NStr::StringToInt( arg ) ); break;
00405 case 'f': m_passParam[m_hashPass].SetHashParam().SetWindowStep( abs( NStr::StringToInt( arg ) ) ); break;
00406 case 'r': m_passParam[m_hashPass].SetHashParam().SetWindowStart( abs( NStr::StringToInt( arg ) - 1 ) ); break;
00407 case 'H': m_passParam[m_hashPass].SetHashParam().SetHashBits( NStr::StringToInt( arg ) ); break;
00408 case 'S': m_passParam[m_hashPass].SetHashParam().SetStrideSize( NStr::StringToInt( arg ) ); break;
00409 case 'n': m_passParam[m_hashPass].SetHashParam().SetHashMismatches( NStr::StringToInt( arg ) ); break;
00410 case 'e': m_passParam[m_hashPass].SetHashParam().SetHashIndels( NStr::StringToInt( arg ) ); break;
00411 case 'j': m_passParam[m_hashPass].SetHashParam().SetHashInsertions( NStr::StringToInt( arg ) ); break;
00412 case 'J': m_passParam[m_hashPass].SetHashParam().SetHashDeletions( NStr::StringToInt( arg ) ); break;
00413 case 'E': m_passParam[m_hashPass].SetHashParam().SetMaxHashDistance( NStr::StringToInt( arg ) ); break;
00414 case 'K': m_passParam[m_hashPass].SetHashParam().SetHashIndelPosition( NStr::StringToInt( arg ) ); break;
00415 case 'a': m_passParam[m_hashPass].SetHashParam().SetHashMaxAmb( strtol( arg, 0, 10 ) ); break;
00416 case 'A': m_passParam[m_hashPass].SetMaxSubjAmb() = strtol( arg, 0, 10 ); break;
00417 case 'c': m_colorSpace = *arg == '+' ? true : *arg == '-' ? false : NStr::StringToBool( arg ); break;
00418 case 'i': m_readFile = arg; break;
00419 case 'd': m_fastaFile = arg; break;
00420 case 'b': m_snpdbFile = arg; break;
00421 case 'g': m_guideFile = arg; break;
00422 case 'v': m_featFile = arg; break;
00423 case 'o': m_outputFile = arg; break;
00424 case 'O': m_outputFlags += arg; if( const char * m = strrchr( m_outputFlags.c_str(), '-' ) ) m_outputFlags = m + 1; break;
00425 case 'l': m_gilistFile = arg; break;
00426 case 'y': m_seqIds.push_back( CSeq_id( arg ).AsFastaString() ); break;
00427 case 's': m_strands = strtol( arg, 0, 10 ); break;
00428 case 'B': m_readsPerRun = strtol( arg, 0, 10 ); break;
00429
00430 case 'p': m_minPctid = NStr::StringToDouble( arg ); break;
00431 case 'u': m_topCnt = strtol( arg, 0, 10 ); break;
00432 case 't': m_topPct = NStr::StringToDouble( arg ); break;
00433 case '1': m_read1qualityFile = arg; break;
00434 case '2': m_read2qualityFile = arg; break;
00435 case 'q': m_qualityChannels = NStr::StringToInt( arg ); break;
00436 case '0': m_qualityBase = ( arg[0] && arg[0] == '+' ) ? arg[1] : NStr::StringToInt( arg ); break;
00437 case 'D': ParseRange( m_passParam[m_hashPass].SetMinPair(), m_passParam[m_hashPass].SetMaxPair(), arg ); break;
00438 case 'm': m_passParam[m_hashPass].SetPairMargin() = strtol( arg, 0, 10 ); break;
00439 case 'P': m_passParam[m_hashPass].SetPhrapCutoff() = strtol( arg, 0, 10 ); break;
00440 case 'R': m_geometry = arg; break;
00441 case 'F': m_passParam[m_hashPass].SetHashParam().SetMaxSimplicity( NStr::StringToDouble( arg ) ); break;
00442 case 'X': m_passParam[m_hashPass].SetAlignParam().SetMaxIndelLength( abs( strtol( arg, 0, 10 ) ) ); break;
00443 case 'Y': m_passParam[m_hashPass].SetAlignParam().SetMaxIndelCount( abs( strtol( arg, 0, 10 ) ) ); break;
00444 case 'I': m_identityScore = fabs( NStr::StringToDouble( arg ) ); break;
00445 case 'M': m_mismatchScore = -fabs( NStr::StringToDouble( arg ) ); break;
00446 case 'G': m_gapOpeningScore = -fabs( NStr::StringToDouble( arg ) ); break;
00447 case 'Q': m_gapExtentionScore = -fabs( NStr::StringToDouble( arg ) ); break;
00448 case 'x': m_extentionPenaltyDropoff = -fabs( NStr::StringToDouble( arg ) ); break;
00449 case 'L':
00450 #ifndef _WIN32
00451 do {
00452 char * t = 0;
00453 m_memoryLimit = strtoll( arg, &t, 10 );
00454 if( t ) {
00455 switch( tolower(*t) ) {
00456 case 'g': m_memoryLimit *= 1024;
00457 case 'm': m_memoryLimit *= 1024;
00458 case 'k': m_memoryLimit *= 1024;
00459 }
00460 }
00461 } while(0);
00462 #else
00463 cerr << "[" << GetProgramBasename() << "] Warning: -L is ignored in win32\n";
00464 #endif
00465 break;
00466 case 'T': m_performTests = (*arg == '+') ? true : (*arg == '-') ? false : NStr::StringToBool( arg ); break;
00467 default: return CApp::ParseArg( opt, arg, longindex );
00468 }
00469 return 0;
00470 }
00471
00472 void COligoFarApp::ParseConfig( const string& cfg )
00473 {
00474 ifstream in( cfg.c_str() );
00475 if( !in.good() ) THROW( runtime_error, "Failed to read config file " << cfg );
00476 CNcbiRegistry reg( in );
00477 ParseConfig( ® );
00478 }
00479
00480 void COligoFarApp::ParseConfig( IRegistry * reg )
00481 {
00482 const option * opts = GetLongOptions();
00483 for( int index = 0 ; opts && opts->name != 0 ; ++opts, ++index ) {
00484 if( reg->HasEntry( "oligofar", opts->name ) ) {
00485 ParseArg( opts->val, reg->Get( "oligofar", opts->name ).c_str(), index );
00486 }
00487 }
00488 }
00489
00490 int COligoFarApp::RunTestSuite()
00491 {
00492 if( m_performTests ) {
00493 if( int rc = TestSuite() ) {
00494 cerr << "[" << GetProgramBasename() << "] internal tests failed: " << rc << endl;
00495 return rc;
00496 } else {
00497 cerr << "[" << GetProgramBasename() << "] internal tests succeeded!\n";
00498 }
00499 }
00500 return 0;
00501 }
00502
00503 int COligoFarApp::SetLimits()
00504 {
00505 #ifndef _WIN32
00506 if( m_memoryLimit ) {
00507 struct rlimit rl;
00508 rl.rlim_cur = m_memoryLimit;
00509 rl.rlim_max = RLIM_INFINITY;
00510 cerr << "[" << GetProgramBasename() << "] Setting memory limit to " << m_memoryLimit << ": ";
00511 errno = 0;
00512 int rc = setrlimit( RLIMIT_AS, &rl );
00513 cerr << "\b\b failed: " << strerror( errno ) << endl;
00514 return rc;
00515 } else
00516 cerr << "[" << GetProgramBasename() << "] Memory limit is not set.\n";
00517 #else
00518 cerr << "[" << GetProgramBasename() << "] Setting memory limit is not implemented for win32 yet, ignored.\n";
00519 #endif
00520 return 0;
00521 }
00522
00523 int COligoFarApp::Execute()
00524 {
00525 if( int rc = RunTestSuite() ) return rc;
00526
00527 SetLimits();
00528 return ProcessData();
00529 }
00530
00531 int COligoFarApp::GetOutputFlags()
00532 {
00533 int oflags = 0;
00534 for( const char * f = m_outputFlags.c_str(); *f; ++f ) {
00535 switch( tolower(*f) ) {
00536 case '-': oflags = 0; break;
00537 case 'z': m_outputSam = true; break;
00538 case 'e': oflags |= COutputFormatter::fReportEmptyLines; break;
00539 case 'u': oflags |= COutputFormatter::fReportUnmapped; break;
00540 case 'm': oflags |= COutputFormatter::fReportMany; break;
00541 case 'x': oflags |= COutputFormatter::fReportMore; break;
00542 case 't': oflags |= COutputFormatter::fReportTerminator; break;
00543 case 'd': oflags |= COutputFormatter::fReportDifferences; break;
00544 case 'h': oflags |= COutputFormatter::fReportAllHits; break;
00545 case 'r': oflags |= COutputFormatter::fReportRawScore; break;
00546 default: cerr << "[" << GetProgramBasename() << "] Warning: unknown format flag `" << *f << "'\n"; break;
00547 }
00548 }
00549 return oflags;
00550 }
00551
00552 void COligoFarApp::SetupGeometries( map<string,int>& geometries )
00553 {
00554 geometries.insert( make_pair("p",CFilter::eCentripetal) );
00555 geometries.insert( make_pair("centripetal",CFilter::eCentripetal) );
00556 geometries.insert( make_pair("inside",CFilter::eCentripetal) );
00557 geometries.insert( make_pair("pcr",CFilter::eCentripetal) );
00558 geometries.insert( make_pair("solexa",CFilter::eCentripetal) );
00559 geometries.insert( make_pair("f",CFilter::eCentrifugal) );
00560 geometries.insert( make_pair("centrifugal",CFilter::eCentrifugal) );
00561 geometries.insert( make_pair("outside",CFilter::eCentrifugal) );
00562 geometries.insert( make_pair("i",CFilter::eIncremental) );
00563 geometries.insert( make_pair("incr",CFilter::eIncremental) );
00564 geometries.insert( make_pair("incremental",CFilter::eIncremental) );
00565 geometries.insert( make_pair("solid",CFilter::eIncremental) );
00566 geometries.insert( make_pair("d",CFilter::eDecremental) );
00567 geometries.insert( make_pair("decr",CFilter::eDecremental) );
00568 geometries.insert( make_pair("decremental",CFilter::eDecremental) );
00569 }
00570
00571 string COligoFarApp::ReportSplices( int i ) const
00572 {
00573 ostringstream o;
00574 o << Join( ",", m_passParam[i].GetAlignParam().GetSpliceSet().begin(), m_passParam[i].GetAlignParam().GetSpliceSet().end() );
00575 return o.str();
00576 }
00577
00578 void COligoFarApp::AddSplice( const char* arg )
00579 {
00580 m_passParam[m_hashPass].SetAlignParam().AddSplice( CIntron( arg ) );
00581 }
00582
00583 int COligoFarApp::ProcessData()
00584 {
00585 for( unsigned p = 0; p < m_passParam.size(); ++p ) {
00586 string msg;
00587 if( ! m_passParam[p].GetHashParam().ValidateParam( msg ) ) {
00588 cerr << "Incompatible set of hash parameters for pass" << p
00589 << ( m_passParam[p].GetHashParam().GetSkipPositions().size() ? " with skip ppositions:" : ":" )
00590 << msg << "\n";
00591 return EX_USAGE;
00592 }
00593 }
00594
00595 if( m_readFile == "-" ) m_readFile = "/dev/stdin";
00596 if( m_fastaFile == "-" ) m_fastaFile = "/dev/stdin";
00597 if( m_guideFile == "-" ) m_guideFile = "/dev/stdin";
00598 if( m_outputFile == "-" ) m_outputFile = "/dev/stdout";
00599
00600 CReaderFactory rfactory;
00601 rfactory.SetColorspace( m_colorSpace );
00602 rfactory.SetQualityChannels( m_qualityChannels );
00603 rfactory.SetReadIdFile( m_readFile );
00604 rfactory.SetReadDataFile1( m_read1qualityFile );
00605 rfactory.SetReadDataFile2( m_read2qualityFile );
00606 auto_ptr<IShortReader> qreader( rfactory.CreateReader() );
00607
00608 if( qreader.get() == 0 ) {
00609 cerr << "Failed to understand input specifications: incompatible set of parameters (see oligofar -h).\n";
00610 return EX_USAGE;
00611 }
00612
00613 ofstream o( m_outputFile.c_str() );
00614 if( m_readFile == "/dev/stdin" ) cerr << "* Notice: expecting read tab-separated data on STDIN\n";
00615 if( m_guideFile == "/dev/stdin" ) cerr << "* Notice: expecting guide data on STDIN\n";
00616 if( m_fastaFile == "/dev/stdin" ) cerr << "* Notice: expecting reference sequence data on STDIN\n";
00617
00618 map<string,int> geometries;
00619 SetupGeometries( geometries );
00620
00621 if( geometries.find( m_geometry ) == geometries.end() ) {
00622 THROW( runtime_error, "Unknown geometry `" << m_geometry << "'" );
00623 }
00624
00625 for( vector<CPassParam>::iterator p = m_passParam.begin(); p != m_passParam.end(); ++p ) {
00626 p->SetHashParam().Validate( cerr, "[" + string( GetProgramBasename() ) + "] " );
00627 }
00628
00629 CSeqIds seqIds;
00630 CFilter filter;
00631 CSeqVecProcessor seqVecProcessor;
00632 CSeqScanner seqScanner;
00633
00634 CFeatMap featMap;
00635 if( m_featFile.length() ) {
00636 featMap.SetSeqIds( &seqIds );
00637 featMap.ReadFile( m_featFile );
00638 seqScanner.SetFeatMap( &featMap );
00639 }
00640
00641 auto_ptr<AOutputFormatter> formatter( 0 );
00642 int oflags = GetOutputFlags();
00643 if( m_outputSam ) formatter.reset( new CSamFormatter( o, seqIds ) );
00644 else {
00645 COutputFormatter * of = new COutputFormatter( o, seqIds );
00646 formatter.reset( of );
00647 of->AssignFlags( oflags );
00648 of->SetTopCount( m_topCnt );
00649 of->SetTopPct( m_topPct );
00650 }
00651
00652
00653 CQueryHash queryHash( m_passParam[0].GetHashParam().GetHashMismatches(), m_passParam[0].GetHashParam().GetHashMaxAmb() );
00654 CScoreParam scoreParam( m_identityScore, m_mismatchScore, m_gapOpeningScore, m_gapExtentionScore );
00655 CScoringFactory scoringFactory( &scoreParam, &m_passParam[0].GetAlignParam() );
00656 CAligner aligner( &scoringFactory );
00657 CGuideSamFile guideFile( m_guideFile, filter, seqIds, scoreParam );
00658 CBatch batch;
00659
00660 batch.SetReadCount( m_readsPerRun );
00661 batch.SetFastaFile( m_fastaFile );
00662 batch.SetQueryHash( &queryHash );
00663 batch.SetSeqVecProcessor( &seqVecProcessor );
00664 batch.SetSeqScanner( &seqScanner );
00665 batch.SetFilter( &filter );
00666 batch.SetOutputFormatter( formatter.get() );
00667 batch.SetScoringFactory( &scoringFactory );
00668 batch.SetPassParam( &m_passParam );
00669 batch.SetRange( m_minRun, m_maxRun );
00670
00671 CSnpDb snpDb( CSnpDb::eSeqId_integer );
00672 if( m_snpdbFile.length() ) {
00673 snpDb.Open( m_snpdbFile, CBDB_File::eReadOnly );
00674 seqScanner.SetSnpDb( &snpDb );
00675 }
00676
00677
00678
00679
00680 formatter->SetGuideFile( guideFile );
00681
00682 filter.SetGeometry( geometries[m_geometry] );
00683 filter.SetAligner( &aligner );
00684 filter.SetSeqIds( &seqIds );
00685 filter.SetTopPct( m_topPct );
00686 filter.SetTopCnt( m_topCnt );
00687 filter.SetScorePctCutoff( m_minPctid );
00688 filter.SetOutputFormatter( formatter.get() );
00689
00690 if( m_minPctid > m_topPct ) {
00691 cerr << "[" << GetProgramBasename() << "] Warning: top% is greater then %cutoff ("
00692 << m_topPct << " < " << m_minPctid << ")\n";
00693 }
00694
00695 aligner.SetSubjectCoding( CSeqCoding::eCoding_ncbi8na );
00696 aligner.SetQueryStrand( CSeqCoding::eStrand_pos );
00697 aligner.SetExtentionPenaltyDropoff( m_extentionPenaltyDropoff );
00698
00699
00700 auto_ptr<CBitmaskAccess> bitmaskAccess( 0 );
00701 if( m_hashBitMask.size() ) bitmaskAccess.reset( new CBitmaskAccess( m_hashBitMask ) );
00702
00703 queryHash.SetStrands( m_strands );
00704 queryHash.SetNaHSO3mode( m_sodiumBisulfiteCuration );
00705 queryHash.SetHashWordBitmask( bitmaskAccess.get() );
00706
00707 seqScanner.SetFilter( &filter );
00708 seqScanner.SetQueryHash( &queryHash );
00709 seqScanner.SetSeqIds( &seqIds );
00710 seqScanner.SetMinBlockLength( m_minBlockLength );
00711 seqScanner.SetInputChunk( batch.GetInputChunk() );
00712
00713 seqVecProcessor.SetTargetCoding( m_colorSpace ? CSeqCoding::eCoding_colorsp : CSeqCoding::eCoding_ncbi8na );
00714 seqVecProcessor.AddCallback( 1, &filter );
00715 seqVecProcessor.AddCallback( 0, &seqScanner );
00716
00717 if( !ValidateSplices( queryHash ) ) return EX_USAGE;
00718
00719 if( m_seqIds.size() ) seqVecProcessor.SetSeqIdList( m_seqIds );
00720 if( m_gilistFile.length() ) seqVecProcessor.SetGiListFile( m_gilistFile );
00721
00722 Uint8 queriesTotal = 0;
00723 Uint8 entriesTotal = 0;
00724
00725 CProgressIndicator p( "Reading input data", "lines" );
00726 batch.SetReadProgressIndicator( &p );
00727 batch.Start();
00728
00729 for( int count = 0; (!batch.Done()) && qreader->NextRead(); ++count ) {
00730
00731 CQuery * query = new CQuery(
00732 qreader->GetSeqCoding(),
00733 qreader->GetReadId(),
00734 qreader->GetReadData(0),
00735 qreader->GetReadData(1),
00736 m_qualityBase );
00737
00738
00739
00740
00741
00742
00743 query->ComputeBestScore( &scoreParam, 0 );
00744 query->ComputeBestScore( &scoreParam, 1 );
00745 while( guideFile.NextHit( queriesTotal, query ) );
00746 entriesTotal += batch.AddQuery( query );
00747 queriesTotal ++;
00748 p.Increment();
00749 }
00750 batch.Purge();
00751 batch.SetReadProgressIndicator( 0 );
00752 p.Summary();
00753 cerr << "Queries processed: " << queriesTotal << " (" << entriesTotal << " hash entries)\n";
00754 cerr << "Memory usage:\n"
00755 << " hits left: " << CHit::GetCount() << "\n"
00756 << "queries left: " << CQuery::GetCount() << "\n";
00757
00758 return 0;
00759 }
00760
00761 bool COligoFarApp::ValidateSplices( CQueryHash& queryHash )
00762 {
00763
00764
00765
00766
00767
00768
00769
00770
00771
00772
00773
00774
00775
00776
00777
00778
00779
00780
00781
00782
00783
00784
00785 return true;
00786 }
00787
00788
00789