src/app/oligofar/coligofarapp.cpp

Go to the documentation of this file.
00001 #include <ncbi_pch.hpp>
00002 #include "coligofarapp.hpp"
00003 #include "csamformatter.hpp"
00004 #include "coutputformatter.hpp"
00005 #include "cprogressindicator.hpp"
00006 #include "cscoringfactory.hpp"
00007 #include "cbitmaskaccess.hpp"
00008 #include "cseqscanner.hpp"
00009 #include "cguidesam.hpp"
00010 #include "cfeatmap.hpp"
00011 #include "caligner.hpp"
00012 #include "cfilter.hpp"
00013 #include "cbatch.hpp"
00014 #include "csnpdb.hpp"
00015 #include "oligofar-version.hpp"
00016 
00017 #include "string-util.hpp"
00018 
00019 #include <iostream>
00020 #include <iomanip>
00021 #include <fstream>
00022 #include <sstream>
00023 
00024 #ifndef _WIN32
00025 #include <sys/resource.h>
00026 #include <sysexits.h>
00027 #else
00028 #define strtoll( a, b, c ) strtoui64( a, b, c )
00029 #define EX_USAGE 64
00030 #endif
00031 
00032 USING_OLIGOFAR_SCOPES;
00033 
00034 COligoFarApp::COligoFarApp( int argc, char ** argv ) :
00035     CApp( argc, argv ),
00036     m_hashPass( 0 ),
00037     m_strands( 0x03 ),
00038     m_readsPerRun( 250000 ),
00039     m_minRun(0),
00040     m_maxRun( numeric_limits<int>::max() ),
00041     m_topCnt( 10 ),
00042     m_topPct( 99 ),
00043     m_minPctid( 60 ),
00044     m_identityScore(  1.0 ),
00045     m_mismatchScore( -1.0 ),
00046     m_gapOpeningScore( -3.0 ),
00047     m_gapExtentionScore( -1.5 ),
00048     m_extentionPenaltyDropoff( -100 ),
00049     m_qualityChannels( 0 ),
00050     m_qualityBase( 33 ),
00051     m_minBlockLength( 1000 ),
00052 //    m_guideFilemaxMismatch( 0 ),
00053     m_memoryLimit( Uint8( sizeof(void*) == 4 ? 3 : 8 ) * int(kGigaByte) ),
00054     m_performTests( false ),
00055     m_colorSpace( false ),
00056     m_sodiumBisulfiteCuration( false ),
00057     m_outputSam( true ),
00058 #ifdef _WIN32
00059     //m_guideFile( "nul:" ),
00060     m_outputFile( "con:" ),
00061 #else
00062     m_guideFile( "/dev/null" ),
00063     m_outputFile( "/dev/stdout" ),
00064 #endif
00065     m_outputFlags( "z" ),
00066     m_geometry( "p" )
00067 {
00068     m_passParam.push_back( CPassParam() );
00069 #ifndef _WIN32
00070     ifstream meminfo( "/proc/meminfo" );
00071     string buff;
00072     if( !meminfo.fail() ) {
00073         m_memoryLimit = 0;
00074 
00075         while( getline( meminfo, buff ) ) {
00076             istringstream line(buff);
00077             string name, units;
00078             Uint8 value;
00079             line >> name >> value >> units;
00080             if( units.length() ) {
00081                 switch( tolower( units[0] ) ) {
00082                 case 'g': value *= kKiloByte;
00083                 case 'm': value *= kKiloByte;
00084                 case 'k': value *= kKiloByte;
00085                 }
00086             }
00087             if( name == "MemFree:" || name == "Buffers:" || name == "Cached:" || name == "SwapCached:" ) {
00088                 m_memoryLimit += value;
00089             }
00090         }
00091     }
00092 #endif
00093 }
00094 
00095 int COligoFarApp::RevNo()
00096 {
00097     return strtol( "$Rev: 175071 $"+6, 0, 10 );
00098 }
00099 
00100 void COligoFarApp::Version( const char * )
00101 {
00102     cout << GetProgramBasename() << " ver " OLIGOFAR_VERSION " (Rev:" << RevNo() << ") " NCBI_SIGNATURE << endl;
00103 }
00104 
00105 // letter   lower       letter  upper
00106 // =============================================
00107 // a        ambCnt      A       ambCnt
00108 // b        snpdb       B       batchSz
00109 // c        colorsp     C       /config/
00110 // d        database    D       pairRange
00111 // e        maxIndelW   E       masHashDist*
00112 // f        windowStep  F       dustScore
00113 // g        guidefile   G       gapScore
00114 // h        help        H       hashBits
00115 // i        input       I       idScore
00116 // j        maxInsWid*  J       maxDelWid*   
00117 // k        skipPos     K       indelPos*
00118 // l        gilist      L       memLimit
00119 // m        margin      M       mismScore
00120 // n        maxMism     N       maxWindows
00121 // o        output      O       outputFmt
00122 // p        pctCutoff   P       phrapScore
00123 // q        qualChn     Q       gapExtScore
00124 // r        windowStart R       geometry
00125 // s        strands     S       stride
00126 // t        topPct      T       runTests
00127 // u        topCnt      U       assertVer
00128 // v        featfile    V       version
00129 // w        win/word    W       POSIX 
00130 // x        extDropOff  X       xDropoff
00131 // y        onlySeqid   Y       bandHalfWidth
00132 // z                    Z
00133 // 0        baseQual    5
00134 // 1        solexa1     6
00135 // 2        solexa2     7
00136 // 3                    8
00137 // 4                    9
00138 
00139 void COligoFarApp::Help( const char * arg )
00140 {
00141     enum EFlags {
00142         fSynopsis = 0x01,
00143         fDetails  = 0x02,
00144         fExtended = 0x04
00145     };
00146     int flags = fSynopsis;
00147     if( arg ) {
00148         switch( *arg ) {
00149         case 'b': case 'B': break;
00150         case 'e': case 'E': flags = fExtended; break;
00151         case 'f': case 'F': flags = fDetails; break;
00152         }
00153     }
00154     if( flags & fSynopsis ) 
00155         cout << "usage: [-hV] [--help[=full|brief|extended]] [-U version]\n"
00156              << "  [short-read-options] [-0 qbase] [-d genomedb] [-b snpdb] [-g guidefile]\n"
00157              << "  [-v featfile] [-l gilist|-y seqID] [--hash-bitmap-file=file]\n"
00158              << "  [-o output] [-O -eumxtdhz] [-B batchsz] [-s 1|2|3] [-k skipPos]\n"
00159              << "  [--pass0 hash-options] [--pass1 hash-options]\n"
00160              << "  [-a maxamb] [-A maxamb] [-P phrap] [-F dust] [-X xdropoff] [-Y bandhw]\n"
00161              << "  [-I idscore] [-M mismscore] [-G gapcore] [-Q gapextscore]\n"
00162              << "  [-D minPair[-maxPair]] [-m margin] [-R geometry]\n"
00163              << "  [-p cutoff] [-x dropoff] [-u topcnt] [-t toppct] [-L memlimit] [-T +|-]\n"
00164              << "  [--NaHSO3=yes|no]\n"
00165              << "where hash-options are:\n"
00166              << "  [-w win[/word]] [-N wcnt] [-f wstep] [-r wstart] [-S stride] [-H bits]\n"
00167              << "  [-n mism] [-e gaps] [-j ins] [-J del] [-E dist]\n"
00168              << "  [--add-splice=pos([min:]max)] [--longest-del=val] [--longest-ins=val]\n"
00169              << "  [--max-inserted=val] [--max-deleted=val]\n"
00170              << "and short-read-options are:\n"
00171              << "  [-i reads.col] [-1 reads1] [-2 reads2] [-q 0|1|4] [-c yes|no]\n"
00172              << "for more details and to see effective option values run:\n"
00173              << "  oligofar [options] --help=full\n";
00174 
00175     if( flags & fDetails ) {
00176         cout 
00177             << "\nFile options:\n" 
00178             << "   --input-file=file         -i file       short reads tab-separated input file [" << m_readFile << "]\n"
00179             << "   --fasta-file=file         -d file       database (fasta or basename of blastdb) file [" << m_fastaFile << "]\n"
00180             << "   --snpdb-file=file         -b file       snp database subject file [" << m_snpdbFile << "]\n"
00181             << "   --guide-file=file         -g file       guide file (output of sr-search in SAM 1.2 format [" << m_guideFile << "]\n"
00182             << "   --feat-file=file          -v file       limit scanning to features listed in this file [" << m_featFile << "]\n"
00183             << "   --gi-list=file            -l file       gi list to use for the blast db [" << m_gilistFile << "]\n"
00184             << "   --read-1-file=file        -1 file       read 1 4-channel quality file (requires -i), fasta or fastq file [" << m_read1qualityFile << "]\n"
00185             << "   --read-2-file=file        -2 file       read 2 4-channel quality file (requires -i), fasta or fastq file [" << m_read2qualityFile << "]\n"
00186             << "   --output-file=output      -o output     set output file [" << m_outputFile << "]\n"
00187             << "   --only-seqid=seqId        -y seqId      make database scan only seqIds indicated here [" << Join( ", ", m_seqIds ) << "]\n"
00188 //            << "   --guide-max-mism=count    -x count      set maximal number of mismatches for hits in guide file [" << m_guideFilemaxMismatch << "]\n"
00189             << "   --colorspace=+|-          -c +|-       *reads are set in dibase colorspace [" << (m_colorSpace?"yes":"no") << "]\n"
00190             << "   --quality-channels=cnt    -q 0|1|4      number of channels in input file quality columns [" << m_qualityChannels << "]\n"
00191             << "   --quality-base=value      -0 value      base quality number (ASCII value for character representing phrap score 0) [" << m_qualityBase << "]\n"
00192             << "   --quality-base=+char      -0 +char      base quality char (character representing phrap score 0) [+" << char(m_qualityBase) << "]\n"
00193             << "   --output-flags=flags      -O flags      add output flags (-huxmtdaez) [" << m_outputFlags << "]\n"
00194             << "   --batch-size=count        -B count      how many short seqs to map at once [" << m_readsPerRun << "]\n"
00195             << "   --batch-range=min[-max]                 which batches to run [" << m_minRun << "-" << m_maxRun << "]\n"
00196             << "   --NaHSO3=+|-                            subject sequences sodium bisulfite curation [" << (m_sodiumBisulfiteCuration?"yes":"no") << "]\n"
00197 //            << "  -C config     --config-file=file         take parameters from config file section `oligofar' and continue parsing commandline\n"
00198             << "\nGeneral hashing and scanning options:\n"
00199             << "   --strands=1|2|3           -s 1|2|3      hash and lookup for strands (bitmask: 1 for +, 2 for -, 3 for both) [" << m_strands << "]\n"
00200             << "   --hash-bitmap-file=file                 hash bitmap file [" << m_hashBitMask << "]\n"
00201             ;
00202     
00203         cout
00204             << "\nPass-specific hashing and scanning options:\n";
00205         for( unsigned i = 0; i < max( size_t(2), m_passParam.size() ); ++i ) {
00206             cout 
00207                 << "   --pass" << i << "                                 following options will be used for pass " << i;
00208             if( i >= m_passParam.size() ) cout << " [off]\n";
00209             else {
00210                 set<int> skipPos;
00211                 copy( m_passParam[i].GetHashParam().GetSkipPositions().begin(), m_passParam[i].GetHashParam().GetSkipPositions().end(), inserter( skipPos, skipPos.end() ) );
00212                 cout 
00213                     << ":\n"
00214                     << "   --window-size=win[/word]  -w win[/word] hash using window size and word size [" << m_passParam[i].GetHashParam().GetWindowSize() << "/" << m_passParam[i].GetHashParam().GetWordSize() << "]\n"
00215                     << "   --window-skip=pos[,...]   -k pos[,...]  skip read positions when hashing (1-based) [" << Join( ",", skipPos ) << "]\n"
00216                     << "   --window-step=bases       -f bases      step between windows to hash (0 - consecutive) [" << m_passParam[i].GetHashParam().GetWindowStep() << "]\n"
00217                     << "   --window-start=bases      -r bases      start position for first window to hash (1 - default) [" << (m_passParam[i].GetHashParam().GetWindowStart() + 1) << "]\n"
00218                     << "   --stride-size=stride      -S stride     hash with given stride size [" << m_passParam[i].GetHashParam().GetStrideSize() << "]\n"
00219                     << "   --index-bits=bits         -H bits       set number of bits for index part of hash table [" << m_passParam[i].GetHashParam().GetHashBits() << "]\n"   
00220                     << "   --max-windows=count       -N count      hash using maximum number of windows [" << m_passParam[i].GetHashParam().GetWindowCount() << "]\n"
00221                     << "   --input-max-amb=amb       -a amb        maximal number of ambiguities in hash window [" << m_passParam[i].GetHashParam().GetHashMaxAmb() << "]\n"
00222                     << "   --fasta-max-amb=amb       -A amb        maximal number of ambiguities in fasta window [" << m_passParam[i].GetMaxSubjAmb() << "]\n"
00223                     << "   --phrap-cutoff=score      -P score      set maximal phrap score to consider base as ambiguous [" << m_passParam[i].GetPhrapCutoff() << "]\n"
00224                     << "   --max-simplicity=val      -F simpl      low complexity filter cutoff for hash window [" << m_passParam[i].GetHashParam().GetMaxSimplicity() << "]\n"
00225                     << "   --max-mism=mismatch       -n mismatch   hash allowing up to given number of mismatches (0-3) [" << m_passParam[i].GetHashParam().GetHashMismatches() << "]\n"
00226                     << "   --max-indel=len           -e len        hash allowing indel of up to given length (0-2) [" << max( m_passParam[i].GetHashParam().GetHashInsertions(), m_passParam[i].GetHashParam().GetHashDeletions() ) << "]\n"
00227                     << "   --max-ins=len             -j len        hash allowing insertion of up to given length (0-2) [" << m_passParam[i].GetHashParam().GetHashInsertions() << "]\n"
00228                     << "   --max-del=len             -J len        hash allowing deletion  of up to given length (0-2) [" << m_passParam[i].GetHashParam().GetHashDeletions() << "]\n"
00229                     << "   --max-hash-dist=cnt       -E cnt        hash allowing up to given total number of mismatches and indels (0-5) [" << m_passParam[i].GetHashParam().GetHashMaxDistance() << "]\n"
00230                     << "   --indel-pos=pos           -K pos        hash allowing indels only at this position [" << m_passParam[i].GetHashParam().GetHashIndelPosition() << "]\n"
00231                     << "   --indel-dropoff=value     -X value      set longest indel for alignment [" << m_passParam[i].GetAlignParam().GetMaxIndelLength() << "]\n"
00232                     << "   --band-half-width=value   -Y value      set maximal number of consecutive indels of same type for alignment [" << m_passParam[i].GetAlignParam().GetMaxIndelCount() << "]\n"
00233                     << "   --longest-ins=value                     set maximal length for insertions to be reliably found [" << m_passParam[i].GetAlignParam().GetMaxInsertionLength() << "]\n"
00234                     << "   --longest-del=value                     set maximal length for deletions to be reliably found [" << m_passParam[i].GetAlignParam().GetMaxDeletionLength() << "]\n"
00235                     << "   --max-inserted=value                    set maximal number of inserted bases to be allowed [" << m_passParam[i].GetAlignParam().GetMaxInsertionsCount() << "]\n"
00236                     << "   --max-deleted=value                     set maximal number of deleted bases to be allowed [" << m_passParam[i].GetAlignParam().GetMaxDeletionsCount() << "]\n"
00237                     << "   --add-splice=pos([min:]max)             add non-penalized splice site for alignment [" << ReportSplices( i ) << "]\n"
00238                     << "   --pair-distance=min[-max] -D min[-max]  pair distance [" << m_passParam[i].GetMinPair() << "-" << m_passParam[i].GetMaxPair() << "]\n"
00239                     << "   --pair-margin=len         -m dist       pair distance margin [" << m_passParam[i].GetPairMargin() << "]\n"
00240                     ;
00241             }
00242         }
00243         cout
00244             << "\nAlignment and scoring options:\n"
00245             << "   --identity-score=score    -I score      set identity score [" << m_identityScore << "]\n"
00246             << "   --mismatch-score=score    -M score      set mismatch score [" << m_mismatchScore << "]\n"
00247             << "   --gap-opening-score=score -G score      set gap opening score [" << m_gapOpeningScore << "]\n"
00248             << "   --gap-extention-score=val -Q score      set gap extention score [" << m_gapExtentionScore << "]\n"
00249             << "   --extention-dropoff       -x score      the worst penalty possible when extending alignment [" << m_extentionPenaltyDropoff << "]\n"
00250             << "\nFiltering and ranking options:\n"
00251             << "   --min-pctid=pctid         -p pctid      set global percent identity cutoff [" << m_minPctid << "]\n"
00252             << "   --top-count=val           -u topcnt     maximal number of top hits per read [" << m_topCnt << "]\n"
00253             << "   --top-percent=val         -t toppct     maximal score of hit (in % to best) to be reported [" << m_topPct << "]\n"
00254             << "   --geometry=value          -R value      restrictions on relative hit orientation and order for paired hits [" << (m_geometry) << "]\n"
00255             << "\nOther options:\n"
00256             << "   --help=[brief|full|ext]   -h            print help with current parameter values and exit after parsing cmdline\n"
00257             << "   --version                 -V            print version and exit after parsing cmdline\n"
00258             << "   --assert-version=version  -U version    make sure that the oligofar version is what expected [" OLIGOFAR_VERSION "]\n"
00259             << "   --memory-limit=value      -L value      set rlimit for the program (k|M|G suffix is allowed) [" << m_memoryLimit << "]\n"
00260             << "   --test-suite=+|-          -T +|-        turn test suite on/off [" << (m_performTests?"on":"off") << "]\n"
00261             << "\nRelative orientation flags recognized:\n"
00262             << "     p|centripetal|inside|pcr|solexa       reads are oriented so that vectors 5'->3' pointing to each other\n"
00263             << "     f|centrifugal|outside                 reads are oriented so that vectors 5'->3' are pointing outside\n"
00264             << "     i|incr|incremental|solid              reads are on same strand, first preceeds second on this strand\n"
00265             << "     d|decr|decremental                    reads are on same strand, first succeeds second on this strand\n"
00266             << "\nOutput flags (for -O):\n"
00267             << "     -   reset all flags\n"
00268             << "     h   report all hits before ranking\n"
00269             << "     u   report unmapped reads\n"
00270             << "     x   indicate that there are more reads of this rank\n"
00271             << "     m   indicate that there are more reads of lower ranks\n"
00272             << "     t   indicate that there were no more hits\n"
00273             << "     d   report differences between query and subject\n"
00274             << "     e   print empty line after all hits of the read are reported\n"
00275             << "     r   print raw scores rather then relative scores\n"
00276             << "     z   output in SAM 0.1.1 format (other flags are unsupported in this format)\n"
00277             << "Read file data options may be used only in combinations:\n"
00278             << "     1. with column file:\n"
00279             << "        -q0 -i input.col -c no \n"
00280             << "        -q1 -i input.col -c no \n"
00281             << "        -q0 -i input.col -c yes\n"
00282             << "     2. with fasta or fastq files:\n"
00283             << "        -q0 -1 reads1.fa  [-2 reads2.fa]  -c yes|no\n"
00284             << "        -q1 -1 reads1.faq [-2 reads2.faq] -c no\n"
00285             << "     3. with Solexa 4-channel data\n"
00286             << "        -q4 -i input.id -1 reads1.prb [-2 reads2.prb] -c no\n"
00287             << "\nNB: although -L flag is optional, it is strongly recommended to use it!\n"
00288             ;
00289     }
00290     if( flags & fExtended ) 
00291         cout << "\nExtended options:\n"
00292              << "   --min-block-length=bases   Length for subject sequence to be scanned at once [" << m_minBlockLength << "]\n"
00293              ;
00294 }
00295 
00296 const option * COligoFarApp::GetLongOptions() const
00297 {
00298     static struct option opt[] = {
00299         {"help", 2, 0, 'h'},
00300         {"version", 0, 0, 'V'},
00301         {"assert-version", 1, 0, 'U'},
00302         {"window-size", 1, 0, 'w'},
00303         {"window-skip",1,0,'k'},
00304         {"window-step",1,0,'f'},
00305         {"window-start",1,0,'r'},
00306         {"max-windows",1,0,'N'},
00307         {"max-mism", 1, 0, 'n'},
00308         {"max-indel", 1, 0, 'e'},
00309         {"max-ins", 1, 0, 'j'},
00310         {"max-del", 1, 0, 'J'},
00311         {"max-hash-dist", 1, 0, 'E'},
00312         {"indel-pos", 1, 0, 'K'},
00313         {"input-max-amb", 1, 0, 'a'},
00314         {"fasta-max-amb", 1, 0, 'A'},
00315         {"colorspace", 1, 0, 'c'},
00316         {"NaHSO3", 1, 0, kLongOpt_NaHSO3},
00317         {"input-file", 1, 0, 'i'},
00318         {"fasta-file", 1, 0, 'd'},
00319         {"snpdb-file", 1, 0, 'b'},
00320         {"guide-file", 1, 0, 'g'},
00321         {"feat-file", 1, 0, 'v'},
00322         {"output-file", 1, 0, 'o'},
00323         {"output-flags", 1, 0, 'O'},
00324 //        {"config-file", 1, 0, 'C'},
00325         {"only-seqid", 1, 0, 'y'},
00326         {"gi-list", 1, 0, 'l'},
00327         {"strands", 1, 0, 's'},
00328         {"hash-bitmap-file", 1, 0, kLongOpt_hashBitMask },
00329         {"batch-size", 1, 0, 'B'},
00330         {"batch-range", 1, 0, kLongOpt_batchRange },
00331         {"guide-max-mism", 1, 0, 'x'},
00332         {"min-pctid", 1, 0, 'p'},
00333         {"top-count", 1, 0, 'u'},
00334         {"top-percent", 1, 0, 't'},
00335         {"read-1-file", 1, 0, '1'},
00336         {"read-2-file", 1, 0, '2'},
00337         {"quality-channels", 1, 0, 'q'},
00338         {"quality-base", 1, 0, '0'},
00339         {"phrap-score", 1, 0, 'P'},
00340         {"pair-distance", 1, 0, 'D'},
00341         {"pair-margin", 1, 0, 'm'},
00342         {"geometry", 1, 0, 'R'},
00343         {"max-simplicity", 1, 0, 'F'},
00344         {"identity-score", 1, 0, 'I'},
00345         {"mismatch-score", 1, 0, 'M'},
00346         {"gap-opening-score", 1, 0, 'G'},
00347         {"gap-extention-score", 1, 0, 'Q'},
00348         {"extention-dropoff", 1, 0, 'x'},
00349         {"indel-dropoff", 1, 0, 'X'},
00350         {"band-half-width", 1, 0, 'Y'},
00351         {"longest-ins", 1, 0, kLongOpt_maxInsertion},
00352         {"longest-del", 1, 0, kLongOpt_maxDeletion},
00353         {"max-inserted", 1, 0, kLongOpt_maxInsertions},
00354         {"max-deleted", 1, 0,  kLongOpt_maxDeletions},
00355         {"add-splice", 1, 0, kLongOpt_addSplice},
00356         {"memory-limit", 1, 0, 'L'},
00357         {"test-suite", 1, 0, 'T'},
00358         {"index-bits", 1, 0, 'H'},
00359         {"pass0", 0, 0, kLongOpt_pass0},
00360         {"pass1", 0, 0, kLongOpt_pass1},
00361         {"min-block-length", 1, 0, kLongOpt_min_block_length },
00362         {0,0,0,0}
00363     };
00364     return opt;
00365 }
00366 
00367 const char * COligoFarApp::GetOptString() const
00368 {
00369     return "U:H:S:w:N:f:r:k:n:e:E:j:J:K:a:A:c:i:d:b:v:g:o:O:l:y:s:B:x:p:u:t:1:2:q:0:P:m:D:R:F:I:M:G:Q:X:Y:L:T:";
00370 }
00371 
00372 int COligoFarApp::ParseArg( int opt, const char * arg, int longindex )
00373 {
00374     switch( opt ) {
00375     case kLongOpt_hashBitMask: m_hashBitMask = arg; break;
00376     case kLongOpt_min_block_length: m_minBlockLength = NStr::StringToInt( arg ); break;
00377     case kLongOpt_NaHSO3: m_sodiumBisulfiteCuration = *arg == '+' ? true : *arg == '-' ? false : NStr::StringToBool( arg ); break;
00378     case kLongOpt_pass0: m_hashPass = 0; break;
00379     case kLongOpt_pass1: if( m_passParam.size() < 2 ) m_passParam.push_back( m_passParam.back() ); m_hashPass = 1; break;
00380     case kLongOpt_maxInsertion: m_passParam[m_hashPass].SetAlignParam().SetMaxInsertionLength( abs( NStr::StringToInt( arg ) ) ); break;
00381     case kLongOpt_maxDeletion:  m_passParam[m_hashPass].SetAlignParam().SetMaxDeletionLength( abs( NStr::StringToInt( arg ) ) ); break;
00382     case kLongOpt_maxInsertions: m_passParam[m_hashPass].SetAlignParam().SetMaxInsertionCount( abs( NStr::StringToInt( arg ) ) ); break;
00383     case kLongOpt_maxDeletions:  m_passParam[m_hashPass].SetAlignParam().SetMaxDeletionCount( abs( NStr::StringToInt( arg ) ) ); break;
00384     case kLongOpt_addSplice: AddSplice( arg ); break;
00385     case kLongOpt_batchRange: ParseRange( m_minRun, m_maxRun, arg, "-" ); break;
00386     case 'U': if( strcmp( arg, OLIGOFAR_VERSION ) ) THROW( runtime_error, "Expected oligofar version " << arg << ", called " OLIGOFAR_VERSION ); break;
00387 //    case 'C': ParseConfig( arg ); break;
00388     case 'k': 
00389         do {
00390             list<string> x;
00391             Split( arg, ",", back_inserter( x ) );
00392             CHashParam::TSkipPositions& sp = m_passParam[m_hashPass].SetHashParam().SetSkipPositions();
00393             ITERATE( list<string>, t, x ) sp.insert( sp.end(), NStr::StringToInt( *t ) );
00394         } while(0);
00395         break;
00396     case 'w': 
00397         do {
00398             int win, word;
00399             ParseRange( win, word, arg, "/" ); 
00400             m_passParam[m_hashPass].SetHashParam().SetWindowSize( win );
00401             m_passParam[m_hashPass].SetHashParam().SetWordSize( word );
00402         } while(0);
00403         break;
00404     case 'N': m_passParam[m_hashPass].SetHashParam().SetWindowCount( NStr::StringToInt( arg ) ); break;
00405     case 'f': m_passParam[m_hashPass].SetHashParam().SetWindowStep( abs( NStr::StringToInt( arg ) ) ); break;
00406     case 'r': m_passParam[m_hashPass].SetHashParam().SetWindowStart( abs( NStr::StringToInt( arg ) - 1 ) ); break;
00407     case 'H': m_passParam[m_hashPass].SetHashParam().SetHashBits( NStr::StringToInt( arg ) ); break;
00408     case 'S': m_passParam[m_hashPass].SetHashParam().SetStrideSize( NStr::StringToInt( arg ) ); break;
00409     case 'n': m_passParam[m_hashPass].SetHashParam().SetHashMismatches( NStr::StringToInt( arg ) ); break;
00410     case 'e': m_passParam[m_hashPass].SetHashParam().SetHashIndels( NStr::StringToInt( arg ) ); break;
00411     case 'j': m_passParam[m_hashPass].SetHashParam().SetHashInsertions( NStr::StringToInt( arg ) ); break;
00412     case 'J': m_passParam[m_hashPass].SetHashParam().SetHashDeletions( NStr::StringToInt( arg ) ); break;
00413     case 'E': m_passParam[m_hashPass].SetHashParam().SetMaxHashDistance( NStr::StringToInt( arg ) ); break;
00414     case 'K': m_passParam[m_hashPass].SetHashParam().SetHashIndelPosition( NStr::StringToInt( arg ) ); break;
00415     case 'a': m_passParam[m_hashPass].SetHashParam().SetHashMaxAmb( strtol( arg, 0, 10 ) ); break;
00416     case 'A': m_passParam[m_hashPass].SetMaxSubjAmb() = strtol( arg, 0, 10 ); break;
00417     case 'c': m_colorSpace = *arg == '+' ? true : *arg == '-' ? false : NStr::StringToBool( arg ); break;
00418     case 'i': m_readFile = arg; break;
00419     case 'd': m_fastaFile = arg; break;
00420     case 'b': m_snpdbFile = arg; break;
00421     case 'g': m_guideFile = arg; break;
00422     case 'v': m_featFile = arg; break;
00423     case 'o': m_outputFile = arg; break;
00424     case 'O': m_outputFlags += arg; if( const char * m = strrchr( m_outputFlags.c_str(), '-' ) ) m_outputFlags = m + 1; break;
00425     case 'l': m_gilistFile = arg; break;
00426     case 'y': m_seqIds.push_back( CSeq_id( arg ).AsFastaString() ); break;
00427     case 's': m_strands = strtol( arg, 0, 10 ); break;
00428     case 'B': m_readsPerRun = strtol( arg, 0, 10 ); break;
00429 //    case 'x': m_guideFilemaxMismatch = strtol( arg, 0, 10 ); break;
00430     case 'p': m_minPctid = NStr::StringToDouble( arg ); break;
00431     case 'u': m_topCnt = strtol( arg, 0, 10 ); break;
00432     case 't': m_topPct = NStr::StringToDouble( arg ); break;
00433     case '1': m_read1qualityFile = arg; break;
00434     case '2': m_read2qualityFile = arg; break;
00435     case 'q': m_qualityChannels = NStr::StringToInt( arg ); break;
00436     case '0': m_qualityBase = ( arg[0] && arg[0] == '+' ) ? arg[1] : NStr::StringToInt( arg ); break;
00437     case 'D': ParseRange( m_passParam[m_hashPass].SetMinPair(), m_passParam[m_hashPass].SetMaxPair(), arg ); break;
00438     case 'm': m_passParam[m_hashPass].SetPairMargin() = strtol( arg, 0, 10 ); break;
00439     case 'P': m_passParam[m_hashPass].SetPhrapCutoff() = strtol( arg, 0, 10 ); break;
00440     case 'R': m_geometry = arg; break;
00441     case 'F': m_passParam[m_hashPass].SetHashParam().SetMaxSimplicity( NStr::StringToDouble( arg ) ); break;
00442     case 'X': m_passParam[m_hashPass].SetAlignParam().SetMaxIndelLength( abs( strtol( arg, 0, 10 ) ) ); break;
00443     case 'Y': m_passParam[m_hashPass].SetAlignParam().SetMaxIndelCount( abs( strtol( arg, 0, 10 ) ) ); break;
00444     case 'I': m_identityScore = fabs( NStr::StringToDouble( arg ) ); break;
00445     case 'M': m_mismatchScore = -fabs( NStr::StringToDouble( arg ) ); break;
00446     case 'G': m_gapOpeningScore = -fabs( NStr::StringToDouble( arg ) ); break;
00447     case 'Q': m_gapExtentionScore = -fabs( NStr::StringToDouble( arg ) ); break;
00448     case 'x': m_extentionPenaltyDropoff = -fabs( NStr::StringToDouble( arg ) ); break; 
00449     case 'L':
00450 #ifndef _WIN32
00451         do {
00452             char * t = 0;
00453             m_memoryLimit = strtoll( arg, &t, 10 );
00454             if( t ) {
00455                 switch( tolower(*t) ) {
00456                 case 'g': m_memoryLimit *= 1024;
00457                 case 'm': m_memoryLimit *= 1024;
00458                 case 'k': m_memoryLimit *= 1024;
00459                 }
00460             }
00461         } while(0);
00462 #else
00463         cerr << "[" << GetProgramBasename() << "] Warning: -L is ignored in win32\n";
00464 #endif
00465         break;
00466     case 'T': m_performTests = (*arg == '+') ? true : (*arg == '-') ? false : NStr::StringToBool( arg ); break;
00467     default: return CApp::ParseArg( opt, arg, longindex );
00468     }
00469     return 0;
00470 }
00471 
00472 void COligoFarApp::ParseConfig( const string& cfg ) 
00473 {
00474     ifstream in( cfg.c_str() );
00475     if( !in.good() ) THROW( runtime_error, "Failed to read config file " << cfg );
00476     CNcbiRegistry reg( in );
00477     ParseConfig( &reg );
00478 }
00479 
00480 void COligoFarApp::ParseConfig( IRegistry * reg )
00481 {
00482     const option * opts = GetLongOptions();
00483     for( int index = 0 ; opts && opts->name != 0 ; ++opts, ++index ) {
00484         if( reg->HasEntry( "oligofar", opts->name ) ) {
00485             ParseArg( opts->val, reg->Get( "oligofar", opts->name ).c_str(), index );
00486         }
00487     }
00488 }
00489 
00490 int COligoFarApp::RunTestSuite()
00491 {
00492     if( m_performTests ) {
00493         if( int rc = TestSuite() ) {
00494             cerr << "[" << GetProgramBasename() << "] internal tests failed: " << rc << endl;
00495             return rc;
00496         } else {
00497             cerr << "[" << GetProgramBasename() << "] internal tests succeeded!\n";
00498         }
00499     }
00500     return 0;
00501 }
00502 
00503 int COligoFarApp::SetLimits()
00504 {
00505 #ifndef _WIN32
00506     if( m_memoryLimit ) {
00507         struct rlimit rl;
00508         rl.rlim_cur = m_memoryLimit;
00509         rl.rlim_max = RLIM_INFINITY;
00510         cerr << "[" << GetProgramBasename() << "] Setting memory limit to " << m_memoryLimit << ": ";
00511         errno = 0;
00512         int rc = setrlimit( RLIMIT_AS, &rl );
00513         cerr << "\b\b failed: " << strerror( errno ) << endl;
00514         return rc; 
00515     } else 
00516         cerr << "[" << GetProgramBasename() << "] Memory limit is not set.\n";
00517 #else
00518     cerr << "[" << GetProgramBasename() << "] Setting memory limit is not implemented for win32 yet, ignored.\n";
00519 #endif
00520     return 0;
00521 }
00522 
00523 int COligoFarApp::Execute()
00524 {
00525     if( int rc = RunTestSuite() ) return rc;
00526     //if( int rc = SetLimits() ) return rc;
00527     SetLimits();
00528     return ProcessData();
00529 }
00530 
00531 int COligoFarApp::GetOutputFlags() 
00532 {
00533     int oflags = 0;
00534     for( const char * f = m_outputFlags.c_str(); *f; ++f ) {
00535         switch( tolower(*f) ) {
00536         case '-': oflags = 0; break;
00537         case 'z': m_outputSam = true; break;
00538         case 'e': oflags |= COutputFormatter::fReportEmptyLines; break;
00539         case 'u': oflags |= COutputFormatter::fReportUnmapped; break;
00540         case 'm': oflags |= COutputFormatter::fReportMany; break;
00541         case 'x': oflags |= COutputFormatter::fReportMore; break;
00542         case 't': oflags |= COutputFormatter::fReportTerminator; break;
00543         case 'd': oflags |= COutputFormatter::fReportDifferences; break;
00544         case 'h': oflags |= COutputFormatter::fReportAllHits; break;
00545         case 'r': oflags |= COutputFormatter::fReportRawScore; break;
00546         default: cerr << "[" << GetProgramBasename() << "] Warning: unknown format flag `" << *f << "'\n"; break;
00547         }
00548     }
00549     return oflags;
00550 }
00551 
00552 void COligoFarApp::SetupGeometries( map<string,int>& geometries )
00553 {
00554     geometries.insert( make_pair("p",CFilter::eCentripetal) );
00555     geometries.insert( make_pair("centripetal",CFilter::eCentripetal) );
00556     geometries.insert( make_pair("inside",CFilter::eCentripetal) );
00557     geometries.insert( make_pair("pcr",CFilter::eCentripetal) );
00558     geometries.insert( make_pair("solexa",CFilter::eCentripetal) );
00559     geometries.insert( make_pair("f",CFilter::eCentrifugal) );
00560     geometries.insert( make_pair("centrifugal",CFilter::eCentrifugal) );
00561     geometries.insert( make_pair("outside",CFilter::eCentrifugal) );
00562     geometries.insert( make_pair("i",CFilter::eIncremental) );
00563     geometries.insert( make_pair("incr",CFilter::eIncremental) );
00564     geometries.insert( make_pair("incremental",CFilter::eIncremental) );
00565     geometries.insert( make_pair("solid",CFilter::eIncremental) );
00566     geometries.insert( make_pair("d",CFilter::eDecremental) );
00567     geometries.insert( make_pair("decr",CFilter::eDecremental) );
00568     geometries.insert( make_pair("decremental",CFilter::eDecremental) );
00569 }
00570 
00571 string COligoFarApp::ReportSplices( int i ) const 
00572 {
00573     ostringstream o;
00574     o << Join( ",", m_passParam[i].GetAlignParam().GetSpliceSet().begin(), m_passParam[i].GetAlignParam().GetSpliceSet().end() );
00575     return o.str();
00576 }
00577 
00578 void COligoFarApp::AddSplice( const char* arg ) 
00579 {
00580     m_passParam[m_hashPass].SetAlignParam().AddSplice( CIntron( arg ) );
00581 }
00582 
00583 int COligoFarApp::ProcessData()
00584 {
00585     for( unsigned p = 0; p < m_passParam.size(); ++p ) {
00586         string msg;
00587         if( ! m_passParam[p].GetHashParam().ValidateParam( msg ) ) {
00588             cerr << "Incompatible set of hash parameters for pass" << p 
00589                  << ( m_passParam[p].GetHashParam().GetSkipPositions().size() ? " with skip ppositions:" : ":" ) 
00590                  << msg << "\n";
00591             return EX_USAGE;
00592         }
00593     }
00594 
00595     if( m_readFile == "-" ) m_readFile = "/dev/stdin";
00596     if( m_fastaFile == "-" ) m_fastaFile = "/dev/stdin";
00597     if( m_guideFile == "-" ) m_guideFile = "/dev/stdin";
00598     if( m_outputFile == "-" ) m_outputFile = "/dev/stdout";
00599 
00600     CReaderFactory rfactory;
00601     rfactory.SetColorspace( m_colorSpace );
00602     rfactory.SetQualityChannels( m_qualityChannels );
00603     rfactory.SetReadIdFile( m_readFile );
00604     rfactory.SetReadDataFile1( m_read1qualityFile );
00605     rfactory.SetReadDataFile2( m_read2qualityFile );
00606     auto_ptr<IShortReader> qreader( rfactory.CreateReader() );
00607     
00608     if( qreader.get() == 0 ) {
00609         cerr << "Failed to understand input specifications: incompatible set of parameters (see oligofar -h).\n";
00610         return EX_USAGE;
00611     }
00612 
00613     ofstream o( m_outputFile.c_str() );
00614     if( m_readFile == "/dev/stdin" ) cerr << "* Notice: expecting read tab-separated data on STDIN\n";
00615     if( m_guideFile == "/dev/stdin" ) cerr << "* Notice: expecting guide data on STDIN\n";
00616     if( m_fastaFile == "/dev/stdin" ) cerr << "* Notice: expecting reference sequence data on STDIN\n";
00617 
00618     map<string,int> geometries;
00619     SetupGeometries( geometries );
00620 
00621     if( geometries.find( m_geometry ) == geometries.end() ) {
00622         THROW( runtime_error, "Unknown geometry `" << m_geometry << "'" );
00623     }
00624 
00625     for( vector<CPassParam>::iterator p = m_passParam.begin(); p != m_passParam.end(); ++p ) {
00626         p->SetHashParam().Validate( cerr, "[" + string( GetProgramBasename() ) + "] " );
00627     }
00628 
00629     CSeqIds seqIds;
00630     CFilter filter;
00631     CSeqVecProcessor seqVecProcessor;
00632     CSeqScanner seqScanner;
00633 
00634     CFeatMap featMap;
00635     if( m_featFile.length() ) {
00636         featMap.SetSeqIds( &seqIds );
00637         featMap.ReadFile( m_featFile );
00638         seqScanner.SetFeatMap( &featMap );
00639     }
00640 
00641     auto_ptr<AOutputFormatter> formatter( 0 );
00642     int oflags = GetOutputFlags();
00643     if( m_outputSam ) formatter.reset( new CSamFormatter( o, seqIds ) );
00644     else {
00645         COutputFormatter * of = new COutputFormatter( o, seqIds );
00646         formatter.reset( of );
00647         of->AssignFlags( oflags );
00648         of->SetTopCount( m_topCnt );
00649         of->SetTopPct( m_topPct );
00650     }
00651     // o-lala!!! here something should be changed....
00652 
00653     CQueryHash queryHash( m_passParam[0].GetHashParam().GetHashMismatches(), m_passParam[0].GetHashParam().GetHashMaxAmb() ); 
00654     CScoreParam scoreParam( m_identityScore, m_mismatchScore, m_gapOpeningScore, m_gapExtentionScore );
00655     CScoringFactory scoringFactory( &scoreParam, &m_passParam[0].GetAlignParam() );
00656     CAligner aligner( &scoringFactory );
00657     CGuideSamFile guideFile( m_guideFile, filter, seqIds, scoreParam );
00658     CBatch batch;
00659     
00660     batch.SetReadCount( m_readsPerRun );
00661     batch.SetFastaFile( m_fastaFile );
00662     batch.SetQueryHash( &queryHash );
00663     batch.SetSeqVecProcessor( &seqVecProcessor );
00664     batch.SetSeqScanner( &seqScanner );
00665     batch.SetFilter( &filter );
00666     batch.SetOutputFormatter( formatter.get() );
00667     batch.SetScoringFactory( &scoringFactory );
00668     batch.SetPassParam( &m_passParam );
00669     batch.SetRange( m_minRun, m_maxRun );
00670 
00671     CSnpDb snpDb( CSnpDb::eSeqId_integer );
00672     if( m_snpdbFile.length() ) {
00673         snpDb.Open( m_snpdbFile, CBDB_File::eReadOnly );
00674         seqScanner.SetSnpDb( &snpDb );
00675     }
00676     
00677 //     guideFile.SetMismatchPenalty( scoreParam );
00678 //     guideFile.SetMaxMismatch( m_guideFilemaxMismatch );
00679 
00680     formatter->SetGuideFile( guideFile );
00681 
00682     filter.SetGeometry( geometries[m_geometry] );
00683     filter.SetAligner( &aligner );
00684     filter.SetSeqIds( &seqIds );
00685     filter.SetTopPct( m_topPct );
00686     filter.SetTopCnt( m_topCnt );
00687     filter.SetScorePctCutoff( m_minPctid );
00688     filter.SetOutputFormatter( formatter.get() );
00689 
00690     if( m_minPctid > m_topPct ) {
00691         cerr << "[" << GetProgramBasename() << "] Warning: top% is greater then %cutoff ("
00692              << m_topPct << " < " << m_minPctid << ")\n";
00693     }
00694 
00695     aligner.SetSubjectCoding( CSeqCoding::eCoding_ncbi8na );
00696     aligner.SetQueryStrand( CSeqCoding::eStrand_pos );
00697     aligner.SetExtentionPenaltyDropoff( m_extentionPenaltyDropoff );
00698 
00699 //    for( TSkipPositions::iterator i = m_skipPositions.begin(); i != m_skipPositions.end(); ++i ) --*i; // 1-based to 0-based
00700     auto_ptr<CBitmaskAccess> bitmaskAccess( 0 );
00701     if( m_hashBitMask.size() ) bitmaskAccess.reset( new CBitmaskAccess( m_hashBitMask ) );
00702 
00703     queryHash.SetStrands( m_strands );
00704     queryHash.SetNaHSO3mode( m_sodiumBisulfiteCuration );
00705     queryHash.SetHashWordBitmask( bitmaskAccess.get() );
00706 
00707     seqScanner.SetFilter( &filter );
00708     seqScanner.SetQueryHash( &queryHash );
00709     seqScanner.SetSeqIds( &seqIds );
00710     seqScanner.SetMinBlockLength( m_minBlockLength );
00711     seqScanner.SetInputChunk( batch.GetInputChunk() );
00712 
00713     seqVecProcessor.SetTargetCoding( m_colorSpace ? CSeqCoding::eCoding_colorsp : CSeqCoding::eCoding_ncbi8na );
00714     seqVecProcessor.AddCallback( 1, &filter );
00715     seqVecProcessor.AddCallback( 0, &seqScanner );
00716 
00717     if( !ValidateSplices( queryHash ) ) return EX_USAGE;
00718 
00719     if( m_seqIds.size() ) seqVecProcessor.SetSeqIdList( m_seqIds );
00720     if( m_gilistFile.length() ) seqVecProcessor.SetGiListFile( m_gilistFile );
00721 
00722     Uint8 queriesTotal = 0;
00723     Uint8 entriesTotal = 0;
00724 
00725     CProgressIndicator p( "Reading input data", "lines" );
00726     batch.SetReadProgressIndicator( &p );
00727     batch.Start();
00728 
00729     for( int count = 0; (!batch.Done()) && qreader->NextRead(); ++count ) {
00730 
00731         CQuery * query = new CQuery( 
00732                 qreader->GetSeqCoding(), 
00733                 qreader->GetReadId(), 
00734                 qreader->GetReadData(0), 
00735                 qreader->GetReadData(1), 
00736                 m_qualityBase );
00737         /*
00738         ITERATE( TSkipPositions, k, m_skipPositions ) {
00739             query->MarkPositionAmbiguous( 0, *k - 1 );
00740             query->MarkPositionAmbiguous( 1, *k - 1 );
00741         }
00742         */
00743         query->ComputeBestScore( &scoreParam, 0 );
00744         query->ComputeBestScore( &scoreParam, 1 );
00745         while( guideFile.NextHit( queriesTotal, query ) ); // add guided hits
00746         entriesTotal += batch.AddQuery( query );
00747         queriesTotal ++;
00748         p.Increment();
00749     }
00750     batch.Purge();
00751     batch.SetReadProgressIndicator( 0 );
00752     p.Summary();
00753     cerr << "Queries processed: " << queriesTotal << " (" << entriesTotal << " hash entries)\n";
00754     cerr << "Memory usage:\n"
00755          << "  hits  left: " << CHit::GetCount() << "\n"
00756          << "queries left: " << CQuery::GetCount() << "\n";
00757 
00758     return 0;
00759 }
00760 
00761 bool COligoFarApp::ValidateSplices( CQueryHash& queryHash )
00762 {
00763     /*
00764     int wlen =    queryHash.ComputeHasherWindowLength();
00765     int wstep =   queryHash.GetHasherWindowStep();
00766     int wcnt =    queryHash.GetMaxWindowCount();
00767     int wstart =  queryHash.GetWindowStart();
00768     CIntron::TSet::const_iterator sp = m_intronSet.begin();
00769     for( int wp = wstart, wn = 0; wn != wcnt && sp != m_intronSet.end(); ++wn, (wp += wstep) ) {
00770         while( sp->GetPos() < wp ) if( ++sp == m_intronSet.end() ) return true;
00771         int sc = 0;
00772         for( CIntron::TSet::const_iterator so = sp; so != m_intronSet.end(); ++so ) {
00773             if( so->GetPos() >= wp + wlen ) break;
00774             ++sc;
00775         }
00776         if( sc > 1 ) {
00777             cerr << "[" << GetProgramBasename() << "] ERROR: window " << wn 
00778                 << " [" << wp << ".." << (wp + wlen - 1) 
00779                 << "] has more then one (" << sc << ") splices defined, which oligoFAR can't handle. "
00780                 << "Please consider using different window size, window step, window start or splice definitions.";
00781             return false;
00782         }
00783     }
00784     */
00785     return true;
00786 }
00787 
00788 
00789 

Generated on Sun Dec 6 22:21:06 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Mon Dec 07 16:20:55 2009 by modify_doxy.py rev. 173732