NCBI C Toolkit Cross Reference

C/demo/blast_driver.c


  1 /* $Id: blast_driver.c,v 1.129 2007/10/23 16:03:23 madden Exp $
  2 * ===========================================================================
  3 *
  4 *                            PUBLIC DOMAIN NOTICE
  5 *               National Center for Biotechnology Information
  6 *
  7 *  This software/database is a "United States Government Work" under the
  8 *  terms of the United States Copyright Act.  It was written as part of
  9 *  the author's offical duties as a United States Government employee and
 10 *  thus cannot be copyrighted.  This software/database is freely available
 11 *  to the public for use. The National Library of Medicine and the U.S.
 12 *  Government have not placed any restriction on its use or reproduction.
 13 *
 14 *  Although all reasonable efforts have been taken to ensure the accuracy
 15 *  and reliability of the software and data, the NLM and the U.S.
 16 *  Government do not and cannot warrant the performance or results that
 17 *  may be obtained by using this software or data. The NLM and the U.S.
 18 *  Government disclaim all warranties, express or implied, including
 19 *  warranties of performance, merchantability or fitness for any particular
 20 *  purpose.
 21 *
 22 *  Please cite the author in any work or product based on this material.
 23 *
 24 * ===========================================================================*/
 25 
 26 /*****************************************************************************
 27 
 28 File name: blast_driver.c
 29 
 30 Author: Ilya Dondoshansky
 31 
 32 Contents: Main function for running BLAST
 33 
 34 ******************************************************************************
 35  * $Revision: 1.129 $
 36  * */
 37 
 38 static char const rcsid[] = "$Id: blast_driver.c,v 1.129 2007/10/23 16:03:23 madden Exp $";
 39 
 40 #include <ncbi.h>
 41 #include <sqnutils.h>
 42 #include <readdb.h>
 43 #include <algo/blast/core/blast_options.h>
 44 #include <algo/blast/api/blast_seq.h>
 45 #include <algo/blast/api/blast_input.h>
 46 #include <algo/blast/api/blast_format.h>
 47 #include <algo/blast/api/blast_seqalign.h>
 48 #include <algo/blast/api/blast_tabular.h>
 49 #include <algo/blast/api/blast_api.h>
 50 #include <algo/blast/api/repeats_filter.h>
 51 
 52 #define NUMARG (sizeof(myargs)/sizeof(myargs[0]))
 53 
 54 typedef enum {
 55    ARG_PROGRAM = 0,
 56    ARG_DB,
 57    ARG_QUERY,
 58    ARG_QUERY_LOC,
 59    ARG_SUBJECT,
 60    ARG_SUBJECT_LOC,
 61    ARG_STRAND,
 62    ARG_GENCODE,
 63    ARG_DBGENCODE,
 64    ARG_FILTER,
 65    ARG_LCASE,
 66    ARG_LOOKUP,
 67    ARG_MATRIX,
 68    ARG_MISMATCH,
 69    ARG_MATCH,
 70    ARG_WORDSIZE,
 71    ARG_TEMPL_LEN,
 72    ARG_TEMPL_TYPE,
 73    ARG_EVERYBASE,
 74    ARG_PHI,
 75    ARG_THRESHOLD,
 76    ARG_WINDOW,
 77    ARG_XDROP_UNGAPPED,
 78    ARG_UNGAPPED,
 79    ARG_GREEDY,
 80    ARG_GAPOPEN,
 81    ARG_GAPEXT,
 82    ARG_FRAMESHIFT,
 83    ARG_XDROP,
 84    ARG_XDROP_FINAL,
 85    ARG_EVALUE,
 86    ARG_SEARCHSP,
 87    ARG_PERC_IDENT,
 88    ARG_INTRON,
 89    ARG_DESCRIPTIONS,
 90    ARG_CULLING,
 91    ARG_ALIGNMENTS,
 92    ARG_OUT,
 93    ARG_FORMAT,
 94    ARG_HTML,
 95    ARG_TABULAR,
 96    ARG_THREADS,
 97    ARG_SHOWGI,
 98    ARG_ACCESSION,
 99    ARG_COMP_BASED_STATS,
100    ARG_SMITH_WATERMAN
101 } BlastArguments;
102 
103 static Args myargs[] = {
104    { "Program Name",           /* ARG_PROGRAM */
105       NULL, NULL, NULL, FALSE, 'p', ARG_STRING, 0.0, 0, NULL},
106    { "Database name (if not set, second sequence FASTA must be provided)",
107      NULL, NULL, NULL, TRUE, 'd', ARG_STRING, 0.0, 0, NULL}, /* ARG_DB */
108    { "Query File",               /* ARG_QUERY */
109      "stdin", NULL, NULL, FALSE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
110    { "Query location offsets; format: \"start stop\";\n"
111      "Applies only if query file contains 1 sequence", /* ARG_QUERY_LOC */
112      NULL, NULL, NULL, TRUE, 'I', ARG_STRING, 0.0, 0, NULL},
113    { "Subject File (for sequence sets comparison)", /* ARG_SUBJECT */
114      "stdin", NULL, NULL, FALSE, 'j', ARG_FILE_IN, 0.0, 0, NULL},
115    { "Subject location offsets; format: \"start stop\";\n"/* ARG_SUBJECT_LOC */
116      "Applies only if subject file (-j) contains 1 sequence",
117      NULL, NULL, NULL, TRUE, 'J', ARG_STRING, 0.0, 0, NULL},
118    { "Query strands to search against database: 0 or 3 is both, 1 is top, "
119      "2 is bottom", /* ARG_STRAND */
120      "0", NULL, NULL, FALSE, 'S', ARG_INT, 0.0, 0, NULL},
121    { "Genetic code for translation of the query sequence", /* ARG_GENCODE */
122      "0", NULL, NULL, FALSE, 'Q', ARG_INT, 0.0, 0, NULL},
123    { "Genetic code for translation of the database", /* ARG_DBGENCODE */
124      "0", NULL, NULL, FALSE, 'D', ARG_INT, 0.0, 0, NULL},
125    { "Filter query sequence (DUST with blastn, SEG with others)", /* ARG_FILTER */
126      "T", NULL, NULL, FALSE, 'F', ARG_STRING, 0.0, 0, NULL},
127    { "Mask lower case",
128      "F", NULL, NULL, FALSE, 'c', ARG_BOOLEAN, 0.0, 0, NULL}, /* ARG_LCASE */
129    { "Use (classical Mega BLAST) lookup table with width 12", 
130      "F", NULL, NULL, FALSE, 'L', ARG_BOOLEAN, 0.0, 0, NULL},/* ARG_LOOKUP */
131    { "Matrix",                 /* ARG_MATRIX */
132      "BLOSUM62", NULL, NULL, FALSE, 'M', ARG_STRING, 0.0, 0, NULL},
133    { "Penalty for a nucleotide mismatch (blastn only)", /* ARG_MISMATCH */
134      "-3", NULL, NULL, FALSE, 'q', ARG_INT, 0.0, 0, NULL},
135    { "Reward for a nucleotide match (blastn only)", /* ARG_MATCH */
136      "1", NULL, NULL, FALSE, 'r', ARG_INT, 0.0, 0, NULL},
137    { "Word size, default if 0 (blastn 11, others 3) ", /* ARG_WORDSIZE */  
138      "0", NULL, NULL, FALSE, 'W', ARG_INT, 0.0, 0, NULL}, 
139    { "Length of a discontiguous word template (contiguous word if 0)",
140      "0", NULL, NULL, FALSE, 't', ARG_INT, 0.0, 0, NULL}, /* ARG_TEMPL_LEN */
141    { "Type of a discontiguous word template (0 - coding, 1 - optimal, "
142      "2 - two simultaneous", /* ARG_TEMPL_TYPE */
143      "0", NULL, NULL, FALSE, 'T', ARG_INT, 0.0, 0, NULL},
144    {"Generate words for every base of the database; this option is ignored",
145         "F", NULL, NULL, TRUE, 's', ARG_BOOLEAN, 0.0, 0, NULL},    /* ARG_EVERYBASE */
146    { "Pattern for PHI BLAST",
147      NULL, NULL, NULL, TRUE, 'k', ARG_STRING, 0.0, 0, NULL}, /* ARG_PHI */
148    { "Threshold for extending hits, default if zero\n" /* ARG_THRESHOLD */
149      "      blastp 11, blastn 0, blastx 12, tblastn 13\n"
150      "      tblastx 13, megablast 0",
151      "0", NULL, NULL, FALSE, 'f', ARG_FLOAT, 0.0, 0, NULL},
152    { "Window size (max. allowed distance between a pair of initial hits;\n"
153      "      0 causes default behavior, -1 turns off multiple hits)", 
154      "0", NULL, NULL, FALSE, 'w', ARG_INT, 0.0, 0, NULL}, /* ARG_WINDOW */
155    { "X dropoff value for ungapped extensions in bits (0 invokes default "
156      "behavior)\n      blastn 20, others 7",/*ARG_XDROP_UNGAPPED*/
157       "0", NULL, NULL, FALSE, 'y', ARG_INT, 0.0, 0, NULL},
158    { "Do only ungapped alignment (always TRUE for tblastx)",/*ARG_UNGAPPED*/
159      "F", NULL, NULL, FALSE, 'u', ARG_BOOLEAN, 0.0, 0, NULL},
160    { "Use greedy algorithm for gapped extensions", /* ARG_GREEDY */
161      "F", NULL, NULL, FALSE, 'g', ARG_BOOLEAN, 0.0, 0, NULL},
162    { "Gap open penalty (-1 means default: non-affine if greedy; 5 if dyn. prog.)", 
163      "-1", NULL, NULL, FALSE, 'G', ARG_INT, 0.0, 0, NULL}, /* ARG_GAPOPEN */
164    { "Gap extension penalty (-1 means default: non-affine if greedy; 2 otherwise)",
165      "-1", NULL, NULL, FALSE, 'E', ARG_INT, 0.0, 0, NULL}, /* ARG_GAPEXT */
166    { "Frame shift penalty for out-of-frame gapping (blastx, tblastn only)",
167      "0", NULL, NULL, FALSE, 'h', ARG_INT, 0.0, 0, NULL}, /* ARG_FRAMESHIFT */
168    { "X dropoff value for gapped alignment (in bits) (zero invokes default "
169      "behavior)\n      blastn 30, tblastx 0, others 15", /* ARG_XDROP */
170      "0", NULL, NULL, FALSE, 'X', ARG_INT, 0.0, 0, NULL},
171    { "X dropoff value for final gapped alignment in bits "
172      "(0 invokes default behavior)\n"
173      "      blastn 50, tblastx 0, others 25",  /* ARG_XDROP_FINAL */
174      "0", NULL, NULL, FALSE, 'Z', ARG_INT, 0.0, 0, NULL},
175    { "Expected value",                         /* ARG_EVALUE */
176      "10.0", NULL, NULL, FALSE, 'e', ARG_FLOAT, 0.0, 0, NULL},
177    { "Effective length of the search space (use zero for the real size)", 
178      "0", NULL, NULL, FALSE, 'Y', ARG_FLOAT, 0.0, 0, NULL}, /* ARG_SEARCHSP */
179    {"Identity percentage cut-off",  /* ARG_PERC_IDENT */
180     "0", NULL, NULL, FALSE, 'P', ARG_FLOAT, 0.0, 0, NULL},
181    { "Longest intron length for uneven gap HSP linking (tblastn only)",
182      "0", NULL, NULL, FALSE, 'z', ARG_INT, 0.0, 0, NULL}, /* ARG_INTRON */
183    { "Number of database sequences to show one-line descriptions for (V)",
184      "500", NULL, NULL, FALSE, 'v', ARG_INT, 0.0, 0, NULL}, /* ARG_DESCRIPTIONS */
185    { "Remove hits whose query range is contained in at least "
186       "this many higher-scoring hits (ignored if zero)", /* ARG_CULLING */
187      "0", NULL, NULL, FALSE, 'K', ARG_INT, 0.0, 0, NULL},
188    { "Number of database sequence to show alignments for (B)", /* ARG_ALIGNMENTS */
189      "250", NULL, NULL, FALSE, 'b', ARG_INT, 0.0, 0, NULL},
190    { "Final output file name",             /* ARG_OUT */
191      "stdout", NULL, NULL, TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL}, 
192    { "alignment view options:\n0 = pairwise,\n1 = query-anchored showing "
193      "identities,\n2 = query-anchored no identities,\n3 = flat "
194      "query-anchored, show identities,\n4 = flat query-anchored, no "
195      "identities,\n5 = query-anchored no identities and blunt ends,\n6 = "
196      "flat query-anchored, no identities and blunt ends,\n7 = XML Blast "
197      "output,\n8 = tabular, \n9 tabular with comment lines\n10 ASN, text\n"
198      "11 ASN, binary",                                    /* ARG_FORMAT */
199      "0", NULL, NULL, FALSE, 'm', ARG_INT, 0.0, 0, NULL},
200    { "Produce HTML output",                    /* ARG_HTML */
201      "F", NULL, NULL, FALSE, 'H', ARG_BOOLEAN, 0.0, 0, NULL},
202    { "Produce on-the-fly tabular output;\n"
203      "1 - just offsets and quality values;\n"
204      "2 - add sequence data;\n3 - on-the-fly text ASN.1;\n"
205      "4 - on-the-fly binary ASN.1",
206      "0", NULL, NULL, FALSE, 'B', ARG_INT, 0.0, 0, NULL}, /* ARG_TABULAR */
207    { "Number of threads to use in preliminary search stage",
208      "1", NULL, NULL, FALSE, 'a', ARG_INT, 0.0, 0, NULL}, /* ARG_THREADS */
209    { "Show gis in sequence ids",
210      "F", NULL, NULL, FALSE, 'n', ARG_BOOLEAN, 0.0, 0, NULL}, /* ARG_SHOWGI */
211    { "Show only accessions for sequence ids in tabular output",
212      "F", NULL, NULL, FALSE, 'N', ARG_BOOLEAN, 0.0, 0, NULL},/* ARG_ACCESSION */
213    { "Use composition-based statistics for blastp or tblastn:\n"                /* ARG_COMP_BASED_STATS */
214      "      D or d: default (equivalent to T)\n"
215      "      0 or F or f: no composition-based statistics\n"
216      "      1 or T or t: Composition-based statistics as in "
217      "NAR 29:2994-3005, 2001\n"
218      "      2: Composition-based score adjustment as in "
219      "Bioinformatics 21:902-911,\n"
220      "          2005, conditioned on sequence properties\n"
221      "      3: Composition-based score adjustment as in "
222      "Bioinformatics 21:902-911,\n"
223      "          2005, unconditionally\n"
224      "      For programs other than tblastn, must either be absent "
225      "or be D, F or 0.\n     ",
226      "D", NULL, NULL, FALSE, 'C', ARG_STRING, 0.0, 0, NULL},
227    { "Compute locally optimal Smith-Waterman alignments "
228      "(This option is only\n"
229      "      available for gapped tblastn.)",                          /* ARG_SMITH_WATERMAN */
230      "F", NULL, NULL, FALSE, 'R', ARG_BOOLEAN, 0.0, 0, NULL},
231 };
232 
233 extern void PrintTabularOutputHeader PROTO((char* blast_database, 
234                BioseqPtr query_bsp, SeqLocPtr query_slp, char* blast_program, 
235                Int4 iteration, Boolean believe_query, FILE *outfp));
236 
237 /** Fills all the options structures with user defined values. Uses the 
238  * myargs global structure obtained from GetArgs.
239  * @param lookup_options Lookup table options [in]
240  * @param query_setup_options Query options [in]
241  * @param word_options Initial word processing options [in]
242  * @param ext_options Extension options [in]
243  * @param hit_options Hit saving options [out]
244  * @param score_options Scoring options [out]
245  * @param eff_len_options Effective length options [out]
246  * @param psi_options Protein BLAST options [out]
247  * @param db_options BLAST database options [out]
248  */
249 static Int2 
250 s_FillOptions(SBlastOptions* options)
251 {
252    LookupTableOptions* lookup_options = options->lookup_options;
253    QuerySetUpOptions* query_setup_options = options->query_options; 
254    BlastInitialWordOptions* word_options = options->word_options;
255    BlastExtensionOptions* ext_options = options->ext_options;
256    BlastHitSavingOptions* hit_options = options->hit_options ;
257    BlastScoringOptions* score_options = options->score_options;
258    BlastEffectiveLengthsOptions* eff_len_options = options->eff_len_options;
259 
260    Boolean mb_lookup = FALSE;
261    Int4 greedy_extension = 0;
262    Int4 diag_separation = 0;
263    Boolean is_gapped = FALSE;
264    EBlastProgramType program_number = options->program;
265 
266    /* The following options are for blastn only */
267    if (program_number == eBlastTypeBlastn) {
268       if (myargs[ARG_TEMPL_LEN].intvalue == 0) {
269          mb_lookup = (Boolean) myargs[ARG_LOOKUP].intvalue;
270       } else {
271          /* Discontiguous words */
272          mb_lookup = TRUE;
273       }
274       greedy_extension = myargs[ARG_GREEDY].intvalue;
275    }
276 
277    BLAST_FillLookupTableOptions(lookup_options, program_number, mb_lookup,
278       myargs[ARG_THRESHOLD].floatvalue, (Int2)myargs[ARG_WORDSIZE].intvalue);
279    /* Fill the rest of the lookup table options */
280    lookup_options->mb_template_length = 
281       (Uint1) myargs[ARG_TEMPL_LEN].intvalue;
282    lookup_options->mb_template_type = 
283       (Uint1) myargs[ARG_TEMPL_TYPE].intvalue;
284 
285    if (myargs[ARG_PHI].strvalue) {
286       lookup_options->phi_pattern = strdup(myargs[ARG_PHI].strvalue);
287       /* Set the lookup table type, and also change program type to 
288          indicate a PHI BLAST search. */
289       if (program_number == eBlastTypeBlastn) {
290           lookup_options->lut_type = ePhiNaLookupTable;
291           options->program = eBlastTypePhiBlastn;
292       } else {
293           lookup_options->lut_type = ePhiLookupTable;
294           options->program = eBlastTypePhiBlastp;
295       }
296    }
297 
298    BLAST_FillQuerySetUpOptions(query_setup_options, program_number, 
299       myargs[ARG_FILTER].strvalue, (Uint1)myargs[ARG_STRAND].intvalue);
300 
301    if (myargs[ARG_GENCODE].intvalue &&
302        (program_number == eBlastTypeBlastx || 
303         program_number == eBlastTypeTblastx))
304       query_setup_options->genetic_code = myargs[ARG_GENCODE].intvalue;
305 
306    BLAST_FillInitialWordOptions(word_options, program_number, 
307                             myargs[ARG_WINDOW].intvalue, 
308                             myargs[ARG_XDROP_UNGAPPED].intvalue);
309 
310    if (myargs[ARG_WINDOW].intvalue < 0) {
311       word_options->window_size = 0;
312    }
313    else if (word_options->window_size == 0 &&
314             lookup_options->mb_template_length > 0) {
315       word_options->window_size = 40;
316    }
317 
318    BLAST_FillExtensionOptions(ext_options, program_number, greedy_extension, 
319       myargs[ARG_XDROP].intvalue, myargs[ARG_XDROP_FINAL].intvalue);
320 
321    BLAST_FillScoringOptions(score_options, program_number, 
322        (Boolean)greedy_extension, myargs[ARG_MISMATCH].intvalue, 
323         myargs[ARG_MATCH].intvalue, myargs[ARG_MATRIX].strvalue, 
324         myargs[ARG_GAPOPEN].intvalue, myargs[ARG_GAPEXT].intvalue);
325 
326    if (program_number != eBlastTypeTblastx)
327       is_gapped = !myargs[ARG_UNGAPPED].intvalue;
328 
329    score_options->gapped_calculation = is_gapped;
330    if (myargs[ARG_FRAMESHIFT].intvalue) {
331       score_options->shift_pen = myargs[ARG_FRAMESHIFT].intvalue;
332       score_options->is_ooframe = TRUE;
333    }
334 
335    if (mb_lookup)
336        diag_separation = 6;
337 
338    BLAST_FillHitSavingOptions(hit_options, 
339       myargs[ARG_EVALUE].floatvalue, 
340       MAX(myargs[ARG_DESCRIPTIONS].intvalue, 
341           myargs[ARG_ALIGNMENTS].intvalue),
342       is_gapped,
343       myargs[ARG_CULLING].intvalue,
344       diag_separation);
345  
346    hit_options->percent_identity = myargs[ARG_PERC_IDENT].floatvalue;
347    hit_options->longest_intron = myargs[ARG_INTRON].intvalue;
348 
349    if (myargs[ARG_SEARCHSP].floatvalue != 0) {
350       Int8 searchsp = (Int8) myargs[ARG_SEARCHSP].floatvalue; 
351       BLAST_FillEffectiveLengthsOptions(eff_len_options, 0, 0, &searchsp, 1);
352    }
353    if ((program_number == eBlastTypeTblastn ||
354         program_number == eBlastTypeBlastp) && is_gapped) {
355        /* Set options specific to gapped blastp and tblastn */
356        switch (myargs[ARG_COMP_BASED_STATS].strvalue[0]) {
357        case '0': case 'F': case 'f':
358            ext_options->compositionBasedStats = 0;
359            break;
360        case 'D': case 'd':
361        case '1': case 'T': case 't':
362             ext_options->compositionBasedStats = 1;
363             break;
364        case '2':
365            ErrPostEx(SEV_WARNING, 1, 0, "the -C 2 argument "
366                      "is currently experimental\n");
367            ext_options->compositionBasedStats = 2;
368            break;
369        case '3':
370            ErrPostEx(SEV_WARNING, 1, 0, "the -C 3 argument "
371                      "is currently experimental\n");
372            ext_options->compositionBasedStats = 3;
373            break;
374        default:
375            ErrPostEx(SEV_FATAL, 1, 0, "invalid argument for composition-"
376                      "based statistics; see -C options\n");
377            break;
378        }
379        if (myargs[ARG_SMITH_WATERMAN].intvalue) {
380            ext_options->eTbackExt = eSmithWatermanTbck;
381        }
382    } else {
383        /* Make sure tblastn and blastp parameters were not set for
384         * other programs */
385        
386        switch (myargs[ARG_COMP_BASED_STATS].strvalue[0]) {
387        case '0': case 'D': case 'd': case 'F': case 'f':
388            break;
389        default:
390            ErrPostEx(SEV_FATAL, 1, 0,
391                      "Invalid option -C: only gapped tblastn may use"
392                      " composition based statistics.");
393            break;
394        }
395        if(myargs[ARG_SMITH_WATERMAN].intvalue) {
396            ErrPostEx(SEV_FATAL, 1, 0,
397                      "Invalid option -s: Smith-Waterman alignments are only "
398                      "available for gapped tblastn and blastp.");
399        }
400    }
401    if (program_number == eBlastTypeTblastn ||
402        program_number == eBlastTypeRpsTblastn ||
403        program_number == eBlastTypeTblastx) {
404        SBlastOptionsSetDbGeneticCode(options, myargs[ARG_DBGENCODE].intvalue);
405    }
406 
407    if (lookup_options->lut_type == eCompressedAaLookupTable) {
408        if (lookup_options->threshold < 16) {
409            ErrPostEx(SEV_WARNING, 1, 0,
410                      "Threshold is probably too small for protein "
411                      "searches with a compressed alphabet");
412        }
413        if (word_options->window_size > 0) {
414            ErrPostEx(SEV_WARNING, 1, 0,
415                      "Multiple hits may not work with compressed alphabets");
416        }
417    }
418 
419    return 0;
420 }
421 
422 /** Parses an argument specifying an interval on a sequence to search.
423  * Argument has form "Start[ ,;]Stop"
424  * @param arg Argument string (no restriction if NULL) [in]
425  * @param from_ptr Start of the location [out]
426  * @param to_ptr End of the location [out]
427  * @return -1 if invalid location, otherwise 0.
428  */
429 static Int2
430 s_ParseIntervalLocationArgument(char* arg, Int4* from_ptr, Int4* to_ptr)
431 {
432    const char* delimiters = " ,;";
433    Int4 from = 0, to = 0;
434 
435    if (!arg)
436       return 0;
437    
438    from = atoi(StringTokMT(arg, delimiters, &arg));
439    to = atoi(arg);
440       
441    from = MAX(from, 0);
442    to = MAX(to, 0);
443 
444    *from_ptr = from;
445    *to_ptr = to;
446 
447    if (to > 0 && from > to)
448       return -1;
449    else
450       return 0;
451 }
452 
453 Int2 Nlm_Main(void)
454 {
455    SeqLoc* subject_slp = NULL; /* SeqLoc for the subject sequence in two
456                                     sequences case */
457    Boolean query_is_na, db_is_na;
458    char buf[256] = { '\0' };
459    char* blast_program;
460    EBlastProgramType program_number;
461    char* dbname = NULL;
462    Int2 status = 0;
463    SeqLoc* lcase_mask = NULL;
464    SeqLoc* query_slp = NULL;
465    FILE *infp, *outfp = NULL;
466    AsnIoPtr asn_outfp = NULL;
467    SBlastOptions* options = NULL;
468    BlastFormattingInfo* format_info = NULL;
469    Int4 ctr = 1;
470    Int4 num_queries=0;
471    Int4 batch_size;
472    int tabular_output = FALSE;
473    BlastTabularFormatData* tf_data = NULL;
474    Boolean believe_query = FALSE;
475    Int4 q_from = 0, q_to = 0;
476    Blast_SummaryReturn* sum_returns = NULL;
477    Boolean phi_blast = FALSE;
478    ValNode* phivnps = NULL;
479    Blast_SummaryReturn* full_sum_returns = NULL;
480    GeneticCodeSingletonInit();
481 
482    if (! GetArgs (buf, NUMARG, myargs))
483       return (1);
484    
485    UseLocalAsnloadDataAndErrMsg ();
486    
487    if (! SeqEntryLoad())
488       return 1;
489    
490    ErrSetMessageLevel(SEV_WARNING);
491    
492    blast_program = myargs[ARG_PROGRAM].strvalue;
493 
494    sum_returns = Blast_SummaryReturnNew();
495    status = SBlastOptionsNew(blast_program, &options, sum_returns);
496 
497    if (status) {
498        if (sum_returns->error) {
499            SBlastMessageErrPost(sum_returns->error);
500            sum_returns = Blast_SummaryReturnFree(sum_returns);
501        }
502        return -1;
503    }
504 
505    program_number = options->program;
506 
507    db_is_na = Blast_SubjectIsNucleotide(program_number); 
508    query_is_na = Blast_QueryIsNucleotide(program_number); 
509 
510    phi_blast = Blast_ProgramIsPhiBlast(program_number);
511 
512    if ((dbname = myargs[ARG_DB].strvalue) == NULL) {
513       Int4 letters_read;
514       FILE *infp2;
515       Int4 s_from = 0, s_to = 0;
516       char *subject_file = strdup(myargs[ARG_SUBJECT].strvalue);
517       if ((infp2 = FileOpen(subject_file, "r")) == NULL) {
518          ErrPostEx(SEV_FATAL, 1, 0, 
519                    "blast: Unable to open second input file %s\n", 
520                    subject_file);
521          return (1);
522       }
523       sfree(subject_file);
524 
525       if (s_ParseIntervalLocationArgument(myargs[ARG_SUBJECT_LOC].strvalue,
526                                           &s_from, &s_to)) {
527          ErrPostEx(SEV_FATAL, 1, 0, "Invalid subject sequence location\n");
528          return -1;
529       }
530 
531       letters_read = 
532          BLAST_GetQuerySeqLoc(infp2, db_is_na, 0, 0, s_from, s_to, NULL, 
533                               &subject_slp, &ctr, NULL, FALSE,
534                               myargs[ARG_DBGENCODE].intvalue);
535 
536       if (letters_read <= 0)
537       {
538            ErrPostEx(SEV_FATAL, 1, 0, "Unable to read subject sequence\n");
539            return -1;
540       }
541       FileClose(infp2);
542    }
543 
544    s_FillOptions(options);
545 
546    switch(program_number) {
547        case eBlastTypeBlastn:
548            batch_size = 40000;
549            if (myargs[ARG_LOOKUP].intvalue)
550                batch_size = 5000000;
551            break;
552        case eBlastTypeTblastn:
553        case eBlastTypePsiTblastn:
554            batch_size = 20000;
555            break;
556        case eBlastTypeBlastp:
557            batch_size = 10000;
558            if (options->lookup_options->lut_type == 
559                        eCompressedAaLookupTable) {
560                batch_size = 20000;
561            }
562            break;
563        case eBlastTypeBlastx:
564        case eBlastTypeTblastx:
565        default:
566            batch_size = 10000;
567    }
568 
569    if ((infp = FileOpen(myargs[ARG_QUERY].strvalue, "r")) == NULL) {
570       ErrPostEx(SEV_FATAL, 1, 0, "blast: Unable to open input file %s\n", 
571                 myargs[ARG_QUERY].strvalue);
572       return (1);
573    }
574 
575    if (s_ParseIntervalLocationArgument(myargs[ARG_QUERY_LOC].strvalue,
576                                        &q_from, &q_to)) {
577          ErrPostEx(SEV_FATAL, 1, 0, "Invalid query sequence location\n");
578          return -1;
579    }
580 
581    tabular_output = myargs[ARG_TABULAR].intvalue;
582    
583    /* For on-the-fly tabular and for ASN.1 output, the believe query defline 
584       option must be set to true, otherwise it should be false. */
585    believe_query = 
586        (myargs[ARG_FORMAT].intvalue == eAlignViewAsnText ||
587         myargs[ARG_FORMAT].intvalue == eAlignViewAsnBinary ||
588         tabular_output == eBlastIncrementalASN);
589 
590 
591    switch (tabular_output) {
592    case eBlastTabularDefault:
593    case eBlastTabularAddSequences:
594       if ((outfp = FileOpen(myargs[ARG_OUT].strvalue, "w")) == NULL) {
595          ErrPostEx(SEV_FATAL, 1, 0, "Unable to open output file %s\n", 
596                    myargs[ARG_OUT].strvalue);
597         return (1);
598       }
599       break;
600    case 3:
601       if ((asn_outfp = AsnIoOpen(myargs[ARG_OUT].strvalue, "w")) == NULL) {
602          ErrPostEx(SEV_FATAL, 1, 0, "Unable to open text ASN outfile %s\n", 
603                    myargs[ARG_OUT].strvalue);
604         return (2);
605       }
606       break;
607    case 4:
608       if ((asn_outfp = AsnIoOpen(myargs[ARG_OUT].strvalue, "wb")) == NULL) {
609          ErrPostEx(SEV_FATAL, 1, 0, "Unable to open binary ASN outfile %s\n", 
610                    myargs[ARG_OUT].strvalue);
611         return (2);
612       }
613       break;
614    default:
615        BlastFormattingInfoNew(myargs[ARG_FORMAT].intvalue, options, 
616                               blast_program, dbname, 
617                               myargs[ARG_OUT].strvalue, &format_info);
618 
619        BlastFormattingInfoSetUpOptions(format_info, 
620                                        myargs[ARG_DESCRIPTIONS].intvalue, 
621                                        myargs[ARG_ALIGNMENTS].intvalue,
622                                        (Boolean) myargs[ARG_HTML].intvalue,
623                                        (Boolean) myargs[ARG_GREEDY].intvalue,
624                                        (Boolean) myargs[ARG_SHOWGI].intvalue,
625                                        believe_query);
626 
627        if (dbname)
628            BLAST_PrintOutputHeader(format_info);
629        break;
630    }
631 
632    /* Get the query (queries), loop if necessary. */
633    while (1) {
634        SBlastSeqalignArray* seqalign_arr = NULL;
635        SeqLoc* filter_loc=NULL; /* All masking locations */
636        SeqLoc* repeat_mask = NULL; /* Repeat mask locations */
637        Int4 letters_read;
638 
639        if ((Boolean)myargs[ARG_LCASE].intvalue) {
640            letters_read = 
641                BLAST_GetQuerySeqLoc(infp, query_is_na, 
642                    (Uint1)myargs[ARG_STRAND].intvalue, batch_size, 
643                    q_from, q_to, &lcase_mask,
644                    &query_slp, &ctr, &num_queries, believe_query,
645                    myargs[ARG_GENCODE].intvalue);
646        } else {
647            letters_read = 
648                BLAST_GetQuerySeqLoc(infp, query_is_na,
649                    (Uint1)myargs[ARG_STRAND].intvalue, batch_size,
650                    q_from, q_to, NULL, &query_slp, &ctr, &num_queries, 
651                    believe_query, myargs[ARG_GENCODE].intvalue);
652        }
653 
654        /* If there is no sequence data left in the input file, break out of 
655           the loop. */
656        if (letters_read == 0)
657            break;
658 
659        if (letters_read < 0) {
660            ErrPostEx(SEV_FATAL, 1, 0, "Unable to read query sequence(s)\n");
661            return -1;
662        }
663 
664        if (believe_query && BlastSeqlocsHaveDuplicateIDs(query_slp)) {
665           ErrPostEx(SEV_FATAL, 1, 0, 
666                   "Duplicate IDs detected; please ensure that "
667                   "all query sequence identifiers are unique");
668        }
669 
670        if (tabular_output) {
671            EBlastTabularFormatOptions tab_option = eBlastTabularDefault;
672            if (tabular_output == 3 || tabular_output == 4) {
673                tab_option = eBlastIncrementalASN;
674            } else {
675                /* Print the header of tabular output. */
676                PrintTabularOutputHeader(dbname, NULL, query_slp, 
677                                         blast_program, 0, FALSE, outfp);
678                if (tabular_output == 2) {
679                    if (program_number == eBlastTypeBlastn) {
680                        tab_option = eBlastTabularAddSequences;
681                    } else {
682                        fprintf(stderr, 
683                                "WARNING: Sequences printout in tabular output"
684                                " allowed only for blastn\n");
685                    }
686                } 
687            }
688            
689            tf_data = BlastTabularFormatDataNew(outfp, asn_outfp, query_slp, 
690                                                tab_option, believe_query);
691            tf_data->show_gi = (Boolean) myargs[ARG_SHOWGI].intvalue;
692            tf_data->show_accession = (Boolean) myargs[ARG_ACCESSION].intvalue;
693            tf_data->is_ooframe = (Boolean)(myargs[ARG_FRAMESHIFT].intvalue > 0);
694        }
695 
696        options->num_cpus = myargs[ARG_THREADS].intvalue;
697 
698        /* Find repeat mask, if necessary */
699        if ((status = Blast_FindRepeatFilterSeqLoc(query_slp, myargs[ARG_FILTER].strvalue,
700                                 &repeat_mask, &sum_returns->error)) != 0)
701        {
702             if (sum_returns && sum_returns->error && sum_returns->error->sev >= SEV_ERROR)
703             {
704                    ErrPostEx(SEV_ERROR, 1, 0, sum_returns->error->message);
705                    return status;
706             }
707        }
708 
709        /* Combine repeat mask with lower case mask */
710        if (repeat_mask)
711            lcase_mask = ValNodeLink(&lcase_mask, repeat_mask);
712 
713        /* Do the main search */
714        if (phi_blast) {
715            status = PHIBlastRunSearch(query_slp, dbname, lcase_mask, options, 
716                                       &phivnps, &filter_loc, sum_returns);
717        } else if (dbname) {
718            status =
719                Blast_DatabaseSearch(query_slp, (Blast_PsiCheckpointLoc *) NULL,
720                                     dbname, lcase_mask, options,
721                                     tf_data, &seqalign_arr,
722                                     &filter_loc, sum_returns);
723        } else {
724            status = 
725                Blast_TwoSeqLocSetsAdvanced(query_slp, subject_slp, lcase_mask, 
726                                            options, tf_data, &seqalign_arr, 
727                                            &filter_loc, sum_returns);
728        }
729 
730        /* Deallocate the data structure used for tabular formatting. */
731        BlastTabularFormatDataFree(tf_data);
732 
733        /* Free the lower case mask in SeqLoc form. */
734        lcase_mask = Blast_ValNodeMaskListFree(lcase_mask);
735 
736        /* If masking was done for lookup table only, free the masking locations,
737           because they will not be used for formatting. */
738        if (SBlastOptionsGetMaskAtHash(options))
739            filter_loc = Blast_ValNodeMaskListFree(filter_loc);
740 
741        /* Post warning or error messages, no matter what the search status was. */
742        SBlastMessageErrPost(sum_returns->error);
743 
744        if (!status) {
745            if (phi_blast) {
746                status =
747                    PHIBlastFormatResults(phivnps, query_slp, format_info, 
748                                          sum_returns);
749                phivnps = PHIBlastResultsFree(phivnps);
750            } else if (!tabular_output) {
751                
752                /* Format the results */
753                status = 
754                    BLAST_FormatResults(seqalign_arr, num_queries, query_slp, 
755                                        filter_loc, format_info, sum_returns);
756                
757                seqalign_arr = SBlastSeqalignArrayFree(seqalign_arr);
758            }
759        }
760        /* Update the cumulative summary returns structure and clean the returns
761           substructures for the current search iteration. */
762        Blast_SummaryReturnUpdate(sum_returns, &full_sum_returns);
763        Blast_SummaryReturnClean(sum_returns);
764        filter_loc = Blast_ValNodeMaskListFree(filter_loc);
765        FreeSeqLocSetComponents(query_slp);
766        query_slp = SeqLocSetFree(query_slp);
767    } /* End loop on sets of queries */
768    
769    if (infp)
770       FileClose(infp);
771    
772    subject_slp = SeqLocSetFree(subject_slp);
773 
774    /* Print the footer with cumulative summary information. */
775    Blast_PrintOutputFooter(format_info, full_sum_returns);
776 
777    sum_returns = Blast_SummaryReturnFree(sum_returns);
778    full_sum_returns = Blast_SummaryReturnFree(full_sum_returns);
779 
780    if (tabular_output == 3 || tabular_output == 4)
781        AsnIoClose(asn_outfp);
782    else if (tabular_output) 
783        FileClose(outfp);
784    else
785        format_info = BlastFormattingInfoFree(format_info);       
786 
787    options = SBlastOptionsFree(options);
788    GeneticCodeSingletonFini();
789 
790    return status;
791 }
792 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.