|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/demo/blast_driver.c |
source navigation diff markup identifier search freetext search file search |
1 /* $Id: blast_driver.c,v 1.129 2007/10/23 16:03:23 madden Exp $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25
26 /*****************************************************************************
27
28 File name: blast_driver.c
29
30 Author: Ilya Dondoshansky
31
32 Contents: Main function for running BLAST
33
34 ******************************************************************************
35 * $Revision: 1.129 $
36 * */
37
38 static char const rcsid[] = "$Id: blast_driver.c,v 1.129 2007/10/23 16:03:23 madden Exp $";
39
40 #include <ncbi.h>
41 #include <sqnutils.h>
42 #include <readdb.h>
43 #include <algo/blast/core/blast_options.h>
44 #include <algo/blast/api/blast_seq.h>
45 #include <algo/blast/api/blast_input.h>
46 #include <algo/blast/api/blast_format.h>
47 #include <algo/blast/api/blast_seqalign.h>
48 #include <algo/blast/api/blast_tabular.h>
49 #include <algo/blast/api/blast_api.h>
50 #include <algo/blast/api/repeats_filter.h>
51
52 #define NUMARG (sizeof(myargs)/sizeof(myargs[0]))
53
54 typedef enum {
55 ARG_PROGRAM = 0,
56 ARG_DB,
57 ARG_QUERY,
58 ARG_QUERY_LOC,
59 ARG_SUBJECT,
60 ARG_SUBJECT_LOC,
61 ARG_STRAND,
62 ARG_GENCODE,
63 ARG_DBGENCODE,
64 ARG_FILTER,
65 ARG_LCASE,
66 ARG_LOOKUP,
67 ARG_MATRIX,
68 ARG_MISMATCH,
69 ARG_MATCH,
70 ARG_WORDSIZE,
71 ARG_TEMPL_LEN,
72 ARG_TEMPL_TYPE,
73 ARG_EVERYBASE,
74 ARG_PHI,
75 ARG_THRESHOLD,
76 ARG_WINDOW,
77 ARG_XDROP_UNGAPPED,
78 ARG_UNGAPPED,
79 ARG_GREEDY,
80 ARG_GAPOPEN,
81 ARG_GAPEXT,
82 ARG_FRAMESHIFT,
83 ARG_XDROP,
84 ARG_XDROP_FINAL,
85 ARG_EVALUE,
86 ARG_SEARCHSP,
87 ARG_PERC_IDENT,
88 ARG_INTRON,
89 ARG_DESCRIPTIONS,
90 ARG_CULLING,
91 ARG_ALIGNMENTS,
92 ARG_OUT,
93 ARG_FORMAT,
94 ARG_HTML,
95 ARG_TABULAR,
96 ARG_THREADS,
97 ARG_SHOWGI,
98 ARG_ACCESSION,
99 ARG_COMP_BASED_STATS,
100 ARG_SMITH_WATERMAN
101 } BlastArguments;
102
103 static Args myargs[] = {
104 { "Program Name", /* ARG_PROGRAM */
105 NULL, NULL, NULL, FALSE, 'p', ARG_STRING, 0.0, 0, NULL},
106 { "Database name (if not set, second sequence FASTA must be provided)",
107 NULL, NULL, NULL, TRUE, 'd', ARG_STRING, 0.0, 0, NULL}, /* ARG_DB */
108 { "Query File", /* ARG_QUERY */
109 "stdin", NULL, NULL, FALSE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
110 { "Query location offsets; format: \"start stop\";\n"
111 "Applies only if query file contains 1 sequence", /* ARG_QUERY_LOC */
112 NULL, NULL, NULL, TRUE, 'I', ARG_STRING, 0.0, 0, NULL},
113 { "Subject File (for sequence sets comparison)", /* ARG_SUBJECT */
114 "stdin", NULL, NULL, FALSE, 'j', ARG_FILE_IN, 0.0, 0, NULL},
115 { "Subject location offsets; format: \"start stop\";\n"/* ARG_SUBJECT_LOC */
116 "Applies only if subject file (-j) contains 1 sequence",
117 NULL, NULL, NULL, TRUE, 'J', ARG_STRING, 0.0, 0, NULL},
118 { "Query strands to search against database: 0 or 3 is both, 1 is top, "
119 "2 is bottom", /* ARG_STRAND */
120 "0", NULL, NULL, FALSE, 'S', ARG_INT, 0.0, 0, NULL},
121 { "Genetic code for translation of the query sequence", /* ARG_GENCODE */
122 "0", NULL, NULL, FALSE, 'Q', ARG_INT, 0.0, 0, NULL},
123 { "Genetic code for translation of the database", /* ARG_DBGENCODE */
124 "0", NULL, NULL, FALSE, 'D', ARG_INT, 0.0, 0, NULL},
125 { "Filter query sequence (DUST with blastn, SEG with others)", /* ARG_FILTER */
126 "T", NULL, NULL, FALSE, 'F', ARG_STRING, 0.0, 0, NULL},
127 { "Mask lower case",
128 "F", NULL, NULL, FALSE, 'c', ARG_BOOLEAN, 0.0, 0, NULL}, /* ARG_LCASE */
129 { "Use (classical Mega BLAST) lookup table with width 12",
130 "F", NULL, NULL, FALSE, 'L', ARG_BOOLEAN, 0.0, 0, NULL},/* ARG_LOOKUP */
131 { "Matrix", /* ARG_MATRIX */
132 "BLOSUM62", NULL, NULL, FALSE, 'M', ARG_STRING, 0.0, 0, NULL},
133 { "Penalty for a nucleotide mismatch (blastn only)", /* ARG_MISMATCH */
134 "-3", NULL, NULL, FALSE, 'q', ARG_INT, 0.0, 0, NULL},
135 { "Reward for a nucleotide match (blastn only)", /* ARG_MATCH */
136 "1", NULL, NULL, FALSE, 'r', ARG_INT, 0.0, 0, NULL},
137 { "Word size, default if 0 (blastn 11, others 3) ", /* ARG_WORDSIZE */
138 "0", NULL, NULL, FALSE, 'W', ARG_INT, 0.0, 0, NULL},
139 { "Length of a discontiguous word template (contiguous word if 0)",
140 "0", NULL, NULL, FALSE, 't', ARG_INT, 0.0, 0, NULL}, /* ARG_TEMPL_LEN */
141 { "Type of a discontiguous word template (0 - coding, 1 - optimal, "
142 "2 - two simultaneous", /* ARG_TEMPL_TYPE */
143 "0", NULL, NULL, FALSE, 'T', ARG_INT, 0.0, 0, NULL},
144 {"Generate words for every base of the database; this option is ignored",
145 "F", NULL, NULL, TRUE, 's', ARG_BOOLEAN, 0.0, 0, NULL}, /* ARG_EVERYBASE */
146 { "Pattern for PHI BLAST",
147 NULL, NULL, NULL, TRUE, 'k', ARG_STRING, 0.0, 0, NULL}, /* ARG_PHI */
148 { "Threshold for extending hits, default if zero\n" /* ARG_THRESHOLD */
149 " blastp 11, blastn 0, blastx 12, tblastn 13\n"
150 " tblastx 13, megablast 0",
151 "0", NULL, NULL, FALSE, 'f', ARG_FLOAT, 0.0, 0, NULL},
152 { "Window size (max. allowed distance between a pair of initial hits;\n"
153 " 0 causes default behavior, -1 turns off multiple hits)",
154 "0", NULL, NULL, FALSE, 'w', ARG_INT, 0.0, 0, NULL}, /* ARG_WINDOW */
155 { "X dropoff value for ungapped extensions in bits (0 invokes default "
156 "behavior)\n blastn 20, others 7",/*ARG_XDROP_UNGAPPED*/
157 "0", NULL, NULL, FALSE, 'y', ARG_INT, 0.0, 0, NULL},
158 { "Do only ungapped alignment (always TRUE for tblastx)",/*ARG_UNGAPPED*/
159 "F", NULL, NULL, FALSE, 'u', ARG_BOOLEAN, 0.0, 0, NULL},
160 { "Use greedy algorithm for gapped extensions", /* ARG_GREEDY */
161 "F", NULL, NULL, FALSE, 'g', ARG_BOOLEAN, 0.0, 0, NULL},
162 { "Gap open penalty (-1 means default: non-affine if greedy; 5 if dyn. prog.)",
163 "-1", NULL, NULL, FALSE, 'G', ARG_INT, 0.0, 0, NULL}, /* ARG_GAPOPEN */
164 { "Gap extension penalty (-1 means default: non-affine if greedy; 2 otherwise)",
165 "-1", NULL, NULL, FALSE, 'E', ARG_INT, 0.0, 0, NULL}, /* ARG_GAPEXT */
166 { "Frame shift penalty for out-of-frame gapping (blastx, tblastn only)",
167 "0", NULL, NULL, FALSE, 'h', ARG_INT, 0.0, 0, NULL}, /* ARG_FRAMESHIFT */
168 { "X dropoff value for gapped alignment (in bits) (zero invokes default "
169 "behavior)\n blastn 30, tblastx 0, others 15", /* ARG_XDROP */
170 "0", NULL, NULL, FALSE, 'X', ARG_INT, 0.0, 0, NULL},
171 { "X dropoff value for final gapped alignment in bits "
172 "(0 invokes default behavior)\n"
173 " blastn 50, tblastx 0, others 25", /* ARG_XDROP_FINAL */
174 "0", NULL, NULL, FALSE, 'Z', ARG_INT, 0.0, 0, NULL},
175 { "Expected value", /* ARG_EVALUE */
176 "10.0", NULL, NULL, FALSE, 'e', ARG_FLOAT, 0.0, 0, NULL},
177 { "Effective length of the search space (use zero for the real size)",
178 "0", NULL, NULL, FALSE, 'Y', ARG_FLOAT, 0.0, 0, NULL}, /* ARG_SEARCHSP */
179 {"Identity percentage cut-off", /* ARG_PERC_IDENT */
180 "0", NULL, NULL, FALSE, 'P', ARG_FLOAT, 0.0, 0, NULL},
181 { "Longest intron length for uneven gap HSP linking (tblastn only)",
182 "0", NULL, NULL, FALSE, 'z', ARG_INT, 0.0, 0, NULL}, /* ARG_INTRON */
183 { "Number of database sequences to show one-line descriptions for (V)",
184 "500", NULL, NULL, FALSE, 'v', ARG_INT, 0.0, 0, NULL}, /* ARG_DESCRIPTIONS */
185 { "Remove hits whose query range is contained in at least "
186 "this many higher-scoring hits (ignored if zero)", /* ARG_CULLING */
187 "0", NULL, NULL, FALSE, 'K', ARG_INT, 0.0, 0, NULL},
188 { "Number of database sequence to show alignments for (B)", /* ARG_ALIGNMENTS */
189 "250", NULL, NULL, FALSE, 'b', ARG_INT, 0.0, 0, NULL},
190 { "Final output file name", /* ARG_OUT */
191 "stdout", NULL, NULL, TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
192 { "alignment view options:\n0 = pairwise,\n1 = query-anchored showing "
193 "identities,\n2 = query-anchored no identities,\n3 = flat "
194 "query-anchored, show identities,\n4 = flat query-anchored, no "
195 "identities,\n5 = query-anchored no identities and blunt ends,\n6 = "
196 "flat query-anchored, no identities and blunt ends,\n7 = XML Blast "
197 "output,\n8 = tabular, \n9 tabular with comment lines\n10 ASN, text\n"
198 "11 ASN, binary", /* ARG_FORMAT */
199 "0", NULL, NULL, FALSE, 'm', ARG_INT, 0.0, 0, NULL},
200 { "Produce HTML output", /* ARG_HTML */
201 "F", NULL, NULL, FALSE, 'H', ARG_BOOLEAN, 0.0, 0, NULL},
202 { "Produce on-the-fly tabular output;\n"
203 "1 - just offsets and quality values;\n"
204 "2 - add sequence data;\n3 - on-the-fly text ASN.1;\n"
205 "4 - on-the-fly binary ASN.1",
206 "0", NULL, NULL, FALSE, 'B', ARG_INT, 0.0, 0, NULL}, /* ARG_TABULAR */
207 { "Number of threads to use in preliminary search stage",
208 "1", NULL, NULL, FALSE, 'a', ARG_INT, 0.0, 0, NULL}, /* ARG_THREADS */
209 { "Show gis in sequence ids",
210 "F", NULL, NULL, FALSE, 'n', ARG_BOOLEAN, 0.0, 0, NULL}, /* ARG_SHOWGI */
211 { "Show only accessions for sequence ids in tabular output",
212 "F", NULL, NULL, FALSE, 'N', ARG_BOOLEAN, 0.0, 0, NULL},/* ARG_ACCESSION */
213 { "Use composition-based statistics for blastp or tblastn:\n" /* ARG_COMP_BASED_STATS */
214 " D or d: default (equivalent to T)\n"
215 " 0 or F or f: no composition-based statistics\n"
216 " 1 or T or t: Composition-based statistics as in "
217 "NAR 29:2994-3005, 2001\n"
218 " 2: Composition-based score adjustment as in "
219 "Bioinformatics 21:902-911,\n"
220 " 2005, conditioned on sequence properties\n"
221 " 3: Composition-based score adjustment as in "
222 "Bioinformatics 21:902-911,\n"
223 " 2005, unconditionally\n"
224 " For programs other than tblastn, must either be absent "
225 "or be D, F or 0.\n ",
226 "D", NULL, NULL, FALSE, 'C', ARG_STRING, 0.0, 0, NULL},
227 { "Compute locally optimal Smith-Waterman alignments "
228 "(This option is only\n"
229 " available for gapped tblastn.)", /* ARG_SMITH_WATERMAN */
230 "F", NULL, NULL, FALSE, 'R', ARG_BOOLEAN, 0.0, 0, NULL},
231 };
232
233 extern void PrintTabularOutputHeader PROTO((char* blast_database,
234 BioseqPtr query_bsp, SeqLocPtr query_slp, char* blast_program,
235 Int4 iteration, Boolean believe_query, FILE *outfp));
236
237 /** Fills all the options structures with user defined values. Uses the
238 * myargs global structure obtained from GetArgs.
239 * @param lookup_options Lookup table options [in]
240 * @param query_setup_options Query options [in]
241 * @param word_options Initial word processing options [in]
242 * @param ext_options Extension options [in]
243 * @param hit_options Hit saving options [out]
244 * @param score_options Scoring options [out]
245 * @param eff_len_options Effective length options [out]
246 * @param psi_options Protein BLAST options [out]
247 * @param db_options BLAST database options [out]
248 */
249 static Int2
250 s_FillOptions(SBlastOptions* options)
251 {
252 LookupTableOptions* lookup_options = options->lookup_options;
253 QuerySetUpOptions* query_setup_options = options->query_options;
254 BlastInitialWordOptions* word_options = options->word_options;
255 BlastExtensionOptions* ext_options = options->ext_options;
256 BlastHitSavingOptions* hit_options = options->hit_options ;
257 BlastScoringOptions* score_options = options->score_options;
258 BlastEffectiveLengthsOptions* eff_len_options = options->eff_len_options;
259
260 Boolean mb_lookup = FALSE;
261 Int4 greedy_extension = 0;
262 Int4 diag_separation = 0;
263 Boolean is_gapped = FALSE;
264 EBlastProgramType program_number = options->program;
265
266 /* The following options are for blastn only */
267 if (program_number == eBlastTypeBlastn) {
268 if (myargs[ARG_TEMPL_LEN].intvalue == 0) {
269 mb_lookup = (Boolean) myargs[ARG_LOOKUP].intvalue;
270 } else {
271 /* Discontiguous words */
272 mb_lookup = TRUE;
273 }
274 greedy_extension = myargs[ARG_GREEDY].intvalue;
275 }
276
277 BLAST_FillLookupTableOptions(lookup_options, program_number, mb_lookup,
278 myargs[ARG_THRESHOLD].floatvalue, (Int2)myargs[ARG_WORDSIZE].intvalue);
279 /* Fill the rest of the lookup table options */
280 lookup_options->mb_template_length =
281 (Uint1) myargs[ARG_TEMPL_LEN].intvalue;
282 lookup_options->mb_template_type =
283 (Uint1) myargs[ARG_TEMPL_TYPE].intvalue;
284
285 if (myargs[ARG_PHI].strvalue) {
286 lookup_options->phi_pattern = strdup(myargs[ARG_PHI].strvalue);
287 /* Set the lookup table type, and also change program type to
288 indicate a PHI BLAST search. */
289 if (program_number == eBlastTypeBlastn) {
290 lookup_options->lut_type = ePhiNaLookupTable;
291 options->program = eBlastTypePhiBlastn;
292 } else {
293 lookup_options->lut_type = ePhiLookupTable;
294 options->program = eBlastTypePhiBlastp;
295 }
296 }
297
298 BLAST_FillQuerySetUpOptions(query_setup_options, program_number,
299 myargs[ARG_FILTER].strvalue, (Uint1)myargs[ARG_STRAND].intvalue);
300
301 if (myargs[ARG_GENCODE].intvalue &&
302 (program_number == eBlastTypeBlastx ||
303 program_number == eBlastTypeTblastx))
304 query_setup_options->genetic_code = myargs[ARG_GENCODE].intvalue;
305
306 BLAST_FillInitialWordOptions(word_options, program_number,
307 myargs[ARG_WINDOW].intvalue,
308 myargs[ARG_XDROP_UNGAPPED].intvalue);
309
310 if (myargs[ARG_WINDOW].intvalue < 0) {
311 word_options->window_size = 0;
312 }
313 else if (word_options->window_size == 0 &&
314 lookup_options->mb_template_length > 0) {
315 word_options->window_size = 40;
316 }
317
318 BLAST_FillExtensionOptions(ext_options, program_number, greedy_extension,
319 myargs[ARG_XDROP].intvalue, myargs[ARG_XDROP_FINAL].intvalue);
320
321 BLAST_FillScoringOptions(score_options, program_number,
322 (Boolean)greedy_extension, myargs[ARG_MISMATCH].intvalue,
323 myargs[ARG_MATCH].intvalue, myargs[ARG_MATRIX].strvalue,
324 myargs[ARG_GAPOPEN].intvalue, myargs[ARG_GAPEXT].intvalue);
325
326 if (program_number != eBlastTypeTblastx)
327 is_gapped = !myargs[ARG_UNGAPPED].intvalue;
328
329 score_options->gapped_calculation = is_gapped;
330 if (myargs[ARG_FRAMESHIFT].intvalue) {
331 score_options->shift_pen = myargs[ARG_FRAMESHIFT].intvalue;
332 score_options->is_ooframe = TRUE;
333 }
334
335 if (mb_lookup)
336 diag_separation = 6;
337
338 BLAST_FillHitSavingOptions(hit_options,
339 myargs[ARG_EVALUE].floatvalue,
340 MAX(myargs[ARG_DESCRIPTIONS].intvalue,
341 myargs[ARG_ALIGNMENTS].intvalue),
342 is_gapped,
343 myargs[ARG_CULLING].intvalue,
344 diag_separation);
345
346 hit_options->percent_identity = myargs[ARG_PERC_IDENT].floatvalue;
347 hit_options->longest_intron = myargs[ARG_INTRON].intvalue;
348
349 if (myargs[ARG_SEARCHSP].floatvalue != 0) {
350 Int8 searchsp = (Int8) myargs[ARG_SEARCHSP].floatvalue;
351 BLAST_FillEffectiveLengthsOptions(eff_len_options, 0, 0, &searchsp, 1);
352 }
353 if ((program_number == eBlastTypeTblastn ||
354 program_number == eBlastTypeBlastp) && is_gapped) {
355 /* Set options specific to gapped blastp and tblastn */
356 switch (myargs[ARG_COMP_BASED_STATS].strvalue[0]) {
357 case '0': case 'F': case 'f':
358 ext_options->compositionBasedStats = 0;
359 break;
360 case 'D': case 'd':
361 case '1': case 'T': case 't':
362 ext_options->compositionBasedStats = 1;
363 break;
364 case '2':
365 ErrPostEx(SEV_WARNING, 1, 0, "the -C 2 argument "
366 "is currently experimental\n");
367 ext_options->compositionBasedStats = 2;
368 break;
369 case '3':
370 ErrPostEx(SEV_WARNING, 1, 0, "the -C 3 argument "
371 "is currently experimental\n");
372 ext_options->compositionBasedStats = 3;
373 break;
374 default:
375 ErrPostEx(SEV_FATAL, 1, 0, "invalid argument for composition-"
376 "based statistics; see -C options\n");
377 break;
378 }
379 if (myargs[ARG_SMITH_WATERMAN].intvalue) {
380 ext_options->eTbackExt = eSmithWatermanTbck;
381 }
382 } else {
383 /* Make sure tblastn and blastp parameters were not set for
384 * other programs */
385
386 switch (myargs[ARG_COMP_BASED_STATS].strvalue[0]) {
387 case '0': case 'D': case 'd': case 'F': case 'f':
388 break;
389 default:
390 ErrPostEx(SEV_FATAL, 1, 0,
391 "Invalid option -C: only gapped tblastn may use"
392 " composition based statistics.");
393 break;
394 }
395 if(myargs[ARG_SMITH_WATERMAN].intvalue) {
396 ErrPostEx(SEV_FATAL, 1, 0,
397 "Invalid option -s: Smith-Waterman alignments are only "
398 "available for gapped tblastn and blastp.");
399 }
400 }
401 if (program_number == eBlastTypeTblastn ||
402 program_number == eBlastTypeRpsTblastn ||
403 program_number == eBlastTypeTblastx) {
404 SBlastOptionsSetDbGeneticCode(options, myargs[ARG_DBGENCODE].intvalue);
405 }
406
407 if (lookup_options->lut_type == eCompressedAaLookupTable) {
408 if (lookup_options->threshold < 16) {
409 ErrPostEx(SEV_WARNING, 1, 0,
410 "Threshold is probably too small for protein "
411 "searches with a compressed alphabet");
412 }
413 if (word_options->window_size > 0) {
414 ErrPostEx(SEV_WARNING, 1, 0,
415 "Multiple hits may not work with compressed alphabets");
416 }
417 }
418
419 return 0;
420 }
421
422 /** Parses an argument specifying an interval on a sequence to search.
423 * Argument has form "Start[ ,;]Stop"
424 * @param arg Argument string (no restriction if NULL) [in]
425 * @param from_ptr Start of the location [out]
426 * @param to_ptr End of the location [out]
427 * @return -1 if invalid location, otherwise 0.
428 */
429 static Int2
430 s_ParseIntervalLocationArgument(char* arg, Int4* from_ptr, Int4* to_ptr)
431 {
432 const char* delimiters = " ,;";
433 Int4 from = 0, to = 0;
434
435 if (!arg)
436 return 0;
437
438 from = atoi(StringTokMT(arg, delimiters, &arg));
439 to = atoi(arg);
440
441 from = MAX(from, 0);
442 to = MAX(to, 0);
443
444 *from_ptr = from;
445 *to_ptr = to;
446
447 if (to > 0 && from > to)
448 return -1;
449 else
450 return 0;
451 }
452
453 Int2 Nlm_Main(void)
454 {
455 SeqLoc* subject_slp = NULL; /* SeqLoc for the subject sequence in two
456 sequences case */
457 Boolean query_is_na, db_is_na;
458 char buf[256] = { '\0' };
459 char* blast_program;
460 EBlastProgramType program_number;
461 char* dbname = NULL;
462 Int2 status = 0;
463 SeqLoc* lcase_mask = NULL;
464 SeqLoc* query_slp = NULL;
465 FILE *infp, *outfp = NULL;
466 AsnIoPtr asn_outfp = NULL;
467 SBlastOptions* options = NULL;
468 BlastFormattingInfo* format_info = NULL;
469 Int4 ctr = 1;
470 Int4 num_queries=0;
471 Int4 batch_size;
472 int tabular_output = FALSE;
473 BlastTabularFormatData* tf_data = NULL;
474 Boolean believe_query = FALSE;
475 Int4 q_from = 0, q_to = 0;
476 Blast_SummaryReturn* sum_returns = NULL;
477 Boolean phi_blast = FALSE;
478 ValNode* phivnps = NULL;
479 Blast_SummaryReturn* full_sum_returns = NULL;
480 GeneticCodeSingletonInit();
481
482 if (! GetArgs (buf, NUMARG, myargs))
483 return (1);
484
485 UseLocalAsnloadDataAndErrMsg ();
486
487 if (! SeqEntryLoad())
488 return 1;
489
490 ErrSetMessageLevel(SEV_WARNING);
491
492 blast_program = myargs[ARG_PROGRAM].strvalue;
493
494 sum_returns = Blast_SummaryReturnNew();
495 status = SBlastOptionsNew(blast_program, &options, sum_returns);
496
497 if (status) {
498 if (sum_returns->error) {
499 SBlastMessageErrPost(sum_returns->error);
500 sum_returns = Blast_SummaryReturnFree(sum_returns);
501 }
502 return -1;
503 }
504
505 program_number = options->program;
506
507 db_is_na = Blast_SubjectIsNucleotide(program_number);
508 query_is_na = Blast_QueryIsNucleotide(program_number);
509
510 phi_blast = Blast_ProgramIsPhiBlast(program_number);
511
512 if ((dbname = myargs[ARG_DB].strvalue) == NULL) {
513 Int4 letters_read;
514 FILE *infp2;
515 Int4 s_from = 0, s_to = 0;
516 char *subject_file = strdup(myargs[ARG_SUBJECT].strvalue);
517 if ((infp2 = FileOpen(subject_file, "r")) == NULL) {
518 ErrPostEx(SEV_FATAL, 1, 0,
519 "blast: Unable to open second input file %s\n",
520 subject_file);
521 return (1);
522 }
523 sfree(subject_file);
524
525 if (s_ParseIntervalLocationArgument(myargs[ARG_SUBJECT_LOC].strvalue,
526 &s_from, &s_to)) {
527 ErrPostEx(SEV_FATAL, 1, 0, "Invalid subject sequence location\n");
528 return -1;
529 }
530
531 letters_read =
532 BLAST_GetQuerySeqLoc(infp2, db_is_na, 0, 0, s_from, s_to, NULL,
533 &subject_slp, &ctr, NULL, FALSE,
534 myargs[ARG_DBGENCODE].intvalue);
535
536 if (letters_read <= 0)
537 {
538 ErrPostEx(SEV_FATAL, 1, 0, "Unable to read subject sequence\n");
539 return -1;
540 }
541 FileClose(infp2);
542 }
543
544 s_FillOptions(options);
545
546 switch(program_number) {
547 case eBlastTypeBlastn:
548 batch_size = 40000;
549 if (myargs[ARG_LOOKUP].intvalue)
550 batch_size = 5000000;
551 break;
552 case eBlastTypeTblastn:
553 case eBlastTypePsiTblastn:
554 batch_size = 20000;
555 break;
556 case eBlastTypeBlastp:
557 batch_size = 10000;
558 if (options->lookup_options->lut_type ==
559 eCompressedAaLookupTable) {
560 batch_size = 20000;
561 }
562 break;
563 case eBlastTypeBlastx:
564 case eBlastTypeTblastx:
565 default:
566 batch_size = 10000;
567 }
568
569 if ((infp = FileOpen(myargs[ARG_QUERY].strvalue, "r")) == NULL) {
570 ErrPostEx(SEV_FATAL, 1, 0, "blast: Unable to open input file %s\n",
571 myargs[ARG_QUERY].strvalue);
572 return (1);
573 }
574
575 if (s_ParseIntervalLocationArgument(myargs[ARG_QUERY_LOC].strvalue,
576 &q_from, &q_to)) {
577 ErrPostEx(SEV_FATAL, 1, 0, "Invalid query sequence location\n");
578 return -1;
579 }
580
581 tabular_output = myargs[ARG_TABULAR].intvalue;
582
583 /* For on-the-fly tabular and for ASN.1 output, the believe query defline
584 option must be set to true, otherwise it should be false. */
585 believe_query =
586 (myargs[ARG_FORMAT].intvalue == eAlignViewAsnText ||
587 myargs[ARG_FORMAT].intvalue == eAlignViewAsnBinary ||
588 tabular_output == eBlastIncrementalASN);
589
590
591 switch (tabular_output) {
592 case eBlastTabularDefault:
593 case eBlastTabularAddSequences:
594 if ((outfp = FileOpen(myargs[ARG_OUT].strvalue, "w")) == NULL) {
595 ErrPostEx(SEV_FATAL, 1, 0, "Unable to open output file %s\n",
596 myargs[ARG_OUT].strvalue);
597 return (1);
598 }
599 break;
600 case 3:
601 if ((asn_outfp = AsnIoOpen(myargs[ARG_OUT].strvalue, "w")) == NULL) {
602 ErrPostEx(SEV_FATAL, 1, 0, "Unable to open text ASN outfile %s\n",
603 myargs[ARG_OUT].strvalue);
604 return (2);
605 }
606 break;
607 case 4:
608 if ((asn_outfp = AsnIoOpen(myargs[ARG_OUT].strvalue, "wb")) == NULL) {
609 ErrPostEx(SEV_FATAL, 1, 0, "Unable to open binary ASN outfile %s\n",
610 myargs[ARG_OUT].strvalue);
611 return (2);
612 }
613 break;
614 default:
615 BlastFormattingInfoNew(myargs[ARG_FORMAT].intvalue, options,
616 blast_program, dbname,
617 myargs[ARG_OUT].strvalue, &format_info);
618
619 BlastFormattingInfoSetUpOptions(format_info,
620 myargs[ARG_DESCRIPTIONS].intvalue,
621 myargs[ARG_ALIGNMENTS].intvalue,
622 (Boolean) myargs[ARG_HTML].intvalue,
623 (Boolean) myargs[ARG_GREEDY].intvalue,
624 (Boolean) myargs[ARG_SHOWGI].intvalue,
625 believe_query);
626
627 if (dbname)
628 BLAST_PrintOutputHeader(format_info);
629 break;
630 }
631
632 /* Get the query (queries), loop if necessary. */
633 while (1) {
634 SBlastSeqalignArray* seqalign_arr = NULL;
635 SeqLoc* filter_loc=NULL; /* All masking locations */
636 SeqLoc* repeat_mask = NULL; /* Repeat mask locations */
637 Int4 letters_read;
638
639 if ((Boolean)myargs[ARG_LCASE].intvalue) {
640 letters_read =
641 BLAST_GetQuerySeqLoc(infp, query_is_na,
642 (Uint1)myargs[ARG_STRAND].intvalue, batch_size,
643 q_from, q_to, &lcase_mask,
644 &query_slp, &ctr, &num_queries, believe_query,
645 myargs[ARG_GENCODE].intvalue);
646 } else {
647 letters_read =
648 BLAST_GetQuerySeqLoc(infp, query_is_na,
649 (Uint1)myargs[ARG_STRAND].intvalue, batch_size,
650 q_from, q_to, NULL, &query_slp, &ctr, &num_queries,
651 believe_query, myargs[ARG_GENCODE].intvalue);
652 }
653
654 /* If there is no sequence data left in the input file, break out of
655 the loop. */
656 if (letters_read == 0)
657 break;
658
659 if (letters_read < 0) {
660 ErrPostEx(SEV_FATAL, 1, 0, "Unable to read query sequence(s)\n");
661 return -1;
662 }
663
664 if (believe_query && BlastSeqlocsHaveDuplicateIDs(query_slp)) {
665 ErrPostEx(SEV_FATAL, 1, 0,
666 "Duplicate IDs detected; please ensure that "
667 "all query sequence identifiers are unique");
668 }
669
670 if (tabular_output) {
671 EBlastTabularFormatOptions tab_option = eBlastTabularDefault;
672 if (tabular_output == 3 || tabular_output == 4) {
673 tab_option = eBlastIncrementalASN;
674 } else {
675 /* Print the header of tabular output. */
676 PrintTabularOutputHeader(dbname, NULL, query_slp,
677 blast_program, 0, FALSE, outfp);
678 if (tabular_output == 2) {
679 if (program_number == eBlastTypeBlastn) {
680 tab_option = eBlastTabularAddSequences;
681 } else {
682 fprintf(stderr,
683 "WARNING: Sequences printout in tabular output"
684 " allowed only for blastn\n");
685 }
686 }
687 }
688
689 tf_data = BlastTabularFormatDataNew(outfp, asn_outfp, query_slp,
690 tab_option, believe_query);
691 tf_data->show_gi = (Boolean) myargs[ARG_SHOWGI].intvalue;
692 tf_data->show_accession = (Boolean) myargs[ARG_ACCESSION].intvalue;
693 tf_data->is_ooframe = (Boolean)(myargs[ARG_FRAMESHIFT].intvalue > 0);
694 }
695
696 options->num_cpus = myargs[ARG_THREADS].intvalue;
697
698 /* Find repeat mask, if necessary */
699 if ((status = Blast_FindRepeatFilterSeqLoc(query_slp, myargs[ARG_FILTER].strvalue,
700 &repeat_mask, &sum_returns->error)) != 0)
701 {
702 if (sum_returns && sum_returns->error && sum_returns->error->sev >= SEV_ERROR)
703 {
704 ErrPostEx(SEV_ERROR, 1, 0, sum_returns->error->message);
705 return status;
706 }
707 }
708
709 /* Combine repeat mask with lower case mask */
710 if (repeat_mask)
711 lcase_mask = ValNodeLink(&lcase_mask, repeat_mask);
712
713 /* Do the main search */
714 if (phi_blast) {
715 status = PHIBlastRunSearch(query_slp, dbname, lcase_mask, options,
716 &phivnps, &filter_loc, sum_returns);
717 } else if (dbname) {
718 status =
719 Blast_DatabaseSearch(query_slp, (Blast_PsiCheckpointLoc *) NULL,
720 dbname, lcase_mask, options,
721 tf_data, &seqalign_arr,
722 &filter_loc, sum_returns);
723 } else {
724 status =
725 Blast_TwoSeqLocSetsAdvanced(query_slp, subject_slp, lcase_mask,
726 options, tf_data, &seqalign_arr,
727 &filter_loc, sum_returns);
728 }
729
730 /* Deallocate the data structure used for tabular formatting. */
731 BlastTabularFormatDataFree(tf_data);
732
733 /* Free the lower case mask in SeqLoc form. */
734 lcase_mask = Blast_ValNodeMaskListFree(lcase_mask);
735
736 /* If masking was done for lookup table only, free the masking locations,
737 because they will not be used for formatting. */
738 if (SBlastOptionsGetMaskAtHash(options))
739 filter_loc = Blast_ValNodeMaskListFree(filter_loc);
740
741 /* Post warning or error messages, no matter what the search status was. */
742 SBlastMessageErrPost(sum_returns->error);
743
744 if (!status) {
745 if (phi_blast) {
746 status =
747 PHIBlastFormatResults(phivnps, query_slp, format_info,
748 sum_returns);
749 phivnps = PHIBlastResultsFree(phivnps);
750 } else if (!tabular_output) {
751
752 /* Format the results */
753 status =
754 BLAST_FormatResults(seqalign_arr, num_queries, query_slp,
755 filter_loc, format_info, sum_returns);
756
757 seqalign_arr = SBlastSeqalignArrayFree(seqalign_arr);
758 }
759 }
760 /* Update the cumulative summary returns structure and clean the returns
761 substructures for the current search iteration. */
762 Blast_SummaryReturnUpdate(sum_returns, &full_sum_returns);
763 Blast_SummaryReturnClean(sum_returns);
764 filter_loc = Blast_ValNodeMaskListFree(filter_loc);
765 FreeSeqLocSetComponents(query_slp);
766 query_slp = SeqLocSetFree(query_slp);
767 } /* End loop on sets of queries */
768
769 if (infp)
770 FileClose(infp);
771
772 subject_slp = SeqLocSetFree(subject_slp);
773
774 /* Print the footer with cumulative summary information. */
775 Blast_PrintOutputFooter(format_info, full_sum_returns);
776
777 sum_returns = Blast_SummaryReturnFree(sum_returns);
778 full_sum_returns = Blast_SummaryReturnFree(full_sum_returns);
779
780 if (tabular_output == 3 || tabular_output == 4)
781 AsnIoClose(asn_outfp);
782 else if (tabular_output)
783 FileClose(outfp);
784 else
785 format_info = BlastFormattingInfoFree(format_info);
786
787 options = SBlastOptionsFree(options);
788 GeneticCodeSingletonFini();
789
790 return status;
791 }
792 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |