|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/algo/blast/api/twoseq_api.c |
source navigation diff markup identifier search freetext search file search |
1 /* $Id: twoseq_api.c,v 1.61 2009/05/28 14:55:09 camacho Exp $
2 ***************************************************************************
3 * *
4 * COPYRIGHT NOTICE *
5 * *
6 * This software/database is categorized as "United States Government *
7 * Work" under the terms of the United States Copyright Act. It was *
8 * produced as part of the author's official duties as a Government *
9 * employee and thus can not be copyrighted. This software/database is *
10 * freely available to the public for use without a copyright notice. *
11 * Restrictions can not be placed on its present or future use. *
12 * *
13 * Although all reasonable efforts have been taken to ensure the accuracy *
14 * and reliability of the software and data, the National Library of *
15 * Medicine (NLM) and the U.S. Government do not and can not warrant the *
16 * performance or results that may be obtained by using this software, *
17 * data, or derivative works thereof. The NLM and the U.S. Government *
18 * disclaim any and all warranties, expressed or implied, as to the *
19 * performance, merchantability or fitness for any particular purpose or *
20 * use. *
21 * *
22 * In any work or product derived from this material, proper attribution *
23 * of the author(s) as the source of the software or data would be *
24 * appreciated. *
25 * *
26 * Author: Jason Papadopoulos *
27 ***************************************************************************/
28
29 /** @file twoseq_api.c
30 * Functions for C toolkit applications to compare two sequences using the
31 * rewritten blast engine.
32 */
33
34 #include <algo/blast/core/blast_options.h>
35 #include <algo/blast/core/blast_setup.h>
36 #include <algo/blast/core/blast_message.h>
37 #include <algo/blast/core/blast_util.h>
38 #include <algo/blast/core/blast_engine.h>
39 #include <algo/blast/core/blast_filter.h>
40 #include <algo/blast/core/blast_nalookup.h>
41 #include <algo/blast/core/gencode_singleton.h>
42 #include <algo/blast/api/seqsrc_multiseq.h>
43 #include <algo/blast/api/blast_seqalign.h>
44 #include <algo/blast/api/blast_seq.h>
45 #include <algo/blast/api/twoseq_api.h>
46 #include <algo/blast/api/blast_returns.h>
47
48 #include <algo/blast/api/blast_api.h>
49
50 /** @addtogroup CToolkitAlgoBlast
51 *
52 * @{
53 */
54
55 Int2 BLAST_SummaryOptionsInit(BLAST_SummaryOptions **options)
56 {
57 BLAST_SummaryOptions *new_options = (BLAST_SummaryOptions *)calloc(1,
58 sizeof(BLAST_SummaryOptions));
59 if (new_options == NULL) {
60 *options = NULL;
61 return -1;
62 }
63
64 new_options->hint = eBlastHint_Sensitive;
65 new_options->program = eChoose;
66 new_options->strand = Seq_strand_both;
67 new_options->cutoff_evalue = 10.0;
68 new_options->gapped_calculation = TRUE;
69 new_options->use_megablast = FALSE;
70 new_options->nucleotide_match = 1;
71 new_options->nucleotide_mismatch = -3;
72 new_options->longest_intron = 0;
73 new_options->init_seed_method = eDefaultSeedType;
74 new_options->gap_open = -1;
75 new_options->gap_extend = -1;
76
77 *options = new_options;
78 return 0;
79 }
80
81 BLAST_SummaryOptions*
82 BLAST_SummaryOptionsFree(BLAST_SummaryOptions *options)
83 {
84 sfree(options->matrix);
85 sfree(options->filter_string);
86 sfree(options);
87 return NULL;
88 }
89
90 /** Fills the core options structures, given the summary options.
91 * @param basic_options Basic options set by client [in]
92 * @param options All internal options structures [in]
93 * @param query_length Length of query sequence [in]
94 */
95 static Int2
96 s_TwoSeqBasicFillOptions(const BLAST_SummaryOptions* basic_options,
97 SBlastOptions* options,
98 Int4 query_length)
99 {
100 Int2 status = 0;
101 EBlastProgramType program_number = options->program;
102 LookupTableOptions* lookup_options = options->lookup_options;
103 QuerySetUpOptions* query_setup_options = options->query_options;
104 BlastInitialWordOptions* word_options = options->word_options;
105 BlastExtensionOptions* ext_options = options->ext_options;
106 BlastHitSavingOptions* hit_options = options->hit_options;
107 BlastScoringOptions* score_options = options->score_options;
108 BlastEffectiveLengthsOptions* eff_len_options = options->eff_len_options;
109 BlastDatabaseOptions* db_options = options->db_options;
110 Boolean do_megablast = FALSE;
111 Boolean do_discontig = FALSE;
112 Int4 greedy_align = 0;
113 Int4 diag_separation = 0;
114 Int2 word_size = basic_options->word_size;
115 char *matrix;
116
117 if (Blast_SubjectIsTranslated(program_number)) {
118 Uint1* gc = NULL;
119 BLAST_GeneticCodeFind(db_options->genetic_code, &gc);
120 GenCodeSingletonAdd(db_options->genetic_code, gc);
121 free(gc);
122 }
123
124 if (program_number == eBlastTypeBlastn) {
125 if (basic_options->strand != Seq_strand_plus &&
126 basic_options->strand != Seq_strand_minus &&
127 basic_options->strand != Seq_strand_both) {
128 return -2;
129 }
130
131 if (basic_options->use_megablast == TRUE)
132 do_megablast = TRUE;
133
134 /* If the query sequence is large enough, set up a megablast search */
135
136 if (basic_options->hint != eBlastHint_None &&
137 query_length > MEGABLAST_CUTOFF) {
138 do_megablast = TRUE;
139 if (basic_options->gapped_calculation)
140 greedy_align = 1;
141 }
142
143
144 /* If megablast was turned on but the input indicates a sensitive search
145 is desired, or if word size is <=12, which is not used in contiguous
146 megablast, switch to discontiguous megablast.
147 Because a sensitive search is the default, discontig. megablast will
148 be used by default when the first input sequence is large. */
149 if (do_megablast &&
150 (basic_options->hint == eBlastHint_Sensitive ||
151 (word_size != 0 && word_size <= 12))) {
152 if (word_size == 0 || word_size > 12)
153 word_size = 11;
154 do_discontig = TRUE;
155 }
156
157 if (do_megablast && !do_discontig)
158 greedy_align = 1;
159 }
160
161
162 BLAST_FillLookupTableOptions(lookup_options,
163 program_number,
164 do_megablast,
165 basic_options->word_threshold,
166 word_size);
167
168 /* If discontiguous megablast is specified, choose
169 the 11-of-21 optimal template).*/
170
171 if (do_discontig) {
172 lookup_options->mb_template_length = 21;
173 lookup_options->mb_template_type = eMBWordOptimal;
174 }
175 else {
176 lookup_options->mb_template_length = 0;
177 lookup_options->mb_template_type = 0;
178 }
179
180 BLAST_FillQuerySetUpOptions(query_setup_options,
181 program_number,
182 basic_options->filter_string,
183 basic_options->strand);
184
185 BLAST_FillInitialWordOptions(word_options,
186 program_number,
187 0, /* default window size. */
188 0); /* default ungapped X dropoff */
189
190 /* If we need to enforce a single-hit method, reset window size to 0.
191 To enforce two-hit method, set window size to a default non-zero
192 value */
193 if (basic_options->init_seed_method == eOneHit)
194 word_options->window_size = 0;
195 else if (basic_options->init_seed_method == eTwoHits)
196 word_options->window_size = BLAST_WINDOW_SIZE_PROT;
197
198 BLAST_FillExtensionOptions(ext_options,
199 program_number,
200 greedy_align,
201 basic_options->gap_x_dropoff,
202 0); /* default final X dropoff */
203
204 if (basic_options->matrix == NULL)
205 matrix = BLAST_DEFAULT_MATRIX; /* BLOSUM62 */
206 else
207 matrix = basic_options->matrix;
208
209 BLAST_FillScoringOptions(score_options,
210 program_number,
211 (Boolean)greedy_align,
212 basic_options->nucleotide_mismatch,
213 basic_options->nucleotide_match,
214 matrix,
215 basic_options->gap_open,
216 basic_options->gap_extend);
217
218 score_options->gapped_calculation = basic_options->gapped_calculation;
219
220 if (do_megablast)
221 diag_separation = 6;
222
223 BLAST_FillHitSavingOptions(hit_options,
224 basic_options->cutoff_evalue,
225 0, /* default number of alignments saved */
226 score_options->gapped_calculation,
227 0, /* do not perform culling */
228 diag_separation);
229
230 hit_options->percent_identity = 0; /* no percent identity cutoff */
231 hit_options->longest_intron = basic_options->longest_intron; /* For uneven gap statistics. */
232
233 eff_len_options->db_length = (Int8)basic_options->db_length;
234
235 return 0;
236 }
237
238 Int2
239 BLAST_TwoSequencesSearch(BLAST_SummaryOptions *basic_options,
240 BioseqPtr bsp1, BioseqPtr bsp2,
241 SeqAlign **seqalign_out)
242 {
243 enum blast_type program_type = eChoose;
244 SeqLocPtr query_slp = NULL; /* sequence variables */
245 SeqLocPtr subject_slp = NULL;
246 Boolean seq1_is_aa, seq2_is_aa;
247 Int2 status = 0;
248 SBlastSeqalignArray* seqalign_arr=NULL;
249
250 /* sanity checks */
251
252 *seqalign_out = NULL;
253 if (bsp1 == NULL || bsp2 == NULL)
254 return 0;
255
256 seq1_is_aa = ISA_aa(bsp1->mol);
257 seq2_is_aa = ISA_aa(bsp2->mol);
258
259 /* Find program type consistent with the sequences. */
260 if (!seq1_is_aa && !seq2_is_aa) {
261 if (basic_options->program == eTblastx)
262 program_type = eTblastx;
263 else
264 program_type = eBlastn;
265 } else if (seq1_is_aa && seq2_is_aa) {
266 program_type = eBlastp;
267 } else if (!seq1_is_aa && seq2_is_aa) {
268 program_type = eBlastx;
269 } else if (seq1_is_aa && !seq2_is_aa) {
270 program_type = eTblastn;
271 }
272
273 /* Check if program type in options is consistent with the one determined
274 from sequences. */
275 if (basic_options->program == eChoose)
276 basic_options->program = program_type;
277 else if (basic_options->program != program_type)
278 return -1;
279
280 /* Convert the bioseqs into seqlocs. */
281
282 ValNodeAddPointer(&query_slp, SEQLOC_WHOLE,
283 SeqIdDup(SeqIdFindBest(bsp1->id, SEQID_GI)));
284 if (!query_slp)
285 return -1;
286 ValNodeAddPointer(&subject_slp, SEQLOC_WHOLE,
287 SeqIdDup(SeqIdFindBest(bsp2->id, SEQID_GI)));
288 if (!subject_slp)
289 return -1;
290
291 status = BLAST_TwoSeqLocSets(basic_options, query_slp, subject_slp,
292 NULL, &seqalign_arr, NULL, NULL, NULL);
293
294 if (seqalign_arr && seqalign_arr->num_queries)
295 {
296 *seqalign_out = seqalign_arr->array[0];
297 seqalign_arr->array[0] = NULL;
298 SBlastSeqalignArrayFree(seqalign_arr);
299 }
300
301 SeqLocFree(query_slp);
302 SeqLocFree(subject_slp);
303
304 return status;
305 }
306
307 /** Calculates total length of a list of sequence locations.
308 * @param seqloc List of SeqLoc's [in]
309 * @return Total length of all SeqLoc's in the list.
310 */
311 static Int4
312 s_SeqLocListLen(SeqLoc* seqloc)
313 {
314 Int4 length = 0;
315
316 for ( ; seqloc; seqloc = seqloc->next)
317 length += SeqLocLen(seqloc);
318
319 return length;
320 }
321
322 Int2
323 Blast_SearchOptionsFromSummaryOptions(const BLAST_SummaryOptions *basic_options,
324 SeqLoc* query_seqloc,
325 Blast_SummaryReturn* extra_returns,
326 SBlastOptions* *search_options,
327 char* *program_name)
328 {
329 const char *kProgram = NULL;
330 Int2 status = 0;
331
332 switch(basic_options->program) {
333 case eBlastn:
334 kProgram = "blastn";
335 break;
336 case eBlastp:
337 kProgram = "blastp";
338 break;
339 case eBlastx:
340 kProgram = "blastx";
341 break;
342 case eTblastn:
343 kProgram = "tblastn";
344 break;
345 case eTblastx:
346 kProgram = "tblastx";
347 break;
348 default:
349 return -1;
350 }
351
352 status = SBlastOptionsNew(kProgram, search_options, extra_returns);
353
354 if (status)
355 return -1;
356
357 status = s_TwoSeqBasicFillOptions(basic_options, *search_options,
358 s_SeqLocListLen(query_seqloc));
359
360 if (program_name)
361 *program_name = strdup(kProgram);
362
363 return status;
364 }
365
366 Int2
367 BLAST_TwoSeqLocSets(const BLAST_SummaryOptions *basic_options,
368 SeqLoc* query_seqloc, SeqLoc* subject_seqloc,
369 SeqLoc* masking_locs,
370 SBlastSeqalignArray* *seqalign_arr,
371 SeqLoc** filter_out,
372 Boolean* mask_at_hash,
373 Blast_SummaryReturn* *extra_returns_ptr)
374 {
375 SBlastOptions* options = NULL;
376 char *program_name = NULL;
377 Int2 status = 0;
378 Blast_SummaryReturn* extra_returns;
379
380 if (!basic_options || !query_seqloc || !subject_seqloc)
381 return -1;
382
383 extra_returns = Blast_SummaryReturnNew();
384
385 status =
386 Blast_SearchOptionsFromSummaryOptions(basic_options, query_seqloc,
387 extra_returns, &options,
388 &program_name);
389 sfree(program_name);
390
391 if (!status) {
392 status =
393 Blast_TwoSeqLocSetsAdvanced(query_seqloc, subject_seqloc,
394 masking_locs, options, NULL, seqalign_arr, filter_out,
395 extra_returns);
396 }
397
398 if (mask_at_hash)
399 *mask_at_hash = SBlastOptionsGetMaskAtHash(options);
400
401 options = SBlastOptionsFree(options);
402
403 if (extra_returns_ptr)
404 *extra_returns_ptr = extra_returns;
405 else
406 Blast_SummaryReturnFree(extra_returns);
407
408 return status;
409 }
410
411 /* @} */
412
413 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |