|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/algo/blast/api/twoseq_api.c |
source navigation diff markup identifier search freetext search file search |
1 /* $Id: twoseq_api.c,v 1.59 2007/03/20 15:17:16 kans Exp $
2 ***************************************************************************
3 * *
4 * COPYRIGHT NOTICE *
5 * *
6 * This software/database is categorized as "United States Government *
7 * Work" under the terms of the United States Copyright Act. It was *
8 * produced as part of the author's official duties as a Government *
9 * employee and thus can not be copyrighted. This software/database is *
10 * freely available to the public for use without a copyright notice. *
11 * Restrictions can not be placed on its present or future use. *
12 * *
13 * Although all reasonable efforts have been taken to ensure the accuracy *
14 * and reliability of the software and data, the National Library of *
15 * Medicine (NLM) and the U.S. Government do not and can not warrant the *
16 * performance or results that may be obtained by using this software, *
17 * data, or derivative works thereof. The NLM and the U.S. Government *
18 * disclaim any and all warranties, expressed or implied, as to the *
19 * performance, merchantability or fitness for any particular purpose or *
20 * use. *
21 * *
22 * In any work or product derived from this material, proper attribution *
23 * of the author(s) as the source of the software or data would be *
24 * appreciated. *
25 * *
26 * Author: Jason Papadopoulos *
27 ***************************************************************************/
28
29 /** @file twoseq_api.c
30 * Functions for C toolkit applications to compare two sequences using the
31 * rewritten blast engine.
32 */
33
34 #include <algo/blast/core/blast_options.h>
35 #include <algo/blast/core/blast_setup.h>
36 #include <algo/blast/core/blast_message.h>
37 #include <algo/blast/core/blast_util.h>
38 #include <algo/blast/core/blast_engine.h>
39 #include <algo/blast/core/blast_filter.h>
40 #include <algo/blast/core/blast_nalookup.h>
41 #include <algo/blast/core/hspstream_collector.h>
42 #include <algo/blast/core/gencode_singleton.h>
43 #include <algo/blast/api/seqsrc_multiseq.h>
44 #include <algo/blast/api/blast_seqalign.h>
45 #include <algo/blast/api/blast_seq.h>
46 #include <algo/blast/api/twoseq_api.h>
47 #include <algo/blast/api/blast_returns.h>
48
49 #include <algo/blast/api/blast_api.h>
50
51 /** @addtogroup CToolkitAlgoBlast
52 *
53 * @{
54 */
55
56 Int2 BLAST_SummaryOptionsInit(BLAST_SummaryOptions **options)
57 {
58 BLAST_SummaryOptions *new_options = (BLAST_SummaryOptions *)calloc(1,
59 sizeof(BLAST_SummaryOptions));
60 if (new_options == NULL) {
61 *options = NULL;
62 return -1;
63 }
64
65 new_options->hint = eSensitive;
66 new_options->program = eChoose;
67 new_options->strand = Seq_strand_both;
68 new_options->cutoff_evalue = 10.0;
69 new_options->gapped_calculation = TRUE;
70 new_options->use_megablast = FALSE;
71 new_options->nucleotide_match = 1;
72 new_options->nucleotide_mismatch = -3;
73 new_options->longest_intron = 0;
74 new_options->init_seed_method = eDefaultSeedType;
75 new_options->gap_open = -1;
76 new_options->gap_extend = -1;
77
78 *options = new_options;
79 return 0;
80 }
81
82 BLAST_SummaryOptions*
83 BLAST_SummaryOptionsFree(BLAST_SummaryOptions *options)
84 {
85 sfree(options->matrix);
86 sfree(options->filter_string);
87 sfree(options);
88 return NULL;
89 }
90
91 /** Fills the core options structures, given the summary options.
92 * @param basic_options Basic options set by client [in]
93 * @param options All internal options structures [in]
94 * @param query_length Length of query sequence [in]
95 */
96 static Int2
97 s_TwoSeqBasicFillOptions(const BLAST_SummaryOptions* basic_options,
98 SBlastOptions* options,
99 Int4 query_length)
100 {
101 Int2 status = 0;
102 EBlastProgramType program_number = options->program;
103 LookupTableOptions* lookup_options = options->lookup_options;
104 QuerySetUpOptions* query_setup_options = options->query_options;
105 BlastInitialWordOptions* word_options = options->word_options;
106 BlastExtensionOptions* ext_options = options->ext_options;
107 BlastHitSavingOptions* hit_options = options->hit_options;
108 BlastScoringOptions* score_options = options->score_options;
109 BlastEffectiveLengthsOptions* eff_len_options = options->eff_len_options;
110 BlastDatabaseOptions* db_options = options->db_options;
111 Boolean do_megablast = FALSE;
112 Boolean do_discontig = FALSE;
113 Int4 greedy_align = 0;
114 Int4 diag_separation = 0;
115 Int2 word_size = basic_options->word_size;
116 char *matrix;
117
118 if (Blast_SubjectIsTranslated(program_number)) {
119 Uint1* gc = NULL;
120 BLAST_GeneticCodeFind(db_options->genetic_code, &gc);
121 GenCodeSingletonAdd(db_options->genetic_code, gc);
122 free(gc);
123 }
124
125 if (program_number == eBlastTypeBlastn) {
126 if (basic_options->strand != Seq_strand_plus &&
127 basic_options->strand != Seq_strand_minus &&
128 basic_options->strand != Seq_strand_both) {
129 return -2;
130 }
131
132 if (basic_options->use_megablast == TRUE)
133 do_megablast = TRUE;
134
135 /* If the query sequence is large enough, set up a megablast search */
136
137 if (basic_options->hint != eNone &&
138 query_length > MEGABLAST_CUTOFF) {
139 do_megablast = TRUE;
140 if (basic_options->gapped_calculation)
141 greedy_align = 1;
142 }
143
144
145 /* If megablast was turned on but the input indicates a sensitive search
146 is desired, or if word size is <=12, which is not used in contiguous
147 megablast, switch to discontiguous megablast.
148 Because a sensitive search is the default, discontig. megablast will
149 be used by default when the first input sequence is large. */
150 if (do_megablast &&
151 (basic_options->hint == eSensitive ||
152 (word_size != 0 && word_size <= 12))) {
153 if (word_size == 0 || word_size > 12)
154 word_size = 11;
155 do_discontig = TRUE;
156 }
157
158 if (do_megablast && !do_discontig)
159 greedy_align = 1;
160 }
161
162
163 BLAST_FillLookupTableOptions(lookup_options,
164 program_number,
165 do_megablast,
166 basic_options->word_threshold,
167 word_size);
168
169 /* If discontiguous megablast is specified, choose
170 the 11-of-21 optimal template).*/
171
172 if (do_discontig) {
173 lookup_options->mb_template_length = 21;
174 lookup_options->mb_template_type = eMBWordOptimal;
175 }
176 else {
177 lookup_options->mb_template_length = 0;
178 lookup_options->mb_template_type = 0;
179 }
180
181 BLAST_FillQuerySetUpOptions(query_setup_options,
182 program_number,
183 basic_options->filter_string,
184 basic_options->strand);
185
186 BLAST_FillInitialWordOptions(word_options,
187 program_number,
188 0, /* default window size. */
189 0); /* default ungapped X dropoff */
190
191 /* If we need to enforce a single-hit method, reset window size to 0.
192 To enforce two-hit method, set window size to a default non-zero
193 value */
194 if (basic_options->init_seed_method == eOneHit)
195 word_options->window_size = 0;
196 else if (basic_options->init_seed_method == eTwoHits)
197 word_options->window_size = BLAST_WINDOW_SIZE_PROT;
198
199 BLAST_FillExtensionOptions(ext_options,
200 program_number,
201 greedy_align,
202 basic_options->gap_x_dropoff,
203 0); /* default final X dropoff */
204
205 if (basic_options->matrix == NULL)
206 matrix = BLAST_DEFAULT_MATRIX; /* BLOSUM62 */
207 else
208 matrix = basic_options->matrix;
209
210 BLAST_FillScoringOptions(score_options,
211 program_number,
212 (Boolean)greedy_align,
213 basic_options->nucleotide_mismatch,
214 basic_options->nucleotide_match,
215 matrix,
216 basic_options->gap_open,
217 basic_options->gap_extend);
218
219 score_options->gapped_calculation = basic_options->gapped_calculation;
220
221 if (do_megablast)
222 diag_separation = 6;
223
224 BLAST_FillHitSavingOptions(hit_options,
225 basic_options->cutoff_evalue,
226 0, /* default number of alignments saved */
227 score_options->gapped_calculation,
228 0, /* do not perform culling */
229 diag_separation);
230
231 hit_options->percent_identity = 0; /* no percent identity cutoff */
232 hit_options->longest_intron = basic_options->longest_intron; /* For uneven gap statistics. */
233
234 eff_len_options->db_length = (Int8)basic_options->db_length;
235
236 return 0;
237 }
238
239 Int2
240 BLAST_TwoSequencesSearch(BLAST_SummaryOptions *basic_options,
241 BioseqPtr bsp1, BioseqPtr bsp2,
242 SeqAlign **seqalign_out)
243 {
244 enum blast_type program_type = eChoose;
245 SeqLocPtr query_slp = NULL; /* sequence variables */
246 SeqLocPtr subject_slp = NULL;
247 Boolean seq1_is_aa, seq2_is_aa;
248 Int2 status = 0;
249 SBlastSeqalignArray* seqalign_arr=NULL;
250
251 /* sanity checks */
252
253 *seqalign_out = NULL;
254 if (bsp1 == NULL || bsp2 == NULL)
255 return 0;
256
257 seq1_is_aa = ISA_aa(bsp1->mol);
258 seq2_is_aa = ISA_aa(bsp2->mol);
259
260 /* Find program type consistent with the sequences. */
261 if (!seq1_is_aa && !seq2_is_aa) {
262 if (basic_options->program == eTblastx)
263 program_type = eTblastx;
264 else
265 program_type = eBlastn;
266 } else if (seq1_is_aa && seq2_is_aa) {
267 program_type = eBlastp;
268 } else if (!seq1_is_aa && seq2_is_aa) {
269 program_type = eBlastx;
270 } else if (seq1_is_aa && !seq2_is_aa) {
271 program_type = eTblastn;
272 }
273
274 /* Check if program type in options is consistent with the one determined
275 from sequences. */
276 if (basic_options->program == eChoose)
277 basic_options->program = program_type;
278 else if (basic_options->program != program_type)
279 return -1;
280
281 /* Convert the bioseqs into seqlocs. */
282
283 ValNodeAddPointer(&query_slp, SEQLOC_WHOLE,
284 SeqIdDup(SeqIdFindBest(bsp1->id, SEQID_GI)));
285 if (!query_slp)
286 return -1;
287 ValNodeAddPointer(&subject_slp, SEQLOC_WHOLE,
288 SeqIdDup(SeqIdFindBest(bsp2->id, SEQID_GI)));
289 if (!subject_slp)
290 return -1;
291
292 status = BLAST_TwoSeqLocSets(basic_options, query_slp, subject_slp,
293 NULL, &seqalign_arr, NULL, NULL, NULL);
294
295 if (seqalign_arr && seqalign_arr->num_queries)
296 {
297 *seqalign_out = seqalign_arr->array[0];
298 seqalign_arr->array[0] = NULL;
299 SBlastSeqalignArrayFree(seqalign_arr);
300 }
301
302 SeqLocFree(query_slp);
303 SeqLocFree(subject_slp);
304
305 return status;
306 }
307
308 /** Calculates total length of a list of sequence locations.
309 * @param seqloc List of SeqLoc's [in]
310 * @return Total length of all SeqLoc's in the list.
311 */
312 static Int4
313 s_SeqLocListLen(SeqLoc* seqloc)
314 {
315 Int4 length = 0;
316
317 for ( ; seqloc; seqloc = seqloc->next)
318 length += SeqLocLen(seqloc);
319
320 return length;
321 }
322
323 Int2
324 Blast_SearchOptionsFromSummaryOptions(const BLAST_SummaryOptions *basic_options,
325 SeqLoc* query_seqloc,
326 Blast_SummaryReturn* extra_returns,
327 SBlastOptions* *search_options,
328 char* *program_name)
329 {
330 const char *kProgram = NULL;
331 Int2 status = 0;
332
333 switch(basic_options->program) {
334 case eBlastn:
335 kProgram = "blastn";
336 break;
337 case eBlastp:
338 kProgram = "blastp";
339 break;
340 case eBlastx:
341 kProgram = "blastx";
342 break;
343 case eTblastn:
344 kProgram = "tblastn";
345 break;
346 case eTblastx:
347 kProgram = "tblastx";
348 break;
349 default:
350 return -1;
351 }
352
353 status = SBlastOptionsNew(kProgram, search_options, extra_returns);
354
355 if (status)
356 return -1;
357
358 status = s_TwoSeqBasicFillOptions(basic_options, *search_options,
359 s_SeqLocListLen(query_seqloc));
360
361 if (program_name)
362 *program_name = strdup(kProgram);
363
364 return status;
365 }
366
367 Int2
368 BLAST_TwoSeqLocSets(const BLAST_SummaryOptions *basic_options,
369 SeqLoc* query_seqloc, SeqLoc* subject_seqloc,
370 SeqLoc* masking_locs,
371 SBlastSeqalignArray* *seqalign_arr,
372 SeqLoc** filter_out,
373 Boolean* mask_at_hash,
374 Blast_SummaryReturn* *extra_returns_ptr)
375 {
376 SBlastOptions* options = NULL;
377 char *program_name = NULL;
378 Int2 status = 0;
379 Blast_SummaryReturn* extra_returns;
380
381 if (!basic_options || !query_seqloc || !subject_seqloc)
382 return -1;
383
384 extra_returns = Blast_SummaryReturnNew();
385
386 status =
387 Blast_SearchOptionsFromSummaryOptions(basic_options, query_seqloc,
388 extra_returns, &options,
389 &program_name);
390 sfree(program_name);
391
392 if (!status) {
393 status =
394 Blast_TwoSeqLocSetsAdvanced(query_seqloc, subject_seqloc,
395 masking_locs, options, NULL, seqalign_arr, filter_out,
396 extra_returns);
397 }
398
399 if (mask_at_hash)
400 *mask_at_hash = SBlastOptionsGetMaskAtHash(options);
401
402 options = SBlastOptionsFree(options);
403
404 if (extra_returns_ptr)
405 *extra_returns_ptr = extra_returns;
406 else
407 Blast_SummaryReturnFree(extra_returns);
408
409 return status;
410 }
411
412 /* @} */
413
414 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |