NCBI C Toolkit Cross Reference

C/algo/blast/api/twoseq_api.c


  1 /* $Id: twoseq_api.c,v 1.59 2007/03/20 15:17:16 kans Exp $
  2 ***************************************************************************
  3 *                                                                         *
  4 *                             COPYRIGHT NOTICE                            *
  5 *                                                                         *
  6 * This software/database is categorized as "United States Government      *
  7 * Work" under the terms of the United States Copyright Act.  It was       *
  8 * produced as part of the author's official duties as a Government        *
  9 * employee and thus can not be copyrighted.  This software/database is    *
 10 * freely available to the public for use without a copyright notice.      *
 11 * Restrictions can not be placed on its present or future use.            *
 12 *                                                                         *
 13 * Although all reasonable efforts have been taken to ensure the accuracy  *
 14 * and reliability of the software and data, the National Library of       *
 15 * Medicine (NLM) and the U.S. Government do not and can not warrant the   *
 16 * performance or results that may be obtained by using this software,     *
 17 * data, or derivative works thereof.  The NLM and the U.S. Government     *
 18 * disclaim any and all warranties, expressed or implied, as to the        *
 19 * performance, merchantability or fitness for any particular purpose or   *
 20 * use.                                                                    *
 21 *                                                                         *
 22 * In any work or product derived from this material, proper attribution   *
 23 * of the author(s) as the source of the software or data would be         *
 24 * appreciated.                                                            *
 25 *                                                                         *
 26 * Author: Jason Papadopoulos                                              *
 27 ***************************************************************************/
 28 
 29 /** @file twoseq_api.c
 30  * Functions for C toolkit applications to compare two sequences using the 
 31  * rewritten blast engine. 
 32  */
 33 
 34 #include <algo/blast/core/blast_options.h>
 35 #include <algo/blast/core/blast_setup.h>
 36 #include <algo/blast/core/blast_message.h>
 37 #include <algo/blast/core/blast_util.h>
 38 #include <algo/blast/core/blast_engine.h>
 39 #include <algo/blast/core/blast_filter.h>
 40 #include <algo/blast/core/blast_nalookup.h>
 41 #include <algo/blast/core/hspstream_collector.h>
 42 #include <algo/blast/core/gencode_singleton.h>
 43 #include <algo/blast/api/seqsrc_multiseq.h>
 44 #include <algo/blast/api/blast_seqalign.h>
 45 #include <algo/blast/api/blast_seq.h>
 46 #include <algo/blast/api/twoseq_api.h>
 47 #include <algo/blast/api/blast_returns.h>
 48 
 49 #include <algo/blast/api/blast_api.h>
 50 
 51 /** @addtogroup CToolkitAlgoBlast
 52  *
 53  * @{
 54  */
 55 
 56 Int2 BLAST_SummaryOptionsInit(BLAST_SummaryOptions **options)
 57 {
 58     BLAST_SummaryOptions *new_options = (BLAST_SummaryOptions *)calloc(1,
 59                                                  sizeof(BLAST_SummaryOptions));
 60     if (new_options == NULL) {
 61         *options = NULL;
 62         return -1;
 63     }
 64     
 65     new_options->hint = eSensitive;
 66     new_options->program = eChoose;
 67     new_options->strand = Seq_strand_both;
 68     new_options->cutoff_evalue = 10.0;
 69     new_options->gapped_calculation = TRUE;
 70     new_options->use_megablast = FALSE;
 71     new_options->nucleotide_match = 1;
 72     new_options->nucleotide_mismatch = -3;
 73     new_options->longest_intron = 0;
 74     new_options->init_seed_method = eDefaultSeedType;
 75     new_options->gap_open = -1;
 76     new_options->gap_extend = -1;
 77 
 78     *options = new_options;
 79     return 0;
 80 } 
 81 
 82 BLAST_SummaryOptions*
 83 BLAST_SummaryOptionsFree(BLAST_SummaryOptions *options)
 84 {
 85     sfree(options->matrix);
 86     sfree(options->filter_string);
 87     sfree(options);
 88     return NULL;
 89 }
 90 
 91 /** Fills the core options structures, given the summary options.
 92  * @param basic_options Basic options set by client [in]
 93  * @param options All internal options structures [in]
 94  * @param query_length Length of query sequence [in]
 95  */
 96 static Int2 
 97 s_TwoSeqBasicFillOptions(const BLAST_SummaryOptions* basic_options,
 98                          SBlastOptions* options,
 99                          Int4 query_length)
100 {
101     Int2 status = 0;
102     EBlastProgramType program_number = options->program;
103     LookupTableOptions* lookup_options = options->lookup_options;
104     QuerySetUpOptions* query_setup_options = options->query_options; 
105     BlastInitialWordOptions* word_options = options->word_options;
106     BlastExtensionOptions* ext_options = options->ext_options;
107     BlastHitSavingOptions* hit_options = options->hit_options;
108     BlastScoringOptions* score_options = options->score_options;
109     BlastEffectiveLengthsOptions* eff_len_options = options->eff_len_options;
110     BlastDatabaseOptions* db_options = options->db_options; 
111     Boolean do_megablast = FALSE;
112     Boolean do_discontig = FALSE;
113     Int4 greedy_align = 0;
114     Int4 diag_separation = 0;
115     Int2 word_size = basic_options->word_size;
116     char *matrix;
117 
118     if (Blast_SubjectIsTranslated(program_number)) {
119         Uint1* gc = NULL;
120         BLAST_GeneticCodeFind(db_options->genetic_code, &gc);
121         GenCodeSingletonAdd(db_options->genetic_code, gc);
122         free(gc);
123     }
124 
125     if (program_number == eBlastTypeBlastn) {
126         if (basic_options->strand != Seq_strand_plus &&
127             basic_options->strand != Seq_strand_minus &&
128             basic_options->strand != Seq_strand_both) {
129             return -2;
130         }
131         
132         if (basic_options->use_megablast == TRUE)
133            do_megablast = TRUE;
134 
135         /* If the query sequence is large enough, set up a megablast search */
136 
137         if (basic_options->hint != eNone && 
138             query_length > MEGABLAST_CUTOFF) {
139             do_megablast = TRUE;
140             if (basic_options->gapped_calculation)
141                 greedy_align = 1;
142         }
143 
144 
145         /* If megablast was turned on but the input indicates a sensitive search
146            is desired, or if word size is <=12, which is not used in contiguous
147            megablast, switch to discontiguous megablast. 
148            Because a sensitive search is the default, discontig. megablast will 
149            be used by default when the first input sequence is large. */
150         if (do_megablast && 
151             (basic_options->hint == eSensitive || 
152              (word_size != 0 && word_size <= 12))) {
153             if (word_size == 0 || word_size > 12)
154                 word_size = 11;
155             do_discontig = TRUE;
156         }
157 
158         if (do_megablast && !do_discontig)
159             greedy_align = 1;
160     }
161     
162 
163     BLAST_FillLookupTableOptions(lookup_options, 
164                                  program_number, 
165                                  do_megablast,
166                                  basic_options->word_threshold,
167                                  word_size);
168  
169     /* If discontiguous megablast is specified, choose
170        the 11-of-21 optimal template).*/
171 
172     if (do_discontig) {
173         lookup_options->mb_template_length = 21; 
174         lookup_options->mb_template_type = eMBWordOptimal;
175     }
176     else {
177         lookup_options->mb_template_length = 0; 
178         lookup_options->mb_template_type = 0;
179     }
180     
181     BLAST_FillQuerySetUpOptions(query_setup_options, 
182                                 program_number, 
183                                 basic_options->filter_string,
184                                 basic_options->strand);
185  
186     BLAST_FillInitialWordOptions(word_options, 
187                                  program_number, 
188                                  0,      /* default window size. */
189                                  0);     /* default ungapped X dropoff */
190  
191     /* If we need to enforce a single-hit method, reset window size to 0. 
192        To enforce two-hit method, set window size to a default non-zero 
193        value */
194     if (basic_options->init_seed_method == eOneHit)
195        word_options->window_size = 0;
196     else if (basic_options->init_seed_method == eTwoHits)
197        word_options->window_size = BLAST_WINDOW_SIZE_PROT;
198 
199     BLAST_FillExtensionOptions(ext_options, 
200                                program_number, 
201                                greedy_align, 
202                                basic_options->gap_x_dropoff,
203                                0);       /* default final X dropoff */
204  
205     if (basic_options->matrix == NULL)
206         matrix = BLAST_DEFAULT_MATRIX; /* BLOSUM62 */
207     else
208         matrix = basic_options->matrix;
209 
210     BLAST_FillScoringOptions(score_options, 
211                              program_number, 
212                              (Boolean)greedy_align, 
213                              basic_options->nucleotide_mismatch,
214                              basic_options->nucleotide_match,
215                              matrix,
216                              basic_options->gap_open,
217                              basic_options->gap_extend);
218  
219     score_options->gapped_calculation = basic_options->gapped_calculation;
220  
221     if (do_megablast)
222         diag_separation = 6;
223 
224     BLAST_FillHitSavingOptions(hit_options, 
225                                basic_options->cutoff_evalue,
226                                0,     /* default number of alignments saved */
227                                score_options->gapped_calculation,
228                                0,     /* do not perform culling */
229                                diag_separation);
230 
231     hit_options->percent_identity = 0;   /* no percent identity cutoff */
232     hit_options->longest_intron = basic_options->longest_intron;   /* For uneven gap statistics. */
233   
234     eff_len_options->db_length = (Int8)basic_options->db_length;
235 
236     return 0;
237 }
238 
239 Int2 
240 BLAST_TwoSequencesSearch(BLAST_SummaryOptions *basic_options,
241                          BioseqPtr bsp1, BioseqPtr bsp2, 
242                          SeqAlign **seqalign_out)
243 {
244     enum blast_type program_type = eChoose;
245     SeqLocPtr query_slp = NULL;      /* sequence variables */
246     SeqLocPtr subject_slp = NULL;
247     Boolean seq1_is_aa, seq2_is_aa;
248     Int2 status = 0;
249     SBlastSeqalignArray* seqalign_arr=NULL;
250 
251     /* sanity checks */
252 
253     *seqalign_out = NULL;
254     if (bsp1 == NULL || bsp2 == NULL)
255         return 0;
256 
257     seq1_is_aa = ISA_aa(bsp1->mol);
258     seq2_is_aa = ISA_aa(bsp2->mol);
259 
260     /* Find program type consistent with the sequences. */
261     if (!seq1_is_aa && !seq2_is_aa) {
262        if (basic_options->program == eTblastx)
263           program_type = eTblastx;
264        else
265           program_type = eBlastn;
266     } else if (seq1_is_aa && seq2_is_aa) {
267        program_type = eBlastp;
268     } else if (!seq1_is_aa && seq2_is_aa) {
269        program_type = eBlastx;
270     } else if (seq1_is_aa && !seq2_is_aa) {
271        program_type = eTblastn;
272     }
273 
274     /* Check if program type in options is consistent with the one determined
275        from sequences. */
276     if (basic_options->program == eChoose)
277        basic_options->program = program_type;
278     else if (basic_options->program != program_type)
279        return -1;
280 
281     /* Convert the bioseqs into seqlocs. */
282 
283     ValNodeAddPointer(&query_slp, SEQLOC_WHOLE,
284                       SeqIdDup(SeqIdFindBest(bsp1->id, SEQID_GI)));
285     if (!query_slp)
286        return -1;
287     ValNodeAddPointer(&subject_slp, SEQLOC_WHOLE,
288                       SeqIdDup(SeqIdFindBest(bsp2->id, SEQID_GI)));
289     if (!subject_slp)
290        return -1;
291 
292     status = BLAST_TwoSeqLocSets(basic_options, query_slp, subject_slp, 
293                                  NULL, &seqalign_arr, NULL, NULL, NULL);
294 
295     if (seqalign_arr && seqalign_arr->num_queries)
296     {
297            *seqalign_out = seqalign_arr->array[0];
298            seqalign_arr->array[0] = NULL;
299            SBlastSeqalignArrayFree(seqalign_arr);
300     }
301 
302     SeqLocFree(query_slp);
303     SeqLocFree(subject_slp);
304 
305     return status;
306 }
307 
308 /** Calculates total length of a list of sequence locations. 
309  * @param seqloc List of SeqLoc's [in]
310  * @return Total length of all SeqLoc's in the list.
311  */
312 static Int4 
313 s_SeqLocListLen(SeqLoc* seqloc)
314 {
315    Int4 length = 0;
316 
317    for ( ; seqloc; seqloc = seqloc->next)
318       length += SeqLocLen(seqloc);
319 
320    return length;
321 }
322 
323 Int2 
324 Blast_SearchOptionsFromSummaryOptions(const BLAST_SummaryOptions *basic_options,
325                                       SeqLoc* query_seqloc,
326                                       Blast_SummaryReturn* extra_returns, 
327                                       SBlastOptions* *search_options,
328                                       char* *program_name)
329 {
330     const char *kProgram = NULL;
331     Int2 status = 0;
332 
333     switch(basic_options->program) {
334     case eBlastn:
335        kProgram = "blastn";
336        break;
337     case eBlastp:
338        kProgram = "blastp";
339        break;
340     case eBlastx:
341        kProgram = "blastx";
342        break;
343     case eTblastn:
344        kProgram = "tblastn";
345        break;
346     case eTblastx:
347        kProgram = "tblastx";
348        break;
349     default:
350        return -1;
351     }
352 
353     status = SBlastOptionsNew(kProgram, search_options, extra_returns);
354 
355     if (status)
356         return -1;
357     
358     status = s_TwoSeqBasicFillOptions(basic_options, *search_options, 
359                                       s_SeqLocListLen(query_seqloc));
360 
361     if (program_name)
362         *program_name = strdup(kProgram);
363 
364     return status;
365 }
366 
367 Int2 
368 BLAST_TwoSeqLocSets(const BLAST_SummaryOptions *basic_options,
369                     SeqLoc* query_seqloc, SeqLoc* subject_seqloc, 
370                     SeqLoc* masking_locs,
371                     SBlastSeqalignArray* *seqalign_arr,
372                     SeqLoc** filter_out,
373                     Boolean* mask_at_hash,
374                     Blast_SummaryReturn* *extra_returns_ptr)
375 {
376     SBlastOptions* options = NULL;
377     char *program_name = NULL;
378     Int2 status = 0;
379     Blast_SummaryReturn* extra_returns;
380 
381     if (!basic_options || !query_seqloc || !subject_seqloc)
382         return -1;
383 
384     extra_returns = Blast_SummaryReturnNew();
385 
386     status = 
387         Blast_SearchOptionsFromSummaryOptions(basic_options, query_seqloc, 
388                                               extra_returns, &options, 
389                                               &program_name);
390     sfree(program_name);
391 
392     if (!status) {
393         status = 
394             Blast_TwoSeqLocSetsAdvanced(query_seqloc, subject_seqloc, 
395                 masking_locs, options, NULL, seqalign_arr, filter_out, 
396                 extra_returns);
397     }
398 
399     if (mask_at_hash)
400         *mask_at_hash = SBlastOptionsGetMaskAtHash(options);
401 
402     options = SBlastOptionsFree(options);
403 
404     if (extra_returns_ptr)
405         *extra_returns_ptr = extra_returns;
406     else 
407         Blast_SummaryReturnFree(extra_returns);
408 
409     return status;
410 }
411 
412 /* @} */
413 
414 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.