NCBI C Toolkit Cross Reference

C/algo/blast/api/twoseq_api.c


  1 /* $Id: twoseq_api.c,v 1.61 2009/05/28 14:55:09 camacho Exp $
  2 ***************************************************************************
  3 *                                                                         *
  4 *                             COPYRIGHT NOTICE                            *
  5 *                                                                         *
  6 * This software/database is categorized as "United States Government      *
  7 * Work" under the terms of the United States Copyright Act.  It was       *
  8 * produced as part of the author's official duties as a Government        *
  9 * employee and thus can not be copyrighted.  This software/database is    *
 10 * freely available to the public for use without a copyright notice.      *
 11 * Restrictions can not be placed on its present or future use.            *
 12 *                                                                         *
 13 * Although all reasonable efforts have been taken to ensure the accuracy  *
 14 * and reliability of the software and data, the National Library of       *
 15 * Medicine (NLM) and the U.S. Government do not and can not warrant the   *
 16 * performance or results that may be obtained by using this software,     *
 17 * data, or derivative works thereof.  The NLM and the U.S. Government     *
 18 * disclaim any and all warranties, expressed or implied, as to the        *
 19 * performance, merchantability or fitness for any particular purpose or   *
 20 * use.                                                                    *
 21 *                                                                         *
 22 * In any work or product derived from this material, proper attribution   *
 23 * of the author(s) as the source of the software or data would be         *
 24 * appreciated.                                                            *
 25 *                                                                         *
 26 * Author: Jason Papadopoulos                                              *
 27 ***************************************************************************/
 28 
 29 /** @file twoseq_api.c
 30  * Functions for C toolkit applications to compare two sequences using the 
 31  * rewritten blast engine. 
 32  */
 33 
 34 #include <algo/blast/core/blast_options.h>
 35 #include <algo/blast/core/blast_setup.h>
 36 #include <algo/blast/core/blast_message.h>
 37 #include <algo/blast/core/blast_util.h>
 38 #include <algo/blast/core/blast_engine.h>
 39 #include <algo/blast/core/blast_filter.h>
 40 #include <algo/blast/core/blast_nalookup.h>
 41 #include <algo/blast/core/gencode_singleton.h>
 42 #include <algo/blast/api/seqsrc_multiseq.h>
 43 #include <algo/blast/api/blast_seqalign.h>
 44 #include <algo/blast/api/blast_seq.h>
 45 #include <algo/blast/api/twoseq_api.h>
 46 #include <algo/blast/api/blast_returns.h>
 47 
 48 #include <algo/blast/api/blast_api.h>
 49 
 50 /** @addtogroup CToolkitAlgoBlast
 51  *
 52  * @{
 53  */
 54 
 55 Int2 BLAST_SummaryOptionsInit(BLAST_SummaryOptions **options)
 56 {
 57     BLAST_SummaryOptions *new_options = (BLAST_SummaryOptions *)calloc(1,
 58                                                  sizeof(BLAST_SummaryOptions));
 59     if (new_options == NULL) {
 60         *options = NULL;
 61         return -1;
 62     }
 63     
 64     new_options->hint = eBlastHint_Sensitive;
 65     new_options->program = eChoose;
 66     new_options->strand = Seq_strand_both;
 67     new_options->cutoff_evalue = 10.0;
 68     new_options->gapped_calculation = TRUE;
 69     new_options->use_megablast = FALSE;
 70     new_options->nucleotide_match = 1;
 71     new_options->nucleotide_mismatch = -3;
 72     new_options->longest_intron = 0;
 73     new_options->init_seed_method = eDefaultSeedType;
 74     new_options->gap_open = -1;
 75     new_options->gap_extend = -1;
 76 
 77     *options = new_options;
 78     return 0;
 79 } 
 80 
 81 BLAST_SummaryOptions*
 82 BLAST_SummaryOptionsFree(BLAST_SummaryOptions *options)
 83 {
 84     sfree(options->matrix);
 85     sfree(options->filter_string);
 86     sfree(options);
 87     return NULL;
 88 }
 89 
 90 /** Fills the core options structures, given the summary options.
 91  * @param basic_options Basic options set by client [in]
 92  * @param options All internal options structures [in]
 93  * @param query_length Length of query sequence [in]
 94  */
 95 static Int2 
 96 s_TwoSeqBasicFillOptions(const BLAST_SummaryOptions* basic_options,
 97                          SBlastOptions* options,
 98                          Int4 query_length)
 99 {
100     Int2 status = 0;
101     EBlastProgramType program_number = options->program;
102     LookupTableOptions* lookup_options = options->lookup_options;
103     QuerySetUpOptions* query_setup_options = options->query_options; 
104     BlastInitialWordOptions* word_options = options->word_options;
105     BlastExtensionOptions* ext_options = options->ext_options;
106     BlastHitSavingOptions* hit_options = options->hit_options;
107     BlastScoringOptions* score_options = options->score_options;
108     BlastEffectiveLengthsOptions* eff_len_options = options->eff_len_options;
109     BlastDatabaseOptions* db_options = options->db_options; 
110     Boolean do_megablast = FALSE;
111     Boolean do_discontig = FALSE;
112     Int4 greedy_align = 0;
113     Int4 diag_separation = 0;
114     Int2 word_size = basic_options->word_size;
115     char *matrix;
116 
117     if (Blast_SubjectIsTranslated(program_number)) {
118         Uint1* gc = NULL;
119         BLAST_GeneticCodeFind(db_options->genetic_code, &gc);
120         GenCodeSingletonAdd(db_options->genetic_code, gc);
121         free(gc);
122     }
123 
124     if (program_number == eBlastTypeBlastn) {
125         if (basic_options->strand != Seq_strand_plus &&
126             basic_options->strand != Seq_strand_minus &&
127             basic_options->strand != Seq_strand_both) {
128             return -2;
129         }
130         
131         if (basic_options->use_megablast == TRUE)
132            do_megablast = TRUE;
133 
134         /* If the query sequence is large enough, set up a megablast search */
135 
136         if (basic_options->hint != eBlastHint_None && 
137             query_length > MEGABLAST_CUTOFF) {
138             do_megablast = TRUE;
139             if (basic_options->gapped_calculation)
140                 greedy_align = 1;
141         }
142 
143 
144         /* If megablast was turned on but the input indicates a sensitive search
145            is desired, or if word size is <=12, which is not used in contiguous
146            megablast, switch to discontiguous megablast. 
147            Because a sensitive search is the default, discontig. megablast will 
148            be used by default when the first input sequence is large. */
149         if (do_megablast && 
150             (basic_options->hint == eBlastHint_Sensitive || 
151              (word_size != 0 && word_size <= 12))) {
152             if (word_size == 0 || word_size > 12)
153                 word_size = 11;
154             do_discontig = TRUE;
155         }
156 
157         if (do_megablast && !do_discontig)
158             greedy_align = 1;
159     }
160     
161 
162     BLAST_FillLookupTableOptions(lookup_options, 
163                                  program_number, 
164                                  do_megablast,
165                                  basic_options->word_threshold,
166                                  word_size);
167  
168     /* If discontiguous megablast is specified, choose
169        the 11-of-21 optimal template).*/
170 
171     if (do_discontig) {
172         lookup_options->mb_template_length = 21; 
173         lookup_options->mb_template_type = eMBWordOptimal;
174     }
175     else {
176         lookup_options->mb_template_length = 0; 
177         lookup_options->mb_template_type = 0;
178     }
179     
180     BLAST_FillQuerySetUpOptions(query_setup_options, 
181                                 program_number, 
182                                 basic_options->filter_string,
183                                 basic_options->strand);
184  
185     BLAST_FillInitialWordOptions(word_options, 
186                                  program_number, 
187                                  0,      /* default window size. */
188                                  0);     /* default ungapped X dropoff */
189  
190     /* If we need to enforce a single-hit method, reset window size to 0. 
191        To enforce two-hit method, set window size to a default non-zero 
192        value */
193     if (basic_options->init_seed_method == eOneHit)
194        word_options->window_size = 0;
195     else if (basic_options->init_seed_method == eTwoHits)
196        word_options->window_size = BLAST_WINDOW_SIZE_PROT;
197 
198     BLAST_FillExtensionOptions(ext_options, 
199                                program_number, 
200                                greedy_align, 
201                                basic_options->gap_x_dropoff,
202                                0);       /* default final X dropoff */
203  
204     if (basic_options->matrix == NULL)
205         matrix = BLAST_DEFAULT_MATRIX; /* BLOSUM62 */
206     else
207         matrix = basic_options->matrix;
208 
209     BLAST_FillScoringOptions(score_options, 
210                              program_number, 
211                              (Boolean)greedy_align, 
212                              basic_options->nucleotide_mismatch,
213                              basic_options->nucleotide_match,
214                              matrix,
215                              basic_options->gap_open,
216                              basic_options->gap_extend);
217  
218     score_options->gapped_calculation = basic_options->gapped_calculation;
219  
220     if (do_megablast)
221         diag_separation = 6;
222 
223     BLAST_FillHitSavingOptions(hit_options, 
224                                basic_options->cutoff_evalue,
225                                0,     /* default number of alignments saved */
226                                score_options->gapped_calculation,
227                                0,     /* do not perform culling */
228                                diag_separation);
229 
230     hit_options->percent_identity = 0;   /* no percent identity cutoff */
231     hit_options->longest_intron = basic_options->longest_intron;   /* For uneven gap statistics. */
232   
233     eff_len_options->db_length = (Int8)basic_options->db_length;
234 
235     return 0;
236 }
237 
238 Int2 
239 BLAST_TwoSequencesSearch(BLAST_SummaryOptions *basic_options,
240                          BioseqPtr bsp1, BioseqPtr bsp2, 
241                          SeqAlign **seqalign_out)
242 {
243     enum blast_type program_type = eChoose;
244     SeqLocPtr query_slp = NULL;      /* sequence variables */
245     SeqLocPtr subject_slp = NULL;
246     Boolean seq1_is_aa, seq2_is_aa;
247     Int2 status = 0;
248     SBlastSeqalignArray* seqalign_arr=NULL;
249 
250     /* sanity checks */
251 
252     *seqalign_out = NULL;
253     if (bsp1 == NULL || bsp2 == NULL)
254         return 0;
255 
256     seq1_is_aa = ISA_aa(bsp1->mol);
257     seq2_is_aa = ISA_aa(bsp2->mol);
258 
259     /* Find program type consistent with the sequences. */
260     if (!seq1_is_aa && !seq2_is_aa) {
261        if (basic_options->program == eTblastx)
262           program_type = eTblastx;
263        else
264           program_type = eBlastn;
265     } else if (seq1_is_aa && seq2_is_aa) {
266        program_type = eBlastp;
267     } else if (!seq1_is_aa && seq2_is_aa) {
268        program_type = eBlastx;
269     } else if (seq1_is_aa && !seq2_is_aa) {
270        program_type = eTblastn;
271     }
272 
273     /* Check if program type in options is consistent with the one determined
274        from sequences. */
275     if (basic_options->program == eChoose)
276        basic_options->program = program_type;
277     else if (basic_options->program != program_type)
278        return -1;
279 
280     /* Convert the bioseqs into seqlocs. */
281 
282     ValNodeAddPointer(&query_slp, SEQLOC_WHOLE,
283                       SeqIdDup(SeqIdFindBest(bsp1->id, SEQID_GI)));
284     if (!query_slp)
285        return -1;
286     ValNodeAddPointer(&subject_slp, SEQLOC_WHOLE,
287                       SeqIdDup(SeqIdFindBest(bsp2->id, SEQID_GI)));
288     if (!subject_slp)
289        return -1;
290 
291     status = BLAST_TwoSeqLocSets(basic_options, query_slp, subject_slp, 
292                                  NULL, &seqalign_arr, NULL, NULL, NULL);
293 
294     if (seqalign_arr && seqalign_arr->num_queries)
295     {
296            *seqalign_out = seqalign_arr->array[0];
297            seqalign_arr->array[0] = NULL;
298            SBlastSeqalignArrayFree(seqalign_arr);
299     }
300 
301     SeqLocFree(query_slp);
302     SeqLocFree(subject_slp);
303 
304     return status;
305 }
306 
307 /** Calculates total length of a list of sequence locations. 
308  * @param seqloc List of SeqLoc's [in]
309  * @return Total length of all SeqLoc's in the list.
310  */
311 static Int4 
312 s_SeqLocListLen(SeqLoc* seqloc)
313 {
314    Int4 length = 0;
315 
316    for ( ; seqloc; seqloc = seqloc->next)
317       length += SeqLocLen(seqloc);
318 
319    return length;
320 }
321 
322 Int2 
323 Blast_SearchOptionsFromSummaryOptions(const BLAST_SummaryOptions *basic_options,
324                                       SeqLoc* query_seqloc,
325                                       Blast_SummaryReturn* extra_returns, 
326                                       SBlastOptions* *search_options,
327                                       char* *program_name)
328 {
329     const char *kProgram = NULL;
330     Int2 status = 0;
331 
332     switch(basic_options->program) {
333     case eBlastn:
334        kProgram = "blastn";
335        break;
336     case eBlastp:
337        kProgram = "blastp";
338        break;
339     case eBlastx:
340        kProgram = "blastx";
341        break;
342     case eTblastn:
343        kProgram = "tblastn";
344        break;
345     case eTblastx:
346        kProgram = "tblastx";
347        break;
348     default:
349        return -1;
350     }
351 
352     status = SBlastOptionsNew(kProgram, search_options, extra_returns);
353 
354     if (status)
355         return -1;
356     
357     status = s_TwoSeqBasicFillOptions(basic_options, *search_options, 
358                                       s_SeqLocListLen(query_seqloc));
359 
360     if (program_name)
361         *program_name = strdup(kProgram);
362 
363     return status;
364 }
365 
366 Int2 
367 BLAST_TwoSeqLocSets(const BLAST_SummaryOptions *basic_options,
368                     SeqLoc* query_seqloc, SeqLoc* subject_seqloc, 
369                     SeqLoc* masking_locs,
370                     SBlastSeqalignArray* *seqalign_arr,
371                     SeqLoc** filter_out,
372                     Boolean* mask_at_hash,
373                     Blast_SummaryReturn* *extra_returns_ptr)
374 {
375     SBlastOptions* options = NULL;
376     char *program_name = NULL;
377     Int2 status = 0;
378     Blast_SummaryReturn* extra_returns;
379 
380     if (!basic_options || !query_seqloc || !subject_seqloc)
381         return -1;
382 
383     extra_returns = Blast_SummaryReturnNew();
384 
385     status = 
386         Blast_SearchOptionsFromSummaryOptions(basic_options, query_seqloc, 
387                                               extra_returns, &options, 
388                                               &program_name);
389     sfree(program_name);
390 
391     if (!status) {
392         status = 
393             Blast_TwoSeqLocSetsAdvanced(query_seqloc, subject_seqloc, 
394                 masking_locs, options, NULL, seqalign_arr, filter_out, 
395                 extra_returns);
396     }
397 
398     if (mask_at_hash)
399         *mask_at_hash = SBlastOptionsGetMaskAtHash(options);
400 
401     options = SBlastOptionsFree(options);
402 
403     if (extra_returns_ptr)
404         *extra_returns_ptr = extra_returns;
405     else 
406         Blast_SummaryReturnFree(extra_returns);
407 
408     return status;
409 }
410 
411 /* @} */
412 
413 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.