NCBI C Toolkit Cross Reference

C/algo/blast/api/blast_seq.c


  1 #ifndef SKIP_DOXYGEN_PROCESSING
  2 static char const rcsid[] = "$Id: blast_seq.c,v 1.89 2008/06/09 17:29:14 madden Exp $";
  3 #endif /* SKIP_DOXYGEN_PROCESSING */
  4 /*
  5 * ===========================================================================
  6 *
  7 *                            PUBLIC DOMAIN NOTICE
  8 *               National Center for Biotechnology Information
  9 *
 10 *  This software/database is a "United States Government Work" under the
 11 *  terms of the United States Copyright Act.  It was written as part of
 12 *  the author's offical duties as a United States Government employee and
 13 *  thus cannot be copyrighted.  This software/database is freely available
 14 *  to the public for use. The National Library of Medicine and the U.S.
 15 *  Government have not placed any restriction on its use or reproduction.
 16 *
 17 *  Although all reasonable efforts have been taken to ensure the accuracy
 18 *  and reliability of the software and data, the NLM and the U.S.
 19 *  Government do not and cannot warrant the performance or results that
 20 *  may be obtained by using this software or data. The NLM and the U.S.
 21 *  Government disclaim all warranties, express or implied, including
 22 *  warranties of performance, merchantability or fitness for any particular
 23 *  purpose.
 24 *
 25 *  Please cite the author in any work or product based on this material.
 26 *
 27 *  Author: Ilya Dondoshansky
 28 * ===========================================================================*/
 29 
 30 /** @file blast_seq.c
 31  * Functions converting between SeqLocs and structures used in BLAST.
 32  */
 33 
 34 #include <seqport.h>
 35 #include <sequtil.h>
 36 #include <objloc.h>
 37 #include <readdb.h>
 38 #include <algo/blast/api/blast_seq.h>
 39 #include <algo/blast/core/blast_filter.h>
 40 #include <algo/blast/core/blast_util.h>
 41 #include <algo/blast/core/blast_encoding.h>
 42 #include <algo/blast/core/blast_setup.h> /* For BlastSeqLoc_RestrictToInterval */
 43 
 44 /** @addtogroup CToolkitAlgoBlast
 45  *
 46  * @{
 47  */
 48 
 49 
 50 /** Structure used for hash-based comparison of sequence IDs */
 51 typedef struct SeqIdHash {
 52     SeqId *id;           /**< The ID of this entry */
 53     Int4 query_index;    /**< index of query with this ID */
 54     Int4 next_id;        /**< Offset of the next hash entry in a chain */
 55 } SeqIdHash;
 56 
 57 Boolean
 58 BlastSeqlocsHaveDuplicateIDs(SeqLoc* query_seqlocs)
 59 {
 60    Boolean retval = FALSE;
 61    const Int4 kNumSeqs = ValNodeLen(query_seqlocs);
 62    const Int4 kLog2HashSize = 11;
 63    SeqIdHash *id_entries;
 64    Uint4 *hashtable;
 65    Int4 curr_id_num;
 66    SeqLocPtr slp;
 67 
 68    if (kNumSeqs == 1)
 69       return FALSE;
 70 
 71    /* allocate hashtable */
 72    hashtable = (Uint4 *)calloc((size_t)1 << kLog2HashSize, sizeof(Uint4));
 73    id_entries = (SeqIdHash *)malloc((kNumSeqs + 1) * sizeof(SeqIdHash));
 74 
 75    for (slp = query_seqlocs, curr_id_num = 1; slp; slp = slp->next) {
 76 
 77        Uint4 hashval;
 78        SeqIdPtr id = SeqLocId(slp);
 79        Char buffer[64];
 80 
 81        /* hash the ID of the next query sequence */
 82        SeqIdLabel(id, buffer, sizeof(buffer), OM_LABEL_CONTENT);
 83        hashval = readdb_sequence_hash(buffer, (int)strlen(buffer));
 84        hashval = hashval >> (32 - kLog2HashSize);
 85        if (hashtable[hashval] != 0) {
 86           Int4 offset = hashtable[hashval];
 87           /* check the chain of ID's in the resulting hash
 88              entry for a duplicate */
 89           while (offset != 0) {
 90              SeqIdHash *curr_entry = id_entries + offset;
 91              if (SeqIdMatch(id, curr_entry->id)) {
 92                  retval = TRUE;
 93                  goto clean_up;
 94              }
 95              offset = curr_entry->next_id;
 96           }
 97        }
 98 
 99        /* no duplicate found; add the ID to the hashtable */
100        id_entries[curr_id_num].id = id;
101        id_entries[curr_id_num].next_id = hashtable[hashval];
102        hashtable[hashval] = curr_id_num++;
103    }
104 
105 clean_up:
106    sfree(hashtable);
107    sfree(id_entries);
108    return retval;
109 }
110 
111 /** Converts a SeqLocPtr to a BlastSeqLoc, used for formatting.
112  * @param slp SeqLocPtr to be converted [in]
113  * @param head_loc BlastSeqLoc returned from last call [in]
114  * @return pointer to BlastSeqLoc
115  */
116 static BlastSeqLoc* 
117 s_BlastSeqLocFromSeqLoc(SeqLocPtr slp, BlastSeqLoc* head_loc)
118 {
119    BlastSeqLoc* last_loc = head_loc;
120 
121    if (slp == NULL)
122       return NULL;
123 
124    if (slp->choice == SEQLOC_PACKED_INT)
125       slp = (SeqLocPtr) slp->data.ptrvalue;
126 
127    for ( ; slp; slp = slp->next) {
128       SeqIntPtr si = (SeqIntPtr) slp->data.ptrvalue;
129       if (!head_loc) {
130          last_loc = head_loc = BlastSeqLocNew(&last_loc, si->from, si->to);
131       } else {
132          last_loc = BlastSeqLocNew(&last_loc, si->from, si->to);
133       }
134    }
135    return head_loc;
136 }
137 
138 BlastMaskLoc* 
139 BlastMaskLocFromSeqLoc(SeqLoc* mask_seqlocs, SeqLoc* query_seqlocs, 
140                        EBlastProgramType program_number)
141 {
142     const Int4 kNumSeqs = ValNodeLen(query_seqlocs);
143     BlastMaskLoc* retval = NULL;
144     Int4 query_index = 0;
145     const unsigned int kNumContexts = BLAST_GetNumberOfContexts(program_number);
146     const Int4 kLog2HashSize = 11;
147     SeqIdHash *id_entries;
148     Uint4 *hashtable;
149     Int4 curr_id_num = 1;
150     SeqLocPtr query_slp, mask_slp;
151 
152     if (!mask_seqlocs)
153         return NULL;
154 
155     retval = BlastMaskLocNew(kNumSeqs*kNumContexts);
156 
157     /* create hashtable for query IDs */
158     hashtable = (Uint4 *)calloc((size_t)1 << kLog2HashSize, sizeof(Uint4));
159     id_entries = (SeqIdHash *)malloc((kNumSeqs + 1) * sizeof(SeqIdHash));
160  
161     /* add the ID of each query sequence to the hashtable */
162     for (query_slp = query_seqlocs; query_slp; query_slp = query_slp->next) {
163         Uint4 hashval;
164         SeqIdPtr seq_id = SeqLocId(query_slp);
165         Char buffer[64];
166  
167         SeqIdLabel(seq_id, buffer, sizeof(buffer), OM_LABEL_CONTENT);
168         hashval = readdb_sequence_hash(buffer, (int)strlen(buffer));
169         hashval = hashval >> (32 - kLog2HashSize);
170  
171         id_entries[curr_id_num].id = seq_id;
172         id_entries[curr_id_num].query_index = query_index++;
173         id_entries[curr_id_num].next_id = hashtable[hashval];
174         hashtable[hashval] = curr_id_num++;
175     }
176 
177     /* for each mask location, find the query sequence containing
178        that mask and add to the list of filter locations for
179        that query. Note that this assumes IDs for all query
180        sequences are unique */
181 
182     for (mask_slp = mask_seqlocs; mask_slp; mask_slp = mask_slp->next) {
183        SeqLocPtr current_mask = (SeqLocPtr) mask_slp->data.ptrvalue;
184        Uint4 hashval;
185        SeqIdPtr mask_id;
186        Char buffer[64];
187 
188        if (current_mask == NULL)
189            continue;
190 
191        mask_id = SeqLocId(current_mask);
192        SeqIdLabel(mask_id, buffer, sizeof(buffer), OM_LABEL_CONTENT);
193        hashval = readdb_sequence_hash(buffer, (int)strlen(buffer));
194        hashval = hashval >> (32 - kLog2HashSize);
195 
196        /* examine only the query IDs that hash to the same value */
197        if (hashtable[hashval] != 0) {
198           Int4 offset = hashtable[hashval];
199           while (offset != 0) {
200 
201              SeqIdHash *q_entry = id_entries + offset;
202 
203              if (SeqIdMatch(mask_id, q_entry->id)) {
204                 Int4 context_idx = kNumContexts * q_entry->query_index;
205                 retval->seqloc_array[context_idx] = 
206                               s_BlastSeqLocFromSeqLoc(current_mask,
207                                          retval->seqloc_array[context_idx]);
208                 break;
209              }
210              offset = q_entry->next_id;
211           }
212        }
213     }
214 
215     sfree(hashtable);
216     sfree(id_entries);
217 
218     /* iterate through the query sequences and compute
219        the complement of the filtering locations for each */
220 
221     for (query_slp = query_seqlocs, query_index = 0; 
222          query_slp; 
223          query_slp = query_slp->next, query_index++) {
224 
225         const int kCtxIndex = kNumContexts * query_index; /* context index */
226          
227         if (retval->seqloc_array[kCtxIndex])
228         {
229             const Boolean kIsNa = Blast_QueryIsNucleotide(program_number) &&
230                 !Blast_QueryIsTranslated(program_number) &&
231                 !Blast_ProgramIsPhiBlast(program_number);
232             BlastSeqLoc_RestrictToInterval(&retval->seqloc_array[kCtxIndex], 
233                                            SeqLocStart(query_slp), 
234                                            SeqLocStop(query_slp));
235             if (kIsNa) {
236                 /* N.B.: Unlike in the C++ APIs, this logic is only applied to
237                  * non-translated nucleotide queries. See comment for
238                  * BlastMaskLocDNAToProtein */
239                 Uint1 strand = SeqLocStrand(query_slp);
240                 if (strand == Seq_strand_minus) {
241                     retval->seqloc_array[kCtxIndex+1] = 
242                         retval->seqloc_array[kCtxIndex];
243                     retval->seqloc_array[kCtxIndex] = NULL;
244                 } else if (strand == Seq_strand_plus) {
245                     retval->seqloc_array[kCtxIndex+1] = NULL;
246                 } else {
247                     retval->seqloc_array[kCtxIndex+1] = 
248                         BlastSeqLocListDup(retval->seqloc_array[kCtxIndex]);
249                 }
250             }
251         }
252     }
253     
254     return retval;
255 }
256 
257 SeqLoc*
258 Blast_ValNodeMaskListFree(SeqLoc* mask_loc)
259 {
260     ValNode* mask_var;
261     for (mask_var = mask_loc; mask_var; mask_var = mask_var->next) 
262         SeqLocSetFree((SeqLoc*)mask_var->data.ptrvalue);
263     mask_loc = ValNodeFree(mask_loc);
264     return mask_loc;
265 }
266 
267 SeqLocPtr BlastMaskLocToSeqLoc(EBlastProgramType program_number, 
268                                const BlastMaskLoc* mask_loc, 
269                                SeqLoc* query_loc)
270 {
271    SeqLocPtr retval = NULL, retval_tail = NULL;
272    Int4 index;
273    const Boolean k_translate = Blast_QueryIsTranslated(program_number);
274    const Uint1 k_num_frames = BLAST_GetNumberOfContexts(program_number);
275    const Boolean kIsNucl = (program_number == eBlastTypeBlastn);
276    SeqLoc* slp;
277    Boolean all_minus = TRUE;
278 
279    if (mask_loc == NULL || mask_loc->seqloc_array == NULL)
280       return NULL;
281 
282    for (slp = query_loc; slp; slp = slp->next)
283    {
284         Uint1 strand = SeqLocStrand(slp);
285         if (strand != Seq_strand_minus)
286         {
287            all_minus = FALSE;
288            break;
289         }
290    }
291 
292    for (index=0, slp = query_loc; slp; ++index, slp = slp->next)
293    {
294       const int kCtxIndex = k_num_frames * index; /* context index */
295       Int4 tmp_index;
296       Int4 slp_from = SeqLocStart(slp);
297       SeqIdPtr seqid = SeqLocId(slp);
298       for (tmp_index=kCtxIndex; tmp_index<(kCtxIndex+k_num_frames); tmp_index++)
299       {
300          BlastSeqLoc* loc = NULL;
301          SeqLocPtr mask_slp_head = NULL, mask_slp_tail = NULL;
302          if (all_minus || BlastIsReverseStrand(kIsNucl , tmp_index) == FALSE)
303          {
304             for (loc = mask_loc->seqloc_array[tmp_index]; loc; loc = loc->next)
305             {
306                SeqIntPtr si = SeqIntNew();
307                si->from = loc->ssr->left + slp_from;
308                si->to = loc->ssr->right + slp_from;
309                si->id = SeqIdDup(seqid);
310                /* Append the pointer, but also keep track of the tail of the list
311                 * so that appending to the list is a constant operation */
312                mask_slp_tail = ValNodeAddPointer
313                    ( (mask_slp_tail ? &mask_slp_tail : &mask_slp_head), 
314                      SEQLOC_INT, si);
315             }
316          }
317 
318          if (mask_slp_head) {
319             SeqLocPtr new_mask_slp = ValNodeAddPointer(NULL, SEQLOC_PACKED_INT, 
320                                              mask_slp_head);
321             Uint1 tmp_choice = 0;
322             /* The 'choice' of the SeqLoc in masks should show the frame,
323                with values 1..6 when queries are translated; otherwise
324                it does not matter. */
325             if (k_translate)  
326                 tmp_choice = (tmp_index % NUM_FRAMES) + 1;
327             else
328                 tmp_choice = 0;
329 
330             /* Append the pointer, but also keep track of the tail of the list
331              * so that appending to the list is a constant operation */
332             retval_tail = ValNodeAddPointer
333                 ( (retval_tail ? &retval_tail : &retval), 
334                   tmp_choice, new_mask_slp);
335         }
336       }
337    }
338    return retval;
339 }
340 
341 /** Set field values for one element of the context array of a
342  * concatenated query.  All previous contexts should have already been
343  * assigned correct values.
344  * @param qinfo  Query info structure containing contexts. [in/out]
345  * @param index  Index of the context to fill. [in]
346  * @param length Length of this context. [in]
347  */
348 static void
349 s_QueryInfoSetContextInfo(BlastQueryInfo*   qinfo,
350                           Uint4             index,
351                           Uint4             length)
352 {
353     if (index) {
354         Uint4 prev_loc = qinfo->contexts[index-1].query_offset;
355         Uint4 prev_len = qinfo->contexts[index-1].query_length;
356         
357         Uint4 shift = prev_len ? prev_len + 1 : 0;
358         
359         qinfo->contexts[index].query_offset = prev_loc + shift;
360         qinfo->contexts[index].query_length = length;
361         if (length == 0)
362             qinfo->contexts[index].is_valid = FALSE;
363 
364     } else {
365         /* First context */
366         qinfo->contexts[0].query_offset = 0;
367         qinfo->contexts[0].query_length = length;
368         if (length == 0)
369             qinfo->contexts[0].is_valid = FALSE;
370     }
371 }
372 
373 /** Sets up the query information structure with all contexts' data.
374  * @param slp List of query Seq-loc's [in]
375  * @param program Type of BLAST program [in]
376  * @param query_info_ptr Pointer to the structure to populate. [out]
377  */
378 static Int4 
379 s_QueryInfoSetUp(SeqLocPtr slp, EBlastProgramType program, 
380                  BlastQueryInfo** query_info_ptr)
381 {
382    Uint4 length, protein_length;
383    Boolean translate = 
384       (program == eBlastTypeBlastx || program == eBlastTypeTblastx ||
385        program == eBlastTypeRpsTblastn);
386    Boolean is_na = (program == eBlastTypeBlastn || 
387                     program == eBlastTypePhiBlastn);
388    Int2 num_frames, frame;
389    Uint1 strand;
390    BlastQueryInfo* query_info;
391    Int4 index;
392    Uint4 max_length = 0;
393 
394    if (translate)
395       num_frames = NUM_FRAMES;
396    else if (is_na)
397       num_frames = 2;
398    else
399       num_frames = 1;
400 
401    if ((query_info = BlastQueryInfoNew(program, ValNodeLen(slp))) == NULL) 
402       return -1;
403 
404    if ((strand = SeqLocStrand(slp)) == Seq_strand_minus) {
405       if (translate)
406          query_info->first_context = 3;
407       else
408          query_info->first_context = 1;
409    }
410    
411    /* Fill the context offsets */
412    for (index = 0; slp; slp = slp->next, index += num_frames) {
413       length = SeqLocLen(slp);  /* FIXME: could return -1 */
414       strand = SeqLocStrand(slp);
415       if (translate) {
416          Int2 first_frame, last_frame;
417          if (strand == Seq_strand_plus) {
418             first_frame = 0;
419             last_frame = 2;
420          } else if (strand == Seq_strand_minus) {
421             first_frame = 3;
422             last_frame = 5;
423          } else {
424             first_frame = 0;
425             last_frame = 5;
426          }
427 
428          /* Set the unused initial contexts if any */
429          for (frame = 0; frame < first_frame; ++frame) {
430              s_QueryInfoSetContextInfo(query_info, index+frame, 0);
431          }
432          
433          for (frame = first_frame; frame <= last_frame; ++frame) {
434             protein_length = BLAST_GetTranslatedProteinLength(length, index+frame);
435             max_length = MAX(max_length, protein_length);
436 
437             s_QueryInfoSetContextInfo(query_info,
438                                      index+frame,
439                                      protein_length);
440          }
441 
442          /* Set the unused trailing contexts if any */
443          for (frame = last_frame + 1; frame < num_frames; ++frame) {
444              s_QueryInfoSetContextInfo(query_info, index+frame, 0);
445          }
446       } else {
447          max_length = MAX(max_length, length);
448          
449          if (is_na) {
450             if (strand == Seq_strand_plus) {
451                 s_QueryInfoSetContextInfo(query_info, index,   length);
452                 s_QueryInfoSetContextInfo(query_info, index+1, 0);
453             } else if (strand == Seq_strand_minus) {
454                 s_QueryInfoSetContextInfo(query_info, index,   0);
455                 s_QueryInfoSetContextInfo(query_info, index+1, length);
456             } else {
457                 s_QueryInfoSetContextInfo(query_info, index,   length);
458                 s_QueryInfoSetContextInfo(query_info, index+1, length);
459             }
460          } else {
461              s_QueryInfoSetContextInfo(query_info, index, length);
462          }
463       }
464    }
465    query_info->max_length = max_length;
466 
467    *query_info_ptr = query_info;
468    return 0;
469 }
470 
471 /** Given a SeqLoc, fills a preallocated sequence buffer in the correct 
472  * encoding.
473  * @param slp SeqLoc structure to get data from. [in]
474  * @param encoding What encoding to fill sequence buffer in? [in]
475  * @param buffer Buffer to fill.
476  */
477 static Int2 
478 s_SeqLocReadSequence(SeqLocPtr slp, EBlastEncoding encoding, Uint1** buffer)
479 {
480    Uint1* buffer_var = *buffer;
481    Int4 size, index;
482 
483    if (!buffer_var || !slp)
484       return -1;
485 
486    size = SeqPortStreamLoc(slp, STREAM_EXPAND_GAPS | STREAM_CORRECT_INVAL,
487                           buffer_var, NULL);
488 
489    switch (encoding) {
490    case eBlastEncodingProtein: 
491       for (index = 0; index < size; index++)
492          buffer_var[index] = AMINOACID_TO_NCBISTDAA[buffer_var[index]];
493       break;
494    case eBlastEncodingNcbi4na:
495       for (index = 0; index < size; index++) 
496          buffer_var[index] = IUPACNA_TO_NCBI4NA[buffer_var[index]];
497       break;
498    case eBlastEncodingNucleotide:
499       for (index = 0; index < size; index++) 
500          buffer_var[index] = IUPACNA_TO_BLASTNA[buffer_var[index]];
501       break;
502    default:
503        /* This function should not be called for any other encodings - 
504           return an error status. */
505        return -1;
506    }
507 
508    *buffer = buffer_var + size;
509    return 0;
510 }
511 
512 /** Fills sequence buffer for a single SeqLoc; fills both strands if necessary.
513  * @param slp Sequence location [in]
514  * @param encoding Encoding to use for the sequence buffer. [in]
515  * @param add_sentinel_bytes Should sentinel bytes be added at the ends of the 
516  *                           buffer? [in]
517  * @param both_strands Should buffer include both strands for a nucleotide 
518  *                     sequence? [in]
519  * @param buffer Buffer to populate. Must be already allocated. [in] [out]
520  */
521 static Int2 
522 s_SeqLocFillSequenceBuffer(SeqLocPtr slp, EBlastEncoding encoding, 
523     Boolean add_sentinel_bytes, Boolean both_strands, Uint1* buffer)
524 {
525    Uint1* buffer_var;
526    Uint1 sentinel = (encoding == eBlastEncodingNucleotide 
527                      ? NCBI4NA_TO_BLASTNA[NULLB] 
528                      : NULLB);
529    Uint1 seq_code, strand;
530 
531    buffer_var = buffer;
532 
533    if (add_sentinel_bytes) {
534       *buffer_var = sentinel;
535       ++buffer_var;
536    }
537 
538    if (encoding == eBlastEncodingProtein) {
539       seq_code = Seq_code_ncbistdaa;
540       strand = Seq_strand_unknown;
541    } else {
542       seq_code = Seq_code_ncbi4na;
543       strand = SeqLocStrand(slp);
544    }
545 
546    s_SeqLocReadSequence(slp, encoding, &buffer_var);
547 
548    if (add_sentinel_bytes)
549       *buffer_var = sentinel;
550 
551    if (both_strands && strand == Seq_strand_both) {
552       SeqLocPtr tmp_slp=NULL;
553 
554       ++buffer_var;
555 
556       tmp_slp = SeqLocIntNew(SeqLocStart(slp), SeqLocStop(slp),
557                              Seq_strand_minus, SeqLocId(slp));
558             
559       s_SeqLocReadSequence(tmp_slp, encoding, &buffer_var);
560       if (add_sentinel_bytes)
561          *buffer_var = sentinel;
562 
563       SeqLocFree(tmp_slp);
564    }
565 
566    return 0;
567 }
568 
569 Int2 BLAST_GeneticCodeFind(Int4 gc, Uint1** genetic_code)
570 {
571    ValNodePtr vnp;
572    GeneticCodePtr gcp;
573    char* gen_code_eaa = NULL;
574    Uint1* gen_code_stdaa = NULL;
575    Int4 gen_code_length = 0, index;
576    SeqMapTablePtr smtp;
577 
578    gcp = GeneticCodeFind(gc, NULL);
579    for (vnp = (ValNodePtr)gcp->data.ptrvalue; vnp != NULL; 
580         vnp = vnp->next) {
581       if (vnp->choice == 3) {  /* ncbieaa */
582          gen_code_eaa = (char*)vnp->data.ptrvalue;
583          break;
584       }
585    }
586 
587    if (!gen_code_eaa)
588       return -1;
589    smtp = SeqMapTableFind(Seq_code_ncbistdaa, Seq_code_ncbieaa);
590    gen_code_length = (Int4)StrLen(gen_code_eaa);
591    *genetic_code = gen_code_stdaa = (Uint1*) calloc(gen_code_length+1, 1);
592 
593    if (!gen_code_stdaa)
594       return -2;
595 
596    for (index = 0; index < gen_code_length; ++index) {
597       gen_code_stdaa[index] = 
598          SeqMapTableConvert(smtp, gen_code_eaa[index]);
599    }
600    
601    return 0;
602 }
603 
604 /** s_GetSequence
605  * Purpose:     Get the sequence for the BLAST engine, put in a Uint1 buffer
606  * @param slp SeqLoc to extract sequence for [in]
607  * @param query_info The query information structure, pre-initialized,
608  *                   but filled here [in]
609  * @param query_options Query setup options, containing the genetic code for
610  *                      translation. N.B.: its strand_option field is ignored [in]
611  * @param num_frames How many frames to get for this sequence? [in]
612  * @param encoding In what encoding to retrieve the sequence? [in]
613  * @param buffer_out Buffer to hold plus strand or protein [out]
614  * @param buffer_length Length of buffer allocated [out]
615  */
616 static Int2 
617 s_GetSequence(SeqLocPtr slp, BlastQueryInfo* query_info, 
618    const QuerySetUpOptions* query_options, Uint1 num_frames, 
619    EBlastEncoding encoding, Uint1* *buffer_out, Int4 *buffer_length)
620 {
621    Int2         status=0; /* return value. */
622    Int4 total_length; /* Total length of all queries/frames/strands */
623    Int4         index; /* Loop counter */
624    SeqLocPtr    slp_var; /* loop variable */
625    Uint1*       buffer; /* buffer to fill. */
626    Boolean add_sentinel_bytes = TRUE;
627    Uint1* genetic_code=NULL;
628    Boolean translate = FALSE;
629    Int4 offset = 0;
630 
631    if (query_info) {
632        *buffer_length = total_length = QueryInfo_GetSeqBufLen(query_info);
633    } else {
634       /* Subject sequence in 2 sequences comparison */
635       *buffer_length = SeqLocLen(slp);
636       /* allow two extra bytes for sentinels or a trailing
637          null appended by the low-level sequence conversion */
638       total_length = (*buffer_length) + 2;
639       if (encoding == eBlastEncodingNcbi4na) {
640          /* Searches with translated subjects (tblastn, tblastx) */
641          add_sentinel_bytes = FALSE;
642       }
643    }
644 
645    if (num_frames == NUM_FRAMES) {
646       /* Sequence must be translated in 6 frames. This can only happen
647          for query - subject sequences are translated later. */
648       Int4 gc;
649       
650       translate = TRUE;
651       gc = (query_options ? query_options->genetic_code : 1);
652 
653       if ((status = BLAST_GeneticCodeFind(gc, &genetic_code)) != 0)
654          return status;
655    }
656 
657    *buffer_out = buffer = (Uint1 *) malloc((total_length)*sizeof(Uint1));
658    
659    for (index = 0, slp_var = slp; slp_var; 
660         slp_var = slp_var->next, index += num_frames)
661    {
662       if (translate) {
663          Uint1* na_buffer, *buffer_rev = NULL;
664          Int4 context, context_start, context_end;
665          Int4 na_length;
666          Uint1 strand;
667          
668 
669          na_length = SeqLocLen(slp_var);
670          strand = SeqLocStrand(slp_var);
671          /* Retrieve nucleotide sequence in an auxiliary buffer; 
672             then translate into the appropriate place in the 
673             preallocated buffer */
674          if (strand == Seq_strand_plus) {
675             na_buffer = (Uint1 *) malloc(na_length + 2);
676             context_start = 0;
677             context_end = 2;
678          } else if (strand == Seq_strand_minus) {
679             na_buffer = (Uint1 *) malloc(na_length + 2);
680             context_start = 3;
681             context_end = 5;
682          } else {
683             na_buffer = (Uint1*) malloc(2*na_length + 3);
684             context_start = 0;
685             context_end = 5;
686          }
687          s_SeqLocFillSequenceBuffer(slp_var, encoding, TRUE, TRUE, na_buffer);
688          if (strand == Seq_strand_both)
689             buffer_rev = na_buffer + na_length + 1;
690          else if (strand == Seq_strand_minus)
691             buffer_rev = na_buffer;
692 
693          for (context = context_start; context <= context_end; context++) {
694              offset = query_info->contexts[index+context].query_offset;
695              
696              BLAST_GetTranslation(na_buffer+1, buffer_rev, na_length,
697                                   BLAST_ContextToFrame(eBlastTypeBlastx, context),
698                                   &buffer[offset], genetic_code);
699          }
700          sfree(na_buffer);
701       } else {
702          /* This can happen both for query and subject, so query_info 
703             might not be initialized here. */
704           if (query_info)
705               offset = query_info->contexts[index].query_offset;
706           
707           s_SeqLocFillSequenceBuffer(slp_var, encoding, add_sentinel_bytes, 
708                                      (Boolean)(num_frames == 2), &buffer[offset]);
709       }
710       /* For subjects, do only one SeqLoc at a time */
711       if (!query_info)
712          break;
713    }
714 
715    sfree(genetic_code);
716 
717    return status;
718 }
719 
720 Int2 BLAST_SetUpQuery(EBlastProgramType program_number, 
721         SeqLocPtr query_slp, const QuerySetUpOptions* query_options, 
722         SeqLoc* masking_locs, BlastQueryInfo** query_info, 
723         BLAST_SequenceBlk* *query_blk)
724 {
725    Uint1* buffer;       /* holds sequence for plus strand or protein. */
726    Int4 buffer_length;
727    Int2 status;
728    Uint1 num_frames;
729    EBlastEncoding encoding;
730 
731    if (query_slp == NULL || query_options == NULL ||
732        query_info == NULL || query_blk == NULL)
733       return -1;
734 
735    if ((status = s_QueryInfoSetUp(query_slp, program_number, query_info)))
736       return status;
737 
738    if (program_number == eBlastTypeBlastn || 
739        program_number == eBlastTypePhiBlastn) {
740       encoding = eBlastEncodingNucleotide;
741       num_frames = 2;
742    } else if (Blast_QueryIsProtein(program_number)) {
743       encoding = eBlastEncodingProtein;
744       num_frames = 1;
745    } else { /* blastx or rpstblastn, which is also essentially blastx */
746       encoding = eBlastEncodingNcbi4na;
747       num_frames = NUM_FRAMES;
748    }
749 
750    if ((status=s_GetSequence(query_slp, *query_info, query_options,
751                   num_frames, encoding, &buffer, &buffer_length)))
752       return status; 
753         
754    /* Do not count the first and last sentinel bytes in the 
755       query length */
756    if ((status=BlastSetUp_SeqBlkNew(buffer, buffer_length-2, 
757                                     query_blk, TRUE)))
758       return status;
759 
760    if (masking_locs) {
761        BlastMaskLoc* lcase_mask = BlastMaskLocFromSeqLoc(masking_locs, 
762                                                          query_slp,
763                                                          program_number);
764        if (Blast_QueryIsTranslated(program_number))
765            BlastMaskLocDNAToProtein(lcase_mask, *query_info);
766        (*query_blk)->lcase_mask = lcase_mask;
767        (*query_blk)->lcase_mask_allocated = TRUE;
768    }
769 
770    return 0;
771 }
772 
773 Int2 BLAST_SetUpSubject(EBlastProgramType program_number, 
774         SeqLocPtr subject_slp, BLAST_SequenceBlk** subject)
775 {
776    Int2 status = 0;
777    Uint1* subject_buffer = NULL; /* Buffer for the compressed subject 
778                                       sequence in two sequences case */
779    Int4 buffer_length=0; /* Length of subject sequence for two sequences 
780                             case */
781    EBlastEncoding encoding;
782    const Boolean kNucleotide = (program_number == eBlastTypeBlastn || 
783                                 program_number == eBlastTypePhiBlastn);
784    const Boolean kTranslated = Blast_SubjectIsTranslated(program_number);
785 
786    if (kNucleotide)
787       encoding = eBlastEncodingNucleotide;
788    else if (kTranslated) {
789       encoding = eBlastEncodingNcbi4na;
790    } else {
791       encoding = eBlastEncodingProtein;
792    }
793 
794    if ((status = s_GetSequence(subject_slp, NULL, NULL, 1, encoding,
795                                    &subject_buffer, &buffer_length)))
796       return status;
797    
798    /* Initialize the sequence block, saving the sequence buffer in 
799       'sequence_start'. */
800    if ((status=BlastSetUp_SeqBlkNew(subject_buffer, buffer_length,
801                                     subject, TRUE)))
802       return status;
803 
804    /* If subject sequence is nucleotide, create compressed sequence buffer
805       and save it in 'sequence'. For blastn, the sentinel bytes should not 
806       be included in the packed sequence. */
807    if (kNucleotide)
808       ++subject_buffer;
809 
810    if (kNucleotide || kTranslated) {
811       BLAST_PackDNA(subject_buffer, buffer_length, encoding, 
812                     &((*subject)->sequence));
813       (*subject)->sequence_allocated = TRUE;
814    }
815 
816    return 0;
817 }
818 /* @} */
819 
820 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.