|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/algo/blast/api/blast_seq.c |
source navigation diff markup identifier search freetext search file search |
1 #ifndef SKIP_DOXYGEN_PROCESSING
2 static char const rcsid[] = "$Id: blast_seq.c,v 1.89 2008/06/09 17:29:14 madden Exp $";
3 #endif /* SKIP_DOXYGEN_PROCESSING */
4 /*
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's offical duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * Author: Ilya Dondoshansky
28 * ===========================================================================*/
29
30 /** @file blast_seq.c
31 * Functions converting between SeqLocs and structures used in BLAST.
32 */
33
34 #include <seqport.h>
35 #include <sequtil.h>
36 #include <objloc.h>
37 #include <readdb.h>
38 #include <algo/blast/api/blast_seq.h>
39 #include <algo/blast/core/blast_filter.h>
40 #include <algo/blast/core/blast_util.h>
41 #include <algo/blast/core/blast_encoding.h>
42 #include <algo/blast/core/blast_setup.h> /* For BlastSeqLoc_RestrictToInterval */
43
44 /** @addtogroup CToolkitAlgoBlast
45 *
46 * @{
47 */
48
49
50 /** Structure used for hash-based comparison of sequence IDs */
51 typedef struct SeqIdHash {
52 SeqId *id; /**< The ID of this entry */
53 Int4 query_index; /**< index of query with this ID */
54 Int4 next_id; /**< Offset of the next hash entry in a chain */
55 } SeqIdHash;
56
57 Boolean
58 BlastSeqlocsHaveDuplicateIDs(SeqLoc* query_seqlocs)
59 {
60 Boolean retval = FALSE;
61 const Int4 kNumSeqs = ValNodeLen(query_seqlocs);
62 const Int4 kLog2HashSize = 11;
63 SeqIdHash *id_entries;
64 Uint4 *hashtable;
65 Int4 curr_id_num;
66 SeqLocPtr slp;
67
68 if (kNumSeqs == 1)
69 return FALSE;
70
71 /* allocate hashtable */
72 hashtable = (Uint4 *)calloc((size_t)1 << kLog2HashSize, sizeof(Uint4));
73 id_entries = (SeqIdHash *)malloc((kNumSeqs + 1) * sizeof(SeqIdHash));
74
75 for (slp = query_seqlocs, curr_id_num = 1; slp; slp = slp->next) {
76
77 Uint4 hashval;
78 SeqIdPtr id = SeqLocId(slp);
79 Char buffer[64];
80
81 /* hash the ID of the next query sequence */
82 SeqIdLabel(id, buffer, sizeof(buffer), OM_LABEL_CONTENT);
83 hashval = readdb_sequence_hash(buffer, (int)strlen(buffer));
84 hashval = hashval >> (32 - kLog2HashSize);
85 if (hashtable[hashval] != 0) {
86 Int4 offset = hashtable[hashval];
87 /* check the chain of ID's in the resulting hash
88 entry for a duplicate */
89 while (offset != 0) {
90 SeqIdHash *curr_entry = id_entries + offset;
91 if (SeqIdMatch(id, curr_entry->id)) {
92 retval = TRUE;
93 goto clean_up;
94 }
95 offset = curr_entry->next_id;
96 }
97 }
98
99 /* no duplicate found; add the ID to the hashtable */
100 id_entries[curr_id_num].id = id;
101 id_entries[curr_id_num].next_id = hashtable[hashval];
102 hashtable[hashval] = curr_id_num++;
103 }
104
105 clean_up:
106 sfree(hashtable);
107 sfree(id_entries);
108 return retval;
109 }
110
111 /** Converts a SeqLocPtr to a BlastSeqLoc, used for formatting.
112 * @param slp SeqLocPtr to be converted [in]
113 * @param head_loc BlastSeqLoc returned from last call [in]
114 * @return pointer to BlastSeqLoc
115 */
116 static BlastSeqLoc*
117 s_BlastSeqLocFromSeqLoc(SeqLocPtr slp, BlastSeqLoc* head_loc)
118 {
119 BlastSeqLoc* last_loc = head_loc;
120
121 if (slp == NULL)
122 return NULL;
123
124 if (slp->choice == SEQLOC_PACKED_INT)
125 slp = (SeqLocPtr) slp->data.ptrvalue;
126
127 for ( ; slp; slp = slp->next) {
128 SeqIntPtr si = (SeqIntPtr) slp->data.ptrvalue;
129 if (!head_loc) {
130 last_loc = head_loc = BlastSeqLocNew(&last_loc, si->from, si->to);
131 } else {
132 last_loc = BlastSeqLocNew(&last_loc, si->from, si->to);
133 }
134 }
135 return head_loc;
136 }
137
138 BlastMaskLoc*
139 BlastMaskLocFromSeqLoc(SeqLoc* mask_seqlocs, SeqLoc* query_seqlocs,
140 EBlastProgramType program_number)
141 {
142 const Int4 kNumSeqs = ValNodeLen(query_seqlocs);
143 BlastMaskLoc* retval = NULL;
144 Int4 query_index = 0;
145 const unsigned int kNumContexts = BLAST_GetNumberOfContexts(program_number);
146 const Int4 kLog2HashSize = 11;
147 SeqIdHash *id_entries;
148 Uint4 *hashtable;
149 Int4 curr_id_num = 1;
150 SeqLocPtr query_slp, mask_slp;
151
152 if (!mask_seqlocs)
153 return NULL;
154
155 retval = BlastMaskLocNew(kNumSeqs*kNumContexts);
156
157 /* create hashtable for query IDs */
158 hashtable = (Uint4 *)calloc((size_t)1 << kLog2HashSize, sizeof(Uint4));
159 id_entries = (SeqIdHash *)malloc((kNumSeqs + 1) * sizeof(SeqIdHash));
160
161 /* add the ID of each query sequence to the hashtable */
162 for (query_slp = query_seqlocs; query_slp; query_slp = query_slp->next) {
163 Uint4 hashval;
164 SeqIdPtr seq_id = SeqLocId(query_slp);
165 Char buffer[64];
166
167 SeqIdLabel(seq_id, buffer, sizeof(buffer), OM_LABEL_CONTENT);
168 hashval = readdb_sequence_hash(buffer, (int)strlen(buffer));
169 hashval = hashval >> (32 - kLog2HashSize);
170
171 id_entries[curr_id_num].id = seq_id;
172 id_entries[curr_id_num].query_index = query_index++;
173 id_entries[curr_id_num].next_id = hashtable[hashval];
174 hashtable[hashval] = curr_id_num++;
175 }
176
177 /* for each mask location, find the query sequence containing
178 that mask and add to the list of filter locations for
179 that query. Note that this assumes IDs for all query
180 sequences are unique */
181
182 for (mask_slp = mask_seqlocs; mask_slp; mask_slp = mask_slp->next) {
183 SeqLocPtr current_mask = (SeqLocPtr) mask_slp->data.ptrvalue;
184 Uint4 hashval;
185 SeqIdPtr mask_id;
186 Char buffer[64];
187
188 if (current_mask == NULL)
189 continue;
190
191 mask_id = SeqLocId(current_mask);
192 SeqIdLabel(mask_id, buffer, sizeof(buffer), OM_LABEL_CONTENT);
193 hashval = readdb_sequence_hash(buffer, (int)strlen(buffer));
194 hashval = hashval >> (32 - kLog2HashSize);
195
196 /* examine only the query IDs that hash to the same value */
197 if (hashtable[hashval] != 0) {
198 Int4 offset = hashtable[hashval];
199 while (offset != 0) {
200
201 SeqIdHash *q_entry = id_entries + offset;
202
203 if (SeqIdMatch(mask_id, q_entry->id)) {
204 Int4 context_idx = kNumContexts * q_entry->query_index;
205 retval->seqloc_array[context_idx] =
206 s_BlastSeqLocFromSeqLoc(current_mask,
207 retval->seqloc_array[context_idx]);
208 break;
209 }
210 offset = q_entry->next_id;
211 }
212 }
213 }
214
215 sfree(hashtable);
216 sfree(id_entries);
217
218 /* iterate through the query sequences and compute
219 the complement of the filtering locations for each */
220
221 for (query_slp = query_seqlocs, query_index = 0;
222 query_slp;
223 query_slp = query_slp->next, query_index++) {
224
225 const int kCtxIndex = kNumContexts * query_index; /* context index */
226
227 if (retval->seqloc_array[kCtxIndex])
228 {
229 const Boolean kIsNa = Blast_QueryIsNucleotide(program_number) &&
230 !Blast_QueryIsTranslated(program_number) &&
231 !Blast_ProgramIsPhiBlast(program_number);
232 BlastSeqLoc_RestrictToInterval(&retval->seqloc_array[kCtxIndex],
233 SeqLocStart(query_slp),
234 SeqLocStop(query_slp));
235 if (kIsNa) {
236 /* N.B.: Unlike in the C++ APIs, this logic is only applied to
237 * non-translated nucleotide queries. See comment for
238 * BlastMaskLocDNAToProtein */
239 Uint1 strand = SeqLocStrand(query_slp);
240 if (strand == Seq_strand_minus) {
241 retval->seqloc_array[kCtxIndex+1] =
242 retval->seqloc_array[kCtxIndex];
243 retval->seqloc_array[kCtxIndex] = NULL;
244 } else if (strand == Seq_strand_plus) {
245 retval->seqloc_array[kCtxIndex+1] = NULL;
246 } else {
247 retval->seqloc_array[kCtxIndex+1] =
248 BlastSeqLocListDup(retval->seqloc_array[kCtxIndex]);
249 }
250 }
251 }
252 }
253
254 return retval;
255 }
256
257 SeqLoc*
258 Blast_ValNodeMaskListFree(SeqLoc* mask_loc)
259 {
260 ValNode* mask_var;
261 for (mask_var = mask_loc; mask_var; mask_var = mask_var->next)
262 SeqLocSetFree((SeqLoc*)mask_var->data.ptrvalue);
263 mask_loc = ValNodeFree(mask_loc);
264 return mask_loc;
265 }
266
267 SeqLocPtr BlastMaskLocToSeqLoc(EBlastProgramType program_number,
268 const BlastMaskLoc* mask_loc,
269 SeqLoc* query_loc)
270 {
271 SeqLocPtr retval = NULL, retval_tail = NULL;
272 Int4 index;
273 const Boolean k_translate = Blast_QueryIsTranslated(program_number);
274 const Uint1 k_num_frames = BLAST_GetNumberOfContexts(program_number);
275 const Boolean kIsNucl = (program_number == eBlastTypeBlastn);
276 SeqLoc* slp;
277 Boolean all_minus = TRUE;
278
279 if (mask_loc == NULL || mask_loc->seqloc_array == NULL)
280 return NULL;
281
282 for (slp = query_loc; slp; slp = slp->next)
283 {
284 Uint1 strand = SeqLocStrand(slp);
285 if (strand != Seq_strand_minus)
286 {
287 all_minus = FALSE;
288 break;
289 }
290 }
291
292 for (index=0, slp = query_loc; slp; ++index, slp = slp->next)
293 {
294 const int kCtxIndex = k_num_frames * index; /* context index */
295 Int4 tmp_index;
296 Int4 slp_from = SeqLocStart(slp);
297 SeqIdPtr seqid = SeqLocId(slp);
298 for (tmp_index=kCtxIndex; tmp_index<(kCtxIndex+k_num_frames); tmp_index++)
299 {
300 BlastSeqLoc* loc = NULL;
301 SeqLocPtr mask_slp_head = NULL, mask_slp_tail = NULL;
302 if (all_minus || BlastIsReverseStrand(kIsNucl , tmp_index) == FALSE)
303 {
304 for (loc = mask_loc->seqloc_array[tmp_index]; loc; loc = loc->next)
305 {
306 SeqIntPtr si = SeqIntNew();
307 si->from = loc->ssr->left + slp_from;
308 si->to = loc->ssr->right + slp_from;
309 si->id = SeqIdDup(seqid);
310 /* Append the pointer, but also keep track of the tail of the list
311 * so that appending to the list is a constant operation */
312 mask_slp_tail = ValNodeAddPointer
313 ( (mask_slp_tail ? &mask_slp_tail : &mask_slp_head),
314 SEQLOC_INT, si);
315 }
316 }
317
318 if (mask_slp_head) {
319 SeqLocPtr new_mask_slp = ValNodeAddPointer(NULL, SEQLOC_PACKED_INT,
320 mask_slp_head);
321 Uint1 tmp_choice = 0;
322 /* The 'choice' of the SeqLoc in masks should show the frame,
323 with values 1..6 when queries are translated; otherwise
324 it does not matter. */
325 if (k_translate)
326 tmp_choice = (tmp_index % NUM_FRAMES) + 1;
327 else
328 tmp_choice = 0;
329
330 /* Append the pointer, but also keep track of the tail of the list
331 * so that appending to the list is a constant operation */
332 retval_tail = ValNodeAddPointer
333 ( (retval_tail ? &retval_tail : &retval),
334 tmp_choice, new_mask_slp);
335 }
336 }
337 }
338 return retval;
339 }
340
341 /** Set field values for one element of the context array of a
342 * concatenated query. All previous contexts should have already been
343 * assigned correct values.
344 * @param qinfo Query info structure containing contexts. [in/out]
345 * @param index Index of the context to fill. [in]
346 * @param length Length of this context. [in]
347 */
348 static void
349 s_QueryInfoSetContextInfo(BlastQueryInfo* qinfo,
350 Uint4 index,
351 Uint4 length)
352 {
353 if (index) {
354 Uint4 prev_loc = qinfo->contexts[index-1].query_offset;
355 Uint4 prev_len = qinfo->contexts[index-1].query_length;
356
357 Uint4 shift = prev_len ? prev_len + 1 : 0;
358
359 qinfo->contexts[index].query_offset = prev_loc + shift;
360 qinfo->contexts[index].query_length = length;
361 if (length == 0)
362 qinfo->contexts[index].is_valid = FALSE;
363
364 } else {
365 /* First context */
366 qinfo->contexts[0].query_offset = 0;
367 qinfo->contexts[0].query_length = length;
368 if (length == 0)
369 qinfo->contexts[0].is_valid = FALSE;
370 }
371 }
372
373 /** Sets up the query information structure with all contexts' data.
374 * @param slp List of query Seq-loc's [in]
375 * @param program Type of BLAST program [in]
376 * @param query_info_ptr Pointer to the structure to populate. [out]
377 */
378 static Int4
379 s_QueryInfoSetUp(SeqLocPtr slp, EBlastProgramType program,
380 BlastQueryInfo** query_info_ptr)
381 {
382 Uint4 length, protein_length;
383 Boolean translate =
384 (program == eBlastTypeBlastx || program == eBlastTypeTblastx ||
385 program == eBlastTypeRpsTblastn);
386 Boolean is_na = (program == eBlastTypeBlastn ||
387 program == eBlastTypePhiBlastn);
388 Int2 num_frames, frame;
389 Uint1 strand;
390 BlastQueryInfo* query_info;
391 Int4 index;
392 Uint4 max_length = 0;
393
394 if (translate)
395 num_frames = NUM_FRAMES;
396 else if (is_na)
397 num_frames = 2;
398 else
399 num_frames = 1;
400
401 if ((query_info = BlastQueryInfoNew(program, ValNodeLen(slp))) == NULL)
402 return -1;
403
404 if ((strand = SeqLocStrand(slp)) == Seq_strand_minus) {
405 if (translate)
406 query_info->first_context = 3;
407 else
408 query_info->first_context = 1;
409 }
410
411 /* Fill the context offsets */
412 for (index = 0; slp; slp = slp->next, index += num_frames) {
413 length = SeqLocLen(slp); /* FIXME: could return -1 */
414 strand = SeqLocStrand(slp);
415 if (translate) {
416 Int2 first_frame, last_frame;
417 if (strand == Seq_strand_plus) {
418 first_frame = 0;
419 last_frame = 2;
420 } else if (strand == Seq_strand_minus) {
421 first_frame = 3;
422 last_frame = 5;
423 } else {
424 first_frame = 0;
425 last_frame = 5;
426 }
427
428 /* Set the unused initial contexts if any */
429 for (frame = 0; frame < first_frame; ++frame) {
430 s_QueryInfoSetContextInfo(query_info, index+frame, 0);
431 }
432
433 for (frame = first_frame; frame <= last_frame; ++frame) {
434 protein_length = BLAST_GetTranslatedProteinLength(length, index+frame);
435 max_length = MAX(max_length, protein_length);
436
437 s_QueryInfoSetContextInfo(query_info,
438 index+frame,
439 protein_length);
440 }
441
442 /* Set the unused trailing contexts if any */
443 for (frame = last_frame + 1; frame < num_frames; ++frame) {
444 s_QueryInfoSetContextInfo(query_info, index+frame, 0);
445 }
446 } else {
447 max_length = MAX(max_length, length);
448
449 if (is_na) {
450 if (strand == Seq_strand_plus) {
451 s_QueryInfoSetContextInfo(query_info, index, length);
452 s_QueryInfoSetContextInfo(query_info, index+1, 0);
453 } else if (strand == Seq_strand_minus) {
454 s_QueryInfoSetContextInfo(query_info, index, 0);
455 s_QueryInfoSetContextInfo(query_info, index+1, length);
456 } else {
457 s_QueryInfoSetContextInfo(query_info, index, length);
458 s_QueryInfoSetContextInfo(query_info, index+1, length);
459 }
460 } else {
461 s_QueryInfoSetContextInfo(query_info, index, length);
462 }
463 }
464 }
465 query_info->max_length = max_length;
466
467 *query_info_ptr = query_info;
468 return 0;
469 }
470
471 /** Given a SeqLoc, fills a preallocated sequence buffer in the correct
472 * encoding.
473 * @param slp SeqLoc structure to get data from. [in]
474 * @param encoding What encoding to fill sequence buffer in? [in]
475 * @param buffer Buffer to fill.
476 */
477 static Int2
478 s_SeqLocReadSequence(SeqLocPtr slp, EBlastEncoding encoding, Uint1** buffer)
479 {
480 Uint1* buffer_var = *buffer;
481 Int4 size, index;
482
483 if (!buffer_var || !slp)
484 return -1;
485
486 size = SeqPortStreamLoc(slp, STREAM_EXPAND_GAPS | STREAM_CORRECT_INVAL,
487 buffer_var, NULL);
488
489 switch (encoding) {
490 case eBlastEncodingProtein:
491 for (index = 0; index < size; index++)
492 buffer_var[index] = AMINOACID_TO_NCBISTDAA[buffer_var[index]];
493 break;
494 case eBlastEncodingNcbi4na:
495 for (index = 0; index < size; index++)
496 buffer_var[index] = IUPACNA_TO_NCBI4NA[buffer_var[index]];
497 break;
498 case eBlastEncodingNucleotide:
499 for (index = 0; index < size; index++)
500 buffer_var[index] = IUPACNA_TO_BLASTNA[buffer_var[index]];
501 break;
502 default:
503 /* This function should not be called for any other encodings -
504 return an error status. */
505 return -1;
506 }
507
508 *buffer = buffer_var + size;
509 return 0;
510 }
511
512 /** Fills sequence buffer for a single SeqLoc; fills both strands if necessary.
513 * @param slp Sequence location [in]
514 * @param encoding Encoding to use for the sequence buffer. [in]
515 * @param add_sentinel_bytes Should sentinel bytes be added at the ends of the
516 * buffer? [in]
517 * @param both_strands Should buffer include both strands for a nucleotide
518 * sequence? [in]
519 * @param buffer Buffer to populate. Must be already allocated. [in] [out]
520 */
521 static Int2
522 s_SeqLocFillSequenceBuffer(SeqLocPtr slp, EBlastEncoding encoding,
523 Boolean add_sentinel_bytes, Boolean both_strands, Uint1* buffer)
524 {
525 Uint1* buffer_var;
526 Uint1 sentinel = (encoding == eBlastEncodingNucleotide
527 ? NCBI4NA_TO_BLASTNA[NULLB]
528 : NULLB);
529 Uint1 seq_code, strand;
530
531 buffer_var = buffer;
532
533 if (add_sentinel_bytes) {
534 *buffer_var = sentinel;
535 ++buffer_var;
536 }
537
538 if (encoding == eBlastEncodingProtein) {
539 seq_code = Seq_code_ncbistdaa;
540 strand = Seq_strand_unknown;
541 } else {
542 seq_code = Seq_code_ncbi4na;
543 strand = SeqLocStrand(slp);
544 }
545
546 s_SeqLocReadSequence(slp, encoding, &buffer_var);
547
548 if (add_sentinel_bytes)
549 *buffer_var = sentinel;
550
551 if (both_strands && strand == Seq_strand_both) {
552 SeqLocPtr tmp_slp=NULL;
553
554 ++buffer_var;
555
556 tmp_slp = SeqLocIntNew(SeqLocStart(slp), SeqLocStop(slp),
557 Seq_strand_minus, SeqLocId(slp));
558
559 s_SeqLocReadSequence(tmp_slp, encoding, &buffer_var);
560 if (add_sentinel_bytes)
561 *buffer_var = sentinel;
562
563 SeqLocFree(tmp_slp);
564 }
565
566 return 0;
567 }
568
569 Int2 BLAST_GeneticCodeFind(Int4 gc, Uint1** genetic_code)
570 {
571 ValNodePtr vnp;
572 GeneticCodePtr gcp;
573 char* gen_code_eaa = NULL;
574 Uint1* gen_code_stdaa = NULL;
575 Int4 gen_code_length = 0, index;
576 SeqMapTablePtr smtp;
577
578 gcp = GeneticCodeFind(gc, NULL);
579 for (vnp = (ValNodePtr)gcp->data.ptrvalue; vnp != NULL;
580 vnp = vnp->next) {
581 if (vnp->choice == 3) { /* ncbieaa */
582 gen_code_eaa = (char*)vnp->data.ptrvalue;
583 break;
584 }
585 }
586
587 if (!gen_code_eaa)
588 return -1;
589 smtp = SeqMapTableFind(Seq_code_ncbistdaa, Seq_code_ncbieaa);
590 gen_code_length = (Int4)StrLen(gen_code_eaa);
591 *genetic_code = gen_code_stdaa = (Uint1*) calloc(gen_code_length+1, 1);
592
593 if (!gen_code_stdaa)
594 return -2;
595
596 for (index = 0; index < gen_code_length; ++index) {
597 gen_code_stdaa[index] =
598 SeqMapTableConvert(smtp, gen_code_eaa[index]);
599 }
600
601 return 0;
602 }
603
604 /** s_GetSequence
605 * Purpose: Get the sequence for the BLAST engine, put in a Uint1 buffer
606 * @param slp SeqLoc to extract sequence for [in]
607 * @param query_info The query information structure, pre-initialized,
608 * but filled here [in]
609 * @param query_options Query setup options, containing the genetic code for
610 * translation. N.B.: its strand_option field is ignored [in]
611 * @param num_frames How many frames to get for this sequence? [in]
612 * @param encoding In what encoding to retrieve the sequence? [in]
613 * @param buffer_out Buffer to hold plus strand or protein [out]
614 * @param buffer_length Length of buffer allocated [out]
615 */
616 static Int2
617 s_GetSequence(SeqLocPtr slp, BlastQueryInfo* query_info,
618 const QuerySetUpOptions* query_options, Uint1 num_frames,
619 EBlastEncoding encoding, Uint1* *buffer_out, Int4 *buffer_length)
620 {
621 Int2 status=0; /* return value. */
622 Int4 total_length; /* Total length of all queries/frames/strands */
623 Int4 index; /* Loop counter */
624 SeqLocPtr slp_var; /* loop variable */
625 Uint1* buffer; /* buffer to fill. */
626 Boolean add_sentinel_bytes = TRUE;
627 Uint1* genetic_code=NULL;
628 Boolean translate = FALSE;
629 Int4 offset = 0;
630
631 if (query_info) {
632 *buffer_length = total_length = QueryInfo_GetSeqBufLen(query_info);
633 } else {
634 /* Subject sequence in 2 sequences comparison */
635 *buffer_length = SeqLocLen(slp);
636 /* allow two extra bytes for sentinels or a trailing
637 null appended by the low-level sequence conversion */
638 total_length = (*buffer_length) + 2;
639 if (encoding == eBlastEncodingNcbi4na) {
640 /* Searches with translated subjects (tblastn, tblastx) */
641 add_sentinel_bytes = FALSE;
642 }
643 }
644
645 if (num_frames == NUM_FRAMES) {
646 /* Sequence must be translated in 6 frames. This can only happen
647 for query - subject sequences are translated later. */
648 Int4 gc;
649
650 translate = TRUE;
651 gc = (query_options ? query_options->genetic_code : 1);
652
653 if ((status = BLAST_GeneticCodeFind(gc, &genetic_code)) != 0)
654 return status;
655 }
656
657 *buffer_out = buffer = (Uint1 *) malloc((total_length)*sizeof(Uint1));
658
659 for (index = 0, slp_var = slp; slp_var;
660 slp_var = slp_var->next, index += num_frames)
661 {
662 if (translate) {
663 Uint1* na_buffer, *buffer_rev = NULL;
664 Int4 context, context_start, context_end;
665 Int4 na_length;
666 Uint1 strand;
667
668
669 na_length = SeqLocLen(slp_var);
670 strand = SeqLocStrand(slp_var);
671 /* Retrieve nucleotide sequence in an auxiliary buffer;
672 then translate into the appropriate place in the
673 preallocated buffer */
674 if (strand == Seq_strand_plus) {
675 na_buffer = (Uint1 *) malloc(na_length + 2);
676 context_start = 0;
677 context_end = 2;
678 } else if (strand == Seq_strand_minus) {
679 na_buffer = (Uint1 *) malloc(na_length + 2);
680 context_start = 3;
681 context_end = 5;
682 } else {
683 na_buffer = (Uint1*) malloc(2*na_length + 3);
684 context_start = 0;
685 context_end = 5;
686 }
687 s_SeqLocFillSequenceBuffer(slp_var, encoding, TRUE, TRUE, na_buffer);
688 if (strand == Seq_strand_both)
689 buffer_rev = na_buffer + na_length + 1;
690 else if (strand == Seq_strand_minus)
691 buffer_rev = na_buffer;
692
693 for (context = context_start; context <= context_end; context++) {
694 offset = query_info->contexts[index+context].query_offset;
695
696 BLAST_GetTranslation(na_buffer+1, buffer_rev, na_length,
697 BLAST_ContextToFrame(eBlastTypeBlastx, context),
698 &buffer[offset], genetic_code);
699 }
700 sfree(na_buffer);
701 } else {
702 /* This can happen both for query and subject, so query_info
703 might not be initialized here. */
704 if (query_info)
705 offset = query_info->contexts[index].query_offset;
706
707 s_SeqLocFillSequenceBuffer(slp_var, encoding, add_sentinel_bytes,
708 (Boolean)(num_frames == 2), &buffer[offset]);
709 }
710 /* For subjects, do only one SeqLoc at a time */
711 if (!query_info)
712 break;
713 }
714
715 sfree(genetic_code);
716
717 return status;
718 }
719
720 Int2 BLAST_SetUpQuery(EBlastProgramType program_number,
721 SeqLocPtr query_slp, const QuerySetUpOptions* query_options,
722 SeqLoc* masking_locs, BlastQueryInfo** query_info,
723 BLAST_SequenceBlk* *query_blk)
724 {
725 Uint1* buffer; /* holds sequence for plus strand or protein. */
726 Int4 buffer_length;
727 Int2 status;
728 Uint1 num_frames;
729 EBlastEncoding encoding;
730
731 if (query_slp == NULL || query_options == NULL ||
732 query_info == NULL || query_blk == NULL)
733 return -1;
734
735 if ((status = s_QueryInfoSetUp(query_slp, program_number, query_info)))
736 return status;
737
738 if (program_number == eBlastTypeBlastn ||
739 program_number == eBlastTypePhiBlastn) {
740 encoding = eBlastEncodingNucleotide;
741 num_frames = 2;
742 } else if (Blast_QueryIsProtein(program_number)) {
743 encoding = eBlastEncodingProtein;
744 num_frames = 1;
745 } else { /* blastx or rpstblastn, which is also essentially blastx */
746 encoding = eBlastEncodingNcbi4na;
747 num_frames = NUM_FRAMES;
748 }
749
750 if ((status=s_GetSequence(query_slp, *query_info, query_options,
751 num_frames, encoding, &buffer, &buffer_length)))
752 return status;
753
754 /* Do not count the first and last sentinel bytes in the
755 query length */
756 if ((status=BlastSetUp_SeqBlkNew(buffer, buffer_length-2,
757 query_blk, TRUE)))
758 return status;
759
760 if (masking_locs) {
761 BlastMaskLoc* lcase_mask = BlastMaskLocFromSeqLoc(masking_locs,
762 query_slp,
763 program_number);
764 if (Blast_QueryIsTranslated(program_number))
765 BlastMaskLocDNAToProtein(lcase_mask, *query_info);
766 (*query_blk)->lcase_mask = lcase_mask;
767 (*query_blk)->lcase_mask_allocated = TRUE;
768 }
769
770 return 0;
771 }
772
773 Int2 BLAST_SetUpSubject(EBlastProgramType program_number,
774 SeqLocPtr subject_slp, BLAST_SequenceBlk** subject)
775 {
776 Int2 status = 0;
777 Uint1* subject_buffer = NULL; /* Buffer for the compressed subject
778 sequence in two sequences case */
779 Int4 buffer_length=0; /* Length of subject sequence for two sequences
780 case */
781 EBlastEncoding encoding;
782 const Boolean kNucleotide = (program_number == eBlastTypeBlastn ||
783 program_number == eBlastTypePhiBlastn);
784 const Boolean kTranslated = Blast_SubjectIsTranslated(program_number);
785
786 if (kNucleotide)
787 encoding = eBlastEncodingNucleotide;
788 else if (kTranslated) {
789 encoding = eBlastEncodingNcbi4na;
790 } else {
791 encoding = eBlastEncodingProtein;
792 }
793
794 if ((status = s_GetSequence(subject_slp, NULL, NULL, 1, encoding,
795 &subject_buffer, &buffer_length)))
796 return status;
797
798 /* Initialize the sequence block, saving the sequence buffer in
799 'sequence_start'. */
800 if ((status=BlastSetUp_SeqBlkNew(subject_buffer, buffer_length,
801 subject, TRUE)))
802 return status;
803
804 /* If subject sequence is nucleotide, create compressed sequence buffer
805 and save it in 'sequence'. For blastn, the sentinel bytes should not
806 be included in the packed sequence. */
807 if (kNucleotide)
808 ++subject_buffer;
809
810 if (kNucleotide || kTranslated) {
811 BLAST_PackDNA(subject_buffer, buffer_length, encoding,
812 &((*subject)->sequence));
813 (*subject)->sequence_allocated = TRUE;
814 }
815
816 return 0;
817 }
818 /* @} */
819
820 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |